๋ณธ๋ฌธ ๋ฐ”๋กœ๊ฐ€๊ธฐ
๐Ÿ’ป Programming/Python

[python] ๊ตฌ๊ธ€ ๊ฒ€์ƒ‰ ์ด๋ฏธ์ง€ ํฌ๋กค๋ง/์Šคํฌ๋ž˜ํ•‘ํ•˜๊ธฐ

by ๋ญ…์ฆค 2023. 8. 15.
๋ฐ˜์‘ํ˜•

๊ตฌ๊ธ€ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ด๋ฏธ์ง€๋ฅผ ํŒŒ์ด์ฌ์œผ๋กœ ํฌ๋กค๋งํ•˜๋Š” ๋ฐฉ๋ฒ•์„ ์ •๋ฆฌํ•œ๋‹ค.

 

Selenium

์šฐ์„  ์›นํฌ๋กค๋ง/์Šคํฌ๋ž˜ํ•‘์„ ํ•˜๋Š” ๊ฒฝ์šฐ ์ฃผ๋กœ Selenium์ด๋ผ๋Š” ํŒจํ‚ค์ง€๋ฅผ ์‚ฌ์šฉํ•˜๊ธฐ์— ๊ฐ„๋‹จํžˆ ์•Œ์•„๋ณด๊ณ  ์ง€๋‚˜๊ฐ€์ž.

 

Selenium์€ ์›น ์• ํ”Œ๋ฆฌ์ผ€์ด์…˜์„ ํ…Œ์ŠคํŠธํ•˜๊ฑฐ๋‚˜ ์ž๋™ํ™”ํ•˜๋Š” ๋ฐ ์‚ฌ์šฉ๋˜๋Š” ํ”„๋ ˆ์ž„์›Œํฌ์ด๋‹ค. ์ฃผ๋กœ ์›น ๋ธŒ๋ผ์šฐ์ €์™€ ์ƒํ˜ธ ์ž‘์šฉํ•˜๋ฉฐ, ์›น ํŽ˜์ด์ง€๋ฅผ ์ œ์–ดํ•˜๊ณ  ํ…Œ์ŠคํŠธํ•˜๋Š” ๋ฐ ์œ ์šฉํ•œ ๋„๊ตฌ๋กœ, ๋‹ค์–‘ํ•œ ํ”„๋กœ๊ทธ๋ž˜๋ฐ ์–ธ์–ด๋กœ ์ง€์›๋˜๋ฉฐ, ์›น ํŽ˜์ด์ง€์˜ ์š”์†Œ๋ฅผ ์ฐพ๊ณ  ์กฐ์ž‘ํ•  ์ˆ˜ ์žˆ์–ด ์›น ์Šคํฌ๋ž˜ํ•‘ ๋˜๋Š” ์›น ์ž๋™ํ™”์— ํ™œ์šฉ๋œ๋‹ค.

 

<์˜ˆ์‹œ ์ฝ”๋“œ>

from selenium import webdriver

# ์›น ๋“œ๋ผ์ด๋ฒ„ ์„ค์ •
driver_path = "๊ฒฝ๋กœ/chromedriver"  # ๋‹ค์šด๋กœ๋“œํ•œ ChromeDriver์˜ ๊ฒฝ๋กœ
driver = webdriver.Chrome(executable_path=driver_path)

# ๊ตฌ๊ธ€ ๊ฒ€์ƒ‰ ํŽ˜์ด์ง€ ์—ด๊ธฐ
driver.get("https://www.google.com")

# ๊ฒ€์ƒ‰์–ด ์ž…๋ ฅ
search_box = driver.find_element_by_name("q")
search_box.send_keys("Selenium ์‚ฌ์šฉ๋ฒ•")

# ๊ฒ€์ƒ‰ ์‹คํ–‰
search_box.submit()

# ๋ธŒ๋ผ์šฐ์ € ๋‹ซ๊ธฐ
driver.quit()

 

๊ตฌ๊ธ€ ๊ฒ€์ƒ‰ ์ด๋ฏธ์ง€ ํฌ๋กค๋ง ํ•˜๊ธฐ
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import urllib.request
import time
import os

# ํŽ˜์ด์ง€๋ฅผ ์•„๋ž˜๋กœ ์Šคํฌ๋กคํ•˜๋Š” ํ•จ์ˆ˜
def scroll_down():
    while True:
        time.sleep(3)
        # ํŽ˜์ด์ง€ ๋งจ ์•„๋ž˜๋กœ ์Šคํฌ๋กค
        driver.find_element(By.XPATH, '//body').send_keys(Keys.END)
        time.sleep(3)
        try:
            # '๋”๋ณด๊ธฐ' ๋ฒ„ํŠผ์ด ๋ณด์ด๋ฉด ํด๋ฆญ
            load_more_button = driver.find_element(By.XPATH, '//*[@id="islmp"]/div/div/div/div/div[1]/div[2]/div[2]/input')
            if load_more_button.is_displayed():
                load_more_button.click()
        except:
            pass
        time.sleep(3)
        try:
            # '๋” ์ด์ƒ ํ‘œ์‹œํ•  ์ฝ˜ํ…์ธ ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.' ๋ฉ”์‹œ์ง€๊ฐ€ ๋ณด์ด๋ฉด ์ข…๋ฃŒ
            no_more_content = driver.find_element(By.XPATH, '//div[@class="K25wae"]//*[text()="๋” ์ด์ƒ ํ‘œ์‹œํ•  ์ฝ˜ํ…์ธ ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค."]')
            if no_more_content.is_displayed():
                break
        except:
            pass

if __name__ == "__main__":
    query = input("๊ฒ€์ƒ‰์–ด : ") 
    image_cnt = int(input("์ˆ˜์ง‘ํ•  ์ด๋ฏธ์ง€ ๊ฐœ์ˆ˜ : ")) 

    save_dir = "saved_image"  # ์ €์žฅํ•  ๋””๋ ‰ํ† ๋ฆฌ ์ด๋ฆ„
    os.makedirs(save_dir, exist_ok=True)  # ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ (์ด๋ฏธ ์กด์žฌํ•˜๋ฉด ๋ฌด์‹œ)
    os.chdir(save_dir)  # ์ž‘์—… ๋””๋ ‰ํ† ๋ฆฌ ๋ณ€๊ฒฝ

    driver = webdriver.Chrome()  # Chrome ์›น ๋“œ๋ผ์ด๋ฒ„ ์‹คํ–‰
    URL = 'https://www.google.com/search?tbm=isch&q='
    driver.get(URL + query)  # ๊ฒ€์ƒ‰์–ด๋ฅผ ํฌํ•จํ•œ URL๋กœ ์ด๋™

    scroll_down()  # ํŽ˜์ด์ง€ ์Šคํฌ๋กค ํ•จ์ˆ˜ ํ˜ธ์ถœ

    # ์ด๋ฏธ์ง€ ์ •๋ณด ์ถ”์ถœ
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    image_info_list = soup.find_all('img', class_='rg_i')
    image_and_name_list = []

    print('=== ์ด๋ฏธ์ง€ ์ˆ˜์ง‘ ์‹œ์ž‘ ===')

    downlaod_cnt = 0
    for i in range(len(image_info_list)):
        if i == image_cnt:
            break
        if 'data-src' in image_info_list[i].attrs:
            save_image = image_info_list[i]['data-src']

            image_path = os.path.join(query.replace(' ', '_') + '_' + str(downlaod_cnt) + '.jpg')
            image_and_name_list.append((save_image, image_path))
            downlaod_cnt += 1

    # ์ด๋ฏธ์ง€ ๋‹ค์šด๋กœ๋“œ
    for i in range(len(image_and_name_list)):
        urllib.request.urlretrieve(image_and_name_list[i][0], image_and_name_list[i][1])

    print('=== ์ด๋ฏธ์ง€ ์ˆ˜์ง‘ ์ข…๋ฃŒ ===')
    driver.close()  # ๋ธŒ๋ผ์šฐ์ € ๋‹ซ๊ธฐ

์ด ์ฝ”๋“œ๋Š” ๊ฒ€์ƒ‰์–ด์™€ ์ด๋ฏธ์ง€ ์ˆ˜์ง‘ ๊ฐœ์ˆ˜๋ฅผ ์ž…๋ ฅ๋ฐ›์•„์„œ ๊ตฌ๊ธ€ ์ด๋ฏธ์ง€ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ์—์„œ ์ด๋ฏธ์ง€๋ฅผ ์ˆ˜์ง‘ํ•˜๋Š” ๊ธฐ๋Šฅ์„ ์ˆ˜ํ–‰ํ•œ๋‹ค. ์กฐ๊ธˆ๋” ๊ตฌ์ฒด์ ์œผ๋กœ๋Š” ๊ฒ€์ƒ‰์–ด๋กœ ๊ฒ€์ƒ‰ ์ดํ›„ ํŽ˜์ด์ง€๋ฅผ ์•„๋ž˜๋กœ ์Šคํฌ๋กคํ•˜๋ฉด์„œ ์ด๋ฏธ์ง€๋ฅผ ๋” ๊ฐ€์ ธ์˜ค๊ณ , ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ํŽ˜์ด์ง€์—์„œ ์ด๋ฏธ์ง€๋ฅผ ์ถ”์ถœํ•˜์—ฌ ๋‹ค์šด๋กœ๋“œํ•œ๋‹ค.

 

 

๋ฐ˜์‘ํ˜•