rubus0304 님의 블로그

[최종프로젝트 1일차] 본문

Data Analyst/daily

[최종프로젝트 1일차]

rubus0304 2025. 1. 7. 21:39

기획서 의견조율 및 웹크롤링 시도 

개인 맞춤형 여행코스 추천 플랫폼: 대전을 중심으로

https://docs.google.com/document/d/1ZZM8FRppvcXzF-aNP0ZY9xeCc5QgL2FcR2eS5ugMu0s/edit?tab=t.pln2up5i1p5nMYRO - AI 여행 플래너

 

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

from time import sleep
import random
import re
 

 

class Colors:
    RED = '\033[91m'
    GREEN = '\033[92m'
    BLUE = '\033[94m'
    MAGENTA = '\033[95m'
    CYAN = '\033[96m'
    RESET = '\033[0m'

def switch_left():
    driver.switch_to.parent_frame()
    iframe = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="searchIframe"]'))
    )
    driver.switch_to.frame(iframe)

def switch_right():
    driver.switch_to.parent_frame()
    iframe = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="entryIframe"]'))
    )
    driver.switch_to.frame(iframe)

options = webdriver.ChromeOptions()
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3')
options.add_argument('window-size=1380,900')
driver = webdriver.Chrome(options=options)

# 네이버 지도 URL
URL = 'https://map.naver.com/p/search/%EB%8C%80%EC%A0%84%20%EB%A7%9B%EC%A7%91?c=10.00,0,0,0,dh'
driver.get(url=URL)
driver.implicitly_wait(3)

while True:
    switch_left()

    # 다음 페이지 확인
    next_page = driver.find_element(By.XPATH, '//*[@id="app-root"]/div/div[3]/div[2]/a[7]').get_attribute('aria-disabled')
    if next_page == 'true':
        break

    # 페이지의 가게 리스트
    page_no = driver.find_element(By.XPATH, '//a[contains(@class, "mBN2s qxokY")]').text
    elements = driver.find_elements(By.XPATH, '//*[@id="_pcmap_list_scroll_container"]//li')[2:] if page_no == '1' else driver.find_elements(By.XPATH, '//*[@id="_pcmap_list_scroll_container"]//li')

    for index, e in enumerate(elements, start=1):
        # 데이터 초기화
        store_name = ''
        category = ''
        new_open = ''
        rating = 0.0
        visited_review = 0
        blog_review = 0
        store_id = ''
        address = ''
        business_hours = []
        phone_num = ''
        menu = []
        top_reviews = []
        theme_keywords = {'분위기': [], '인기토픽': [], '찾는목적': []}

        # 가게 클릭 및 상세 페이지 이동
        e.find_element(By.CLASS_NAME, 'CHC5F').find_element(By.XPATH, ".//a/div/div/span").click()
        sleep(2)
        switch_right()

        try:
            # 상세 정보 크롤링
            title = driver.find_element(By.XPATH, '//div[@class="zD5Nm undefined"]')
            store_name = title.find_element(By.XPATH, './/div[1]/div[1]/span[1]').text
            category = title.find_element(By.XPATH, './/div[1]/div[1]/span[2]').text

            if len(title.find_elements(By.XPATH, './/div[1]/div[1]/span')) > 2:
                new_open = title.find_element(By.XPATH, './/div[1]/div[1]/span[3]').text

            # 리뷰 정보
            review_elements = title.find_elements(By.XPATH, './/div[2]/span')
            if len(review_elements) > 2:
                rating = review_elements[0].text
                visited_review = review_elements[1].text
                blog_review = review_elements[2].text

            # 메뉴 정보
            menu_elements = driver.find_elements(By.XPATH, '//*[@id="app-root"]/div/div/div/div[5]/div/div[3]/div[1]')
            menu = [menu_element.text for menu_element in menu_elements]

            # 방문자 리뷰
            review_elements = driver.find_elements(By.XPATH, '//*[@id="app-root"]/div/div/div/div[6]/div/div[7]/div[1]/div[1]/div/a/ul')
            top_reviews = [review_element.text for review_element in review_elements[:5]]

            # 테마 키워드
            theme_elements = driver.find_elements(By.XPATH, '//*[@id="app-root"]/div/div/div/div[6]/div/div[9]/div[1]')
            for theme_element in theme_elements:
                category_name = theme_element.find_element(By.CLASS_NAME, 'pNnVF').text
                keywords = theme_element.find_element(By.CLASS_NAME, 'sJgQj').find_elements(By.TAG_NAME, 'span')
                theme_keywords[category_name] = [keyword.text for keyword in keywords]

            # 가게 주소 및 영업 시간
            address = driver.find_element(By.XPATH, '//span[@class="LDgIH"]').text
            try:
                driver.find_element(By.XPATH, '//div[@class="y6tNq"]//span').click()
                sleep(2)
                parent_element = driver.find_element(By.XPATH, '//a[@class="gKP9i RMgN0"]')
                child_elements = parent_element.find_elements(By.XPATH, './*[@class="w9QyJ" or @class="w9QyJ undefined"]')
                for child in child_elements:
                    business_hours.append(child.text)
                phone_num = driver.find_element(By.XPATH, '//span[@class="xlx7Q"]').text
            except:
                pass

        except Exception as ex:
            print(Colors.RED + '------------ 데이터 크롤링 오류 ------------' + Colors.RESET)
            print(ex)

        # 데이터 출력
        print(Colors.BLUE + f'{index}. {store_name}' + Colors.RESET + ' · ' + category)
        print('메뉴:', menu)
        print('상위 5개 리뷰:', top_reviews)
        print('테마 키워드:', theme_keywords)
        print(Colors.MAGENTA + '-' * 50 + Colors.RESET)
        print('평점 ' + Colors.RED + str(rating) + Colors.RESET + ' / ' + visited_review + ' · ' + blog_review)
        print(f'가게 고유 번호 -> {store_id}')
        print('가게 주소 ' + Colors.GREEN + str(address) + Colors.RESET)
        print(Colors.CYAN + '가게 영업 시간' + Colors.RESET)
        for i in business_hours:
            print(i)
            print('')
        print('가게 번호 ' + Colors.GREEN + phone_num + Colors.RESET)
        print(Colors.MAGENTA + "-"*50 + Colors.RESET)
       
        switch_left()
        sleep(2)

    # 다음 페이지로 이동
    if next_page == 'false':
        driver.find_element(By.XPATH, '//*[@id="app-root"]/div/div[3]/div[2]/a[7]').click()
    else:
        break

 

 

메뉴 등 생각보다 리스트에 안 들어가는 애들이 많이 나옴.

 

'Data Analyst > daily' 카테고리의 다른 글

[QCC 6회차]  (0) 2025.01.10
[프로젝트 2일차]  (0) 2025.01.08
[데이터 파이프라인 1강]  (0) 2025.01.07
[라이브세션] '5분 기록 보드'로 20시간 절약하기  (0) 2025.01.06
[QCC 5회차]  (1) 2025.01.03