[프로젝트 5일차]

Notice

Recent Posts

Recent Comments

Link

« 2025/01 »
일	월	화	수	목	금	토
			1	2	3	4
5	6	7	8	9	10	11
12	13	14	15	16	17	18
19	20	21	22	23	24	25
26	27	28	29	30	31

Tags more

Archives

Today

Total

관리 메뉴

rubus0304 님의 블로그

[프로젝트 5일차] 본문

Data Analyst/daily

[프로젝트 5일차]

rubus0304 2025. 1. 13. 20:25

대전 명소 웹크롤링 완성

[에러코드 - 상세정보부터]

from selenium import webdriver

from selenium.webdriver.common.by import By

from selenium.webdriver.common.keys import Keys

from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.common.action_chains import ActionChains

import json

import time

from time import sleep

import random

import re

# 크롤링 데이터 저장용 리스트

store_data = []

# 왼쪽 iframe 전환

def switch_left():

driver.switch_to.parent_frame()

iframe = driver.find_element(By.XPATH, '//*[@id="searchIframe"]')

driver.switch_to.frame(iframe)

# 오른쪽 iframe 전환

def switch_right():

driver.switch_to.parent_frame()

iframe = driver.find_element(By.XPATH, '//*[@id="entryIframe"]')

driver.switch_to.frame(iframe)

options = webdriver.ChromeOptions()

options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3')

options.add_argument('window-size=1380,900')

driver = webdriver.Chrome(options=options)

# 네이버 지도 URL

# 지도에 검색한 키워드 URL을 아래 복붙

URL = 'https://map.naver.com/p/search/%EB%8C%80%EC%A0%84%20%EB%AA%85%EC%86%8C?c=10.00,0,0,0,dh'

driver.get(url=URL)

driver.implicitly_wait(5)

# 한 번에 끝까지 스크롤

def scroll_to_bottom():

last_height = driver.execute_script("return document.body.scrollHeight") # 초기 높이 가져오기

while True:

# 페이지 끝까지 스크롤

driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

sleep(2) # 로딩 대기

new_height = driver.execute_script("return document.body.scrollHeight") # 새로운 높이 가져오기

if new_height == last_height: # 높이가 더 이상 변하지 않으면 종료

break

last_height = new_height

# 1000px씩 끊어서 스크롤

def scroll_to_1000px(scrollable_element):

last_height = driver.execute_script("return arguments[0].scrollHeight", scrollable_element) # 초기 높이 가져오기

while True:

# 요소 내에서 아래로 1000px 스크롤

driver.execute_script("arguments[0].scrollTop += 1000;", scrollable_element)

sleep(2) # 로딩 대기

new_height = driver.execute_script("return arguments[0].scrollHeight", scrollable_element) # 새로운 높이 가져오기

if new_height == last_height: # 높이가 더 이상 변하지 않으면 종료

break

last_height = new_height

# 누적 크롤링 시간 확인용 변수

total_elapsed_time = 0

#####중간에 멈췄을 경우 아래 코드 실행#####

# switch_left()

# # 페이지 이동 필요만큼 아래 두 행 복붙

# driver.find_element(By.XPATH, '//*[@id="app-root"]/div/div[2]/div[2]/a[last()]').click()

# sleep(1)

# driver.find_element(By.XPATH, '//*[@id="app-root"]/div/div[2]/div[2]/a[last()]').click()

# sleep(1)

# driver.find_element(By.XPATH, '//*[@id="app-root"]/div/div[2]/div[2]/a[last()]').click()

# sleep(1)

# driver.find_element(By.XPATH, '//*[@id="app-root"]/div/div[2]/div[2]/a[last()]').click()

# sleep(1)

while True:

switch_left()

# 맨 밑까지 스크롤 (1000px 씩 끊어서)

scrollable_element = driver.find_element(By.CLASS_NAME, "Ryr1F")

scroll_to_1000px(scrollable_element)

# 현재 페이지 번호

page_no = driver.find_element(By.XPATH, '//a[contains(@class, "mBN2s qxokY")]').text

# n번째 가게부터 시작

#####처음 실행할 경우#####

elements = driver.find_elements(By.XPATH, '//*[@id="_pcmap_list_scroll_container"]//li')

# # #####중간에 멈춰서 다시 실행할 경우 아래 코드의 숫자 수정 필요#####

# # elements = driver.find_elements(By.XPATH, '//*[@id="_pcmap_list_scroll_container"]//li')[7:] if page_no == '5' else driver.find_elements(By.XPATH, '//*[@id="_pcmap_list_scroll_container"]//li')

for index, e in enumerate(elements, start=1):

# 광고 태그

is_ad = False

# 광고면 is_ad = True 아니면 기존값 False

try:

ad_tag = e.find_element(By.XPATH, './/span[@class="place_blind" and text()="광고"]')

if ad_tag:

is_ad = True

except:

pass

# 데이터 초기화

store_info = {

'index' : index,

'page' : page_no,

'store_name' : '',

'category' : '',

# 'is_ad' : is_ad,

# 'rating' : 0.0,

'visited_review' : 0,

'blog_review' : 0,

'address' : '',

'link' : '',

'phone_num' : '',

'menu' : [],

'keyword_reviews' : [],

}

# 크롤링 시간 확인용

start_time = time.time()

# 가게 클릭 및 상세 페이지 이동

clickable_element = e.find_element(By.CLASS_NAME, 'ouxiq').find_element(By.XPATH, ".//a/div/div/span[1]")

# if is_ad == True:

# # 요소의 위치 가져오기

# location = clickable_element.location

# size = clickable_element.size

# # 광고 태그 크기 확인

# ad_tag_width = ad_tag.size['width']

# # 약간 오른쪽으로 위치 조정 (x_offset 값 수정 가능)

# x_offset = ad_tag_width + 5 # 가장 왼쪽에서 광고 태그만큼 5 px 이동

# y_offset = size['height'] // 2 # 요소의 세로 중심

# # ActionChains로 클릭 위치 이동

# actions = ActionChains(driver)

# actions.move_to_element_with_offset(clickable_element, x_offset, y_offset).click().perform()

# else:

clickable_element.click()

sleep(2)

switch_right()

try:

# 상세 정보 크롤링

try:

scroll_to_bottom()

sleep(2)

title = driver.find_element(By.XPATH, '//div[@class="zD5Nm undefined"]')

store_info['store_name'] = title.find_element(By.XPATH, './/div[1]/div[1]/span[1]').text

store_info['category'] = title.find_element(By.XPATH, './/div[1]/div[1]/span[2]').text

except:

pass

# 리뷰 정보

try:

review_elements = title.find_elements(By.XPATH, './/div[2]/span')

if len(review_elements) > 2:

store_info['rating'] = review_elements[0].text

store_info['visited_review'] = review_elements[1].text

store_info['blog_review'] = review_elements[2].text

elif len(review_elements) > 1:

store_info['visited_review'] = review_elements[0].text

store_info['blog_review'] = review_elements[1].text

except:

pass

# 가게 주소 및 영업 시간

try:

store_info['address'] = driver.find_element(By.XPATH, '//span[@class="LDgIH"]').text

except:

pass

try:

store_info['phone_num'] = driver.find_element(By.XPATH, '//span[@class="xlx7Q"]').text

except:

pass

# 링크

try:

store_info['link'] = driver.find_element(By.XPATH, '//div[@class="jO09N"]/a').text

except:

pass

# 호텔 최저가

try:

menu_elements = driver.find_elements(By.XPATH, '//ul[contains(@class, "jG53P")]/li')

store_info['menu'] = []

for menu_element in menu_elements:

name = menu_element.find_element(By.XPATH, './/div[@class="FtVpy"]').text

price = menu_element.find_elements(By.XPATH, './/span[@class="QW7xr"]')

price_texts = [p.text for p in price]

info = menu_element.find_elements(By.XPATH, './/div[contains(@class, "uNeKV")]')

info_texts = [i.text for i in info]

store_info['menu'].append({"name": name,"price": price_texts, 'info':info_texts})

except:

pass

# 방문자 리뷰

try:

driver.find_element(By.XPATH, '//div[@class="KERaF"]').click()

sleep(2)

parent_element_review = driver.find_element(By.XPATH, '//div[@class="wvfSn"]')

child_elements_review = parent_element_review.find_elements(By.XPATH, './div[@class="jypaX" or @class="mrSZf"]')

store_info['keyword_reviews'] = [child.text for child in child_elements_review]

except:

pass

except Exception as ex:

print('------------ 데이터 크롤링 오류 ------------' )

print(ex)

store_data.append(store_info)

switch_left()

sleep(2)

# JSON 형식으로 저장

# #######!!!!!!!!n번째 실행할 경우, 아래 cafe_data 뒤에 넘버링 필수!!!!!!!!#######

with open("tourist_data.json", "w", encoding="utf-8") as f:

json.dump(store_data, f, ensure_ascii=False, indent=4)

# 출력 (테스트용)

print(json.dumps(store_info, ensure_ascii=False, indent=4))

elapsed_time = time.time() - start_time # 소요 시간 계산

total_elapsed_time += elapsed_time # 누적 시간 갱신

print(f"Crawling Time: {elapsed_time:.2f} seconds") # 소요 시간 출력

print(f"Total Elapsed Time: {total_elapsed_time:.2f} seconds") # 누적 시간 출력

# 다음 페이지로 이동

next_page = driver.find_element(By.XPATH, '//*[@id="app-root"]/div/div[2]/div[2]/a[last()]').get_attribute('aria-disabled')

if next_page == 'false':

driver.find_element(By.XPATH, '//*[@id="app-root"]/div/div[2]/div[2]/a[last()]').click()

else:

break

'Data Analyst > daily' 카테고리의 다른 글

[프로젝트 6일차] (0)	2025.01.14
[프로젝트 4일차] (0)	2025.01.10
[QCC 6회차] (0)	2025.01.10
[프로젝트 2일차] (0)	2025.01.08
[최종프로젝트 1일차] (0)	2025.01.07

'Data Analyst/daily' Related Articles

rubus0304 님의 블로그

[프로젝트 5일차] 본문

[프로젝트 5일차]

'Data Analyst > daily' 카테고리의 다른 글

티스토리툴바