selenium 과 BeautifulSoup 그리고 크롬을 이용한 예제

web page의 원하는 부분을 읽어낼때는 크롬의 F12를 눌러 DevTools 를 이용합니다.

여기에서는 daum page의 "로또당첨번호" 라는 텍스트 위치를 읽어 내보도록 하겠습니다.

DevTools에서 마우스를 클릭해가면서 원하는 부분을 찾습니다.
그리고 마우스 우측 버튼을 눌러 Copy > Copy selector를 합니다.
여기에서는 아래와 같은 값이 되며 해당값은 CSS selector로 BeautifulSoup 에서 원하는 위치를 읽어낼때 사용하게 될것입니다.

#wrapSearch > div.slide_favorsch > ul:nth-child(2) > li:nth-child(1) > a

해당 부분이 왜 저렇게 되는지는 css selector 를 검색 해보시기 바랍니다.

selenium을 이용해 html 읽기

selenium 을 통해서 html을 읽는 예제는 아래와 같이 작업하였습니다.

아래 파일은 selenium_v1.py 로 저장하겠습니다.

import chromedriver_autoinstaller
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import os

class selenium_v1:
	def __init__(self):
		self.driver = None
		self.download_path = None
		return
		
	def create_web_driver_chrome(self, headless=True, download_path=None):
		options = webdriver.ChromeOptions()
		options.add_argument('disable-gpu')
		options.add_experimental_option('excludeSwitches',['enable-logging'])
		
		if headless:
			options.add_argument('headless')
		
		if download_path!=None:
			self.download_path = os.path.abspath(download_path)
			prefs = {"download.default_directory":self.download_path}
			options.add_experimental_option("prefs",prefs)
		
		if self.driver!=None:
			self.driver.close()
		self.driver = None
		
		chromedriver_autoinstaller.install()
		
		try:
			self.driver = webdriver.Chrome(options=options)
			self.driver.implicitly_wait(10)
		except Exception as e:
			print("exception",e)
		
		return self.driver
		
	def download_wait(self, timeout_min=1):
		if self.download_path==None:
			print("error can not find download path")
			return -2
		path_to_downloads = self.download_path
		seconds = 0
		dl_wait = True
		sum_after = 0
		while dl_wait and seconds < timeout_min*60:
			time.sleep(5)
			dl_wait = False
			sum_before = sum_after
			sum_after = 0
			for fname in os.listdir(path_to_downloads):
				if fname.endswith('.crdownload'):
					sum_after += os.stat(path_to_downloads+'/'+fname).st_size
					dl_wait = True
			if dl_wait and seconds > 10 and sum_before == sum_after:
				print("download timeout")
				dl_wait = False
				return -1
			seconds += 5
		return seconds
	
	def get(self,url):
		if self.driver == None:
			return -1
		return self.driver.get(url)
		
	def close(self):
		if self.driver == None:
			return -1
		return self.driver.close()
		
	def save_page_source(self, filename):
		if self.driver == None:
			return -1
		html = self.driver.page_source
		try:
			f = open(filename, 'w', encoding = 'utf-8')
			f.write(html)
			f.close()
		except:
			print("exception",e)
		return 0
		
if __name__ == "__main__":
	sel = selenium_v1()
	sel.create_web_driver_chrome(headless=True,download_path=".")
	print(sel.get("https://www.daum.net"))
	print(sel.driver.page_source)
	print(sel.get("https://www.python.org/ftp/python/3.9.11/python-3.9.11-embed-amd64.zip"))
	sel.download_wait()
	print(sel.driver.page_source)
	sel.save_page_source("test.html")

여기 예제는 chrome 드라이버를 이용한 파일을 다운로드 하는 코드를 샘플로 작성하였고 실제 page는 sel.driver.page_source 가 됩니다.

import 에러 발생시

ModuleNotFoundError: No module named 'chromedriver_autoinstaller'

pip install chromedriver-autoinstaller

ModuleNotFoundError: No module named 'selenium'

pip install -U selenium

sel.driver.page_source 여기에 html 코드가 들어가는데는 시간이 좀 필요합니다. 그래서 조금 시간이 필요한데 driver.implicitly_wait(3) 함수를 이용해서 html 페이지가 로딩되도록 하겠습니다.

daum 페이지를 읽어 화면에 출력하는 코드

import selenium_v1


if __name__ == "__main__":
	sel = selenium_v1.selenium_v1()
	sel.create_web_driver_chrome(headless=True, download_path=".")
	print(sel.get("https://www.daum.net"))
	sel.driver.implicitly_wait(3)
	print(sel.driver.page_source)

BeautifulSoup 로 정보 읽기

import selenium_v1
from bs4 import BeautifulSoup


if __name__ == "__main__":
	sel = selenium_v1.selenium_v1()
	sel.create_web_driver_chrome(headless=True, download_path=".")
	# get page
	sel.get("https://www.daum.net")
	sel.driver.implicitly_wait(3)
	#print(sel.driver.page_source)

	soup = BeautifulSoup(sel.driver.page_source, "html.parser")
	result = soup.select('#wrapSearch > div.slide_favorsch > ul:nth-child(2) > li:nth-child(1) > a')
	print(f'type:{type(result)},result:{result}')
	for one in result:
		print(f'type:{type(one)},result:{one},href:{one.href},text:{one.text},get:{one.get("data-tiara-layer")}')

	result = soup.select_one('#wrapSearch > div.slide_favorsch > ul:nth-child(2) > li:nth-child(1) > a')
	print(f'type:{type(result)},result:{result}')

실행 결과

type:<class 'bs4.element.ResultSet'>,result:[<a class="link_favorsch" data-tiara-action-name="header-search-txt" data-tiara-layer="header search txt" href="https://search.daum.net/search?w=tot&amp;q=%EB%A1%9C%EB%98%90%EB%8B%B9%EC%B2%A8%EB%B2%88%ED%98%B8&amp;DA=NPI&amp;rtmaxcoll=LOT">로또당첨번호</a>]
type:<class 'bs4.element.Tag'>,result:<a class="link_favorsch" data-tiara-action-name="header-search-txt" data-tiara-layer="header search txt" href="https://search.daum.net/search?w=tot&amp;q=%EB%A1%9C%EB%98%90%EB%8B%B9%EC%B2%A8%EB%B2%88%ED%98%B8&amp;DA=NPI&amp;rtmaxcoll=LOT">로또당첨번호</a>,href:None,text:로또당첨번호,get:header search txt
type:<class 'bs4.element.Tag'>,result:<a class="link_favorsch" data-tiara-action-name="header-search-txt" data-tiara-layer="header search txt" href="https://search.daum.net/search?w=tot&amp;q=%EB%A1%9C%EB%98%90%EB%8B%B9%EC%B2%A8%EB%B2%88%ED%98%B8&amp;DA=NPI&amp;rtmaxcoll=LOT">로또당첨번호</a>

import 에러시

ModuleNotFoundError: No module named 'bs4'

pip install beautifulsoup4

BS4로 CSS select 하기

예제에 있듯이 select, select_one 두개의 메소드를 이용가능 합니다. select 메소드를 사용하게 되면 선택한 tag가 여러개라는 가정하에 list 형태로 묶이게 됩니다.

select_one 메소드는 첫번째 결과만 tag에 저장하게 됩니다.

결과로 돌아오는 tag는 기본적으로 text, href 를 이용하여 직접 읽을 수도 있지만 get 을 이용하여 읽을 수도 있습니다.

아래 부분 참고

print(f'type:{type(one)},result:{one},href:{one.href},text:{one.text},get:{one.get("data-tiara-layer")}')

결론적으로 우리가 원하는 텍스트는 one.text에 저장되게 됩니다.

SW정리

2022년 10월 23일 일요일

python selenium 실전 사용과 BeautifulSoup로 정보 읽기