Contents
반응형
# crawler.py
import ssl
import sys
from urllib.request import Request, urlopen
from datetime import datetime
from bs4 import BeautifulSoup
def crawling(url='',
encoding='utf-8',
proc1=lambda data :data,
proc2=lambda data :data,
err = lambda e : print(f'{e}: {datetime.now()}',file=sys.stderr) ) :
try :
request = Request(url)
ssl._create_default_https_context = ssl._create_unverified_context()
context = ssl._create_unverified_context()
response = urlopen(request ,context=context)
receive = response.read()
return proc2(proc1((receive.decode(encoding,errors='replace'))))
except Exception as e :
err(e)
크롤링 함수로 빼기
lamda 사용해보기
lamda를 통해 함수를 안 넣어줄 시 에러 안생기게 함
proc1함수가 매개변수로 안들어오면 proc1=data가 됨
# test.py
from itertools import count
from urllib import response
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from collection import crawler
def ex01():
request = Request('https://movie.naver.com/movie/sdb/rank/rmovie.nhn')
# response = urlopen(request)
html = response.read().decode('cp949')
# print(html)
bs = BeautifulSoup(html, 'html.parser')
# 이쁘게 출력해줌
# print(bs.prettify())
divs = bs.findAll('div', attrs={'class': 'tit3'})
# print(divs)
for index, div in enumerate(divs):
print(index + 1, div.a.text, div.a['href'], sep=" : ")
print('======================================')
def proc_naver_movie_rank(html):
# processing
bs = BeautifulSoup(html, 'html.parser')
results = bs.findAll('div', attrs={'class': 'tit3'})
return results
def store_naver_movie_rank(data):
for index, div in enumerate(data):
print(index + 1, div.a.text, div.a['href'], sep=" : ")
return data
def ex02():
# fetch
crawler.crawling(url='https://movie.naver.com/movie/sdb/rank/rmovie.nhn',
encoding='cp949',
proc1=proc_naver_movie_rank,
proc2=lambda data: list(map(lambda div: print(div[0],div[1].a.text,div[1].a['href'],sep=' : ' ), enumerate(data)))
)
def crawling_kyochon():
results=[]
for sido1 in range(1,18):
for sido2 in count(start=1):
url = 'http://www.kyochon.com/shop/domestic.asp?sido1={0}&sido2={1}&txtsearch='.format(sido1, sido2)
html = crawler.crawling(url)
# 끝 검출
if html is None:
break
bs = BeautifulSoup(html,'html.parser')
tag_ul = bs.find('ul',attrs={'class':'list'})
tags_span = tag_ul.findAll('span',attrs={'class':'store_item'})
for tag_span in tags_span :
strings = list (tag_span.strings)
name=strings[1]
address=strings[3].replace('\r\n\t','').strip()
sidogu=address.split()[:2]
results.append((name,address)+tuple(sidogu))
for t in results:
print(t)
crawling_kyochon()
# __name__ == '__main__' and not \
# ex01() and not \
# ex02()
>> 함수로 빼서 바꿔줌
동적으로 가져오기
프로젝트 - settings - project - pip -install selenium
import time
from selenium import webdriver
wd = webdriver.Chrome('D:\cafe24\chromedriver_win32\chromedriver.exe')
wd.get('http://www.google.com')
time.sleep(2)
html= wd.page_source
print(html)
wd.quit()
def crawling_goobne():
results=[]
url= 'http://www.goobne.co.kr/store/search_store.jsp'
# 첫 페이지 로딩
wd = webdriver.Chrome('D:\chromedriver_win32\chromedriver.exe')
wd.get(url)
time.sleep(5)
for page in count(start=1) :
# 자바스크립트 실행
script= 'store.getList(%d)' % page
wd.execute_script(script)
print(f'{datetime.now()}:success for request [{url}]')
time.sleep(3)
# 실행 결과 rendering 된 HTML 가져오기
html=wd.page_source
# parsing with bs4
bs=BeautifulSoup(html,'html.parser')
tag_tbody = bs.find('tbody',attrs={"id":"store_list"})
tags_tr = tag_tbody.findAll('tr')
# detect last page
if tags_tr[0].get('class') is None:
break
for tag_tr in tags_tr :
strings = list(tag_tr.strings)
name = strings[1]
address=strings[6]
sidogu = address.split()[:2]
results.append( (name,address)+tuple(sidogu) )
wd.quit()
for t in results:
print(t)
# store
table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu'])
# table.to_csv('__result__/goobne.csv', encoding='UTF-8', mode='w', index=True)
if __name__ == '__main__':
#pelicana
# crawling_pelicana()
# crawling_nene()
crawling_goobne()
반응형
'IT 공부 > Python' 카테고리의 다른 글
Python django 설정 및 시작하기 (0) | 2019.06.19 |
---|---|
Python 크롤링 연습 (0) | 2019.06.18 |
Python 클래스(1) 클래스, 인스턴스 메서드/ 멤버, 인스턴스 변 (0) | 2019.06.17 |
Python 파일입출력 예제 (0) | 2019.06.17 |
Python 모듈, import (0) | 2019.06.17 |