Contents
반응형
프로젝트 - setting - project -> + 모양 -> beautifulSoup4 설치
from bs4 import BeautifulSoup
html = '''
<td class="title">
<div class="tit3">
<a href="/movie/bi/mi/basic.nhn?code=161967" title="기생충">기생충</a>
</div>
</td>
'''
# 1. tag 조회
def ex1():
bs= BeautifulSoup(html, 'html.parser')
print(bs)
print(type(bs))
if __name__ == '__main__':
ex1()
>>
<td class="title">
<div class="tit3">
<a href="/movie/bi/mi/basic.nhn?code=161967" title="기생충">기생충</a>
</div>
</td>
<class 'bs4.BeautifulSoup'>
# 1. tag 조회
def ex1():
bs= BeautifulSoup(html, 'html.parser')
# print(bs)
# print(type(bs))
tag=bs.td
print(tag)
print(type(tag))
>>
<td class="title">
<div class="tit3">
<a href="/movie/bi/mi/basic.nhn?code=161967" title="기생충">기생충</a>
</div>
</td>
<class 'bs4.element.Tag'>
tag = bs.a
print(tag)
print(type(tag))
>>
<a href="/movie/bi/mi/basic.nhn?code=161967" title="기생충">기생충</a>
<class 'bs4.element.Tag'>
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
request = Request('https://movie.naver.com/movie/sdb/rank/rmovie.nhn')
response = urlopen(request)
html=response.read().decode('cp949')
# print(html)
bs = BeautifulSoup(html,'html.parser')
# 이쁘게 출력해줌
# print(bs.prettify())
divs=bs.findAll('div',attrs={'class':'tit3'})
# print(divs)
for index,div in enumerate(divs) :
print(index+1, div.a.text, div.a['href'],sep=" : ")
from datetime import datetime
import ssl
import sys
from urllib.request import Request, urlopen
import pandas as pd
from bs4 import BeautifulSoup
def crawling_pelicana():
results = []
for page in range (1,5) :
url = 'https://pelicana.co.kr/store/stroe_search.html?branch_name=&gu=&si=&page=%d' % page
try :
request = Request(url)
# ssl._create_default_https_context = ssl._create_unverified_context()
context = ssl._create_unverified_context()
response = urlopen(request ,context=context)
receive = response.read()
html=receive.decode('utf-8',errors='replace')
print(f'{datetime.now()}:success for request [{url}]')
except Exception as e :
print(f'{e}: {datetime.now()}',file=sys.stderr)
# continue
bs=BeautifulSoup(html,'html.parser')
tag_table=bs.find('table',attrs={'class':'table'})
tag_body=tag_table.find('tbody')
tags_tr=tag_body.findAll('tr')
#끝 검출
if len(tags_tr) == 0:
break
for tag_tr in tags_tr :
strings=list(tag_tr.strings)
name=strings[1]
address=strings[3]
sidogu=address.split()[:2]
results.append( (name,address)+tuple(sidogu) )
# store
table= pd.DataFrame(results,columns=['name','address','sido','gungu'])
table.to_csv('__result__/pelicana.csv', encoding='UTF-8', mode='w', index=True)
for t in results:
print(t)
def crawling_nene():
results = []
for page in range (1,50):
try :
url = 'https://nenechicken.com/17_new/sub_shop01.asp?ex_select=1&ex_select2=&IndexSword=&GUBUN=A&page=%d' % page
request = Request(url)
response = urlopen(request )
receive=response.read()
html = receive.decode('utf-8', errors='replace')
print(f'{datetime.now()}:success for request [{url}]')
except Exception as e:
print(f'{e}: {datetime.now()}', file=sys.stderr)
# continue
bs = BeautifulSoup(html, 'html.parser')
tag_table=bs.find('div',attrs={'class':'shopWrap'})
shopInfo=tag_table.findAll('div',attrs={'class':'shopInfo'})
shopNames=tag_table.findAll('div',attrs={'class':'shopName'})
shopAddrs=tag_table.findAll('div',attrs={'class':'shopAdd'})
# print(shopInfo )
for index,shop in enumerate(shopInfo) :
sidogu=shopAddrs[index].text.split()[:2]
info=(shopNames[index].text,shopAddrs[index].text)
results.append(info+tuple(sidogu))
# store
table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu'])
table.to_csv('__result__/nene.csv', encoding='UTF-8', mode='w', index=True)
for t in results:
print(t)
if __name__ == '__main__' :
#pelicana
# crawling_pelicana()
crawling_nene()
반응형
'IT 공부 > Python' 카테고리의 다른 글
Python django 설정 및 시작하기 (0) | 2019.06.19 |
---|---|
Python 크롤링, 동적으로 가져오기 selenium (0) | 2019.06.19 |
Python 클래스(1) 클래스, 인스턴스 메서드/ 멤버, 인스턴스 변 (0) | 2019.06.17 |
Python 파일입출력 예제 (0) | 2019.06.17 |
Python 모듈, import (0) | 2019.06.17 |