본문 바로가기
IT 공부/Python

Python 크롤링 연습

by 쭈잇 2019. 6. 18.

Contents

    반응형

    프로젝트 - setting - project -> + 모양 -> beautifulSoup4 설치

     

    from bs4 import BeautifulSoup
    
    html = '''
    <td class="title">
      <div class="tit3">
        <a href="/movie/bi/mi/basic.nhn?code=161967" title="기생충">기생충</a>
       </div>
    </td>
    
    '''
    
    
    
    # 1. tag 조회
    def ex1():
        bs= BeautifulSoup(html, 'html.parser')
        print(bs)
        print(type(bs))
        
    
    if __name__ == '__main__':
        ex1()
        
        >>
    <td class="title">
    <div class="tit3">
    <a href="/movie/bi/mi/basic.nhn?code=161967" title="기생충">기생충</a>
    </div>
    </td>
    
    <class 'bs4.BeautifulSoup'>
    # 1. tag 조회
    def ex1():
        bs= BeautifulSoup(html, 'html.parser')
        # print(bs)
        # print(type(bs))
        tag=bs.td
        print(tag)
        print(type(tag))
        
        >>
    <td class="title">
    <div class="tit3">
    <a href="/movie/bi/mi/basic.nhn?code=161967" title="기생충">기생충</a>
    </div>
    </td>
    <class 'bs4.element.Tag'>
     tag = bs.a
        print(tag)
        print(type(tag))
        
        >>
    <a href="/movie/bi/mi/basic.nhn?code=161967" title="기생충">기생충</a>
    <class 'bs4.element.Tag'>
    from urllib.request import Request, urlopen
    
    from bs4 import BeautifulSoup
    
    request = Request('https://movie.naver.com/movie/sdb/rank/rmovie.nhn')
    response = urlopen(request)
    html=response.read().decode('cp949')
    # print(html)
    
    bs = BeautifulSoup(html,'html.parser')
    # 이쁘게 출력해줌
    # print(bs.prettify())
    
    divs=bs.findAll('div',attrs={'class':'tit3'})
    # print(divs)
    for index,div in enumerate(divs) :
        print(index+1, div.a.text, div.a['href'],sep=" : ")
    
    from datetime import datetime
    import ssl
    import sys
    from urllib.request import Request, urlopen
    import pandas as pd
    from bs4 import BeautifulSoup
    
    
    def crawling_pelicana():
        results = []
    
        for page in range (1,5) :
            url =   'https://pelicana.co.kr/store/stroe_search.html?branch_name=&gu=&si=&page=%d' % page
            try :
    
                request = Request(url)
                # ssl._create_default_https_context = ssl._create_unverified_context()
                context = ssl._create_unverified_context()
                response = urlopen(request ,context=context)
                receive = response.read()
                html=receive.decode('utf-8',errors='replace')
                print(f'{datetime.now()}:success for request [{url}]')
    
    
            except Exception as e :
                print(f'{e}: {datetime.now()}',file=sys.stderr)
                # continue
            bs=BeautifulSoup(html,'html.parser')
            tag_table=bs.find('table',attrs={'class':'table'})
            tag_body=tag_table.find('tbody')
            tags_tr=tag_body.findAll('tr')
    
            #끝 검출
            if len(tags_tr) == 0:
                break
    
            for tag_tr in tags_tr :
                strings=list(tag_tr.strings)
                name=strings[1]
                address=strings[3]
                sidogu=address.split()[:2]
                results.append( (name,address)+tuple(sidogu) )
    
        # store
        table= pd.DataFrame(results,columns=['name','address','sido','gungu'])
        table.to_csv('__result__/pelicana.csv', encoding='UTF-8', mode='w', index=True)
    
        for t in results:
            print(t)
    
    
    def crawling_nene():
        results = []
        for page in range (1,50):
            try :
    
                url = 'https://nenechicken.com/17_new/sub_shop01.asp?ex_select=1&ex_select2=&IndexSword=&GUBUN=A&page=%d' % page
                request = Request(url)
                response = urlopen(request )
                receive=response.read()
                html = receive.decode('utf-8', errors='replace')
                print(f'{datetime.now()}:success for request [{url}]')
    
            except Exception as e:
                print(f'{e}: {datetime.now()}', file=sys.stderr)
                # continue
            bs = BeautifulSoup(html, 'html.parser')
            tag_table=bs.find('div',attrs={'class':'shopWrap'})
            shopInfo=tag_table.findAll('div',attrs={'class':'shopInfo'})
            shopNames=tag_table.findAll('div',attrs={'class':'shopName'})
            shopAddrs=tag_table.findAll('div',attrs={'class':'shopAdd'})
            # print(shopInfo )
            for index,shop  in enumerate(shopInfo)  :
                sidogu=shopAddrs[index].text.split()[:2]
                info=(shopNames[index].text,shopAddrs[index].text)
                results.append(info+tuple(sidogu))
    
        # store
        table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu'])
        table.to_csv('__result__/nene.csv', encoding='UTF-8', mode='w', index=True)
    
        for t in results:
            print(t)
    
    
    if __name__ == '__main__' :
        #pelicana
        # crawling_pelicana()
        crawling_nene()
    
    반응형