본문 바로가기
IT 공부/Python

Python 크롤링, 동적으로 가져오기 selenium

by 쭈잇 2019. 6. 19.

Contents

    반응형
    # crawler.py
    
    import ssl
    import sys
    from urllib.request import Request, urlopen
    from datetime import datetime
    
    from bs4 import BeautifulSoup
    
    
    def crawling(url='',
                 encoding='utf-8',
                 proc1=lambda data :data,
                 proc2=lambda data :data,
                 err = lambda e : print(f'{e}: {datetime.now()}',file=sys.stderr) ) :
    
         try :
    
            request = Request(url)
            ssl._create_default_https_context = ssl._create_unverified_context()
            context = ssl._create_unverified_context()
            response = urlopen(request ,context=context)
            receive = response.read()
            return proc2(proc1((receive.decode(encoding,errors='replace'))))
    
    
         except Exception as e :
            err(e)
    
    

     

     

    크롤링 함수로 빼기

    lamda 사용해보기

    lamda를 통해 함수를 안 넣어줄 시 에러 안생기게 함

    proc1함수가 매개변수로 안들어오면 proc1=data가 됨

     

    # test.py
    
    from itertools import count
    from urllib import response
    from urllib.request import Request, urlopen
    
    from bs4 import BeautifulSoup
    
    from collection import crawler
    
    
    def ex01():
        request = Request('https://movie.naver.com/movie/sdb/rank/rmovie.nhn')
        # response = urlopen(request)
        html = response.read().decode('cp949')
        # print(html)
    
        bs = BeautifulSoup(html, 'html.parser')
        # 이쁘게 출력해줌
        # print(bs.prettify())
    
        divs = bs.findAll('div', attrs={'class': 'tit3'})
        # print(divs)
        for index, div in enumerate(divs):
            print(index + 1, div.a.text, div.a['href'], sep=" : ")
        print('======================================')
    
    
    def proc_naver_movie_rank(html):
        # processing
        bs = BeautifulSoup(html, 'html.parser')
        results = bs.findAll('div', attrs={'class': 'tit3'})
        return results
    
    
    def store_naver_movie_rank(data):
        for index, div in enumerate(data):
            print(index + 1, div.a.text, div.a['href'], sep=" : ")
        return data
    
    
    def ex02():
        # fetch
        crawler.crawling(url='https://movie.naver.com/movie/sdb/rank/rmovie.nhn',
                         encoding='cp949',
                         proc1=proc_naver_movie_rank,
                         proc2=lambda data: list(map(lambda div: print(div[0],div[1].a.text,div[1].a['href'],sep=' : ' ), enumerate(data)))
    )
    
    def crawling_kyochon():
        results=[]
        for sido1 in range(1,18):
            for sido2 in count(start=1):
                url = 'http://www.kyochon.com/shop/domestic.asp?sido1={0}&sido2={1}&txtsearch='.format(sido1, sido2)
                html = crawler.crawling(url)
    
                # 끝 검출
                if html is None:
                    break
                bs = BeautifulSoup(html,'html.parser')
                tag_ul = bs.find('ul',attrs={'class':'list'})
                tags_span = tag_ul.findAll('span',attrs={'class':'store_item'})
    
                for tag_span in tags_span :
                    strings =  list (tag_span.strings)
                    name=strings[1]
                    address=strings[3].replace('\r\n\t','').strip()
                    sidogu=address.split()[:2]
                    results.append((name,address)+tuple(sidogu))
    
        for t in results:
            print(t)
    crawling_kyochon()
    
    # __name__ == '__main__' and not \
    #     ex01() and not \
    #     ex02()
    
    
    
    

    >> 함수로 빼서 바꿔줌

     

     

    동적으로 가져오기

     

    프로젝트 - settings - project - pip -install selenium

     

    import time
    
    from selenium import webdriver
    
    wd = webdriver.Chrome('D:\cafe24\chromedriver_win32\chromedriver.exe')
    wd.get('http://www.google.com')
    
    time.sleep(2)
    html= wd.page_source
    print(html)
    
    
    
    
    wd.quit()
    def crawling_goobne():
        results=[]
        url= 'http://www.goobne.co.kr/store/search_store.jsp'
    
        # 첫 페이지 로딩
        wd = webdriver.Chrome('D:\chromedriver_win32\chromedriver.exe')
        wd.get(url)
        time.sleep(5)
    
        for page in count(start=1) :
            # 자바스크립트 실행
            script= 'store.getList(%d)' % page
            wd.execute_script(script)
            print(f'{datetime.now()}:success for request [{url}]')
            time.sleep(3)
    
            # 실행 결과 rendering 된 HTML 가져오기
            html=wd.page_source
    
            # parsing with bs4
            bs=BeautifulSoup(html,'html.parser')
            tag_tbody = bs.find('tbody',attrs={"id":"store_list"})
            tags_tr = tag_tbody.findAll('tr')
    
            # detect last page
            if tags_tr[0].get('class') is None:
                break
    
            for tag_tr in tags_tr :
                strings = list(tag_tr.strings)
                name = strings[1]
                address=strings[6]
                sidogu = address.split()[:2]
                results.append( (name,address)+tuple(sidogu) )
        wd.quit()
        for t in results:
            print(t)
    
    
        # store
            table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu'])
            # table.to_csv('__result__/goobne.csv', encoding='UTF-8', mode='w', index=True)
    
    if __name__ == '__main__':
        #pelicana
        # crawling_pelicana()
        # crawling_nene()
        crawling_goobne()
    반응형