Searching in baidu using BeautifulSoup in Python3.3

来源:互联网 发布:乐视视频mac客户端 编辑:程序博客网 时间:2024/05/16 14:47


 # -*- coding: utf-8 -*- import pandas as pdfrom bs4 import BeautifulSoupimport urllib.request as reqimport urllibimport reimport jsonimport datetimeclass BaiduCrawler:    link = 'http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidulocal&wd='    def __init__(self,inter_page,target,inter_days):        self.inter_page = inter_page        self.inter_days = datetime.timedelta(days=int(inter_days))        self.target = target        '''        #PROXY        proxy = req.ProxyHandler({'http': '127.0.0.1:3128'})        opener = req.build_opener(proxy)        req.install_opener(opener)        '''    def get_threads(self,soup):        text = soup.find_all('td',{'class':'f'})        return text        def baidu_search(self,page,game_name):        pn = page*10        print('pn is ',pn)        url = self.link + game_name + '&pn='+str(pn)        print('url is ' ,url)        conn = req.urlopen(url)        text = conn.read().decode()        soup = BeautifulSoup(text,'html.parser')        return soup        def crawler(self):        today = datetime.date.today()                name = urllib.parse.quote(self.target)        dfs = []        tmp_list = []        for page in range(self.inter_page):            soup = self.baidu_search(page,name)            threads = self.get_threads(soup)            for thread in threads:                up_date = str(thread.select('font')[-1].text.split('\xa0')[-2])                dt = datetime.datetime.strptime(up_date,'%Y-%m-%d')                dt = datetime.date(dt.year,dt.month,dt.day)                if(today-dt <= self.inter_days):                    tmp_list.append((thread.select('a')[0].text,thread.select('a')[0]['href']))        df = pd.DataFrame.from_dict(tmp_list)        df.columns = ['title','url']        dfs.append(df)         final_df = pd.concat(dfs)         print('Saving file ...')        df.to_csv(path_or_buf = 'd:\\tmp\\baidu.csv',encoding='utf-8')         print('It is OK!')    def baidu():    page = 1 #how many page do you need.    search = 'xxxx' # something you want to search     days = 30 # in how many days    baidu = BaiduCrawler(page,search,days)    baidu.crawler()    if __name__ == '__main__':    baidu()


0 0
原创粉丝点击