针对百家号抓取项目

来源:互联网 发布:施耐德可编程编程软件 编辑:程序博客网 时间:2024/05/20 19:48

针对百家号抓取项目

闲聊

放假的时候接到了一个抓取百度百家号的项目,由于种种原因没有完成。由于我的能力不够真的好尴尬。我的第一个项目就这样黄了。不过技术不行我可以学啊(嘿嘿嘿)

先说一下项目是怎么样的好le

项目思路

就是先解决比如http://baijiahao.baidu.com/u?app_id=1551032544962326这个链接里面的所有文章的标题链接阅读数和评论数。我们可以观察到这个网站是动态加载的你可以一直的往下翻一直到没有这个就是典型的XHR的加载手机版的淘宝也是这样弄得。xhr涉及了比较多的网页端的知识我了解的也不多。现在也只是会通过python发出XHR的请求然后获取我们想要的东西。这个就是一个模拟请求代码就是几行不用怎么说。

是获取那些百家号的id这个比较困难了因为他的id有16位暴力是不可能的。从别的地方也获取不到当时就是一阵懵逼。当我看到百度的时候。我才想到他们帮我们都弄好了嘿嘿谢谢百度的爬虫哈哈。当然我们的问题从爬百家号变成了爬百度搜索了。我的难题又来了百度搜索也不是我能够很轻松搞定的他的搜索结果不能够直接得到这就尴尬了。我开始是通过PHANTOMJS的这个的效率太低了我服务器cpu都跑满了一天也获得不到几个。最后我还是改变自己的思路了我可以不一次获得百度结果的760个所有结果每一次就获取一页通过不同的关键词来获取这样的话一切问题都变得简单了只要向百度发起get请求就可以了嘿嘿

贴出我的代码

bajihao.py

'''本模块是为了解决获取百家号url并且从这个url里面获取我们想要的新闻'''import reimport timeimport bs4import requestsfrom selenium import webdriverclass sobaidu():    '''sobaidu类实现通过百度搜索获取真实的url并且把url写入数据库'''    def __init__(self):        self.KEYFILENAME = "keylist.txt"        self.URLFILENAME = "urllist.txt"        self.KEYLIST = set()        self.URLLIST = set()        self.URLFILE = open(self.URLFILENAME, 'w')    def _readkey(self):        '''读取百度搜索所需要的所有关键词'''        with open(self.KEYFILENAME) as keyklistfile:            for i in keyklistfile.readlines():                self.KEYLIST.add(i)    def _changeurl(self, url):        '''百度搜索结果url转换为真实的url'''        try:            req = requests.get(url+'&wd=')            # time.sleep(1)            # print(req.text)            regx = r'http[s]*://baijiahao.baidu.com/[\S]*id=[0-9]*'            pattern = re.compile(regx)            match = re.findall(pattern, req.text)            return match[0]        except Exception as e:            print(e)    def _writetomysql(self):        '''将真实url写入数据库'''        pass    def _writetofile(self,url):        self.URLFILE.write(url)        self.URLFILE.write('\n')    def sobaidu(self):        '''调用以上函数解决我们的问题'''        # browser = webdriver.Chrome()        browser = webdriver.PhantomJS()        num = 0        for key in self.KEYLIST:            ''''doc'''            num += 1            now_num = 0            browser.implicitly_wait(30)            browser.get('https://www.baidu.com/s?wd=site:(baijiahao.baidu.com)' + key)             while True:                if now_num == 1:                    try:                        browser.find_element_by_xpath('//*[@id="page"]/a[10]').click()                        time.sleep(2)                    except Exception as e:                        print(e)                        print("有问题")                        break                now_num += 1                print(now_num)                source = browser.page_source                soup = bs4.BeautifulSoup(source, 'lxml')                print('next_page')                for i in soup.findAll(class_='result c-container '):                    url = i.find(class_='t').find('a').get('href')                    # print(url)                    # self.URLLIST.add(self._changeurl(url))                    self._writetofile(self._changeurl(url))                time.sleep(1)                if now_num > 1:                    try:                        browser.find_element_by_xpath('//*[@id="page"]/a[11]').click()                        time.sleep(1)                    except:                        print('not find next_button may be for the page end!!!')                        breakclass getappid:    def __init__(self):        self.URLFILENAME = "urllist.txt"        self.APPIDLIST = "appid.txt"        self.URLLIST = set()        self.APPIDFILE = open(self.APPIDLIST, 'w')    def _readurl(self):        '''读取新闻页的url'''        with open(self.URLFILENAME) as urllistfile:            for i in urllistfile.readlines():                self.URLLIST.add(i)    def _writeappid(self, appid):        self.APPIDFILE.write(appid)        self.APPIDFILE.write('\n')        print("写入成功")    def getid(self):        # browser = webdriver.PhantomJS()        browser = webdriver.Chrome()        browser.implicitly_wait(10)        # browser.set_script_timeout(10)        # browser.set_page_load_timeout(10)        for url in self.URLLIST:            browser.get(url)            regx = r'http[s]*://baijiahao.baidu.com/u[\S]*id=[0-9]*'            pattern = re.compile(regx)            match = re.findall(pattern, browser.page_source)            time.sleep(1)            try:                print(match[0])                self._writeappid(match[0])            except Exception as e:                print('匹配失败')def main():    dsfsd = sobaidu()    # strings = dsfsd._changeurl('https://www.baidu.com/link?url=w8wWEQMyVf0cD3TsKcn_pTQZ92cIqLqxVZKWFtT4rYJcESE_qfhKlPJg5B7OM2mXhZoSM1H0ogmCIgi4G2EkP_&wd=&eqid=aa2c3db90000bf4c0000000458831761')    # print(strings)    # # ###################### 奇怪的分割线 ###############    dsfsd._readkey()    print(len(dsfsd.KEYLIST))    #########################  有点傻的分割线 #############    dsfsd.sobaidu()    # print(len(dsfsd.URLLIST))    # for i in dsfsd.URLLIST:    #     print(i)    # ####################### 电脑有点卡的分割线 ###########    dsfsd.URLFILE.close()def getid():    dsfsd = getappid()    dsfsd._readurl()    print(len(dsfsd.URLLIST))    dsfsd.getid()    dsfsd.APPIDFILE.close()if __name__ == '__main__':    # main()    # getid()

get_url.py

import reimport timeimport bs4import requests# from selenium import webdriverclass sobaidu():    '''sobaidu类实现通过百度搜索获取真实的url并且把url写入数据库'''    def __init__(self):        self.KEYFILENAME = "keylist.txt"        self.URLFILENAME = "urllist.txt"        self.KEYLIST = set()        self.URLLIST = set()        self.URLFILE = open(self.URLFILENAME, 'w')    def _readkey(self):        '''读取百度搜索所需要的所有关键词'''        with open(self.KEYFILENAME) as keyklistfile:            for i in keyklistfile.readlines():                self.KEYLIST.add(i)    def _changeurl(self, url):        '''百度搜索结果url转换为真实的url'''        try:            req = requests.get(url+'&wd=')            # time.sleep(1)            # print(req.text)            regx = r'http[s]*://baijiahao.baidu.com/[\S]*id=[0-9]*'            pattern = re.compile(regx)            match = re.findall(pattern, req.text)            return match[0]        except Exception as e:            print(e)    def _writetomysql(self):        '''将真实url写入数据库'''        pass    def _writetofile(self,url):        self.URLFILE.write(url)        self.URLFILE.write('\n')    def sobaidu(self):        '''调用以上函数解决我们的问题'''        # browser = webdriver.Chrome()        # # browser = webdriver.PhantomJS()        headers = {    'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",    'accept-encoding': "gzip, deflate, sdch, br",    'accept-language': "zh-CN,zh;q=0.8,en;q=0.6",    'cache-control': "no-cache",    'connection': "keep-alive",    'host': "www.baidu.com",    'upgrade-insecure-requests': "1",    'user-agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",    'postman-token': "d7d540ff-c479-210d-4d11-9b3deaba8395"    }        for key in self.KEYLIST:            ''''doc'''            now_num = 0            # browser.implicitly_wait(30)            try:                # browser.get('https://www.baidu.com/s?wd=site:(baijiahao.baidu.com)' + key)                req = requests.get('http://www.baidu.com/s?wd=site:(baijiahao.baidu.com)'+ key, headers=headers)            except Exception as e:                time.sleep(20)                continue            source = req.text            soup = bs4.BeautifulSoup(source, 'lxml')                # print('next_page')            for i in soup.findAll(class_='result c-container '):                url = i.find(class_='t').find('a').get('href')                print(url)                try:                    self._writetofile(url)                except Exception as e:                    pass            time.sleep(1)def main():    dsfsd = sobaidu()    # strings = dsfsd._changeurl('https://www.baidu.com/link?url=w8wWEQMyVf0cD3TsKcn_pTQZ92cIqLqxVZKWFtT4rYJcESE_qfhKlPJg5B7OM2mXhZoSM1H0ogmCIgi4G2EkP_&wd=&eqid=aa2c3db90000bf4c0000000458831761')     # print(strings)    # # ###################### 奇怪的分割线 ###############    dsfsd._readkey()    print(len(dsfsd.KEYLIST))    #########################  有点傻的分割线 #############    dsfsd.sobaidu()    # print(len(dsfsd.URLLIST))    # for i in dsfsd.URLLIST:    #     print(i)    dsfsd.URLFILE.close()if __name__ == '__main__':    main()    # getid()

示例keylist.txt

爱好者爱河爱辉区爱舰爱将爱克斯光爱丽舍爱侣爱民区爱女爱妻爱情爱情股爱情剧爱情戏爱情秀

示例结果

1558203514244407155358095836459915544814148597901554581149767353155331198350876515503921156501141549296295514136155422944516563715542035147726081552292596126106

ps:我把url变成了appid的形式免得博客里面出现一些无用的链接

版权属于:copie

本文链接:

http://copie.cn/index.php/archives/%E9%92%88%E5%AF%B9%E7%99%BE%E5%AE%B6%E5%8F%B7%E6%8A%93%E5%8F%96%E9%A1%B9%E7%9B%AE.html

转载时须注明出处及本声明