搜狗微信文章爬取
来源:互联网 发布:ftp做yum源 编辑:程序博客网 时间:2024/06/14 14:31
# coding: utf8import requestsimport bs4import jsonimport sysimport randomreload(sys)sys.setdefaultencoding("utf-8")class WXSogou: def __init__(self): self._session = requests.session() self._html = None self._bs = None @staticmethod def get_reading(dict_msg, key): if isinstance(dict_msg, dict): value = dict_msg.get(key, None) return None if not value else value.split(',')[0] @staticmethod def get_posting(dict_msg, key): if isinstance(dict_msg, dict): value = dict_msg.get(key, None) return None if not value else value.split(',')[1] @staticmethod def get_pure_value(descendants): r = '' for a in descendants: if not isinstance(a, bs4.element.Comment) \ and isinstance(a, bs4.element.NavigableString): r += str(a) return r def parse_reading_and_posting(self): """ 获取月发文章数和平均阅读量 """ print "*" * 50 print str(self._bs) print "*" * 50 account_anti_url = 'http://weixin.sogou.com' + str(self._bs.find('div', attrs={'class': 'wrapper'}).find_all('script')[-1]).split('"')[-2] r = self._session.get(url=account_anti_url) if r.status_code == 200: r.encoding = 'utf8' json_code = json.loads(r.text) if json_code['code'] == 'success': return json_code.get('msg', None) return None def parse_data_of_html(self): """ 解析、提取搜狗公众号第一页的数据 """ msg = self.parse_reading_and_posting() info = [] for li in self._bs.find('ul', attrs={'class': 'news-list2'}).find_all('li'): acc_name = self.get_pure_value(li.find('p', attrs={'class': 'tit'}).a.descendants) perm_post = self.get_posting(msg, str(li['d'])) aver_reading = self.get_reading(msg, str(li['d'])) acc_number = str(li.find('p', attrs={'class': 'info'}).label.contents[0]) acc_img = str(li.find('div', attrs={'class': 'img-box'}).img['src']) acc_qrcode = str(li.find('div', attrs={'class': 'ew-pop'}).find_all('img')[-2]['src']) acc_url = str(li.find('div', attrs={'class': 'img-box'}).a['href']) acc_intro = self.get_pure_value(li.dl.dd.descendants) # 最近 had_gone_to_posted = len(li.find_all('dl')) > 1 tag_dd = li.find_all('dd') try: #可能没有最近文章 art_brief = None if not had_gone_to_posted else self.get_pure_value(tag_dd[-1].a.descendants) except: art_brief = None try: art_url = None if not had_gone_to_posted else str(tag_dd[-1].a['href']) except: art_url = None try: art_time = None if not had_gone_to_posted else int(tag_dd[-1].span.script.contents[0].split('\'')[-2]) except: art_time = None info.append({ 'acc_name' : acc_name, 'perm_post' : perm_post, 'aver_reading': aver_reading, 'acc_number' : acc_number, 'acc_img' : acc_img, 'acc_qrcode' : acc_qrcode, 'acc_url' : acc_url, 'acc_intro' : acc_intro, 'art_brief' : art_brief, 'art_url' : art_url, 'art_time' : art_time }) return info def search(self, account,page): agents = [ "Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5", "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7", "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7", "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10", "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)", "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5", "Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)", "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", "Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110622 Firefox/6.0a2", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre", "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0 )", "Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Win 9x 4.90)", "Mozilla/5.0 (Windows; U; Windows XP) Gecko MultiZilla/1.6.1.0a", "Mozilla/2.02E (Win95; U)", "Mozilla/3.01Gold (Win95; I)", "Mozilla/4.8 [en] (Windows NT 5.1; U)", "Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)", ] headers = { "Host":"weixin.sogou.com", "User-Agent":random.choice(agents), "Accept":"text/html,application/xhtml+xm…plication/xml;q=0.9,*/*;q=0.8", "Accept-Language":"zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Cookie":"你的cookie", "Connection":"keep-alive" } url = 'http://weixin.sogou.com/weixin' \ '?type=1' \ '&s_from=input' \ '&query={account}' \ '&ie=utf8' \ '&_sug_=n' \ '&page={page}'\ '&_sug_type_='.format(account=account,page=page) print url r = self._session.get(url=url,headers = headers) if r.status_code == 200: r.encoding = 'utf8' self._html = r.text self._bs = bs4.BeautifulSoup(markup=self._html, features='html.parser') info = self.parse_data_of_html() return infoif __name__ == '__main__': sogou = WXSogou() #因为没有登陆,所以只能显示100条数据,每页十条数据 for i in range(1,11): info = sogou.search('电影公众号',i) with open("weixin_article.json","a") as f: for article in info: f.write(json.dumps(dict(article),ensure_ascii = False) +","+ "\n")
备注:爬取的次数太多要求输入验证码
阅读全文
0 0
- 搜狗微信文章爬取
- scrapy爬取博客文章
- 爬取英语文章推送
- 博客园文章爬取代码
- 博客文章数据的爬取
- 基于Scrapy爬取网页文章
- python 爬取公众号文章
- 使用BeautifulSoup爬取CSDN博客文章
- 微信公众号文章的爬取(搜狗微信搜索)
- 对于CSDN博客文章不能爬取的问题
- 用python爬取文章链接并分类
- Reddit网站获赞最高文章/评论的爬取
- scrapy爬取某个手机app的文章数据
- python_爬取博客文章下载到本地
- 爬取网页中的文章写成本地txt文件
- java爬虫之爬取博客园推荐文章列表
- 用Python爬取更加有价值的文章
- 使用selenium爬取百度文库文章(动态)
- Java
- Android Gradle偶遇 Ensure that you have installed a JDK (not just a JRE) and configured your JAVA_HOM
- Andrew Ng 深度学习课程deeplearning.ai 编程作业——shallow network for datesets classification (1-3)
- UVa10375(唯一分解定理)
- Eclipse快捷键大全
- 搜狗微信文章爬取
- Linux下查看MySQL的安装路径
- 9.1.7 NULL值
- java中Class类及用法
- API管理基础知识集锦
- Junit (Assertions in JUNIT)
- C++ 中两个数据交换总结
- python模块subprocess学习
- 一台机器启动多个jboss实例