搜狗微信文章爬取

来源:互联网 发布:ftp做yum源 编辑:程序博客网 时间:2024/06/14 14:31
# coding: utf8import requestsimport bs4import jsonimport sysimport randomreload(sys)sys.setdefaultencoding("utf-8")class WXSogou:    def __init__(self):        self._session = requests.session()        self._html = None        self._bs = None    @staticmethod    def get_reading(dict_msg, key):        if isinstance(dict_msg, dict):            value = dict_msg.get(key, None)            return None if not value else value.split(',')[0]    @staticmethod    def get_posting(dict_msg, key):        if isinstance(dict_msg, dict):            value = dict_msg.get(key, None)            return None if not value else value.split(',')[1]    @staticmethod    def get_pure_value(descendants):        r = ''        for a in descendants:            if not isinstance(a, bs4.element.Comment) \               and isinstance(a, bs4.element.NavigableString):                r += str(a)        return r    def parse_reading_and_posting(self):        """            获取月发文章数和平均阅读量        """        print "*" * 50        print str(self._bs)        print "*" * 50        account_anti_url = 'http://weixin.sogou.com' + str(self._bs.find('div', attrs={'class': 'wrapper'}).find_all('script')[-1]).split('"')[-2]        r = self._session.get(url=account_anti_url)        if r.status_code == 200:            r.encoding = 'utf8'            json_code = json.loads(r.text)            if json_code['code'] == 'success':                return json_code.get('msg', None)        return None    def parse_data_of_html(self):        """            解析、提取搜狗公众号第一页的数据        """        msg  = self.parse_reading_and_posting()        info = []        for li in self._bs.find('ul', attrs={'class': 'news-list2'}).find_all('li'):            acc_name     = self.get_pure_value(li.find('p', attrs={'class': 'tit'}).a.descendants)            perm_post    = self.get_posting(msg, str(li['d']))            aver_reading = self.get_reading(msg, str(li['d']))            acc_number   = str(li.find('p', attrs={'class': 'info'}).label.contents[0])            acc_img      = str(li.find('div', attrs={'class': 'img-box'}).img['src'])            acc_qrcode   = str(li.find('div', attrs={'class': 'ew-pop'}).find_all('img')[-2]['src'])            acc_url      = str(li.find('div', attrs={'class': 'img-box'}).a['href'])            acc_intro    = self.get_pure_value(li.dl.dd.descendants)            # 最近            had_gone_to_posted = len(li.find_all('dl')) > 1            tag_dd = li.find_all('dd')            try:                #可能没有最近文章                art_brief = None if not had_gone_to_posted else self.get_pure_value(tag_dd[-1].a.descendants)             except:                art_brief = None            try:                art_url = None if not had_gone_to_posted else str(tag_dd[-1].a['href'])            except:                art_url = None            try:                art_time     = None if not had_gone_to_posted else int(tag_dd[-1].span.script.contents[0].split('\'')[-2])            except:                art_time = None            info.append({                'acc_name'    : acc_name,                'perm_post'   : perm_post,                'aver_reading': aver_reading,                'acc_number'  : acc_number,                'acc_img'     : acc_img,                'acc_qrcode'  : acc_qrcode,                'acc_url'     : acc_url,                'acc_intro'   : acc_intro,                'art_brief'   : art_brief,                'art_url'     : art_url,                'art_time'    : art_time            })        return info    def search(self, account,page):        agents = [        "Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)",        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",        "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9",        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",        "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2",        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",        "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre",        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10",        "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)",        "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5",        "Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)",        "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",        "Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0",        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110622 Firefox/6.0a2",        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1",        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre",        "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0 )",        "Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Win 9x 4.90)",        "Mozilla/5.0 (Windows; U; Windows XP) Gecko MultiZilla/1.6.1.0a",        "Mozilla/2.02E (Win95; U)",        "Mozilla/3.01Gold (Win95; I)",        "Mozilla/4.8 [en] (Windows NT 5.1; U)",        "Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)",    ]        headers = {                "Host":"weixin.sogou.com",                "User-Agent":random.choice(agents),                "Accept":"text/html,application/xhtml+xm…plication/xml;q=0.9,*/*;q=0.8",                "Accept-Language":"zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",                "Cookie":"你的cookie",                "Connection":"keep-alive"            }        url = 'http://weixin.sogou.com/weixin' \              '?type=1' \              '&s_from=input' \              '&query={account}' \              '&ie=utf8' \              '&_sug_=n' \              '&page={page}'\              '&_sug_type_='.format(account=account,page=page)        print url        r = self._session.get(url=url,headers = headers)        if r.status_code == 200:            r.encoding = 'utf8'            self._html = r.text            self._bs = bs4.BeautifulSoup(markup=self._html, features='html.parser')            info = self.parse_data_of_html()            return infoif __name__ == '__main__':    sogou = WXSogou()    #因为没有登陆,所以只能显示100条数据,每页十条数据    for i in range(1,11):        info = sogou.search('电影公众号',i)        with open("weixin_article.json","a") as f:            for article in info:                f.write(json.dumps(dict(article),ensure_ascii = False) +","+ "\n")

备注:爬取的次数太多要求输入验证码

原创粉丝点击