二.scrapy抓取百度新闻排行榜,并且推送到指定邮箱

来源:互联网 发布:steam mac免费游戏推荐 编辑:程序博客网 时间:2024/05/16 19:27
#encoding=utf-8
import scrapyimport requestsfrom pymongo import MongoClientfrom ..items import FirstoneItemimport smtplibfrom email.mime.text import MIMETextfrom .. import settingsimport timefrom scrapy.http import Requestfrom scrapy.spiders import Spiderclass MaimaiSpider(Spider):    #mongodb    cn=MongoClient(settings.MONGODB_HOST,settings.MONGODB_PORT)    db=cn[settings.MONGODB_DB]    tb=db[settings.MONGODB_TABLE]    name='baidunews'    allowed_domains=['baidu.com']    start_urls=['http://top.baidu.com/buzz?b=341']    mainurl='http://top.baidu.com/'    def parse(self, response):        modes=response.xpath('//div[@class="hblock"]/ul/li/a/@href').extract()        for mode in modes[1:]:            news_type=response.xpath('//div[@class="hblock"]/ul/li[{}]/a/@title'.format(str(1+modes.index(mode)))).extract_first()            yield Request(url=self.mainurl+mode[1:],callback=self.parse_item,meta={'news_type':news_type})    def parse_item(self,response):        bodys=response.xpath('//table[@class="list-table"]/tr')        for body in bodys:            if body.xpath('.//td[@class="first"]').extract():                items=FirstoneItem()                num=body.xpath('.//td[@class="first"]/span/text()').extract_first()                title=body.xpath('.//td[@class="keyword"]/a/text()').extract_first()                href=body.xpath('.//td[@class="keyword"]/a/@href').extract_first()                focus_num=body.xpath('.//td[@class="last"]/span/text()').extract_first()                items['num']=num                items['_id']=title                items['news_type']=response.meta['news_type']                items['baidu_url']=href                items['focus_num']=focus_num                yield items             #   print response.meta['news_type'].encode('gb18030'),num,title.encode('gb18030'),href    def close(self, reason):        if reason=='finished':            header='<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><table border="0" cellspacing="0" cellpadding="3" align="left" >'            tail='</table></body></html>'            line=''            for data in self.tb.find():                if int(data['num'])<=3:                    tp0='<tr align="left"><td colspan="6">%s</td></tr>'%('*'*10)                    tp1='<tr align="left"><td colspan="6">%s</td></tr>'%data['news_type']                    tp2='<tr align="left"><td colspan="6">%s</td></tr>'%data['num']                    tp3='<tr align="left"><td colspan="6">%s</td></tr>'%data['_id']                    tp4='<tr align="left"><td colspan="6">%s</td></tr>'%data['baidu_url']                    line=line+tp0+tp1+tp2+tp3+tp4            body=header+line+tail            msg = MIMEText(body,'html', 'utf-8')            msg["Subject"] = "[%s]BaiduTopNews"%time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))            msg["From"]    = settings.email_From            msg["To"]      = ','.join(settings.email_To)            try:                s = smtplib.SMTP_SSL(settings.smtpHost, settings.smtpPort)                s.login(settings.email_From,settings.email_pwd)                s.sendmail(settings.email_From, msg["To"], msg.as_string())                s.quit()                print "Success!"            except smtplib.SMTPException,e:                print "sendemail_Falied,the reson is %s"%e
阅读全文
2 0