基于python的豆瓣“我看过的电影”的爬虫

来源:互联网 发布:森田玻尿酸乳液知乎 编辑:程序博客网 时间:2024/05/21 08:36
#!/usr/bin/env python# -*- coding: cp936 -*-# Filename: backup_ver1.pyimport urllib2import reimport sysimport xlwtimport time wbk=xlwt.Workbook()sheet1=wbk.add_sheet("my_sheet1")sheet1.write(0,0,u'影片代码')sheet1.write(0,1,u'影片名称')sheet1.write(0,2,u'星级')sheet1.write(0,3,u'日期')sheet1.write(0,4,u'标签')sheet1.write(0,5,u'短评')reload(sys)sys.setdefaultencoding( "utf-8" )# 打印系统初始化界面print u"""  ---------------------------------------     程序:豆瓣爬虫     版本:2.0     作者:anzic   日期:2014-11-13     语言:Python 2.7     功能:按提示输入后提取指定用户的豆列---------------------------------------  """print u'请输入用户代号,回车则为默认'ainput=raw_input()if ainput=='':    person=60683287else:    person=ainputprint u'请输入要打印的页数'page_num=raw_input()user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent }i=0for page in range(1,int(page_num)+1):    myUrl = "http://m.douban.com/movie/people/"+str(person)+"/watched?page="+str(page)    req = urllib2.Request(myUrl,headers = headers)      response = urllib2.urlopen(req)      the_page = response.read()      #f = file(u'抓取网页.html', 'w')    #f.write(the_page)    #f.close    uPage = the_page.decode("utf-8")    #re.findall()返回list    myItems = re.findall('<a href="/movie/subject/.*?<br>.*?</div>',uPage,re.S)    for item in myItems:        # code为电影对应代码        code1=re.findall('\d+',item,re.S)        code=code1[0]                # name为电影名称        item=item.replace(ur':',ur'')        name=re.findall(ur'">(.*?)</a>',item,re.S)        # mark为电影评分        mark1=re.findall(ur'span>\(([1-5])[\u4e00-\u9fa5]',item,re.S)        if len(mark1)>0:            mark=mark1[0]        else:            mark=[]        # date为观看日期        date1=re.findall(ur'<br>(\d{4}-\d{2}-\d{2})',item,re.S)        date=date1[0]        # comment为短评           comment1=re.findall(ur'<br>短评\s(.*?)\s*</div>',item,re.S)        if len(comment1)==0:            comment=[]        else:            comment=comment1[0].decode("utf-8")                # tag为标签        tag1=re.findall(ur'<br>标签\s(.*?)\s*<br>',item,re.S)        if len(tag1)==0:            tag=[]        else:            tag=tag1[0].decode("utf-8")        # 判断提取名称是否出现问题        if len(name)==0:            errorlist.append(code)        # 写入excel        i+=1        sheet1.write(i,0,code)        sheet1.write(i,1,name)        sheet1.write(i,2,mark)        sheet1.write(i,3,date)        sheet1.write(i,4,tag)        sheet1.write(i,5,comment)    print 'Page',page,'is OK'    time.sleep(1)    wbk.save(u"电影导出.xls")

0 0