爬虫豆瓣电影top250代码和收获

来源:互联网 发布:多文件上传java 编辑:程序博客网 时间:2024/06/06 03:42

# -*- coding:utf-8 -*-import urllib2import reclass MovieTop250 :    def __init__(self):        self.start = 0        self.movielist = []    def getPage(self):        try:            headers = {"User-Agent" : "Mozilla/5.0(Windows NT 6.1;WOW64)"}            url = "https://movie.douban.com/top250?start="+str(self.start)            request = urllib2.Request(url = url,headers = headers)            response = urllib2.urlopen(request)            page = response.read()            pageNum = str(self.start / 25)            print "正在抓取第" + pageNum + "页"            return page        except urllib2.URLError,e:            print e.reason    def getMovie(self):        pattern = re.compile('<em.*?class="">(.*?)</em>.*?'#网页源代码中属性那里仅有class,但写正则表达式要写成class=""                             +'<a.*?>.*?<span class="title">(.*?)</span>.*?'                             +'<span class="\w{5}">(.*?)</span>.*?'                             +'<span class="rating_num".*?>(.*?)</span>.*?'                             +'<span class="inq">(.*?)</span>',re.S)        while self.start <= 255:            page = self.getPage()            movies = re.findall(pattern, page)            for movie in movies:                self.movielist.append([movie[0], movie[1], movie[2].lstrip(' / '), movie[3], movie[4]])                # lstrip去掉左边的空格或者去掉对应的字符串            self.start += 25    def writeTXT(self):        with open('doubanmovietop250.txt','w+') as movietop250:             for movie in self.movielist:                movietop250.write('电影排名: '+ movie[0]+ '\r\n')                movietop250.write('电影名称: ' + movie[1] + '\r\n')                movietop250.write('原名别名: ' + movie[2] + '\r\n')                movietop250.write('累计评分: ' + movie[3] + '\r\n')                movietop250.write('简评: ' + movie[4] + '\r\n')                movietop250.write('\r\n')                #\r\n换行doubanspider = MovieTop250()doubanspider.getMovie()doubanspider.writeTXT()#分别启动两个函数
收获:

1.将类匹配进函数时,必须启动相关两个函数,也可以在用self.getMovie()在函数中启动另一个




原创粉丝点击