python抓取几大票房统计系统数据的之猫眼电影

来源:互联网 发布:淘宝司马zm17 编辑:程序博客网 时间:2024/05/18 01:00
# coding=utf-8import reimport urllib2import chardetclass Maoyandianying_01:    dataT_h2 = []    dx = []    def downAndparse(self,url):        response_1 = urllib2.urlopen(url).read()        # 解决乱码问题        mychar = chardet.detect(response_1)        bianma = mychar['encoding']        print bianma        if bianma == 'utf-8' or bianma == 'UTF-8':            response = response_1            print 'xx'        else:            response = response_1.decode('gb2312', 'ignore').encode('utf-8')        self.parseB(response)        self.parseI(response)    dataT_fileName = []    def parseB(self,response):        h = re.findall(r'<b>[^<i class="cs gsBlur">].*?</b>', response, re.M)        for i in set(h):            objM = re.match(r'<b>(.*?)</b>', i, re.M)            if objM:                # print objM.group()                # print objM.group(1)                self.dataT_fileName.append(objM.group(1))    def parseI(self,response):        h2 = re.findall(r'<i class="cs gsBlur">.*</i>',response,re.M)        for i in h2:            objM = re.match(r'<i class="cs gsBlur">(.*?)</i>',i,re.M)            if objM:                self.dataT_h2.append(objM.group(1))        for ii in range(1,self.dataT_h2.__len__()):            if ii%5 == 0:                self.dx.append(ii)                # print ii    df = []    def sendtodb(self):        for i in self.dataT_fileName:            print i        print '------'        dv = [self.dataT_h2[1],self.dataT_h2[2],self.dataT_h2[3],self.dataT_h2[4],self.dataT_h2[5]]        self.df.append(dv)        print '========='        print self.dx        print self.dataT_h2.__len__()        for iii in range(len(self.dx)-1):            dn = [self.dataT_h2[self.dx[iii] + 1], self.dataT_h2[self.dx[iii] + 2], self.dataT_h2[self.dx[iii] + 3], self.dataT_h2[self.dx[iii] + 4],self.dataT_h2[self.dx[iii] + 5]]            self.df.append(dn)        print len(self.dataT_fileName)        print self.df.__len__()        for i in range(self.df.__len__()):            self.df[i].append(self.dataT_fileName[i])if __name__ == '__main__':    url = 'https://piaofang.maoyan.com/?ver=normal'    maoyan = Maoyandianying_01()    maoyan.downAndparse(url)    maoyan.sendtodb()    print len(maoyan.df)    for i in maoyan.df:        print i






原创粉丝点击