python抓取豌豆荚app数据信息

来源：互联网发布：玻璃杯知乎编辑：程序博客网时间：2024/05/16 06:32
一个哥们参加app大赛，我帮他写的抓取程序，但是好像抓取的数据挺少的，不知道怎么回事，先贴上来大家研究研究吧
# -*- coding: utf-8 -*-"""Created on Sat Apr 26 10:50:20 2014@author: lifeix"""import urllib2from HTMLParser import HTMLParserimport simplejson as jsonimport tracebackclass Spyder(HTMLParser):        def __init__(self):        HTMLParser.__init__(self)        self.data = [];  #app信息集合        self.temp = {};  #存储单个app信息        self.isStartTag = False  #app列表的入口div        self.singleLi = False   #每个app的入口li元素        self.isAppDescFlag = False  #进入app的详细描述元素div        self.isMeta = False        self.isInstallCount = False  #        self.isComment = False        self.isInstallBtn = False        self.isDot = False        self.isIconUrl = False        self.icon= False        self.recordInstallCount = False        pass        def handle_starttag(self, tag, attrs):        if tag == 'li':            for key, value in attrs:                if key == 'class' and value == 'card':                    self.singleLi = True                    break        elif  self.singleLi:            if self.icon == False and self.isIconUrl == False and tag == 'div':                              for key, value in attrs:                    if key == 'class' and value == 'icon-wrap':                        self.isIconUrl = True                        break            elif self.isIconUrl and tag == 'img':                for key, value in attrs:                    if key == 'src':                        self.temp['app_icon_url'] = value                                             self.isIconUrl = False                        self.icon = True                        break            if tag == 'div':                for key , value in attrs:                    if value == 'app-desc' and key == 'class':                        self.singleLi = False                                                self.isAppDescFlag = True                        break        elif self.isAppDescFlag:            if self.isMeta == False and tag == 'a':                            for key, value in attrs:                    if key == 'title':                        self.temp['app_name'] = value                        break            elif  tag == 'div':                for  key, value in attrs:                    if self.isMeta == False and key == 'class' and value == 'meta':                        self.isMeta = True                        break                    elif self.isComment == False and key == 'class' and value == 'comment':                        self.isComment = True                                    elif tag == 'span' and self.isMeta:                            for key, value in attrs:                    if key == 'class':                        if self.isInstallCount == False and value == 'install-count':                            self.isInstallCount = True                        elif self.isDot == False and self.isInstallCount and value == 'dot':                            self.isDot = True                    elif key== 'title'  and self.isInstallCount and self.isDot:                         self.isInstallCount = False                         self.isDot = False                         self.temp['app_size'] = value            if self.isMeta and tag == 'a':                                if self.isInstallBtn == False:                    for key, value in attrs:                        if key == 'class' and value.strip() == 'install-btn':                            self.isInstallBtn = True                            break                if self.isInstallBtn:                    for key, value in attrs:                       if key == 'href':                            self.temp['app_download_url'] = value                            self.isAppDescFlag = False                            self.isInstallBtn = False                            self.isMeta = False                            self.data.append(self.temp)                            self.temp = {}                            self.recordInstallCount = False                            self.icon = False                            break                def handle_data(self, data):        if self.recordInstallCount == False and self.isInstallCount:            self.recordInstallCount = True            self.temp['app_data_install'] = data.split(' ')[0]                    elif self.isComment:            self.temp['app_desc'] = data            self.isComment = False                                def getResult(self):        return self.data;        request = urllib2.urlopen("http://www.wandoujia.com/apps")content = request.read();allData = []  #存放所有的app信息spyder = Spyder()spyder.feed(content)allData = spyder.getResult()maxData  = 12moreUrl = 'http://apps.wandoujia.com/api/v1/feeds?max=12&start=%d&opt_fields=data.app.tags.*,data.app.editorComment,data.app.likesCount,data.app.reason,data.app.ad,data.app.title,data.app.packageName,data.app.apks.size,data.app.icons.px68,data.app.apks.superior,data.app.installedCountStr,data.app.snippet,data.app.apks.versionCode&callback=jsonp1'def loadMore():    start = 24    flag = True    while flag:            try:            moreRequest = urllib2.urlopen(moreUrl%start)            if moreRequest:                moreData = moreRequest.read()                data = moreData[7:len(moreData) - 2]                target = json.JSONDecoder().decode(data)                targetData = target['data']                print len(targetData), start,len(allData)                if len(targetData) < 1:                    flag = False                else:                    for obj in targetData:                        app = obj['app']                        apkSize = app['apks'][0]['size']                        comment = app['editorComment']                        iconUrl = app['icons']['px68']                        installCount = app['installedCountStr'].split(' ')[0]                        title = app['title']                        packageName = app['packageName']                        downloadUrl = 'http://apps.wandoujia.com/apps/%s/download'%packageName                        temp = {'app_name':title,                                'app_size':apkSize,                                'app_icon_url':iconUrl,                                'app_data_install':installCount,                                'app_desc':comment,                                'app_download_url':downloadUrl}                                            allData.append(temp)            start = start + maxData        except Exception as e:            print e            print traceback.format_exc()            flag = False    else:        print 'loadmore is finished........'        loadMore()print len(allData)
0 0