python抓取豌豆荚app数据信息
来源:互联网 发布:玻璃杯 知乎 编辑:程序博客网 时间:2024/05/16 06:32
一个哥们参加app大赛,我帮他写的抓取程序,但是好像抓取的数据挺少的,不知道怎么回事,先贴上来大家研究研究吧
# -*- coding: utf-8 -*-"""Created on Sat Apr 26 10:50:20 2014@author: lifeix"""import urllib2from HTMLParser import HTMLParserimport simplejson as jsonimport tracebackclass Spyder(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.data = []; #app信息集合 self.temp = {}; #存储单个app信息 self.isStartTag = False #app列表的入口div self.singleLi = False #每个app的入口li元素 self.isAppDescFlag = False #进入app的详细描述元素div self.isMeta = False self.isInstallCount = False # self.isComment = False self.isInstallBtn = False self.isDot = False self.isIconUrl = False self.icon= False self.recordInstallCount = False pass def handle_starttag(self, tag, attrs): if tag == 'li': for key, value in attrs: if key == 'class' and value == 'card': self.singleLi = True break elif self.singleLi: if self.icon == False and self.isIconUrl == False and tag == 'div': for key, value in attrs: if key == 'class' and value == 'icon-wrap': self.isIconUrl = True break elif self.isIconUrl and tag == 'img': for key, value in attrs: if key == 'src': self.temp['app_icon_url'] = value self.isIconUrl = False self.icon = True break if tag == 'div': for key , value in attrs: if value == 'app-desc' and key == 'class': self.singleLi = False self.isAppDescFlag = True break elif self.isAppDescFlag: if self.isMeta == False and tag == 'a': for key, value in attrs: if key == 'title': self.temp['app_name'] = value break elif tag == 'div': for key, value in attrs: if self.isMeta == False and key == 'class' and value == 'meta': self.isMeta = True break elif self.isComment == False and key == 'class' and value == 'comment': self.isComment = True elif tag == 'span' and self.isMeta: for key, value in attrs: if key == 'class': if self.isInstallCount == False and value == 'install-count': self.isInstallCount = True elif self.isDot == False and self.isInstallCount and value == 'dot': self.isDot = True elif key== 'title' and self.isInstallCount and self.isDot: self.isInstallCount = False self.isDot = False self.temp['app_size'] = value if self.isMeta and tag == 'a': if self.isInstallBtn == False: for key, value in attrs: if key == 'class' and value.strip() == 'install-btn': self.isInstallBtn = True break if self.isInstallBtn: for key, value in attrs: if key == 'href': self.temp['app_download_url'] = value self.isAppDescFlag = False self.isInstallBtn = False self.isMeta = False self.data.append(self.temp) self.temp = {} self.recordInstallCount = False self.icon = False break def handle_data(self, data): if self.recordInstallCount == False and self.isInstallCount: self.recordInstallCount = True self.temp['app_data_install'] = data.split(' ')[0] elif self.isComment: self.temp['app_desc'] = data self.isComment = False def getResult(self): return self.data; request = urllib2.urlopen("http://www.wandoujia.com/apps")content = request.read();allData = [] #存放所有的app信息spyder = Spyder()spyder.feed(content)allData = spyder.getResult()maxData = 12moreUrl = 'http://apps.wandoujia.com/api/v1/feeds?max=12&start=%d&opt_fields=data.app.tags.*,data.app.editorComment,data.app.likesCount,data.app.reason,data.app.ad,data.app.title,data.app.packageName,data.app.apks.size,data.app.icons.px68,data.app.apks.superior,data.app.installedCountStr,data.app.snippet,data.app.apks.versionCode&callback=jsonp1'def loadMore(): start = 24 flag = True while flag: try: moreRequest = urllib2.urlopen(moreUrl%start) if moreRequest: moreData = moreRequest.read() data = moreData[7:len(moreData) - 2] target = json.JSONDecoder().decode(data) targetData = target['data'] print len(targetData), start,len(allData) if len(targetData) < 1: flag = False else: for obj in targetData: app = obj['app'] apkSize = app['apks'][0]['size'] comment = app['editorComment'] iconUrl = app['icons']['px68'] installCount = app['installedCountStr'].split(' ')[0] title = app['title'] packageName = app['packageName'] downloadUrl = 'http://apps.wandoujia.com/apps/%s/download'%packageName temp = {'app_name':title, 'app_size':apkSize, 'app_icon_url':iconUrl, 'app_data_install':installCount, 'app_desc':comment, 'app_download_url':downloadUrl} allData.append(temp) start = start + maxData except Exception as e: print e print traceback.format_exc() flag = False else: print 'loadmore is finished........' loadMore()print len(allData)
0 0
- python抓取豌豆荚app数据信息
- Python爬虫:抓取手机APP数据
- Python抓取360手机市场APP信息并做简单分析
- Python爬虫:抓取手机APP的传输数据
- Python爬虫:抓取手机APP的传输数据
- [Python]网页信息抓取
- python 页面信息抓取
- Scrapy+MongoDB爬取并存储豌豆荚App数据
- pyspider抓取数据信息
- python抓取数据例子
- python抓取数据步骤
- Python Scrapy抓取数据
- python数据抓取
- python 抓取网页数据
- Python, 数据抓取
- python 抓取搜房网数据
- python抓取动态数据
- python 抓取数据相关
- orcale 中常用函数
- 日期格式
- C++学习笔记(2)
- ubuntu sublime安装及配置
- 面试求职中需要了解的Java多线程知识
- python抓取豌豆荚app数据信息
- Laravel cheatsheet
- 阿里实习二面
- 建造者模式
- jQuery中ajax异步请求应用
- MonkeyRunner常用类方法
- 同步与异步--阻塞与非阻塞型I/O
- 本地无法SSH,远程可以SSH
- Maven的安装与运行及MyEclipse8.6中的设置