#!/usr/env python#-*- coding: utf-8 -*-import requestsimport os,sys import MySQLdbimport timefrom BeautifulSoup import BeautifulSoupimport renum=0dataresult=[]def main():try:conn=MySQLdb.connect(host='localhost',user='root',passwd='123456',db='addressbookdb',charset="utf8")conn.query("set names utf8")except Exception,e:print esys.exit()cursor=conn.cursor() category=['PERSONALIZATION','TRANSPORTATION','SPORTS','HEALTH_AND_FITNESS','APP_WALLPAPER','COMICS','MEDICAL','BUSINESS','BOOKS_AND_REFERENCE','WEATHER','ENTERTAINMENT','MEDIA_AND_VIDEO','APP_WIDGETS','TOOLS','PHOTOGRAPHY','PRODUCTIVITY','EDUCATION','NEWS_AND_MAGAZINES','TRAVEL_AND_LOCAL','LIFESTYLE','SOCIAL','FINANCE','SHOPPING','LIBRARIES_AND_DEMO','COMMUNICATION','MUSIC_AND_AUDIO','GAME']for k in range(0,27): print kt="https://play.google.com/store/apps/category/"+category[k] try: html=requests.get(t) preresult=html.content soup=BeautifulSoup(preresult) result=soup.prettify("utf-8") except: time.sleep(30) pass pattern=re.compile('<a class="title" href="(.+?)" title') dataresult=re.findall(pattern,result) for i in dataresult: url="https://play.google.com"+i try: html=requests.get(url) preresult=html.content soup=BeautifulSoup(preresult) result=soup.prettify("utf-8") except: time.sleep(30) pass #名称 pattern=re.compile('<div class="document-title" itemprop="name">[\s\S]*?<div>([\s\S]*?)</div>') data0=re.findall(pattern,result) for items in data0: print items #制造商 pattern=re.compile('itemprop="name">([\s\S]*?)</a>') data1=re.findall(pattern,result) make=data1[0].split("\n") print make[8]#版本 pattern=re.compile('itemprop="softwareVersion">([\s\S]*?)</div>') data2=re.findall(pattern,result) print data2[0] #更新时间 pattern=re.compile('itemprop="datePublished">([\s\S]*?)</div>') data3=re.findall(pattern,result) print data3[0] #文件大小 pattern=re.compile('itemprop="fileSize">([\s\S]*?)</div>') data4=re.findall(pattern,result) print data4[0] #支持固件import sys pattern=re.compile('itemprop="operatingSystems">([\s\S]*?)</div>') data5=re.findall(pattern,result) print data5[0] #说明 pattern=re.compile('itemprop="description">[\s\S]*?<div>([\s\S]*?)</div>') data6=re.findall(pattern,result) sql="insert into googlemarket(name,developer,version,pubtime,filesize,support,classify,introduction) values(%s,%s,%s,%s,%s,%s,%s,%s)" for items in data6: values=(data0[0],make[8],data2[0],data3[0],data4[0],data5[0],category[k],re.sub('[<br /> <p> </p>]',' ',items)) print sql %values print category[k] try: cursor.execute(sql,values) conn.commit() except: pass pattern=re.compile('<img class="cover-image" src=(.+?) alt="Cover art" itemprop="image" />') data=re.findall(pattern,result) global num for j in data: print j try: temp=requests.get(j[1:-2],) except: time.sleep(30) pass f=file("googlemarket/"+str(num),"w+") num=num+1 print num f.write(temp.content)if __name__=="__main__": main()