python84

来源:互联网 发布:java工作流视频教程 编辑:程序博客网 时间:2024/06/06 15:54

anzhi84.py

#!/usr/env  python#-*- coding: utf-8  -*-import requestsimport os,sys import timeimport MySQLdbimport renum=0dataresult=[]def main():    try:        conn=MySQLdb.connect(host='localhost',user='root',passwd='123456',db='addressbookdb',charset="utf8")        conn.query("set names utf8")    except Exception,e:        print e        sys.exit()     cursor=conn.cursor()     for k in range(1,7773):        try:                        url="http://www.anzhi.com/list_1_"+str(k)+"_hot.html"            print url            html=requests.get(url)            result=html.content            pattern=re.compile('<span class="app_name"><a href="(.+?)">')            daresult=re.findall(pattern,result)            global dataresult            dataresult+=daresult            dataresult=list(set(dataresult))            print len(dataresult)        except:                        time.sleep(30)            pass                           f=file("anzhi.txt","a+")    content=str(len(dataresult))    f.write(content)    f.close()    print len(dataresult)    for i in dataresult:                print i        t='http://www.anzhi.com/'+i        try:            html=requests.get(t)            result=html.content        except:            time.sleep(30)            pass        pattern=re.compile('<div class="detail_line">[\s\S]*?<h3>(.+?)</h3>')#名称        data0=re.findall(pattern,result)        print data0[0]        pattern=re.compile('<span class="app_detail_version">(.+?)</span>')#版本号        data1=re.findall(pattern,result)        print data1[0]        pattern=re.compile('开发者:(.+?)</span>')#开发者        data2=re.findall(pattern,result)        print data2[0]        pattern=re.compile('发布时间:(.+?)</li>')#发布时间        data3=re.findall(pattern,result)        print data3[0]        pattern=re.compile('文件大小:(.+?)</span></li>')#文件大小        data4=re.findall(pattern,result)        print data4[0]        pattern=re.compile('系统支持:(.+?)</li>')#支持固件        data5=re.findall(pattern,result)        print data5[0]        pattern=re.compile('所属类别:(.+?)</li>')#类别        data6=re.findall(pattern,result)        print data6[0]        pattern=re.compile('<div class="app_detail_infor">([\s\S]*?)</div>')#介绍        data7=re.findall(pattern,result)        for items in data7:            print re.sub('<br />',' ',items)        sql="insert into anzhi(name,version,developer,pubtime,filesize,support,classifyintroduction) values(%s,%s,%s,%s,%s,%s,%s,%s)"        for items in data7:            try:                                values=(data0[0],data1[0],data2[0],data3[0],data4[0],data5[0],data6[0],re.sub('</p> <br />',' ',items))                            except:                pass            try:                cursor.execute(sql,values)                conn.commit()            except:                pass        pattern=re.compile('<div class="detail_icon">[\s\S]*?<img src=(.+?)')        data=re.findall(pattern,result)        for j in data:            print j               try:                temp=requests.get(j[1:-2])            except:                time.sleep(30)                pass        global num        f=file("anzhi/"+str(num),"w+")        num=num+1        print num        f.write(temp.content)    cursor.close()    conn.close()    f.close()if  __name__=="__main__":       main()       


原创粉丝点击