python3学习爬虫 正则以及url

来源:互联网 发布:update数据库 编辑:程序博客网 时间:2024/05/12 18:04
#coding=utf8__author__ = 'Administrator'import osimport reimport urllib.requestimport pymysqlclass Spider:    #页面初始化    def __init__(self,url,retext,path):        self.url = url        self.path = path        self.retext = retext    def mkdir(self,path):        isExists = os.path.exists(path)        if not isExists:            os.makedirs(path)        return path    def getData(self):        url = urllib.request.Request(self.url)        html = urllib.request.urlopen(url).read()        print(html)        html = html.decode('utf-8','ignore')        imgRe = re.compile(self.retext)        data = imgRe.findall(html)        return data        #self.data = data    def saveImg(self,imgurl,imgname):        #img = urllib.request.urlopen(imgurl).read()        #img = urllib.request.urlopen(imgurl)        #print(img)        path = self.path        try:            img = urllib.request.urlopen(imgurl)            #print(img)        except Exception as e:            print(e)        else:            img = img.read()            f = open("./%s/%s.jpg" %(path,imgname),'wb')            f.write(img)            f.close()
    def saveMysql(self,title,url,catogary,content):        try:            con=pymysql.connect(host='qdm***w.com',user='q****46',passwd='*******',db='qd*****db',port=3306,charset='utf8')            cur=con.cursor()            #cur.execute('select * from imgurl')            #data=cur.fetchall()            insert = "insert into pic(title,url,catogary,content) values ('%s','%s','%s')"%(title,url,catogary,content)            #print(insert)            cur.execute(insert)            cur.close()#关闭游标            con.close()#释放数据库资源        except  Exception as e:            print("发生异常:%s"%e)
    def getContent(self):        path = self.mkdir(self.path)        data = self.getData()        #print(data)        fp = open('./%s/url.txt'%(path),'w+')        x = 0        for d in data:            print(d)            fp.write(d)            if (len(d) < 80) :                #self.saveImg(d,x)                #print(d)                x+=1        fp.close()url = "http://www.zhihu.com/question/29649162"url2 = 'http://image.baidu.com/activity/starfans/4093640704%201415350495?&albumtype=1'retext = r'http://.*?\.jpg|http://.*?\.png'retext2 = r'<h2 class="zm-item-title.*?>(.*?)</h2>'spider = Spider(url,retext2,"赵丽颖")spider.getContent()

0 0
原创粉丝点击