Python爬虫学习记录(1)——百度贴吧图片下载

来源:互联网 发布:Windows切换目录 编辑:程序博客网 时间:2024/06/04 18:16
#!/usr/bin/python#coding=utf-8import osfrom urllib.request import urlopenfrom urllib.request import urlretrieveimport redef getHtml(url):#获取网页的函数    page = urlopen(url)    html = page.read()    return htmldef getImg(html,id,page_num): #获取图片的函数    reg = r'http:\/\/imgsrc.baidu.com\/forum\/.{70,100}jpg'    imgre = re.compile(reg)    html = str(html)    f = open("/usr/lxp/python_test/getImg_Python/out_" + str(page_num),"w+")    f.write(html)    f.close()    imglist = imgre.findall(html)    x = 0    for imgurl in imglist:        save_name = 'topic_'+ id + '_' + str(page_num) + '_%s.jpg' % x        print('download' + save_name +' sucessfully from ' + imgurl)        urlretrieve(imgurl,save_name)        x+=1    return imglistdef getAllImg(topic_id):#解析网页按页数下载    page_num = 1    html_len=0    os.system('mkdir topic_' + topic_id)    while True :        html = getHtml("http://tieba.baidu.com/p/" + topic_id + '?see_lz=1&pn=' + str(page_num))        print(str(html_len) + ' ' + str(len(html)))        if html_len == len(html):            break        getImg(html,topic_id,page_num)        os.system('mv topic_' + topic_id + '*.jpg topic_' + topic_id)        html_len = len(html)        page_num = page_num + 1    return page_numtopic_id = input("topic id:")getAllImg(topic_id)

</pre><pre name="code" class="python">
1 0
原创粉丝点击