Python爬取百度贴吧图片脚本

来源：互联网发布：mac的鼠标触摸板编辑：程序博客网时间：2024/05/18 22:17

新手，以下是爬取百度贴吧制定帖子的图片脚本，因为脚本主要是解析html代码，因此一旦百度修改页面前端代码，那么脚本会失效，权当爬虫入门练习吧，后续还会尝试更多的爬虫。

# coding=utf-8# !/usr/bin/env pythonimport urllib, string, osfrom bs4 import BeautifulSoupdef getHtml(url):    page = urllib.urlopen(url)    html = page.read()    return htmldef getImg():    imgPath = 'F:/craw_tieba/'    if not os.path.exists(imgPath):        os.makedirs(imgPath)    baseUrl = 'http://tieba.baidu.com/p/4657665666'    imgList = []    for pg in range(1, 114):        url = baseUrl + '?pn=' + str(pg)        print 'Craw: ',url        html = getHtml(url)        soup = BeautifulSoup(html)        imgURLList = string.split(str(soup.find_all('img')), ',')        for i in range(0, len(imgURLList)):            if 'http://imgsrc.baidu.com/forum/w%3D580/sign=' in imgURLList[i]:                start = string.find(imgURLList[i], 'http')                end = string.find(imgURLList[i], '.jpg') + 4                imgList.append(imgURLList[i][start : end])    x = 1    for img in imgList:        urllib.urlretrieve(img, 'F:/craw_tieba/%s.jpg' % x)        x += 1    print 'Craw tieba finish!'if __name__ == '__main__':    getImg()

0 0