Python 遍历网页代码抓取文字和图片

来源:互联网 发布:仿虎牙直播网站源码 编辑:程序博客网 时间:2024/05/12 08:12

Python 遍历网页代码抓取文字和图片

通过python的几个工具requests,beautifulSoup,json, Pool暴力遍历url抓取内容

  • 获取全量的文字和图片链接
  • 获取图片

获取全量的文字和图片链接

#!/usr/bin/python#-*- coding:utf-8 -*-import requestsimport bs4from multiprocessing import Poolimport jsonimport time##地址就补贴了。哈哈url_root = 'xxxx'def geturl(num):    return url_root + str(num);def geturls(num):    return map(geturl, range(14,num));def getdata(url):    datalist={}    response = requests.get(url)    if response.status_code != 200 :        return {"noValue":"noValue"}    soup = bs4.BeautifulSoup(response.text, "html.parser")    ## 获取index    datalist['index'] = soup.title.string[4:8].encode('utf-8')    ## 获取内容    for meta in soup.select('meta'):        if meta.get('name') == 'description':            datalist['content'] = meta.get('content').encode('utf-8')    ##获取图片    datalist['img'] = soup.find_all('img')[1]['src'].encode('utf-8')    return datalistif __name__ == '__main__':    pool = Pool(processes=10)    datalist = []    urls = geturls(1314);    start = time.time()    datalist = pool.map(getdata, urls)    end = time.time()    print 'use:%.2f s' %(end -start)    jsondata = json.dumps({'data':datalist}, ensure_ascii=False)    with open('data.txt','w' ) as outfile:        outfile.write(jsondata)        outfile.close()

获取图片

解析第一个script得到的格式化内容,通过pool并发的区下载图片

#!/usr/bin/python#-*- coding:utf-8 -*-import jsonimport requestsfrom multiprocessing import Pooldef downImge(imgurl):    file_name = imgurl.split('/')[len(imgurl.split('/')) -1 ]    response = requests.get(imgurl, stream=True)    if response.status_code == 200 :        with open("image/"+file_name, 'w') as f:            f.write(response.content)if __name__ == "__main__":    datalist = []    with open('data.txt', 'r') as f:        datalist = json.loads(f.read())        f.close()    imglist = []    for item in datalist['data']:        if item.has_key('img'):            imglist.append(item['img'])    pool = Pool(10)    pool.map(downImge, imglist)

0 0
原创粉丝点击