手写python爬虫

来源:互联网 发布:网络女作家富豪行榜 编辑:程序博客网 时间:2024/06/05 10:24

  一、图片爬虫

 (1)京东手机图片的抓取

import reimport urllib.requestdef craw(url,page):    html1=urllib.request.urlopen(url).read()    html1=str(html1)    pat1='<div id="plist".+? <div class="page clearfix">'    result1=re.compile(pat1).findall(html1)    result1=result1[0]    pat2='<img width="220" height="220" data-img="1" data-lazy-img="//(.+?\.jpg)">'    imagelist=re.compile(pat2).findall(result1)    x=1    for imageurl in imagelist:        imagename="E:/picture/"+str(page)+str(x)+".jpg"        imageurl="http://"+imageurl        try:            urllib.request.urlretrieve(imageurl,filename=imagename)        except urllib.error.URLError as e:            if hasattr(e,"code"):                x+=1            if hasattr(e,"reason"):                x+=1        x+=1        print(x)for i in range(1,79):    url="http://list.jd.com/list.html?cat=9987,653,655&page="+str(i)    craw(url,i)

 (2)妹子图片抓取(参照)

## http://www.zjito.com/# 爬取妹子图片 bs4 + re + gevent 多线程爬虫import requestsfrom bs4 import BeautifulSoupimport urllibimport geventfrom gevent import Greenletimport socketimport randomdef cbk(a,b,c):      '''''回调函数     @a:已经下载的数据块     @b:数据块的大小     @c:远程文件的大小     '''      per=100.0*a*b/c      if per>100:          per=100      print('%.2f%%' % per)def photo_download(photo_thread, index_number, photo_number, number):while number < 3564 :try:i = 0number = number + 1url = 'http://www.zjito.com/dqfl/'+dict[i]+'/'+str(index_number)+'.shtml?idx=1'# 爬虫目标网站地址headers = {'user-agent': 'my-app/0.0.1'}r = requests.get(url, headers=headers)# 获得目标页面返回信息print(r.status_code)print(url)while r.status_code == 404:# 判断响应状态码i = i + 1url = 'http://www.zjito.com/dqfl/'+dict[i]+'/'+str(index_number)+'.shtml?idx=1'print(url)else :soup = BeautifulSoup(r.text, 'html.parser')# 返回的信息放入soup中# 获取页面全部标签信息# print(soup.prettify())# 测试显示的是否是页面的标签for link in soup.find_all(class_="div-num"):print(link.get('data-src'))# 输出图片地址socket.setdefaulttimeout(3.0)# 设置超时photo_number = photo_number + 1urllib.request.urlretrieve(link.get('data-src'), file+'/'+str(photo_thread)+'_'+str(photo_number)+'.jpg', cbk)# 下载图片并显示下载进度gevent.sleep(random.randint(0,2)*0.001)except Exception as e:index_number = index_number + 1index_number = index_number + 1if __name__ == '__main__':dict = ['zgnd', 'tw', 'xg', 'rb', 'hg', 'mlxy', 'tg', 'om', 'hx',]# 照片分类photo_thread = [1, 2]# 线程计数器photo_number = -1# 下载图片计数器,最大50# index_number = 530273# 页面计数器,最小530273,最大544527file = '../photo/'# 图片的保存地址thread1 = Greenlet.spawn(photo_download, photo_thread[0], 530273, photo_number, 0)# 从命名中创建,并运行新的Greenlet的包装器# 函数photo_download,带有传递的参数thread2 = gevent.spawn(photo_download, photo_thread[1], 533836, photo_number, 0)# 两个thread运行,一个从530273页面开始爬取,另一个从537400页面开始爬取# 537400 - 530273 = 7127# 7127 / 2 = 3564# 3564 + 530273 = 533836threads = [thread1, thread2]# 阻止所有线程完成gevent.joinall(threads)

 二、链接爬虫

 (1)爬取csdn链接

import reimport urllib.requestdef getlink(url):    #模拟成浏览器    headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0")    opener = urllib.request.build_opener()    opener.addheaders = [headers]    #将opener安装为全局    urllib.request.install_opener(opener)    file=urllib.request.urlopen(url)    data=str(file.read())    #根据需求构建好链接表达式    pat='(https?://[^\s)";]+\.(\w|/)*)'    link=re.compile(pat).findall(data)    #去除重复元素    link=list(set(link))    return link#要爬取的网页链接url="http://blog.csdn.net/"#获取对应网页中包含的链接地址linklist=getlink(url)#通过for循环分别遍历输出获取到的链接地址到屏幕上for link in linklist:    print(link[0])


 三、糗事百科爬虫

import urllib.requestimport redef getcontent(url,page):    #模拟成浏览器    headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0")    opener = urllib.request.build_opener()    opener.addheaders = [headers]    #将opener安装为全局    urllib.request.install_opener(opener)    data=urllib.request.urlopen(url).read().decode("utf-8")    #构建对应用户提取的正则表达式    userpat='target="_blank" title="(.*?)">'    #构建段子内容提取的正则表达式    contentpat='<div class="content">(.*?)</div>'    #寻找出所有的用户    userlist=re.compile(userpat,re.S).findall(data)    #寻找出所有的内容    contentlist=re.compile(contentpat,re.S).findall(data)    x=1    #通过for循环遍历段子内容并将内容分别赋给对应的变量    for content in contentlist:        content=content.replace("\n","")        #用字符串作为变量名,先将对应字符串赋给一个变量        name="content"+str(x)         #通过exec()函数实现用字符串作为变量名并赋值        exec(name+'=content')        x+=1    y=1    #通过for循环遍历用户,并输出该用户对应的内容    for user in userlist:        name="content"+str(y)        print("用户"+str(page)+str(y)+"是:"+user)        print("内容是:")        exec("print("+name+")")        print("\n")        y+=1#分别获取各页的段子,通过for循环可以获取多页for i in range(1,30):    url="http://www.qiushibaike.com/8hr/page/"+str(i)    getcontent(url,i)




原创粉丝点击