网络爬虫的简易实现(1)

来源:互联网 发布:鬼吹灯和盗墓笔记 知乎 编辑:程序博客网 时间:2024/05/16 05:41

这个爬虫主要实现对http://pic.yesky.com这个网站图片的爬取;

import urllibimport urllib2import reimport timefrom bs4 import BeautifulSoupsend_headers = {}send_headers["Host"] = "http://pic.yesky.com"send_headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.3228.1 Safari/537.36"send_headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"send_headers["Connection"] = "keep-alive"#print(send_headers)#f = urllib2.urlopen("https://www.4399.com")req = urllib2.Request("http://pic.yesky.com")    #伪装头部的请求#print(req.headers)f = urllib2.urlopen(req)html = f.read()print(html)soup = BeautifulSoup(html)yy = soup.select("img")    #按标签进行查找print(yy)ruffix = "jpg"     #存储后缀i = 1for temp in yy:#print(temp['src'])print('-'*50)print(temp.prettify())str =temp['src'].encode('gbk')     #unicode编码转为string类型,查找后缀ruffix = str[str.rfind('.'):str.rfind('.')+4:1]print(ruffix)print(type(str))if str.find("htt") != -1:print(temp['src'])urllib.urlretrieve(temp['src'],filename="/home/lxt/Desktop/pach/4399Pic/%d%s"%(i,ruffix))   #保存图片到本地i+=1time.sleep(1)