爬虫笔记（9/27）------常用爬虫（图片，链接，内容）

来源：互联网发布：linux查询端口命令编辑：程序博客网时间：2024/06/06 14:13

1.图片爬虫

1）建立爬取图片的自定义函数

2）通过for循环将该分类下的所有网页都爬取一遍

import reimport urllib.requestdef craw(url,page):    html1 = urllib.request.urlopen(url).read()    html1 = str(html1)    pat1='<div id="plist".+?<div class="page clearfix">'    result1 = re.compile(pat1).findall(html1)    result1 = result1[0]    pat2 = '<img width="220" height="220" data-img="1" data-lazy-img="//(.+?\.jpg)">'    imagelist = re.compile(pat2).findall(result1)    x = 1    for imageurl in imagelist:        imagename = "C:/Users/alibaba/Desktop/jupyter/code/picturecrawler/jdphoto/"+str(page)+str(x)+".jpg"        imageurl = "http://"+imageurl        try:            urllib.request.urlretrieve(imageurl, filename = imagename)        except urllib.error.URLError as e:            if hasattr(e,"code"):                x+=1            if hasattr(e,"reason"):                x+=1        x+=1for i in range(1,79):    url = "http://list.jd.com/list.html?cat=9987,653,655&page="+str(i)    craw(url,i)    break

2.链接爬取

1）确定好爬取的入口链接

2）构建正则表达式

3）模拟浏览器对网页进行爬取

4）提取需要的网页链接(re.compile(pat).findall(data))

5）过滤重复的网页(list(set(link)))

import reimport urllib.requestdef getlink(url):    #模拟成浏览器    headers = ("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36")    opener = urllib.request.build_opener()    opener.addheaders = [headers]    #将opener安装位全局    urllib.request.install_opener(opener)    file = urllib.request.urlopen(url)    data = str(file.read())    #根据需求构建好链接表达式    pat = '(https?://[^\s)";]+\.(\w|)*)'    link = re.compile(pat).findall(data)    #去除重复元素    link = list(set(link))    return link#要爬取的网页链接url = "http://blog.csdn.net/"#获取对应网页中包含的链接地址linklist = getlink(url)#通过for循环分辨遍历输出获取到的链接地址到屏幕上for link in linklist:    print(link[0])

3.内容爬虫

1）看网址规律，构造网址变量，for循环实现多页内容爬取

2）构造函数getcontent(用户，内容)，

·模拟成浏览器

·正则表达式

·匹配内容，爬虫

·for循环输出内容

3）for循环网页，每次调用gercontent函数

import urllib.requestimport redef getcontent(url,page):    #模拟成浏览器    headers = ("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36")    opener = urllib.request.build_opener()    opener.addheaders = [headers]    #将opener安装全局    urllib.request.install_opener(opener)    data = urllib.request.urlopen(url).read().decode('utf-8')    #构建对应用户提取的正则表达式    userpat = 'target="_blank" title="(.*?)">'    #构建段子内容提取的正则表达式    contentpat = '<div class="content">(.*?)</div>'    #寻找出所有的用户    userlist = re.compile(userpat,re.S).findall(data)    #寻找出所有的内容    contentlist = re.compile(contentpat,re.S).findall(data)    x=1    #通过for循环遍历段子的内容并将内容分别赋予给对应的变量    for content in contentlist:        content = content.replace("\n","")        #用字符串作为变量名，先将对应字符串付给变量        name = "content"+str(x)        #通过exec()函数实现用字符串作为变量名赋值        exec(name+'=content')        x+=1    y=1    #通过for循环遍历用户，并输出该用户对应的内容    for user in userlist:        name = "content"+str(y)        print("用户"+str(page)+str(y)+"是"+user)        print("内容是：")        exec("print("+name+")")        print("\n")        y+=1#分别获取各页的段子，通过for循环可获取多页for i in range(1,30):    url="http://www.qiushibaike.com/"+str(i)    getcontent(url,i)

4.微信爬虫

1）自定义函数：

·用代理服务器爬内容

·获取多个页面的文章链接

·爬取标题和内容

2）异常处理

import reimport urllib.requestimport timeimport urllib.error#模拟成浏览器headers = ("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36")opener = urllib.request.build_opener()opener.addheaders = [headers]#将opener安装为全局urllib.request.install_opener(opener)#设置一个列表listurl存储文章网址列表listurl = []#自定义函数，功能为使用代理服务器def use_proxy(proxy_addr,url):    #建立异常处理机制    try:        import urllib.request        proxy = urllib.request.ProxyHandler({'http':proxy_addr})        opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)        urllib.request.install_opener(opener)        data = urllib.request.urlopen(url).read().decode('utf-8')        return data    except urllib.request.URLError as e:        if hasattr(e,"code"):            print(e.code)        if hasattr(e,"reason"):            print(e.reason)        #若为URLError异常，延时10秒执行        time.sleep(10)    except Exception as e:        print("exception:"+str(e))        #若为Exception异常，延时1秒执行        time.sleep(1)        print(1)#获取所有文章链接def getlisturl(key,pagestart,pageend,proxy):    try:        page = pagestart        #编码关键字key        keycode = urllib.request.quote(key)        #编码“&page”        pagecode = urllib.request.quote("&page")        #循环爬取各页的文章链接        for page in range(pagestart,pageend+1):            #分别构建各页的url链接，每次循环构建一次            url = "http://weixin.sougou.com/weixin?type=2&query="+keycode+pagecode+str(page)            #用代理服务器爬取，解决IP被封杀问题            data1 = use_proxy(proxy,url)            #获取文章链接的正则表达式            listurlpat = '<div class="txt-box">.*?(http://.*?)"'            #获取每页的所有文章链接并添加到链表listurl中            listurl.append(re.compile(listurlpat,re.S).findall(data1))        print("共获取到"+str(len(listurl))+"页")#便于调试        return listurl    except urllib.error.URLError as e:        if hasattr(e,"code"):            print(e.code)        if hasattr(e,"reason"):            print(e.reason)        #若为URLError异常，延时10秒执行        time.sleep(10)    except Exception as e:        print("exception:"+str(e))        #若为Exception异常，延时1秒执行        time.sleep(1)#通过文章链接获取对应内容def getcontent(listurl,proxy):    i = 0    #设置本地文件中的开始html编码    html1='''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "HTTP://www.w3.org/TR/xhtml1-transitional.dtd"><html xmlns="html://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>微信文章页面</title></head><body>'''    fh = open("自己保存的位置/1.html","wb")    fh.write(html1.encode("utf-8"))    fh.close()    #再次以追加写入的方式打开文件，以写入对应文章内容    fh = open("自己保存的位置/1.html","ab")    #此时listurl为二维列表，形如listurl[][]，第一维存储的信息跟第几页相关，第二维存储的信息跟该页第几个文章链接相关    for i in range(0,len(listurl)):        for j in range(0,len(listurl[i])):            try:                url = listurl[i][j]                #处理成真是url，读者亦可以观察对应网址的关系自行分析，采集网址比真实网址多了一串amp                url = url.replace("amp","")                #使用代理去爬取对应网站的内容                data = usr_proxy(proxy,url)                #文章标题正则表达式                titlepat = "<title>(.*?)</title>"                #文章内容正则表达式                contentpat = 'id="js_content">(.*?)id="js_sg_bar"'                #通过对应正则表达式找到标题并赋给列表title                title = re.compile(titlepat).findall(data)                #通过对应正则表达式找到内容并赋给列表content                content = re.compile(contentpat,re.S).findall(data)                #初始化标题和内容                thistitle = "此次没有获取到"                thiscontent = "此次没有获取到"                #如果标题列表不为空，说明找到了标题，取列表第零个元素，即此次标题赋给变量thistitle                if(title!=[]):                    thistitle = title[0]                if(content!=0):                    thiscontent = content[0]                #如果标题与内容汇总赋给变量dataall                dataall = "<p>标题为："+thistitle+"</p><p>内容为："+thiscontent+"</p><br>"                #将该片文章的标题和内容的总信息写入对应文件                fh.write(dataall.encode('utf-8'))                print("第"+str(i)+"个网页第"+str(j)+"次处理")#便于调试            except urllib.error.URLError as e:                if hasattr(e,"code"):                    print(e.code)                if hasattr(e,"reason"):                    print(e.reason)                #若为URLError异常，延时10秒执行                time.sleep(10)            except Exception as e:                print("exception:"+str(e))                #若为Exception异常，延时1秒执行                time.sleep(1)    fh.close()    #设置并写入本地文本的html后面结束部分代码    html2 = '''</body></html>'''    fh = open("自己保存的位置/1.html","ab")    fh.write(html2.encode("utf-8"))    fh.close()#设置关键词key =  "物联网"#设置代理服务器，该代理服务器有可能失效，读者需要换成新的有效代理服务器proxy = "119.6.136.122:80"#可以为getlisturl()与getcontent()设置不同的代理服务器，此处没有启用该项设置proxy2 = ""#起始页pagestart=1#爬取到哪一页pageend=2listurl = getlisturl(key,pagestart,pageend,proxy)getcontent(listurl,proxy)

阅读全文

0 0