网络爬虫---1.简介

来源:互联网 发布:在github上下载源码 编辑:程序博客网 时间:2024/05/29 18:13

1. 检查robots.txt文件

有的网站会定义robots.txt文件,让爬虫了解爬取网站存在的限制。

在浏览器里面输入 http://www.csdn.net/robots.txt 查看robots.txt内容

User-agent: * Disallow: /scripts Disallow: /public Disallow: /css/ Disallow: /images/ Disallow: /content/ Disallow: /ui/ Disallow: /js/ Disallow: /scripts/ Disallow: /article_preview.html* Sitemap: http://www.csdn.net/article/sitemap.txt 

这个robots.txt规定任何代理都不可爬取下面的那些链接,如果爬取了,也许会被禁止IP几分钟之类的

最后一句定义了Sitemap文件


2.检查Sitmap(网站地图)文件

在浏览器输入上面的sitmap地址,查看网站地图,发现都是网址,这个文件可能不会很完整或者更新不及时


3.识别网站所用的技术

检查网站构建的技术类型工具-----builtwith

安装方法:

pip install builtwith


查看部分网站的技术:

>>> import builtwith>>> builtwith.parse("http://www.csdn.net/"){'programming-languages': ['Lua'], 'web-servers': ['OpenResty', 'Nginx'], 'javascript-frameworks': ['jQuery']}>>> builtwith.parse("http://news.baidu.com/"){'javascript-frameworks': ['RequireJS', 'jQuery UI', 'jQuery'], 'javascript-graphics': ['D3'], 'web-servers': ['Apache']}>>> builtwith.parse("https://www.baidu.com/"){}

4.寻找网站的所有者

WHOIS工具

安装:

pip install python-whois

>>> import whois>>> print(whois.whois("csdn.net")){  "status": "clientTransferProhibited https://icann.org/epp#clientTransferProhibited",  "registrar": "NETWORK SOLUTIONS, LLC.",  "address": "B3-2-1 ZHaowei Industry Park",  "emails": [    "abuse@web.com",    "Jiangtao@CSDN.NET"  ],  "state": "Beijing",  "whois_server": "whois.networksolutions.com",  "domain_name": "CSDN.NET",  "org": "Beijing Chuangxin Lezhi Co.ltd",  "updated_date": [    "2017-03-10 00:00:00",    "2017-03-10 00:53:11"  ],  "country": "CN",  "city": "Beijng",  "name": "Beijing Chuangxin Lezhi Co.ltd",  "expiration_date": [    "2020-03-11 00:00:00",    "2020-03-11 04:00:00"  ],  "zipcode": "100016",  "creation_date": [    "1999-03-11 00:00:00",    "1999-03-11 05:00:00"  ],  "dnssec": "Unsigned",  "name_servers": [    "NS3.DNSV3.COM",    "NS4.DNSV3.COM"  ],  "referral_url": "http://networksolutions.com"}>>> 

5. 下载网页

# coding=utf-8import urllibfrom urllib import requestdef download(url):    print("Downloading: ", url)    try:        html = request.urlopen(url).read()    except urllib.error.URLError as e:        print("Download error", e.reason)        html = None    return htmlurl1 = "https://www.baidu1111.com/"print(download(url1))url2 = "https://www.baidu.com/"print(download(url2))

重试下载

5xx服务器问题,要重试下载一下  默认重试两次

# coding=utf-8import urllibfrom urllib import requestdef download(url, num_retries=2):    print("Downloading: ", url)    try:        html = request.urlopen(url).read()    except urllib.error.URLError as e:        print("Download error", e.reason)        html = None        if num_retries > 0:            if hasattr(e, 'code') and 500 <= e.code < 600:                return download(url, num_retries-1)    return htmlprint(download("http://httpstat.us/500"))


设置用户代理

# coding=utf-8import urllibfrom urllib import requestdef download(url, user_agent='wswp', num_retries=2):    print("Downloading: ", url)    headers = {'User-agent': user_agent}    req = urllib.request.Request(url, headers=headers)    try:        html = urllib.request.urlopen(req).read()    except urllib.error.URLError as e:        print("Download error", e.reason)        html = None        if num_retries > 0:            if hasattr(e, 'code') and 500 <= e.code < 600:                return download(url, user_agent, num_retries-1)    return htmlprint(download("http://httpstat.us/500"))


6. 链接爬虫

# coding=utf-8import urllibfrom urllib import requestimport redef download(url, user_agent='wswp', num_retries=2):    print("Downloading: ", url)    headers = {'User-agent': user_agent}    req = urllib.request.Request(url, headers=headers)    try:        html = urllib.request.urlopen(req).read()    except urllib.error.URLError as e:        print("Download error", e.reason)        html = None        if num_retries > 0:            if hasattr(e, 'code') and 500 <= e.code < 600:                return download(url, user_agent, num_retries-1)    return html### print(download("http://httpstat.us/500"))def link_crawler(seed_url, link_regex):    '''抓取匹配正则的链接地址'''    crawl_queue = [seed_url]    while crawl_queue:        print(crawl_queue)        url = crawl_queue.pop()        html = download(url)        for link in get_links(html):            print("link",link)            if re.search(link_regex,link): ##re.match 从开头开始匹配,所以返回None                crawl_queue.append(link)                print("append",link)                def get_links(html):    '''返回链接列表'''    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)    return webpage_regex.findall(html.decode('utf-8'))link_crawler('http://example.webscraping.com', '/index')#1.TypeError: cannot use a string pattern on a bytes-like object  #这种情况解决方法就是加上html=html.decode('utf-8')#python3这句代码

改为绝对路径

# coding=utf-8import urllibfrom urllib import requestimport redef download(url, user_agent='wswp', num_retries=2):    print("Downloading: ", url)    headers = {'User-agent': user_agent}    req = urllib.request.Request(url, headers=headers)    try:        html = urllib.request.urlopen(req).read()    except urllib.error.URLError as e:        print("Download error", e.reason)        html = None        if num_retries > 0:            if hasattr(e, 'code') and 500 <= e.code < 600:                return download(url, user_agent, num_retries-1)    return html### print(download("http://httpstat.us/500"))from urllib import parsedef link_crawler(seed_url, link_regex):    '''抓取匹配正则的链接地址'''    crawl_queue = [seed_url]    while crawl_queue:        print(crawl_queue)        url = crawl_queue.pop()        html = download(url)        for link in get_links(html):            if re.search(link_regex,link): ##re.match 从开头开始匹配,所以返回None                link = parse.urljoin(seed_url, link)                crawl_queue.append(link)                print("append",link)                def get_links(html):    '''返回链接列表'''    if html == None:        return []    else:        webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)        return webpage_regex.findall(html.decode('utf-8'))link_crawler('http://example.webscraping.com', '/index')#1.TypeError: cannot use a string pattern on a bytes-like object  #这种情况解决方法就是加上html=html.decode('utf-8')#python3这句代码
用 urllib.parse.urljoin(seed_url, link) 来拼接绝对路径

去重

def link_crawler(seed_url, link_regex):    '''抓取匹配正则的链接地址'''    crawl_queue = [seed_url]    seen = set(crawl_queue)    while crawl_queue:        print(crawl_queue)        url = crawl_queue.pop()        html = download(url)        for link in get_links(html):            if re.search(link_regex,link): ##re.match 从开头开始匹配,所以返回None                link = parse.urljoin(seed_url, link)                if link not in seen:                    seen.add(link)                    crawl_queue.append(link)                

禁止访问robots.txt中的url

from urllib import robotparserparser = robotparser.RobotFileParser()url = "http://www.csdn.net/"parser.set_url(urllib.parse.urljoin(url, "robots.txt"))parser.read()PATHS = {    '/',    '/scripts/',    'content/',    '/js',    '/lock'    }for path in PATHS:    url1 = urllib.parse.urljoin(url, path)    print("%6s: %s" %(parser.can_fetch('PyMOTW', url1), url1))


robotsparser处理robots.txt文件 parser.can_fetch()方法判断该User-agent是否可以访问这个url


设定代理

此段代码未验证,需要时再验证

headers = {'User-agent':"user_agent"}reqst = urllib.request.Request("https//www.baidu.com", headers=headers)    proxy = ''opener = urllib.request.build_opener()proxy_names = {urllib.parse.urlparse(url).scheme:proxy}opener.add_handler(urllib.request.ProxyHandler(proxy_names))response = opener.open(reqst)

下载限速

import datetimeimport timeclass Throttle:    def __init__(self, delay):        self.delay = delay        self.domains = {}    def wait(self, url):        domain = urllib.parse.urlparse(url).netloc        last_accessed = self.domains.get(domain)                if self.delay > 0 and last_accessed is not None:            sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).seconds()            if sleep_secs > 0:                time.sleep(sleep_secs)        self.domains[domain] = datetime.datetime.now()

调用:

throttle = Throttle(delay)......throttle.wait(url)html = download(url, headers, proxy=proxy, num_retries=num_retries)

获取parser.urlparse(url)的部分属性:

>>> from urllib.parse import urlparse>>> o = urlparse('http://www.cwi.nl:80/%7Eguido/Python.html')>>> oParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='')>>> o.port80>>> o.geturl()'http://www.cwi.nl:80/%7Eguido/Python.html'>>> o.netloc'www.cwi.nl:80'>>> o.hostname'www.cwi.nl'
时间:

>>> import datetime>>> datetime.datetime<class 'datetime.datetime'>>>> datetime.datetime.now<built-in method now of type object at 0x0000000065C3C3E0>>>> datetime.datetime.now()datetime.datetime(2017, 6, 30, 16, 12, 16, 559991)>>> last_accessed = datetime.datetime.now()>>> datetime.datetime.now() - last_accesseddatetime.timedelta(0, 15, 470000)>>> (datetime.datetime.now() - last_accessed).seconds39

避免深度爬虫

避免无止尽爬虫,要设置深度

seen = {}...depth = seen[url]if depth != max_depth:    for link not in seen:        seen[link] = depth + 1        crawl_queue.append(link)



原创粉丝点击