python:爬虫系列-02

来源:互联网 发布:sap数据归档 编辑:程序博客网 时间:2024/05/29 10:31

接上一篇python:爬虫系列-01,按照当时的想法做了一个根据一个入口,爬出全部链接,以及这些链接里面的全部链接。

  • 基于python3

涉及的知识点如下:http 请求 / 数据编码 / 字符串处理 / 遍历 / 递归

存在的问题点:

  • 做的不是很成熟,爬出来的链接没有存本地,就打印显示了一下。
  • 有些链接访问不了,会返回403或者其他的状态码
  • 还有一些网页不是utf-8编码的,使用utf-8转码会失败。
  • 我将这些异常全部catch了,并且设置了一个访问超时timeout=10s。
  • 所以,这样爬取的链接不是全部的链接,只是全部的可以被这个爬虫访问的链接。
  • 由于从一个入口进入之后,后面还有很多的链接,子链接里面又会有很多的链接。所以,我本地测试了3个小时,还是没有爬完。也没有办法验证有没有重复爬取。

  • 该爬虫实现的比较简单,获取的数据没有存本地,一共仅包含三个文件

    1. common_var.py
    2. http_file.py
    3. url_collections.py
  • 其中,url_collections.py是该爬虫的入口文件。调用爬虫也是在这里执行的
#!/usr/bin/env python# -*- coding: utf-8 -*-# @file   :common_var.py# @author : cat# @date   : 2017/6/25.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"headers = {"User-Agent": user_agent}if __name__ == '__main__':    pass
#!/usr/bin/env python# -*- coding: utf-8 -*-# @file   : http_file.py# @author : cat# @date   : 2017/6/24.from urllib import requestimport sslfrom web.common_var import headersimport reregex = re.compile(    r'^(?:http|ftp)s?://'  # http:// or https://    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...    r'localhost|'  # localhost...    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip    r'(?::\d+)?'  # optional port    r'(?:/?|[/?]\S+)$', re.IGNORECASE)csdn = 'http://www.csdn.com'def get_urls(url_in=csdn, key="href="):    """    通过一个入口的URL爬取其中的全部的URL    :param url_in: 入口的URL    :param key: 'href='    :return: urls set !    """    url_sets = set()    ssl_context = ssl._create_unverified_context()    req = request.Request(url_in, headers=headers)    resp_bytes = ""    try:        # 超时:10s        resp_bytes = request.urlopen(req, timeout=10, context=ssl_context)        # print(resp_bytes.getcode())        if resp_bytes.getcode() != 200:            return        for line in resp_bytes:            line_html = ""            try:                line_html = line.decode('utf-8')            except UnicodeDecodeError:                print("can not decode utf8:   ", line_html)            # print(line_html)            if key in line_html:                # print(line_html)                index = line_html.index(key)                split = line_html[index + len(key):].replace('"', "#").split('#')                sub_url = None                try:                    sub_url = split[1] if len(split) > 1 else None                except IndexError:                    print("error in : ", split, len(split))                    pass                match = False                try:                    match = regex.search(sub_url)                except TypeError:                    print(sub_url, type(sub_url))                if match:                    # print(match.group())                    # yield match.group()                    url_sets.add(match.group())                    # print(url_sets)    except Exception:        print("urlopen error: ", resp_bytes)    return url_setsif __name__ == '__main__':    print(list(get_urls("http://news.baidu.com/?tn=news")))    # baidu_news = "http://news.baidu.com/?tn=news"    # urls_collected = set()    # urls = get_urls(baidu_news)    # # print(urls)    # for u in urls:    #     print(" ", u)    #     # print("total url size in {} = {}"    #       .format(baidu_news, len(urls)))
#!/usr/bin/env python# -*- coding: utf-8 -*-# @file   : url_collections.py# @author : cat# @date   : 2017/6/25.from web.http_file import get_urlsfrom web.http_file import csdnclass UrlCollection:    def __init__(self, spidered_urls=set(), collcted_urls=set()):        self.spidered_urls = spidered_urls        self.collcted_urls = collcted_urls    def collct(self, url_in):        if url_in not in self.spidered_urls:            urls = get_urls(url_in)            self.url_in = url_in            self.collcted_urls = self.collcted_urls | urls            self.spidered_urls.add(self.url_in)            print(self.url_in, len(self.collcted_urls))            for u in urls:                self.collct(u)if __name__ == '__main__':    spider = UrlCollection()    spider.collct(csdn)    pass

执行结果如下:(未爬完)

http://www.csdn.com 307http://g.csdn.net/5272869 381http://hardware.csdn.net/themes/zone/hardware/css/01mod-nav.css 381http://huiyi.csdn.net/activity/product/goods_list?project_id=1628 392can not decode utf8:    http://csdnimg.cn/public/favicon.ico 392··· # 后面还有很多内容,没办法全部复制...

该练习的初衷是使用一下python