python:爬虫系列-02

来源：互联网发布：sap数据归档编辑：程序博客网时间：2024/05/29 10:31

接上一篇python:爬虫系列-01，按照当时的想法做了一个根据一个入口，爬出全部链接，以及这些链接里面的全部链接。

基于python3

涉及的知识点如下：http 请求 / 数据编码 / 字符串处理 / 遍历 / 递归

存在的问题点：

做的不是很成熟，爬出来的链接没有存本地，就打印显示了一下。
有些链接访问不了，会返回403或者其他的状态码
还有一些网页不是utf-8编码的，使用utf-8转码会失败。
我将这些异常全部catch了，并且设置了一个访问超时timeout=10s。
所以，这样爬取的链接不是全部的链接，只是全部的可以被这个爬虫访问的链接。
由于从一个入口进入之后，后面还有很多的链接，子链接里面又会有很多的链接。所以，我本地测试了3个小时，还是没有爬完。也没有办法验证有没有重复爬取。
该爬虫实现的比较简单，获取的数据没有存本地，一共仅包含三个文件
1. common_var.py
2. http_file.py
3. url_collections.py
其中，url_collections.py是该爬虫的入口文件。调用爬虫也是在这里执行的

#!/usr/bin/env python# -*- coding: utf-8 -*-# @file   :common_var.py# @author : cat# @date   : 2017/6/25.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"headers = {"User-Agent": user_agent}if __name__ == '__main__':    pass

#!/usr/bin/env python# -*- coding: utf-8 -*-# @file   : http_file.py# @author : cat# @date   : 2017/6/24.from urllib import requestimport sslfrom web.common_var import headersimport reregex = re.compile(    r'^(?:http|ftp)s?://'  # http:// or https://    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...    r'localhost|'  # localhost...    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip    r'(?::\d+)?'  # optional port    r'(?:/?|[/?]\S+)$', re.IGNORECASE)csdn = 'http://www.csdn.com'def get_urls(url_in=csdn, key="href="):    """    通过一个入口的URL爬取其中的全部的URL    :param url_in: 入口的URL    :param key: 'href='    :return: urls set !    """    url_sets = set()    ssl_context = ssl._create_unverified_context()    req = request.Request(url_in, headers=headers)    resp_bytes = ""    try:        # 超时：10s        resp_bytes = request.urlopen(req, timeout=10, context=ssl_context)        # print(resp_bytes.getcode())        if resp_bytes.getcode() != 200:            return        for line in resp_bytes:            line_html = ""            try:                line_html = line.decode('utf-8')            except UnicodeDecodeError:                print("can not decode utf8:   ", line_html)            # print(line_html)            if key in line_html:                # print(line_html)                index = line_html.index(key)                split = line_html[index + len(key):].replace('"', "#").split('#')                sub_url = None                try:                    sub_url = split[1] if len(split) > 1 else None                except IndexError:                    print("error in : ", split, len(split))                    pass                match = False                try:                    match = regex.search(sub_url)                except TypeError:                    print(sub_url, type(sub_url))                if match:                    # print(match.group())                    # yield match.group()                    url_sets.add(match.group())                    # print(url_sets)    except Exception:        print("urlopen error: ", resp_bytes)    return url_setsif __name__ == '__main__':    print(list(get_urls("http://news.baidu.com/?tn=news")))    # baidu_news = "http://news.baidu.com/?tn=news"    # urls_collected = set()    # urls = get_urls(baidu_news)    # # print(urls)    # for u in urls:    #     print(" ", u)    #     # print("total url size in {} = {}"    #       .format(baidu_news, len(urls)))

#!/usr/bin/env python# -*- coding: utf-8 -*-# @file   : url_collections.py# @author : cat# @date   : 2017/6/25.from web.http_file import get_urlsfrom web.http_file import csdnclass UrlCollection:    def __init__(self, spidered_urls=set(), collcted_urls=set()):        self.spidered_urls = spidered_urls        self.collcted_urls = collcted_urls    def collct(self, url_in):        if url_in not in self.spidered_urls:            urls = get_urls(url_in)            self.url_in = url_in            self.collcted_urls = self.collcted_urls | urls            self.spidered_urls.add(self.url_in)            print(self.url_in, len(self.collcted_urls))            for u in urls:                self.collct(u)if __name__ == '__main__':    spider = UrlCollection()    spider.collct(csdn)    pass

执行结果如下：（未爬完）

http://www.csdn.com 307http://g.csdn.net/5272869 381http://hardware.csdn.net/themes/zone/hardware/css/01mod-nav.css 381http://huiyi.csdn.net/activity/product/goods_list?project_id=1628 392can not decode utf8:    http://csdnimg.cn/public/favicon.ico 392··· # 后面还有很多内容，没办法全部复制...

该练习的初衷是使用一下python。

阅读全文

0 0