
来源:互联网 发布:jsp和php 编辑:程序博客网 时间:2024/05/29 09:39

  • 下载缓存
  • 1为链接爬虫添加缓存支持
  • 2磁盘缓存
    • 1用磁盘缓存的实现
    • 2缓存测试
    • 3节省磁盘空间
    • 4清理过期数据
    • 5用磁盘缓存的缺点
  • 3数据库缓存
    • 1NoSQL是什么
    • 2安装MongoDB
    • 3MongoDB概述
    • 4MongoDB缓存实现
    • 5压缩存储
    • 6缓存测试
    • 7MongoDB缓存完整代码




  • 我们将downloader重构一类,这样参数只需在构造方法中设置一次,就能在后续多次复用,在URL下载之前进行缓存检查,并把限速功能移到函数内部。
  • 在Downloader类的call特殊方法实现了下载前先检查缓存,如果已经定义该URL缓存则再检查下载中是否遇到了服务端错误,如果都没问题表明缓存结果可用,否则都需要正常下载该URL存到缓存中。
  • downloader方法返回添加了HTTP状态码,以便缓存中存储错误机校验。如果不需要限速或缓存的话,你可以直接调用该方法,这样就不会通过call方法调用了。
class Downloader:    def __init__(self, delay=5, user_agent='Wu_Being', proxies=None, num_retries=1, cache=None):        self.throttle = Throttle(delay)        self.user_agent = user_agent        self.proxies = proxies        self.num_retries = num_retries        self.cache = cache    def __call__(self, url):        result = None        if self.cache:            try:                result = self.cache[url]            except KeyError:                # url is not available in cache                 pass            else:                if self.num_retries > 0 and 500 <= result['code'] < 600:                    # server error so ignore result from cache and re-download                    result = None        if result is None:            # result was not loaded from cache so still need to download            self.throttle.wait(url)            proxy = random.choice(self.proxies) if self.proxies else None            headers = {'User-agent': self.user_agent}            result = self.download(url, headers, proxy=proxy, num_retries=self.num_retries)            if self.cache:                # save result to cache                self.cache[url] = result        return result['html']    def download(self, url, headers, proxy, num_retries, data=None):        print 'Downloading:', url    ...        return {'html': html, 'code': code}class Throttle:    def __init__(self, delay):    ...    def wait(self, url):    ...


from downloader import Downloaderdef link_crawler(... cache=None):    crawl_queue = [seed_url]    seen = {seed_url: 0}    # track how many URL's have been downloaded    num_urls = 0    rp = get_robots(seed_url)    #cache.clear()          ###############################    D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, cache=cache)    while crawl_queue:        url = crawl_queue.pop()        depth = seen[url]        # check url passes robots.txt restrictions        if rp.can_fetch(user_agent, url):            html = D(url)               ###def __call__(self, url):            links = []    ...def normalize(seed_url, link):    ...def same_domain(url1, url2):    ...def get_robots(url):    ...def get_links(html):    ..."""if __name__ == '__main__':    link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, user_agent='BadCrawler')    link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, max_depth=1, user_agent='GoodCrawler')"""



操作系统 文件系统 非法文件名字符 文件名最大长度 Linux Ext3/Ext4 /\0 255个字节 OS X HFS Plus :\0 255个UTF-16编码单元 Windows NTFS \/?:*"><和` `


>>> import re>>> url="http://example.webscraping.com/default/view/australia-1">>> re.sub('[^/0-9a-zA-Z\-,.;_ ]','_',url)'http_//example.webscraping.com/default/view/australia-1'


>>> filename=re.sub('[^/0-9a-zA-Z\-,.;_ ]','_',url)>>> filename='/'.join(segment[:255] for segment in filename.split('/'))>>> print filenamehttp_//example.webscraping.com/default/view/australia-1>>> print '#'.join(segment[:5] for segment in filename.split('/'))http_##examp#defau#view#austr>>> 

- http://example.webscraping.com/index/
- http://example.webscraping.com/index/1


>>> import urlparse>>> components=urlparse.urlsplit('http://exmaple.scraping.com/index/')>>> print componentsSplitResult(scheme='http', netloc='exmaple.scraping.com', path='/index/', query='', fragment='')>>> print components.path/index/>>> path=components.path>>> if not path:...     path='/index.html'... elif path.endswith('/'):...     path+='index.html'... >>> filename=components.netloc+path+components.query>>> filename'exmaple.scraping.com/index/index.html'>>> 



from link_crawler import link_crawlerclass DiskCache:    def __init__(self, cache_dir='cache', ...):        """        cache_dir: the root level folder for the cache        """        self.cache_dir = cache_dir    ...    def url_to_path(self, url):        """Create file system path for this URL        """        components = urlparse.urlsplit(url)        # when empty path set to /index.html        path = components.path        if not path:            path = '/index.html'        elif path.endswith('/'):            path += 'index.html'        filename = components.netloc + path + components.query        # replace invalid characters        filename = re.sub('[^/0-9a-zA-Z\-.,;_ ]', '_', filename)        # restrict maximum number of characters        filename = '/'.join(segment[:255] for segment in filename.split('/'))        return os.path.join(self.cache_dir, filename) #拼接当前目录和文件名为完整目录    def __getitem__(self, url):        ...    def __setitem__(self, url, result):        ...    def __delitem__(self, url):        ...    def has_expired(self, timestamp):        ...    def clear(self):    ...if __name__ == '__main__':    link_crawler('http://example.webscraping.com/', '/(index|view)', cache=DiskCache())


import pickleclass DiskCache:    def __init__(self, cache_dir='cache', expires=timedelta(days=30), compress=True):    ...        def url_to_path(self, url):    ...    def __getitem__(self, url):    ...    def __setitem__(self, url, result):        """Save data to disk for this url        """        path = self.url_to_path(url)        folder = os.path.dirname(path)        if not os.path.exists(folder):            os.makedirs(folder)        with open(path, 'wb') as fp:            fp.write(pickle.dumps(result))


import pickleclass DiskCache:    def __init__(self, cache_dir='cache', expires=timedelta(days=30), compress=True):    ...        def url_to_path(self, url):    ...    def __getitem__(self, url):        """Load data from disk for this URL        """        path = self.url_to_path(url)        if os.path.exists(path):            with open(path, 'rb') as fp:                return pickle.loads(fp.read())        else:            # URL has not yet been cached            raise KeyError(url + ' does not exist')    def __setitem__(self, url, result):    ...




wu_being@ubuntukylin64:~/GitHub/WebScrapingWithPython/3.下载缓存$ time python 2disk_cache_Nozip127.py Downloading:    0m58.710suser    0m0.684ssys 0m0.120swu_being@ubuntukylin64:~/GitHub/WebScrapingWithPython/3.下载缓存$ time python 2disk_cache_Nozip127.py real    0m0.221suser    0m0.204ssys 0m0.012s





return pickle.loads(zlib.decompress(fp.read()))

压缩所有网页之后,缓存占用大小2.8 MB下降到821.2 KB,耗时略有增加。

wu_being@ubuntukylin64:~/GitHub/WebScrapingWithPython/3.下载缓存$ time python 2disk_cache.py Downloading:    1m0.011suser    0m0.800ssys 0m0.104swu_being@ubuntukylin64:~/GitHub/WebScrapingWithPython/3.下载缓存$ wu_being@ubuntukylin64:~/GitHub/WebScrapingWithPython/3.下载缓存$ time python 2disk_cache.py real    0m0.252suser    0m0.228ssys 0m0.020swu_being@ubuntukylin64:~/GitHub/WebScrapingWithPython/3.下载缓存$ 



from datetime import datetime, timedeltaclass DiskCache:    def __init__(self, cache_dir='cache', expires=timedelta(days=30), compress=True):        """        cache_dir: the root level folder for the cache        expires: timedelta of amount of time before a cache entry is considered expired        compress: whether to compress data in the cache        """        self.cache_dir = cache_dir        self.expires = expires        self.compress = compress    def __getitem__(self, url):        """Load data from disk for this URL        """        path = self.url_to_path(url)        if os.path.exists(path):            with open(path, 'rb') as fp:                data = fp.read()                if self.compress:                    data = zlib.decompress(data)                result, timestamp = pickle.loads(data)                if self.has_expired(timestamp):                    raise KeyError(url + ' has expired')                return result        else:            # URL has not yet been cached            raise KeyError(url + ' does not exist')    def __setitem__(self, url, result):        """Save data to disk for this url        """        path = self.url_to_path(url)        folder = os.path.dirname(path)        if not os.path.exists(folder):            os.makedirs(folder)        data = pickle.dumps((result, datetime.utcnow()))        if self.compress:            data = zlib.compress(data)        with open(path, 'wb') as fp:            fp.write(data)    ...    def has_expired(self, timestamp):        """Return whether this timestamp has expired        """        return datetime.utcnow() > timestamp + self.expires


    """    Dictionary interface that stores cached     values in the file system rather than in memory.    The file path is formed from an md5 hash of the key.    """>>> from disk_cache import DiskCache>>> cache=DiskCache()>>> url='http://www.baidu.com'>>> result={'html':'<html>...','code':200}>>> cache[url]=result>>> cache[url]{'code': 200, 'html': '<html>...'}>>> cache[url]['html']==result['html']True>>> >>> from datetime import timedelta>>> cache2=DiskCache(expires=timedelta(seconds=5))>>> url2='https://www.baidu.sss'>>> result2={'html':'<html>..ss.','code':500}>>> cache2[url2]=result2>>> cache2[url2]{'code': 200, 'html': '<html>...'}>>> cache2[url2]{'code': 200, 'html': '<html>...'}>>> cache2[url2]{'code': 200, 'html': '<html>...'}>>> cache2[url2]{'code': 200, 'html': '<html>...'}>>> cache2[url2]Traceback (most recent call last):  File "<stdin>", line 1, in <module>  File "disk_cache.py", line 57, in __getitem__    raise KeyError(url + ' has expired')KeyError: 'http://www.baidu.com has expired'>>> cache2.clear()


- 有些URL会被映射为相同的文件名。比如URL:.../count.asp?a+b,.../count.asp?a*b
- URL截断255个字符的文件名也可能相同。因为URL可以超过2000下字符。

- 每个卷和每个目录下的文件数量是有限制的。FAT32文件系统每个目录的最大文件数65535,但可以分割到不同目录下。
- 文件系统可存储的文件总数也是有限的。ext4分区目前支持略多于1500万个文件,而一个大型网站往往拥有超过1亿个网页。





NoSQL全称为Not Only SQL,是一种相对较新的数据库设计方式。传统的关系模型使用是固定模式,并将数据分割到各个表中。然而,对于大数据集的情况,数据量太大使其难以存放在单一服务器中,此时就需要扩展到多台服务器。不过,关系模型对于这种扩展的支持并不够好,因为在查询多个表时,数据可能在不同的服务器中。相反,NoSQL数据库通常是无模式的,从设计之初就考虑了跨服务器无缝分片的问题。在NoSQL中,有多种方式可以实现该目标,分别是:

- 列数据存储(如HBase);
- 键值对存储(如Redis);
- 图形数据库(如Neo4j);
- 面向文档的数据库(如MongoDB)。


MongoDB可以从https://www.mongodb.org/downloads 下载。然后安装其Python封装库:

pip install pymongo


wu_being@ubuntukylin64:~/GitHub/WebScrapingWithPython/3.下载缓存$ mongod -dbpath MongoD2017-01-17T21:20:46.224+0800 [initandlisten] MongoDB starting : pid=1978 port=27017 dbpath=MongoD 64-bit host=ubuntukylin642017-01-17T21:20:46.224+0800 [initandlisten] db version v2.6.102017-01-17T21:20:46.224+0800 [initandlisten] git version: nogitversion2017-01-17T21:20:46.225+0800 [initandlisten] OpenSSL version: OpenSSL 1.0.2g  1 Mar 20162017-01-17T21:20:46.225+0800 [initandlisten] build info: Linux lgw01-12 3.19.0-25-generic #26~14.04.1-Ubuntu SMP Fri Jul 24 21:16:20 UTC 2015 x86_64 BOOST_LIB_VERSION=1_582017-01-17T21:20:46.225+0800 [initandlisten] allocator: tcmalloc2017-01-17T21:20:46.225+0800 [initandlisten] options: { storage: { dbPath: "MongoD" } }2017-01-17T21:20:46.269+0800 [initandlisten] journal dir=MongoD/journal2017-01-17T21:20:46.270+0800 [initandlisten] recover : no journal files present, no recovery needed2017-01-17T21:20:49.126+0800 [initandlisten] preallocateIsFaster=true 33.722017-01-17T21:20:51.932+0800 [initandlisten] preallocateIsFaster=true 32.72017-01-17T21:20:55.729+0800 [initandlisten] preallocateIsFaster=true 32.362017-01-17T21:20:55.730+0800 [initandlisten] preallocateIsFaster check took 9.459 secs2017-01-17T21:20:55.730+0800 [initandlisten] preallocating a journal file MongoD/journal/prealloc.02017-01-17T21:20:58.042+0800 [initandlisten]        File Preallocator Progress: 608174080/1073741824    56%2017-01-17T21:21:03.290+0800 [initandlisten]        File Preallocator Progress: 744488960/1073741824    69%2017-01-17T21:21:08.043+0800 [initandlisten]        File Preallocator Progress: 954204160/1073741824    88%2017-01-17T21:21:18.347+0800 [initandlisten] preallocating a journal file MongoD/journal/prealloc.12017-01-17T21:21:21.166+0800 [initandlisten]        File Preallocator Progress: 639631360/1073741824    59%2017-01-17T21:21:26.328+0800 [initandlisten]        File Preallocator Progress: 754974720/1073741824    70%...


>>> from pymongo import MongoClient>>> client=MongoClient('localhost',27017)



>>> from pymongo import MongoClient>>> client=MongoClient('localhost',27017)>>> url='http://www.baidu.com/view/China-47'>>> html='...<html>...'>>> db=client.cache>>> db.webpage.insert({'url':url,'html':html})ObjectId('587e2cb26b00c10b956e0be9')>>> db.webpage.find_one({'url':url}){u'url': u'http://www.baidu.com/view/China-47', u'_id': ObjectId('587e2cb26b00c10b956e0be9'), u'html': u'...<html>...'}>>> db.webpage.find({'url':url})<pymongo.cursor.Cursor object at 0x7fcde0ca60d0>>>> db.webpage.find({'url':url}).count()1


>>> db.webpage.insert({'url':url,'html':html})ObjectId('587e2d546b00c10b956e0bea')>>> db.webpage.find({'url':url}).count()2>>> db.webpage.find_one({'url':url}){u'url': u'http://www.baidu.com/view/China-47', u'_id': ObjectId('587e2cb26b00c10b956e0be9'), u'html': u'...<html>...'}


>>> >>> new_html='<...>...'>>> db.webpage.update({'_id':url},{'$set':{'html':new_html}},upsert=True){'updatedExisting': True, u'nModified': 1, u'ok': 1, u'n': 1}>>> db.webpage.find_one({'_id':url}){u'_id': u'http://www.baidu.com/view/China-47', u'html': u'<...>...'}>>> db.webpage.find({'_id':url}).count()1>>> db.webpage.update({'_id':url},{'$set':{'html':new_html}},upsert=True){'updatedExisting': True, u'nModified': 0, u'ok': 1, u'n': 1}>>> db.webpage.find({'_id':url}).count()1>>> 




import picklefrom datetime import datetime, timedeltafrom pymongo import MongoClientclass MongoCache:    def __init__(self, client=None, expires=timedelta(days=30)):        """        client: mongo database client        expires: timedelta of amount of time before a cache entry is considered expired        """        # if a client object is not passed         # then try connecting to mongodb at the default localhost port         self.client = MongoClient('localhost', 27017) if client is None else client        #create collection to store cached webpages,        # which is the equivalent of a table in a relational database        self.db = self.client.cache        self.db.webpage.create_index('timestamp', expireAfterSeconds=expires.total_seconds())    def __getitem__(self, url):        """Load value at this URL        """        record = self.db.webpage.find_one({'_id': url})        if record:            return record['result']        else:            raise KeyError(url + ' does not exist')    def __setitem__(self, url, result):        """Save value for this URL        """        record = {'result': result, 'timestamp': datetime.utcnow()}        self.db.webpage.update({'_id': url}, {'$set': record}, upsert=True)


>>> from mongo_cache import MongoCache>>> from datetime import timedelta>>> cache=MongoCache(expires=timedelta())>>> result={'html':'.....'}>>> cache[url]=result>>> cache[url]{'html': '.....'}>>> cache[url]{'html': '.....'}>>> import time>>> import time;time.sleep(60)>>> cache[url]Traceback (most recent call last):  File "<stdin>", line 1, in <module>  File "mongo_cache.py", line 62, in __getitem__    raise KeyError(url + ' does not exist')KeyError: 'http://www.baidu.com/view/China-47 does not exist'>>> 


import pickleimport zlibfrom bson.binary import Binaryclass MongoCache:    def __getitem__(self, url):        """Load value at this URL        """        record = self.db.webpage.find_one({'_id': url})        if record:            #return record['result']            return pickle.loads(zlib.decompress(record['result']))        else:            raise KeyError(url + ' does not exist')    def __setitem__(self, url, result):        """Save value for this URL        """        #record = {'result': result, 'timestamp': datetime.utcnow()}        record = {'result': Binary(zlib.compress(pickle.dumps(result))), 'timestamp': datetime.utcnow()}        self.db.webpage.update({'_id': url}, {'$set': record}, upsert=True)


wu_being@ubuntukylin64:~/GitHub/WebScrapingWithPython/3.下载缓存$ time python 3mongo_cache.py Downloading:    0m59.239suser    0m1.164ssys 0m0.108swu_being@ubuntukylin64:~/GitHub/WebScrapingWithPython/3.下载缓存$ time python 3mongo_cache.py real    0m0.695suser    0m0.408ssys 0m0.044s



try:    import cPickle as pickleexcept ImportError:    import pickleimport zlibfrom datetime import datetime, timedeltafrom pymongo import MongoClientfrom bson.binary import Binaryfrom link_crawler import link_crawlerclass MongoCache:    """    Wrapper around MongoDB to cache downloads    >>> cache = MongoCache()    >>> cache.clear()    >>> url = 'http://example.webscraping.com'    >>> result = {'html': '...'}    >>> cache[url] = result    >>> cache[url]['html'] == result['html']    True    >>> cache = MongoCache(expires=timedelta())    >>> cache[url] = result    >>> # every 60 seconds is purged http://docs.mongodb.org/manual/core/index-ttl/    >>> import time; time.sleep(60)    >>> cache[url]     Traceback (most recent call last):     ...    KeyError: 'http://example.webscraping.com does not exist'    """    def __init__(self, client=None, expires=timedelta(days=30)):        """        client: mongo database client        expires: timedelta of amount of time before a cache entry is considered expired        """        # if a client object is not passed         # then try connecting to mongodb at the default localhost port         self.client = MongoClient('localhost', 27017) if client is None else client        #create collection to store cached webpages,        # which is the equivalent of a table in a relational database        self.db = self.client.cache        self.db.webpage.create_index('timestamp100s', expireAfterSeconds=expires.total_seconds())       #timestamp    def __contains__(self, url):        try:            self[url]        except KeyError:            return False        else:            return True    def __getitem__(self, url):        """Load value at this URL        """        record = self.db.webpage.find_one({'_id': url})        if record:            #return record['result']            return pickle.loads(zlib.decompress(record['result']))        else:            raise KeyError(url + ' does not exist')    def __setitem__(self, url, result):        """Save value for this URL        """        #record = {'result': result, 'timestamp': datetime.utcnow()}        record = {'result': Binary(zlib.compress(pickle.dumps(result))), 'timestamp100s': datetime.utcnow()}    #timestamp        self.db.webpage.update({'_id': url}, {'$set': record}, upsert=True)    def clear(self):        self.db.webpage.drop()        print 'drop() successful'if __name__ == '__main__':      #link_crawler('http://example.webscraping.com/', '/(index|view)', cache=MongoCache())    #link_crawler('', '/places/default/(index|view)/', cache=MongoCache())    link_crawler('', '/places/default/(index|view)/', cache=MongoCache(expires=timedelta(seconds=100)))

Wu_Being 博客声明:本人博客欢迎转载,请标明博客原文和原链接!谢谢!

Wu_Being 吴兵博客接受赞助费二维码


0 0