多进程爬虫
来源:互联网 发布:瞻博网络中国裁员 编辑:程序博客网 时间:2024/06/05 06:06
1. mongoqueue.py
from datetime import datetime, timedeltafrom pymongo import MongoClient, errorsclass MongoQueue: OUTSTANDING, PROCESSING, COMPLETE = range(3) def __init__(self, client=None, timeout=300): self.client = MongoClient() if client is None else client self.db = self.client.cache self.timeout = timeout def __nonzero__(self): record = self.db.crawl_queue.find_one( {'status': {'$ne': self.COMPLETE}} ) return True if record else False def push(self, url): try: self.db.crawl_queue.insert({'id': url, 'status': self.OUTSTANDING}) except errors.DuplicateKeyError as e: pass def pop(self): record = self.db.crawl_queue.find_and_modify( query={'status': self.OUTSTANDING}, update={'$set': {'status': self.PROCESSING, 'timestamp': datetime.now()}} ) if record: return record['_id'] else: self.repair() raise KeyError() def complete(self, url): record = self.db.crawl_queue.find_and_modify( query={ 'timestamp': {'$lt': datetime.now() - timedelta(seconds=self.timeout)}, 'status': {'$ne': self.COMPLETE} }, update={'$set': {'status': self.OUTSTANDING}} ) if record: print 'Released', record['_id']2.process_crawler.py# -*- coding: utf-8 -*-import timeimport threadingfrom downloader import Downloaderimport urlparseimport robotparserimport csvimport reimport lxmlfrom mongoqueue import MongoQueueSLEEP_TIME = 1DEFAULT_AGENT = 'wswp'DEFAULT_DELAY = 5DEFAULT_RETRIES = 1DEFAULT_TIMEOUT = 60def process_crawler(seed_url,cache=None, delay=DEFAULT_DELAY, user_agent='wswp', proxies=None,num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, sleep_time=SLEEP_TIME,max_threads=10, scrape_callback=None): #多线程 # crawl_queue = [seed_url] # seen = set([seed_url]) #多进程 crawl_queue = MongoQueue() crawl_queue.push(seed_url) D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: try: url = crawl_queue.pop() except IndexError: break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print 'Error in callback for: {}: {}'.format(url, e) else: for link in links: #多线程 # link = normalize(seed_url, link) # # check whether already crawled this link # if link not in seen: # seen.add(link) # # add this new link to queue # crawl_queue.append(link) #多进程 crawl_queue.push(normalize(seed_url, link)) crawl_queue.complete(url) threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue: thread = threading.Thread(target=process_queue()) thread.setDaemon(True) thread.start() threads.append(thread) time.sleep(sleep_time)def get_robots(url): """Initialize robots parser for this domain """ rp = robotparser.RobotFileParser() rp.set_url(urlparse.urljoin(url, '/robots.txt'))#绝对链接 rp.read() return rpdef normalize(seed_url, link): """Normalize this URL by removing hash and adding domain """ link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates urldefrag(url)将url分解成去掉fragment的新url和去掉的fragment的二元组 return urlparse.urljoin(seed_url, link)#绝对链接def same_domain(url1, url2): """Return True if both URL's belong to same domain """ #将url分解成部件的6元组 return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netlocdef get_links(html): """Return a list of links from html """ # a regular expression to extract all links from the webpage #re.compile()函数将正则表达式的字符串形式编译为Pattern实例,然后使用Pattern实例处理文本并获得匹配结果(一个Match实例),最后使用Match实例获得信息,进行其他的操作。 webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) # list of all links from the webpage return webpage_regex.findall(html)class ScrapeCallback: def __init__(self): self.writer = csv.writer(open('countries.csv', 'w')) self.fields = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages') self.writer.writerow(self.fields) def __call__(self, url, html): if re.search('view', url): tree = lxml.html.fromstring(html) row = [] for field in self.fields: row.append(tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content()) self.writer.writerow(row)
3.multiprocessing.pyimport multiprocessingfrom threaded_crawler import threaded_crawlerdef process_link_crawler(args, **kwargs): num_cpus = multiprocessing.cpu_count() print 'Staring {} process'.format(num_cpus) processes = [] for i in range(num_cpus): p = multiprocessing.process(target=threaded_crawler, args=[args], kwargs=kwargs) p.start() processes.append(p) for p in processes: p.join()4.process_test.pyimport sysfrom process_crawler import process_crawlerfrom mongo_cache import MongoCachefrom alexa_cb import AlexaCallbackdef main(max_threads): scrape_callback = AlexaCallback() cache = MongoCache() cache.clear() process_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10)if __name__ == '__main__': sys.argv = ['$ time python process_test.py', 5] max_threads = int(sys.argv[1]) main(max_threads)
阅读全文
0 0
- 多进程爬虫
- 多进程爬虫
- python爬虫-多进程
- python 多进程爬虫案例
- 爬虫进程
- python爬虫:编写多进程爬虫学习笔记
- 【WebScraping】并行下载_多线程爬虫&多进程爬虫
- 爬虫实战4—多线程与多进程爬虫
- 多进程 多线程 异步 爬虫(1)
- 多进程 多线程 异步 爬虫(2)
- python爬虫学习多进程下载图片
- Python爬虫学习笔记--多进程用法
- python爬虫——多进程multiprocessing
- python scrapy多进程新闻爬虫
- Python多进程协程爬虫----1
- 爬虫知识点(多进程,多线程 多协程)
- 【爬虫学习】多进程,多线程处理
- 多进程+多线程打造高效率爬虫
- if和与、或的组合判断
- 变量+标量变量+变量命名规则+变量命名规则
- 存储结构的定义及其分类
- ASP.NET页面输出XML(C#)
- .NET 在线生成XML文档,并提供下载
- 多进程爬虫
- 专注于windows系统安全 与 asp.net开发
- .Net读取xlsx文件Excel2007
- 【LeetCode】48. Rotate Image
- HBase1.2.0 windows单机版安装配置
- CodeForces
- 观察者模式--众多通知,一步到位(行为模式06)
- 纯CSS美化表格单元格背景效果,效果很帅
- UBUNTU系统镜像定制