AkkaCrawler 翻译(一)
来源:互联网 发布:ps淘宝详情页制作步骤 编辑:程序博客网 时间:2024/06/05 08:01
有了Akka 及Lucene的简单基础后就可以看一个简单的
示例项目 了解Akka如何在爬虫中进行应用
本节主要对于项目中有关搜索引擎的基本 部件 进行简要介绍
代码搬运自
https://github.com/fhopf/akka-crawler-example
用到的类
per4j 性能分析工具StopWatch 主要用来执行 性能监控 其有若干派生类可以用于实现不同形式输出的性能监控 常用的是输出到命令行的 LoggingStopWatch其使用start 及 stop来记录下 一段代码块下 运行花费的时间当使用一个StopWatch实现对于多个代码部分 的分割性能监视时 一般使用 lap方法(其立即调用 stop 之后立即调用start)相应简单示例见http://www.blogjava.net/yangpingyu/archive/2012/04/15/374217.html这里用到的MatchAllDocsQuery 是匹配所有document的query (select * ??)下面对示例的若干代码给予说明。
package de.fhopf.akka;/** * Created by admin on 2017/3/14. */import org.apache.lucene.index.IndexWriter;public interface Execution { public void downloadAndIndex(String path, IndexWriter writer);}
package de.fhopf.akka;/** * Created by admin on 2017/3/14. */import java.io.File;import java.io.IOException;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.index.CorruptIndexException;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.MatchAllDocsQuery;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.store.LockObtainFailedException;import org.apache.lucene.util.Version;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import org.perf4j.LoggingStopWatch;import org.perf4j.StopWatch;public class Executor { private final Execution execution; private final Logger logger = LoggerFactory.getLogger(Executor.class); public Executor(Execution execution){this.execution = execution;} public void execute(String path){ IndexWriter writer = null; IndexSearcher searcher = null; try{ File indexDir = new File(System.getProperty("java.io.tmpdir"), "akka-index"); writer = openWriter(indexDir); StopWatch stopWatch = new LoggingStopWatch(); execution.downloadAndIndex(path, writer); stopWatch.stop(execution.getClass().getSimpleName()); searcher = openSearcher(indexDir); TopDocs result = searcher.search(new MatchAllDocsQuery(), 100); logger.info("Found {} reaults", result.totalHits); for(ScoreDoc scoreDoc: result.scoreDocs){ Document doc = searcher.doc(scoreDoc.doc); logger.debug(doc.get("id")); } searcher.close(); }catch(Exception ex){ logger.error(ex.getMessage(), ex); if (writer != null) { try { writer.rollback(); } catch (IOException ex1) { logger.error(ex1.getMessage(), ex1); } } } finally { if (writer != null) { try { writer.close(); } catch (CorruptIndexException ex) { logger.error(ex.getMessage(), ex); } catch (IOException ex) { logger.error(ex.getMessage(), ex); } } if (searcher != null) { try { searcher.close(); } catch (IOException ex) { logger.error(ex.getMessage(), ex); } } } } private IndexWriter openWriter(File indexDir)throws CorruptIndexException, LockObtainFailedException, IOException{ Directory dir = FSDirectory.open(indexDir); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); return new IndexWriter(dir, config); } private IndexSearcher openSearcher(File indexDir)throws CorruptIndexException, IOException{ Directory dir = FSDirectory.open(indexDir); IndexReader reader = IndexReader.open(dir); return new IndexSearcher(reader); }}
Executor其基本的作用就是 在系统临时目录 创建 搜索引擎的索引并使用MatchAllDocsQuery匹配所有的doocument (100个之内)并给出downloadAndIndex所使用的时间的性能记录。
package de.fhopf.akka;/** * Created by admin on 2017/3/14. */public interface PageRetriever { PageContent fetchPageContent(String url);}
package de.fhopf.akka;/** * Created by admin on 2017/3/14. */import java.util.ArrayList;import java.util.List;import org.htmlparser.Parser;import org.htmlparser.Tag;import org.htmlparser.tags.BodyTag;import org.htmlparser.tags.LinkTag;import org.htmlparser.tags.TitleTag;import org.htmlparser.util.ParserException;import org.htmlparser.visitors.NodeVisitor;import org.slf4j.Logger;import org.slf4j.LoggerFactory;public class HtmlParserPageRetriever implements PageRetriever { private final String baseUrl; private static final Logger logger = LoggerFactory.getLogger(HtmlParserPageRetriever.class); public HtmlParserPageRetriever(String baseUrl){this.baseUrl = baseUrl;} public PageContent fetchPageContent(String url){ logger.debug("Fetching {}", url); try{ Parser parser = new Parser(url); PageContentVisitor visitor = new PageContentVisitor(baseUrl, url); parser.visitAllNodesWith(visitor); return visitor.getContent(); }catch(ParserException ex){ throw new IllegalStateException(ex); } } private static class PageContentVisitor extends NodeVisitor{ private List<String> linksToVisit = new ArrayList<String>(); private String content; private String title; private final String baseUrl; private final String currentUrl; public PageContentVisitor(String baseUrl, String currentUrl){ super(true); this.baseUrl = baseUrl; this.currentUrl = currentUrl; } @Override public void visitTag(Tag tag){ if(tag instanceof LinkTag){ LinkTag linkTag = (LinkTag) tag; if(linkTag.getLink().startsWith(baseUrl) && isProbablyHtml(linkTag.getLink())){ logger.debug("Using link pointing to {}", linkTag.getLink()); linksToVisit.add(linkTag.getLink()); }else{ logger.debug("Skipping link pointing to {}", linkTag.getLink()); } }else if(tag instanceof TitleTag){ TitleTag titleTag = (TitleTag) tag; title = titleTag.getTitle(); }else if(tag instanceof BodyTag){ BodyTag bodyTag = (BodyTag)tag; content = bodyTag.toPlainTextString(); } } public PageContent getContent(){ return new PageContent(currentUrl, linksToVisit, title, content); } private boolean isProbablyHtml(String link){ return link.endsWith(".html") || link.endsWith("/"); } }}
HtmlParserPageRetriever用于实现 html解析并遍历节点的类其具体实现是通过vistitor遍历所有的html节点 并将body作为content进行储存 将可能的后续访问links 也作为 List的元素储存起来。
用到的内容类
package de.fhopf.akka;/** * Created by admin on 2017/3/14. */import java.util.List;public class PageContent { private final List<String> linksToFollow; private final String title; private final String content; private final String path; public PageContent(String path, List<String> linksToFollow, String title, String content){ this.path = path; this.linksToFollow = linksToFollow; this.title = title; this.content = content; } public String getPath(){return path;} public List<String> getLinksToFollow(){return linksToFollow;} public String getTitle(){return title;} public String getContent(){return content;} @Override public String toString(){ return "PageContent{title=" + title + ", content= "+ content + ", linksToFollow= + " + linksToFollow +" }"; }}
package de.fhopf.akka;/** * Created by admin on 2017/3/14. */public interface Indexer { void commit(); void index(PageContent pageContent); void close();}
package de.fhopf.akka;/** * Created by admin on 2017/3/14. */import java.io.IOException;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.CorruptIndexException;import org.apache.lucene.index.IndexWriter;public class IndexerImpl implements Indexer { private final IndexWriter indexWriter; public IndexerImpl(IndexWriter indexWriter){this.indexWriter = indexWriter;} public void index(PageContent pageContent){ try{ indexWriter.addDocument(toDocument(pageContent)); }catch(CorruptIndexException ex){ throw new IllegalStateException(ex); }catch(IOException ex){ throw new IllegalStateException(ex); } } private Document toDocument(PageContent content){ Document doc = new Document(); doc.add(new Field("id", content.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("title", content.getTitle(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("content", content.getContent(), Field.Store.NO, Field.Index.ANALYZED)); return doc; } public void commit(){ try{ indexWriter.commit(); }catch (CorruptIndexException ex){ throw new IllegalStateException(ex); }catch (IOException ex){ throw new IllegalStateException(ex); } } public void close() { try { indexWriter.close(true); } catch (CorruptIndexException ex) { throw new IllegalStateException(ex); } catch (IOException ex) { throw new IllegalStateException(ex); } }}
IndexerImpl将content 先转化为document 后将其写入IndexWriter以生成索引 有一些有关Document Field的选项 如 Store.NO 指定可以搜索得到 但不在相应的field储存 内容(这是可以理解的 因为对于一般的文本结构 搜索引擎会对文本进行 分词 等“解构”处理 处理前的原信息可能是不需要的) Index.NOT_ANALYZED 指定不进行分词(取出停词等一些“解构”操作都在此步中) 在有关搜素的IndexWriter的方法中 有一个commit方法 使得之前的一些增删操作立即生效 (在搜索中可以查到) 相应的问题在elasticsearch中也是提到的 一般是不会手动地操作使得加入的 数据立即生效 如想立即生效在elasticsearch中可以调用refresh 方法。
package de.fhopf.akka;/** * Created by admin on 2017/3/14. */import java.util.Collection;import java.util.HashSet;import java.util.Set;public class VisitedPageStore { private Set<String> pagesToVisit = new HashSet<>(); private Set<String> allPages = new HashSet<>(); private Set<String> inProgress = new HashSet<>(); public void add(String page){ if(!allPages.contains(page)){ pagesToVisit.add(page); allPages.add(page); } } public void addAll(Collection<String> pages){ for(String page: pages){ add(page); } } public void finished(String page){inProgress.remove(page);} public String getNext(){ if(pagesToVisit.isEmpty()){ return null; }else { String next = pagesToVisit.iterator().next(); pagesToVisit.remove(next); inProgress.add(next); return next; } } public Collection<String> getNextBatch(){ Set<String> pages = new HashSet<>(); pages.addAll(pagesToVisit); pagesToVisit.clear(); inProgress.addAll(pages); return pages; } public boolean isFinished(){return pagesToVisit.isEmpty() && inProgress.isEmpty();} @Override public String toString(){ return String.format("inProgress: %1$3s, allPages: %2$3s", inProgress.size(), allPages.size()); }}
VisitedPageStore 该类用于对于将要进行访问的 及访问完的页面进行存储,代码很简单 不述。
后续会继续对Akka实现的爬虫内容进行介绍
下面是简单的将整个 url队列当成一个集合的情况下的 单线程阻塞版的爬虫函数调用(此部分不涉及akka 是单线程阻塞版本)
比较坑 的一个现象 是当运行的过程抛出异常后 就会终止(如比较常见的 是在遍历节点时 出现编码异常:实测时确实遇到了)
package de.fhopf.akka.sequential;import de.fhopf.akka.*;import de.fhopf.akka.Execution;import de.fhopf.akka.Executor;import org.apache.lucene.index.IndexWriter;/** * Indexes pages sequentially. * @author Florian Hopf, http://www.florian-hopf.de */public class SequentialExecution implements Execution { @Override public void downloadAndIndex(String path, IndexWriter writer) { VisitedPageStore pageStore = new VisitedPageStore(); pageStore.add(path); Indexer indexer = new IndexerImpl(writer); PageRetriever retriever = new HtmlParserPageRetriever(path); String page; while ((page = pageStore.getNext()) != null) { PageContent pageContent = retriever.fetchPageContent(page); pageStore.addAll(pageContent.getLinksToFollow()); indexer.index(pageContent); pageStore.finished(page); } indexer.commit(); } public static void main(String[] args) { SequentialExecution execution = new SequentialExecution(); Executor exec = new Executor(execution); exec.execute("http://www.example.com/"); } }
package de.fhopf.akka.actor;/** * Created by admin on 2017/3/17. */public class IndexedMessage { public final String path; public IndexedMessage(String path){this.path = path;}}
package de.fhopf.akka.actor;/** * Created by admin on 2017/3/17. */import akka.actor.UntypedActor;import de.fhopf.akka.Indexer;import de.fhopf.akka.PageContent;public class IndexingActor extends UntypedActor { public static final Object COMMIT_MESSAGE = new Object(); public static final Object COMMITTED_MESSAGE = new Object(); private final Indexer indexer; public IndexingActor(Indexer indexer){this.indexer = indexer;} @Override public void onReceive(Object o)throws Exception{ if(o instanceof PageContent){ PageContent content = (PageContent) o; indexer.index(content); getSender().tell(new IndexedMessage(content.getPath()), getSelf()); }else if(COMMIT_MESSAGE == o){ indexer.commit(); getSender().tell(COMMITTED_MESSAGE, getSelf()); }else { unhandled(o); } }}
package de.fhopf.akka.actor;/** * Created by admin on 2017/3/17. */import akka.actor.UntypedActor;import de.fhopf.akka.PageRetriever;import de.fhopf.akka.PageContent;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import scala.Option;public class PageParsingActor extends UntypedActor { private final PageRetriever pageRetriever; private final Logger logger = LoggerFactory.getLogger(PageParsingActor.class); public PageParsingActor(PageRetriever pageRetriever){this.pageRetriever = pageRetriever;} @Override public void onReceive(Object o)throws Exception{ if(o instanceof String){ PageContent content = pageRetriever.fetchPageContent((String) o); getSender().tell(content, getSelf()); }else { unhandled(o); } } @Override public void preRestart(Throwable reason, Option<Object> message)throws Exception{ logger.info("Restarting PageParsingActor because of {}", reason.getClass()); super.preRestart(reason, message); }}
package de.fhopf.akka.actor;/** * Created by admin on 2017/3/17. */import org.slf4j.LoggerFactory;import org.slf4j.Logger;import akka.actor.ActorRef;import akka.actor.UntypedActor;import de.fhopf.akka.PageContent;import de.fhopf.akka.VisitedPageStore;public abstract class Master extends UntypedActor { private final Logger logger = LoggerFactory.getLogger(Master.class); private final VisitedPageStore visitedPageStore = new VisitedPageStore(); @Override public void onReceive(Object message)throws Exception{ if(message instanceof String){ String start = (String)message; visitedPageStore.add(start); getParser().tell(visitedPageStore.getNext(),getSelf()); }else if(message instanceof PageContent){ PageContent content = (PageContent) message; getIndexer().tell(content, getSelf()); visitedPageStore.addAll(content.getLinksToFollow()); logger.info(visitedPageStore.toString()); if(visitedPageStore.isFinished()){ getIndexer().tell(IndexingActor.COMMIT_MESSAGE, getSelf()); }else { for(String page: visitedPageStore.getNextBatch()){ getParser().tell(page, getSelf()); } } }else if(message instanceof IndexedMessage){ IndexedMessage indexedMessage = (IndexedMessage) message; visitedPageStore.finished(indexedMessage.path); if(visitedPageStore.isFinished()){ getIndexer().tell(IndexingActor.COMMIT_MESSAGE, getSelf()); } }else if(message == IndexingActor.COMMITTED_MESSAGE){ logger.info("Shutting down, finished"); getContext().system().shutdown(); } } protected abstract ActorRef getIndexer(); protected abstract ActorRef getParser();}
package de.fhopf.akka.actor;/** * Created by admin on 2017/3/17. */import org.apache.lucene.index.IndexWriter;import akka.actor.ActorRef;import akka.actor.Props;import de.fhopf.akka.IndexerImpl;import de.fhopf.akka.PageRetriever;public class SimpleActorMaster extends Master { private final ActorRef indexer; private final ActorRef parser; public SimpleActorMaster(final PageRetriever pageRetriever, final IndexWriter indexWriter){ this.indexer = getContext().actorOf(Props.create(IndexingActor.class, new IndexerImpl(indexWriter))); this.parser = getContext().actorOf(Props.create(PageParsingActor.class, pageRetriever)); } @Override protected ActorRef getParser(){return parser;} @Override protected ActorRef getIndexer(){return indexer;}}
package de.fhopf.akka.actor;/** * Created by admin on 2017/3/17. */import org.apache.lucene.index.IndexWriter;import akka.actor.ActorRef;import akka.actor.ActorSystem;import akka.actor.Props;import de.fhopf.akka.Executor;import de.fhopf.akka.Execution;import de.fhopf.akka.HtmlParserPageRetriever;public class SimpleActorExecution implements Execution { @Override public void downloadAndIndex(final String path, final IndexWriter writer){ ActorSystem actorSystem = ActorSystem.create(); ActorRef master = actorSystem.actorOf(Props.create(SimpleActorMaster.class, new HtmlParserPageRetriever(path), writer)); master.tell(path, actorSystem.guardian()); actorSystem.awaitTermination(); } public static void main(String[]args){ SimpleActorExecution execution = new SimpleActorExecution(); Executor exec = new Executor(execution); exec.execute("http://www.example.com/"); }}
此版本将整个顺序逻辑过程
主要分解为负责页面解析的 PageParsingActor 及负责对于搜索引擎进行写入的 IndexingActor
及用于分配任务的Master
实现了非阻塞任务分离
通过
SimpleActorExecution进行调用。在 akka中实现 并行非常简单 这里利用了Router(路由)直接将 任务分发到20个Routee 关于路由的简单解释见https://rockeyhoo.gitbooks.io/akka_essentials/content/ppt/lu_you_qi.html该实现的效率较单线程是惊人的
package de.fhopf.akka.actor.parallel;/** * Created by admin on 2017/3/17. */import akka.actor.ActorRef;import akka.actor.Props;import akka.routing.RoundRobinRouter;import de.fhopf.akka.Indexer;import de.fhopf.akka.PageRetriever;import de.fhopf.akka.actor.IndexingActor;import de.fhopf.akka.actor.Master;import de.fhopf.akka.actor.PageParsingActor;public class ParallelMaster extends Master { private final ActorRef parser; private final ActorRef indexingActor; public ParallelMaster(final Indexer indexer, final PageRetriever pageRetriever){ parser = getContext().actorOf(Props.create(PageParsingActor.class, pageRetriever).withRouter(new RoundRobinRouter(20))); indexingActor = getContext().actorOf(Props.create(IndexingActor.class, indexer)); } @Override protected ActorRef getIndexer(){return indexingActor;} @Override protected ActorRef getParser(){return parser;}}package de.fhopf.akka.actor.parallel;/** * Created by admin on 2017/3/17. */import org.apache.lucene.index.IndexWriter;import akka.actor.ActorRef;import akka.actor.ActorSystem;import akka.actor.Props;import de.fhopf.akka.Execution;import de.fhopf.akka.Executor;import de.fhopf.akka.HtmlParserPageRetriever;import de.fhopf.akka.IndexerImpl;public class FetchInParallelExecution implements Execution { @Override public void downloadAndIndex(final String path, final IndexWriter writer){ ActorSystem actorSystem = ActorSystem.create(); ActorRef master = actorSystem.actorOf(Props.create(ParallelMaster.class, new IndexerImpl(writer), new HtmlParserPageRetriever(path))); master.tell(path, actorSystem.guardian()); actorSystem.awaitTermination(); } public static void main(String [] args){ FetchInParallelExecution execution = new FetchInParallelExecution(); Executor exec = new Executor(execution); exec.execute("http://www.example.com/"); }}
从上述示例可以看到使用akka实现并行的简易与高效
其在机器学习中的简单应用见 Akka 在Bagging投票算法中的简单应用
0 0
- AkkaCrawler 翻译(一)
- Boost.Asio翻译(一)
- AGG文档翻译(一)
- Boost.Asio翻译(一)
- V4L2文档翻译(一)
- V4L2文档翻译(一)
- Nim教程翻译(一)
- 【翻译】LearnYouSomeErlangForGreatGood(一):导言
- kamon文档翻译(一)
- mybatis知识翻译(一)
- 终端服务(翻译)(一)
- 终端服务(翻译)(一)
- (一)CocoaPods入门(原文翻译)
- Struts快速入门(一)icecloud [翻译]
- eMule 协议分析翻译(一)
- WTP协议简要翻译一(dlmu2001)
- 翻译:通往WinDbg的捷径(一)
- 尝试翻译JRUnit的文档(一)
- 哈希表算法实现
- windows 安装python3.6(numpy,scipy,pandas,matplotlib,scikit-learn)
- 什么是Quartz
- Activity&Fragment生命周期
- 菲波那切数列求余数
- AkkaCrawler 翻译(一)
- 8位10进制数变成32进制数
- C语言strtol()函数:将字符串转换成long(长整型数)
- 关于自定义控件和属性时TypedArray.getDimension应当注意的问题
- Ubuntu下使用git在github托管代码
- iOS 多个label/button自动换行
- RabbitMQ学习之PHP AMQP拓展安装
- 自实现简单线程池
- Vuex初探之旅