AkkaCrawler 翻译(一)

来源:互联网 发布:ps淘宝详情页制作步骤 编辑:程序博客网 时间:2024/06/05 08:01

有了Akka 及Lucene的简单基础后就可以看一个简单的

示例项目 了解Akka如何在爬虫中进行应用 

本节主要对于项目中有关搜索引擎的基本 部件 进行简要介绍


代码搬运自 

https://github.com/fhopf/akka-crawler-example


用到的类

per4j 性能分析工具StopWatch 主要用来执行 性能监控 其有若干派生类可以用于实现不同形式输出的性能监控 常用的是输出到命令行的 LoggingStopWatch其使用start 及 stop来记录下 一段代码块下 运行花费的时间当使用一个StopWatch实现对于多个代码部分 的分割性能监视时 一般使用 lap方法(其立即调用 stop 之后立即调用start)相应简单示例见http://www.blogjava.net/yangpingyu/archive/2012/04/15/374217.html这里用到的MatchAllDocsQuery 是匹配所有document的query (select * ??)下面对示例的若干代码给予说明。

package de.fhopf.akka;/** * Created by admin on 2017/3/14. */import org.apache.lucene.index.IndexWriter;public interface Execution {    public void downloadAndIndex(String path, IndexWriter writer);}


package de.fhopf.akka;/** * Created by admin on 2017/3/14. */import java.io.File;import java.io.IOException;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.index.CorruptIndexException;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.MatchAllDocsQuery;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.store.LockObtainFailedException;import org.apache.lucene.util.Version;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import org.perf4j.LoggingStopWatch;import org.perf4j.StopWatch;public class Executor {    private final Execution execution;    private final Logger logger = LoggerFactory.getLogger(Executor.class);    public  Executor(Execution execution){this.execution = execution;}    public void execute(String path){        IndexWriter writer = null;        IndexSearcher searcher = null;        try{            File indexDir = new File(System.getProperty("java.io.tmpdir"), "akka-index");            writer = openWriter(indexDir);            StopWatch stopWatch = new LoggingStopWatch();            execution.downloadAndIndex(path, writer);            stopWatch.stop(execution.getClass().getSimpleName());            searcher = openSearcher(indexDir);            TopDocs result = searcher.search(new MatchAllDocsQuery(), 100);            logger.info("Found {} reaults", result.totalHits);            for(ScoreDoc scoreDoc: result.scoreDocs){                Document doc = searcher.doc(scoreDoc.doc);                logger.debug(doc.get("id"));            }            searcher.close();        }catch(Exception ex){            logger.error(ex.getMessage(), ex);            if (writer != null) {                try {                    writer.rollback();                } catch (IOException ex1) {                    logger.error(ex1.getMessage(), ex1);                }            }        } finally {            if (writer != null) {                try {                    writer.close();                } catch (CorruptIndexException ex) {                    logger.error(ex.getMessage(), ex);                } catch (IOException ex) {                    logger.error(ex.getMessage(), ex);                }            }            if (searcher != null) {                try {                    searcher.close();                } catch (IOException ex) {                    logger.error(ex.getMessage(), ex);                }            }        }    }    private IndexWriter openWriter(File indexDir)throws CorruptIndexException, LockObtainFailedException, IOException{        Directory dir = FSDirectory.open(indexDir);        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35));        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);        return new IndexWriter(dir, config);    }    private IndexSearcher openSearcher(File indexDir)throws CorruptIndexException, IOException{        Directory dir = FSDirectory.open(indexDir);        IndexReader reader = IndexReader.open(dir);        return new IndexSearcher(reader);    }}

Executor其基本的作用就是 在系统临时目录 创建 搜索引擎的索引并使用MatchAllDocsQuery匹配所有的doocument (100个之内)并给出downloadAndIndex所使用的时间的性能记录。
package de.fhopf.akka;/** * Created by admin on 2017/3/14. */public interface PageRetriever {    PageContent fetchPageContent(String url);}

package de.fhopf.akka;/** * Created by admin on 2017/3/14. */import java.util.ArrayList;import java.util.List;import org.htmlparser.Parser;import org.htmlparser.Tag;import org.htmlparser.tags.BodyTag;import org.htmlparser.tags.LinkTag;import org.htmlparser.tags.TitleTag;import org.htmlparser.util.ParserException;import org.htmlparser.visitors.NodeVisitor;import org.slf4j.Logger;import org.slf4j.LoggerFactory;public class HtmlParserPageRetriever implements PageRetriever {    private final String baseUrl;    private static final Logger logger = LoggerFactory.getLogger(HtmlParserPageRetriever.class);    public HtmlParserPageRetriever(String baseUrl){this.baseUrl = baseUrl;}    public PageContent fetchPageContent(String url){        logger.debug("Fetching {}", url);        try{            Parser parser = new Parser(url);            PageContentVisitor visitor = new PageContentVisitor(baseUrl, url);            parser.visitAllNodesWith(visitor);            return visitor.getContent();        }catch(ParserException ex){            throw new IllegalStateException(ex);        }    }    private static class PageContentVisitor extends NodeVisitor{        private List<String> linksToVisit = new ArrayList<String>();        private String content;        private String title;        private final String baseUrl;        private final String currentUrl;        public PageContentVisitor(String baseUrl, String currentUrl){            super(true);            this.baseUrl = baseUrl;            this.currentUrl = currentUrl;        }        @Override        public void visitTag(Tag tag){            if(tag instanceof LinkTag){                LinkTag linkTag = (LinkTag) tag;                if(linkTag.getLink().startsWith(baseUrl) && isProbablyHtml(linkTag.getLink())){                    logger.debug("Using link pointing to {}", linkTag.getLink());                    linksToVisit.add(linkTag.getLink());                }else{                    logger.debug("Skipping link pointing to {}", linkTag.getLink());                }            }else if(tag instanceof TitleTag){                TitleTag titleTag = (TitleTag) tag;                title = titleTag.getTitle();            }else if(tag instanceof BodyTag){                BodyTag bodyTag = (BodyTag)tag;                content = bodyTag.toPlainTextString();            }        }        public  PageContent getContent(){            return new PageContent(currentUrl, linksToVisit, title, content);        }        private boolean isProbablyHtml(String link){            return link.endsWith(".html") || link.endsWith("/");        }    }}


HtmlParserPageRetriever用于实现 html解析并遍历节点的类其具体实现是通过vistitor遍历所有的html节点 并将body作为content进行储存 将可能的后续访问links 也作为 List的元素储存起来。

用到的内容类

package de.fhopf.akka;/** * Created by admin on 2017/3/14. */import java.util.List;public class PageContent {    private final List<String> linksToFollow;    private final String title;    private final String content;    private final String path;    public PageContent(String path, List<String> linksToFollow, String title, String content){        this.path = path;        this.linksToFollow = linksToFollow;        this.title = title;        this.content = content;    }    public String getPath(){return path;}    public List<String> getLinksToFollow(){return linksToFollow;}    public String getTitle(){return title;}    public String getContent(){return content;}    @Override    public String toString(){        return "PageContent{title=" + title +  ", content= "+ content +  ", linksToFollow= + " + linksToFollow +" }";    }}

package de.fhopf.akka;/** * Created by admin on 2017/3/14. */public interface Indexer {    void commit();    void index(PageContent pageContent);    void close();}


package de.fhopf.akka;/** * Created by admin on 2017/3/14. */import java.io.IOException;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.CorruptIndexException;import org.apache.lucene.index.IndexWriter;public class IndexerImpl implements Indexer {    private final IndexWriter indexWriter;    public IndexerImpl(IndexWriter indexWriter){this.indexWriter = indexWriter;}    public void index(PageContent pageContent){        try{            indexWriter.addDocument(toDocument(pageContent));        }catch(CorruptIndexException ex){            throw new IllegalStateException(ex);        }catch(IOException ex){            throw new IllegalStateException(ex);        }    }    private Document toDocument(PageContent content){        Document doc = new Document();        doc.add(new Field("id", content.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));        doc.add(new Field("title", content.getTitle(), Field.Store.YES, Field.Index.ANALYZED));        doc.add(new Field("content", content.getContent(), Field.Store.NO, Field.Index.ANALYZED));        return doc;    }    public void commit(){        try{            indexWriter.commit();        }catch (CorruptIndexException ex){            throw new IllegalStateException(ex);        }catch (IOException ex){            throw new IllegalStateException(ex);        }    }    public void close() {        try {            indexWriter.close(true);        } catch (CorruptIndexException ex) {            throw new IllegalStateException(ex);        } catch (IOException ex) {            throw new IllegalStateException(ex);        }    }}

IndexerImpl将content 先转化为document 后将其写入IndexWriter以生成索引    有一些有关Document Field的选项 如        Store.NO 指定可以搜索得到 但不在相应的field储存            内容(这是可以理解的 因为对于一般的文本结构 搜索引擎会对文本进行 分词            等“解构”处理 处理前的原信息可能是不需要的)        Index.NOT_ANALYZED            指定不进行分词(取出停词等一些“解构”操作都在此步中)    在有关搜素的IndexWriter的方法中        有一个commit方法 使得之前的一些增删操作立即生效 (在搜索中可以查到)        相应的问题在elasticsearch中也是提到的 一般是不会手动地操作使得加入的        数据立即生效 如想立即生效在elasticsearch中可以调用refresh 方法。

package de.fhopf.akka;/** * Created by admin on 2017/3/14. */import java.util.Collection;import java.util.HashSet;import java.util.Set;public class VisitedPageStore {    private Set<String> pagesToVisit = new HashSet<>();    private Set<String> allPages = new HashSet<>();    private Set<String> inProgress = new HashSet<>();    public void add(String page){        if(!allPages.contains(page)){            pagesToVisit.add(page);            allPages.add(page);        }    }    public void addAll(Collection<String> pages){        for(String page: pages){            add(page);        }    }    public void finished(String page){inProgress.remove(page);}    public String getNext(){        if(pagesToVisit.isEmpty()){            return null;        }else {            String next = pagesToVisit.iterator().next();            pagesToVisit.remove(next);            inProgress.add(next);            return next;        }    }    public Collection<String> getNextBatch(){        Set<String> pages = new HashSet<>();        pages.addAll(pagesToVisit);        pagesToVisit.clear();        inProgress.addAll(pages);        return pages;    }    public boolean isFinished(){return pagesToVisit.isEmpty() && inProgress.isEmpty();}    @Override    public String toString(){        return String.format("inProgress: %1$3s, allPages: %2$3s", inProgress.size(), allPages.size());    }}

VisitedPageStore    该类用于对于将要进行访问的 及访问完的页面进行存储,代码很简单 不述。

后续会继续对Akka实现的爬虫内容进行介绍


下面是简单的将整个 url队列当成一个集合的情况下的 单线程阻塞版的爬虫函数调用(此部分不涉及akka 是单线程阻塞版本)

比较坑 的一个现象 是当运行的过程抛出异常后 就会终止(如比较常见的 是在遍历节点时 出现编码异常:实测时确实遇到了)

package de.fhopf.akka.sequential;import de.fhopf.akka.*;import de.fhopf.akka.Execution;import de.fhopf.akka.Executor;import org.apache.lucene.index.IndexWriter;/** * Indexes pages sequentially. * @author Florian Hopf, http://www.florian-hopf.de */public class SequentialExecution implements Execution {    @Override    public void downloadAndIndex(String path, IndexWriter writer) {        VisitedPageStore pageStore = new VisitedPageStore();        pageStore.add(path);                Indexer indexer = new IndexerImpl(writer);        PageRetriever retriever = new HtmlParserPageRetriever(path);                String page;        while ((page = pageStore.getNext()) != null) {            PageContent pageContent = retriever.fetchPageContent(page);            pageStore.addAll(pageContent.getLinksToFollow());            indexer.index(pageContent);            pageStore.finished(page);        }                indexer.commit();    }        public static void main(String[] args) {        SequentialExecution execution = new SequentialExecution();        Executor exec = new Executor(execution);        exec.execute("http://www.example.com/");    }    }


下面是对任务进行actor分割 的非并行 akka版本

package de.fhopf.akka.actor;/** * Created by admin on 2017/3/17. */public class IndexedMessage {    public final String path;    public IndexedMessage(String path){this.path = path;}}


package de.fhopf.akka.actor;/** * Created by admin on 2017/3/17. */import akka.actor.UntypedActor;import de.fhopf.akka.Indexer;import de.fhopf.akka.PageContent;public class IndexingActor extends UntypedActor {    public static final Object COMMIT_MESSAGE = new Object();    public static final Object COMMITTED_MESSAGE = new Object();    private final Indexer indexer;    public IndexingActor(Indexer indexer){this.indexer = indexer;}    @Override    public void onReceive(Object o)throws Exception{        if(o instanceof PageContent){            PageContent content = (PageContent) o;            indexer.index(content);            getSender().tell(new IndexedMessage(content.getPath()), getSelf());        }else if(COMMIT_MESSAGE == o){            indexer.commit();            getSender().tell(COMMITTED_MESSAGE, getSelf());        }else {            unhandled(o);        }    }}

package de.fhopf.akka.actor;/** * Created by admin on 2017/3/17. */import akka.actor.UntypedActor;import de.fhopf.akka.PageRetriever;import de.fhopf.akka.PageContent;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import scala.Option;public class PageParsingActor extends UntypedActor {    private final PageRetriever pageRetriever;    private final Logger logger = LoggerFactory.getLogger(PageParsingActor.class);    public PageParsingActor(PageRetriever pageRetriever){this.pageRetriever = pageRetriever;}    @Override    public void onReceive(Object o)throws Exception{        if(o instanceof String){            PageContent content = pageRetriever.fetchPageContent((String) o);            getSender().tell(content, getSelf());        }else {            unhandled(o);        }    }    @Override    public void preRestart(Throwable reason, Option<Object> message)throws Exception{        logger.info("Restarting PageParsingActor because of {}", reason.getClass());        super.preRestart(reason, message);    }}

package de.fhopf.akka.actor;/** * Created by admin on 2017/3/17. */import org.slf4j.LoggerFactory;import org.slf4j.Logger;import akka.actor.ActorRef;import akka.actor.UntypedActor;import de.fhopf.akka.PageContent;import de.fhopf.akka.VisitedPageStore;public abstract class Master extends UntypedActor {    private final Logger logger = LoggerFactory.getLogger(Master.class);    private final VisitedPageStore visitedPageStore = new VisitedPageStore();    @Override    public void onReceive(Object message)throws Exception{        if(message instanceof String){            String start = (String)message;            visitedPageStore.add(start);            getParser().tell(visitedPageStore.getNext(),getSelf());        }else if(message instanceof PageContent){            PageContent content = (PageContent) message;            getIndexer().tell(content, getSelf());            visitedPageStore.addAll(content.getLinksToFollow());            logger.info(visitedPageStore.toString());            if(visitedPageStore.isFinished()){                getIndexer().tell(IndexingActor.COMMIT_MESSAGE, getSelf());            }else {                for(String page: visitedPageStore.getNextBatch()){                    getParser().tell(page, getSelf());                }            }        }else if(message instanceof IndexedMessage){            IndexedMessage indexedMessage = (IndexedMessage) message;            visitedPageStore.finished(indexedMessage.path);            if(visitedPageStore.isFinished()){                getIndexer().tell(IndexingActor.COMMIT_MESSAGE, getSelf());            }        }else if(message == IndexingActor.COMMITTED_MESSAGE){            logger.info("Shutting down, finished");            getContext().system().shutdown();        }    }    protected abstract ActorRef getIndexer();    protected abstract ActorRef getParser();}

package de.fhopf.akka.actor;/** * Created by admin on 2017/3/17. */import org.apache.lucene.index.IndexWriter;import akka.actor.ActorRef;import akka.actor.Props;import de.fhopf.akka.IndexerImpl;import de.fhopf.akka.PageRetriever;public class SimpleActorMaster extends Master {    private final ActorRef indexer;    private final ActorRef parser;    public SimpleActorMaster(final PageRetriever pageRetriever, final IndexWriter indexWriter){        this.indexer = getContext().actorOf(Props.create(IndexingActor.class, new IndexerImpl(indexWriter)));        this.parser = getContext().actorOf(Props.create(PageParsingActor.class, pageRetriever));    }    @Override    protected ActorRef getParser(){return parser;}    @Override    protected ActorRef getIndexer(){return indexer;}}

package de.fhopf.akka.actor;/** * Created by admin on 2017/3/17. */import org.apache.lucene.index.IndexWriter;import akka.actor.ActorRef;import akka.actor.ActorSystem;import akka.actor.Props;import de.fhopf.akka.Executor;import de.fhopf.akka.Execution;import de.fhopf.akka.HtmlParserPageRetriever;public class SimpleActorExecution implements Execution {    @Override    public void downloadAndIndex(final String path, final IndexWriter writer){        ActorSystem actorSystem = ActorSystem.create();        ActorRef master = actorSystem.actorOf(Props.create(SimpleActorMaster.class, new HtmlParserPageRetriever(path), writer));        master.tell(path, actorSystem.guardian());        actorSystem.awaitTermination();    }    public static void main(String[]args){        SimpleActorExecution execution = new SimpleActorExecution();        Executor exec = new Executor(execution);        exec.execute("http://www.example.com/");    }}

此版本将整个顺序逻辑过程 

主要分解为负责页面解析的 PageParsingActor 及负责对于搜索引擎进行写入的 IndexingActor

及用于分配任务的Master

实现了非阻塞任务分离

通过 

SimpleActorExecution进行调用。在 akka中实现 并行非常简单 这里利用了Router(路由)直接将 任务分发到20个Routee 关于路由的简单解释见https://rockeyhoo.gitbooks.io/akka_essentials/content/ppt/lu_you_qi.html该实现的效率较单线程是惊人的

package de.fhopf.akka.actor.parallel;/** * Created by admin on 2017/3/17. */import akka.actor.ActorRef;import akka.actor.Props;import akka.routing.RoundRobinRouter;import de.fhopf.akka.Indexer;import de.fhopf.akka.PageRetriever;import de.fhopf.akka.actor.IndexingActor;import de.fhopf.akka.actor.Master;import de.fhopf.akka.actor.PageParsingActor;public class ParallelMaster extends Master {    private final ActorRef parser;    private final ActorRef indexingActor;    public ParallelMaster(final Indexer indexer, final PageRetriever pageRetriever){        parser = getContext().actorOf(Props.create(PageParsingActor.class, pageRetriever).withRouter(new RoundRobinRouter(20)));        indexingActor = getContext().actorOf(Props.create(IndexingActor.class, indexer));    }    @Override    protected ActorRef getIndexer(){return indexingActor;}    @Override    protected ActorRef getParser(){return parser;}}

package de.fhopf.akka.actor.parallel;/** * Created by admin on 2017/3/17. */import org.apache.lucene.index.IndexWriter;import akka.actor.ActorRef;import akka.actor.ActorSystem;import akka.actor.Props;import de.fhopf.akka.Execution;import de.fhopf.akka.Executor;import de.fhopf.akka.HtmlParserPageRetriever;import de.fhopf.akka.IndexerImpl;public class FetchInParallelExecution implements Execution {    @Override    public void downloadAndIndex(final String path, final IndexWriter writer){        ActorSystem actorSystem = ActorSystem.create();        ActorRef master = actorSystem.actorOf(Props.create(ParallelMaster.class, new IndexerImpl(writer), new HtmlParserPageRetriever(path)));        master.tell(path, actorSystem.guardian());        actorSystem.awaitTermination();    }    public static void main(String [] args){        FetchInParallelExecution execution = new FetchInParallelExecution();        Executor exec = new Executor(execution);        exec.execute("http://www.example.com/");    }}

从上述示例可以看到使用akka实现并行的简易与高效
其在机器学习中的简单应用见 Akka 在Bagging投票算法中的简单应用

0 0