基于lucene的mr索引程序的实现
来源:互联网 发布:nginx 账号密码 编辑:程序博客网 时间:2024/06/06 13:19
之前做过一个基于solrJ的mr索引程序,性能并不理想,想着solr的底层就是lucene,所以我用相应版本的lucene做了一个mr程序,程序性能提高了30-40倍(同样的分词器),实验证明创建出来的索引能被solrcloud识别,此程序只用到了map过程,生成的文件最后经过一次本地的合并形成最终的索引。
下面放出部分主要程序代码:
public static class TestMapper extendsMapper<LongWritable, Text, Text, Text> {private static final Logger LOG = Logger.getLogger(TestMapper.class);RunInfo runInfo = null;/** * 做一些准备工作 1.获得solrcloud的DocCollection,以方便知道文档应该放在哪一个shard里面 * 2.初始化对应shard的list以方便文档被放入 */@Overrideprotected void setup(Context context) throws IOException,InterruptedException {CreateIndexWriter createIndexWriter = new CreateIndexWriter();try {runInfo = createIndexWriter.create(context.getConfiguration().get("ZK_HOST"),context.getConfiguration().get("APP_ID"), true);} catch (Exception e) {e.printStackTrace();}}/** * 开始遍历文档,并把计算文档所属的shard,放在对应的list里面去 */@Overridepublic void map(LongWritable key, Text columns, Context context)throws IOException, InterruptedException {InputData inputData = null;try {inputData = DocumentUtil.parseLog(columns);} catch (Exception e) {e.printStackTrace();return;}if (inputData != null) {String id = inputData.getId().toString();int sliceHash = DocumentUtil.sliceHash(id);Slice slice = DocumentUtil.hashToSlice(sliceHash,runInfo.getDocCollection());String shardName = slice.getName();// shard1,shard2 ..Document doc = null;try {doc = DocumentUtil.getDocument(inputData);} catch (Exception e) {e.printStackTrace();}runInfo.getShardDocs().get(shardName).add(doc);}}/** * 将每一个shard的list的文档提交,写到hdfs目录里 */@Overrideprotected void cleanup(Context context) throws IOException,InterruptedException {try {AddDoc addDoc = new AddDoc(runInfo, context.getConfiguration().get("HDFS_FLOADER"), context.getConfiguration().get("INDEX_PATH"));addDoc.write(runInfo.getShardDocs().size());} catch (Exception e) {e.printStackTrace();}}}
为了进一步提升索引的性能在addDoc.write方法里用多线程同时去写多个shard,代码如下:
public void write(int num) throws Exception {ExecutorService pool = Executors.newFixedThreadPool(num);ArrayList<Future<Object>> get = new ArrayList<>();String random = getRandom();for (int i = 0; i < num; i++) {Log.info("indexMrLog shard" + i + ":start write");Log.info("indexMrLog shard" + i + " num is:"+ runInfo.getShardDocs().get("shard" + (i + 1)).size());addCallable callable = new addCallable(runInfo.getShardDocs().get("shard" + (i + 1)), (i + 1), random);get.add(pool.submit(callable));}for (Future<Object> future : get) {try {future.get();} catch (InterruptedException e) {e.printStackTrace();} catch (ExecutionException e) {e.printStackTrace();}}pool.shutdown();}class addCallable implements Callable<Object> {ArrayList<Document> documents;int num;String code;addCallable(ArrayList<Document> documents, int num, String code) {this.num = num;this.documents = documents;this.code = code;}@Overridepublic Object call() throws Exception {try {IndexWriter indexWriter = getIndexWriter(HDFS_FOLADER,INDEX_PATH, num, code);runInfo.getIndexWriters().add(indexWriter);indexWriter.addDocuments(documents);indexWriter.commit();LOG.info("index writer:" + code + "has commit size"+ documents.size());// indexWriter.close();return true;} catch (Exception e) {e.printStackTrace();return false;}}
做完mapreduce后最后再用一个程序将每个shard下面的所有的文件夹合并一次形成最终的索引文件,代码如下
public void startCombine(String HDFS_FLOADER, String INDEX_PATH, int num)throws Exception {this.HDFS_FLOADER = HDFS_FLOADER;this.INDEX_PATH = INDEX_PATH;Configuration hdfsconf = new Configuration();hdfsconf.set("fs.hdfs.impl.disable.cache", "false");hdfsconf.set("fs.defaultFS", HDFS_FLOADER);// HDFS目录FileSystem fs = FileSystem.get(hdfsconf);Path path = new Path(INDEX_PATH);// 索引目录FileStatus[] files = fs.listStatus(path);BlockingQueue<Runnable> workQueue = new ArrayBlockingQueue<>(100);ThreadPoolExecutor pool = new ThreadPoolExecutor(40, 43, 1,TimeUnit.MINUTES, workQueue);ArrayList<Future<Object>> futures = new ArrayList<Future<Object>>();for (FileStatus file : files) {if (!file.getPath().getName().startsWith("shard")) {continue;}String dest = file.getPath().getParent().toString() + "/combine/"+ file.getPath().getName();System.out.println("indexMrLog dest:" + dest);Callable<Object> callable = new combineCallable(file.getPath().toString(), dest, num);futures.add(pool.submit(callable));}for (Future<Object> future : futures) {try {LOG.info("indexMrLog: " + future.get());} catch (InterruptedException e) {e.printStackTrace();} catch (ExecutionException e) {e.printStackTrace();}}pool.shutdown();}private class combineCallable implements Callable<Object> {String source;String dest;int num;public combineCallable(String source, String dest, int num) {this.source = source;this.dest = dest;this.num = num;}@Overridepublic Object call() {try {Configuration hdfsconf = new Configuration();hdfsconf.set("fs.hdfs.impl.disable.cache", "true");hdfsconf.set("fs.defaultFS", HDFS_FLOADER);// HDFS目录FileSystem destFS = FileSystem.get(hdfsconf);Path path = new Path(dest);destFS.mkdirs(path);HdfsDirectory d = new HdfsDirectory(path, hdfsconf);IndexWriterConfig conf = new IndexWriterConfig(new KeywordAnalyzer());conf.setUseCompoundFile(false);conf.setRAMBufferSizeMB(25000);conf.setMaxBufferedDocs(5000000);// conf.se// DocumentsWriterPerThreadPool indexerThreadPool = new// DocumentsWriterPerThreadPool();conf.setCommitOnClose(false);LogMergePolicy logMergePolicy = new LogDocMergePolicy();logMergePolicy.setMergeFactor(num);logMergePolicy.setMaxMergeDocs(5000000);conf.setMergePolicy(logMergePolicy);conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);// LogMergePolicy logMergePolicy = new LogDocMergePolicy();// logMergePolicy.setMergeFactor(2);// conf.setMergePolicy(logMergePolicy);IndexWriter indexWriter = new IndexWriter(d, conf);indexWriters.add(indexWriter);System.out.println("source:" + source);FileStatus[] files = destFS.listStatus(new Path(source));HdfsDirectory[] hdfsDirectories = new HdfsDirectory[files.length];// CodecReader[] codecReaders=new CodecReader[files.length];int j = 0;for (FileStatus file : files) {try {if (!file.isDirectory()) {continue;}String pathString = file.getPath().toString();System.out.println("add index file:" + pathString);Configuration tempHdfsconf = new Configuration(hdfsconf);tempHdfsconf.set("fs.hdfs.impl.disable.cache", "true");// tempHdfsconf.set("fs.defaultFS", HDFS_FLOADER);//// HDFS目录hdfsDirectories[j] = new HdfsDirectory(new Path(pathString), tempHdfsconf);// codecReaders[j]=new SegmentReaderj++;FileSystem fSystem = FileSystem.get(tempHdfsconf);System.out.println(pathString + "/write.lock");fSystem.delete(new Path(pathString + "/write.lock"),true);// indexWriter.addIndexes(new HdfsDirectory(new Path(// pathString), tempHdfsconf));} catch (Exception e) {e.printStackTrace();}}indexWriter.addIndexes(hdfsDirectories);indexWriter.forceMerge(num);indexWriter.commit();indexWriter.close();return source + " sucess";} catch (Exception e) {e.printStackTrace();return source + " false";}}}
0 0
- 基于lucene的mr索引程序的实现
- lucene 基于索引的查询
- 基于lucene的案例开发:实现实时索引基本原理
- 索引算法(Lucene的实现)
- 基于lucene的nutch索引详解
- 基于lucene的案例开发:索引数学模型
- 基于lucene的案例开发:创建索引
- 基于lucene的案例开发:搜索索引
- Hadoop之——基于MR程序实现倒排索引
- 基于lucene的案例开发:实时索引的检索
- 基于lucene的案例开发:实时索引的修改
- 基于随机游走的社团划分算法hadoop MR实现
- 基于Hadoop的MR开发
- 基于ODPS的MR开发
- 基于Java的全文索引引擎Lucene简介
- 基于Java的全文索引/检索引擎——Lucene
- 基于Java的全文索引/检索引擎——Lucene
- 基于Java的全文索引引擎Lucene简介
- 序:准备开始openfire之旅,希望旅途愉快
- 云测试平台, 内存优化,性能优化,OO,。垃圾回收,GC,heap和stack,
- java.io.IOException: unexpected end of stream
- 2016多校联赛 hdu 5724 Chess
- 超强、超详细Redis数据库入门教程
- 基于lucene的mr索引程序的实现
- linux下的守护进程
- Spring jdbcTemplate操作完数据库后是否需要手动关闭数据库连接 ,如何关闭
- java---抽奖游戏
- 关于typedef的用法总结(以及typedef与#define的用法区别)
- 使用php向mysql中插入当前时间
- Java工程师面试题
- stdio.h 部分成员函数介绍
- easyUI 获取记录数 页数 当前页