基于lucene的mr索引程序的实现

来源：互联网发布：nginx 账号密码编辑：程序博客网时间：2024/06/06 13:19

之前做过一个基于solrJ的mr索引程序，性能并不理想，想着solr的底层就是lucene，所以我用相应版本的lucene做了一个mr程序，程序性能提高了30-40倍（同样的分词器），实验证明创建出来的索引能被solrcloud识别，此程序只用到了map过程，生成的文件最后经过一次本地的合并形成最终的索引。

下面放出部分主要程序代码：

public static class TestMapper extendsMapper<LongWritable, Text, Text, Text> {private static final Logger LOG = Logger.getLogger(TestMapper.class);RunInfo runInfo = null;/** * 做一些准备工作 1.获得solrcloud的DocCollection，以方便知道文档应该放在哪一个shard里面 * 2.初始化对应shard的list以方便文档被放入 */@Overrideprotected void setup(Context context) throws IOException,InterruptedException {CreateIndexWriter createIndexWriter = new CreateIndexWriter();try {runInfo = createIndexWriter.create(context.getConfiguration().get("ZK_HOST"),context.getConfiguration().get("APP_ID"), true);} catch (Exception e) {e.printStackTrace();}}/** * 开始遍历文档，并把计算文档所属的shard，放在对应的list里面去 */@Overridepublic void map(LongWritable key, Text columns, Context context)throws IOException, InterruptedException {InputData inputData = null;try {inputData = DocumentUtil.parseLog(columns);} catch (Exception e) {e.printStackTrace();return;}if (inputData != null) {String id = inputData.getId().toString();int sliceHash = DocumentUtil.sliceHash(id);Slice slice = DocumentUtil.hashToSlice(sliceHash,runInfo.getDocCollection());String shardName = slice.getName();// shard1,shard2 ..Document doc = null;try {doc = DocumentUtil.getDocument(inputData);} catch (Exception e) {e.printStackTrace();}runInfo.getShardDocs().get(shardName).add(doc);}}/** * 将每一个shard的list的文档提交，写到hdfs目录里 */@Overrideprotected void cleanup(Context context) throws IOException,InterruptedException {try {AddDoc addDoc = new AddDoc(runInfo, context.getConfiguration().get("HDFS_FLOADER"), context.getConfiguration().get("INDEX_PATH"));addDoc.write(runInfo.getShardDocs().size());} catch (Exception e) {e.printStackTrace();}}}

为了进一步提升索引的性能在addDoc.write方法里用多线程同时去写多个shard，代码如下：

public void write(int num) throws Exception {ExecutorService pool = Executors.newFixedThreadPool(num);ArrayList<Future<Object>> get = new ArrayList<>();String random = getRandom();for (int i = 0; i < num; i++) {Log.info("indexMrLog shard" + i + ":start write");Log.info("indexMrLog shard" + i + " num is:"+ runInfo.getShardDocs().get("shard" + (i + 1)).size());addCallable callable = new addCallable(runInfo.getShardDocs().get("shard" + (i + 1)), (i + 1), random);get.add(pool.submit(callable));}for (Future<Object> future : get) {try {future.get();} catch (InterruptedException e) {e.printStackTrace();} catch (ExecutionException e) {e.printStackTrace();}}pool.shutdown();}class addCallable implements Callable<Object> {ArrayList<Document> documents;int num;String code;addCallable(ArrayList<Document> documents, int num, String code) {this.num = num;this.documents = documents;this.code = code;}@Overridepublic Object call() throws Exception {try {IndexWriter indexWriter = getIndexWriter(HDFS_FOLADER,INDEX_PATH, num, code);runInfo.getIndexWriters().add(indexWriter);indexWriter.addDocuments(documents);indexWriter.commit();LOG.info("index writer:" + code + "has commit size"+ documents.size());// indexWriter.close();return true;} catch (Exception e) {e.printStackTrace();return false;}}

做完mapreduce后最后再用一个程序将每个shard下面的所有的文件夹合并一次形成最终的索引文件，代码如下

public void startCombine(String HDFS_FLOADER, String INDEX_PATH, int num)throws Exception {this.HDFS_FLOADER = HDFS_FLOADER;this.INDEX_PATH = INDEX_PATH;Configuration hdfsconf = new Configuration();hdfsconf.set("fs.hdfs.impl.disable.cache", "false");hdfsconf.set("fs.defaultFS", HDFS_FLOADER);// HDFS目录FileSystem fs = FileSystem.get(hdfsconf);Path path = new Path(INDEX_PATH);// 索引目录FileStatus[] files = fs.listStatus(path);BlockingQueue<Runnable> workQueue = new ArrayBlockingQueue<>(100);ThreadPoolExecutor pool = new ThreadPoolExecutor(40, 43, 1,TimeUnit.MINUTES, workQueue);ArrayList<Future<Object>> futures = new ArrayList<Future<Object>>();for (FileStatus file : files) {if (!file.getPath().getName().startsWith("shard")) {continue;}String dest = file.getPath().getParent().toString() + "/combine/"+ file.getPath().getName();System.out.println("indexMrLog dest:" + dest);Callable<Object> callable = new combineCallable(file.getPath().toString(), dest, num);futures.add(pool.submit(callable));}for (Future<Object> future : futures) {try {LOG.info("indexMrLog: " + future.get());} catch (InterruptedException e) {e.printStackTrace();} catch (ExecutionException e) {e.printStackTrace();}}pool.shutdown();}private class combineCallable implements Callable<Object> {String source;String dest;int num;public combineCallable(String source, String dest, int num) {this.source = source;this.dest = dest;this.num = num;}@Overridepublic Object call() {try {Configuration hdfsconf = new Configuration();hdfsconf.set("fs.hdfs.impl.disable.cache", "true");hdfsconf.set("fs.defaultFS", HDFS_FLOADER);// HDFS目录FileSystem destFS = FileSystem.get(hdfsconf);Path path = new Path(dest);destFS.mkdirs(path);HdfsDirectory d = new HdfsDirectory(path, hdfsconf);IndexWriterConfig conf = new IndexWriterConfig(new KeywordAnalyzer());conf.setUseCompoundFile(false);conf.setRAMBufferSizeMB(25000);conf.setMaxBufferedDocs(5000000);// conf.se// DocumentsWriterPerThreadPool indexerThreadPool = new// DocumentsWriterPerThreadPool();conf.setCommitOnClose(false);LogMergePolicy logMergePolicy = new LogDocMergePolicy();logMergePolicy.setMergeFactor(num);logMergePolicy.setMaxMergeDocs(5000000);conf.setMergePolicy(logMergePolicy);conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);// LogMergePolicy logMergePolicy = new LogDocMergePolicy();// logMergePolicy.setMergeFactor(2);// conf.setMergePolicy(logMergePolicy);IndexWriter indexWriter = new IndexWriter(d, conf);indexWriters.add(indexWriter);System.out.println("source:" + source);FileStatus[] files = destFS.listStatus(new Path(source));HdfsDirectory[] hdfsDirectories = new HdfsDirectory[files.length];// CodecReader[] codecReaders=new CodecReader[files.length];int j = 0;for (FileStatus file : files) {try {if (!file.isDirectory()) {continue;}String pathString = file.getPath().toString();System.out.println("add index file:" + pathString);Configuration tempHdfsconf = new Configuration(hdfsconf);tempHdfsconf.set("fs.hdfs.impl.disable.cache", "true");// tempHdfsconf.set("fs.defaultFS", HDFS_FLOADER);//// HDFS目录hdfsDirectories[j] = new HdfsDirectory(new Path(pathString), tempHdfsconf);// codecReaders[j]=new SegmentReaderj++;FileSystem fSystem = FileSystem.get(tempHdfsconf);System.out.println(pathString + "/write.lock");fSystem.delete(new Path(pathString + "/write.lock"),true);// indexWriter.addIndexes(new HdfsDirectory(new Path(// pathString), tempHdfsconf));} catch (Exception e) {e.printStackTrace();}}indexWriter.addIndexes(hdfsDirectories);indexWriter.forceMerge(num);indexWriter.commit();indexWriter.close();return source + "  sucess";} catch (Exception e) {e.printStackTrace();return source + "  false";}}}

0 0