利用lucene对整个数据库建立索引

来源:互联网 发布:全文期刊数据库 编辑:程序博客网 时间:2024/05/17 23:29

利用lucene对整个数据库建立索引 

导言:
如果要对整个数据库做精确查询或模糊查询,我们怎么才可以做到?还是通过SQL查询吗?答案是否定的。因为,通过SQL对整个数据库做精确查询或模糊查询,速度将非常的慢;
lucene解决了这个问题。通过对表或者文本文件预先建立索引,可以很快的实现全文检索。
思路:
1、通过SQL得到所有表名的集合---->2、遍历所有的表,分别为每个表的每个记录建立索引;同时添加表的中文名以及表的说明的索引---->按Writer\analyzer\document\field的循序建索引。
IndexerDB类:

package com.model;import java.io.File;import java.sql.ResultSet;import java.sql.ResultSetMetaData;import java.util.Date;import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.Version;public class IndexerDB { // 保存索引文件的地方 private static String INDEX_DIR = "F:\\MyLuceneDB2\\LuceneFileIndexDir"; private String index_dir; private File file = new File(INDEX_DIR); // 将要搜索TXT文件的地方 private String data_dir; private String DATA_DIR = "F:\\Lucene"; private String entityName; // private ApiInfo apiInfo; // private ApiInfoQuery apiInfoQuery = new ApiInfoQuery(); // private ApiInfoService apiInfoService; // private List<ApiIndicator> apiIndicators = new ArrayList<ApiIndicator>(); // private ApiIndicator apiIndicator; private StringBuilder newsb = new StringBuilder(); private ResultSet rs = null; private ResultSet tempRs = null; private ResultSetMetaData rsmd = null; // private SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer( // Version.LUCENE_30, true); private Document doc; private String tableName; public static void main(String[] args) throws Exception { IndexerDB indexDB = new IndexerDB(); File file = new File(INDEX_DIR); // 1 indexDB.execute(file); } // 执行对数据库的索引化 // @param file // @return // @throws Exception // public String execute(File file) throws Exception { long start = new Date().getTime(); // 2 int numIndexed = getAllTableNameFromDBAndIndexing(file); long end = new Date().getTime(); System.out.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds"); return "success"; } // 查询所有的表,并遍历所有的表; // 调用getDataFromTable(tableName)获取表的记录; // 调用indexData(writer, rs, tableName)对单张表做索引 public int getAllTableNameFromDBAndIndexing(File file) throws Exception { ResultSet rs = null; String sql1 = null; String sql2 = null; sql1 = "select [name] from [sysobjects] where [type] = 'u' order by [name]"; sql2 = "show tables"; SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer( Version.LUCENE_36, true); IndexWriter writer = new IndexWriter(FSDirectory.open(file), analyzer, true, IndexWriter.MaxFieldLength.LIMITED); try { rs = JDBCUtil.execute(sql1); } catch (Exception e) { rs = JDBCUtil.execute(sql2); } while (rs.next()) { tableName = rs.getString(1); System.out.println("tableName: " + rs.getString(1)); // ResultSetMetaData rsmd = rs.getMetaData(); ResultSet tempRs = getDataFromTable(tableName); System.out.println("already get data of table"); indexData(writer, tempRs, tableName); } int numIndexed = writer.numDocs(); writer.optimize(); writer.close(); return numIndexed; } // 取得表的数据 public ResultSet getDataFromTable(String tableName) throws Exception { String sql = "select * from " + tableName; System.out.println("" + sql); // String sql = "select * from " + "HK_STKCODE"; return JDBCUtil.execute(sql); } // 对特定表的记录,采用特定writer,索引 // 该方法是lucene的indexer的关键方法 // writer---->document---->field private void indexData(IndexWriter writer, ResultSet rs, String tableName) throws Exception { if (rs == null) { return; } // 取得实体名 // System.out.println("tableName: " + tableName); entityName = getEntityName(tableName); System.out.println(" .entityName: " + entityName); while (rs.next()) { doc = new Document(); rsmd = rs.getMetaData(); int colsNum = rsmd.getColumnCount(); // System.out.println("colsNum: "+colsNum); for (int i = 1; i < colsNum + 1; i++) { String columnName = rsmd.getColumnName(i); // System.out.println("columnName: "+columnName); // System.out.println(" rs.getString(i): "+ rs.getString(i)); // 1、对该条记录的第一个字段进行索引 doc.add(new Field(columnName, (rs.getString(i) == null ? "" : rs.getString(i)), Field.Store.YES, Field.Index.ANALYZED)); } // 2、在此处添加,表的说明(description);来至于apiinfo;根据表对应的实体名,找到表对应的apiinfo实体 // apiInfoQuery.setEntityname(entityName); // apiInfo = apiInfoService.gainApiInfoByEntityName(entityName); // if (apiInfo != null) { // doc.add(new Field("apiDesc", apiInfo.toString(), Field.Store.YES, // Field.Index.ANALYZED)); // } // 3、在此处添加,表的中英文字段: // 来自于apiindicator;根据表对应的实体名,去APIINFO中取找apiid,再去apiindicator中找apiindicator的实体 // List<ApiIndicator> apiIndicators = apiInfoService // .gainApiIndicatorsByApiId(apiInfo.getId()); // // 输出指标和描述结合为字符串 // for (int i = 0; i < apiIndicators.size(); i++) { // apiIndicator = apiIndicators.get(i); // newsb.append(apiIndicator.getCnname() + " ") // .append(apiIndicator.getEnname() + " ") // .append(apiIndicator.getDescription() + ""); // newsb.append("\n"); // } // System.out.println("newsb.toString(): " + newsb.toString()); // doc.add(new Field("outputFields", newsb.toString(), // Field.Store.YES, Field.Index.ANALYZED)); // 4、文件名 doc.add(new Field("apiName", entityName, Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc); } } // 将表名通过字符串处理成为实体名 public String getEntityName(String tableName) { // tableName = tableName.substring(0, tableName.indexOf(".txt")); StringBuilder sb = new StringBuilder(); // 将"_"替换掉,如果有的话,以便处理成与API对应的实体一样的字符串。如:PUB_SEITGC.txt,RSH_RSHRPT_INDFO.txt if (tableName.indexOf("_") != -1) { String[] subStrings = tableName.split("_"); for (int i = 0; i < subStrings.length; i++) { sb.append(subStrings[i]); } return sb.toString(); } else { return tableName; } } public String getIndex_dir() { return index_dir; } public void setIndex_dir(String index_dir) { this.index_dir = index_dir; } public String getData_dir() { return data_dir; } public void setData_dir(String data_dir) { this.data_dir = data_dir; } // public ApiInfoService getApiInfoService() { // return apiInfoService; // } // // public void setApiInfoService(ApiInfoService apiInfoService) { // this.apiInfoService = apiInfoService; // }}


JDBCUtil类:

// 数据库查询类//JDBCUtil用于取得连接;JDBCUtil.execute(sql)用于执行SQL,并返回resultset.package com.model;import java.sql.Connection;import java.sql.DriverManager;import java.sql.ResultSet;import java.sql.SQLException;import java.sql.Statement;public class JDBCUtil { public static Connection conn = null; // 建立连接 static void getConntion() { try { String driver_class = "com.mysql.jdbc.Driver"; // ?useUnicode=true&characterEncoding=utf-8 String connection_url = "jdbc:mysql://localhost:3306/sy001?useUnicode=true&characterEncoding=UTF-8"; String user_name = "root"; String db_password = "0882115303"; Class.forName(driver_class); conn = DriverManager.getConnection(connection_url, user_name, db_password); conn.setAutoCommit(false); } catch (Exception e) { e.printStackTrace(); } } // 查询后,得到数据 public static ResultSet execute(String sql) throws Exception { // 取得连接 getConntion(); // 写SQL // 得到一个statment对象 Statement stmt = conn.createStatement(); // 得到一个结果集 return stmt.executeQuery(sql); } static void close() { if (conn != null) { try { conn.close(); } catch (SQLException e) { e.printStackTrace(); } } } public static Connection getConn() { return conn; }}