第二节 Elasticsearch加入中文分词器IK

来源:互联网 发布:华三交换机端口镜像 编辑:程序博客网 时间:2024/05/18 17:58
一、简介
Elasticsearch 内置的分词器是standard对英文分词还好,但对中文的支持就比较弱,所以需要外引入一个中文分词器。目前比较流行的中文分词器有:IKAnalyzerMMSeg4j、Paoding等等。此次引入的是IKAnalyzer。

二、下载和安装
1、IK下载地址:https://github.com/medcl/elasticsearch-analysis-ik需要根据Elasticsearch 的版本号找到相应的IK版本。或者
https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v5.3.0/elasticsearch-analysis-ik-5.3.0.zip直接下载对应的版本
2、解压后/elasticsearch-analysis-ik-5.3.0在此目录下执行mvn clean package进行打包(此过程需要从maven上下载相关的jar所以时间会有点长)
3、打完包后找到elasticsearch-analysis-ik-5.3.0\target\releases\elasticsearch-analysis-ik-5.3.0.zip并将此文件copy到ES_HOME/plugins/ik(如果没有可新建)。然后解压即可。
至此,即可安装完成,注意:网上有很多还需要修改elasticsearch.yml。这是在5.X之前的版本配置方式,在5.X之后分词器的配置则不在全局配置文件中了,而是在settings和mappings进行配置。

三、添加自己的分词
随着业务的发展可能会有新的词语出现,这时就需要搜索时也要支持这些新词,就需要在IK配置中扩展词词
1、找到elasticsearch-5.3.0\plugins\ik\config\IKAnalyzer.cfg.xml内容如下:
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>IK Analyzer 扩展配置</comment>
<!--用户可以在这里配置自己的扩展字典 -->
<entry key="ext_dict">custom/mydict.dic;custom/mydict1.dic;custom/single_word_low_freq.dic</entry>
<!--用户可以在这里配置自己的扩展停止词字典-->
<entry key="ext_stopwords">custom/ext_stopword.dic</entry>
<!--用户可以在这里配置远程扩展字典 -->
<!-- <entry key="remote_ext_dict">words_location</entry> -->
<!--用户可以在这里配置远程扩展停止词字典-->
<!-- <entry key="remote_ext_stopwords">words_location</entry> -->
</properties>
注释写的很清楚可以在custom/mydict.dic;中添加新词,一个词一行。注意:保存时一定要保存为UTF-8格式,否则不生效。也可以使用远程扩展字典,返回的可以是一个页面,也可以是一个txt的文档,但要保证输出的内容是 utf-8 的格式 并且ik 接收两个返回的头部属性 Last-Modified 和 ETag。只要其中一个有变化,就会触发更新,ik 会每分钟获取一次

四、测试
GET方式:http://localhost:9200/es1/_analyze?pretty&analyzer=ik_max_word&text="中华人民共和国"
可以查看到分词的结果。
&analyzer有两种分词方式:
ik_max_word:会将文本做最细粒度的拆分;尽可能多的拆分出词语
ik_smart:会做最粗粒度的拆分;已被分出的词语将不会再次被其它词语占有

URL使用方式可以参考:https://github.com/medcl/elasticsearch-analysis-ik下的Quick Example

java API方式:
package com.els.util;import com.els.common.Const;import com.google.common.collect.Maps;import org.elasticsearch.action.ActionListener;import org.elasticsearch.action.admin.indices.get.GetIndexRequest;import org.elasticsearch.action.admin.indices.get.GetIndexResponse;import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequest;import org.elasticsearch.action.admin.indices.settings.put.UpdateSettingsRequest;import org.elasticsearch.action.bulk.BulkResponse;import org.elasticsearch.action.bulk.byscroll.BulkByScrollResponse;import org.elasticsearch.action.delete.DeleteResponse;import org.elasticsearch.action.get.GetResponse;import org.elasticsearch.action.get.MultiGetItemResponse;import org.elasticsearch.action.get.MultiGetRequest;import org.elasticsearch.action.get.MultiGetResponse;import org.elasticsearch.action.index.IndexRequest;import org.elasticsearch.action.index.IndexResponse;import org.elasticsearch.action.search.MultiSearchResponse;import org.elasticsearch.action.search.SearchResponse;import org.elasticsearch.action.update.UpdateRequest;import org.elasticsearch.action.update.UpdateResponse;import org.elasticsearch.client.transport.TransportClient;import org.elasticsearch.common.settings.Settings;import org.elasticsearch.common.transport.InetSocketTransportAddress;import org.elasticsearch.common.xcontent.XContentBuilder;import org.elasticsearch.common.xcontent.XContentFactory;import org.elasticsearch.index.IndexNotFoundException;import org.elasticsearch.index.VersionType;import org.elasticsearch.index.query.QueryBuilders;import org.elasticsearch.index.reindex.DeleteByQueryAction;import org.elasticsearch.index.reindex.DeleteByQueryRequestBuilder;import org.elasticsearch.script.Script;import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;import org.elasticsearch.transport.client.PreBuiltTransportClient;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import java.io.IOException;import java.net.InetAddress;import java.net.UnknownHostException;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;/** * Created by xiongps on 2017/8/11. */public class ElsUtil2 {    private static Logger logger = LoggerFactory.getLogger(ElsUtil2.class);    private static TransportClient client;    public static TransportClient getClient(){        Settings settings = Settings.builder()                .put("cluster.name", "myApplication").build();        try {            TransportClient client = new PreBuiltTransportClient(settings)                    .addTransportAddress(new InetSocketTransportAddress(                            InetAddress.getByName("localhost"), 9300));            ElsUtil2.client = client;            return client;        } catch (UnknownHostException e) {            e.printStackTrace();        }        return null;    }    public static void closeClient(){        if(client != null) {            client.close();        }    }    public static void createIndex(String indices) throws IOException {        //默认setting创建索引       // client.admin().indices().prepareCreate("es1").get();        //获取到所有的索引名称       String []aa = client.admin().indices().prepareGetIndex().get().getIndices();        for(String a:aa) {            logger.debug("a:"+a);        }        try{            //获取具体的某一个索引的信息,注意:若给出的索引名称不存在,会抛出异常            GetIndexResponse getIndexResponse =                    client.admin().indices().getIndex(new GetIndexRequest().indices("es2")).actionGet();            logger.debug("getIndexResponse:"+getIndexResponse.indices().length);        }catch (IndexNotFoundException e) {            logger.debug("getIndexResponse IndexNotFoundException:未找到es2");        }        //settings设置分片数和副本数        Map<String,Object> settingsBuilder = new HashMap<>();        settingsBuilder.put("number_of_shards", "5");        settingsBuilder.put("number_of_replicas", "1");        //mappings        XContentBuilder mappingsBuilder = getMappings("test_type");        //创建索引        client.admin().indices().prepareCreate(indices)                .setSettings(settingsBuilder)                .addMapping("test_type",mappingsBuilder).get();    }    public static void updateSettings(String indices,Map<String,Object> settingsBuilder){        //修改indexsettings        client.admin().indices()                .updateSettings(                        new UpdateSettingsRequest()                                .indices(indices)                                .settings(settingsBuilder)).actionGet();    }    public static void putMappings(String indices,String type,XContentBuilder mappingsBuilder){        //修改indextypemappings        client.admin().indices().putMapping(                new PutMappingRequest()                        .indices(indices).type(type)                        .source(mappingsBuilder)).actionGet();    }    public static void addDatas(String indices,String type,List<Map<String,Object>> dataMapList,String idFieldName){        for(Map<String,Object> dataMap:dataMapList) {            /**同步的方式            IndexResponse response = client.prepareIndex(indices,type)                    .setSource(dataMap).setId(String.valueOf(dataMap.get(idFieldName)))                    .execute().actionGet();            */            //异步的方式            client.prepareIndex(indices,type)                    .setSource(dataMap).setId(String.valueOf(dataMap.get(idFieldName)))                    .execute(new ActionListener<IndexResponse>() {                        @Override                        public void onResponse(IndexResponse indexResponse) {                            logger.debug("addDatas IndexResponse:{}",indexResponse);                        }                        @Override                        public void onFailure(Exception e) {                            logger.error("addDatas Exception IndexResponse:{}",e);                        }                    });        }    }    public static void updateDataById(String indices,String type,String id,Map<String,Object> updMap,Long version){        UpdateRequest updateRequest = new UpdateRequest();        updateRequest.index(indices);        updateRequest.type(type);        updateRequest.id(id);        updateRequest.doc(updMap);        if(version != null) {            updateRequest.version(version);            updateRequest.versionType(VersionType.INTERNAL);        }        client.update(updateRequest).actionGet();    }    public static void updateDataAsPrepareById(String indices,String type,String id,String field,String value,Long version){        client.prepareUpdate(indices,type,id)                .setVersion(version).setVersionType(VersionType.INTERNAL)                //.setDoc(updMap)                .setScript(new Script("ctx._source."+field+" = \""+value+"\""))                .get();    }    public static void delete(String indices,String type,String id,Long version){        //版本号(插入,删除)        //VersionType.INTERNAL 内部版本号,只有等于当前版本号才可以进行操作        //VersionType.EXTERNAL 外部版本号,只有大于当前版本号才可以进行操作,且update不支持此类型        DeleteResponse response =                client.prepareDelete(indices, type, id)                        .setVersion(version)                        .setVersionType(VersionType.EXTERNAL)//外部版本号,或内部版本号                        .get();    }    public static void deleteByQuery(String indices,String type,String matchNm,String text){        DeleteByQueryRequestBuilder deleteByQueryRequestBuilder =                DeleteByQueryAction.INSTANCE.newRequestBuilder(client);        deleteByQueryRequestBuilder.source().setIndices(indices).setTypes(type);        //为什么提供了.source(“es”)设置setIndices,而没有提供setTypes??        //同步删除        //BulkByScrollResponse response =        //        deleteByQueryRequestBuilder.filter(QueryBuilders.matchQuery("name", "java"))        //                .get();        //long deleted = response.getDeleted();        //logger.info("delete:{}",deleted);        //异步删除        deleteByQueryRequestBuilder.filter(QueryBuilders.matchQuery(matchNm, text))                        .execute(new ActionListener<BulkByScrollResponse>() {                            @Override                            public void onResponse(BulkByScrollResponse bulkByScrollResponse) {                                long deleted = bulkByScrollResponse.getDeleted();                                logger.info("异步delete:{}",deleted);                            }                            @Override                            public void onFailure(Exception e) {                                logger.info("异步delete");                                logger.error("异步删除错误:{}",e);                            }                        });    }    public static void highlightQuery(String indices,String type,String matchNm,String text){        HighlightBuilder highlightBuilder = new HighlightBuilder()                .field("*").requireFieldMatch(false)               // .field("name").requireFieldMatch(false)               // .highlightQuery(QueryBuilders.queryStringQuery(text))                .preTags(Const.HIGHLIGHT_PRE_TAGS)                .postTags(Const.HIGHLIGHT_POST_TAGS);        SearchResponse searchResponse = client.prepareSearch().setIndices(indices).setTypes(type)                .setQuery(QueryBuilders.disMaxQuery()                        .add(QueryBuilders.queryStringQuery(text))                        .add(QueryBuilders.matchQuery(matchNm,text)))                .highlighter(highlightBuilder)                .setFrom(0).setSize(5)//分页                .execute().actionGet();        logger.info("查询的结果:{}",searchResponse);    }    public static void countQuery(String indices,String type,String matchNm,String text){        SearchResponse searchResponse = client.prepareSearch().setIndices(indices).setTypes(type)                .setQuery(QueryBuilders.disMaxQuery()                        .add(QueryBuilders.matchQuery(matchNm,text)))                .setSize(0)//不要数据                .execute().actionGet();        logger.info("查询的结果:{}",searchResponse.getHits().getTotalHits());        logger.info("查询的结果:{}",searchResponse);    }    public static XContentBuilder getMappings(String type){        try{            XContentBuilder mappingsBuilder = XContentFactory.jsonBuilder();            if("test_type".equals(type)) {                mappingsBuilder.startObject()                        .field("dynamic", "stu")                        .startObject("properties")                        .startObject("id").field("type","long").field("store", "yes").field("index", "not_analyzed")                        .endObject()                        .startObject("name").field("type", "string").field("index", "analyzed").field("analyzer", "ik_max_word").field("search_analyzer", "ik_smart")                        .endObject()                        .startObject("desc").field("type", "string").field("index", "analyzed").field("analyzer", "ik_max_word").field("search_analyzer", "ik_smart")                        .endObject()                        .endObject()                        .endObject();            }            return mappingsBuilder;        }catch (IOException e) {            e.printStackTrace();        }        return null;    }    public static List<Map<String,Object>> getDataList(){        List<Map<String,Object>> list = new ArrayList<>();        Map<String,Object> map = Maps.newHashMap();        map.put("id","6");        map.put("name","java陈港生");        map.put("desc","港生中华人民共和国产生10个分片,蓝瘦");        list.add(map);        map = Maps.newHashMap();        map.put("id","7");        map.put("name","IK港生测试");        map.put("desc","一个从分片,那么就有5个从分片陈港生,那么默认配置会");        list.add(map);        return list;    }}



原创粉丝点击