elasticsearch6.0 中文分词
来源:互联网 发布:java中多态的理解 编辑:程序博客网 时间:2024/06/08 09:48
软件版本:
elasticsearch-analysis-ik 6.0.0https://github.com/medcl/elasticsearch-analysis-ik
elasticsearch 6.0.0
https://github.com/elastic/elasticsearch
elasticsearch-head 0.9
https://github.com/mobz/elasticsearch-head
按照github
https://github.com/medcl/elasticsearch-analysis-ik
上面(略有改动,因为可能报错):
1.create a index
curl -XPUT http://localhost:9200/index
2.create a mapping
curl -H 'Content-Type:application/json' http://localhost:9200/index/fulltext/_mapping -d'
{
"properties": {
"content": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_max_word"
}
}
}'
3.index some docs
curl -H 'Content-Type:application/json' http://localhost:9200/index/fulltext/1 -d'
{"content":"美国留给伊拉克的是个烂摊子吗"}
'
curl -H 'Content-Type:application/json' http://localhost:9200/index/fulltext/2 -d'
{"content":"公安部:各地校车将享最高路权"}
'
curl -H 'Content-Type:application/json' http://localhost:9200/index/fulltext/3 -d'
{"content":"中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"}
'
curl -H 'Content-Type:application/json' http://localhost:9200/index/fulltext/4 -d'
{"content":"中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"}
'
4.query with highlighting
curl -H 'Content-Type:application/json' 'http://localhost:9200/index/fulltext/_search?pretty=true' -d'
{
"query" : { "match" : { "content" : "中国" }},
"highlight" : {
"pre_tags" : ["<tag1>", "<tag2>"],
"post_tags" : ["</tag1>", "</tag2>"],
"fields" : {
"content" : {}
}
}
}
'
5. 默认分词
单个字分开了
[root@host-172-16-1-143 ~]# curl -H 'Content-Type:application/json' 'http://localhost:9200/index/_analyze?pretty=true' -d '{"text":"中华人民共和国"}'
{
"tokens" : [
{
"token" : "中",
"start_offset" : 0,
"end_offset" : 1,
"type" : "<IDEOGRAPHIC>",
"position" : 0
},
{
"token" : "华",
"start_offset" : 1,
"end_offset" : 2,
"type" : "<IDEOGRAPHIC>",
"position" : 1
},
{
"token" : "人",
"start_offset" : 2,
"end_offset" : 3,
"type" : "<IDEOGRAPHIC>",
"position" : 2
},
{
"token" : "民",
"start_offset" : 3,
"end_offset" : 4,
"type" : "<IDEOGRAPHIC>",
"position" : 3
},
{
"token" : "共",
"start_offset" : 4,
"end_offset" : 5,
"type" : "<IDEOGRAPHIC>",
"position" : 4
},
{
"token" : "和",
"start_offset" : 5,
"end_offset" : 6,
"type" : "<IDEOGRAPHIC>",
"position" : 5
},
{
"token" : "国",
"start_offset" : 6,
"end_offset" : 7,
"type" : "<IDEOGRAPHIC>",
"position" : 6
}
]
}
.
6. ik分词
1) max word
curl -H 'Content-Type:application/json' 'http://localhost:9200/index/_analyze?pretty=true' -d '
{
"text":"中华人民共和国",
"analyzer" : "ik_max_word"
}'
{
"tokens" : [
{
"token" : "中华人民共和国",
"start_offset" : 0,
"end_offset" : 7,
"type" : "CN_WORD",
"position" : 0
},
{
"token" : "中华人民",
"start_offset" : 0,
"end_offset" : 4,
"type" : "CN_WORD",
"position" : 1
},
{
"token" : "中华",
"start_offset" : 0,
"end_offset" : 2,
"type" : "CN_WORD",
"position" : 2
},
{
"token" : "华人",
"start_offset" : 1,
"end_offset" : 3,
"type" : "CN_WORD",
"position" : 3
},
{
"token" : "人民共和国",
"start_offset" : 2,
"end_offset" : 7,
"type" : "CN_WORD",
"position" : 4
},
{
"token" : "人民",
"start_offset" : 2,
"end_offset" : 4,
"type" : "CN_WORD",
"position" : 5
},
{
"token" : "共和国",
"start_offset" : 4,
"end_offset" : 7,
"type" : "CN_WORD",
"position" : 6
},
{
"token" : "共和",
"start_offset" : 4,
"end_offset" : 6,
"type" : "CN_WORD",
"position" : 7
},
{
"token" : "国",
"start_offset" : 6,
"end_offset" : 7,
"type" : "CN_CHAR",
"position" : 8
}
]
}
2) ik_smart
curl -H 'Content-Type:application/json' 'http://localhost:9200/index/_analyze?pretty=true' -d '
{
"text":"中华人民共和国",
"analyzer" : "ik_smart"
}'
{
"tokens" : [
{
"token" : "中华人民共和国",
"start_offset" : 0,
"end_offset" : 7,
"type" : "CN_WORD",
"position" : 0
}
]
}
7 elasticsearch-head操作
1) 默认分词
http://192.168.10.81:9200/
index/_analyze?pretty=true
index/_analyze?pretty=true
POST
{
"text": "中华人民共和国"
}
{
"text": "中华人民共和国"
}
2) ik分词
http://192.168.10.81:9200/
index/_analyze?pretty=true
index/_analyze?pretty=true
POST
{
"text": "中华人民共和国",
"analyzer": "ik_max_word"
}
{
"text": "中华人民共和国",
"analyzer": "ik_max_word"
}
阅读全文
0 0
- elasticsearch6.0 中文分词
- ElasticSearch6.0配置IK分词器
- ElasticSearch6.0配置HanLP分词器
- Elasticsearch6.0
- solr7.0 中文分词
- solr7.0 中文分词+拼音分词
- Lucene中文分词2.4.0
- Elasticsearch6.0及其head插件安装
- 中文分词
- 中文分词
- 中文分词
- 中文分词
- 中文分词
- 中文分词
- 中文分词
- 中文分词
- 中文分词
- 中文分词
- View 事件体系(三)
- Java反射(思维导图)
- C++中的inline函数
- LeetCode 37. Sudoku Solver
- poj3278--Catch That Cow(BFS+裁剪记录)
- elasticsearch6.0 中文分词
- Java 定时任务的实现
- python批量提交s3-被搁置
- 复制文件夹
- 引导页高亮控件的实现
- AliOS Things KV组件的写平衡特性
- grunt中的autoprefixer,自动一键补充css3兼容前缀
- 数据结构之查找(三)——有序表查找
- HTTP概述