Elasticsearch+python学习
来源:互联网 发布:2016中国中小企业数据 编辑:程序博客网 时间:2024/05/22 01:46
爬虫中建立moudle文件夹用于存放elasticsearch基本数据操作命令(建表)
from datetime import datetimefrom elasticsearch_dsl import DocType, Date, Nested, Boolean,analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integerfrom elasticsearch_dsl.connections import connections#创建服务器链接,非常终于connections.create_connection(hosts=["localhost"])#定义数据类,继承DocType,定义各个字段数据类型,在from elasticsearch_dsl import中导入需要的数据类型,包括字符串,整型,布尔等等class LagouType(DocType): job_name = Text(analyzer="ik_max_word") company = Text(analyzer="ik_max_word") url = Keyword() job_id = Keyword() salary = Text(analyzer="ik_max_word") city = Keyword() experience = Text(analyzer="ik_max_word") education = Text(analyzer="ik_max_word") job_type = Keyword() label = Text(analyzer="ik_max_word") job_benefit = Text(analyzer="ik_max_word") job_description = Text(analyzer="ik_max_word") addr = Text(analyzer="ik_max_word") publish_time = Text(analyzer="ik_max_word") crawl_time = Date() #建立链接的index和doc,在类中建立类,必须是Meta类,用于传入index值和type(表)值 class Meta: index = "lagou" doc_type = "job"if __name__ == "__main__": #调用init()方法建立映射(mappings) LagouType.init()
在pipeline中定制与Elasticsearch连接
1.直接写在pipeline中,但是爬去的item不一定存入elasticsearch中或某数据库中,并且值内容不一,容易混乱,配置性低
#pipeline中写入 class Elasticsearch_pipeline(object): def __init__(self): pass def process_item(self,item,spider): lagou = LagouType() lagou.job_name = item['job_name'] lagou.company = item['company'] lagou.url = item['url'] lagou.job_id = item['job_id'] lagou.salary = item['salary'] lagou.city = item['city'] lagou.experience = item['experience'] lagou.education = item['education'] lagou.job_type = item['job_type'] lagou.label = item['label'] lagou.job_benefit = item['job_benefit'] lagou.job_description = item['job_description'] lagou.addr = item['addr'] lagou.publish_time = item['publish_time'] lagou.crawl_time = item['crawl_time'] lagou.save() return item
2.在item中定制save_to_elasticsearch接口,并在pipeline中调用item方法,增强item的可配置性
#item方法 def save_to_elasticsearch(self): # 继承类 lagou = LagouType() lagou.job_name = self['job_name'] lagou.company = self['company'] lagou.url = self['url'] lagou.job_id = self['job_id'] lagou.salary = self['salary'] lagou.city = self['city'] lagou.experience = self['experience'] lagou.education = self['education'] lagou.job_type = self['job_type'] lagou.label = self['label'] lagou.job_benefit = self['job_benefit'] lagou.job_description = self['job_description'] lagou.addr = self['addr'] lagou.publish_time = self['publish_time'] lagou.crawl_time = self['crawl_time'] lagou.save()#pipeline调用class Elasticsearch_pipeline(object): def __init__(self): pass #在process_item中调用item的方法(item.save_to_elasticsearch()) def process_item(self,item,spider): item.save_to_elasticsearch() return item#settings中开启item_pipeline ITEM_PIPELINES = { 'lagou_spider.pipelines.Elasticsearch_pipeline': 1 }#settings中开启item_pipeline ITEM_PIPELINES = { 'lagou_spider.pipelines.Elasticsearch_pipeline': 1 }
记录Elasticsearch与Python的各种查询操作(基本与kibana中的elasticsearch操作相同可以照搬)
重点“ from elasticsearch import Elasticsearch”
Elasticsearch的查询API接口:
client = Elasticsearch()
response = client.search(select sentence……)
class Elasticsearch_Option: def __init__(self): pass#注意点1:注意大小写,进行分词分析时,elasticsearch的分词器会把自动把所有词变成小写#match 用法,对 match 传入的值进行分词,符合分词结果的都可以检索到def match_option(self): client = Elasticsearch() response = client.search( index="lagou", body={ "query": { "match": { 'title':'C++后端工程师' } } } )#term 用法,不对 term 传入的值进行分词def term_option(self): client = Elasticsearch() response = client.search( index="lagou", body={ "query": { "term": { 'salary_min':'2000000' } } } )#terms 用法,可传入列表,符合列表内的值都可以检索到def terms_option(self): client = Elasticsearch() response = client.search( index="lagou", body={ "query": { "terms": { 'title': ['python','java','c++'] #千万注意大小写 } } } )#from 和 size 的用法def from_size_option(self): client = Elasticsearch() response = client.search( index="lagou", body={ "query": { "match": { 'title':'工程师' } }, "from":0, "size":4 } )#match_all操作def match_all_option(self): client = Elasticsearch() response = client.search( index="lagou", body={ "query": { "match_all": { } } } )# match_phrase 短语查询def match_phrase_option(self): client = Elasticsearch() response = client.search( index="lagou", body={ "query": { "match_phrase": { "title": 'python研发工程师' } } } ) for i in response['hits']['hits']: print(i['_source'])#multi_match查询,单一查询条件查询多列(fields)def multi_match_option(self): client = Elasticsearch() response = client.search( index="lagou", body={ "query": { "multi_match": { "query":"深圳", "fields": ['title','city'] #查询 fields 多个字段中,只要有:query查询内容的关键字的就查询出来。 } } } ) #仔细留意response返回结构 for i in response['hits']['hits']: print(i['_source'])#排序操作def sort_option(self): client = Elasticsearch() response = client.search( index="lagou", body={ "query": { "match_all":{} }, "sort":{ "comment":{ #sort下面先制定需要排序的栏 "order": "asc" } } } ) # 仔细留意response返回结构 for i in response['hits']['hits']: print(i['_source'])#范围查询,gte:大于等于; gt:大于; lte:小于等于; lt:小于; boots:表示权重def range_option(self): client = Elasticsearch() response = client.search( index="lagou", body={ "query": { "range":{ "comment":{ #range下面是要确定范围的field "gt": 15, "lt": 20 } } } } ) # 仔细留意response返回结构 for i in response['hits']['hits']: print(i['_source'])#wildcard,模糊查询def wildcard_option(self): client = Elasticsearch() response = client.search( index="lagou", body={ "query": { "wildcard":{ "title":{ #range下面是要确定范围的field "value":"pyth*n" # "*" 标识通配 } } } } ) # 仔细留意response返回结构 for i in response['hits']['hits']: print(i['_source'])# bool查询# filter:字段过滤并且不参与打分,过滤掉非数组内的内容# must:满足数组中所有的条件,“与”# should:数组中的查询条件满足一个或多个,“或”# must_not:数组中的查询条件一个都不能去满足,“非”def bool_option(self): client = Elasticsearch() response = client.search( index="lagou", body={ "query": { "bool": { "must": [{ "match_all":{} }], "filter": { "term": { "title": '工程师' } }, "must_not": [{ "match": { "comment": 16 } }], "should": [{ "match": { "title": 'c' } }] } } } ) # 仔细留意response返回结构 for i in response['hits']['hits']: print(i['_source'])
阅读全文
0 0
- Elasticsearch+python学习
- elasticsearch学习
- Elasticsearch学习
- ElasticSearch学习
- ElasticSearch学习
- ElasticSearch学习
- elasticsearch 学习
- Elasticsearch学习
- elasticsearch学习
- Elasticsearch学习
- ElasticSearch学习
- elasticsearch 学习
- ElasticSearch学习
- ElasticSearch学习
- ElasticSearch学习
- ElasticSearch学习
- python 访问 elasticsearch
- Python Elasticsearch api
- AngularJS的自定义模板
- 常规的排序算法
- Linux线程的使用策略
- Elementary Math Gym
- Android 项目路径过长 引起error Error:too long on Windows, keep below 240 characters :
- Elasticsearch+python学习
- linux下的系统管理及系统安全命令
- 架构 理论 设计原则 分布式 总结
- QT学习过程中重难点总结
- Ubuntu16.04 常用软件集锦[超实惠]
- 【GarsiaWachs算法】bzoj3229: [Sdoi2008]石子合并
- 如何准备阿里的社招
- leetcode题解之Isomorphic Strings
- 中科院谭铁牛爱徒研发出《碟中谍5》中的步态识别技术,不看脸50米内在人群中认出你