Elasticsearch+python学习

来源：互联网发布：2016中国中小企业数据编辑：程序博客网时间：2024/05/22 01:46

爬虫中建立moudle文件夹用于存放elasticsearch基本数据操作命令（建表）

from datetime import datetimefrom elasticsearch_dsl import DocType, Date, Nested, Boolean,analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integerfrom elasticsearch_dsl.connections import connections#创建服务器链接,非常终于connections.create_connection(hosts=["localhost"])#定义数据类，继承DocType,定义各个字段数据类型，在from elasticsearch_dsl import中导入需要的数据类型，包括字符串，整型，布尔等等class LagouType(DocType):    job_name = Text(analyzer="ik_max_word")    company = Text(analyzer="ik_max_word")    url = Keyword()    job_id = Keyword()    salary = Text(analyzer="ik_max_word")    city = Keyword()    experience = Text(analyzer="ik_max_word")    education = Text(analyzer="ik_max_word")    job_type = Keyword()    label = Text(analyzer="ik_max_word")    job_benefit = Text(analyzer="ik_max_word")    job_description = Text(analyzer="ik_max_word")    addr = Text(analyzer="ik_max_word")    publish_time = Text(analyzer="ik_max_word")    crawl_time = Date()    #建立链接的index和doc，在类中建立类，必须是Meta类，用于传入index值和type（表）值    class Meta:        index = "lagou"        doc_type = "job"if __name__ == "__main__":    #调用init()方法建立映射（mappings）    LagouType.init()

在pipeline中定制与Elasticsearch连接

1.直接写在pipeline中，但是爬去的item不一定存入elasticsearch中或某数据库中，并且值内容不一，容易混乱，配置性低

#pipeline中写入    class Elasticsearch_pipeline(object):    def __init__(self):        pass    def process_item(self,item,spider):        lagou = LagouType()        lagou.job_name = item['job_name']        lagou.company = item['company']        lagou.url = item['url']        lagou.job_id = item['job_id']        lagou.salary = item['salary']        lagou.city = item['city']        lagou.experience = item['experience']        lagou.education = item['education']        lagou.job_type = item['job_type']        lagou.label = item['label']        lagou.job_benefit = item['job_benefit']        lagou.job_description = item['job_description']        lagou.addr = item['addr']        lagou.publish_time = item['publish_time']        lagou.crawl_time = item['crawl_time']        lagou.save()        return item

2.在item中定制save_to_elasticsearch接口，并在pipeline中调用item方法，增强item的可配置性

    #item方法    def save_to_elasticsearch(self):        # 继承类        lagou = LagouType()        lagou.job_name = self['job_name']        lagou.company = self['company']        lagou.url = self['url']        lagou.job_id = self['job_id']        lagou.salary = self['salary']        lagou.city = self['city']        lagou.experience = self['experience']        lagou.education = self['education']        lagou.job_type = self['job_type']        lagou.label = self['label']        lagou.job_benefit = self['job_benefit']        lagou.job_description = self['job_description']        lagou.addr = self['addr']        lagou.publish_time = self['publish_time']        lagou.crawl_time = self['crawl_time']        lagou.save()#pipeline调用class Elasticsearch_pipeline(object):    def __init__(self):        pass    #在process_item中调用item的方法（item.save_to_elasticsearch()）    def process_item(self,item,spider):        item.save_to_elasticsearch()        return item#settings中开启item_pipeline    ITEM_PIPELINES = {    'lagou_spider.pipelines.Elasticsearch_pipeline': 1    }#settings中开启item_pipeline    ITEM_PIPELINES = {    'lagou_spider.pipelines.Elasticsearch_pipeline': 1    }

记录Elasticsearch与Python的各种查询操作（基本与kibana中的elasticsearch操作相同可以照搬）

重点“ from elasticsearch import Elasticsearch”

Elasticsearch的查询API接口：

client = Elasticsearch()

response = client.search(select sentence……)

class Elasticsearch_Option:    def __init__(self):        pass#注意点1：注意大小写，进行分词分析时，elasticsearch的分词器会把自动把所有词变成小写#match 用法，对 match 传入的值进行分词，符合分词结果的都可以检索到def match_option(self):    client = Elasticsearch()    response = client.search(        index="lagou",        body={           "query": {                "match": {                    'title':'C++后端工程师'                }            }        }    )#term 用法，不对 term  传入的值进行分词def term_option(self):    client = Elasticsearch()    response = client.search(        index="lagou",        body={           "query": {                "term": {                    'salary_min':'2000000'                }            }        }    )#terms 用法，可传入列表，符合列表内的值都可以检索到def terms_option(self):    client = Elasticsearch()    response = client.search(        index="lagou",        body={            "query": {                "terms": {                    'title': ['python','java','c++']  #千万注意大小写                }            }        }    )#from 和 size 的用法def from_size_option(self):    client = Elasticsearch()    response = client.search(        index="lagou",        body={           "query": {                "match": {                    'title':'工程师'                }            },            "from":0,            "size":4        }    )#match_all操作def match_all_option(self):    client = Elasticsearch()    response = client.search(        index="lagou",        body={           "query": {                "match_all": {            }           }        }    )# match_phrase 短语查询def match_phrase_option(self):    client = Elasticsearch()    response = client.search(        index="lagou",        body={            "query": {                "match_phrase": {                    "title": 'python研发工程师'                }            }        }    )    for i in response['hits']['hits']:        print(i['_source'])#multi_match查询，单一查询条件查询多列（fields）def multi_match_option(self):    client = Elasticsearch()    response = client.search(        index="lagou",        body={            "query": {                "multi_match": {                    "query":"深圳",                    "fields": ['title','city']    #查询 fields 多个字段中，只要有：query查询内容的关键字的就查询出来。                }            }        }    )    #仔细留意response返回结构    for i in response['hits']['hits']:        print(i['_source'])#排序操作def sort_option(self):    client = Elasticsearch()    response = client.search(        index="lagou",        body={            "query": {            "match_all":{}            },            "sort":{                "comment":{         #sort下面先制定需要排序的栏                    "order": "asc"                }            }        }    )    # 仔细留意response返回结构    for i in response['hits']['hits']:        print(i['_source'])#范围查询，gte：大于等于； gt：大于； lte：小于等于； lt：小于； boots：表示权重def range_option(self):    client = Elasticsearch()    response = client.search(        index="lagou",        body={            "query": {                "range":{                    "comment":{     #range下面是要确定范围的field                        "gt": 15,                        "lt": 20                    }                }            }        }    )    # 仔细留意response返回结构    for i in response['hits']['hits']:        print(i['_source'])#wildcard,模糊查询def wildcard_option(self):    client = Elasticsearch()    response = client.search(        index="lagou",        body={            "query": {                "wildcard":{                    "title":{     #range下面是要确定范围的field                    "value":"pyth*n"    # "*" 标识通配                    }                }            }        }    )    # 仔细留意response返回结构    for i in response['hits']['hits']:        print(i['_source'])# bool查询# filter：字段过滤并且不参与打分，过滤掉非数组内的内容# must：满足数组中所有的条件，“与”# should：数组中的查询条件满足一个或多个，“或”# must_not：数组中的查询条件一个都不能去满足，“非”def bool_option(self):    client = Elasticsearch()    response = client.search(        index="lagou",        body={              "query": {                "bool": {                  "must": [{                    "match_all":{}                  }],                  "filter": {                    "term": {                      "title": '工程师'                    }                  },                  "must_not": [{                    "match": {                      "comment": 16                    }                  }],                  "should": [{                    "match": {                      "title": 'c'                    }                  }]                }              }            }    )    # 仔细留意response返回结构    for i in response['hits']['hits']:        print(i['_source'])

阅读全文

0 0