【Python】【笔记】Python 控制ElasticSearch

Yiwoqu · 发表于 2021-5-18 17:02

[Python] 纯文本查看 复制代码

#####################创建索引####################################
from elasticsearch import Elasticsearch
from tqdm import tqdm #进度条
from elasticsearch import helpers
import codecs

# 创建Es低级客户端 Python好像只有低级客户端
# 高级客户端和低级客户端的主要区别在查询
# 高级客户端查询通过调用高级API  低级客户端查询通过常规的查询方式
es = Elasticsearch()

# 确保不会重复创建
def deleteInices(my_index):
    if True and es.indices.exists(my_index):
        print("删除之前存在的")
        es.indices.delete(index=my_index)


def createIndex(my_index):
    # 修改配置 7.x版本之后 my_doc默认_doc
    settings = \
        {
            "mappings": {
                    "properties": {
                        "my_id": {"type": "integer"},
                        "my_word": {"type": "text", "analyzer": "ik_smart", "search_analyzer": "ik_smart"}
                    }
                }
        }
    # create index
    es.indices.create(index=my_index, ignore=400, body=settings)
    print("创建index成功！")





#####################插入数据####################################

def getAllWords(path="para_and_test_split.txt"):
    #将数据从文件读出
    #文件格式：
    #
    words = []
    with open(path, "r", encoding="utf-8") as f:
        for i,item in enumerate(f.readlines()):
            words.append((i,item.strip()))
    return words

def insertData(words, my_index, my_doc, one_bulk):
    #插入数据
    #one_bulk表示一个bulk里装多少个
    body = []
    body_count = 0  #记录body里面有多少个.
    #最后一个bulk可能没满one_bulk,但也要插入

    print("共需要插入%d条..."%len(words))
    pbar = tqdm(total=len(words))

    for id,word in words:
        data1 = { "my_id": id,
                  "my_word": word}
        every_body = \
        {
            "_index": my_index,
            "_type": my_doc,
            "_source": data1
        }

        if body_count<one_bulk:
            body.append(every_body)
            body_count+=1
        else:
            helpers.bulk(es, body) #还是要用bulk啊，不然太慢了
            pbar.update(one_bulk)
            body_count = 0
            body = []
            body.append(every_body)
            body_count+=1

    if len(body)>0:
        #如果body里面还有，则再插入一次（最后非整块的）
        helpers.bulk(es, body)
        # pbar.update(len(body))
        print('done2')

    pbar.close()
    #res = es.index(index=my_index,doc_type=my_doc,id=my_key_id,body=data1)  #一条插入
    print("插入数据完成!")



#####################检索数据####################################

def keywordSearch(keywords1, my_index):
    # 根据keywords1来查找，倒排索引
    my_search1 = \
        {
            "query": {
                "match": {
                    "my_word": keywords1
                }
            }
        }
    # 直接查询
    res= es.search(index=my_index,body=my_search1)
    for line in res["hits"]["hits"]:
        print(line)
        # print(line["_source"]['my_word'])
    # total = res["hits"]["total"] #一共这么多个
    # print("共查询到%d条数据"%total)

    # helpers查询
    # es_result = helpers.scan(
    #     client=es,
    #     query=my_search1,
    #     scroll='10m',
    #     index=my_index,
    #     timeout='10m'
    # )
    # es_result = [item for item in es_result]  # 原始是生成器<generator object scan at 0x0000022E384697D8>
    # # print(es_result) #你可以直接打印查看
    # search_res = []
    # for item in es_result:
    #     tmp = item['_source']
    #     search_res.append((tmp['my_id'], tmp['my_word']))
    #     print(tmp['my_word'])
    # print("共查询到%d条数据" % len(es_result))

def mainCreateIndex():
    # 调用后创建index
    my_index = "word2vec_index"
    deleteInices(my_index)
    createIndex(my_index)

def mainInsert():
    # 调用后插入数据
    my_index = "word2vec_index"
    my_doc = "_doc"
    words = getAllWords(path="para_and_test_split.txt")
    insertData(words, my_index, my_doc, one_bulk=5000)

def mainSearch():
    # 调用后检索数据
    my_index = "word2vec_index"
    keywords1 = "特朗普的支持率"
    keywordSearch(keywords1, my_index)

if __name__ == '__main__':
    # mainCreateIndex()
    # mainInsert()
    mainSearch()

帐号		自动登录	找回密码
密码			注册[Register]

[Python 转载] 【Python】【笔记】Python 控制ElasticSearch