ZebraScienceBackend/search.py at master · Andy1621/ZebraScienceBackend · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from pymongo import MongoClient

ONCE = 1000         # 调用mongo2es中find的数据条数
SKIPNUM = 0         # 第几次调用mongo2es函数
ERROR_ELE = []      # 未插入es的数据序号列表
INSERT_NUM = 100      # 一次批量插入的条数
START = 0           # 开始下标

class zebrasearch():
    """
    连接Elaticsearch
    """
    def connect_es(self, host, port):
        self.es = Elasticsearch([{u'host': host, u'port': port}], timeout=3600)

    """
    连接到mongodb
    """
    def connect_mongo(self, host, port):
        self.client = MongoClient(host, port)

    """
    将mongodb中的db数据库的collection插入
    elaticsearch的index索引的types中
    """
    def mongo2es(self, db, collection, index, types):
        db = self.client[db]
        collection = db[collection]
        count = 0
        actions = []
        tmp = collection.find().skip(SKIPNUM * ONCE).limit(ONCE)
        for item in tmp:
            item = dict(item)
            item.pop('_id')
            # for p in item['paper']:
            #     if '_id' in p.keys():
            #         p.pop('_id')
            action = {
                "_index": index,
                "_type": types,
                "_source": item
            }
            actions.append(action)
            count += 1
            print('第' + str(SKIPNUM * ONCE + count) + '篇论文已加入列表')
            try:
                if len(actions) == INSERT_NUM:
                    print("截止到" + str(SKIPNUM * ONCE + count) + "篇论文正在准备插入")
                    helpers.bulk(client=self.es, actions=actions)
                    actions.clear()
            except:
                actions.clear()
                ERROR_ELE.append(SKIPNUM * ONCE + count)
        if count > 0:
            helpers.bulk(self.es, actions)

    """
    将es的index索引的types清空
    """
    def cleartypes(self, index, types):
        query = {'query': {'match_all': {}}}
        self.es.delete_by_query(index=index, body=query, doc_type=types)


if __name__ == '__main__':
    zebrasearch = zebrasearch()
    zebrasearch.connect_es(u'139.199.96.196', 9200)
    zebrasearch.connect_mongo('139.199.96.196', 27017)
    # zebrasearch.mongo2es('Business', 'mechanism', 'business', 'user')
    # print(zebrasearch.es.search(index='business', doc_type='scisource'))
    # zebrasearch.cleartypes('busscisource', 'scisource')

    # 专家每次插10条，每次挑100条
    # 论文每次插100条，每次挑1000条
    START = 300
    SKIPNUM = START
    END = START + 376
    for i in range(START, END):
        print("第" + str(i) + "轮")
        zebrasearch.mongo2es('Business', 'paper', 'paper_index', '_doc')
        SKIPNUM += 1

    print(ERROR_ELE)