-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathsearch.py
More file actions
86 lines (76 loc) · 2.77 KB
/
search.py
File metadata and controls
86 lines (76 loc) · 2.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from pymongo import MongoClient
ONCE = 1000 # 调用mongo2es中find的数据条数
SKIPNUM = 0 # 第几次调用mongo2es函数
ERROR_ELE = [] # 未插入es的数据序号列表
INSERT_NUM = 100 # 一次批量插入的条数
START = 0 # 开始下标
class zebrasearch():
"""
连接Elaticsearch
"""
def connect_es(self, host, port):
self.es = Elasticsearch([{u'host': host, u'port': port}], timeout=3600)
"""
连接到mongodb
"""
def connect_mongo(self, host, port):
self.client = MongoClient(host, port)
"""
将mongodb中的db数据库的collection插入
elaticsearch的index索引的types中
"""
def mongo2es(self, db, collection, index, types):
db = self.client[db]
collection = db[collection]
count = 0
actions = []
tmp = collection.find().skip(SKIPNUM * ONCE).limit(ONCE)
for item in tmp:
item = dict(item)
item.pop('_id')
# for p in item['paper']:
# if '_id' in p.keys():
# p.pop('_id')
action = {
"_index": index,
"_type": types,
"_source": item
}
actions.append(action)
count += 1
print('第' + str(SKIPNUM * ONCE + count) + '篇论文已加入列表')
try:
if len(actions) == INSERT_NUM:
print("截止到" + str(SKIPNUM * ONCE + count) + "篇论文正在准备插入")
helpers.bulk(client=self.es, actions=actions)
actions.clear()
except:
actions.clear()
ERROR_ELE.append(SKIPNUM * ONCE + count)
if count > 0:
helpers.bulk(self.es, actions)
"""
将es的index索引的types清空
"""
def cleartypes(self, index, types):
query = {'query': {'match_all': {}}}
self.es.delete_by_query(index=index, body=query, doc_type=types)
if __name__ == '__main__':
zebrasearch = zebrasearch()
zebrasearch.connect_es(u'139.199.96.196', 9200)
zebrasearch.connect_mongo('139.199.96.196', 27017)
# zebrasearch.mongo2es('Business', 'mechanism', 'business', 'user')
# print(zebrasearch.es.search(index='business', doc_type='scisource'))
# zebrasearch.cleartypes('busscisource', 'scisource')
# 专家每次插10条,每次挑100条
# 论文每次插100条,每次挑1000条
START = 300
SKIPNUM = START
END = START + 376
for i in range(START, END):
print("第" + str(i) + "轮")
zebrasearch.mongo2es('Business', 'paper', 'paper_index', '_doc')
SKIPNUM += 1
print(ERROR_ELE)