-
Notifications
You must be signed in to change notification settings - Fork 21
/
cnvd_fofa_gather.py
224 lines (182 loc) · 7.63 KB
/
cnvd_fofa_gather.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import re
import base64
import requests
from urllib.parse import quote
import time
requests.packages.urllib3.disable_warnings()
headers = {
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.9 Safari/537.36',
}
def get_proxy():
return requests.get("http://127.0.0.1:5010/get/").json()
def delete_proxy(proxy):
requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))
def getHtml(urls):
# 代理ip尝试连接次数
print("---》》使用代理IP《《---")
retry_count = 3
proxy = get_proxy().get("proxy")
proxy = {"http": "http://{}".format(proxy),
"https": "https://{}".format(proxy)
}
while retry_count > 0:
try:
html = requests.get(urls, headers=headers, proxies=proxy, verify=False, timeout=10)
# 使用代理访问
return html
except Exception as u:
print('proxy_err:', retry_count)
retry_count -= 1
# 删除代理池中代理
delete_proxy(proxy)
# 代理池ip无法请求成功,尝试使用本地ip请求一次,提高容错率
print('---》》使用本地IP《《---')
try:
html = requests.get(urls, headers=headers, verify=False)
return html
except Exception as u:
print('lo_err:', u)
return None
# 延时,防止请求过快,如果还是太快出现请求错误,可以再调高一些
time.sleep(1)
def fofa_search(kjgs, gs):
url = 'https://classic.fofa.so//search//result_stats?qbase64='
search = '"' + kjgs + '"'
search_data_bs = str(base64.b64encode(search.encode("utf-8")), "utf-8")
search_data_url = quote(search_data_bs) # url编码
urls = url + search_data_url
# print(urls)
result = getHtml(urls)
while 1:
if type(result) == type(None):
print('空')
result = getHtml(urls)
if result.status_code == 200:
# print(200)
break
else:
print(result.status_code)
# time.sleep(1)
# proxy = get_proxy().get("proxy")
result = getHtml(urls)
results = result.content.decode('utf-8')
# print(results)
# 获取独立IP总数
# print(results)
# (1)截取ip_count = 后的内容
ip_count_start = re.search('ip_count = ', results).span()[1]
ip_data = results[ip_count_start:]
# (2)匹配ip_count = 后第一次出现的数,即独立ip
ip_count = re.match(r'\d+', ip_data).group() # group()获取匹配到的值
print(kjgs + "--->独立总数IP:" + ip_count)
if int(ip_count) > 20:
# 获取首个标题
title_data1 = results[:ip_count_start]
title_data2_start = re.search('网站标题排名', title_data1).span()[1]
title_data2 = title_data1[title_data2_start:]
# print(title_data2)
# print(title_start)
title_data3 = re.search('<a.*">', title_data2).group()
title_data3_start = re.search('<a.*">', title_data2).group()[0]
# print(title_data3)
# 如果需要匹配一个转义字符,需要用3条反斜杠,python转义一次,正则转义一次
title_data4_start = re.search('<\\\/a>', title_data3).span()[0]
title_data5 = title_data3[:title_data4_start]
title_data5_start = re.search('>', title_data5).span()[1]
title = title_data3[title_data5_start:title_data4_start]
# 获取首个标题对应的站点数
num_flag1 = re.search('<span >', title_data3).span()[1]
num_flag2 = re.search('<\\\/span>', title_data3).span()[0]
num = title_data3[num_flag1:num_flag2]
if int(num) > 15 or int(ip_count) >= 300:
print("------>>标题榜首:" + title + "---->>对应条数:" + num)
with open(r'company.txt', 'a+') as f:
f.write(kjgs + '/' + gs + "-->独立总数IP:" + ip_count + "->标题榜首:" + title + "->标题对应数:" + num)
f.write('\n')
f.write('\n')
f.close()
if __name__ == '__main__':
# fofa_search("科技")
# 打开公司列表,获取公司名称
print("开始收集--------")
for f in open('gs.txt', 'rb'):
gs = str(f, "utf-8")
gs = gs.strip()
# 获取科技前面的字段
try:
if re.search(r'科技', gs):
start = re.search(r'科技', gs).span()[1]
kj = gs[:start]
# 去除括号内容
if '(' in kj:
start = re.search(r'\(', kj).span()[0]
end = re.search(r'\)', kj).span()[1]
kj_last = kj.replace(kj[start:end], '')
fofa_search(kj_last, gs)
# q.put(kj_last)
else:
fofa_search(kj, gs)
# q.put(kj)
elif re.search(r'技术', gs):
start = re.search(r'技术', gs).span()[1]
kj = gs[:start]
if '(' in kj:
start = re.search(r'\(', kj).span()[0]
end = re.search(r'\)', kj).span()[1]
kj_last = kj.replace(kj[start:end], '')
fofa_search(kj_last, gs)
# q.put(kj_last)
else:
fofa_search(kj, gs)
# q.put(kj)
elif re.search(r'软件', gs):
start = re.search(r'软件', gs).span()[1]
kj = gs[:start]
if '(' in kj:
start = re.search(r'\(', kj).span()[0]
end = re.search(r'\)', kj).span()[1]
kj_last = kj.replace(kj[start:end], '')
fofa_search(kj_last, gs)
# q.put(kj_last)
else:
fofa_search(kj, gs)
# q.put(kj)
elif re.search(r'股份', gs):
start = re.search(r'股份', gs).span()[0]
kj = gs[:start]
if '(' in kj:
start = re.search(r'\(', kj).span()[0]
end = re.search(r'\)', kj).span()[1]
kj_last = kj.replace(kj[start:end], '')
fofa_search(kj_last, gs)
# q.put(kj_last)
else:
fofa_search(kj, gs)
# q.put(kj)
elif re.search(r'有限', gs):
start = re.search(r'有限', gs).span()[0]
kj = gs[:start]
if '(' in kj:
start = re.search(r'\(', kj).span()[0]
end = re.search(r'\)', kj).span()[1]
kj_last = kj.replace(kj[start:end], '')
fofa_search(kj_last, gs)
# q.put(kj_last)
else:
fofa_search(kj, gs)
# q.put(kj)
else:
if '(' in gs:
start = re.search(r'\(', gs).span()[0]
end = re.search(r'\)', gs).span()[1]
gs_last = gs.replace(gs[start:end], '')
fofa_search(gs_last, gs)
# q.put(kj_last)
else:
kj = gs
# print(kj,gs)
fofa_search(kj, gs)
# q.put(kj)
except Exception as u:
print('main_err:', u)