This repository has been archived by the owner on Jan 2, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 10
/
cache.py
466 lines (386 loc) · 12.6 KB
/
cache.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
import argparse
import requests
import os
import json
import time
import pickle
from config import SUB
with open('sender.sess', 'br') as f:
sess = pickle.load(f)
# todo 修改header
def headers():
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
'cookie': f'SUB={SUB}'
}
return header
def strftime(ttint):
"""转time.time()格式为人类可读"""
format = '%Y%m%d-%H:%M:%S'
value = time.localtime(int(ttint))
dt = time.strftime(format, value)
return dt
def log(*args, **kwargs):
"""
写入log.txt文件,带有输出时间
"""
format = '%Y%m%d-%H:%M:%S'
value = time.localtime(int(time.time()))
dt = time.strftime(format, value)
with open('log.txt', 'a', encoding='utf-8') as f:
print(dt, *args, file=f, **kwargs)
class Model(object):
"""打印显示类属性的信息"""
def __repr__(self):
name = self.__class__.__name__
properties = (f'{k}=({v})' for k, v in self.__dict__.items())
string = '\n '.join(properties)
s = f'\n<{name} \n {string}>'
return s
class Weibo(Model):
def __init__(self):
self.ct = ''
self.device = ''
self.text = ''
# 被转发数
self.forward = ''
# 评论数
self.comment_c = ''
if self.comment_c is not 0:
self.comment_ct = ''
self.comment = ''
self.ans_ct = ''
self.ans_comment = ''
self.like = ''
self.photo = ''
# Todo 转发的内容
self.expand = ''
# 新浪的id
self.id = ''
self.mid = ''
def dict_attr(self):
"""
将属性及值转化为字典
清除value为空的键值对
"""
return {name: value
for name, value in vars(self).items()
if value is not ''
if value is not None
if value is not 0
}
def write_cached(strs, path):
"""写入数据"""
with open(path, 'w', encoding='utf-8') as f:
r = f.write(strs)
log(f"{path}\n已写入cached")
return r
def read_cached(path):
"""从缓存中读出json数据"""
with open(path, 'rb') as f:
r = json.load(f)
log(f"{path}\n从cached中读出")
return r
def weibo_path(url):
"""微博缓存的下载路径"""
folder = 'cached'
filename = url.split('=')[-1] + '.txt'
path = os.path.join(folder, filename)
return folder, path
def comment_path(url):
"""
评论缓存的下载路径
folder: cached\comment
path: cached\comment\w_id.txt
"""
folder_f = 'cached'
folder_c = 'comment'
folder = os.path.join(folder_f, folder_c)
# print(folder)
filename = url.split('id=')[-2].split('&')[0] + '.txt'
# print(filename)
path = os.path.join(folder, filename)
# print(path)
return folder, path
def cached_page(url, folder, path):
"""
下载微博ajax的json数据
:url:weibo ajax api
:return:json.loads()的字典格式
"""
# 如果没有'cached'目录则创建
if not os.path.exists(folder):
os.makedirs(folder)
# 已下载则直接读出,数据有时效性
if os.path.exists(path):
r = read_cached(path)
return r
# 连接上网站,获取数据
else:
r = requests.get(url, headers=headers())
# print('r', type(r), r.content)
time.sleep(3)
log('every connecting', r)
log(f"向url请求数据,暂停3秒,\n{url}")
# 判断Json中包含 s 则不保存,微博数据如果没有了 会包含s,
# 评论数据如果没有了则只有一个key,json长度为8
s = bytes('这里还没有内容', encoding='unicode_escape')
if s not in r.content and r.status_code != 400 and len(r.content) >= 10:
# 转成json,再转成字典,再转回json
# 实现json的缩进,方便阅读,后期取出数据,可删后一行
# print(type(r.content))
# Debug: 'utf-8' codec can't decode byte 0xd0
# Debug: json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
# 一是: 评论的数据没有,返回的不是json,是document #<Response [400]>
# 二是: cookie过时了
log('before wring cached', r.status_code)
# print(url)
# print(res)
js = json.loads(r.content)
j = json.dumps(dict(js), indent=4, ensure_ascii=False)
# 写入数据
write_cached(j, path)
return js
else:
print('内容下载完成,或无法解析')
def who_say(data):
"""如果评论者有备注,显示备注名,否则显示用户名"""
whoSay = data['user']['screen_name']
# print(data['user'].keys())
if 'remark' in data['user']:
whoSay = data['user']['remark']
return whoSay
def answer_comment_clean(ans_comment):
"""
传入完始的comment数据,html格式
筛选出干净的comment
"""
ans_comment1 = ans_comment.split('<a')[0]
ans_comment2 = ans_comment.split('>')[1].split('<')[0]
ans_comment3 = ans_comment.split('/a>')[-1]
ans_comment32 = ans_comment3.split('<')[0]
# 取出链接
ans_comment4 = ''
if ' src="' in ans_comment3:
ans_comment4 = ans_comment3.split(' src="')[1].split('"')[0]
ans_comment = ans_comment1 + ans_comment2 + ans_comment32 + ans_comment4
return ans_comment
def answer_comments(data):
"""
对评论回覆,相互回覆的内容
返回[{time: say}]的列表
"""
if data['comments']:
n = len(data['comments'])
ans_cts = []
ans_comments = []
for i in range(n):
ans_ct = data['comments'][i]['created_at']
ans_comment_dirty = data['comments'][i]['text']
# 筛选出干净的ans_comments
ans_comment = answer_comment_clean(ans_comment_dirty)
# print(ans_comment, '\n\n')
ans_cts.append(ans_ct)
ans_comments.append({ans_ct: ans_comment})
else:
ans_comments = None
ans_cts = None
# 返回ans_ct 作为属性,用于索引comments
return ans_cts, ans_comments
def get_comment(w_id, w_mid):
"""
获取评论
转发的内容,评论的内容(id作为url的一部分,在另一json中)
"""
url = f'https://m.weibo.cn/comments/hotflow?id={w_id}&mid={w_mid}&max_id_type=0'
print('have comment_count\'s url', url)
# r = requests.get(url, headers()).content
# jsons = json.loads(r)
# down_comment()
# TODO 排除是否被封了,无法获取评论
# 下载评论的数据进cache\comment
folder, path = comment_path(url)
jsons = cached_page(url, folder, path)
# 有评论数而无评论时
# print(len(jsons), jsons)
# 返回格式
if jsons is None or len(jsons) <= 1:
log(f'评论被删除了,\n{url}')
return '评论被删除了'
data = jsons['data']['data'][0]
ct = data['created_at']
whoSay = who_say(data)
comment = data['text']
# 互评互答的时间与内容
ans_cts, ans_comments = answer_comments(data)
return [(ct, whoSay, comment), ans_cts, ans_comments]
def split_to_link(str, weibo_dict):
"""
分割出链接的字符串,如href=".*?",src=".*?"
"""
texts = weibo_dict['text'].split('<', 1)
text2 = texts[1].split(str, 1)[1].split('"')[0]
link = text2
return link
def weibo_text_clean(w, weibo_dict):
"""
:param w: Weibo的对象
:param weibo_dict: 从json内刨出来的,关于微博内容的字典
:return: 清除html的内容的净化w.text
"""
# 清除text后面的四个空格
if ' ' in weibo_dict['text']:
text = weibo_dict['text'].split(' ', 1)[0] + weibo_dict['text'].split(' ')[1]
else:
text = weibo_dict['text']
# 存在链接,则取出来
if ' src="' in weibo_dict['text'] and ' href="' in weibo_dict['text']:
log('src and href in weibotext', weibo_dict['text'])
texts = weibo_dict['text'].split('<a', 1)
text1 = texts[0]
link1 = split_to_link(' src="', weibo_dict)
link2 = split_to_link(' href="', weibo_dict)
text = text1 + link1 + link2
elif ' src="' in weibo_dict['text']:
texts = weibo_dict['text'].split('<a', 1)
text1 = texts[0]
link = split_to_link(' src="', weibo_dict)
text = text1 + link
elif ' href="' in weibo_dict['text']:
texts = weibo_dict['text'].split('<a', 1)
text1 = texts[0]
# text2 = texts[1].split(' href="', 1)[1].split('"')[0]
# text2 = text1 + text2
link = split_to_link(' href="', weibo_dict)
text = text1 + link
w.text = text
return w.text
def set_weibo_attr(weibo_dict):
"""每条微博的属性"""
w = Weibo()
w.ct = weibo_dict['created_at']
# todo text 超过 140字的内容
w.text = weibo_text_clean(w, weibo_dict)
if 'original_pic' in weibo_dict:
w.photo = weibo_dict['original_pic']
# w.id w.mid为str,w.comment_c为int
w.id = weibo_dict['id']
w.mid = weibo_dict['mid']
w.comment_c = weibo_dict['comments_count']
# 如果评论数不为0,(评论有可能被删了)
# if w.comment_c is not 0:
if False:
# 获取评论的列表
comments = get_comment(w.id, w.mid)
# print('debug', comments)
if '评论被删除了' not in comments:
w.comment_ct = comments[0][0]
# user + ':' + 说的东西
w.comment = comments[0][1] + ':' + comments[0][2]
w.ans_comment = comments[2]
# print(w.comment)
# print(w.ans_comment, '\n\n')
else:
w.comment = '评论被删除了'
w.like = weibo_dict['attitudes_count']
w.forward = weibo_dict['reposts_count']
w.device = weibo_dict['source']
return w
def weibo_list(url):
"""
读取一个url的数据,遍历出每个mblog,生成对象,返回list列表
"""
folder, path = weibo_path(url)
j = cached_page(url, folder, path)
# 一般十个mblog 为一张 cards
# 即十条微博一个json文件
# 判断没有内容停止(cached_page is None)
if j is None:
return
mblogs = j['data']['cards']
lens = len(mblogs)
# print(lens)
# lists 十条微博对象的列表
# lists = [set_weibo_attr(j['data']['cards'][i]['mblog']) for i in range(lens)]
lists = []
# 遍历出每个mblog,生成对象,返回list列表
for i in range(lens):
mblog = j['data']['cards'][i]['mblog']
m = set_weibo_attr(mblog)
lists.append(m)
# print(m.text)
# print(lists)
return lists
def weibo_list_dick(url):
"""
每个url对应的字典
{
id: {'ct': 2017, 'text': ".*?"},
}
"""
lists = weibo_list(url)
if lists is None:
return
# 将遍历每个对象,生成字典
dicts = {}
for w in lists:
d = w.dict_attr()
# id 和 mid是一样的吗?
if d['id'] == d['mid']:
del d['mid']
else:
log(f"发现了一枚神奇的mid,\n{url}")
# 新增进字典
dicts[d['id']] = d
return dicts
def write_weibo_json(url):
"""
以追加的形式写入webo.txt
"""
d = weibo_list_dick(url)
if d is None:
return
with open('weibo.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(d, indent=4, ensure_ascii=False))
log(f"已写入weibo.txt,\n{url}")
return d
def auto(func, url_root):
"""
自动运行直至没有数据
"""
b, n = 1, 1
while b is not None:
# todo url 自增 加 导致 1 12 123 1234 跳页
print(f"Page {n} ...")
url = url_root
url = url + str(n)
b = func(url)
# print(url)
n += 1
log('auto function url', n)
# print('b is None?')
def parse_args():
parser = argparse.ArgumentParser(description='Fack weibo!')
parser.add_argument(
'-cid', dest='cid', type=str,
help='Container ID'
)
args = parser.parse_args()
return args
def main():
# 启动前删除weibo.txt(因为追加写入)
if os.path.exists('weibo.txt'):
os.remove('weibo.txt')
os.remove('log.txt')
# m.weibo.cn 的ajax API
# 将页数删掉。 url以 'page=' 结尾
# todo 这里修改url
args = parse_args()
cid = args.cid
url = f'https://m.weibo.cn/api/container/getIndex?containerid={cid}&page='
auto(write_weibo_json, url)
# write_weibo_json(url)
# weibo_list(url)
# weibo_list_dick(url)
if __name__ == '__main__':
main()