forked from Ehco1996/Python-crawler
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
670 additions
and
105 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import json | ||
import os | ||
|
||
BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | ||
|
||
|
||
def parse_activites(file_path): | ||
''' | ||
解析用户动态数据 | ||
rtype: | ||
list | ||
''' | ||
with open(file_path) as f: | ||
data = json.load(f).get('data') | ||
res = [] | ||
for action in data: | ||
verb = action['verb'] | ||
if verb == 'ANSWER_VOTE_UP' or verb == 'ANSWER_CREATE': # 赞同/回答的行为 | ||
question_id = action['target']['question']['id'] | ||
question_api_url = action['target']['question']['url'] | ||
question_name = action['target']['question']['title'] | ||
|
||
answer_id = action['target']['id'] | ||
answer_api_url = action['target']['url'] | ||
answer_content = action['target']['excerpt'] | ||
elif verb == 'QUESTION_FOLLOW': # 关注问题的行为 | ||
question_id = action['target']['id'] | ||
question_api_url = action['target']['url'] | ||
question_name = action['target']['title'] | ||
|
||
answer_id = '' | ||
answer_api_url = '' | ||
answer_content = '' | ||
else: | ||
continue | ||
|
||
res.append({ | ||
'question_id': question_id, | ||
'question_name': question_name, | ||
'question_api_url': question_api_url, | ||
'answer_id': answer_id, | ||
'answer_api_url': answer_api_url, | ||
'answer_content': answer_content, }) | ||
return res | ||
|
||
|
||
for file in os.listdir(BASE_DIR+'/data/'): | ||
file_abs_path = BASE_DIR+'/data/'+file | ||
res = parse_activites(file_abs_path) | ||
for data in res: | ||
for k, v in data.items(): | ||
print(k, v) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import json | ||
import time | ||
import os | ||
|
||
from client import ZhihuClient | ||
|
||
BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | ||
|
||
|
||
def download_activs_json(s, url, count=1): | ||
''' | ||
获取用户信息的json信息 | ||
''' | ||
res = s.get(url).json() | ||
with open(BASE_DIR+'/data/{}.json'.format(count), 'w') as f: | ||
f.write(json.dumps(res, ensure_ascii=False)) | ||
print('正在下载第{}份动态'.format(count)) | ||
count += 1 | ||
time.sleep(3) | ||
# 递归下载 知道动态下载完毕 | ||
if res['paging']['is_end'] == False: | ||
next_url = res['paging']['next'] | ||
download_activs_json(s, next_url, count) | ||
else: | ||
print('所有动态下载完毕') | ||
|
||
|
||
# 登录知乎 | ||
s = ZhihuClient('', '').get_session() | ||
# 增加权限认证 | ||
s.headers.update({'authorization': ''}) | ||
# 起始动态url | ||
start_url = 'https://www.zhihu.com/api/v4/members/Ehcostuff/activities?limit=8&after_id=1518305424&desktop=True' | ||
download_activs_json(s, start_url) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
import shutil | ||
|
||
import requests | ||
|
||
|
||
def get_image(url, path): | ||
res = requests.get(url, stream=True) | ||
with open(path, 'wb') as f: | ||
shutil.copyfileobj(res.raw, f) | ||
|
||
|
||
def save_html(text, name): | ||
with open(name, 'w') as f: | ||
f.write(text) |
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.