Skip to content

Commit

Permalink
it之家,热门评论抓取
Browse files Browse the repository at this point in the history
  • Loading branch information
Ehco1996 committed Aug 25, 2017
1 parent d0980ee commit 4ff8f0a
Show file tree
Hide file tree
Showing 3 changed files with 183 additions and 0 deletions.
7 changes: 7 additions & 0 deletions ithome/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
'''
mongodb的配置文件
'''

MONGO_URL = 'localhost'
MONGO_DB = 'ithome'
MONGO_TABLE = 'hotcomment_network'
18 changes: 18 additions & 0 deletions ithome/pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
'''
处理数据
保存到mogodb
'''

from pymongo import MongoClient
from config import *

client = MongoClient(MONGO_URL, connect=True)
db = client[MONGO_DB]

def save_to_mongo(result):
if db[MONGO_TABLE].insert(result):
print('存储成功', result)
return True
return False


158 changes: 158 additions & 0 deletions ithome/spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
'''
it 之家热评抓取
'''


import requests
from bs4 import BeautifulSoup
# 导入数据库存储方法
from pipeline import save_to_mongo


def parse_hot_comment(newsid):
'''
找到it之家新闻的热评
return :info_list <list>
'''
info_list = []
data = {
'newsID': newsid,
'type': 'hotcomment'
}
try:
r = requests.post(
'https://dyn.ithome.com/ithome/getajaxdata.aspx', data=data)
r.raise_for_status()
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, 'lxml')
comment_list = soup.find_all('li', class_='entry')
for comment in comment_list:
# 评论内容
content = comment.find('p').text
# 用户名
name = comment.find('strong', class_='nick').get_text()
# 其他信息
info = comment.find('div', class_='info rmp').find_all('span')
# 判断用户是否填写了手机尾巴
# 对信息做出咸蛋的处理
# 抓取到 手机厂商、型号、位置、时间
# 方便最后做数据分析
if len(info) > 1:
phone_com = info[0].text.split(' ')[0]
phone_model = info[0].text.split(' ')[1]
loc = info[1].text.replace('IT之家', '').replace(
'网友', ' ').replace('\xa0', '').split(' ')[0]
time = info[1].text.replace('IT之家', '').replace(
'网友', ' ').replace('\xa0', '').split(' ')[2]
else:
phone_com = '暂无'
phone_model = '暂无'
loc = info[0].text.replace('IT之家', '').replace(
'网友', ' ').replace('\xa0', '').split(' ')[0]
time = info[0].text.replace('IT之家', '').replace(
'网友', ' ').replace('\xa0', '').split(' ')[2]

info_list.append(
{'name': name, 'content': content, 'phone_com': phone_com, 'phone_model': phone_model, 'loc': loc, 'time': time, })

return info_list
except:
return None


def parse_news_id(categoryid, page_start):
'''
找到当前分类下首页的文章的id
retrun newsid <str>
'''
data = {
'categoryid': categoryid,
'type': 'pccategorypage',
'page': '1',
}

# 循环获取newsid 最早可到2014年12月
# 默认每次取10页
for page in range(page_start, page_start + 11):
data['page'] = str(page)
try:
r = requests.post(
'http://it.ithome.com/ithome/getajaxdata.aspx', data=data)
soup = BeautifulSoup(r.text, 'lxml')
news_list = soup.find_all('a', class_='list_thumbnail')
# 找到当前页的所有新闻链接之后,用生成器返回newsid
for news in news_list:
yield news['href'].split('/')[-1].replace('.htm', '')

except:
return None


import time
# 写了一个检测函数运行时间的装饰器


def clock(func):
def clocked(*args):
t0 = time.perf_counter()
result = func(*args) # 装饰被装饰的函数

timepassed = time.perf_counter() - t0
name = func.__name__
arg_str = ', '.join(repr(arg) for arg in args)

print('[{:.8f}s] {}({}) -> {}'.format(timepassed, name, arg_str, result))
return clocked


#@clock
def main(page_start):
# 苹果分类的id
APPLE_ID = '32'
# 建立苹果新闻分类对象
apple = parse_news_id(APPLE_ID, page_start)

# 利用迭代器抓取热评
for newsid in apple:
hot_comment_dic = parse_hot_comment(newsid)
if hot_comment_dic:
for comment in hot_comment_dic:
save_to_mongo(comment)
else:
print('没有抓取到热评,一般是文章太过久远')


if __name__ == '__main__':

# 单进程模式
# main(1)

# 开启多进程模式
from multiprocessing import Pool
pool = Pool()
# 进程池,每个进程抓取10页新闻的热评
groups = ([x for x in range(1, 31,10)])
pool.map(main, groups)
pool.close()
pool.join()



'''
开启多进程之前 ,抓取一页新闻的所有热评所话费的时间
[8.45930967s] main() -> None
抓取10页:
[112.86940903s] main(61) -> None
开启后:
不能使用装饰器测时间了
AttributeError: Can't pickle local object 'clock.<locals>.clocked'
改为第三方秒表计时:
1~40:
1:56.54
可以看到 速度快了三倍!
'''

0 comments on commit 4ff8f0a

Please sign in to comment.