Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
moonfighting committed Jan 14, 2016
1 parent 29ea41c commit 59e4ab7
Showing 1 changed file with 12 additions and 30 deletions.
42 changes: 12 additions & 30 deletions Tmall/Tmall_comment_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,23 @@
import urllib
import urllib2
import json
from bs4 import BeautifulSoup
import time
import random
import os
import re
import threading
import Queue
import cookielib
import config

headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}

#base_url = 'https://rate.tmall.com/list_detail_rate.htm?itemId=521893618096&sellerId=263726286&currentPage=%d'
base_url = 'https://rate.tmall.com/list_detail_rate.htm?itemId=%s&sellerId=%s&currentPage=%d'

huawei = 'https://huawei.tmall.com/category-1022967656.htm'
def get_Tmall_comment(itemId, sellerId, save_dir):
def get_Tmall_comment(itemId, sellerId, save_dir): #itemId 是商品的id, sellerId是卖家的id
if not os.path.exists(save_dir):
os.mkdir(save_dir)
file_name = os.path.join(save_dir, sellerId + '_' + itemId)
file_name = os.path.join(save_dir, sellerId + '_' + itemId)
fp = open(file_name, 'w')
comment_set = set()
for i in range(1, 100):
for i in range(1, 100): # 最多只能爬取99页评论
page_url = base_url % (itemId, sellerId, i)
req = urllib2.Request(url = page_url, headers = headers)
try:
Expand All @@ -32,8 +27,8 @@ def get_Tmall_comment(itemId, sellerId, save_dir):
print 'page:',i,e
continue
try:
myjson = re.findall('\"rateList\":(\[.*?\])\,\"tags\"',page_data)[0]
page_dict = json.loads(myjson)
myjson = re.findall('\"rateList\":(\[.*?\])\,\"tags\"',page_data)[0] #从json数据中匹配评论的部分
page_dict = json.loads(myjson)
except Exception, e:
print 'page:',i,e
continue
Expand All @@ -46,26 +41,13 @@ def get_Tmall_comment(itemId, sellerId, save_dir):
comment_set.add(comment)
fp.write(comment.encode('utf-8')+'\n')
fp.close()
print len(comment_set)


def get_huawei_phone_list(url):
cookie = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
resp = opener.open(url)
page = resp.read().decode('gbk')
print page.encode('utf-8')


if __name__ == '__main__':
phone_ids = config.phone_ids

for brand in phone_ids.keys():
shops = phone_ids[brand]
for seller in shops.keys():
pids = shops[seller]
for pid in pids:
print brand, seller, pid
get_Tmall_comment(pid, seller, brand)
item_id = '521875831541'
seller_id = '1917047079'
save_dir = '.';

get_Tmall_comment(item_id, seller_id, save_dir)



0 comments on commit 59e4ab7

Please sign in to comment.