Skip to content

Commit

Permalink
s4_02
Browse files Browse the repository at this point in the history
  • Loading branch information
AI悦创 committed Jan 30, 2023
1 parent e34a608 commit 6edd415
Show file tree
Hide file tree
Showing 5 changed files with 195 additions and 0 deletions.
Binary file modified .DS_Store
Binary file not shown.
11 changes: 11 additions & 0 deletions s4_02/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# ScrapeDynamic1

Spider for https://dynamic1.scrape.center/

## Ajax + JSON 存储

[spider.py](./spider.py)

## Ajax + MongoDB 存储

[spider2.py](./spider2.py)
124 changes: 124 additions & 0 deletions s4_02/spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
# import requests
# import logging
#
# logging.basicConfig(level=logging.INFO,
# format='%(asctime)s-%(levelname)s:%(message)s')
#
# INDEX_URL = 'https://dynamic1.scrape.center/api/movie/?limit={limit}&offset={offset}'
#
#
# def scrape_api(url):
# logging.info('scraping %s...', url)
# try:
# response = requests.get(url)
# if response.status_code == 200:
# return response.json()
# logging.error('get invalid status code %s while scraping %s', response.status_code, url)
# except requests.RequestException:
# logging.error('error occurred while scraping %s', url, exc_info=True)
#
#
# LIMIT = 10
#
#
# def scrape_index(page):
# url = INDEX_URL.format(limit=LIMIT, offset=LIMIT * (page - 1))
# return scrape_api(url)
#
#
# DETAIL_URL = 'https://dynamic1.scrape.center/api/movie/{id}'
#
#
# def scrape_detail(id):
# url = DETAIL_URL.format(id=id)
# return scrape_api(url)
#
#
# TOTAL_PAGE = 10
#
# # def main():
# # for page in range(1, TOTAL_PAGE + 1):
# # index_data = scrape_index(page)
# # for item in index_data.get('results'):
# # id = item.get('id')
# # detail_data = scrape_detail(id)
# # logging.info('detail data %s', detail_data)
#
#
# import json
# from os import makedirs
# from os.path import exists
#
# RESULTS_DIR = 'results'
# exists(RESULTS_DIR) or makedirs(RESULTS_DIR)
#
#
# def save_data(data):
# name = data.get('name')
# data_path = f'{RESULTS_DIR}/{name}.json'
# json.dump(data, open(data_path, 'w', encoding='utf-8'), ensure_ascii=False, indent=2)
#
#
# def main():
# for page in range(1, TOTAL_PAGE + 1):
# index_data = scrape_index(page)
# for item in index_data.get('results'):
# id = item.get('id')
# detail_data = scrape_detail(id)
# logging.info('detail data %s', detail_data)
# save_data(detail_data)
#
#
# if __name__ == '__main__':
# main()
import requests
import logging
import json
from os import makedirs
from os.path import exists

logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s: %(message)s')

INDEX_URL = ' https://dynamic1.scrape.center/api/movie/?limit={limit}&offset={offset}'
DETAIL_URL = ' https://dynamic1.scrape.center/api/movie/{id}'
LIMIT = 10
TOTAL_PAGE = 10
RESULTS_DIR = 'results'
exists(RESULTS_DIR) or makedirs(RESULTS_DIR)

def scrape_api(url):
logging.info('scraping %s...', url)
try:
response = requests.get(url)
if response.status_code == 200:
return response.json()
logging.error('get invalid status code %s while scraping %s', response.status_code, url)
except requests.RequestException:
logging.error('error occurred while scraping %s', url, exc_info=True)

def scrape_index(page):
url = INDEX_URL.format(limit=LIMIT, offset=LIMIT * (page - 1))
return scrape_api(url)

def scrape_detail(id):
url = DETAIL_URL.format(id=id)
return scrape_api(url)

def save_data(data):
name = data.get('name')
data_path = f'{RESULTS_DIR}/{name}.json'
json.dump(data, open(data_path, 'w', encoding='utf-8'), ensure_ascii=False, indent=2)


def main():
for page in range(1, TOTAL_PAGE + 1):
index_data = scrape_index(page)
for item in index_data.get('results'):
id = item.get('id')
detail_data = scrape_detail(id)
logging.info('detail data %s', detail_data)
save_data(detail_data)

if __name__ == '__main__':
main()
60 changes: 60 additions & 0 deletions s4_02/spider2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import requests
import logging

logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s: %(message)s')

INDEX_URL = 'https://dynamic1.scrape.center/api/movie/?limit={limit}&offset={offset}'

MONGO_CONNECTION_STRING = 'mongodb://localhost:27017'
MONGO_DB_NAME = 'movies'
MONGO_COLLECTION_NAME = 'movies'

import pymongo
client = pymongo.MongoClient(MONGO_CONNECTION_STRING)
db = client['movies']
collection = db['movies']

def scrape_api(url):
logging.info('scraping %s...', url)
try:
response = requests.get(url)
if response.status_code == 200:
return response.json()
logging.error('get invalid status code %s while scraping %s', response.status_code, url)
except requests.RequestException:
logging.error('error occurred while scraping %s', url, exc_info=True)

LIMIT = 10

def scrape_index(page):
url = INDEX_URL.format(limit=LIMIT, offset=LIMIT * (page - 1))
return scrape_api(url)

DETAIL_URL = 'https://dynamic1.scrape.center/api/movie/{id}'

def scrape_detail(id):
url = DETAIL_URL.format(id=id)
return scrape_api(url)

TOTAL_PAGE = 10

def save_data(data):
collection.update_one({
'name': data.get('name')
}, {
'$set': data
}, upsert=True)

def main():
for page in range(1, TOTAL_PAGE + 1):
index_data = scrape_index(page)
for item in index_data.get('results'):
id = item.get('id')
detail_data = scrape_detail(id)
logging.info('detail data %s', detail_data)
save_data(detail_data)
logging.info('data saved successfully')

if __name__ == '__main__':
main()
Binary file added s4_06/.DS_Store
Binary file not shown.

0 comments on commit 6edd415

Please sign in to comment.