-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
AI悦创
committed
Jan 30, 2023
1 parent
e34a608
commit 6edd415
Showing
5 changed files
with
195 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# ScrapeDynamic1 | ||
|
||
Spider for https://dynamic1.scrape.center/ | ||
|
||
## Ajax + JSON 存储 | ||
|
||
见 [spider.py](./spider.py) | ||
|
||
## Ajax + MongoDB 存储 | ||
|
||
见 [spider2.py](./spider2.py) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
# import requests | ||
# import logging | ||
# | ||
# logging.basicConfig(level=logging.INFO, | ||
# format='%(asctime)s-%(levelname)s:%(message)s') | ||
# | ||
# INDEX_URL = 'https://dynamic1.scrape.center/api/movie/?limit={limit}&offset={offset}' | ||
# | ||
# | ||
# def scrape_api(url): | ||
# logging.info('scraping %s...', url) | ||
# try: | ||
# response = requests.get(url) | ||
# if response.status_code == 200: | ||
# return response.json() | ||
# logging.error('get invalid status code %s while scraping %s', response.status_code, url) | ||
# except requests.RequestException: | ||
# logging.error('error occurred while scraping %s', url, exc_info=True) | ||
# | ||
# | ||
# LIMIT = 10 | ||
# | ||
# | ||
# def scrape_index(page): | ||
# url = INDEX_URL.format(limit=LIMIT, offset=LIMIT * (page - 1)) | ||
# return scrape_api(url) | ||
# | ||
# | ||
# DETAIL_URL = 'https://dynamic1.scrape.center/api/movie/{id}' | ||
# | ||
# | ||
# def scrape_detail(id): | ||
# url = DETAIL_URL.format(id=id) | ||
# return scrape_api(url) | ||
# | ||
# | ||
# TOTAL_PAGE = 10 | ||
# | ||
# # def main(): | ||
# # for page in range(1, TOTAL_PAGE + 1): | ||
# # index_data = scrape_index(page) | ||
# # for item in index_data.get('results'): | ||
# # id = item.get('id') | ||
# # detail_data = scrape_detail(id) | ||
# # logging.info('detail data %s', detail_data) | ||
# | ||
# | ||
# import json | ||
# from os import makedirs | ||
# from os.path import exists | ||
# | ||
# RESULTS_DIR = 'results' | ||
# exists(RESULTS_DIR) or makedirs(RESULTS_DIR) | ||
# | ||
# | ||
# def save_data(data): | ||
# name = data.get('name') | ||
# data_path = f'{RESULTS_DIR}/{name}.json' | ||
# json.dump(data, open(data_path, 'w', encoding='utf-8'), ensure_ascii=False, indent=2) | ||
# | ||
# | ||
# def main(): | ||
# for page in range(1, TOTAL_PAGE + 1): | ||
# index_data = scrape_index(page) | ||
# for item in index_data.get('results'): | ||
# id = item.get('id') | ||
# detail_data = scrape_detail(id) | ||
# logging.info('detail data %s', detail_data) | ||
# save_data(detail_data) | ||
# | ||
# | ||
# if __name__ == '__main__': | ||
# main() | ||
import requests | ||
import logging | ||
import json | ||
from os import makedirs | ||
from os.path import exists | ||
|
||
logging.basicConfig(level=logging.INFO, | ||
format='%(asctime)s - %(levelname)s: %(message)s') | ||
|
||
INDEX_URL = ' https://dynamic1.scrape.center/api/movie/?limit={limit}&offset={offset}' | ||
DETAIL_URL = ' https://dynamic1.scrape.center/api/movie/{id}' | ||
LIMIT = 10 | ||
TOTAL_PAGE = 10 | ||
RESULTS_DIR = 'results' | ||
exists(RESULTS_DIR) or makedirs(RESULTS_DIR) | ||
|
||
def scrape_api(url): | ||
logging.info('scraping %s...', url) | ||
try: | ||
response = requests.get(url) | ||
if response.status_code == 200: | ||
return response.json() | ||
logging.error('get invalid status code %s while scraping %s', response.status_code, url) | ||
except requests.RequestException: | ||
logging.error('error occurred while scraping %s', url, exc_info=True) | ||
|
||
def scrape_index(page): | ||
url = INDEX_URL.format(limit=LIMIT, offset=LIMIT * (page - 1)) | ||
return scrape_api(url) | ||
|
||
def scrape_detail(id): | ||
url = DETAIL_URL.format(id=id) | ||
return scrape_api(url) | ||
|
||
def save_data(data): | ||
name = data.get('name') | ||
data_path = f'{RESULTS_DIR}/{name}.json' | ||
json.dump(data, open(data_path, 'w', encoding='utf-8'), ensure_ascii=False, indent=2) | ||
|
||
|
||
def main(): | ||
for page in range(1, TOTAL_PAGE + 1): | ||
index_data = scrape_index(page) | ||
for item in index_data.get('results'): | ||
id = item.get('id') | ||
detail_data = scrape_detail(id) | ||
logging.info('detail data %s', detail_data) | ||
save_data(detail_data) | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import requests | ||
import logging | ||
|
||
logging.basicConfig(level=logging.INFO, | ||
format='%(asctime)s - %(levelname)s: %(message)s') | ||
|
||
INDEX_URL = 'https://dynamic1.scrape.center/api/movie/?limit={limit}&offset={offset}' | ||
|
||
MONGO_CONNECTION_STRING = 'mongodb://localhost:27017' | ||
MONGO_DB_NAME = 'movies' | ||
MONGO_COLLECTION_NAME = 'movies' | ||
|
||
import pymongo | ||
client = pymongo.MongoClient(MONGO_CONNECTION_STRING) | ||
db = client['movies'] | ||
collection = db['movies'] | ||
|
||
def scrape_api(url): | ||
logging.info('scraping %s...', url) | ||
try: | ||
response = requests.get(url) | ||
if response.status_code == 200: | ||
return response.json() | ||
logging.error('get invalid status code %s while scraping %s', response.status_code, url) | ||
except requests.RequestException: | ||
logging.error('error occurred while scraping %s', url, exc_info=True) | ||
|
||
LIMIT = 10 | ||
|
||
def scrape_index(page): | ||
url = INDEX_URL.format(limit=LIMIT, offset=LIMIT * (page - 1)) | ||
return scrape_api(url) | ||
|
||
DETAIL_URL = 'https://dynamic1.scrape.center/api/movie/{id}' | ||
|
||
def scrape_detail(id): | ||
url = DETAIL_URL.format(id=id) | ||
return scrape_api(url) | ||
|
||
TOTAL_PAGE = 10 | ||
|
||
def save_data(data): | ||
collection.update_one({ | ||
'name': data.get('name') | ||
}, { | ||
'$set': data | ||
}, upsert=True) | ||
|
||
def main(): | ||
for page in range(1, TOTAL_PAGE + 1): | ||
index_data = scrape_index(page) | ||
for item in index_data.get('results'): | ||
id = item.get('id') | ||
detail_data = scrape_detail(id) | ||
logging.info('detail data %s', detail_data) | ||
save_data(detail_data) | ||
logging.info('data saved successfully') | ||
|
||
if __name__ == '__main__': | ||
main() |
Binary file not shown.