-
Notifications
You must be signed in to change notification settings - Fork 108
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
2、完善了readme
- Loading branch information
zhang@laptop
committed
Feb 5, 2020
1 parent
b7aaffe
commit 7fc8fbf
Showing
10 changed files
with
242 additions
and
27 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
from selenium import webdriver | ||
from selenium.webdriver.common.by import By | ||
from selenium.webdriver.support.ui import WebDriverWait | ||
from selenium.webdriver.support import expected_conditions as EC | ||
from selenium.webdriver.common.action_chains import ActionChains | ||
from selenium.common.exceptions import NoSuchElementException | ||
|
||
import base64 | ||
import time | ||
import sys | ||
import os | ||
import shutil | ||
from tqdm import trange | ||
from img2pdf import conpdf | ||
from PIL import Image | ||
|
||
|
||
def download(url): | ||
option = webdriver.ChromeOptions() | ||
# option.add_argument('headless') | ||
option.add_argument('log-level=3') | ||
driver = webdriver.Chrome( | ||
executable_path='.//chromedriver', chrome_options=option) | ||
|
||
title = "output" | ||
try: | ||
driver.set_page_load_timeout(15) | ||
driver.get(url) | ||
title = driver.title | ||
except: | ||
print("Timeout - start download anyway.") | ||
|
||
print(f'原创力: 《{title}》') | ||
time.sleep(5) | ||
|
||
try: | ||
# 展开全部 | ||
elem_cont_button = driver.find_element_by_id("agree_full") | ||
driver.execute_script( | ||
"arguments[0].scrollIntoView(true);", elem_cont_button) | ||
actions = ActionChains(driver) | ||
actions.move_to_element(elem_cont_button).perform() | ||
time.sleep(0.5) | ||
elem_cont_button.click() | ||
# time.sleep(10) | ||
except NoSuchElementException: | ||
pass | ||
|
||
frame = driver.find_element_by_id('layer_view_iframe') | ||
src = frame.get_attribute('src') | ||
print(src) | ||
|
||
driver.get(src) | ||
time.sleep(5) | ||
|
||
if os.path.exists(f'./temp/{title}'): | ||
shutil.rmtree(f'./temp/{title}') | ||
os.makedirs(f'./temp/{title}') | ||
|
||
pageCount = int(driver.find_element_by_id( | ||
'PageCount').get_attribute('innerHTML')) | ||
for i in trange(pageCount): | ||
driver.save_screenshot(f'temp/{title}/capture.png') | ||
page = driver.find_element_by_id('ppt') | ||
|
||
left = page.location['x'] | ||
top = page.location['y'] | ||
right = left + page.size['width'] | ||
bottom = top + page.size['height'] - 35 | ||
|
||
im = Image.open(f'temp/{title}/capture.png') | ||
im = im.crop((left, top, right, bottom)) # 元素裁剪 | ||
im.save(f'temp/{title}/{i}.png') # 元素截图 | ||
driver.find_element_by_id('pageNext').click() | ||
time.sleep(1) # 防止还没加载出来 | ||
os.remove(f'./temp/{title}/capture.png') | ||
driver.quit() | ||
print('下载完毕,正在转码') | ||
conpdf(f'output/{title}.pdf', f'temp/{title}', '.png') | ||
|
||
|
||
if __name__ == '__main__': | ||
download("https://max.book118.com/html/2019/1002/8052020057002053.shtm") |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
from reportlab.pdfgen import canvas | ||
from reportlab.platypus import Image | ||
from reportlab.lib.pagesizes import A4, landscape | ||
from reportlab.lib.utils import ImageReader | ||
import PIL.Image | ||
import PIL.ExifTags | ||
import os | ||
import re | ||
import time | ||
|
||
|
||
class ImgToPdf(): | ||
def img_search(self, mypath, filenames): | ||
for lists in os.listdir(mypath): | ||
path = os.path.join(mypath, lists) | ||
if os.path.isfile(path): | ||
expression = r'[\w]+\.(jpg|png|jpeg)$' | ||
if re.search(expression, path, re.IGNORECASE): | ||
filenames.append(path) | ||
elif os.path.isdir(path): | ||
self.img_search(path, filenames) | ||
|
||
def img_search1(self, mypath, filenames): | ||
for lists in os.listdir(mypath): | ||
path = os.path.join(mypath, lists) | ||
if os.path.isfile(path): | ||
a = path.split('.') | ||
if a[-1] in ['jpg', 'png', 'JPEG']: | ||
filenames.append(path) | ||
elif os.path.isdir(path): | ||
self.img_search1(path, filenames) | ||
|
||
def rotate_img_to_proper(self, image): | ||
global orientation | ||
try: | ||
# image = Image.open(filename) | ||
if hasattr(image, '_getexif'): # only present in JPEGs | ||
for orientation in PIL.ExifTags.TAGS.keys(): | ||
if PIL.ExifTags.TAGS[orientation] == 'Orientation': | ||
break | ||
e = image._getexif() # returns None if no EXIF data | ||
if e is not None: | ||
#log.info('EXIF data found: %r', e) | ||
exif = dict(e.items()) | ||
orientation = exif[orientation] | ||
# print('found, ',orientation) | ||
|
||
if orientation == 3: | ||
image = image.transpose(Image.ROTATE_180) | ||
elif orientation == 6: | ||
image = image.transpose(Image.ROTATE_270) | ||
elif orientation == 8: | ||
image = image.rotate(90, expand=True) | ||
except: | ||
pass | ||
return image | ||
|
||
def pmain(self, src_folder, title): | ||
output_file_name = '文件库//'+str(title)+'.pdf' | ||
|
||
imgDoc = canvas.Canvas(output_file_name) | ||
# 修改PDF文件方向:-默认纵向,改direction为其他则是横向 | ||
direction = '|' | ||
if direction == '-': | ||
imgDoc.setPageSize(landscape(A4)) | ||
document_width, document_height = landscape(A4) | ||
else: | ||
imgDoc.setPageSize(A4) | ||
document_width, document_height = A4 | ||
if src_folder is None: | ||
mypath = input('Input the image folder please:') | ||
else: | ||
mypath = src_folder | ||
filenames = [] | ||
start = time.clock() | ||
self.img_search(mypath, filenames) | ||
end = time.clock() | ||
print('find file cost time: ', end-start, | ||
'find files: ', len(filenames)) | ||
|
||
for image in filenames: | ||
try: | ||
image_file = PIL.Image.open(image) | ||
image_file = self.rotate_img_to_proper(image_file) | ||
|
||
image_width, image_height = image_file.size | ||
# print('img size:', image_file.size) | ||
if not(image_width > 0 and image_height > 0): | ||
raise Exception | ||
image_aspect = image_height/float(image_width) | ||
# Determins the demensions of the image in the overview | ||
print_width = document_width | ||
print_height = document_width*image_aspect | ||
imgDoc.drawImage(ImageReader(image_file), document_width-print_width, | ||
document_height-print_height, width=print_width, | ||
height=print_height, preserveAspectRatio=True) | ||
# inform the reportlab we want a new page | ||
imgDoc.showPage() | ||
except Exception as e: | ||
print('error:', e, image) | ||
imgDoc.save() | ||
print('Done') | ||
|
||
|
||
if __name__ == '__main__': | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,11 @@ | ||
# 多种文档下载器 | ||
# 多种文档下载器 | ||
本工具适用于下载豆丁、道客巴巴、淘豆网、原创力、新浪爱问网站的可以预览的文档。只要可以预览,就可以下载。下载下来是图片格式,然后会通过reportlab库,将图片转换成PDF。 | ||
|
||
其中,由于新浪爱问网站用的都是svg格式的文件,将其转换成图片需要调用svg2png库。因此,需要先安装nodejs,再利用npm安装svg2png,然后才能正常使用。 | ||
|
||
## 使用方法 | ||
``` | ||
1. 安装nodejs | ||
2. npm install -g svg2png | ||
3. python docDownloader.py | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters