Skip to content

Commit

Permalink
1、修复了多个网页
Browse files Browse the repository at this point in the history
2、完善了readme
  • Loading branch information
zhang@laptop committed Feb 5, 2020
1 parent b7aaffe commit 7fc8fbf
Show file tree
Hide file tree
Showing 10 changed files with 242 additions and 27 deletions.
43 changes: 25 additions & 18 deletions book118.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,34 +26,39 @@ def getHTML(url, byte=False):


class Book118:
def __init__(self, pid, title):
def __init__(self, pid, title, url):
ssl._create_default_https_context = ssl._create_unverified_context
self.pid = str(pid)
self.title = '原创力 ' + title.replace('-文档在线预览', '')
self.url = url
self.pdfInfo = {}
self.domain = ''
self.index = -1
self.total = 0
self.imgList = []

def getPDF(self):
print(self.title)
# 获取需要的信息
self.__getPdfInfo()
# 获得所有图片的地址
img = self.pdfInfo.get('Img')
imgUrl = img if img != None else ""
print('解析地址')
while self.index != self.total:
self.__getNextPage(
self.imgList[-1]
if len(self.imgList) != 0 else imgUrl)
self.pbar.close()
# 下载图片
self.__getIMG()
# 生成pdf
print('下载完毕,正在转码')
conpdf(f'output/{self.title}.pdf', f'temp/{self.title}/', '.jpg')
try:
print(self.title)
# 获取需要的信息
self.__getPdfInfo()
# 获得所有图片的地址
img = self.pdfInfo.get('Img')
imgUrl = img if img != None else ""
print('解析地址')
while self.index != self.total:
self.__getNextPage(
self.imgList[-1]
if len(self.imgList) != 0 else imgUrl)
self.pbar.close()
# 下载图片
self.__getIMG()
# 生成pdf
print('下载完毕,正在转码')
conpdf(f'output/{self.title}.pdf', f'temp/{self.title}/', '.jpg')
except Exception:
import book118_PPT
book118_PPT.download(self.url)

def __getPdfInfo(self):
url = makeURL('https://max.book118.com/index.php?',
Expand All @@ -71,6 +76,8 @@ def __getPdfInfo(self):
r'<input type="hidden" id="(.*?)" value="(.*?)".*?/>', rawHTML)
for lst in res:
self.pdfInfo[lst[0]] = lst[1]
for info in self.pdfInfo.items():
print(info)

def __getNextPage(self, imgUrl):
url = makeURL('https://' + self.domain + '.book118.com/PW/GetPage/?', {
Expand Down
83 changes: 83 additions & 0 deletions book118_PPT.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException

import base64
import time
import sys
import os
import shutil
from tqdm import trange
from img2pdf import conpdf
from PIL import Image


def download(url):
option = webdriver.ChromeOptions()
# option.add_argument('headless')
option.add_argument('log-level=3')
driver = webdriver.Chrome(
executable_path='.//chromedriver', chrome_options=option)

title = "output"
try:
driver.set_page_load_timeout(15)
driver.get(url)
title = driver.title
except:
print("Timeout - start download anyway.")

print(f'原创力: 《{title}》')
time.sleep(5)

try:
# 展开全部
elem_cont_button = driver.find_element_by_id("agree_full")
driver.execute_script(
"arguments[0].scrollIntoView(true);", elem_cont_button)
actions = ActionChains(driver)
actions.move_to_element(elem_cont_button).perform()
time.sleep(0.5)
elem_cont_button.click()
# time.sleep(10)
except NoSuchElementException:
pass

frame = driver.find_element_by_id('layer_view_iframe')
src = frame.get_attribute('src')
print(src)

driver.get(src)
time.sleep(5)

if os.path.exists(f'./temp/{title}'):
shutil.rmtree(f'./temp/{title}')
os.makedirs(f'./temp/{title}')

pageCount = int(driver.find_element_by_id(
'PageCount').get_attribute('innerHTML'))
for i in trange(pageCount):
driver.save_screenshot(f'temp/{title}/capture.png')
page = driver.find_element_by_id('ppt')

left = page.location['x']
top = page.location['y']
right = left + page.size['width']
bottom = top + page.size['height'] - 35

im = Image.open(f'temp/{title}/capture.png')
im = im.crop((left, top, right, bottom)) # 元素裁剪
im.save(f'temp/{title}/{i}.png') # 元素截图
driver.find_element_by_id('pageNext').click()
time.sleep(1) # 防止还没加载出来
os.remove(f'./temp/{title}/capture.png')
driver.quit()
print('下载完毕,正在转码')
conpdf(f'output/{title}.pdf', f'temp/{title}', '.png')


if __name__ == '__main__':
download("https://max.book118.com/html/2019/1002/8052020057002053.shtm")
Binary file modified chromedriver.exe
Binary file not shown.
4 changes: 3 additions & 1 deletion doc88.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,9 @@ def download(url):

# Check loading status
while(len(driver.find_element_by_id(pagepb_id).get_attribute('innerHTML')) != 0):
time.sleep(0.5)
time.sleep(1)
# print(driver.find_element_by_id(
# pagepb_id).get_attribute('innerHTML'))

js_cmd = "var canvas = document.getElementById('{}');".format(canvas_id) + \
"return canvas.toDataURL();"
Expand Down
2 changes: 1 addition & 1 deletion docDownloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def main():
html = BeautifulSoup(urllib.request.urlopen(
url).read(), features='lxml')
title = html.title.string[:-4]
Book118(url.split('/')[-1].split('.')[0], title).getPDF()
Book118(url.split('/')[-1].split('.')[0], title, url).getPDF()
elif 'taodocs' in url:
# 淘豆网
import taodocs
Expand Down
13 changes: 8 additions & 5 deletions douding.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import shutil
import os
from img2pdf import conpdf
import requests


def download(url):
Expand All @@ -13,7 +14,7 @@ def download(url):
pages = int(text[pos + 8: pos + 12].split(',')[0])
id = url.split('.')[-2].split('-')[-1]
html = BeautifulSoup(text, features='lxml')
title = html.title.string
title = html.title.string.replace('/', '.')
print(f'豆丁:《{title}》')

if os.path.exists(f'./temp/{title}'):
Expand All @@ -22,13 +23,15 @@ def download(url):

for i in trange(pages):
url = f"http://211.147.220.164/index.jsp?file={id}&width=1600&pageno={i + 1}"
response = urllib.request.urlopen(url)
res = response.read()
res = requests.get(url)
res = res.text
# response = urllib.request.urlopen(url)
# res = response.read()
with open(f'./temp/{title}/{i+1}.jpg', 'wb') as f:
f.write(res)
f.write(res.encode())
print('下载完毕,正在转码')
conpdf(f'output/{title}.pdf', f'./temp/{title}', '.jpg', True)


if __name__ == "__main__":
download("https://www.docin.com/p-96519470.html")
download("https://jz.docin.com/p-1995868152.html")
106 changes: 106 additions & 0 deletions img.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from reportlab.pdfgen import canvas
from reportlab.platypus import Image
from reportlab.lib.pagesizes import A4, landscape
from reportlab.lib.utils import ImageReader
import PIL.Image
import PIL.ExifTags
import os
import re
import time


class ImgToPdf():
def img_search(self, mypath, filenames):
for lists in os.listdir(mypath):
path = os.path.join(mypath, lists)
if os.path.isfile(path):
expression = r'[\w]+\.(jpg|png|jpeg)$'
if re.search(expression, path, re.IGNORECASE):
filenames.append(path)
elif os.path.isdir(path):
self.img_search(path, filenames)

def img_search1(self, mypath, filenames):
for lists in os.listdir(mypath):
path = os.path.join(mypath, lists)
if os.path.isfile(path):
a = path.split('.')
if a[-1] in ['jpg', 'png', 'JPEG']:
filenames.append(path)
elif os.path.isdir(path):
self.img_search1(path, filenames)

def rotate_img_to_proper(self, image):
global orientation
try:
# image = Image.open(filename)
if hasattr(image, '_getexif'): # only present in JPEGs
for orientation in PIL.ExifTags.TAGS.keys():
if PIL.ExifTags.TAGS[orientation] == 'Orientation':
break
e = image._getexif() # returns None if no EXIF data
if e is not None:
#log.info('EXIF data found: %r', e)
exif = dict(e.items())
orientation = exif[orientation]
# print('found, ',orientation)

if orientation == 3:
image = image.transpose(Image.ROTATE_180)
elif orientation == 6:
image = image.transpose(Image.ROTATE_270)
elif orientation == 8:
image = image.rotate(90, expand=True)
except:
pass
return image

def pmain(self, src_folder, title):
output_file_name = '文件库//'+str(title)+'.pdf'

imgDoc = canvas.Canvas(output_file_name)
# 修改PDF文件方向:-默认纵向,改direction为其他则是横向
direction = '|'
if direction == '-':
imgDoc.setPageSize(landscape(A4))
document_width, document_height = landscape(A4)
else:
imgDoc.setPageSize(A4)
document_width, document_height = A4
if src_folder is None:
mypath = input('Input the image folder please:')
else:
mypath = src_folder
filenames = []
start = time.clock()
self.img_search(mypath, filenames)
end = time.clock()
print('find file cost time: ', end-start,
'find files: ', len(filenames))

for image in filenames:
try:
image_file = PIL.Image.open(image)
image_file = self.rotate_img_to_proper(image_file)

image_width, image_height = image_file.size
# print('img size:', image_file.size)
if not(image_width > 0 and image_height > 0):
raise Exception
image_aspect = image_height/float(image_width)
# Determins the demensions of the image in the overview
print_width = document_width
print_height = document_width*image_aspect
imgDoc.drawImage(ImageReader(image_file), document_width-print_width,
document_height-print_height, width=print_width,
height=print_height, preserveAspectRatio=True)
# inform the reportlab we want a new page
imgDoc.showPage()
except Exception as e:
print('error:', e, image)
imgDoc.save()
print('Done')


if __name__ == '__main__':
pass
1 change: 1 addition & 0 deletions ishare.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def download(url):
html = requests.get(imgUrl).content
with open(f'./temp/{title}/{title}.svg', 'wb') as svgFile:
svgFile.write(html)
svgFile.flush()
os.system(
f'svg2png "./temp/{title}/{title}.svg" -o "./temp/{title}/{title}.png" -w 1500')
im = Image.open(f'./temp/{title}/{title}.png')
Expand Down
12 changes: 11 additions & 1 deletion readme.md
Original file line number Diff line number Diff line change
@@ -1 +1,11 @@
# 多种文档下载器
# 多种文档下载器
本工具适用于下载豆丁、道客巴巴、淘豆网、原创力、新浪爱问网站的可以预览的文档。只要可以预览,就可以下载。下载下来是图片格式,然后会通过reportlab库,将图片转换成PDF。

其中,由于新浪爱问网站用的都是svg格式的文件,将其转换成图片需要调用svg2png库。因此,需要先安装nodejs,再利用npm安装svg2png,然后才能正常使用。

## 使用方法
```
1. 安装nodejs
2. npm install -g svg2png
3. python docDownloader.py
```
5 changes: 4 additions & 1 deletion taodocs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException, JavascriptException
from selenium.common.exceptions import NoSuchElementException, JavascriptException, StaleElementReferenceException

import base64
import time
Expand Down Expand Up @@ -48,8 +48,11 @@ def download(url):
actions.move_to_element(elem_cont_button).perform()
time.sleep(0.5)
driver.execute_script("arguments[0].click();", elem_cont_button)
# break
except NoSuchElementException:
break
except StaleElementReferenceException:
break
except JavascriptException:
continue
# 获取页数
Expand Down

0 comments on commit 7fc8fbf

Please sign in to comment.