1、修复了多个网页

2、完善了readme
rty813 · Feb 5, 2020 · 7fc8fbf · 7fc8fbf
1 parent b7aaffe
commit 7fc8fbf
Show file tree

Hide file tree

Showing 10 changed files with 242 additions and 27 deletions.
diff --git a/book118.py b/book118.py
@@ -26,34 +26,39 @@ def getHTML(url, byte=False):
 
 
 class Book118:
-    def __init__(self, pid, title):
+    def __init__(self, pid, title, url):
         ssl._create_default_https_context = ssl._create_unverified_context
         self.pid = str(pid)
         self.title = '原创力 ' + title.replace('-文档在线预览', '')
+        self.url = url
         self.pdfInfo = {}
         self.domain = ''
         self.index = -1
         self.total = 0
         self.imgList = []
 
     def getPDF(self):
-        print(self.title)
-        # 获取需要的信息
-        self.__getPdfInfo()
-        #　获得所有图片的地址
-        img = self.pdfInfo.get('Img')
-        imgUrl = img if img != None else ""
-        print('解析地址')
-        while self.index != self.total:
-            self.__getNextPage(
-                self.imgList[-1]
-                if len(self.imgList) != 0 else imgUrl)
-        self.pbar.close()
-        # 下载图片
-        self.__getIMG()
-        # 生成pdf
-        print('下载完毕，正在转码')
-        conpdf(f'output/{self.title}.pdf', f'temp/{self.title}/', '.jpg')
+        try:
+            print(self.title)
+            # 获取需要的信息
+            self.__getPdfInfo()
+            #　获得所有图片的地址
+            img = self.pdfInfo.get('Img')
+            imgUrl = img if img != None else ""
+            print('解析地址')
+            while self.index != self.total:
+                self.__getNextPage(
+                    self.imgList[-1]
+                    if len(self.imgList) != 0 else imgUrl)
+            self.pbar.close()
+            # 下载图片
+            self.__getIMG()
+            # 生成pdf
+            print('下载完毕，正在转码')
+            conpdf(f'output/{self.title}.pdf', f'temp/{self.title}/', '.jpg')
+        except Exception:
+            import book118_PPT
+            book118_PPT.download(self.url)
 
     def __getPdfInfo(self):
         url = makeURL('https://max.book118.com/index.php?',
@@ -71,6 +76,8 @@ def __getPdfInfo(self):
             r'<input type="hidden" id="(.*?)" value="(.*?)".*?/>', rawHTML)
         for lst in res:
             self.pdfInfo[lst[0]] = lst[1]
+        for info in self.pdfInfo.items():
+            print(info)
 
     def __getNextPage(self, imgUrl):
         url = makeURL('https://' + self.domain + '.book118.com/PW/GetPage/?', {

diff --git a/book118_PPT.py b/book118_PPT.py
@@ -0,0 +1,83 @@
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.common.exceptions import NoSuchElementException
+
+import base64
+import time
+import sys
+import os
+import shutil
+from tqdm import trange
+from img2pdf import conpdf
+from PIL import Image
+
+
+def download(url):
+    option = webdriver.ChromeOptions()
+    # option.add_argument('headless')
+    option.add_argument('log-level=3')
+    driver = webdriver.Chrome(
+        executable_path='.//chromedriver', chrome_options=option)
+
+    title = "output"
+    try:
+        driver.set_page_load_timeout(15)
+        driver.get(url)
+        title = driver.title
+    except:
+        print("Timeout - start download anyway.")
+
+    print(f'原创力: 《{title}》')
+    time.sleep(5)
+
+    try:
+        # 展开全部
+        elem_cont_button = driver.find_element_by_id("agree_full")
+        driver.execute_script(
+            "arguments[0].scrollIntoView(true);", elem_cont_button)
+        actions = ActionChains(driver)
+        actions.move_to_element(elem_cont_button).perform()
+        time.sleep(0.5)
+        elem_cont_button.click()
+        # time.sleep(10)
+    except NoSuchElementException:
+        pass
+
+    frame = driver.find_element_by_id('layer_view_iframe')
+    src = frame.get_attribute('src')
+    print(src)
+
+    driver.get(src)
+    time.sleep(5)
+
+    if os.path.exists(f'./temp/{title}'):
+        shutil.rmtree(f'./temp/{title}')
+    os.makedirs(f'./temp/{title}')
+
+    pageCount = int(driver.find_element_by_id(
+        'PageCount').get_attribute('innerHTML'))
+    for i in trange(pageCount):
+        driver.save_screenshot(f'temp/{title}/capture.png')
+        page = driver.find_element_by_id('ppt')
+
+        left = page.location['x']
+        top = page.location['y']
+        right = left + page.size['width']
+        bottom = top + page.size['height'] - 35
+
+        im = Image.open(f'temp/{title}/capture.png')
+        im = im.crop((left, top, right, bottom))  # 元素裁剪
+        im.save(f'temp/{title}/{i}.png')  # 元素截图
+        driver.find_element_by_id('pageNext').click()
+        time.sleep(1)  # 防止还没加载出来
+    os.remove(f'./temp/{title}/capture.png')
+    driver.quit()
+    print('下载完毕，正在转码')
+    conpdf(f'output/{title}.pdf', f'temp/{title}', '.png')
+
+
+if __name__ == '__main__':
+    download("https://max.book118.com/html/2019/1002/8052020057002053.shtm")
diff --git a/chromedriver.exe b/chromedriver.exe
diff --git a/doc88.py b/doc88.py
@@ -72,7 +72,9 @@ def download(url):
 
         # Check loading status
         while(len(driver.find_element_by_id(pagepb_id).get_attribute('innerHTML')) != 0):
-            time.sleep(0.5)
+            time.sleep(1)
+            # print(driver.find_element_by_id(
+            #     pagepb_id).get_attribute('innerHTML'))
 
         js_cmd = "var canvas = document.getElementById('{}');".format(canvas_id) + \
             "return canvas.toDataURL();"

diff --git a/docDownloader.py b/docDownloader.py
@@ -35,7 +35,7 @@ def main():
             html = BeautifulSoup(urllib.request.urlopen(
                 url).read(), features='lxml')
             title = html.title.string[:-4]
-            Book118(url.split('/')[-1].split('.')[0], title).getPDF()
+            Book118(url.split('/')[-1].split('.')[0], title, url).getPDF()
         elif 'taodocs' in url:
             # 淘豆网
             import taodocs

diff --git a/douding.py b/douding.py
@@ -5,6 +5,7 @@
 import shutil
 import os
 from img2pdf import conpdf
+import requests
 
 
 def download(url):
@@ -13,7 +14,7 @@ def download(url):
     pages = int(text[pos + 8: pos + 12].split(',')[0])
     id = url.split('.')[-2].split('-')[-1]
     html = BeautifulSoup(text, features='lxml')
-    title = html.title.string
+    title = html.title.string.replace('/', '.')
     print(f'豆丁：《{title}》')
 
     if os.path.exists(f'./temp/{title}'):
@@ -22,13 +23,15 @@ def download(url):
 
     for i in trange(pages):
         url = f"http://211.147.220.164/index.jsp?file={id}&width=1600&pageno={i + 1}"
-        response = urllib.request.urlopen(url)
-        res = response.read()
+        res = requests.get(url)
+        res = res.text
+        # response = urllib.request.urlopen(url)
+        # res = response.read()
         with open(f'./temp/{title}/{i+1}.jpg', 'wb') as f:
-            f.write(res)
+            f.write(res.encode())
     print('下载完毕，正在转码')
     conpdf(f'output/{title}.pdf', f'./temp/{title}', '.jpg', True)
 
 
 if __name__ == "__main__":
-    download("https://www.docin.com/p-96519470.html")
+    download("https://jz.docin.com/p-1995868152.html")
diff --git a/img.py b/img.py
@@ -0,0 +1,106 @@
+from reportlab.pdfgen import canvas
+from reportlab.platypus import Image
+from reportlab.lib.pagesizes import A4, landscape
+from reportlab.lib.utils import ImageReader
+import PIL.Image
+import PIL.ExifTags
+import os
+import re
+import time
+
+
+class ImgToPdf():
+    def img_search(self, mypath, filenames):
+        for lists in os.listdir(mypath):
+            path = os.path.join(mypath, lists)
+            if os.path.isfile(path):
+                expression = r'[\w]+\.(jpg|png|jpeg)$'
+                if re.search(expression, path, re.IGNORECASE):
+                    filenames.append(path)
+            elif os.path.isdir(path):
+                self.img_search(path, filenames)
+
+    def img_search1(self, mypath, filenames):
+        for lists in os.listdir(mypath):
+            path = os.path.join(mypath, lists)
+            if os.path.isfile(path):
+                a = path.split('.')
+                if a[-1] in ['jpg', 'png', 'JPEG']:
+                    filenames.append(path)
+            elif os.path.isdir(path):
+                self.img_search1(path, filenames)
+
+    def rotate_img_to_proper(self, image):
+        global orientation
+        try:
+            # image = Image.open(filename)
+            if hasattr(image, '_getexif'):  # only present in JPEGs
+                for orientation in PIL.ExifTags.TAGS.keys():
+                    if PIL.ExifTags.TAGS[orientation] == 'Orientation':
+                        break
+                e = image._getexif()  # returns None if no EXIF data
+                if e is not None:
+                    #log.info('EXIF data found: %r', e)
+                    exif = dict(e.items())
+                    orientation = exif[orientation]
+                    # print('found, ',orientation)
+
+                    if orientation == 3:
+                        image = image.transpose(Image.ROTATE_180)
+                    elif orientation == 6:
+                        image = image.transpose(Image.ROTATE_270)
+                    elif orientation == 8:
+                        image = image.rotate(90, expand=True)
+        except:
+            pass
+        return image
+
+    def pmain(self, src_folder, title):
+        output_file_name = '文件库//'+str(title)+'.pdf'
+
+        imgDoc = canvas.Canvas(output_file_name)
+        # 修改PDF文件方向：-默认纵向，改direction为其他则是横向
+        direction = '|'
+        if direction == '-':
+            imgDoc.setPageSize(landscape(A4))
+            document_width, document_height = landscape(A4)
+        else:
+            imgDoc.setPageSize(A4)
+            document_width, document_height = A4
+        if src_folder is None:
+            mypath = input('Input the image folder please:')
+        else:
+            mypath = src_folder
+        filenames = []
+        start = time.clock()
+        self.img_search(mypath, filenames)
+        end = time.clock()
+        print('find file cost time: ', end-start,
+              'find files: ', len(filenames))
+
+        for image in filenames:
+            try:
+                image_file = PIL.Image.open(image)
+                image_file = self.rotate_img_to_proper(image_file)
+
+                image_width, image_height = image_file.size
+                # print('img size:', image_file.size)
+                if not(image_width > 0 and image_height > 0):
+                    raise Exception
+                image_aspect = image_height/float(image_width)
+                # Determins the demensions of the image in the overview
+                print_width = document_width
+                print_height = document_width*image_aspect
+                imgDoc.drawImage(ImageReader(image_file), document_width-print_width,
+                                 document_height-print_height, width=print_width,
+                                 height=print_height, preserveAspectRatio=True)
+                # inform the reportlab we want a new page
+                imgDoc.showPage()
+            except Exception as e:
+                print('error:', e, image)
+        imgDoc.save()
+        print('Done')
+
+
+if __name__ == '__main__':
+    pass
diff --git a/ishare.py b/ishare.py
@@ -80,6 +80,7 @@ def download(url):
             html = requests.get(imgUrl).content
             with open(f'./temp/{title}/{title}.svg', 'wb') as svgFile:
                 svgFile.write(html)
+                svgFile.flush()
                 os.system(
                     f'svg2png "./temp/{title}/{title}.svg" -o "./temp/{title}/{title}.png" -w 1500')
                 im = Image.open(f'./temp/{title}/{title}.png')

diff --git a/readme.md b/readme.md
@@ -1 +1,11 @@
-# 多种文档下载器
+# 多种文档下载器
+本工具适用于下载豆丁、道客巴巴、淘豆网、原创力、新浪爱问网站的可以预览的文档。只要可以预览，就可以下载。下载下来是图片格式，然后会通过reportlab库，将图片转换成PDF。
+
+其中，由于新浪爱问网站用的都是svg格式的文件，将其转换成图片需要调用svg2png库。因此，需要先安装nodejs，再利用npm安装svg2png，然后才能正常使用。
+
+## 使用方法
+```
+1. 安装nodejs
+2. npm install -g svg2png
+3. python docDownloader.py
+```
diff --git a/taodocs.py b/taodocs.py
@@ -3,7 +3,7 @@
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.common.action_chains import ActionChains
-from selenium.common.exceptions import NoSuchElementException, JavascriptException
+from selenium.common.exceptions import NoSuchElementException, JavascriptException, StaleElementReferenceException
 
 import base64
 import time
@@ -48,8 +48,11 @@ def download(url):
             actions.move_to_element(elem_cont_button).perform()
             time.sleep(0.5)
             driver.execute_script("arguments[0].click();", elem_cont_button)
+            # break
         except NoSuchElementException:
             break
+        except StaleElementReferenceException:
+            break
         except JavascriptException:
             continue
     # 获取页数