A working app with a few bugs sprinkled in. Configuration saved externally

2022-04-17 21:58:58 +02:00
commit 0a6dde8c78
26 changed files with 1742 additions and 0 deletions
--- a/app/utils/download/init.py
+++ b/app/utils/download/init.py
--- a/app/utils/download/browser.py
+++ b/app/utils/download/browser.py
@@ -0,0 +1,158 @@
+import time
+import datetime
+import logging
+import os
+import base64
+import requests
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options
+import configuration
+
+config = configuration.parsed["DOWNLOADS"]
+
+
+
+class PDFDownloader:
+    """Saves a given url. Fills the object it got as a parameter"""
+    logger = logging.getLogger(__name__)
+    # status-variable for restarting:
+    running = False
+    
+    def start(self):
+        options=Options()
+        options.profile = config["browser_profile_path"]
+        # TODO: Get headless mode interactively
+        options.add_argument('--headless')
+        # options.add_argument("--disable-infobars")
+        # options.set_preference("javascript.enabled", False)
+        # options.add_argument("--disable-popup-blocking")
+        # Print to pdf
+        options.set_preference("print_printer", "Mozilla Save to PDF")
+        options.set_preference("print.always_print_silent", True)
+        options.set_preference("print.show_print_progress", False)
+        options.set_preference('print.save_as_pdf.links.enabled', True)
+        options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
+        # Save existing pdf
+        options.set_preference("browser.download.folderList", 2)
+        # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
+        # options.set_preference("pdfjs.disabled", True)
+        options.set_preference("browser.download.dir", config["default_download_path"])
+
+        self.logger.info("Now Starting gecko driver")
+        self.driver = webdriver.Firefox(options=options)
+        
+        residues = os.listdir(config["default_download_path"])
+        for res in residues:
+            os.remove(os.path.join(config["default_download_path"], res))
+
+        self.running = True
+
+    def autostart(self):
+        if not self.running:
+            self.start() # relaunch the dl util    
+
+    def finish(self):
+        self.driver.quit()
+        self.running = False
+
+
+    def download(self, article_object):
+        sleep_time = 1
+        self.autostart()
+        url = article_object.article_url
+
+        # arbitrary bug fixes:
+        if "focus.de" in url or "bloomberg.com" in url:
+            url = url.replace("https://", "https://outline.com/")
+            sleep_time += 5
+        try:
+            self.driver.get(url)
+        except Exception as e:
+            self.logger.critical("Selenium .get(url) failed with error {}".format(e))
+            self.finish()
+            return article_object # without changes
+        
+        time.sleep(sleep_time)
+        # leave the page time to do any funky business
+
+        # in the mean time, get a page title if required
+        if article_object.is_title_bad:
+            article_object.title = self.driver.title.replace(".pdf","")
+            # will be propagated to dst as well
+
+        fname = article_object.fname_template
+        dst = os.path.join(article_object.save_path, fname)
+        if os.path.exists(dst):
+            fname = make_path_unique(fname)
+            dst = os.path.join(article_object.save_path, fname)
+
+
+        if url[-4:] == ".pdf":
+            # according to the browser preferences, calling the url will open pdfjs.
+            # If not handled separately, printing would require the ctrl+p route, but setup is janky to say the least
+            success = self.get_exisiting_pdf(url, dst)
+        else:
+            success = self.get_new_pdf(dst)
+
+
+        if success:
+            article_object.file_name = fname
+            article_object.set_references = self.get_references()
+        else:
+            article_object.file_name = ""
+        
+        return article_object # this change is saved later manually
+
+
+    def get_exisiting_pdf(self, url, dst):
+        try:
+            r = requests.get(url)
+            bytes = r.content
+        except:
+            return False
+        return self.get_new_pdf(dst, other_bytes=bytes)
+
+
+    def get_new_pdf(self, dst, other_bytes=None):
+        os.makedirs(os.path.dirname(dst), exist_ok=True)
+
+        if other_bytes is None:
+            try:
+                result = self.driver.print_page()
+                bytes = base64.b64decode(result, validate=True)
+            except:
+                self.logger.error("Failed, probably because the driver went extinct.")
+                return False
+        else:
+            bytes = other_bytes
+
+        try:
+            with open(dst, "wb+") as f:
+                f.write(bytes)
+            return True
+        except Exception as e:
+            self.logger.error(f"Failed, because of FS-operation: {e}")
+            return False
+        
+
+    def get_references(self):
+        try:
+            hrefs = [e.get_attribute("href") for e in self.driver.find_elements_by_xpath("//a[@href]")]
+        except:
+            hrefs = []
+        # TODO TEST THIS
+        hrefs = [h for h in hrefs \
+            if bool([(domain in h) for domain in config["blacklisted_href_domains"]])
+            ] # filter a tiny bit at least
+        return hrefs
+
+
+
+
+
+
+
+def make_path_unique(path):
+    fname, ending = os.path.splitext(path)
+    fname += datetime.datetime.now().strftime("%d-%H%M%S")
+    return fname + ending
--- a/app/utils/download/runner.py
+++ b/app/utils/download/runner.py
--- a/app/utils/download/youtube.py
+++ b/app/utils/download/youtube.py
@@ -0,0 +1,33 @@
+import logging
+import os
+from pytube import YouTube
+
+logger = logging.getLogger(__name__)
+
+
+def save_video(article_object):
+    """Saves video accoring to url and save path"""
+    url = article_object.article_url
+    logger.info("Saving new video")
+    try:
+        yt = YouTube(url)
+        streams = yt.streams.filter(progressive=True).order_by('resolution')
+    except Exception as e:
+        article_object.file_name = "ERROR: {}".format(e)
+        return article_object
+
+    if streams: # if it's not empty
+        vid = streams[-1]
+        article_object.source_name = "youtube.com"
+        article_object.title = yt.title
+        file_path = os.path.join(article_object.save_path, article_object.fname_template)
+        try:
+            vid.download(file_path)
+            article_object.file_name = article_object.fname_template
+        except Exception as e:
+            logger.error(f"Youtube download crashed: {e}")
+            article_object.file_name = "Error while downloading"
+    else:
+        article_object.file_name = "No streams available"
+    
+    return article_object