new component - upload to NAS

2022-07-23 17:21:00 +02:00
parent 79e3f54955
commit 8e46f30f07
29 changed files with 132 additions and 63 deletions
--- a/news_fetch/app/utils_worker/_init__.py
+++ b/news_fetch/app/utils_worker/_init__.py
--- a/news_fetch/app/utils_worker/compress/runner.py
+++ b/news_fetch/app/utils_worker/compress/runner.py
@@ -0,0 +1,47 @@
+import os
+import subprocess
+from pathlib import Path
+
+import logging
+logger = logging.getLogger(__name__)
+import configuration
+config = configuration.parsed["DOWNLOADS"]
+
+shrink_sizes = []
+
+def shrink_pdf(article):
+    article_loc = Path(article.save_path) / article.file_name
+    initial_size = article_loc.stat().st_size
+    compressed_tmp = Path(config['default_download_path']) / "compressed.pdf"
+
+    if article_loc.suffix != "pdf":
+        return article # it probably was a youtube video
+        
+    c = subprocess.run(
+        [
+            "gs",
+            "-sDEVICE=pdfwrite",
+            "-dPDFSETTINGS=/screen",
+            "-dNOPAUSE",
+            "-dBATCH",
+            f"-sOutputFile={compressed_tmp}", 
+            f"{article_loc}"
+        ],
+        stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+
+    if c.returncode == 0:
+        try:
+            os.replace(compressed_tmp, article_loc)
+        except OSError as e:
+            logger.error(f"Compression ran but I could not copy back the file {e}")
+
+        final_size = article_loc.stat().st_size
+        shrink_sizes.append(initial_size - final_size)
+        logger.info(f"Compression worked. Avg shrinkage: {int(sum(shrink_sizes)/len(shrink_sizes) / 1000)} KB")
+
+
+    else:
+        logger.error(f"Could not run the compression! {c.stderr.decode()} - {c.stdout.decode()}")
+    
+    return article
--- a/news_fetch/app/utils_worker/download/init.py
+++ b/news_fetch/app/utils_worker/download/init.py
--- a/news_fetch/app/utils_worker/download/browser.py
+++ b/news_fetch/app/utils_worker/download/browser.py
@@ -0,0 +1,172 @@
+import time
+import datetime
+import logging
+import os
+import base64
+import requests
+from selenium import webdriver
+import configuration
+import json
+
+config = configuration.parsed["DOWNLOADS"]
+blacklisted = json.loads(config["blacklisted_href_domains"])
+
+
+class PDFDownloader:
+    """Saves a given url. Fills the object it got as a parameter"""
+    logger = logging.getLogger(__name__)
+    # status-variable for restarting:
+    running = False
+    
+    def start(self):
+        self.finish() # clear up
+            
+        options = webdriver.FirefoxOptions()
+        options.profile = config["browser_profile_path"]
+        # should be options.set_preference("profile", config["browser_profile_path"]) as of selenium 4 but that doesn't work
+
+        if os.getenv("HEADLESS", "false") == "true":
+            options.add_argument('--headless')
+        else:
+            self.logger.warning("Opening browser GUI because of 'HEADLESS=false'")
+
+        options.set_preference('print.save_as_pdf.links.enabled', True)
+        # Just save if the filetype is pdf already, does not work!
+
+        options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
+        options.set_preference("browser.download.folderList", 2)
+        # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
+        # options.set_preference("pdfjs.disabled", True)
+        options.set_preference("browser.download.dir", config["default_download_path"])
+
+        self.logger.info("Starting gecko driver")
+        # self.driver = webdriver.Firefox(
+        #     options = options,
+        #     service = webdriver.firefox.service.Service(
+        #         log_path = f'{config["local_storage_path"]}/geckodriver.log'
+        # ))
+        self.driver = webdriver.Remote(
+            command_executor = 'http://geckodriver:4444',
+            options = options,
+            # can't set log path...
+        )
+        
+        residues = os.listdir(config["default_download_path"])
+        for res in residues:
+            os.remove(os.path.join(config["default_download_path"], res))
+
+        self.running = True
+
+    def autostart(self):
+        if not self.running:
+            self.start()  # relaunch the dl util
+
+    def finish(self):
+        if self.running:
+            self.logger.info("Exiting gecko driver")
+            try:
+                self.driver.quit()
+                time.sleep(10)
+            except:
+                self.logger.critical("Connection to the driver broke off")
+            self.running = False
+        else:
+            self.logger.info("Gecko driver not yet running")
+
+    def download(self, article_object):
+        sleep_time = 2
+        self.autostart()
+        url = article_object.article_url
+
+        try:
+            self.driver.get(url)
+        except Exception as e:
+            self.logger.critical("Selenium .get(url) failed with error {}".format(e))
+            self.finish()
+            return article_object  # without changes
+        
+        time.sleep(sleep_time)
+        # leave the page time to do any funky business
+
+        # in the mean time, get a page title if required
+        if article_object.is_title_bad:
+            article_object.title = self.driver.title.replace(".pdf", "")
+            # will be propagated to the saved file (dst) as well
+
+        fname = article_object.fname_template
+        dst = os.path.join(article_object.save_path, fname)
+        if os.path.exists(dst):
+            fname = make_path_unique(fname)
+            dst = os.path.join(article_object.save_path, fname)
+
+
+        if url[-4:] == ".pdf":
+            # according to the browser preferences, calling the url will open pdfjs.
+            # If not handled separately, printing would require the ctrl+p route, but setup is janky to say the least
+            success = self.get_exisiting_pdf(url, dst)
+        else:
+            success = self.get_new_pdf(dst)
+
+
+        if success:
+            article_object.file_name = fname
+            article_object.set_references(self.get_references())
+        else:
+            article_object.file_name = ""
+        
+        return article_object  # this change is saved later by the external caller
+
+
+    def get_exisiting_pdf(self, url, dst):
+        try:
+            r = requests.get(url)
+            bytes = r.content
+        except:
+            return False
+        return self.get_new_pdf(dst, other_bytes=bytes)
+
+
+    def get_new_pdf(self, dst, other_bytes=None):
+        os.makedirs(os.path.dirname(dst), exist_ok=True)
+
+        if other_bytes is None:
+            try:
+                result = self.driver.print_page()
+                bytes = base64.b64decode(result, validate=True)
+            except:
+                self.logger.error("Failed, probably because the driver went extinct.")
+                return False
+        else:
+            bytes = other_bytes
+
+        try:
+            with open(dst, "wb+") as f:
+                f.write(bytes)
+            return True
+        except Exception as e:
+            self.logger.error(f"Failed, because of FS-operation: {e}")
+            return False
+        
+
+    def get_references(self):
+        try:
+            hrefs = [e.get_attribute("href") for e in self.driver.find_elements_by_xpath("//a[@href]")]
+        except:
+            hrefs = []
+        len_old = len(hrefs)
+        hrefs = [h for h in hrefs \
+            if not sum([(domain in h) for domain in blacklisted]) # sum([True, False, False, False]) == 1 (esp. not 0)
+            ] # filter a tiny bit at least
+        self.logger.info(f"Hrefs filtered (before: {len_old}, after: {len(hrefs)})")
+        return hrefs
+
+
+
+
+
+
+
+def make_path_unique(path):
+    fname, ending = os.path.splitext(path)
+    fname += datetime.datetime.now().strftime("%d-%H%M%S")
+    return fname + ending
--- a/news_fetch/app/utils_worker/download/runner.py
+++ b/news_fetch/app/utils_worker/download/runner.py
--- a/news_fetch/app/utils_worker/download/youtube.py
+++ b/news_fetch/app/utils_worker/download/youtube.py
@@ -0,0 +1,51 @@
+from __future__ import unicode_literals
+import youtube_dl
+import os
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class MyLogger(object):
+    def debug(self, msg): pass
+    def warning(self, msg): pass
+    def error(self, msg):
+        logger.error(msg)
+
+
+
+class YouTubeDownloader:
+    def __init__(self) -> None:
+        pass
+
+
+    def post_download_hook(self, ret_code):
+        # print(ret_code)
+        if ret_code['status'] == 'finished':
+            file_loc = ret_code["filename"]
+            fname = os.path.basename(file_loc)
+            self.article_object.file_name = fname
+
+
+    def save_video(self, article_object):
+        """Saves video accoring to url and save path"""
+        self.article_object = article_object
+        url = article_object.article_url
+        logger.info("Saving new video")
+        file_path = os.path.join(article_object.save_path, article_object.fname_template)
+        ydl_opts = {
+            'format': 'best[height<=720]',
+            'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download
+            'logger': MyLogger(),
+            'progress_hooks': [self.post_download_hook],
+            'updatetime': False
+        }
+        try:
+            with youtube_dl.YoutubeDL(ydl_opts) as ydl:
+                ydl.download([url])
+                # article file name is updated in self.post_download_hook
+        except Exception as e:
+            logger.error(f"Youtube download crashed: {e}")
+            article_object.file_name = ""
+
+        return article_object
--- a/news_fetch/app/utils_worker/fetch/runner.py
+++ b/news_fetch/app/utils_worker/fetch/runner.py
@@ -0,0 +1,62 @@
+from newspaper import Article
+from urllib.parse import urlparse
+from htmldate import find_date
+import datetime
+import logging
+logging.getLogger('newspaper').setLevel(logging.ERROR) # quieter logs
+logging.getLogger('urllib').setLevel(logging.ERROR) # quieter logs
+logging.getLogger('urllib3.poolmanager').setLevel(logging.ERROR) # quieter logs
+logging.getLogger('htmldate').setLevel(logging.ERROR) #quieter logs
+logging.getLogger('charset_normalizer').setLevel(logging.ERROR) #quieter logs
+logger = logging.getLogger("fetch")
+
+
+def get_description(article_object):
+    url = article_object.article_url
+    website = urlparse(url).netloc
+    article_object.source_name = website
+
+    try:
+        article_object.pub_date = datetime.datetime.strptime(find_date(url), '%Y-%d-%M')
+    except: # other file types
+        article_object.pub_date = datetime.datetime(year=1900, month=1, day=1)
+
+    try:
+        news_article = Article(url)
+        news_article.download()
+        news_article.parse()
+    except:
+        news_article = object() # fallback value
+
+    try:
+        article_object.title = news_article.title
+    except AttributeError:
+        article_object.title = "Error while running fetch"
+
+    try:
+        if article_object.summary:
+            article_object.summary = news_article.summary
+        elif news_article.text:
+            ind = min(500, len(news_article.text))
+            article_object.summary = news_article.text[:ind] + "..."
+        else:
+            article_object.summary = ""
+    except AttributeError:
+        article_object.summary = ""
+
+    try:
+        article_object.language = news_article.meta_lang
+    except AttributeError:
+        article_object.language = ""
+
+    try:
+        article_object.set_authors(news_article.authors)
+    except AttributeError:
+        pass # list would have been empty anyway
+    
+    try:
+        article_object.set_keywords(news_article.keywords)
+    except AttributeError:
+        pass  # list would have been empty anyway
+    
+    return article_object
--- a/news_fetch/app/utils_worker/upload/runner.py
+++ b/news_fetch/app/utils_worker/upload/runner.py
@@ -0,0 +1,20 @@
+import time
+from waybackpy import WaybackMachineSaveAPI # upload to archive.org
+import logging
+logger = logging.getLogger(__name__)
+
+def upload_to_archive(article_object):
+    """uploads to archive.org and returns the archived url"""
+    user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed?
+    url = article_object.article_url
+    try:
+        wayback = WaybackMachineSaveAPI(url, user_agent)
+        archive_url = wayback.save()
+        # logger.info(f"{url} uploaded to archive successfully")
+        article_object.archive_url = archive_url
+
+    except Exception as e:
+        article_object.archive_url = "Error while uploading: {}".format(e)
+        logger.error(f"Error while generating archive url: {e}")
+
+    return article_object
--- a/news_fetch/app/utils_worker/worker_template.py
+++ b/news_fetch/app/utils_worker/worker_template.py
@@ -0,0 +1,41 @@
+from threading import Thread
+import time
+import logging
+
+
+class TemplateWorker(Thread):
+    """Parent class for any subsequent worker of the article-download pipeline. They should all run in parallel, thus the Thread subclassing"""
+    logger = logging.getLogger(__name__)
+
+    def __init__(self, *args, **kwargs) -> None:
+        target = self._queue_processor # will be executed on Worker.start()
+        group = kwargs.get("group", None)
+        name = kwargs.get("name", None)
+
+        super().__init__(group=group, target=target, name=name)
+        self._article_queue = []
+        self.logger.info(f"Worker thread {self.__class__.__name__} initialized successfully")
+    
+
+    def process(self, article_watcher):
+        self._article_queue.append(article_watcher)#.article_model.article_url)
+
+
+    def _queue_processor(self):
+        """This method is launched by thread.run() and idles when self._article_queue is empty. When an external caller appends to the queue it jumps into action"""
+        while True: # PLEASE tell me if I'm missing an obvious better way of doing this!
+            if len(self._article_queue) == 0:
+                time.sleep(5)
+            else:
+                article_watcher = self._article_queue.pop(0)
+                self.logger.info(f"{self.__class__.__name__} now processing from queue (length: {len(self._article_queue)}) - {article_watcher.article}")
+                self._handle_article(article_watcher)
+                
+
+    def _handle_article(self, article_watcher, action=None):
+        if action is None:
+            self.logger.error("Unoverloaded call of _handle_article(). This should not occur in prod")
+        else:
+            article = article_watcher.article
+            article = action(article) # action updates the article object but does not save the change
+            article.save()
--- a/news_fetch/app/utils_worker/workers.py
+++ b/news_fetch/app/utils_worker/workers.py
@@ -0,0 +1,66 @@
+from .worker_template import TemplateWorker
+from .download.browser import PDFDownloader
+from .download.youtube import YouTubeDownloader
+from .fetch.runner import get_description
+from .upload.runner import upload_to_archive as run_upload
+from .compress.runner import shrink_pdf
+
+import time
+import logging
+logger = logging.getLogger(__name__)
+
+class DownloadWorker(TemplateWorker):
+    def __init__(self) -> None:
+        self.dl_runner = PDFDownloader().download
+        self.yt_runner = YouTubeDownloader().save_video
+        super().__init__()
+
+    def _handle_article(self, article_watcher):
+        article = article_watcher.article
+        u = article.article_url
+
+        if "youtu.be/" in u or "youtube.com/" in u:
+            action = self.yt_runner
+        else:
+            action = self.dl_runner
+
+        super()._handle_article(article_watcher, action)
+        article_watcher.download_completed = True
+
+
+
+class FetchWorker(TemplateWorker):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def _handle_article(self, article_watcher):
+        action = get_description # function
+        super()._handle_article(article_watcher, action)
+        article_watcher.fetch_completed = True
+
+
+
+class UploadWorker(TemplateWorker):
+    def __init__(self) -> None:
+        super().__init__()
+    
+
+
+    def _handle_article(self, article_watcher):
+        def action(*args, **kwargs):
+            time.sleep(10) # uploads to archive are throttled to 15/minute, but 5s still triggers a blacklisting
+            return run_upload(*args, **kwargs)
+
+        super()._handle_article(article_watcher, action)
+        article_watcher.upload_completed = True
+
+
+
+class CompressWorker(TemplateWorker):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def _handle_article(self, article_watcher):
+        action = shrink_pdf
+        super()._handle_article(article_watcher, action)
+        article_watcher.compression_completed = True