Many bug fixes

2022-04-20 16:49:55 +02:00
parent 0d76bcbb98
commit 8f3ea25662
16 changed files with 223 additions and 118 deletions
--- a/app/utils_worker/compress/runner.py
+++ b/app/utils_worker/compress/runner.py
@@ -9,6 +9,9 @@ shrink_sizes = []

 def shrink_pdf(article):
    initial_size = os.path.getsize(article.save_path + article.file_name)
+    if article.file_name[-4:] != ".pdf":
+        return article # it probably was a youtube video
+        
    c = subprocess.run(
        ["gs", "-sDEVICE=pdfwrite", "-dPDFSETTINGS=/screen", "-dNOPAUSE", "-dBATCH", f"-sOutputFile={config['default_download_path']}/compressed.pdf", f'"{article.save_path + article.file_name}"'],
        stdout=subprocess.PIPE,
--- a/app/utils_worker/download/browser.py
+++ b/app/utils_worker/download/browser.py
@@ -7,10 +7,10 @@ import requests
 from selenium import webdriver
 from selenium.webdriver.firefox.options import Options
 import configuration
+import json

 config = configuration.parsed["DOWNLOADS"]
-
-
+blacklisted = json.loads(config["blacklisted_href_domains"])

 class PDFDownloader:
    """Saves a given url. Fills the object it got as a parameter"""
@@ -61,10 +61,6 @@ class PDFDownloader:
        self.autostart()
        url = article_object.article_url

-        # arbitrary bug fixes:
-        if "focus.de" in url or "bloomberg.com" in url:
-            url = url.replace("https://", "https://outline.com/")
-            sleep_time += 5
        try:
            self.driver.get(url)
        except Exception as e:
@@ -97,7 +93,7 @@ class PDFDownloader:

        if success:
            article_object.file_name = fname
-            article_object.set_references = self.get_references()
+            article_object.set_references(self.get_references())
        else:
            article_object.file_name = ""
        
@@ -140,10 +136,12 @@ class PDFDownloader:
            hrefs = [e.get_attribute("href") for e in self.driver.find_elements_by_xpath("//a[@href]")]
        except:
            hrefs = []
-        # TODO TEST THIS
+        old = hrefs
        hrefs = [h for h in hrefs \
-            if bool([(domain in h) for domain in config["blacklisted_href_domains"]])
+            if not sum([(domain in h) for domain in blacklisted]) # sum([True, False, False, False]) == 1 (esp. not 0)
            ] # filter a tiny bit at least
+        diff = set(old) ^ set(hrefs)
+        self.logger.info(f"Removed {len(diff)} hrefs: {diff} (before:{len(old)}, after: {len(hrefs)})")
        return hrefs


--- a/app/utils_worker/download/youtube.py
+++ b/app/utils_worker/download/youtube.py
@@ -1,33 +1,65 @@
-import logging
+from __future__ import unicode_literals
+import youtube_dl
 import os
-from pytube import YouTube
+import logging

 logger = logging.getLogger(__name__)


-def save_video(article_object):
-    """Saves video accoring to url and save path"""
-    url = article_object.article_url
-    logger.info("Saving new video")
-    try:
-        yt = YouTube(url)
-        streams = yt.streams.filter(progressive=True).order_by('resolution')
-    except Exception as e:
-        article_object.file_name = "ERROR: {}".format(e)
-        return article_object
+class MyLogger(object):
+    def debug(self, msg): pass
+    def warning(self, msg): pass
+    def error(self, msg):
+        logger.error(msg)

-    if streams: # if it's not empty
-        vid = streams[-1]
-        article_object.source_name = "youtube.com"
-        article_object.title = yt.title
+
+
+class YouTubeDownloader:
+    def __init__(self) -> None:
+        pass
+
+
+    def post_download_hook(self, ret_code):
+        # print(ret_code)
+        if ret_code['status'] == 'finished':
+            file_loc = ret_code["filename"]
+            fname = os.path.basename(file_loc)
+            self.article_object.file_name = fname
+
+
+    def save_video(self, article_object):
+        """Saves video accoring to url and save path"""
+        self.article_object = article_object
+        url = article_object.article_url
+        logger.info("Saving new video")
        file_path = os.path.join(article_object.save_path, article_object.fname_template)
+        ydl_opts = {
+            'format': 'best[height<=720]',
+            'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download
+            'logger': MyLogger(),
+            'progress_hooks': [self.post_download_hook],
+            'updatetime': False
+        }
        try:
-            vid.download(file_path)
-            article_object.file_name = article_object.fname_template
+            with youtube_dl.YoutubeDL(ydl_opts) as ydl:
+                ydl.download([url])
+                # article file name is updated in self.post_download_hook
        except Exception as e:
            logger.error(f"Youtube download crashed: {e}")
-            article_object.file_name = "Error while downloading"
-    else:
-        article_object.file_name = "No streams available"
-    
-    return article_object
+            article_object.file_name = ""
+
+        return article_object
+
+
+
+# class DummyArticle:
+#     article_url = "https://www.welt.de/politik/ausland/article238267261/Baerbock-Lieferung-gepanzerter-Fahrzeuge-an-die-Ukraine-kein-Tabu.html"
+#     save_path = "/app/file_storage/"
+#     fname_template = "www.youtube.com -- Test"
+#     file_name = ""
+
+# m = DummyArticle()
+# t = YouTubeDownloader()
+# t.save_video(m)
+
+# print(m.file_name)
--- a/app/utils_worker/fetch/runner.py
+++ b/app/utils_worker/fetch/runner.py
@@ -37,24 +37,28 @@ def get_description(article_object):
    except:
        news_article = fallback

-
    if news_article.title:
        title = news_article.title
    else:
        title = fallback.title

-
    if news_article.summary:
        summary = news_article.summary
    elif news_article.text:
        ind = min(500, len(news_article.text))
        summary = news_article.text[:ind] + "..."
    else:
-        summary = fallback.summary        
+        summary = fallback.summary
+
+    if news_article.meta_lang:
+        lang = news_article.meta_lang
+    else:
+        lang = ""

    article_object.title = title
    article_object.summary = summary
+    article_object.language = lang
    article_object.set_authors(news_article.authors)
    article_object.set_keywords(news_article.keywords)
-    
+
    return article_object
--- a/app/utils_worker/upload/runner.py
+++ b/app/utils_worker/upload/runner.py
@@ -9,10 +9,10 @@ def upload_to_archive(article_object):
    try:
        wayback = WaybackMachineSaveAPI(url, user_agent)
        archive_url = wayback.save()
-        logger.info(f"{url} uploaded to archive successfully")
+        # logger.info(f"{url} uploaded to archive successfully")
        article_object.archive_url = archive_url
    except Exception as e:
        article_object.archive_url = "Error while uploading: {}".format(e)
-        logger.error(f"Error while generating new url: {e}")
+        logger.error(f"Error while generating archive url: {e}")

    return article_object
--- a/app/utils_worker/worker_template.py
+++ b/app/utils_worker/worker_template.py
@@ -1,7 +1,6 @@
 from threading import Thread
 import time
 import logging
-# logger = logging.getLogger(__name__)


 class TemplateWorker(Thread):
@@ -34,7 +33,6 @@ class TemplateWorker(Thread):
                

    def _handle_article(self, article_watcher, action=None):
-        # TODO Overload in children classes
        if action is None:
            self.logger.error("Unoverloaded call of _handle_article(). This should not occur in prod")
        else:
--- a/app/utils_worker/workers.py
+++ b/app/utils_worker/workers.py
@@ -1,6 +1,6 @@
 from .worker_template import TemplateWorker
 from .download.browser import PDFDownloader
-from .download.youtube import save_video
+from .download.youtube import YouTubeDownloader
 from .fetch.runner import get_description
 from .upload.runner import upload_to_archive as run_upload
 from .compress.runner import shrink_pdf
@@ -11,7 +11,7 @@ logger = logging.getLogger(__name__)
 class DownloadWorker(TemplateWorker):
    def __init__(self) -> None:
        self.dl_runner = PDFDownloader().download
-        self.yt_runner = save_video
+        self.yt_runner = YouTubeDownloader().save_video
        super().__init__()

    def _handle_article(self, article_watcher):