Many bug fixes

2022-04-20 16:49:55 +02:00
parent 0d76bcbb98
commit 8f3ea25662
16 changed files with 223 additions and 118 deletions
--- a/app/utils_worker/download/browser.py
+++ b/app/utils_worker/download/browser.py
@@ -7,10 +7,10 @@ import requests
 from selenium import webdriver
 from selenium.webdriver.firefox.options import Options
 import configuration
+import json

 config = configuration.parsed["DOWNLOADS"]
-
-
+blacklisted = json.loads(config["blacklisted_href_domains"])

 class PDFDownloader:
    """Saves a given url. Fills the object it got as a parameter"""
@@ -61,10 +61,6 @@ class PDFDownloader:
        self.autostart()
        url = article_object.article_url

-        # arbitrary bug fixes:
-        if "focus.de" in url or "bloomberg.com" in url:
-            url = url.replace("https://", "https://outline.com/")
-            sleep_time += 5
        try:
            self.driver.get(url)
        except Exception as e:
@@ -97,7 +93,7 @@ class PDFDownloader:

        if success:
            article_object.file_name = fname
-            article_object.set_references = self.get_references()
+            article_object.set_references(self.get_references())
        else:
            article_object.file_name = ""
        
@@ -140,10 +136,12 @@ class PDFDownloader:
            hrefs = [e.get_attribute("href") for e in self.driver.find_elements_by_xpath("//a[@href]")]
        except:
            hrefs = []
-        # TODO TEST THIS
+        old = hrefs
        hrefs = [h for h in hrefs \
-            if bool([(domain in h) for domain in config["blacklisted_href_domains"]])
+            if not sum([(domain in h) for domain in blacklisted]) # sum([True, False, False, False]) == 1 (esp. not 0)
            ] # filter a tiny bit at least
+        diff = set(old) ^ set(hrefs)
+        self.logger.info(f"Removed {len(diff)} hrefs: {diff} (before:{len(old)}, after: {len(hrefs)})")
        return hrefs


--- a/app/utils_worker/download/youtube.py
+++ b/app/utils_worker/download/youtube.py
@@ -1,33 +1,65 @@
-import logging
+from __future__ import unicode_literals
+import youtube_dl
 import os
-from pytube import YouTube
+import logging

 logger = logging.getLogger(__name__)


-def save_video(article_object):
-    """Saves video accoring to url and save path"""
-    url = article_object.article_url
-    logger.info("Saving new video")
-    try:
-        yt = YouTube(url)
-        streams = yt.streams.filter(progressive=True).order_by('resolution')
-    except Exception as e:
-        article_object.file_name = "ERROR: {}".format(e)
-        return article_object
+class MyLogger(object):
+    def debug(self, msg): pass
+    def warning(self, msg): pass
+    def error(self, msg):
+        logger.error(msg)

-    if streams: # if it's not empty
-        vid = streams[-1]
-        article_object.source_name = "youtube.com"
-        article_object.title = yt.title
+
+
+class YouTubeDownloader:
+    def __init__(self) -> None:
+        pass
+
+
+    def post_download_hook(self, ret_code):
+        # print(ret_code)
+        if ret_code['status'] == 'finished':
+            file_loc = ret_code["filename"]
+            fname = os.path.basename(file_loc)
+            self.article_object.file_name = fname
+
+
+    def save_video(self, article_object):
+        """Saves video accoring to url and save path"""
+        self.article_object = article_object
+        url = article_object.article_url
+        logger.info("Saving new video")
        file_path = os.path.join(article_object.save_path, article_object.fname_template)
+        ydl_opts = {
+            'format': 'best[height<=720]',
+            'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download
+            'logger': MyLogger(),
+            'progress_hooks': [self.post_download_hook],
+            'updatetime': False
+        }
        try:
-            vid.download(file_path)
-            article_object.file_name = article_object.fname_template
+            with youtube_dl.YoutubeDL(ydl_opts) as ydl:
+                ydl.download([url])
+                # article file name is updated in self.post_download_hook
        except Exception as e:
            logger.error(f"Youtube download crashed: {e}")
-            article_object.file_name = "Error while downloading"
-    else:
-        article_object.file_name = "No streams available"
-    
-    return article_object
+            article_object.file_name = ""
+
+        return article_object
+
+
+
+# class DummyArticle:
+#     article_url = "https://www.welt.de/politik/ausland/article238267261/Baerbock-Lieferung-gepanzerter-Fahrzeuge-an-die-Ukraine-kein-Tabu.html"
+#     save_path = "/app/file_storage/"
+#     fname_template = "www.youtube.com -- Test"
+#     file_name = ""
+
+# m = DummyArticle()
+# t = YouTubeDownloader()
+# t.save_video(m)
+
+# print(m.file_name)