Bug fixes, makefile for launch

2022-12-09 11:19:45 +01:00
parent 24b3bc3b51
commit 647944d23c
25 changed files with 321 additions and 300 deletions
--- a/news_fetch/configuration.py
+++ b/news_fetch/configuration.py
@@ -1,9 +1,7 @@
-import os
-import configparser
-import logging
 import time
-# import shutil
-# from datetime import datetime
+import os
+import logging
+import yaml
 from peewee import SqliteDatabase, PostgresqlDatabase
 from rich.logging import RichHandler

@@ -19,22 +17,21 @@ logger = logging.getLogger(__name__)


 # load config file containing constants and secrets
-main_config = configparser.ConfigParser()
-main_config.read("/app/containerdata/config/news_fetch.config.ini")
-db_config = configparser.ConfigParser()
-db_config.read("/app/containerdata/config/db.config.ini")
+config_location = os.getenv("CONFIG_FILE")
+with open(config_location, "r") as f:
+    config = yaml.safe_load(f)


 # DEBUG MODE:
 if os.getenv("DEBUG", "false") == "true":
    logger.warning("Found 'DEBUG=true', setting up dummy databases")
    
-    main_config["SLACK"]["archive_id"] = main_config["SLACK"]["debug_id"]
-    main_config["MAIL"]["recipient"] = main_config["MAIL"]["sender"]
-    main_config["DOWNLOADS"]["local_storage_path"] = main_config["DOWNLOADS"]["debug_storage_path"]
+    config["slack"]["archive_id"] = config["slack"]["debug_id"]
+    config["mail"]["recipient"] = config["mail"]["sender"]
+    config["downloads"]["local_storage_path"] = config["downloads"]["debug_storage_path"]

    download_db = SqliteDatabase(
-        main_config["DATABASE"]["download_db_debug"],
+        config["database"]["debug_db"],
        pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once
    )

@@ -43,9 +40,9 @@ else:
    logger.warning("Found 'DEBUG=false' and running on production databases, I hope you know what you're doing...")
    
    time.sleep(10) # wait for the vpn to connect (can't use a healthcheck because there is no depends_on)
-    cred = db_config["DATABASE"]
+    cred = config["database"]
    download_db = PostgresqlDatabase(
-        cred["db_name"], user=cred["user_name"], password=cred["password"], host="vpn", port=5432
+        cred["production_db_name"], user=cred["production_user_name"], password=cred["production_password"], host="vpn", port=5432
    )
    # TODO Reimplement backup/printout
    # logger.info("Backing up databases")
--- a/news_fetch/requirements.txt
+++ b/news_fetch/requirements.txt
@@ -10,3 +10,4 @@ markdown
 rich
 psycopg2
 unidecode
+pyyaml
--- a/news_fetch/utils_mail/runner.py
+++ b/news_fetch/utils_mail/runner.py
@@ -7,16 +7,20 @@ import logging
 import configuration

 logger = logging.getLogger(__name__)
-config = configuration.main_config["MAIL"]
+mail_config = configuration.config["mail"]

 def send(article_model):
    mail = MIMEMultipart()
    mail['Subject'] = "{} -- {}".format(article_model.source_name, article_model.title)
-    mail['From'] = config["sender"]
-    mail['To'] = config["recipient"]
-
-    msg, files = article_model.mail_info() # this is html
+    mail['From'] = mail_config["sender"]
+    mail['To'] = mail_config["recipient"]

+    try:
+        msg, files = article_model.mail_info() # this is html
+    except: # Raised by model if article has no associated file
+        logger.info("Skipping mail sending")
+        return
+        
    content = MIMEText(msg, "html")
    mail.attach(content)

@@ -29,14 +33,14 @@ def send(article_model):

    try:
        try:
-            smtp = smtplib.SMTP(config["smtp_server"], config["port"])
+            smtp = smtplib.SMTP(mail_config["smtp_server"], mail_config["port"])
        except ConnectionRefusedError:
            logger.error("Server refused connection. Is this an error on your side?")
            return False

        smtp.starttls()
-        smtp.login(config["uname"], config["password"])
-        smtp.sendmail(config["sender"], config["recipient"], mail.as_string())
+        smtp.login(mail_config["uname"], mail_config["password"])
+        smtp.sendmail(mail_config["sender"], mail_config["recipient"], mail.as_string())
        smtp.quit()
        logger.info("Mail successfully sent.")
    except smtplib.SMTPException as e:
--- a/news_fetch/utils_slack/runner.py
+++ b/news_fetch/utils_slack/runner.py
@@ -7,7 +7,7 @@ import re
 import time

 import configuration
-config = configuration.main_config["SLACK"]
+slack_config = configuration.config["slack"]
 models = configuration.models

 class MessageIsUnwanted(Exception):
@@ -61,7 +61,7 @@ class Message:

    @property
    def is_by_human(self):
-        return self.user.user_id != config["bot_id"]
+        return self.user.user_id != slack_config["bot_id"]

    
    @property
@@ -87,7 +87,7 @@ class BotApp(App):

    def say_substitute(self, *args, **kwargs):
        self.client.chat_postMessage(
-            channel=config["archive_id"],
+            channel=slack_config["archive_id"],
            text=" - ".join(args),
            **kwargs
        )
@@ -101,7 +101,7 @@ class BotApp(App):
            last_ts = presaved.slack_ts_full

        result = self.client.conversations_history(
-            channel=config["archive_id"],
+            channel=slack_config["archive_id"],
            oldest=last_ts
        )

@@ -116,7 +116,7 @@ class BotApp(App):
        while refetch: # we have not actually fetched them all
            try:
                result = self.client.conversations_history(
-                    channel = config["archive_id"],
+                    channel = slack_config["archive_id"],
                    cursor = result["response_metadata"]["next_cursor"],
                    oldest = last_ts
                ) # fetches 100 messages, older than the [-1](=oldest) element of new_fetches
@@ -126,8 +126,8 @@ class BotApp(App):
                for m in new_messages:
                    return_messages.append(Message(m))
            except SlackApiError: # Most likely a rate-limit
-                self.logger.error("Error while fetching channel messages. (likely rate limit) Retrying in {} seconds...".format(config["api_wait_time"]))
-                time.sleep(config["api_wait_time"])
+                self.logger.error("Error while fetching channel messages. (likely rate limit) Retrying in {} seconds...".format(slack_config["api_wait_time"]))
+                time.sleep(slack_config["api_wait_time"])
                refetch = True
        
        self.logger.info(f"Fetched {len(return_messages)} new channel messages.")
@@ -181,7 +181,7 @@ class BotRunner():

    """Stupid encapsulation so that we can apply the slack decorators to the BotApp"""
    def __init__(self, callback, *args, **kwargs) -> None:
-        self.bot_worker = BotApp(callback, token=config["auth_token"])
+        self.bot_worker = BotApp(callback, token=slack_config["auth_token"])

        @self.bot_worker.event(event="message", matchers=[is_message_in_archiving])
        def handle_incoming_message(message, say):
@@ -195,7 +195,7 @@ class BotRunner():
        def handle_all_other_reactions(event, say):
            self.logger.log("Ignoring slack event that isn't a message")

-        self.handler = SocketModeHandler(self.bot_worker, config["app_token"])
+        self.handler = SocketModeHandler(self.bot_worker, slack_config["app_token"])


    def start(self):
@@ -215,5 +215,5 @@ class BotRunner():


 def is_message_in_archiving(message) -> bool:
-    return message["channel"] == config["archive_id"]
+    return message["channel"] == slack_config["archive_id"]

--- a/news_fetch/utils_storage/models.py
+++ b/news_fetch/utils_storage/models.py
@@ -8,8 +8,7 @@ import configuration
 import datetime

 from . import helpers
-config = configuration.main_config["DOWNLOADS"]
-slack_config = configuration.main_config["SLACK"]
+downloads_config = configuration.config["downloads"]
 FILE_SIZE_THRESHOLD = 15 * 1024 * 1024 # 15MB


@@ -34,7 +33,8 @@ class ArticleDownload(DownloadBaseModel):
    def is_title_bad(self):  # add incrementally
        return "PUR-Abo" in self.title \
            or "Redirecting" in self.title \
-            or "Error while running fetch" in self.title
+            or "Error while running fetch" in self.title \
+            or self.title == ""

    summary = TextField(default = '')
    source_name = CharField(default = '')
@@ -44,14 +44,14 @@ class ArticleDownload(DownloadBaseModel):
    file_name = TextField(default = '')
    @property
    def save_path(self):
-        return f"{config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/"
+        return f"{downloads_config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/"
    @property
    def fname_nas(self, file_name=""):
        if self.download_date:
            if file_name:
-                return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{file_name}"
+                return f"NAS: {downloads_config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{file_name}"
            else: # return the self. name
-                return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{self.file_name}"
+                return f"NAS: {downloads_config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{self.file_name}"
        else:
            return None
    @property
@@ -102,18 +102,22 @@ class ArticleDownload(DownloadBaseModel):
        answer_files = []
        # displays the summary in a blockquote

-        status = self.file_status
-        if status == 1: # file_name was empty
-            return None # there has been an error do not send any message
-        elif status == 2: # no file found at specified location
-            answer_text += f"*{self.title}*\n{summary}\nFilename: {self.file_name}"
-        elif status == 3: # file found but deemed too big
-            location = f"File not sent directly. Location on NAS:\n`{self.fname_nas}`"
-            answer_text += f"*{self.title}*\n{summary}\n{location}"
-        else: # everything nominal
+        try:
+            self.ensure_file_present()
            answer_text += f"*{self.title}*\n{summary}"
            answer_files.append(self.save_path + self.file_name)

+        except Exception as e:
+            msg = e.args[0]
+            logger.error(f"Article {self} has file-issues: {msg}")
+            if "file too big" in msg:
+                location = f"File too big to send directly. Location on NAS:\n`{self.fname_nas}`"
+                answer_text += f"*{self.title}*\n{summary}\n{location}"
+                
+            else: # file not found, or filename not set
+                raise e
+                # reraise the exception, so that the caller can handle it
+
        # then the related files
        if self.related:
            rel_text = "Related files on NAS:"
@@ -144,19 +148,14 @@ class ArticleDownload(DownloadBaseModel):
                related_file_name = r
            )
    
-    @property
-    def file_status(self):
-        """0 = file exists, 1 = no file name!, 2 = file does not exit,3 = file exists but is too large"""
+    def ensure_file_present(self):
        if not self.file_name:
-            logger.error(f"Article {self} has no filename!")
-            return 2
+            raise Exception("no filename")
        file_path_abs = self.save_path + self.file_name
        if not os.path.exists(file_path_abs):
-            logger.error(f"Article {self} has a filename, but the file does not exist at that location!")
-            return 2
+            raise Exception("file not found")
        if (os.path.splitext(file_path_abs)[1] != ".pdf") or (os.path.getsize(file_path_abs) > FILE_SIZE_THRESHOLD):
-            logger.warning(f"Article {self} has a file that exceeds the file size limit.")
-            return 3
+            raise Exception("file too big")



--- a/news_fetch/utils_worker/download/browser.py
+++ b/news_fetch/utils_worker/download/browser.py
@@ -11,7 +11,7 @@ from selenium import webdriver

 import configuration

-config = configuration.main_config["DOWNLOADS"]
+download_config = configuration.config["downloads"]

 def driver_running(f):
    def wrapper(*args, **kwargs):
@@ -66,74 +66,88 @@ class PDFDownloader:

    @driver_running
    def download(self, article_object):
-        sleep_time = int(config["browser_print_delay"])
        url = article_object.article_url

+
+        if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly
+            self.logger.info("Downloading existing pdf")
+            success = self.get_exisiting_pdf(article_object)
+            # get a page title if required
+            if article_object.is_title_bad:
+                article_object.title = self.driver.title.replace(".pdf", "") # some titles end with .pdf
+                # will be propagated to the saved file (dst) as well
+        else:
+            success = self.get_new_pdf(article_object)
+
+        if not success:
+            self.logger.error("Download failed")
+        # TODO: need to reset the file name to empty?
+        return article_object # changes to this are saved later by the external caller
+
+
+    def get_exisiting_pdf(self, article_object):
+        # get a better page title if required
+        if article_object.is_title_bad:
+            article_object.title = article_object.article_url.split("/")[-1].split(".pdf")[0]
        try:
-            self.driver.get(url)
+            r = requests.get(article_object.article_url)
+            bytes = r.content
+        except:
+            return False
+        return self.write_pdf(bytes, article_object)
+
+
+    def get_new_pdf(self, article_object):
+        sleep_time = int(download_config["browser_print_delay"])
+
+        try:
+            self.driver.get(article_object.article_url)
        except Exception as e:
            self.logger.critical("Selenium .get(url) failed with error {}".format(e))
            self.finish()
-            return article_object  # without changes
+            return False
        
        time.sleep(sleep_time)
        # leave the page time to do any funky business

-        # in the mean time, get a page title if required
        if article_object.is_title_bad:
-            article_object.title = self.driver.title.replace(".pdf", "") # some titles end with .pdf
-            # will be propagated to the saved file (dst) as well
+            article_object.title = self.driver.title

+        try:
+            result = self.driver.print_page()
+            bytes = base64.b64decode(result, validate=True)
+        except:
+            self.logger.error("Failed, probably because the driver went extinct.")
+            return False
+
+        return self.write_pdf(bytes, article_object)
+
+
+    def get_file_destination(self, article_object):
        fname = article_object.fname_template
        fname = ensure_unique(article_object.save_path, fname)
        dst = os.path.join(article_object.save_path, fname)
+        return dst, fname


-        if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly
-            success = self.get_exisiting_pdf(url, dst)
-        else:
-            success = self.get_new_pdf(dst)
-
-        if success:
-            article_object.file_name = fname
-        else:
-            article_object.file_name = ""
-        
-        return article_object # this change is saved later by the external caller
-
-
-    def get_exisiting_pdf(self, url, dst):
-        try:
-            r = requests.get(url)
-            bytes = r.content
-        except:
-            return False
-        return self.get_new_pdf(dst, other_bytes=bytes)
-
-
-    def get_new_pdf(self, dst, other_bytes=None):
+    def write_pdf(self, content, article_object):
+        dst, fname = self.get_file_destination(article_object)
        os.makedirs(os.path.dirname(dst), exist_ok=True)
-
-        if other_bytes is None:
-            try:
-                result = self.driver.print_page()
-                bytes = base64.b64decode(result, validate=True)
-            except:
-                self.logger.error("Failed, probably because the driver went extinct.")
-                return False
-        else:
-            bytes = other_bytes
-
+        
        try:
            with open(dst, "wb+") as f:
-                f.write(bytes)
+                f.write(content)
+            
+            article_object.file_name = fname
            return True
        except Exception as e:
            self.logger.error(f"Failed, because of FS-operation: {e}")
            return False


-    def create_tmp_profile(self, full_profile_path: Path = Path(config["browser_profile_path"])) -> Path:
+
+        
+    def create_tmp_profile(self, full_profile_path: Path = Path(download_config["browser_profile_path"])) -> Path:
        reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}")
        os.mkdir(reduced_profile_path)
        # copy needed directories
--- a/news_fetch/utils_worker/download/youtube.py
+++ b/news_fetch/utils_worker/download/youtube.py
@@ -1,10 +1,11 @@
 import youtube_dl
 import os
 import logging
+import configuration

+download_config = configuration.config["downloads"]
 logger = logging.getLogger(__name__)

-
 class MyLogger(object):
    def debug(self, msg): pass
    def warning(self, msg): pass
@@ -19,7 +20,6 @@ class YouTubeDownloader:


    def post_download_hook(self, ret_code):
-        # print(ret_code)
        if ret_code['status'] == 'finished':
            file_loc = ret_code["filename"]
            fname = os.path.basename(file_loc)
@@ -35,9 +35,11 @@ class YouTubeDownloader:
        ydl_opts = {
            'format': 'best[height<=720]',
            'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download
-            'logger': MyLogger(),
+            'logger': MyLogger(), # supress verbosity
            'progress_hooks': [self.post_download_hook],
-            'updatetime': False
+            'updatetime': False,
+            # File is also used by firefox so make sure to not write to it!
+            # youtube dl apparenlty does not support cookies.sqlite and the documentation is not clear on how to use cookies.txt
        }
        try:
            with youtube_dl.YoutubeDL(ydl_opts) as ydl:
@@ -46,5 +48,9 @@ class YouTubeDownloader:
        except Exception as e:
            logger.error(f"Youtube download crashed: {e}")
            article_object.file_name = ""
+            logfile = os.path.join(download_config["local_storage_path"], "failed_downloads.csv")
+            logger.info(f"Logging youtube errors seperately to {logfile}")
+            with open(logfile, "a+") as f:
+                f.write(f"{url}\n")

        return article_object