bug fixes

2022-10-06 15:55:30 +02:00
parent cca902b1f2
commit 5eab6c55da
12 changed files with 62 additions and 159 deletions
--- a/news_fetch/Dockerfile
+++ b/news_fetch/Dockerfile
@@ -2,10 +2,6 @@ FROM python:latest

 ENV TZ Europe/Zurich

-RUN apt-get update && apt-get install -y ghostscript
-# for compression of pdfs
-
-
 RUN mkdir -p /app/auto_news

 COPY requirements.txt /app/requirements.txt
--- a/news_fetch/runner.py
+++ b/news_fetch/runner.py
@@ -11,7 +11,7 @@ from collections import OrderedDict

 from utils_mail import runner as MailRunner
 from utils_slack import runner as SlackRunner
-from utils_worker.workers import CompressWorker, DownloadWorker, FetchWorker, UploadWorker
+from utils_worker.workers import DownloadWorker, FetchWorker, UploadWorker


 class ArticleWatcher:
@@ -160,10 +160,10 @@ if __name__ == "__main__":
        try:
            slack_runner = SlackRunner.BotRunner(dispatcher.incoming_request)
            # All workers are implemented as a threaded queue. But the individual model requires a specific processing order:
-            # fetch -> download -> compress -> complete
+            # fetch -> download (-> compress) -> complete
            # This is reflected in the following list of workers:
            workers_in = [
-                OrderedDict({"FetchWorker": FetchWorker(), "DownloadWorker": DownloadWorker(), "CompressWorker": CompressWorker(), "NotifyRunner": "out"}),
+                OrderedDict({"FetchWorker": FetchWorker(), "DownloadWorker": DownloadWorker(), "NotifyRunner": "out"}),
                OrderedDict({"UploadWorker": UploadWorker()})
            ]
            # The two dicts are processed independently. First element of first dict is called at the same time as the first element of the second dict
--- a/news_fetch/utils_mail/runner.py
+++ b/news_fetch/utils_mail/runner.py
@@ -15,14 +15,11 @@ def send(article_model):
    mail['From'] = config["sender"]
    mail['To'] = config["recipient"]

-    msgs = article_model.mail_info # this is html
-    msg = [m["reply_text"] for m in msgs]
-    msg = "\n".join(msg)
+    msg, files = article_model.mail_info() # this is html

    content = MIMEText(msg, "html")
    mail.attach(content)

-    files = [m["file_path"] for m in msgs if m["file_path"]]
    for path in files:
        with open(path, 'rb') as file:
            part = MIMEApplication(file.read(), "pdf")
@@ -31,7 +28,12 @@ def send(article_model):
        mail.attach(part)

    try:
-        smtp = smtplib.SMTP(config["smtp_server"], config["port"])
+        try:
+            smtp = smtplib.SMTP(config["smtp_server"], config["port"])
+        except ConnectionRefusedError:
+            logger.error("Server refused connection. Is this an error on your side?")
+            return False
+
        smtp.starttls()
        smtp.login(config["uname"], config["password"])
        smtp.sendmail(config["sender"], config["recipient"], mail.as_string())
--- a/news_fetch/utils_slack/runner.py
+++ b/news_fetch/utils_slack/runner.py
@@ -154,37 +154,10 @@ class BotApp(App):


    def respond_channel_message(self, article, say=None):
-        if say is None:
-            say = self.say_substitute
-        answers = article.slack_info
        if article.slack_ts == 0:
            self.logger.error(f"{article} has no slack_ts")
        else:
-            self.logger.info("Skipping slack reply because it is broken")
-            for a in []:
-            # for a in answers:
-                if a["file_path"]:
-                    try:
-                        self.client.files_upload(
-                            channels = config["archive_id"],
-                            initial_comment = f"{a['reply_text']}",
-                            file = a["file_path"],
-                            thread_ts = article.slack_ts_full
-                        )
-                        # status = True
-                    except SlackApiError as e: # upload resulted in an error
-                        say(
-                            "File {} could not be uploaded.".format(a),
-                            thread_ts = article.slack_ts_full
-                        )
-                        # status = False
-                        self.logger.error(f"File upload failed: {e}")
-                else: # anticipated that there is no file!
-                    say(
-                        f"{a['reply_text']}",
-                        thread_ts = article.slack_ts_full
-                    )
-                    # status = True
+            self.logger.info("Skipping slack reply.")
        

    def startup_status(self):
--- a/news_fetch/utils_storage/models.py
+++ b/news_fetch/utils_storage/models.py
@@ -10,6 +10,8 @@ import datetime
 from . import helpers
 config = configuration.main_config["DOWNLOADS"]
 slack_config = configuration.main_config["SLACK"]
+FILE_SIZE_THRESHOLD = 15 * 1024 * 1024 # 15MB
+

 # set the nature of the db at runtime
 download_db = DatabaseProxy()
@@ -94,50 +96,34 @@ class ArticleDownload(DownloadBaseModel):
            desc = f"{self.article_url}"
        return f"ART [{desc}]"

-    @property
-    def slack_info(self):
-        status = [":x: No better version available", ":gear: Verification pending", ":white_check_mark: Verified by human"][self.verified + 1]
-        content = "\n>" + "\n>".join(self.summary.split("\n"))
-        file_status, msg = self.file_status()
-        if not file_status:
-            return [msg]
-        
-        # everything alright: generate real content
-        # first the base file
-        if self.file_name[-4:] == ".pdf":
-            answer = [{ # main reply with the base pdf
-                "reply_text" : f"*{self.title}*\n{status}\n{content}",
-                "file_path" : self.save_path + self.file_name 
-            }]
-        else: # don't upload if the file is too big!
-            location = f"Not uploaded to slack, but the file will be on the NAS:\n`{self.fname_nas}`"
-            answer = [{ # main reply with the base pdf
-                "reply_text" : f"*{self.title}*\n{status}\n{content}\n{location}",
-                "file_path" : None 
-            }]
+    def mail_info(self):
+        summary = "\n> " + "\n> ".join(self.summary.split("\n"))
+        answer_text = f"[{self.article_url}]({self.article_url})\n\n" # first the url
+        answer_files = []
+        # displays the summary in a blockquote
+
+        status = self.file_status
+        if status == 1: # file_name was empty
+            return None # there has been an error do not send any message
+        elif status == 2: # no file found at specified location
+            answer_text += f"*{self.title}*\n{summary}\nFilename: {self.file_name}"
+        elif status == 3: # file found but deemed too big
+            location = f"File not sent directly. Location on NAS:\n`{self.fname_nas}`"
+            answer_text += f"*{self.title}*\n{summary}\n{location}"
+        else: # everything nominal
+            answer_text += f"*{self.title}*\n{summary}"
+            answer_files.append(self.save_path + self.file_name)

        # then the related files
-        rel_text = ""
-        for r in self.related:
-            fname = r.related_file_name
-            lentry = "\n• `{}` ".format(self.fname_nas(fname))
-            if fname[-4:] == ".pdf": # this is a manageable file, directly upload
-                f_ret = self.save_path + fname
-                answer.append({"reply_text":"", "file_path" : f_ret})
-            else: # not pdf <=> too large. Don't upload but mention its existence
-                lentry += "(not uploaded to slack, but the file will be on the NAS)"
-                
-            rel_text += lentry
-
-        if rel_text:
-            rel_text = answer[0]["reply_text"] = answer[0]["reply_text"] + "\nRelated files:\n" + rel_text
+        if self.related:
+            rel_text = "Related files on NAS:"
+            for r in self.related:
+                fname = r.related_file_name
+                rel_text += f"\n• `{self.fname_nas(fname)}` "
+            
+            answer_text += "\n\n" + rel_text
        
-        return answer
-
-    @property
-    def mail_info(self):
-        base = [{"reply_text": f"[{self.article_url}]({self.article_url})\n", "file_path":None}] + self.slack_info
-        return [{"reply_text": markdown.markdown(m["reply_text"]), "file_path": m["file_path"]} for m in base]
+        return markdown.markdown(answer_text), answer_files


    def set_authors(self, authors):
@@ -157,18 +143,21 @@ class ArticleDownload(DownloadBaseModel):
                article = self,
                related_file_name = r
            )
-
+    
+    @property
    def file_status(self):
+        """0 = file exists, 1 = no file name!, 2 = file does not exit,3 = file exists but is too large"""
        if not self.file_name:
            logger.error(f"Article {self} has no filename!")
-            return False, {"reply_text": "Download failed, no file was saved.", "file_path": None}
-        
+            return 2
        file_path_abs = self.save_path + self.file_name
        if not os.path.exists(file_path_abs):
            logger.error(f"Article {self} has a filename, but the file does not exist at that location!")
-            return False, {"reply_text": "Can't find file. Either the download failed or the file was moved.", "file_path": None}
+            return 2
+        if (os.path.splitext(file_path_abs)[1] != ".pdf") or (os.path.getsize(file_path_abs) > FILE_SIZE_THRESHOLD):
+            logger.warning(f"Article {self} has a file that exceeds the file size limit.")
+            return 3

-        return True, {}


 class ArticleAuthor(DownloadBaseModel):
--- a/news_fetch/utils_worker/compress/runner.py
+++ b/news_fetch/utils_worker/compress/runner.py
@@ -1,47 +0,0 @@
-import os
-import subprocess
-from pathlib import Path
-
-import logging
-logger = logging.getLogger(__name__)
-import configuration
-config = configuration.main_config["DOWNLOADS"]
-
-shrink_sizes = []
-
-def shrink_pdf(article):
-    article_loc = Path(article.save_path) / article.file_name
-    initial_size = article_loc.stat().st_size
-    compressed_tmp = Path(config['default_download_path']) / "compressed.pdf"
-
-    if article_loc.suffix != "pdf":
-        return article # it probably was a youtube video
-        
-    c = subprocess.run(
-        [
-            "gs",
-            "-sDEVICE=pdfwrite",
-            "-dPDFSETTINGS=/screen",
-            "-dNOPAUSE",
-            "-dBATCH",
-            f"-sOutputFile={compressed_tmp}", 
-            f"{article_loc}"
-        ],
-        stdout=subprocess.PIPE, stderr=subprocess.PIPE
-    )
-
-    if c.returncode == 0:
-        try:
-            os.replace(compressed_tmp, article_loc)
-        except OSError as e:
-            logger.error(f"Compression ran but I could not copy back the file {e}")
-
-        final_size = article_loc.stat().st_size
-        shrink_sizes.append(initial_size - final_size)
-        logger.info(f"Compression worked. Avg shrinkage: {int(sum(shrink_sizes)/len(shrink_sizes) / 1000)} KB")
-
-
-    else:
-        logger.error(f"Could not run the compression! {c.stderr.decode()} - {c.stdout.decode()}")
-    
-    return article
--- a/news_fetch/utils_worker/download/browser.py
+++ b/news_fetch/utils_worker/download/browser.py
@@ -85,10 +85,8 @@ class PDFDownloader:
            # will be propagated to the saved file (dst) as well

        fname = article_object.fname_template
+        fname = ensure_unique(article_object.save_path, fname)
        dst = os.path.join(article_object.save_path, fname)
-        if os.path.exists(dst):
-            fname = make_path_unique(fname)
-            dst = os.path.join(article_object.save_path, fname)


        if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly
@@ -137,7 +135,6 @@ class PDFDownloader:

    def create_tmp_profile(self, full_profile_path: Path = Path(config["browser_profile_path"])) -> Path:
        reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}")
-        print(reduced_profile_path, full_profile_path)
        os.mkdir(reduced_profile_path)
        # copy needed directories
        dirs = ["extensions", "storage"]
@@ -150,13 +147,20 @@ class PDFDownloader:
            shutil.copy(full_profile_path / f, reduced_profile_path)
        
        folder_size = round(sum(p.stat().st_size for p in Path(reduced_profile_path).rglob('*')) / 1024 / 1024, 3)
-        self.logger.info(f"Generated temporary profile with size {folder_size} MB")
+        self.logger.info(f"Generated temporary profile at {reduced_profile_path} with size {folder_size} MB")
        return reduced_profile_path




-def make_path_unique(path):
-    fname, ending = os.path.splitext(path)
-    fname += datetime.datetime.now().strftime("%d-%H%M%S")
-    return fname + ending
+def ensure_unique(path, fname):
+    fbase, ending = os.path.splitext(fname)
+
+    exists = os.path.exists(os.path.join(path, fname))
+    i = 1
+    while exists:
+        fname = fbase + f" -- fetch {i}" + ending
+        i += 1
+        exists = os.path.exists(os.path.join(path, fname))
+    
+    return fname
--- a/news_fetch/utils_worker/workers.py
+++ b/news_fetch/utils_worker/workers.py
@@ -3,7 +3,7 @@ from .download.browser import PDFDownloader
 from .download.youtube import YouTubeDownloader
 from .fetch.runner import get_description
 from .upload.runner import upload_to_archive as run_upload
-from .compress.runner import shrink_pdf
+

 import time
 import logging
@@ -53,14 +53,3 @@ class UploadWorker(TemplateWorker):

        super()._handle_article(article_watcher, action)
        # article_watcher.upload_completed = True
-
-
-
-class CompressWorker(TemplateWorker):
-    def __init__(self) -> None:
-        super().__init__()
-
-    def _handle_article(self, article_watcher):
-        action = shrink_pdf
-        super()._handle_article(article_watcher, action)
-        # article_watcher.compression_completed = True