bug fixes

2022-10-06 15:55:30 +02:00
parent cca902b1f2
commit 5eab6c55da
12 changed files with 62 additions and 159 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,4 @@
 .dev/
 .vscode/
 *.pyc
 *.log
 __pycache__/
@@ -23,7 +22,6 @@ dist-ssr
 # Editor directories and files
 .vscode/*
 !.vscode/extensions.json
 .idea
 .DS_Store
 *.suo
--- a/README.md
+++ b/README.md
@@ -34,7 +34,7 @@ Two additional 'modes' are `build` and `down`. Build rebuilds the container, whi
 In essence a command is simply a service from docker-compose, which is run in an interactive environment. As such all services defined in `docker-compose.yaml` can be called as commands. Only two of them will be of real use:
-`news_fetch` does the majority of the actions mentionned above. By default, that is without any options, it runs a metadata-fetch, download, compression, and upload to archive.org. The upload is usually the slowest which is why articles that are processed but don't yet have an archive.org url tend to pile up. You can therefore specify the option `upload` which only starts the upload for the concerned articles, as a catch-up if you will.
+`news_fetch` does the majority of the actions mentionned above. By default, that is without any options, it runs a metadata-fetch, download, and upload to archive.org. The upload is usually the slowest which is why articles that are processed but don't yet have an archive.org url tend to pile up. You can therefore specify the option `upload` which only starts the upload for the concerned articles, as a catch-up if you will.
 Example usage:
--- a/misc/gather_media_files.py
+++ b/misc/gather_media_files.py
@@ -25,7 +25,6 @@ def fetch():
        "worker_download" : runner.DownloadWorker(),
        "worker_fetch" : runner.FetchWorker(),
        "worker_upload" : runner.UploadWorker(),
        "worker_compress" : runner.CompressWorker(),
    }
    coordinator.add_workers(**kwargs)
--- a/misc/sample_config/news_fetch.config.ini
+++ b/misc/sample_config/news_fetch.config.ini
@@ -28,4 +28,4 @@ default_download_path: /app/containerdata/tmp
 remote_storage_path: /helbing_support/Files RM/Archiving
 browser_profile_path: /app/containerdata/dependencies/news_fetch.profile
 # please keep this exact name
-browser_print_delay: 5
+browser_print_delay: 3
--- a/news_fetch/Dockerfile
+++ b/news_fetch/Dockerfile
@@ -2,10 +2,6 @@ FROM python:latest
 ENV TZ Europe/Zurich
 RUN apt-get update && apt-get install -y ghostscript
 # for compression of pdfs
 RUN mkdir -p /app/auto_news
 COPY requirements.txt /app/requirements.txt
--- a/news_fetch/runner.py
+++ b/news_fetch/runner.py
@@ -11,7 +11,7 @@ from collections import OrderedDict
 from utils_mail import runner as MailRunner
 from utils_slack import runner as SlackRunner
-from utils_worker.workers import CompressWorker, DownloadWorker, FetchWorker, UploadWorker
+from utils_worker.workers import DownloadWorker, FetchWorker, UploadWorker
 class ArticleWatcher:
@@ -160,10 +160,10 @@ if __name__ == "__main__":
        try:
            slack_runner = SlackRunner.BotRunner(dispatcher.incoming_request)
            # All workers are implemented as a threaded queue. But the individual model requires a specific processing order:
-            # fetch -> download -> compress -> complete
+            # fetch -> download (-> compress) -> complete
            # This is reflected in the following list of workers:
            workers_in = [
-                OrderedDict({"FetchWorker": FetchWorker(), "DownloadWorker": DownloadWorker(), "CompressWorker": CompressWorker(), "NotifyRunner": "out"}),
+                OrderedDict({"FetchWorker": FetchWorker(), "DownloadWorker": DownloadWorker(), "NotifyRunner": "out"}),
                OrderedDict({"UploadWorker": UploadWorker()})
            ]
            # The two dicts are processed independently. First element of first dict is called at the same time as the first element of the second dict
--- a/news_fetch/utils_mail/runner.py
+++ b/news_fetch/utils_mail/runner.py
@@ -15,14 +15,11 @@ def send(article_model):
    mail['From'] = config["sender"]
    mail['To'] = config["recipient"]
-    msgs = article_model.mail_info # this is html
+    msg, files = article_model.mail_info() # this is html
    msg = [m["reply_text"] for m in msgs]
    msg = "\n".join(msg)
    content = MIMEText(msg, "html")
    mail.attach(content)
    files = [m["file_path"] for m in msgs if m["file_path"]]
    for path in files:
        with open(path, 'rb') as file:
            part = MIMEApplication(file.read(), "pdf")
@@ -31,7 +28,12 @@ def send(article_model):
        mail.attach(part)
    try:
-        smtp = smtplib.SMTP(config["smtp_server"], config["port"])
+        try:
            smtp = smtplib.SMTP(config["smtp_server"], config["port"])
        except ConnectionRefusedError:
            logger.error("Server refused connection. Is this an error on your side?")
            return False
        smtp.starttls()
        smtp.login(config["uname"], config["password"])
        smtp.sendmail(config["sender"], config["recipient"], mail.as_string())
--- a/news_fetch/utils_slack/runner.py
+++ b/news_fetch/utils_slack/runner.py
@@ -154,37 +154,10 @@ class BotApp(App):
    def respond_channel_message(self, article, say=None):
        if say is None:
            say = self.say_substitute
        answers = article.slack_info
        if article.slack_ts == 0:
            self.logger.error(f"{article} has no slack_ts")
        else:
-            self.logger.info("Skipping slack reply because it is broken")
+            self.logger.info("Skipping slack reply.")
            for a in []:
            # for a in answers:
                if a["file_path"]:
                    try:
                        self.client.files_upload(
                            channels = config["archive_id"],
                            initial_comment = f"{a['reply_text']}",
                            file = a["file_path"],
                            thread_ts = article.slack_ts_full
                        )
                        # status = True
                    except SlackApiError as e: # upload resulted in an error
                        say(
                            "File {} could not be uploaded.".format(a),
                            thread_ts = article.slack_ts_full
                        )
                        # status = False
                        self.logger.error(f"File upload failed: {e}")
                else: # anticipated that there is no file!
                    say(
                        f"{a['reply_text']}",
                        thread_ts = article.slack_ts_full
                    )
                    # status = True
    def startup_status(self):
--- a/news_fetch/utils_storage/models.py
+++ b/news_fetch/utils_storage/models.py
@@ -10,6 +10,8 @@ import datetime
 from . import helpers
 config = configuration.main_config["DOWNLOADS"]
 slack_config = configuration.main_config["SLACK"]
 FILE_SIZE_THRESHOLD = 15 * 1024 * 1024 # 15MB
 # set the nature of the db at runtime
 download_db = DatabaseProxy()
@@ -94,50 +96,34 @@ class ArticleDownload(DownloadBaseModel):
            desc = f"{self.article_url}"
        return f"ART [{desc}]"
-    @property
+    def mail_info(self):
-    def slack_info(self):
+        summary = "\n> " + "\n> ".join(self.summary.split("\n"))
-        status = [":x: No better version available", ":gear: Verification pending", ":white_check_mark: Verified by human"][self.verified + 1]
+        answer_text = f"[{self.article_url}]({self.article_url})\n\n" # first the url
-        content = "\n>" + "\n>".join(self.summary.split("\n"))
+        answer_files = []
-        file_status, msg = self.file_status()
+        # displays the summary in a blockquote
-        if not file_status:
+
-            return [msg]
+        status = self.file_status
-        
+        if status == 1: # file_name was empty
-        # everything alright: generate real content
+            return None # there has been an error do not send any message
-        # first the base file
+        elif status == 2: # no file found at specified location
-        if self.file_name[-4:] == ".pdf":
+            answer_text += f"*{self.title}*\n{summary}\nFilename: {self.file_name}"
-            answer = [{ # main reply with the base pdf
+        elif status == 3: # file found but deemed too big
-                "reply_text" : f"*{self.title}*\n{status}\n{content}",
+            location = f"File not sent directly. Location on NAS:\n`{self.fname_nas}`"
-                "file_path" : self.save_path + self.file_name 
+            answer_text += f"*{self.title}*\n{summary}\n{location}"
-            }]
+        else: # everything nominal
-        else: # don't upload if the file is too big!
+            answer_text += f"*{self.title}*\n{summary}"
-            location = f"Not uploaded to slack, but the file will be on the NAS:\n`{self.fname_nas}`"
+            answer_files.append(self.save_path + self.file_name)
            answer = [{ # main reply with the base pdf
                "reply_text" : f"*{self.title}*\n{status}\n{content}\n{location}",
                "file_path" : None 
            }]
        # then the related files
-        rel_text = ""
+        if self.related:
-        for r in self.related:
+            rel_text = "Related files on NAS:"
-            fname = r.related_file_name
+            for r in self.related:
-            lentry = "\n• `{}` ".format(self.fname_nas(fname))
+                fname = r.related_file_name
-            if fname[-4:] == ".pdf": # this is a manageable file, directly upload
+                rel_text += f"\n• `{self.fname_nas(fname)}` "
-                f_ret = self.save_path + fname
+            
-                answer.append({"reply_text":"", "file_path" : f_ret})
+            answer_text += "\n\n" + rel_text
            else: # not pdf <=> too large. Don't upload but mention its existence
                lentry += "(not uploaded to slack, but the file will be on the NAS)"
            rel_text += lentry
        if rel_text:
            rel_text = answer[0]["reply_text"] = answer[0]["reply_text"] + "\nRelated files:\n" + rel_text
-        return answer
+        return markdown.markdown(answer_text), answer_files
    @property
    def mail_info(self):
        base = [{"reply_text": f"[{self.article_url}]({self.article_url})\n", "file_path":None}] + self.slack_info
        return [{"reply_text": markdown.markdown(m["reply_text"]), "file_path": m["file_path"]} for m in base]
    def set_authors(self, authors):
@@ -157,18 +143,21 @@ class ArticleDownload(DownloadBaseModel):
                article = self,
                related_file_name = r
            )
-
+    
    @property
    def file_status(self):
        """0 = file exists, 1 = no file name!, 2 = file does not exit,3 = file exists but is too large"""
        if not self.file_name:
            logger.error(f"Article {self} has no filename!")
-            return False, {"reply_text": "Download failed, no file was saved.", "file_path": None}
+            return 2
        file_path_abs = self.save_path + self.file_name
        if not os.path.exists(file_path_abs):
            logger.error(f"Article {self} has a filename, but the file does not exist at that location!")
-            return False, {"reply_text": "Can't find file. Either the download failed or the file was moved.", "file_path": None}
+            return 2
        if (os.path.splitext(file_path_abs)[1] != ".pdf") or (os.path.getsize(file_path_abs) > FILE_SIZE_THRESHOLD):
            logger.warning(f"Article {self} has a file that exceeds the file size limit.")
            return 3
        return True, {}
 class ArticleAuthor(DownloadBaseModel):
--- a/news_fetch/utils_worker/compress/runner.py
+++ b/news_fetch/utils_worker/compress/runner.py
@@ -1,47 +0,0 @@
 import os
 import subprocess
 from pathlib import Path
 import logging
 logger = logging.getLogger(__name__)
 import configuration
 config = configuration.main_config["DOWNLOADS"]
 shrink_sizes = []
 def shrink_pdf(article):
    article_loc = Path(article.save_path) / article.file_name
    initial_size = article_loc.stat().st_size
    compressed_tmp = Path(config['default_download_path']) / "compressed.pdf"
    if article_loc.suffix != "pdf":
        return article # it probably was a youtube video
    c = subprocess.run(
        [
            "gs",
            "-sDEVICE=pdfwrite",
            "-dPDFSETTINGS=/screen",
            "-dNOPAUSE",
            "-dBATCH",
            f"-sOutputFile={compressed_tmp}", 
            f"{article_loc}"
        ],
        stdout=subprocess.PIPE, stderr=subprocess.PIPE
    )
    if c.returncode == 0:
        try:
            os.replace(compressed_tmp, article_loc)
        except OSError as e:
            logger.error(f"Compression ran but I could not copy back the file {e}")
        final_size = article_loc.stat().st_size
        shrink_sizes.append(initial_size - final_size)
        logger.info(f"Compression worked. Avg shrinkage: {int(sum(shrink_sizes)/len(shrink_sizes) / 1000)} KB")
    else:
        logger.error(f"Could not run the compression! {c.stderr.decode()} - {c.stdout.decode()}")
    return article
--- a/news_fetch/utils_worker/download/browser.py
+++ b/news_fetch/utils_worker/download/browser.py
@@ -85,10 +85,8 @@ class PDFDownloader:
            # will be propagated to the saved file (dst) as well
        fname = article_object.fname_template
        fname = ensure_unique(article_object.save_path, fname)
        dst = os.path.join(article_object.save_path, fname)
        if os.path.exists(dst):
            fname = make_path_unique(fname)
            dst = os.path.join(article_object.save_path, fname)
        if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly
@@ -137,7 +135,6 @@ class PDFDownloader:
    def create_tmp_profile(self, full_profile_path: Path = Path(config["browser_profile_path"])) -> Path:
        reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}")
        print(reduced_profile_path, full_profile_path)
        os.mkdir(reduced_profile_path)
        # copy needed directories
        dirs = ["extensions", "storage"]
@@ -150,13 +147,20 @@ class PDFDownloader:
            shutil.copy(full_profile_path / f, reduced_profile_path)
        folder_size = round(sum(p.stat().st_size for p in Path(reduced_profile_path).rglob('*')) / 1024 / 1024, 3)
-        self.logger.info(f"Generated temporary profile with size {folder_size} MB")
+        self.logger.info(f"Generated temporary profile at {reduced_profile_path} with size {folder_size} MB")
        return reduced_profile_path
-def make_path_unique(path):
+def ensure_unique(path, fname):
-    fname, ending = os.path.splitext(path)
+    fbase, ending = os.path.splitext(fname)
-    fname += datetime.datetime.now().strftime("%d-%H%M%S")
+
-    return fname + ending
+    exists = os.path.exists(os.path.join(path, fname))
    i = 1
    while exists:
        fname = fbase + f" -- fetch {i}" + ending
        i += 1
        exists = os.path.exists(os.path.join(path, fname))
    return fname
--- a/news_fetch/utils_worker/workers.py
+++ b/news_fetch/utils_worker/workers.py
@@ -3,7 +3,7 @@ from .download.browser import PDFDownloader
 from .download.youtube import YouTubeDownloader
 from .fetch.runner import get_description
 from .upload.runner import upload_to_archive as run_upload
-from .compress.runner import shrink_pdf
+
 import time
 import logging
@@ -53,14 +53,3 @@ class UploadWorker(TemplateWorker):
        super()._handle_article(article_watcher, action)
        # article_watcher.upload_completed = True
 class CompressWorker(TemplateWorker):
    def __init__(self) -> None:
        super().__init__()
    def _handle_article(self, article_watcher):
        action = shrink_pdf
        super()._handle_article(article_watcher, action)
        # article_watcher.compression_completed = True