From 5eab6c55dadb5b92181891c1b63b0f9cb3fa4f7d Mon Sep 17 00:00:00 2001 From: Remy Moll Date: Thu, 6 Oct 2022 15:55:30 +0200 Subject: [PATCH] bug fixes --- .gitignore | 2 - README.md | 2 +- misc/gather_media_files.py | 1 - misc/sample_config/news_fetch.config.ini | 2 +- news_fetch/Dockerfile | 4 - news_fetch/runner.py | 6 +- news_fetch/utils_mail/runner.py | 12 +-- news_fetch/utils_slack/runner.py | 29 +------- news_fetch/utils_storage/models.py | 81 +++++++++------------ news_fetch/utils_worker/compress/runner.py | 47 ------------ news_fetch/utils_worker/download/browser.py | 22 +++--- news_fetch/utils_worker/workers.py | 13 +--- 12 files changed, 62 insertions(+), 159 deletions(-) delete mode 100644 news_fetch/utils_worker/compress/runner.py diff --git a/.gitignore b/.gitignore index fcf509c..6be81a2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ .dev/ -.vscode/ *.pyc *.log __pycache__/ @@ -23,7 +22,6 @@ dist-ssr # Editor directories and files .vscode/* -!.vscode/extensions.json .idea .DS_Store *.suo diff --git a/README.md b/README.md index a0cb871..fdbba49 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ Two additional 'modes' are `build` and `down`. Build rebuilds the container, whi In essence a command is simply a service from docker-compose, which is run in an interactive environment. As such all services defined in `docker-compose.yaml` can be called as commands. Only two of them will be of real use: -`news_fetch` does the majority of the actions mentionned above. By default, that is without any options, it runs a metadata-fetch, download, compression, and upload to archive.org. The upload is usually the slowest which is why articles that are processed but don't yet have an archive.org url tend to pile up. You can therefore specify the option `upload` which only starts the upload for the concerned articles, as a catch-up if you will. +`news_fetch` does the majority of the actions mentionned above. By default, that is without any options, it runs a metadata-fetch, download, and upload to archive.org. The upload is usually the slowest which is why articles that are processed but don't yet have an archive.org url tend to pile up. You can therefore specify the option `upload` which only starts the upload for the concerned articles, as a catch-up if you will. Example usage: diff --git a/misc/gather_media_files.py b/misc/gather_media_files.py index f950243..a9a2c68 100644 --- a/misc/gather_media_files.py +++ b/misc/gather_media_files.py @@ -25,7 +25,6 @@ def fetch(): "worker_download" : runner.DownloadWorker(), "worker_fetch" : runner.FetchWorker(), "worker_upload" : runner.UploadWorker(), - "worker_compress" : runner.CompressWorker(), } coordinator.add_workers(**kwargs) diff --git a/misc/sample_config/news_fetch.config.ini b/misc/sample_config/news_fetch.config.ini index e16f36d..a220818 100644 --- a/misc/sample_config/news_fetch.config.ini +++ b/misc/sample_config/news_fetch.config.ini @@ -28,4 +28,4 @@ default_download_path: /app/containerdata/tmp remote_storage_path: /helbing_support/Files RM/Archiving browser_profile_path: /app/containerdata/dependencies/news_fetch.profile # please keep this exact name -browser_print_delay: 5 +browser_print_delay: 3 diff --git a/news_fetch/Dockerfile b/news_fetch/Dockerfile index c9e6da6..2f4bb4e 100644 --- a/news_fetch/Dockerfile +++ b/news_fetch/Dockerfile @@ -2,10 +2,6 @@ FROM python:latest ENV TZ Europe/Zurich -RUN apt-get update && apt-get install -y ghostscript -# for compression of pdfs - - RUN mkdir -p /app/auto_news COPY requirements.txt /app/requirements.txt diff --git a/news_fetch/runner.py b/news_fetch/runner.py index 7e17cb4..92a9fdf 100644 --- a/news_fetch/runner.py +++ b/news_fetch/runner.py @@ -11,7 +11,7 @@ from collections import OrderedDict from utils_mail import runner as MailRunner from utils_slack import runner as SlackRunner -from utils_worker.workers import CompressWorker, DownloadWorker, FetchWorker, UploadWorker +from utils_worker.workers import DownloadWorker, FetchWorker, UploadWorker class ArticleWatcher: @@ -160,10 +160,10 @@ if __name__ == "__main__": try: slack_runner = SlackRunner.BotRunner(dispatcher.incoming_request) # All workers are implemented as a threaded queue. But the individual model requires a specific processing order: - # fetch -> download -> compress -> complete + # fetch -> download (-> compress) -> complete # This is reflected in the following list of workers: workers_in = [ - OrderedDict({"FetchWorker": FetchWorker(), "DownloadWorker": DownloadWorker(), "CompressWorker": CompressWorker(), "NotifyRunner": "out"}), + OrderedDict({"FetchWorker": FetchWorker(), "DownloadWorker": DownloadWorker(), "NotifyRunner": "out"}), OrderedDict({"UploadWorker": UploadWorker()}) ] # The two dicts are processed independently. First element of first dict is called at the same time as the first element of the second dict diff --git a/news_fetch/utils_mail/runner.py b/news_fetch/utils_mail/runner.py index 25be18e..4ada4f3 100644 --- a/news_fetch/utils_mail/runner.py +++ b/news_fetch/utils_mail/runner.py @@ -15,14 +15,11 @@ def send(article_model): mail['From'] = config["sender"] mail['To'] = config["recipient"] - msgs = article_model.mail_info # this is html - msg = [m["reply_text"] for m in msgs] - msg = "\n".join(msg) + msg, files = article_model.mail_info() # this is html content = MIMEText(msg, "html") mail.attach(content) - files = [m["file_path"] for m in msgs if m["file_path"]] for path in files: with open(path, 'rb') as file: part = MIMEApplication(file.read(), "pdf") @@ -31,7 +28,12 @@ def send(article_model): mail.attach(part) try: - smtp = smtplib.SMTP(config["smtp_server"], config["port"]) + try: + smtp = smtplib.SMTP(config["smtp_server"], config["port"]) + except ConnectionRefusedError: + logger.error("Server refused connection. Is this an error on your side?") + return False + smtp.starttls() smtp.login(config["uname"], config["password"]) smtp.sendmail(config["sender"], config["recipient"], mail.as_string()) diff --git a/news_fetch/utils_slack/runner.py b/news_fetch/utils_slack/runner.py index c838f07..9b41a4b 100644 --- a/news_fetch/utils_slack/runner.py +++ b/news_fetch/utils_slack/runner.py @@ -154,37 +154,10 @@ class BotApp(App): def respond_channel_message(self, article, say=None): - if say is None: - say = self.say_substitute - answers = article.slack_info if article.slack_ts == 0: self.logger.error(f"{article} has no slack_ts") else: - self.logger.info("Skipping slack reply because it is broken") - for a in []: - # for a in answers: - if a["file_path"]: - try: - self.client.files_upload( - channels = config["archive_id"], - initial_comment = f"{a['reply_text']}", - file = a["file_path"], - thread_ts = article.slack_ts_full - ) - # status = True - except SlackApiError as e: # upload resulted in an error - say( - "File {} could not be uploaded.".format(a), - thread_ts = article.slack_ts_full - ) - # status = False - self.logger.error(f"File upload failed: {e}") - else: # anticipated that there is no file! - say( - f"{a['reply_text']}", - thread_ts = article.slack_ts_full - ) - # status = True + self.logger.info("Skipping slack reply.") def startup_status(self): diff --git a/news_fetch/utils_storage/models.py b/news_fetch/utils_storage/models.py index bfa05aa..f7ae8b9 100644 --- a/news_fetch/utils_storage/models.py +++ b/news_fetch/utils_storage/models.py @@ -10,6 +10,8 @@ import datetime from . import helpers config = configuration.main_config["DOWNLOADS"] slack_config = configuration.main_config["SLACK"] +FILE_SIZE_THRESHOLD = 15 * 1024 * 1024 # 15MB + # set the nature of the db at runtime download_db = DatabaseProxy() @@ -94,50 +96,34 @@ class ArticleDownload(DownloadBaseModel): desc = f"{self.article_url}" return f"ART [{desc}]" - @property - def slack_info(self): - status = [":x: No better version available", ":gear: Verification pending", ":white_check_mark: Verified by human"][self.verified + 1] - content = "\n>" + "\n>".join(self.summary.split("\n")) - file_status, msg = self.file_status() - if not file_status: - return [msg] - - # everything alright: generate real content - # first the base file - if self.file_name[-4:] == ".pdf": - answer = [{ # main reply with the base pdf - "reply_text" : f"*{self.title}*\n{status}\n{content}", - "file_path" : self.save_path + self.file_name - }] - else: # don't upload if the file is too big! - location = f"Not uploaded to slack, but the file will be on the NAS:\n`{self.fname_nas}`" - answer = [{ # main reply with the base pdf - "reply_text" : f"*{self.title}*\n{status}\n{content}\n{location}", - "file_path" : None - }] + def mail_info(self): + summary = "\n> " + "\n> ".join(self.summary.split("\n")) + answer_text = f"[{self.article_url}]({self.article_url})\n\n" # first the url + answer_files = [] + # displays the summary in a blockquote + + status = self.file_status + if status == 1: # file_name was empty + return None # there has been an error do not send any message + elif status == 2: # no file found at specified location + answer_text += f"*{self.title}*\n{summary}\nFilename: {self.file_name}" + elif status == 3: # file found but deemed too big + location = f"File not sent directly. Location on NAS:\n`{self.fname_nas}`" + answer_text += f"*{self.title}*\n{summary}\n{location}" + else: # everything nominal + answer_text += f"*{self.title}*\n{summary}" + answer_files.append(self.save_path + self.file_name) # then the related files - rel_text = "" - for r in self.related: - fname = r.related_file_name - lentry = "\n• `{}` ".format(self.fname_nas(fname)) - if fname[-4:] == ".pdf": # this is a manageable file, directly upload - f_ret = self.save_path + fname - answer.append({"reply_text":"", "file_path" : f_ret}) - else: # not pdf <=> too large. Don't upload but mention its existence - lentry += "(not uploaded to slack, but the file will be on the NAS)" - - rel_text += lentry - - if rel_text: - rel_text = answer[0]["reply_text"] = answer[0]["reply_text"] + "\nRelated files:\n" + rel_text + if self.related: + rel_text = "Related files on NAS:" + for r in self.related: + fname = r.related_file_name + rel_text += f"\n• `{self.fname_nas(fname)}` " + + answer_text += "\n\n" + rel_text - return answer - - @property - def mail_info(self): - base = [{"reply_text": f"[{self.article_url}]({self.article_url})\n", "file_path":None}] + self.slack_info - return [{"reply_text": markdown.markdown(m["reply_text"]), "file_path": m["file_path"]} for m in base] + return markdown.markdown(answer_text), answer_files def set_authors(self, authors): @@ -157,18 +143,21 @@ class ArticleDownload(DownloadBaseModel): article = self, related_file_name = r ) - + + @property def file_status(self): + """0 = file exists, 1 = no file name!, 2 = file does not exit,3 = file exists but is too large""" if not self.file_name: logger.error(f"Article {self} has no filename!") - return False, {"reply_text": "Download failed, no file was saved.", "file_path": None} - + return 2 file_path_abs = self.save_path + self.file_name if not os.path.exists(file_path_abs): logger.error(f"Article {self} has a filename, but the file does not exist at that location!") - return False, {"reply_text": "Can't find file. Either the download failed or the file was moved.", "file_path": None} + return 2 + if (os.path.splitext(file_path_abs)[1] != ".pdf") or (os.path.getsize(file_path_abs) > FILE_SIZE_THRESHOLD): + logger.warning(f"Article {self} has a file that exceeds the file size limit.") + return 3 - return True, {} class ArticleAuthor(DownloadBaseModel): diff --git a/news_fetch/utils_worker/compress/runner.py b/news_fetch/utils_worker/compress/runner.py deleted file mode 100644 index fe7cfb6..0000000 --- a/news_fetch/utils_worker/compress/runner.py +++ /dev/null @@ -1,47 +0,0 @@ -import os -import subprocess -from pathlib import Path - -import logging -logger = logging.getLogger(__name__) -import configuration -config = configuration.main_config["DOWNLOADS"] - -shrink_sizes = [] - -def shrink_pdf(article): - article_loc = Path(article.save_path) / article.file_name - initial_size = article_loc.stat().st_size - compressed_tmp = Path(config['default_download_path']) / "compressed.pdf" - - if article_loc.suffix != "pdf": - return article # it probably was a youtube video - - c = subprocess.run( - [ - "gs", - "-sDEVICE=pdfwrite", - "-dPDFSETTINGS=/screen", - "-dNOPAUSE", - "-dBATCH", - f"-sOutputFile={compressed_tmp}", - f"{article_loc}" - ], - stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - - if c.returncode == 0: - try: - os.replace(compressed_tmp, article_loc) - except OSError as e: - logger.error(f"Compression ran but I could not copy back the file {e}") - - final_size = article_loc.stat().st_size - shrink_sizes.append(initial_size - final_size) - logger.info(f"Compression worked. Avg shrinkage: {int(sum(shrink_sizes)/len(shrink_sizes) / 1000)} KB") - - - else: - logger.error(f"Could not run the compression! {c.stderr.decode()} - {c.stdout.decode()}") - - return article diff --git a/news_fetch/utils_worker/download/browser.py b/news_fetch/utils_worker/download/browser.py index 284085f..092f3d7 100644 --- a/news_fetch/utils_worker/download/browser.py +++ b/news_fetch/utils_worker/download/browser.py @@ -85,10 +85,8 @@ class PDFDownloader: # will be propagated to the saved file (dst) as well fname = article_object.fname_template + fname = ensure_unique(article_object.save_path, fname) dst = os.path.join(article_object.save_path, fname) - if os.path.exists(dst): - fname = make_path_unique(fname) - dst = os.path.join(article_object.save_path, fname) if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly @@ -137,7 +135,6 @@ class PDFDownloader: def create_tmp_profile(self, full_profile_path: Path = Path(config["browser_profile_path"])) -> Path: reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}") - print(reduced_profile_path, full_profile_path) os.mkdir(reduced_profile_path) # copy needed directories dirs = ["extensions", "storage"] @@ -150,13 +147,20 @@ class PDFDownloader: shutil.copy(full_profile_path / f, reduced_profile_path) folder_size = round(sum(p.stat().st_size for p in Path(reduced_profile_path).rglob('*')) / 1024 / 1024, 3) - self.logger.info(f"Generated temporary profile with size {folder_size} MB") + self.logger.info(f"Generated temporary profile at {reduced_profile_path} with size {folder_size} MB") return reduced_profile_path -def make_path_unique(path): - fname, ending = os.path.splitext(path) - fname += datetime.datetime.now().strftime("%d-%H%M%S") - return fname + ending \ No newline at end of file +def ensure_unique(path, fname): + fbase, ending = os.path.splitext(fname) + + exists = os.path.exists(os.path.join(path, fname)) + i = 1 + while exists: + fname = fbase + f" -- fetch {i}" + ending + i += 1 + exists = os.path.exists(os.path.join(path, fname)) + + return fname diff --git a/news_fetch/utils_worker/workers.py b/news_fetch/utils_worker/workers.py index e445ce3..812f820 100644 --- a/news_fetch/utils_worker/workers.py +++ b/news_fetch/utils_worker/workers.py @@ -3,7 +3,7 @@ from .download.browser import PDFDownloader from .download.youtube import YouTubeDownloader from .fetch.runner import get_description from .upload.runner import upload_to_archive as run_upload -from .compress.runner import shrink_pdf + import time import logging @@ -53,14 +53,3 @@ class UploadWorker(TemplateWorker): super()._handle_article(article_watcher, action) # article_watcher.upload_completed = True - - - -class CompressWorker(TemplateWorker): - def __init__(self) -> None: - super().__init__() - - def _handle_article(self, article_watcher): - action = shrink_pdf - super()._handle_article(article_watcher, action) - # article_watcher.compression_completed = True \ No newline at end of file