From 104b99df7e898c5c451c0d283bc5160d4c659213 Mon Sep 17 00:00:00 2001 From: Remy Moll Date: Sat, 29 Oct 2022 17:18:41 +0200 Subject: [PATCH] single use media archiving made usable again --- manual/gather_media_files.py | 76 +++++++++++++++++------------------- manual/media_urls.txt | 0 news_fetch/runner.py | 13 ++++-- 3 files changed, 45 insertions(+), 44 deletions(-) create mode 100644 manual/media_urls.txt diff --git a/manual/gather_media_files.py b/manual/gather_media_files.py index e1d9e0b..49514a3 100644 --- a/manual/gather_media_files.py +++ b/manual/gather_media_files.py @@ -2,66 +2,60 @@ Runs the news_fetch pipeline against a manually curated list of urls and saves them locally """ import sys -sys.path.append("../app/news_fetch") +sys.path.append("../news_fetch") import runner import logging logger = logging.getLogger() -import json -from rich.console import Console -from rich.table import Table -console = Console() -logger.info("Overwriting production values for single time media-fetch") -runner.configuration.models.set_db( - runner.configuration.SqliteDatabase("../.dev/media_downloads.db") -) -runner.configuration.main_config["DOWNLOADS"]["local_storage_path"] = "../.dev/" +class DummyMessage: + """Required by the dispatcher""" + ts = 0 + def __init__(self, url): + self.urls = [url] def fetch(): dispatcher = runner.Dispatcher() - dispatcher.workers_in = [{"FetchWorker": runner.FetchWorker(), "DownloadWorker": runner.DownloadWorker()}] - dispatcher.workers_out = [{"PrintWorker": runner.PrintWorker()}] + dispatcher.workers_in = [ + {"FetchWorker": runner.FetchWorker(), "DownloadWorker": runner.DownloadWorker()}, + {"UploadWorker": runner.UploadWorker()} + ] + print_worker = runner.PrintWorker("Finished processing", sent = True) + dispatcher.workers_out = [{"PrintWorker": print_worker}] dispatcher.start() - with open("media_urls.json", "r") as f: - url_list = json.loads(f.read()) - logger.info(f"Found {len(url_list)} media urls") - for u in url_list: - msg_text = f"<{u}|dummy preview text>" - dispatcher.incoming_request(msg) + with open("media_urls.txt", "r") as f: + url_list = [l.replace("\n", "") for l in f.readlines()] + with open("media_urls.txt", "w") as f: + f.write("") # empty the file once it is read so that it does not get processed again + if url_list: + logger.info(f"Found {len(url_list)} media urls") + for u in url_list: + dispatcher.incoming_request(DummyMessage(u)) + else: + logger.info(f"No additional media urls found. Running the pipeline with messages from db.") + + print_worker.keep_alive() def show(): + for a in runner.models.ArticleDownload.select(): + print(f"URL: {a.article_url} \nARCHIVE_URL: {a.archive_url} \nFILE_NAME: {a.file_name}") - t = Table( - title = "ArticleDownloads", - row_styles = ["white", "bright_black"], +if __name__ == "__main__": + logger.info("Overwriting production values for single time media-fetch") + runner.configuration.models.set_db( + runner.configuration.SqliteDatabase("../.dev/media_downloads.db") ) - - entries = ["title", "article_url", "archive_url", "authors"] - - for e in entries: - t.add_column(e, justify = "right") - - sel = runner.models.ArticleDownload.select() - - for s in sel: - c = [getattr(s, e) for e in entries]# - c[-1] = str([a.author for a in c[-1]]) - print(c) - t.add_row(*c) - - - console.print(t) + runner.configuration.main_config["DOWNLOADS"]["local_storage_path"] = "../.dev/" - - -# fetch() -show() \ No newline at end of file + if len(sys.argv) == 1: # no additional arguments + fetch() + elif sys.argv[1] == "show": + show() \ No newline at end of file diff --git a/manual/media_urls.txt b/manual/media_urls.txt new file mode 100644 index 0000000..e69de29 diff --git a/news_fetch/runner.py b/news_fetch/runner.py index 1185a52..ec531cc 100644 --- a/news_fetch/runner.py +++ b/news_fetch/runner.py @@ -128,8 +128,14 @@ class Dispatcher(Thread): class PrintWorker: + def __init__(self, action, sent = False) -> None: + self.action = action + self.sent = sent def send(self, article): - print(f"Uploaded article {article}") + print(f"{self.action} article {article}") + if self.sent: + article.sent = True + article.save() def keep_alive(self): # keeps script running, because there is nothing else in the main thread while True: sleep(1) @@ -144,11 +150,12 @@ if __name__ == "__main__": logger.info(f"Launching upload to archive for {len(articles)} articles.") dispatcher.workers_in = [{"UploadWorker": UploadWorker()}] - dispatcher.workers_out = [{"PrintWorker": PrintWorker()}] + print_worker = PrintWorker("Uploaded") + dispatcher.workers_out = [{"PrintWorker": print_worker}] dispatcher.start() for a in articles: dispatcher.incoming_request(article=a) - PrintWorker().keep_alive() + print_worker.keep_alive() else: # launch with full action try: