""" Runs the news_fetch pipeline against a manually curated list of urls and saves them locally """ import sys sys.path.append("../app/news_fetch") import runner import logging logger = logging.getLogger() import json from rich.console import Console from rich.table import Table console = Console() logger.info("Overwriting production values for single time media-fetch") runner.configuration.models.set_db( runner.configuration.SqliteDatabase("../.dev/media_downloads.db") ) runner.configuration.main_config["DOWNLOADS"]["local_storage_path"] = "../.dev/" def fetch(): dispatcher = runner.Dispatcher() dispatcher.workers_in = [{"FetchWorker": runner.FetchWorker(), "DownloadWorker": runner.DownloadWorker()}] dispatcher.workers_out = [{"PrintWorker": runner.PrintWorker()}] dispatcher.start() with open("media_urls.json", "r") as f: url_list = json.loads(f.read()) logger.info(f"Found {len(url_list)} media urls") for u in url_list: msg_text = f"<{u}|dummy preview text>" dispatcher.incoming_request(msg) def show(): t = Table( title = "ArticleDownloads", row_styles = ["white", "bright_black"], ) entries = ["title", "article_url", "archive_url", "authors"] for e in entries: t.add_column(e, justify = "right") sel = runner.models.ArticleDownload.select() for s in sel: c = [getattr(s, e) for e in entries]# c[-1] = str([a.author for a in c[-1]]) print(c) t.add_row(*c) console.print(t) # fetch() show()