""" Runs the news_fetch pipeline against a manually curated list of urls and saves them locally """ import sys sys.path.append("../news_fetch") import runner import os import logging logger = logging.getLogger() class DummyMessage: """Required by the dispatcher""" ts = 0 def __init__(self, url): self.urls = [url] def fetch(): dispatcher = runner.Dispatcher() dispatcher.workers_in = [ {"FetchWorker": runner.FetchWorker(), "DownloadWorker": runner.DownloadWorker()}, {"UploadWorker": runner.UploadWorker()} ] print_worker = runner.PrintWorker("Finished processing", sent = True) dispatcher.workers_out = [{"PrintWorker": print_worker}] dispatcher.start() with open("media_urls.txt", "r") as f: url_list = [l.replace("\n", "") for l in f.readlines()] with open("media_urls.txt", "w") as f: f.write("") # empty the file once it is read so that it does not get processed again if url_list: logger.info(f"Found {len(url_list)} media urls") for u in url_list: dispatcher.incoming_request(DummyMessage(u)) else: logger.info(f"No additional media urls found. Running the pipeline with messages from db.") print_worker.keep_alive() def show(): for a in runner.models.ArticleDownload.select(): print(f""" URL: {a.article_url} ARCHIVE_URL: {a.archive_url} ARTICLE_SOURCE: {a.source_name} FILE_NAME: {a.file_name} """) if __name__ == "__main__": logger.info("Overwriting production values for single time media-fetch") if not os.path.exists("../.dev/"): os.mkdir("../.dev/") runner.configuration.models.set_db( runner.configuration.SqliteDatabase("../.dev/media_downloads.db") ) runner.configuration.main_config["downloads"]["local_storage_path"] = "../.dev/" if len(sys.argv) == 1: # no additional arguments fetch() elif sys.argv[1] == "show": show()