coss_archiving/manual/gather_media_files.py

"""
Runs the news_fetch pipeline against a manually curated list of urls and saves them locally
"""
import sys
sys.path.append("../news_fetch")
import runner
import os
import logging
logger = logging.getLogger()


class DummyMessage:
    """Required by the dispatcher"""
    ts = 0
    def __init__(self, url):
        self.urls = [url]


def fetch():
    dispatcher = runner.Dispatcher()

    dispatcher.workers_in = [
        {"FetchWorker": runner.FetchWorker(), "DownloadWorker": runner.DownloadWorker()},
        {"UploadWorker": runner.UploadWorker()}
    ]
    print_worker = runner.PrintWorker("Finished processing", sent = True)
    dispatcher.workers_out = [{"PrintWorker": print_worker}]

    dispatcher.start()


    with open("media_urls.txt", "r") as f:
        url_list = [l.replace("\n", "") for l in f.readlines()]
    with open("media_urls.txt", "w") as f:
        f.write("") # empty the file once it is read so that it does not get processed again

    if url_list:
        logger.info(f"Found {len(url_list)} media urls")
        for u in url_list:
            dispatcher.incoming_request(DummyMessage(u))
    else:
        logger.info(f"No additional media urls found. Running the pipeline with messages from db.")

    print_worker.keep_alive()


def show():
    for a in runner.models.ArticleDownload.select():
        print(f"""
        URL: {a.article_url}
        ARCHIVE_URL: {a.archive_url}
        ARTICLE_SOURCE: {a.source_name}
        FILE_NAME: {a.file_name}
        """)


if __name__ == "__main__":
    logger.info("Overwriting production values for single time media-fetch")
    if not os.path.exists("../.dev/"):
        os.mkdir("../.dev/")
    runner.configuration.models.set_db(
        runner.configuration.SqliteDatabase("../.dev/media_downloads.db")
    )
    runner.configuration.main_config["downloads"]["local_storage_path"] = "../.dev/"


    if len(sys.argv) == 1: # no additional arguments
        fetch()
    elif sys.argv[1] == "show":
        show()