coss_archiving/manual/gather_media_files.py

"""
Runs the news_fetch pipeline against a manually curated list of urls and saves them locally
"""
import sys
sys.path.append("../app/news_fetch")
import runner
import logging
logger = logging.getLogger()
import json

from rich.console import Console
from rich.table import Table
console = Console()

logger.info("Overwriting production values for single time media-fetch")
runner.configuration.models.set_db(
    runner.configuration.SqliteDatabase("../.dev/media_downloads.db")
)
runner.configuration.main_config["DOWNLOADS"]["local_storage_path"] = "../.dev/"


def fetch():
    dispatcher = runner.Dispatcher()

    dispatcher.workers_in = [{"FetchWorker": runner.FetchWorker(), "DownloadWorker": runner.DownloadWorker()}]
    dispatcher.workers_out = [{"PrintWorker": runner.PrintWorker()}]

    dispatcher.start()

    with open("media_urls.json", "r") as f:
        url_list = json.loads(f.read())

    logger.info(f"Found {len(url_list)} media urls")
    for u in url_list:
        msg_text = f"<{u}|dummy preview text>"
        dispatcher.incoming_request(msg)


def show():

    t = Table(
        title = "ArticleDownloads",
        row_styles = ["white", "bright_black"],
    )

    entries = ["title", "article_url", "archive_url", "authors"]

    for e in entries:
        t.add_column(e, justify = "right")

    sel = runner.models.ArticleDownload.select()

    for s in sel:
        c = [getattr(s, e) for e in entries]#
        c[-1] = str([a.author for a in c[-1]])
        print(c)
        t.add_row(*c)


    console.print(t)


# fetch()
show()