coss_archiving/manual/gather_media_files.py

67 lines
1.5 KiB
Python

"""
Runs the news_fetch pipeline against a manually curated list of urls and saves them locally
"""
import sys
sys.path.append("../app/news_fetch")
import runner
import logging
logger = logging.getLogger()
import json
from rich.console import Console
from rich.table import Table
console = Console()
logger.info("Overwriting production values for single time media-fetch")
runner.configuration.models.set_db(
runner.configuration.SqliteDatabase("../.dev/media_downloads.db")
)
runner.configuration.main_config["DOWNLOADS"]["local_storage_path"] = "../.dev/"
def fetch():
dispatcher = runner.Dispatcher()
dispatcher.workers_in = [{"FetchWorker": runner.FetchWorker(), "DownloadWorker": runner.DownloadWorker()}]
dispatcher.workers_out = [{"PrintWorker": runner.PrintWorker()}]
dispatcher.start()
with open("media_urls.json", "r") as f:
url_list = json.loads(f.read())
logger.info(f"Found {len(url_list)} media urls")
for u in url_list:
msg_text = f"<{u}|dummy preview text>"
dispatcher.incoming_request(msg)
def show():
t = Table(
title = "ArticleDownloads",
row_styles = ["white", "bright_black"],
)
entries = ["title", "article_url", "archive_url", "authors"]
for e in entries:
t.add_column(e, justify = "right")
sel = runner.models.ArticleDownload.select()
for s in sel:
c = [getattr(s, e) for e in entries]#
c[-1] = str([a.author for a in c[-1]])
print(c)
t.add_row(*c)
console.print(t)
# fetch()
show()