single use media archiving made usable again

This commit is contained in:
Remy Moll 2022-10-29 17:18:41 +02:00
parent e6bfe811d0
commit 104b99df7e
3 changed files with 45 additions and 44 deletions

View File

@ -2,66 +2,60 @@
Runs the news_fetch pipeline against a manually curated list of urls and saves them locally Runs the news_fetch pipeline against a manually curated list of urls and saves them locally
""" """
import sys import sys
sys.path.append("../app/news_fetch") sys.path.append("../news_fetch")
import runner import runner
import logging import logging
logger = logging.getLogger() logger = logging.getLogger()
import json
from rich.console import Console
from rich.table import Table
console = Console()
logger.info("Overwriting production values for single time media-fetch") class DummyMessage:
runner.configuration.models.set_db( """Required by the dispatcher"""
runner.configuration.SqliteDatabase("../.dev/media_downloads.db") ts = 0
) def __init__(self, url):
runner.configuration.main_config["DOWNLOADS"]["local_storage_path"] = "../.dev/" self.urls = [url]
def fetch(): def fetch():
dispatcher = runner.Dispatcher() dispatcher = runner.Dispatcher()
dispatcher.workers_in = [{"FetchWorker": runner.FetchWorker(), "DownloadWorker": runner.DownloadWorker()}] dispatcher.workers_in = [
dispatcher.workers_out = [{"PrintWorker": runner.PrintWorker()}] {"FetchWorker": runner.FetchWorker(), "DownloadWorker": runner.DownloadWorker()},
{"UploadWorker": runner.UploadWorker()}
]
print_worker = runner.PrintWorker("Finished processing", sent = True)
dispatcher.workers_out = [{"PrintWorker": print_worker}]
dispatcher.start() dispatcher.start()
with open("media_urls.json", "r") as f:
url_list = json.loads(f.read())
with open("media_urls.txt", "r") as f:
url_list = [l.replace("\n", "") for l in f.readlines()]
with open("media_urls.txt", "w") as f:
f.write("") # empty the file once it is read so that it does not get processed again
if url_list:
logger.info(f"Found {len(url_list)} media urls") logger.info(f"Found {len(url_list)} media urls")
for u in url_list: for u in url_list:
msg_text = f"<{u}|dummy preview text>" dispatcher.incoming_request(DummyMessage(u))
dispatcher.incoming_request(msg) else:
logger.info(f"No additional media urls found. Running the pipeline with messages from db.")
print_worker.keep_alive()
def show(): def show():
for a in runner.models.ArticleDownload.select():
print(f"URL: {a.article_url} \nARCHIVE_URL: {a.archive_url} \nFILE_NAME: {a.file_name}")
t = Table( if __name__ == "__main__":
title = "ArticleDownloads", logger.info("Overwriting production values for single time media-fetch")
row_styles = ["white", "bright_black"], runner.configuration.models.set_db(
runner.configuration.SqliteDatabase("../.dev/media_downloads.db")
) )
runner.configuration.main_config["DOWNLOADS"]["local_storage_path"] = "../.dev/"
entries = ["title", "article_url", "archive_url", "authors"]
for e in entries:
t.add_column(e, justify = "right")
sel = runner.models.ArticleDownload.select()
for s in sel:
c = [getattr(s, e) for e in entries]#
c[-1] = str([a.author for a in c[-1]])
print(c)
t.add_row(*c)
console.print(t) if len(sys.argv) == 1: # no additional arguments
fetch()
elif sys.argv[1] == "show":
show()
# fetch()
show()

0
manual/media_urls.txt Normal file
View File

View File

@ -128,8 +128,14 @@ class Dispatcher(Thread):
class PrintWorker: class PrintWorker:
def __init__(self, action, sent = False) -> None:
self.action = action
self.sent = sent
def send(self, article): def send(self, article):
print(f"Uploaded article {article}") print(f"{self.action} article {article}")
if self.sent:
article.sent = True
article.save()
def keep_alive(self): # keeps script running, because there is nothing else in the main thread def keep_alive(self): # keeps script running, because there is nothing else in the main thread
while True: sleep(1) while True: sleep(1)
@ -144,11 +150,12 @@ if __name__ == "__main__":
logger.info(f"Launching upload to archive for {len(articles)} articles.") logger.info(f"Launching upload to archive for {len(articles)} articles.")
dispatcher.workers_in = [{"UploadWorker": UploadWorker()}] dispatcher.workers_in = [{"UploadWorker": UploadWorker()}]
dispatcher.workers_out = [{"PrintWorker": PrintWorker()}] print_worker = PrintWorker("Uploaded")
dispatcher.workers_out = [{"PrintWorker": print_worker}]
dispatcher.start() dispatcher.start()
for a in articles: for a in articles:
dispatcher.incoming_request(article=a) dispatcher.incoming_request(article=a)
PrintWorker().keep_alive() print_worker.keep_alive()
else: # launch with full action else: # launch with full action
try: try: