Compare commits
No commits in common. "191d0084516a787ed97a3eb773ce8c9ba1a0a014" and "6301a62de8de8683a7e3d397e73efb042b7e95b2" have entirely different histories.
191d008451
...
6301a62de8
@ -2,60 +2,66 @@
|
|||||||
Runs the news_fetch pipeline against a manually curated list of urls and saves them locally
|
Runs the news_fetch pipeline against a manually curated list of urls and saves them locally
|
||||||
"""
|
"""
|
||||||
import sys
|
import sys
|
||||||
sys.path.append("../news_fetch")
|
sys.path.append("../app/news_fetch")
|
||||||
import runner
|
import runner
|
||||||
import logging
|
import logging
|
||||||
logger = logging.getLogger()
|
logger = logging.getLogger()
|
||||||
|
import json
|
||||||
|
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.table import Table
|
||||||
|
console = Console()
|
||||||
|
|
||||||
class DummyMessage:
|
logger.info("Overwriting production values for single time media-fetch")
|
||||||
"""Required by the dispatcher"""
|
runner.configuration.models.set_db(
|
||||||
ts = 0
|
runner.configuration.SqliteDatabase("../.dev/media_downloads.db")
|
||||||
def __init__(self, url):
|
)
|
||||||
self.urls = [url]
|
runner.configuration.main_config["DOWNLOADS"]["local_storage_path"] = "../.dev/"
|
||||||
|
|
||||||
|
|
||||||
def fetch():
|
def fetch():
|
||||||
dispatcher = runner.Dispatcher()
|
dispatcher = runner.Dispatcher()
|
||||||
|
|
||||||
dispatcher.workers_in = [
|
dispatcher.workers_in = [{"FetchWorker": runner.FetchWorker(), "DownloadWorker": runner.DownloadWorker()}]
|
||||||
{"FetchWorker": runner.FetchWorker(), "DownloadWorker": runner.DownloadWorker()},
|
dispatcher.workers_out = [{"PrintWorker": runner.PrintWorker()}]
|
||||||
{"UploadWorker": runner.UploadWorker()}
|
|
||||||
]
|
|
||||||
print_worker = runner.PrintWorker("Finished processing", sent = True)
|
|
||||||
dispatcher.workers_out = [{"PrintWorker": print_worker}]
|
|
||||||
|
|
||||||
dispatcher.start()
|
dispatcher.start()
|
||||||
|
|
||||||
|
with open("media_urls.json", "r") as f:
|
||||||
|
url_list = json.loads(f.read())
|
||||||
|
|
||||||
with open("media_urls.txt", "r") as f:
|
logger.info(f"Found {len(url_list)} media urls")
|
||||||
url_list = [l.replace("\n", "") for l in f.readlines()]
|
for u in url_list:
|
||||||
with open("media_urls.txt", "w") as f:
|
msg_text = f"<{u}|dummy preview text>"
|
||||||
f.write("") # empty the file once it is read so that it does not get processed again
|
dispatcher.incoming_request(msg)
|
||||||
|
|
||||||
if url_list:
|
|
||||||
logger.info(f"Found {len(url_list)} media urls")
|
|
||||||
for u in url_list:
|
|
||||||
dispatcher.incoming_request(DummyMessage(u))
|
|
||||||
else:
|
|
||||||
logger.info(f"No additional media urls found. Running the pipeline with messages from db.")
|
|
||||||
|
|
||||||
print_worker.keep_alive()
|
|
||||||
|
|
||||||
|
|
||||||
def show():
|
def show():
|
||||||
for a in runner.models.ArticleDownload.select():
|
|
||||||
print(f"URL: {a.article_url} \nARCHIVE_URL: {a.archive_url} \nFILE_NAME: {a.file_name}")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
t = Table(
|
||||||
logger.info("Overwriting production values for single time media-fetch")
|
title = "ArticleDownloads",
|
||||||
runner.configuration.models.set_db(
|
row_styles = ["white", "bright_black"],
|
||||||
runner.configuration.SqliteDatabase("../.dev/media_downloads.db")
|
|
||||||
)
|
)
|
||||||
runner.configuration.main_config["DOWNLOADS"]["local_storage_path"] = "../.dev/"
|
|
||||||
|
entries = ["title", "article_url", "archive_url", "authors"]
|
||||||
|
|
||||||
|
for e in entries:
|
||||||
|
t.add_column(e, justify = "right")
|
||||||
|
|
||||||
|
sel = runner.models.ArticleDownload.select()
|
||||||
|
|
||||||
|
for s in sel:
|
||||||
|
c = [getattr(s, e) for e in entries]#
|
||||||
|
c[-1] = str([a.author for a in c[-1]])
|
||||||
|
print(c)
|
||||||
|
t.add_row(*c)
|
||||||
|
|
||||||
|
|
||||||
|
console.print(t)
|
||||||
|
|
||||||
|
|
||||||
if len(sys.argv) == 1: # no additional arguments
|
|
||||||
fetch()
|
|
||||||
elif sys.argv[1] == "show":
|
# fetch()
|
||||||
show()
|
show()
|
@ -128,14 +128,8 @@ class Dispatcher(Thread):
|
|||||||
|
|
||||||
|
|
||||||
class PrintWorker:
|
class PrintWorker:
|
||||||
def __init__(self, action, sent = False) -> None:
|
|
||||||
self.action = action
|
|
||||||
self.sent = sent
|
|
||||||
def send(self, article):
|
def send(self, article):
|
||||||
print(f"{self.action} article {article}")
|
print(f"Uploaded article {article}")
|
||||||
if self.sent:
|
|
||||||
article.sent = True
|
|
||||||
article.save()
|
|
||||||
def keep_alive(self): # keeps script running, because there is nothing else in the main thread
|
def keep_alive(self): # keeps script running, because there is nothing else in the main thread
|
||||||
while True: sleep(1)
|
while True: sleep(1)
|
||||||
|
|
||||||
@ -150,12 +144,11 @@ if __name__ == "__main__":
|
|||||||
logger.info(f"Launching upload to archive for {len(articles)} articles.")
|
logger.info(f"Launching upload to archive for {len(articles)} articles.")
|
||||||
|
|
||||||
dispatcher.workers_in = [{"UploadWorker": UploadWorker()}]
|
dispatcher.workers_in = [{"UploadWorker": UploadWorker()}]
|
||||||
print_worker = PrintWorker("Uploaded")
|
dispatcher.workers_out = [{"PrintWorker": PrintWorker()}]
|
||||||
dispatcher.workers_out = [{"PrintWorker": print_worker}]
|
|
||||||
dispatcher.start()
|
dispatcher.start()
|
||||||
for a in articles:
|
for a in articles:
|
||||||
dispatcher.incoming_request(article=a)
|
dispatcher.incoming_request(article=a)
|
||||||
print_worker.keep_alive()
|
PrintWorker().keep_alive()
|
||||||
|
|
||||||
else: # launch with full action
|
else: # launch with full action
|
||||||
try:
|
try:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user