70 lines
2.0 KiB
Python
70 lines
2.0 KiB
Python
"""
|
|
Runs the news_fetch pipeline against a manually curated list of urls and saves them locally
|
|
"""
|
|
import sys
|
|
sys.path.append("../news_fetch")
|
|
import runner
|
|
import os
|
|
import logging
|
|
logger = logging.getLogger()
|
|
|
|
|
|
class DummyMessage:
|
|
"""Required by the dispatcher"""
|
|
ts = 0
|
|
def __init__(self, url):
|
|
self.urls = [url]
|
|
|
|
|
|
def fetch():
|
|
dispatcher = runner.Dispatcher()
|
|
|
|
dispatcher.workers_in = [
|
|
{"FetchWorker": runner.FetchWorker(), "DownloadWorker": runner.DownloadWorker()},
|
|
{"UploadWorker": runner.UploadWorker()}
|
|
]
|
|
print_worker = runner.PrintWorker("Finished processing", sent = True)
|
|
dispatcher.workers_out = [{"PrintWorker": print_worker}]
|
|
|
|
dispatcher.start()
|
|
|
|
|
|
with open("media_urls.txt", "r") as f:
|
|
url_list = [l.replace("\n", "") for l in f.readlines()]
|
|
with open("media_urls.txt", "w") as f:
|
|
f.write("") # empty the file once it is read so that it does not get processed again
|
|
|
|
if url_list:
|
|
logger.info(f"Found {len(url_list)} media urls")
|
|
for u in url_list:
|
|
dispatcher.incoming_request(DummyMessage(u))
|
|
else:
|
|
logger.info(f"No additional media urls found. Running the pipeline with messages from db.")
|
|
|
|
print_worker.keep_alive()
|
|
|
|
|
|
def show():
|
|
for a in runner.models.ArticleDownload.select():
|
|
print(f"""
|
|
URL: {a.article_url}
|
|
ARCHIVE_URL: {a.archive_url}
|
|
ARTICLE_SOURCE: {a.source_name}
|
|
FILE_NAME: {a.file_name}
|
|
""")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
logger.info("Overwriting production values for single time media-fetch")
|
|
if not os.path.exists("../.dev/"):
|
|
os.mkdir("../.dev/")
|
|
runner.configuration.models.set_db(
|
|
runner.configuration.SqliteDatabase("../.dev/media_downloads.db")
|
|
)
|
|
runner.configuration.main_config["downloads"]["local_storage_path"] = "../.dev/"
|
|
|
|
|
|
if len(sys.argv) == 1: # no additional arguments
|
|
fetch()
|
|
elif sys.argv[1] == "show":
|
|
show() |