Compare commits
	
		
			2 Commits
		
	
	
		
			6301a62de8
			...
			191d008451
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 191d008451 | |||
| 104b99df7e | 
| @@ -2,16 +2,52 @@ | ||||
| Runs the news_fetch pipeline against a manually curated list of urls and saves them locally | ||||
| """ | ||||
| import sys | ||||
| sys.path.append("../app/news_fetch") | ||||
| sys.path.append("../news_fetch") | ||||
| import runner | ||||
| import logging | ||||
| logger = logging.getLogger() | ||||
| import json | ||||
|  | ||||
| from rich.console import Console | ||||
| from rich.table import Table | ||||
| console = Console() | ||||
|  | ||||
| class DummyMessage: | ||||
|     """Required by the dispatcher""" | ||||
|     ts = 0 | ||||
|     def __init__(self, url): | ||||
|         self.urls = [url] | ||||
|  | ||||
|  | ||||
| def fetch(): | ||||
|     dispatcher = runner.Dispatcher() | ||||
|  | ||||
|     dispatcher.workers_in = [ | ||||
|         {"FetchWorker": runner.FetchWorker(), "DownloadWorker": runner.DownloadWorker()}, | ||||
|         {"UploadWorker": runner.UploadWorker()} | ||||
|     ] | ||||
|     print_worker = runner.PrintWorker("Finished processing", sent = True) | ||||
|     dispatcher.workers_out = [{"PrintWorker": print_worker}] | ||||
|  | ||||
|     dispatcher.start() | ||||
|  | ||||
|  | ||||
|     with open("media_urls.txt", "r") as f: | ||||
|         url_list = [l.replace("\n", "") for l in f.readlines()] | ||||
|     with open("media_urls.txt", "w") as f: | ||||
|         f.write("") # empty the file once it is read so that it does not get processed again | ||||
|  | ||||
|     if url_list: | ||||
|         logger.info(f"Found {len(url_list)} media urls") | ||||
|         for u in url_list: | ||||
|             dispatcher.incoming_request(DummyMessage(u)) | ||||
|     else: | ||||
|         logger.info(f"No additional media urls found. Running the pipeline with messages from db.") | ||||
|  | ||||
|     print_worker.keep_alive() | ||||
|  | ||||
|  | ||||
| def show(): | ||||
|     for a in runner.models.ArticleDownload.select(): | ||||
|         print(f"URL: {a.article_url} \nARCHIVE_URL: {a.archive_url} \nFILE_NAME: {a.file_name}") | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     logger.info("Overwriting production values for single time media-fetch") | ||||
|     runner.configuration.models.set_db( | ||||
|         runner.configuration.SqliteDatabase("../.dev/media_downloads.db") | ||||
| @@ -19,49 +55,7 @@ runner.configuration.models.set_db( | ||||
|     runner.configuration.main_config["DOWNLOADS"]["local_storage_path"] = "../.dev/" | ||||
|  | ||||
|  | ||||
| def fetch(): | ||||
|     dispatcher = runner.Dispatcher() | ||||
|  | ||||
|     dispatcher.workers_in = [{"FetchWorker": runner.FetchWorker(), "DownloadWorker": runner.DownloadWorker()}] | ||||
|     dispatcher.workers_out = [{"PrintWorker": runner.PrintWorker()}] | ||||
|  | ||||
|     dispatcher.start() | ||||
|  | ||||
|     with open("media_urls.json", "r") as f: | ||||
|         url_list = json.loads(f.read())  | ||||
|  | ||||
|     logger.info(f"Found {len(url_list)} media urls") | ||||
|     for u in url_list: | ||||
|         msg_text = f"<{u}|dummy preview text>" | ||||
|         dispatcher.incoming_request(msg) | ||||
|  | ||||
|  | ||||
|  | ||||
| def show(): | ||||
|  | ||||
|     t = Table( | ||||
|         title = "ArticleDownloads", | ||||
|         row_styles = ["white", "bright_black"], | ||||
|     ) | ||||
|      | ||||
|     entries = ["title", "article_url", "archive_url", "authors"] | ||||
|  | ||||
|     for e in entries: | ||||
|         t.add_column(e, justify = "right") | ||||
|  | ||||
|     sel = runner.models.ArticleDownload.select() | ||||
|  | ||||
|     for s in sel: | ||||
|         c = [getattr(s, e) for e in entries]# | ||||
|         c[-1] = str([a.author for a in c[-1]]) | ||||
|         print(c) | ||||
|         t.add_row(*c) | ||||
|  | ||||
|      | ||||
|     console.print(t) | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # fetch() | ||||
|     if len(sys.argv) == 1: # no additional arguments | ||||
|         fetch() | ||||
|     elif sys.argv[1] == "show": | ||||
|         show() | ||||
							
								
								
									
										0
									
								
								manual/media_urls.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								manual/media_urls.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -128,8 +128,14 @@ class Dispatcher(Thread): | ||||
|  | ||||
|  | ||||
| class PrintWorker: | ||||
|     def __init__(self, action, sent = False) -> None: | ||||
|         self.action = action | ||||
|         self.sent = sent | ||||
|     def send(self, article): | ||||
|         print(f"Uploaded article {article}") | ||||
|         print(f"{self.action} article {article}") | ||||
|         if self.sent: | ||||
|             article.sent = True | ||||
|             article.save() | ||||
|     def keep_alive(self): # keeps script running, because there is nothing else in the main thread | ||||
|         while True: sleep(1) | ||||
|  | ||||
| @@ -144,11 +150,12 @@ if __name__ == "__main__": | ||||
|         logger.info(f"Launching upload to archive for {len(articles)} articles.") | ||||
|  | ||||
|         dispatcher.workers_in = [{"UploadWorker": UploadWorker()}] | ||||
|         dispatcher.workers_out = [{"PrintWorker": PrintWorker()}] | ||||
|         print_worker = PrintWorker("Uploaded") | ||||
|         dispatcher.workers_out = [{"PrintWorker": print_worker}] | ||||
|         dispatcher.start() | ||||
|         for a in articles: | ||||
|             dispatcher.incoming_request(article=a) | ||||
|         PrintWorker().keep_alive() | ||||
|         print_worker.keep_alive() | ||||
|  | ||||
|     else: # launch with full action | ||||
|         try: | ||||
|   | ||||
		Reference in New Issue
	
	Block a user