Compare commits
	
		
			2 Commits
		
	
	
		
			6301a62de8
			...
			191d008451
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 191d008451 | |||
| 104b99df7e | 
| @@ -2,66 +2,60 @@ | |||||||
| Runs the news_fetch pipeline against a manually curated list of urls and saves them locally | Runs the news_fetch pipeline against a manually curated list of urls and saves them locally | ||||||
| """ | """ | ||||||
| import sys | import sys | ||||||
| sys.path.append("../app/news_fetch") | sys.path.append("../news_fetch") | ||||||
| import runner | import runner | ||||||
| import logging | import logging | ||||||
| logger = logging.getLogger() | logger = logging.getLogger() | ||||||
| import json |  | ||||||
|  |  | ||||||
| from rich.console import Console |  | ||||||
| from rich.table import Table |  | ||||||
| console = Console() |  | ||||||
|  |  | ||||||
| logger.info("Overwriting production values for single time media-fetch") | class DummyMessage: | ||||||
| runner.configuration.models.set_db( |     """Required by the dispatcher""" | ||||||
|     runner.configuration.SqliteDatabase("../.dev/media_downloads.db") |     ts = 0 | ||||||
| ) |     def __init__(self, url): | ||||||
| runner.configuration.main_config["DOWNLOADS"]["local_storage_path"] = "../.dev/" |         self.urls = [url] | ||||||
|  |  | ||||||
|  |  | ||||||
| def fetch(): | def fetch(): | ||||||
|     dispatcher = runner.Dispatcher() |     dispatcher = runner.Dispatcher() | ||||||
|  |  | ||||||
|     dispatcher.workers_in = [{"FetchWorker": runner.FetchWorker(), "DownloadWorker": runner.DownloadWorker()}] |     dispatcher.workers_in = [ | ||||||
|     dispatcher.workers_out = [{"PrintWorker": runner.PrintWorker()}] |         {"FetchWorker": runner.FetchWorker(), "DownloadWorker": runner.DownloadWorker()}, | ||||||
|  |         {"UploadWorker": runner.UploadWorker()} | ||||||
|  |     ] | ||||||
|  |     print_worker = runner.PrintWorker("Finished processing", sent = True) | ||||||
|  |     dispatcher.workers_out = [{"PrintWorker": print_worker}] | ||||||
|  |  | ||||||
|     dispatcher.start() |     dispatcher.start() | ||||||
|  |  | ||||||
|     with open("media_urls.json", "r") as f: |  | ||||||
|         url_list = json.loads(f.read())  |  | ||||||
|  |  | ||||||
|     logger.info(f"Found {len(url_list)} media urls") |     with open("media_urls.txt", "r") as f: | ||||||
|     for u in url_list: |         url_list = [l.replace("\n", "") for l in f.readlines()] | ||||||
|         msg_text = f"<{u}|dummy preview text>" |     with open("media_urls.txt", "w") as f: | ||||||
|         dispatcher.incoming_request(msg) |         f.write("") # empty the file once it is read so that it does not get processed again | ||||||
|  |  | ||||||
|  |     if url_list: | ||||||
|  |         logger.info(f"Found {len(url_list)} media urls") | ||||||
|  |         for u in url_list: | ||||||
|  |             dispatcher.incoming_request(DummyMessage(u)) | ||||||
|  |     else: | ||||||
|  |         logger.info(f"No additional media urls found. Running the pipeline with messages from db.") | ||||||
|  |  | ||||||
|  |     print_worker.keep_alive() | ||||||
|  |  | ||||||
|  |  | ||||||
| def show(): | def show(): | ||||||
|  |     for a in runner.models.ArticleDownload.select(): | ||||||
|  |         print(f"URL: {a.article_url} \nARCHIVE_URL: {a.archive_url} \nFILE_NAME: {a.file_name}") | ||||||
|  |  | ||||||
|     t = Table( | if __name__ == "__main__": | ||||||
|         title = "ArticleDownloads", |     logger.info("Overwriting production values for single time media-fetch") | ||||||
|         row_styles = ["white", "bright_black"], |     runner.configuration.models.set_db( | ||||||
|  |         runner.configuration.SqliteDatabase("../.dev/media_downloads.db") | ||||||
|     ) |     ) | ||||||
|      |     runner.configuration.main_config["DOWNLOADS"]["local_storage_path"] = "../.dev/" | ||||||
|     entries = ["title", "article_url", "archive_url", "authors"] |  | ||||||
|  |  | ||||||
|     for e in entries: |  | ||||||
|         t.add_column(e, justify = "right") |  | ||||||
|  |  | ||||||
|     sel = runner.models.ArticleDownload.select() |  | ||||||
|  |  | ||||||
|     for s in sel: |  | ||||||
|         c = [getattr(s, e) for e in entries]# |  | ||||||
|         c[-1] = str([a.author for a in c[-1]]) |  | ||||||
|         print(c) |  | ||||||
|         t.add_row(*c) |  | ||||||
|  |  | ||||||
|      |  | ||||||
|     console.print(t) |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     if len(sys.argv) == 1: # no additional arguments | ||||||
|  |         fetch() | ||||||
| # fetch() |     elif sys.argv[1] == "show": | ||||||
| show() |         show() | ||||||
							
								
								
									
										0
									
								
								manual/media_urls.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								manual/media_urls.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -128,8 +128,14 @@ class Dispatcher(Thread): | |||||||
|  |  | ||||||
|  |  | ||||||
| class PrintWorker: | class PrintWorker: | ||||||
|  |     def __init__(self, action, sent = False) -> None: | ||||||
|  |         self.action = action | ||||||
|  |         self.sent = sent | ||||||
|     def send(self, article): |     def send(self, article): | ||||||
|         print(f"Uploaded article {article}") |         print(f"{self.action} article {article}") | ||||||
|  |         if self.sent: | ||||||
|  |             article.sent = True | ||||||
|  |             article.save() | ||||||
|     def keep_alive(self): # keeps script running, because there is nothing else in the main thread |     def keep_alive(self): # keeps script running, because there is nothing else in the main thread | ||||||
|         while True: sleep(1) |         while True: sleep(1) | ||||||
|  |  | ||||||
| @@ -144,11 +150,12 @@ if __name__ == "__main__": | |||||||
|         logger.info(f"Launching upload to archive for {len(articles)} articles.") |         logger.info(f"Launching upload to archive for {len(articles)} articles.") | ||||||
|  |  | ||||||
|         dispatcher.workers_in = [{"UploadWorker": UploadWorker()}] |         dispatcher.workers_in = [{"UploadWorker": UploadWorker()}] | ||||||
|         dispatcher.workers_out = [{"PrintWorker": PrintWorker()}] |         print_worker = PrintWorker("Uploaded") | ||||||
|  |         dispatcher.workers_out = [{"PrintWorker": print_worker}] | ||||||
|         dispatcher.start() |         dispatcher.start() | ||||||
|         for a in articles: |         for a in articles: | ||||||
|             dispatcher.incoming_request(article=a) |             dispatcher.incoming_request(article=a) | ||||||
|         PrintWorker().keep_alive() |         print_worker.keep_alive() | ||||||
|  |  | ||||||
|     else: # launch with full action |     else: # launch with full action | ||||||
|         try: |         try: | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user