minor corrections
This commit is contained in:
		| @@ -1,9 +0,0 @@ | ||||
| peewee | ||||
| selenium | ||||
| youtube-dl | ||||
| waybackpy | ||||
| slack_bolt # relies on slack_sdk | ||||
| newspaper3k | ||||
| htmldate | ||||
| markdown | ||||
| rich | ||||
| @@ -31,7 +31,7 @@ class ArticleWatcher: | ||||
|         # first step: gather metadata | ||||
|         if self.fetch and self.upload: | ||||
|             self.fetch.process(self) # this will call the update_status method | ||||
|             self.upload.process(self) # idependdent from the rest | ||||
|             self.upload.process(self) # idependent from the rest | ||||
|         else: # the full kwargs were not provided, only do a manual run | ||||
|             # overwrite update_status() because calls from the workers will result in erros | ||||
|             self.update_status = lambda completed: logger.info(f"Completed action {completed}") | ||||
|   | ||||
| @@ -28,7 +28,7 @@ class PDFDownloader: | ||||
|         if os.getenv("HEADLESS", "false") == "true": | ||||
|             options.add_argument('--headless') | ||||
|         else: | ||||
|             self.logger.warning("Opening browser GUI because of 'HEADLESS=true'") | ||||
|             self.logger.warning("Opening browser GUI because of 'HEADLESS=false'") | ||||
|  | ||||
|         options.set_preference('print.save_as_pdf.links.enabled', True) | ||||
|         # Just save if the filetype is pdf already, does not work! | ||||
| @@ -46,7 +46,7 @@ class PDFDownloader: | ||||
|         #         log_path = f'{config["local_storage_path"]}/geckodriver.log' | ||||
|         # )) | ||||
|         self.driver = webdriver.Remote( | ||||
|             command_executor = 'http://localhost:4444', | ||||
|             command_executor = 'http://geckodriver:4444', | ||||
|             options = options, | ||||
|             # can't set log path... | ||||
|         ) | ||||
| @@ -64,13 +64,17 @@ class PDFDownloader: | ||||
|     def finish(self): | ||||
|         if self.running: | ||||
|             self.logger.info("Exiting gecko driver") | ||||
|             self.driver.quit() | ||||
|             try: | ||||
|                 self.driver.quit() | ||||
|                 time.sleep(10) | ||||
|             except: | ||||
|                 self.logger.critical("Connection to the driver broke off") | ||||
|             self.running = False | ||||
|         else: | ||||
|             self.logger.info("Gecko driver not yet running") | ||||
|  | ||||
|     def download(self, article_object): | ||||
|         sleep_time = 1 | ||||
|         sleep_time = 2 | ||||
|         self.autostart() | ||||
|         url = article_object.article_url | ||||
|  | ||||
| @@ -87,7 +91,7 @@ class PDFDownloader: | ||||
|         # in the mean time, get a page title if required | ||||
|         if article_object.is_title_bad: | ||||
|             article_object.title = self.driver.title.replace(".pdf", "") | ||||
|             # will be propagated to dst as well | ||||
|             # will be propagated to the saved file (dst) as well | ||||
|  | ||||
|         fname = article_object.fname_template | ||||
|         dst = os.path.join(article_object.save_path, fname) | ||||
| @@ -110,7 +114,7 @@ class PDFDownloader: | ||||
|         else: | ||||
|             article_object.file_name = "" | ||||
|          | ||||
|         return article_object  # this change is saved later manually | ||||
|         return article_object  # this change is saved later by the external caller | ||||
|  | ||||
|  | ||||
|     def get_exisiting_pdf(self, url, dst): | ||||
|   | ||||
| @@ -1,3 +1,4 @@ | ||||
| import time | ||||
| from waybackpy import WaybackMachineSaveAPI # upload to archive.org | ||||
| import logging | ||||
| logger = logging.getLogger(__name__) | ||||
| @@ -11,6 +12,8 @@ def upload_to_archive(article_object): | ||||
|         archive_url = wayback.save() | ||||
|         # logger.info(f"{url} uploaded to archive successfully") | ||||
|         article_object.archive_url = archive_url | ||||
|         # time.sleep(4) # Archive Uploads rate limited to 15/minute | ||||
|  | ||||
|     except Exception as e: | ||||
|         article_object.archive_url = "Error while uploading: {}".format(e) | ||||
|         logger.error(f"Error while generating archive url: {e}") | ||||
|   | ||||
| @@ -43,11 +43,15 @@ class FetchWorker(TemplateWorker): | ||||
| class UploadWorker(TemplateWorker): | ||||
|     def __init__(self) -> None: | ||||
|         super().__init__() | ||||
|      | ||||
|  | ||||
|  | ||||
|     def _handle_article(self, article_watcher): | ||||
|         action = run_upload # function | ||||
|         def action(*args, **kwargs): | ||||
|             run_upload(*args, **kwargs) | ||||
|             time.sleep(5) # uploads to archive are throttled to 15/minute | ||||
|  | ||||
|         super()._handle_article(article_watcher, action) | ||||
|         time.sleep(4) # Archive Uploads rate limited to 15/minute | ||||
|         article_watcher.upload_completed = True | ||||
|  | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Remy Moll
					Remy Moll