minor corrections

2022-06-23 15:05:59 +02:00
parent ac9e988af3
commit 12a7de91ed
13 changed files with 218 additions and 32 deletions
--- a/app/requirements.txt
+++ b/app/requirements.txt
@@ -1,9 +0,0 @@
-peewee
-selenium
-youtube-dl
-waybackpy
-slack_bolt # relies on slack_sdk
-newspaper3k
-htmldate
-markdown
-rich
--- a/app/runner.py
+++ b/app/runner.py
@@ -31,7 +31,7 @@ class ArticleWatcher:
        # first step: gather metadata
        if self.fetch and self.upload:
            self.fetch.process(self) # this will call the update_status method
-            self.upload.process(self) # idependdent from the rest
+            self.upload.process(self) # idependent from the rest
        else: # the full kwargs were not provided, only do a manual run
            # overwrite update_status() because calls from the workers will result in erros
            self.update_status = lambda completed: logger.info(f"Completed action {completed}")
--- a/app/utils_worker/download/browser.py
+++ b/app/utils_worker/download/browser.py
@@ -28,7 +28,7 @@ class PDFDownloader:
        if os.getenv("HEADLESS", "false") == "true":
            options.add_argument('--headless')
        else:
-            self.logger.warning("Opening browser GUI because of 'HEADLESS=true'")
+            self.logger.warning("Opening browser GUI because of 'HEADLESS=false'")

        options.set_preference('print.save_as_pdf.links.enabled', True)
        # Just save if the filetype is pdf already, does not work!
@@ -46,7 +46,7 @@ class PDFDownloader:
        #         log_path = f'{config["local_storage_path"]}/geckodriver.log'
        # ))
        self.driver = webdriver.Remote(
-            command_executor = 'http://localhost:4444',
+            command_executor = 'http://geckodriver:4444',
            options = options,
            # can't set log path...
        )
@@ -64,13 +64,17 @@ class PDFDownloader:
    def finish(self):
        if self.running:
            self.logger.info("Exiting gecko driver")
-            self.driver.quit()
+            try:
+                self.driver.quit()
+                time.sleep(10)
+            except:
+                self.logger.critical("Connection to the driver broke off")
            self.running = False
        else:
            self.logger.info("Gecko driver not yet running")

    def download(self, article_object):
-        sleep_time = 1
+        sleep_time = 2
        self.autostart()
        url = article_object.article_url

@@ -87,7 +91,7 @@ class PDFDownloader:
        # in the mean time, get a page title if required
        if article_object.is_title_bad:
            article_object.title = self.driver.title.replace(".pdf", "")
-            # will be propagated to dst as well
+            # will be propagated to the saved file (dst) as well

        fname = article_object.fname_template
        dst = os.path.join(article_object.save_path, fname)
@@ -110,7 +114,7 @@ class PDFDownloader:
        else:
            article_object.file_name = ""
        
-        return article_object  # this change is saved later manually
+        return article_object  # this change is saved later by the external caller


    def get_exisiting_pdf(self, url, dst):
--- a/app/utils_worker/upload/runner.py
+++ b/app/utils_worker/upload/runner.py
@@ -1,3 +1,4 @@
+import time
 from waybackpy import WaybackMachineSaveAPI # upload to archive.org
 import logging
 logger = logging.getLogger(__name__)
@@ -11,6 +12,8 @@ def upload_to_archive(article_object):
        archive_url = wayback.save()
        # logger.info(f"{url} uploaded to archive successfully")
        article_object.archive_url = archive_url
+        # time.sleep(4) # Archive Uploads rate limited to 15/minute
+
    except Exception as e:
        article_object.archive_url = "Error while uploading: {}".format(e)
        logger.error(f"Error while generating archive url: {e}")
--- a/app/utils_worker/workers.py
+++ b/app/utils_worker/workers.py
@@ -43,11 +43,15 @@ class FetchWorker(TemplateWorker):
 class UploadWorker(TemplateWorker):
    def __init__(self) -> None:
        super().__init__()
+    
+

    def _handle_article(self, article_watcher):
-        action = run_upload # function
+        def action(*args, **kwargs):
+            run_upload(*args, **kwargs)
+            time.sleep(5) # uploads to archive are throttled to 15/minute
+
        super()._handle_article(article_watcher, action)
-        time.sleep(4) # Archive Uploads rate limited to 15/minute
        article_watcher.upload_completed = True