minor corrections

This commit is contained in:
Remy Moll
2022-06-23 15:05:59 +02:00
parent ac9e988af3
commit 12a7de91ed
13 changed files with 218 additions and 32 deletions

View File

@@ -1,9 +0,0 @@
peewee
selenium
youtube-dl
waybackpy
slack_bolt # relies on slack_sdk
newspaper3k
htmldate
markdown
rich

View File

@@ -31,7 +31,7 @@ class ArticleWatcher:
# first step: gather metadata
if self.fetch and self.upload:
self.fetch.process(self) # this will call the update_status method
self.upload.process(self) # idependdent from the rest
self.upload.process(self) # idependent from the rest
else: # the full kwargs were not provided, only do a manual run
# overwrite update_status() because calls from the workers will result in erros
self.update_status = lambda completed: logger.info(f"Completed action {completed}")

View File

@@ -28,7 +28,7 @@ class PDFDownloader:
if os.getenv("HEADLESS", "false") == "true":
options.add_argument('--headless')
else:
self.logger.warning("Opening browser GUI because of 'HEADLESS=true'")
self.logger.warning("Opening browser GUI because of 'HEADLESS=false'")
options.set_preference('print.save_as_pdf.links.enabled', True)
# Just save if the filetype is pdf already, does not work!
@@ -46,7 +46,7 @@ class PDFDownloader:
# log_path = f'{config["local_storage_path"]}/geckodriver.log'
# ))
self.driver = webdriver.Remote(
command_executor = 'http://localhost:4444',
command_executor = 'http://geckodriver:4444',
options = options,
# can't set log path...
)
@@ -64,13 +64,17 @@ class PDFDownloader:
def finish(self):
if self.running:
self.logger.info("Exiting gecko driver")
self.driver.quit()
try:
self.driver.quit()
time.sleep(10)
except:
self.logger.critical("Connection to the driver broke off")
self.running = False
else:
self.logger.info("Gecko driver not yet running")
def download(self, article_object):
sleep_time = 1
sleep_time = 2
self.autostart()
url = article_object.article_url
@@ -87,7 +91,7 @@ class PDFDownloader:
# in the mean time, get a page title if required
if article_object.is_title_bad:
article_object.title = self.driver.title.replace(".pdf", "")
# will be propagated to dst as well
# will be propagated to the saved file (dst) as well
fname = article_object.fname_template
dst = os.path.join(article_object.save_path, fname)
@@ -110,7 +114,7 @@ class PDFDownloader:
else:
article_object.file_name = ""
return article_object # this change is saved later manually
return article_object # this change is saved later by the external caller
def get_exisiting_pdf(self, url, dst):

View File

@@ -1,3 +1,4 @@
import time
from waybackpy import WaybackMachineSaveAPI # upload to archive.org
import logging
logger = logging.getLogger(__name__)
@@ -11,6 +12,8 @@ def upload_to_archive(article_object):
archive_url = wayback.save()
# logger.info(f"{url} uploaded to archive successfully")
article_object.archive_url = archive_url
# time.sleep(4) # Archive Uploads rate limited to 15/minute
except Exception as e:
article_object.archive_url = "Error while uploading: {}".format(e)
logger.error(f"Error while generating archive url: {e}")

View File

@@ -43,11 +43,15 @@ class FetchWorker(TemplateWorker):
class UploadWorker(TemplateWorker):
def __init__(self) -> None:
super().__init__()
def _handle_article(self, article_watcher):
action = run_upload # function
def action(*args, **kwargs):
run_upload(*args, **kwargs)
time.sleep(5) # uploads to archive are throttled to 15/minute
super()._handle_article(article_watcher, action)
time.sleep(4) # Archive Uploads rate limited to 15/minute
article_watcher.upload_completed = True