minor corrections
This commit is contained in:
@@ -1,9 +0,0 @@
|
||||
peewee
|
||||
selenium
|
||||
youtube-dl
|
||||
waybackpy
|
||||
slack_bolt # relies on slack_sdk
|
||||
newspaper3k
|
||||
htmldate
|
||||
markdown
|
||||
rich
|
@@ -31,7 +31,7 @@ class ArticleWatcher:
|
||||
# first step: gather metadata
|
||||
if self.fetch and self.upload:
|
||||
self.fetch.process(self) # this will call the update_status method
|
||||
self.upload.process(self) # idependdent from the rest
|
||||
self.upload.process(self) # idependent from the rest
|
||||
else: # the full kwargs were not provided, only do a manual run
|
||||
# overwrite update_status() because calls from the workers will result in erros
|
||||
self.update_status = lambda completed: logger.info(f"Completed action {completed}")
|
||||
|
@@ -28,7 +28,7 @@ class PDFDownloader:
|
||||
if os.getenv("HEADLESS", "false") == "true":
|
||||
options.add_argument('--headless')
|
||||
else:
|
||||
self.logger.warning("Opening browser GUI because of 'HEADLESS=true'")
|
||||
self.logger.warning("Opening browser GUI because of 'HEADLESS=false'")
|
||||
|
||||
options.set_preference('print.save_as_pdf.links.enabled', True)
|
||||
# Just save if the filetype is pdf already, does not work!
|
||||
@@ -46,7 +46,7 @@ class PDFDownloader:
|
||||
# log_path = f'{config["local_storage_path"]}/geckodriver.log'
|
||||
# ))
|
||||
self.driver = webdriver.Remote(
|
||||
command_executor = 'http://localhost:4444',
|
||||
command_executor = 'http://geckodriver:4444',
|
||||
options = options,
|
||||
# can't set log path...
|
||||
)
|
||||
@@ -64,13 +64,17 @@ class PDFDownloader:
|
||||
def finish(self):
|
||||
if self.running:
|
||||
self.logger.info("Exiting gecko driver")
|
||||
self.driver.quit()
|
||||
try:
|
||||
self.driver.quit()
|
||||
time.sleep(10)
|
||||
except:
|
||||
self.logger.critical("Connection to the driver broke off")
|
||||
self.running = False
|
||||
else:
|
||||
self.logger.info("Gecko driver not yet running")
|
||||
|
||||
def download(self, article_object):
|
||||
sleep_time = 1
|
||||
sleep_time = 2
|
||||
self.autostart()
|
||||
url = article_object.article_url
|
||||
|
||||
@@ -87,7 +91,7 @@ class PDFDownloader:
|
||||
# in the mean time, get a page title if required
|
||||
if article_object.is_title_bad:
|
||||
article_object.title = self.driver.title.replace(".pdf", "")
|
||||
# will be propagated to dst as well
|
||||
# will be propagated to the saved file (dst) as well
|
||||
|
||||
fname = article_object.fname_template
|
||||
dst = os.path.join(article_object.save_path, fname)
|
||||
@@ -110,7 +114,7 @@ class PDFDownloader:
|
||||
else:
|
||||
article_object.file_name = ""
|
||||
|
||||
return article_object # this change is saved later manually
|
||||
return article_object # this change is saved later by the external caller
|
||||
|
||||
|
||||
def get_exisiting_pdf(self, url, dst):
|
||||
|
@@ -1,3 +1,4 @@
|
||||
import time
|
||||
from waybackpy import WaybackMachineSaveAPI # upload to archive.org
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -11,6 +12,8 @@ def upload_to_archive(article_object):
|
||||
archive_url = wayback.save()
|
||||
# logger.info(f"{url} uploaded to archive successfully")
|
||||
article_object.archive_url = archive_url
|
||||
# time.sleep(4) # Archive Uploads rate limited to 15/minute
|
||||
|
||||
except Exception as e:
|
||||
article_object.archive_url = "Error while uploading: {}".format(e)
|
||||
logger.error(f"Error while generating archive url: {e}")
|
||||
|
@@ -43,11 +43,15 @@ class FetchWorker(TemplateWorker):
|
||||
class UploadWorker(TemplateWorker):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
|
||||
|
||||
def _handle_article(self, article_watcher):
|
||||
action = run_upload # function
|
||||
def action(*args, **kwargs):
|
||||
run_upload(*args, **kwargs)
|
||||
time.sleep(5) # uploads to archive are throttled to 15/minute
|
||||
|
||||
super()._handle_article(article_watcher, action)
|
||||
time.sleep(4) # Archive Uploads rate limited to 15/minute
|
||||
article_watcher.upload_completed = True
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user