FS updates and corrections

This commit is contained in:
Remy Moll
2022-06-15 11:14:08 +02:00
parent 54760abee4
commit 87d65fc988
14 changed files with 91 additions and 56 deletions

View File

@@ -5,13 +5,13 @@ import os
import base64
import requests
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import configuration
import json
config = configuration.parsed["DOWNLOADS"]
blacklisted = json.loads(config["blacklisted_href_domains"])
class PDFDownloader:
"""Saves a given url. Fills the object it got as a parameter"""
logger = logging.getLogger(__name__)
@@ -19,10 +19,8 @@ class PDFDownloader:
running = False
def start(self):
try:
self.finish()
except:
self.logger.info("gecko driver not yet running")
self.finish() # clear up
options = webdriver.FirefoxOptions()
options.profile = config["browser_profile_path"]
# should be options.set_preference("profile", config["browser_profile_path"]) as of selenium 4 but that doesn't work
@@ -56,13 +54,15 @@ class PDFDownloader:
def autostart(self):
if not self.running:
self.start() # relaunch the dl util
self.start() # relaunch the dl util
def finish(self):
self.logger.info("Exiting gecko driver")
self.driver.quit()
self.running = False
if self.running:
self.logger.info("Exiting gecko driver")
self.driver.quit()
self.running = False
else:
self.logger.info("Gecko driver not yet running")
def download(self, article_object):
sleep_time = 1
@@ -74,14 +74,14 @@ class PDFDownloader:
except Exception as e:
self.logger.critical("Selenium .get(url) failed with error {}".format(e))
self.finish()
return article_object # without changes
return article_object # without changes
time.sleep(sleep_time)
# leave the page time to do any funky business
# in the mean time, get a page title if required
if article_object.is_title_bad:
article_object.title = self.driver.title.replace(".pdf","")
article_object.title = self.driver.title.replace(".pdf", "")
# will be propagated to dst as well
fname = article_object.fname_template
@@ -105,7 +105,7 @@ class PDFDownloader:
else:
article_object.file_name = ""
return article_object # this change is saved later manually
return article_object # this change is saved later manually
def get_exisiting_pdf(self, url, dst):