|
|
|
@@ -6,10 +6,8 @@ import base64
|
|
|
|
|
import requests
|
|
|
|
|
from selenium import webdriver
|
|
|
|
|
import configuration
|
|
|
|
|
import json
|
|
|
|
|
|
|
|
|
|
config = configuration.main_config["DOWNLOADS"]
|
|
|
|
|
blacklisted = json.loads(config["blacklisted_href_domains"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PDFDownloader:
|
|
|
|
@@ -21,42 +19,31 @@ class PDFDownloader:
|
|
|
|
|
def start(self):
|
|
|
|
|
self.finish() # clear up
|
|
|
|
|
|
|
|
|
|
options = webdriver.FirefoxOptions()
|
|
|
|
|
options.profile = config["browser_profile_path"]
|
|
|
|
|
# should be options.set_preference("profile", config["browser_profile_path"]) as of selenium 4 but that doesn't work
|
|
|
|
|
options = webdriver.ChromeOptions()
|
|
|
|
|
options.add_argument(f"user-data-dir={config['browser_profile_path']}")
|
|
|
|
|
options.add_argument('--headless')
|
|
|
|
|
|
|
|
|
|
if os.getenv("DEBUG", "false") == "true":
|
|
|
|
|
self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
|
|
|
|
|
else:
|
|
|
|
|
options.add_argument('--headless')
|
|
|
|
|
# if os.getenv("DEBUG", "false") == "true":
|
|
|
|
|
# self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
|
|
|
|
|
# else:
|
|
|
|
|
|
|
|
|
|
options.set_preference('print.save_as_pdf.links.enabled', True)
|
|
|
|
|
# Just save if the filetype is pdf already
|
|
|
|
|
# TODO: this is not working right now
|
|
|
|
|
# options.set_preference('print.save_as_pdf.links.enabled', True)
|
|
|
|
|
# # Just save if the filetype is pdf already
|
|
|
|
|
# # TODO: this is not working right now
|
|
|
|
|
|
|
|
|
|
options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
|
|
|
|
|
options.set_preference("browser.download.folderList", 2)
|
|
|
|
|
# options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
|
|
|
|
|
# options.set_preference("pdfjs.disabled", True)
|
|
|
|
|
options.set_preference("browser.download.dir", config["default_download_path"])
|
|
|
|
|
# options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
|
|
|
|
|
# options.set_preference("browser.download.folderList", 2)
|
|
|
|
|
# # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
|
|
|
|
|
# # options.set_preference("pdfjs.disabled", True)
|
|
|
|
|
# options.set_preference("browser.download.dir", config["default_download_path"])
|
|
|
|
|
|
|
|
|
|
self.logger.info("Starting gecko driver")
|
|
|
|
|
# peviously, in a single docker image:
|
|
|
|
|
# self.driver = webdriver.Firefox(
|
|
|
|
|
# options = options,
|
|
|
|
|
# service = webdriver.firefox.service.Service(
|
|
|
|
|
# log_path = f'{config["local_storage_path"]}/geckodriver.log'
|
|
|
|
|
# ))
|
|
|
|
|
self.logger.info("Starting chrome driver")
|
|
|
|
|
self.driver = webdriver.Remote(
|
|
|
|
|
command_executor = 'http://geckodriver:4444',
|
|
|
|
|
command_executor = 'http://chrome:4444', # the host chrome points to the chrome container
|
|
|
|
|
options = options,
|
|
|
|
|
# can't set log path...
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
residues = os.listdir(config["default_download_path"])
|
|
|
|
|
for res in residues:
|
|
|
|
|
os.remove(os.path.join(config["default_download_path"], res))
|
|
|
|
|
|
|
|
|
|
self.running = True
|
|
|
|
|
|
|
|
|
|
def autostart(self):
|
|
|
|
@@ -65,7 +52,7 @@ class PDFDownloader:
|
|
|
|
|
|
|
|
|
|
def finish(self):
|
|
|
|
|
if self.running:
|
|
|
|
|
self.logger.info("Exiting gecko driver")
|
|
|
|
|
self.logger.info("Exiting chrome driver")
|
|
|
|
|
try:
|
|
|
|
|
self.driver.quit()
|
|
|
|
|
time.sleep(10)
|
|
|
|
@@ -73,7 +60,7 @@ class PDFDownloader:
|
|
|
|
|
self.logger.critical("Connection to the driver broke off")
|
|
|
|
|
self.running = False
|
|
|
|
|
else:
|
|
|
|
|
self.logger.info("Gecko driver not yet running")
|
|
|
|
|
self.logger.info("Chrome driver not yet running")
|
|
|
|
|
|
|
|
|
|
def download(self, article_object):
|
|
|
|
|
sleep_time = 2
|
|
|
|
@@ -153,8 +140,6 @@ class PDFDownloader:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def make_path_unique(path):
|
|
|
|
|
fname, ending = os.path.splitext(path)
|
|
|
|
|
fname += datetime.datetime.now().strftime("%d-%H%M%S")
|
|
|
|
|