Switched from geckodriver to chrome

This commit is contained in:
2022-09-18 19:26:55 +02:00
parent 7cf7422b46
commit db161e50c8
13 changed files with 135 additions and 61 deletions

View File

@@ -6,10 +6,8 @@ import base64
import requests
from selenium import webdriver
import configuration
import json
config = configuration.main_config["DOWNLOADS"]
blacklisted = json.loads(config["blacklisted_href_domains"])
class PDFDownloader:
@@ -21,42 +19,31 @@ class PDFDownloader:
def start(self):
self.finish() # clear up
options = webdriver.FirefoxOptions()
options.profile = config["browser_profile_path"]
# should be options.set_preference("profile", config["browser_profile_path"]) as of selenium 4 but that doesn't work
options = webdriver.ChromeOptions()
options.add_argument(f"user-data-dir={config['browser_profile_path']}")
options.add_argument('--headless')
if os.getenv("DEBUG", "false") == "true":
self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
else:
options.add_argument('--headless')
# if os.getenv("DEBUG", "false") == "true":
# self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
# else:
options.set_preference('print.save_as_pdf.links.enabled', True)
# Just save if the filetype is pdf already
# TODO: this is not working right now
# options.set_preference('print.save_as_pdf.links.enabled', True)
# # Just save if the filetype is pdf already
# # TODO: this is not working right now
options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
options.set_preference("browser.download.folderList", 2)
# options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
# options.set_preference("pdfjs.disabled", True)
options.set_preference("browser.download.dir", config["default_download_path"])
# options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
# options.set_preference("browser.download.folderList", 2)
# # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
# # options.set_preference("pdfjs.disabled", True)
# options.set_preference("browser.download.dir", config["default_download_path"])
self.logger.info("Starting gecko driver")
# peviously, in a single docker image:
# self.driver = webdriver.Firefox(
# options = options,
# service = webdriver.firefox.service.Service(
# log_path = f'{config["local_storage_path"]}/geckodriver.log'
# ))
self.logger.info("Starting chrome driver")
self.driver = webdriver.Remote(
command_executor = 'http://geckodriver:4444',
command_executor = 'http://chrome:4444', # the host chrome points to the chrome container
options = options,
# can't set log path...
)
residues = os.listdir(config["default_download_path"])
for res in residues:
os.remove(os.path.join(config["default_download_path"], res))
self.running = True
def autostart(self):
@@ -65,7 +52,7 @@ class PDFDownloader:
def finish(self):
if self.running:
self.logger.info("Exiting gecko driver")
self.logger.info("Exiting chrome driver")
try:
self.driver.quit()
time.sleep(10)
@@ -73,7 +60,7 @@ class PDFDownloader:
self.logger.critical("Connection to the driver broke off")
self.running = False
else:
self.logger.info("Gecko driver not yet running")
self.logger.info("Chrome driver not yet running")
def download(self, article_object):
sleep_time = 2
@@ -153,8 +140,6 @@ class PDFDownloader:
def make_path_unique(path):
fname, ending = os.path.splitext(path)
fname += datetime.datetime.now().strftime("%d-%H%M%S")

View File

@@ -1,4 +1,3 @@
from __future__ import unicode_literals
import youtube_dl
import os
import logging