Fixed browser profile bug, line breaks and exceptions in news_check
This commit is contained in:
		| @@ -1,70 +1,72 @@ | ||||
| import logging | ||||
| import time | ||||
| import datetime | ||||
| import logging | ||||
| import os | ||||
|  | ||||
| import os, shutil, uuid | ||||
| from pathlib import Path | ||||
|  | ||||
| import base64 | ||||
| import requests | ||||
| from selenium import webdriver | ||||
|  | ||||
| import configuration | ||||
|  | ||||
| config = configuration.main_config["DOWNLOADS"] | ||||
|  | ||||
| def driver_running(f): | ||||
|     def wrapper(*args, **kwargs): | ||||
|         self = args[0] | ||||
|         if not self._running: | ||||
|             self.start() | ||||
|         return f(*args, **kwargs) | ||||
|     return wrapper | ||||
|  | ||||
|  | ||||
|  | ||||
| class PDFDownloader: | ||||
|     """Saves a given url. Fills the object it got as a parameter""" | ||||
|     logger = logging.getLogger(__name__) | ||||
|     # status-variable for restarting: | ||||
|     running = False | ||||
|      | ||||
|     _running = False | ||||
|  | ||||
|  | ||||
|     def start(self): | ||||
|         self.finish() # clear up | ||||
|              | ||||
|         options = webdriver.ChromeOptions() | ||||
|         options.add_argument(f"user-data-dir={config['browser_profile_path']}") | ||||
|         options.add_argument('--headless') | ||||
|         """Called externally to start the driver, but after an exception can also be called internally""" | ||||
|         if self._running: | ||||
|             self.finish() # clear up | ||||
|  | ||||
|         # if os.getenv("DEBUG", "false") == "true": | ||||
|         #     self.logger.warning("Opening browser GUI because of 'DEBUG=true'") | ||||
|         # else: | ||||
|         self.logger.info("Starting geckodriver") | ||||
|          | ||||
|         reduced_path = self.create_tmp_profile() | ||||
|         profile = webdriver.FirefoxProfile(reduced_path) | ||||
|         options = webdriver.FirefoxOptions() | ||||
|  | ||||
|         # options.set_preference('print.save_as_pdf.links.enabled', True) | ||||
|         # # Just save if the filetype is pdf already | ||||
|         # # TODO: this is not working right now | ||||
|         if os.getenv("DEBUG", "false") == "true": | ||||
|             self.logger.warning("Opening browser GUI because of 'DEBUG=true'") | ||||
|         else: | ||||
|             options.add_argument('--headless') | ||||
|  | ||||
|         # options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True) | ||||
|         # options.set_preference("browser.download.folderList", 2) | ||||
|         # # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf") | ||||
|         # # options.set_preference("pdfjs.disabled", True) | ||||
|         # options.set_preference("browser.download.dir", config["default_download_path"]) | ||||
|  | ||||
|         self.logger.info("Starting chrome driver") | ||||
|         self.driver = webdriver.Remote( | ||||
|             command_executor = 'http://chrome:4444', # the host chrome points to the chrome container | ||||
|             command_executor = 'http://geckodriver:4444', # the host geckodriver points to the geckodriver container | ||||
|             options = options, | ||||
|             # can't set log path... | ||||
|             browser_profile = profile | ||||
|         ) | ||||
|          | ||||
|         self.running = True | ||||
|         self._running = True | ||||
|  | ||||
|     def autostart(self): | ||||
|         if not self.running: | ||||
|             self.start()  # relaunch the dl util | ||||
|  | ||||
|     def finish(self): | ||||
|         if self.running: | ||||
|             self.logger.info("Exiting chrome driver") | ||||
|             try: | ||||
|                 self.driver.quit() | ||||
|                 time.sleep(10) | ||||
|             except: | ||||
|                 self.logger.critical("Connection to the driver broke off") | ||||
|             self.running = False | ||||
|         else: | ||||
|             self.logger.info("Chrome driver not yet running") | ||||
|         self.logger.info("Exiting Geckodriver") | ||||
|         try: | ||||
|             self.driver.quit() | ||||
|             time.sleep(10) | ||||
|         except: | ||||
|             self.logger.critical("Connection to the driver broke off") | ||||
|         self._running = False | ||||
|  | ||||
|  | ||||
|     @driver_running | ||||
|     def download(self, article_object): | ||||
|         sleep_time = 2 | ||||
|         self.autostart() | ||||
|         sleep_time = int(config["browser_print_delay"]) | ||||
|         url = article_object.article_url | ||||
|  | ||||
|         try: | ||||
| @@ -89,20 +91,17 @@ class PDFDownloader: | ||||
|             dst = os.path.join(article_object.save_path, fname) | ||||
|  | ||||
|  | ||||
|         if url[-4:] == ".pdf": | ||||
|             # according to the browser preferences, calling the url will open pdfjs. | ||||
|             # If not handled separately, printing would require the ctrl+p route, but setup is janky to say the least | ||||
|         if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly | ||||
|             success = self.get_exisiting_pdf(url, dst) | ||||
|         else: | ||||
|             success = self.get_new_pdf(dst) | ||||
|  | ||||
|  | ||||
|         if success: | ||||
|             article_object.file_name = fname | ||||
|         else: | ||||
|             article_object.file_name = "" | ||||
|          | ||||
|         return article_object  # this change is saved later by the external caller | ||||
|         return article_object # this change is saved later by the external caller | ||||
|  | ||||
|  | ||||
|     def get_exisiting_pdf(self, url, dst): | ||||
| @@ -134,9 +133,26 @@ class PDFDownloader: | ||||
|         except Exception as e: | ||||
|             self.logger.error(f"Failed, because of FS-operation: {e}") | ||||
|             return False | ||||
|          | ||||
|  | ||||
|  | ||||
|     def create_tmp_profile(self, full_profile_path: Path = Path(config["browser_profile_path"])) -> Path: | ||||
|         reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}") | ||||
|         print(reduced_profile_path, full_profile_path) | ||||
|         os.mkdir(reduced_profile_path) | ||||
|         # copy needed directories | ||||
|         dirs = ["extensions", "storage"] | ||||
|         for dir in dirs: | ||||
|             shutil.copytree(full_profile_path / dir, reduced_profile_path / dir) | ||||
|  | ||||
|         # copy needed files | ||||
|         files = ["extension-preferences.json", "addons.json", "addonStartup.json.lz4", "prefs.js", "extensions.json", "cookies.sqlite"] | ||||
|         for f in files: | ||||
|             shutil.copy(full_profile_path / f, reduced_profile_path) | ||||
|          | ||||
|         folder_size = round(sum(p.stat().st_size for p in Path(reduced_profile_path).rglob('*')) / 1024 / 1024, 3) | ||||
|         self.logger.info(f"Generated temporary profile with size {folder_size} MB") | ||||
|         return reduced_profile_path | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user