181 lines
5.7 KiB
Python

import logging
import time
import datetime
import os, shutil, uuid
from pathlib import Path
import base64
import requests
from selenium import webdriver
import configuration
download_config = configuration.config["downloads"]
def driver_running(f):
def wrapper(*args, **kwargs):
self = args[0]
if not self._running:
self.start()
return f(*args, **kwargs)
return wrapper
class PDFDownloader:
"""Saves a given url. Fills the object it got as a parameter"""
logger = logging.getLogger(__name__)
_running = False
def start(self):
"""Called externally to start the driver, but after an exception can also be called internally"""
if self._running:
self.finish() # clear up
self.logger.info("Starting geckodriver")
reduced_path = self.create_tmp_profile()
profile = webdriver.FirefoxProfile(reduced_path)
options = webdriver.FirefoxOptions()
if os.getenv("DEBUG", "false") == "true":
self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
else:
options.add_argument('--headless')
self.driver = webdriver.Remote(
command_executor = 'http://geckodriver:4444', # the host geckodriver points to the geckodriver container
options = options,
browser_profile = profile
)
self._running = True
def finish(self):
self.logger.info("Exiting Geckodriver")
try:
self.driver.quit()
time.sleep(10)
except:
self.logger.critical("Connection to the driver broke off")
self._running = False
@driver_running
def download(self, article_object):
url = article_object.article_url
if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly
self.logger.info("Downloading existing pdf")
success = self.get_exisiting_pdf(article_object)
# get a page title if required
if article_object.is_title_bad:
article_object.title = self.driver.title.replace(".pdf", "") # some titles end with .pdf
# will be propagated to the saved file (dst) as well
else:
success = self.get_new_pdf(article_object)
if not success:
self.logger.error("Download failed")
# TODO: need to reset the file name to empty?
return article_object # changes to this are saved later by the external caller
def get_exisiting_pdf(self, article_object):
# get a better page title if required
if article_object.is_title_bad:
article_object.title = article_object.article_url.split("/")[-1].split(".pdf")[0]
try:
r = requests.get(article_object.article_url)
bytes = r.content
except:
return False
return self.write_pdf(bytes, article_object)
def get_new_pdf(self, article_object):
sleep_time = int(download_config["browser_print_delay"])
try:
self.driver.get(article_object.article_url)
except Exception as e:
self.logger.critical("Selenium .get(url) failed with error {}".format(e))
self.finish()
return False
time.sleep(sleep_time)
# leave the page time to do any funky business
if article_object.is_title_bad:
article_object.title = self.driver.title
try:
result = self.driver.print_page()
bytes = base64.b64decode(result, validate=True)
except:
self.logger.error("Failed, probably because the driver went extinct.")
return False
return self.write_pdf(bytes, article_object)
def get_file_destination(self, article_object):
fname = article_object.fname_template
fname = ensure_unique(article_object.save_path, fname)
dst = os.path.join(article_object.save_path, fname)
return dst, fname
def write_pdf(self, content, article_object):
dst, fname = self.get_file_destination(article_object)
os.makedirs(os.path.dirname(dst), exist_ok=True)
try:
with open(dst, "wb+") as f:
f.write(content)
article_object.file_name = fname
return True
except Exception as e:
self.logger.error(f"Failed, because of FS-operation: {e}")
return False
def create_tmp_profile(self, full_profile_path: Path = Path(download_config["browser_profile_path"])) -> Path:
reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}")
os.mkdir(reduced_profile_path)
# copy needed directories
dirs = ["extensions", "storage"]
for dir in dirs:
shutil.copytree(full_profile_path / dir, reduced_profile_path / dir)
# copy needed files
files = ["extension-preferences.json", "addons.json", "addonStartup.json.lz4", "prefs.js", "extensions.json", "cookies.sqlite"]
for f in files:
shutil.copy(full_profile_path / f, reduced_profile_path)
folder_size = round(sum(p.stat().st_size for p in Path(reduced_profile_path).rglob('*')) / 1024 / 1024, 3)
self.logger.info(f"Generated temporary profile at {reduced_profile_path} with size {folder_size} MB")
return reduced_profile_path
def ensure_unique(path, fname):
fbase, ending = os.path.splitext(fname)
exists = os.path.exists(os.path.join(path, fname))
i = 1
while exists:
fname = fbase + f" -- fetch {i}" + ending
i += 1
exists = os.path.exists(os.path.join(path, fname))
return fname