Bug fixes, makefile for launch
This commit is contained in:
@@ -11,7 +11,7 @@ from selenium import webdriver
|
||||
|
||||
import configuration
|
||||
|
||||
config = configuration.main_config["DOWNLOADS"]
|
||||
download_config = configuration.config["downloads"]
|
||||
|
||||
def driver_running(f):
|
||||
def wrapper(*args, **kwargs):
|
||||
@@ -66,74 +66,88 @@ class PDFDownloader:
|
||||
|
||||
@driver_running
|
||||
def download(self, article_object):
|
||||
sleep_time = int(config["browser_print_delay"])
|
||||
url = article_object.article_url
|
||||
|
||||
|
||||
if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly
|
||||
self.logger.info("Downloading existing pdf")
|
||||
success = self.get_exisiting_pdf(article_object)
|
||||
# get a page title if required
|
||||
if article_object.is_title_bad:
|
||||
article_object.title = self.driver.title.replace(".pdf", "") # some titles end with .pdf
|
||||
# will be propagated to the saved file (dst) as well
|
||||
else:
|
||||
success = self.get_new_pdf(article_object)
|
||||
|
||||
if not success:
|
||||
self.logger.error("Download failed")
|
||||
# TODO: need to reset the file name to empty?
|
||||
return article_object # changes to this are saved later by the external caller
|
||||
|
||||
|
||||
def get_exisiting_pdf(self, article_object):
|
||||
# get a better page title if required
|
||||
if article_object.is_title_bad:
|
||||
article_object.title = article_object.article_url.split("/")[-1].split(".pdf")[0]
|
||||
try:
|
||||
self.driver.get(url)
|
||||
r = requests.get(article_object.article_url)
|
||||
bytes = r.content
|
||||
except:
|
||||
return False
|
||||
return self.write_pdf(bytes, article_object)
|
||||
|
||||
|
||||
def get_new_pdf(self, article_object):
|
||||
sleep_time = int(download_config["browser_print_delay"])
|
||||
|
||||
try:
|
||||
self.driver.get(article_object.article_url)
|
||||
except Exception as e:
|
||||
self.logger.critical("Selenium .get(url) failed with error {}".format(e))
|
||||
self.finish()
|
||||
return article_object # without changes
|
||||
return False
|
||||
|
||||
time.sleep(sleep_time)
|
||||
# leave the page time to do any funky business
|
||||
|
||||
# in the mean time, get a page title if required
|
||||
if article_object.is_title_bad:
|
||||
article_object.title = self.driver.title.replace(".pdf", "") # some titles end with .pdf
|
||||
# will be propagated to the saved file (dst) as well
|
||||
article_object.title = self.driver.title
|
||||
|
||||
try:
|
||||
result = self.driver.print_page()
|
||||
bytes = base64.b64decode(result, validate=True)
|
||||
except:
|
||||
self.logger.error("Failed, probably because the driver went extinct.")
|
||||
return False
|
||||
|
||||
return self.write_pdf(bytes, article_object)
|
||||
|
||||
|
||||
def get_file_destination(self, article_object):
|
||||
fname = article_object.fname_template
|
||||
fname = ensure_unique(article_object.save_path, fname)
|
||||
dst = os.path.join(article_object.save_path, fname)
|
||||
return dst, fname
|
||||
|
||||
|
||||
if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly
|
||||
success = self.get_exisiting_pdf(url, dst)
|
||||
else:
|
||||
success = self.get_new_pdf(dst)
|
||||
|
||||
if success:
|
||||
article_object.file_name = fname
|
||||
else:
|
||||
article_object.file_name = ""
|
||||
|
||||
return article_object # this change is saved later by the external caller
|
||||
|
||||
|
||||
def get_exisiting_pdf(self, url, dst):
|
||||
try:
|
||||
r = requests.get(url)
|
||||
bytes = r.content
|
||||
except:
|
||||
return False
|
||||
return self.get_new_pdf(dst, other_bytes=bytes)
|
||||
|
||||
|
||||
def get_new_pdf(self, dst, other_bytes=None):
|
||||
def write_pdf(self, content, article_object):
|
||||
dst, fname = self.get_file_destination(article_object)
|
||||
os.makedirs(os.path.dirname(dst), exist_ok=True)
|
||||
|
||||
if other_bytes is None:
|
||||
try:
|
||||
result = self.driver.print_page()
|
||||
bytes = base64.b64decode(result, validate=True)
|
||||
except:
|
||||
self.logger.error("Failed, probably because the driver went extinct.")
|
||||
return False
|
||||
else:
|
||||
bytes = other_bytes
|
||||
|
||||
|
||||
try:
|
||||
with open(dst, "wb+") as f:
|
||||
f.write(bytes)
|
||||
f.write(content)
|
||||
|
||||
article_object.file_name = fname
|
||||
return True
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed, because of FS-operation: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def create_tmp_profile(self, full_profile_path: Path = Path(config["browser_profile_path"])) -> Path:
|
||||
|
||||
|
||||
def create_tmp_profile(self, full_profile_path: Path = Path(download_config["browser_profile_path"])) -> Path:
|
||||
reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}")
|
||||
os.mkdir(reduced_profile_path)
|
||||
# copy needed directories
|
||||
|
@@ -1,10 +1,11 @@
|
||||
import youtube_dl
|
||||
import os
|
||||
import logging
|
||||
import configuration
|
||||
|
||||
download_config = configuration.config["downloads"]
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MyLogger(object):
|
||||
def debug(self, msg): pass
|
||||
def warning(self, msg): pass
|
||||
@@ -19,7 +20,6 @@ class YouTubeDownloader:
|
||||
|
||||
|
||||
def post_download_hook(self, ret_code):
|
||||
# print(ret_code)
|
||||
if ret_code['status'] == 'finished':
|
||||
file_loc = ret_code["filename"]
|
||||
fname = os.path.basename(file_loc)
|
||||
@@ -35,9 +35,11 @@ class YouTubeDownloader:
|
||||
ydl_opts = {
|
||||
'format': 'best[height<=720]',
|
||||
'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download
|
||||
'logger': MyLogger(),
|
||||
'logger': MyLogger(), # supress verbosity
|
||||
'progress_hooks': [self.post_download_hook],
|
||||
'updatetime': False
|
||||
'updatetime': False,
|
||||
# File is also used by firefox so make sure to not write to it!
|
||||
# youtube dl apparenlty does not support cookies.sqlite and the documentation is not clear on how to use cookies.txt
|
||||
}
|
||||
try:
|
||||
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
||||
@@ -46,5 +48,9 @@ class YouTubeDownloader:
|
||||
except Exception as e:
|
||||
logger.error(f"Youtube download crashed: {e}")
|
||||
article_object.file_name = ""
|
||||
logfile = os.path.join(download_config["local_storage_path"], "failed_downloads.csv")
|
||||
logger.info(f"Logging youtube errors seperately to {logfile}")
|
||||
with open(logfile, "a+") as f:
|
||||
f.write(f"{url}\n")
|
||||
|
||||
return article_object
|
||||
|
Reference in New Issue
Block a user