Bug fixes, makefile for launch
This commit is contained in:
		| @@ -11,7 +11,7 @@ from selenium import webdriver | ||||
|  | ||||
| import configuration | ||||
|  | ||||
| config = configuration.main_config["DOWNLOADS"] | ||||
| download_config = configuration.config["downloads"] | ||||
|  | ||||
| def driver_running(f): | ||||
|     def wrapper(*args, **kwargs): | ||||
| @@ -66,74 +66,88 @@ class PDFDownloader: | ||||
|  | ||||
|     @driver_running | ||||
|     def download(self, article_object): | ||||
|         sleep_time = int(config["browser_print_delay"]) | ||||
|         url = article_object.article_url | ||||
|  | ||||
|  | ||||
|         if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly | ||||
|             self.logger.info("Downloading existing pdf") | ||||
|             success = self.get_exisiting_pdf(article_object) | ||||
|             # get a page title if required | ||||
|             if article_object.is_title_bad: | ||||
|                 article_object.title = self.driver.title.replace(".pdf", "") # some titles end with .pdf | ||||
|                 # will be propagated to the saved file (dst) as well | ||||
|         else: | ||||
|             success = self.get_new_pdf(article_object) | ||||
|  | ||||
|         if not success: | ||||
|             self.logger.error("Download failed") | ||||
|         # TODO: need to reset the file name to empty? | ||||
|         return article_object # changes to this are saved later by the external caller | ||||
|  | ||||
|  | ||||
|     def get_exisiting_pdf(self, article_object): | ||||
|         # get a better page title if required | ||||
|         if article_object.is_title_bad: | ||||
|             article_object.title = article_object.article_url.split("/")[-1].split(".pdf")[0] | ||||
|         try: | ||||
|             self.driver.get(url) | ||||
|             r = requests.get(article_object.article_url) | ||||
|             bytes = r.content | ||||
|         except: | ||||
|             return False | ||||
|         return self.write_pdf(bytes, article_object) | ||||
|  | ||||
|  | ||||
|     def get_new_pdf(self, article_object): | ||||
|         sleep_time = int(download_config["browser_print_delay"]) | ||||
|  | ||||
|         try: | ||||
|             self.driver.get(article_object.article_url) | ||||
|         except Exception as e: | ||||
|             self.logger.critical("Selenium .get(url) failed with error {}".format(e)) | ||||
|             self.finish() | ||||
|             return article_object  # without changes | ||||
|             return False | ||||
|          | ||||
|         time.sleep(sleep_time) | ||||
|         # leave the page time to do any funky business | ||||
|  | ||||
|         # in the mean time, get a page title if required | ||||
|         if article_object.is_title_bad: | ||||
|             article_object.title = self.driver.title.replace(".pdf", "") # some titles end with .pdf | ||||
|             # will be propagated to the saved file (dst) as well | ||||
|             article_object.title = self.driver.title | ||||
|  | ||||
|         try: | ||||
|             result = self.driver.print_page() | ||||
|             bytes = base64.b64decode(result, validate=True) | ||||
|         except: | ||||
|             self.logger.error("Failed, probably because the driver went extinct.") | ||||
|             return False | ||||
|  | ||||
|         return self.write_pdf(bytes, article_object) | ||||
|  | ||||
|  | ||||
|     def get_file_destination(self, article_object): | ||||
|         fname = article_object.fname_template | ||||
|         fname = ensure_unique(article_object.save_path, fname) | ||||
|         dst = os.path.join(article_object.save_path, fname) | ||||
|         return dst, fname | ||||
|  | ||||
|  | ||||
|         if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly | ||||
|             success = self.get_exisiting_pdf(url, dst) | ||||
|         else: | ||||
|             success = self.get_new_pdf(dst) | ||||
|  | ||||
|         if success: | ||||
|             article_object.file_name = fname | ||||
|         else: | ||||
|             article_object.file_name = "" | ||||
|          | ||||
|         return article_object # this change is saved later by the external caller | ||||
|  | ||||
|  | ||||
|     def get_exisiting_pdf(self, url, dst): | ||||
|         try: | ||||
|             r = requests.get(url) | ||||
|             bytes = r.content | ||||
|         except: | ||||
|             return False | ||||
|         return self.get_new_pdf(dst, other_bytes=bytes) | ||||
|  | ||||
|  | ||||
|     def get_new_pdf(self, dst, other_bytes=None): | ||||
|     def write_pdf(self, content, article_object): | ||||
|         dst, fname = self.get_file_destination(article_object) | ||||
|         os.makedirs(os.path.dirname(dst), exist_ok=True) | ||||
|  | ||||
|         if other_bytes is None: | ||||
|             try: | ||||
|                 result = self.driver.print_page() | ||||
|                 bytes = base64.b64decode(result, validate=True) | ||||
|             except: | ||||
|                 self.logger.error("Failed, probably because the driver went extinct.") | ||||
|                 return False | ||||
|         else: | ||||
|             bytes = other_bytes | ||||
|  | ||||
|          | ||||
|         try: | ||||
|             with open(dst, "wb+") as f: | ||||
|                 f.write(bytes) | ||||
|                 f.write(content) | ||||
|              | ||||
|             article_object.file_name = fname | ||||
|             return True | ||||
|         except Exception as e: | ||||
|             self.logger.error(f"Failed, because of FS-operation: {e}") | ||||
|             return False | ||||
|  | ||||
|  | ||||
|     def create_tmp_profile(self, full_profile_path: Path = Path(config["browser_profile_path"])) -> Path: | ||||
|  | ||||
|          | ||||
|     def create_tmp_profile(self, full_profile_path: Path = Path(download_config["browser_profile_path"])) -> Path: | ||||
|         reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}") | ||||
|         os.mkdir(reduced_profile_path) | ||||
|         # copy needed directories | ||||
|   | ||||
| @@ -1,10 +1,11 @@ | ||||
| import youtube_dl | ||||
| import os | ||||
| import logging | ||||
| import configuration | ||||
|  | ||||
| download_config = configuration.config["downloads"] | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| class MyLogger(object): | ||||
|     def debug(self, msg): pass | ||||
|     def warning(self, msg): pass | ||||
| @@ -19,7 +20,6 @@ class YouTubeDownloader: | ||||
|  | ||||
|  | ||||
|     def post_download_hook(self, ret_code): | ||||
|         # print(ret_code) | ||||
|         if ret_code['status'] == 'finished': | ||||
|             file_loc = ret_code["filename"] | ||||
|             fname = os.path.basename(file_loc) | ||||
| @@ -35,9 +35,11 @@ class YouTubeDownloader: | ||||
|         ydl_opts = { | ||||
|             'format': 'best[height<=720]', | ||||
|             'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download | ||||
|             'logger': MyLogger(), | ||||
|             'logger': MyLogger(), # supress verbosity | ||||
|             'progress_hooks': [self.post_download_hook], | ||||
|             'updatetime': False | ||||
|             'updatetime': False, | ||||
|             # File is also used by firefox so make sure to not write to it! | ||||
|             # youtube dl apparenlty does not support cookies.sqlite and the documentation is not clear on how to use cookies.txt | ||||
|         } | ||||
|         try: | ||||
|             with youtube_dl.YoutubeDL(ydl_opts) as ydl: | ||||
| @@ -46,5 +48,9 @@ class YouTubeDownloader: | ||||
|         except Exception as e: | ||||
|             logger.error(f"Youtube download crashed: {e}") | ||||
|             article_object.file_name = "" | ||||
|             logfile = os.path.join(download_config["local_storage_path"], "failed_downloads.csv") | ||||
|             logger.info(f"Logging youtube errors seperately to {logfile}") | ||||
|             with open(logfile, "a+") as f: | ||||
|                 f.write(f"{url}\n") | ||||
|  | ||||
|         return article_object | ||||
|   | ||||
		Reference in New Issue
	
	Block a user