reduced slack functionality, higher ease of use. Database migration wip
This commit is contained in:
		
							
								
								
									
										0
									
								
								news_fetch/utils_worker/download/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								news_fetch/utils_worker/download/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										161
									
								
								news_fetch/utils_worker/download/browser.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										161
									
								
								news_fetch/utils_worker/download/browser.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,161 @@ | ||||
| import time | ||||
| import datetime | ||||
| import logging | ||||
| import os | ||||
| import base64 | ||||
| import requests | ||||
| from selenium import webdriver | ||||
| import configuration | ||||
| import json | ||||
|  | ||||
| config = configuration.main_config["DOWNLOADS"] | ||||
| blacklisted = json.loads(config["blacklisted_href_domains"]) | ||||
|  | ||||
|  | ||||
| class PDFDownloader: | ||||
|     """Saves a given url. Fills the object it got as a parameter""" | ||||
|     logger = logging.getLogger(__name__) | ||||
|     # status-variable for restarting: | ||||
|     running = False | ||||
|      | ||||
|     def start(self): | ||||
|         self.finish() # clear up | ||||
|              | ||||
|         options = webdriver.FirefoxOptions() | ||||
|         options.profile = config["browser_profile_path"] | ||||
|         # should be options.set_preference("profile", config["browser_profile_path"]) as of selenium 4 but that doesn't work | ||||
|  | ||||
|         if os.getenv("DEBUG", "false") == "true": | ||||
|             self.logger.warning("Opening browser GUI because of 'DEBUG=true'") | ||||
|         else: | ||||
|             options.add_argument('--headless') | ||||
|  | ||||
|         options.set_preference('print.save_as_pdf.links.enabled', True) | ||||
|         # Just save if the filetype is pdf already | ||||
|         # TODO: this is not working right now | ||||
|  | ||||
|         options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True) | ||||
|         options.set_preference("browser.download.folderList", 2) | ||||
|         # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf") | ||||
|         # options.set_preference("pdfjs.disabled", True) | ||||
|         options.set_preference("browser.download.dir", config["default_download_path"]) | ||||
|  | ||||
|         self.logger.info("Starting gecko driver") | ||||
|         # peviously, in a single docker image: | ||||
|         # self.driver = webdriver.Firefox( | ||||
|         #     options = options, | ||||
|         #     service = webdriver.firefox.service.Service( | ||||
|         #         log_path = f'{config["local_storage_path"]}/geckodriver.log' | ||||
|         # )) | ||||
|         self.driver = webdriver.Remote( | ||||
|             command_executor = 'http://geckodriver:4444', | ||||
|             options = options, | ||||
|             # can't set log path... | ||||
|         ) | ||||
|          | ||||
|         residues = os.listdir(config["default_download_path"]) | ||||
|         for res in residues: | ||||
|             os.remove(os.path.join(config["default_download_path"], res)) | ||||
|  | ||||
|         self.running = True | ||||
|  | ||||
|     def autostart(self): | ||||
|         if not self.running: | ||||
|             self.start()  # relaunch the dl util | ||||
|  | ||||
|     def finish(self): | ||||
|         if self.running: | ||||
|             self.logger.info("Exiting gecko driver") | ||||
|             try: | ||||
|                 self.driver.quit() | ||||
|                 time.sleep(10) | ||||
|             except: | ||||
|                 self.logger.critical("Connection to the driver broke off") | ||||
|             self.running = False | ||||
|         else: | ||||
|             self.logger.info("Gecko driver not yet running") | ||||
|  | ||||
|     def download(self, article_object): | ||||
|         sleep_time = 2 | ||||
|         self.autostart() | ||||
|         url = article_object.article_url | ||||
|  | ||||
|         try: | ||||
|             self.driver.get(url) | ||||
|         except Exception as e: | ||||
|             self.logger.critical("Selenium .get(url) failed with error {}".format(e)) | ||||
|             self.finish() | ||||
|             return article_object  # without changes | ||||
|          | ||||
|         time.sleep(sleep_time) | ||||
|         # leave the page time to do any funky business | ||||
|  | ||||
|         # in the mean time, get a page title if required | ||||
|         if article_object.is_title_bad: | ||||
|             article_object.title = self.driver.title.replace(".pdf", "") # some titles end with .pdf | ||||
|             # will be propagated to the saved file (dst) as well | ||||
|  | ||||
|         fname = article_object.fname_template | ||||
|         dst = os.path.join(article_object.save_path, fname) | ||||
|         if os.path.exists(dst): | ||||
|             fname = make_path_unique(fname) | ||||
|             dst = os.path.join(article_object.save_path, fname) | ||||
|  | ||||
|  | ||||
|         if url[-4:] == ".pdf": | ||||
|             # according to the browser preferences, calling the url will open pdfjs. | ||||
|             # If not handled separately, printing would require the ctrl+p route, but setup is janky to say the least | ||||
|             success = self.get_exisiting_pdf(url, dst) | ||||
|         else: | ||||
|             success = self.get_new_pdf(dst) | ||||
|  | ||||
|  | ||||
|         if success: | ||||
|             article_object.file_name = fname | ||||
|         else: | ||||
|             article_object.file_name = "" | ||||
|          | ||||
|         return article_object  # this change is saved later by the external caller | ||||
|  | ||||
|  | ||||
|     def get_exisiting_pdf(self, url, dst): | ||||
|         try: | ||||
|             r = requests.get(url) | ||||
|             bytes = r.content | ||||
|         except: | ||||
|             return False | ||||
|         return self.get_new_pdf(dst, other_bytes=bytes) | ||||
|  | ||||
|  | ||||
|     def get_new_pdf(self, dst, other_bytes=None): | ||||
|         os.makedirs(os.path.dirname(dst), exist_ok=True) | ||||
|  | ||||
|         if other_bytes is None: | ||||
|             try: | ||||
|                 result = self.driver.print_page() | ||||
|                 bytes = base64.b64decode(result, validate=True) | ||||
|             except: | ||||
|                 self.logger.error("Failed, probably because the driver went extinct.") | ||||
|                 return False | ||||
|         else: | ||||
|             bytes = other_bytes | ||||
|  | ||||
|         try: | ||||
|             with open(dst, "wb+") as f: | ||||
|                 f.write(bytes) | ||||
|             return True | ||||
|         except Exception as e: | ||||
|             self.logger.error(f"Failed, because of FS-operation: {e}") | ||||
|             return False | ||||
|          | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| def make_path_unique(path): | ||||
|     fname, ending = os.path.splitext(path) | ||||
|     fname += datetime.datetime.now().strftime("%d-%H%M%S") | ||||
|     return fname + ending | ||||
							
								
								
									
										0
									
								
								news_fetch/utils_worker/download/runner.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								news_fetch/utils_worker/download/runner.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										51
									
								
								news_fetch/utils_worker/download/youtube.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										51
									
								
								news_fetch/utils_worker/download/youtube.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,51 @@ | ||||
| from __future__ import unicode_literals | ||||
| import youtube_dl | ||||
| import os | ||||
| import logging | ||||
|  | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| class MyLogger(object): | ||||
|     def debug(self, msg): pass | ||||
|     def warning(self, msg): pass | ||||
|     def error(self, msg): | ||||
|         logger.error(msg) | ||||
|  | ||||
|  | ||||
|  | ||||
| class YouTubeDownloader: | ||||
|     def __init__(self) -> None: | ||||
|         pass | ||||
|  | ||||
|  | ||||
|     def post_download_hook(self, ret_code): | ||||
|         # print(ret_code) | ||||
|         if ret_code['status'] == 'finished': | ||||
|             file_loc = ret_code["filename"] | ||||
|             fname = os.path.basename(file_loc) | ||||
|             self.article_object.file_name = fname | ||||
|  | ||||
|  | ||||
|     def save_video(self, article_object): | ||||
|         """Saves video accoring to url and save path""" | ||||
|         self.article_object = article_object | ||||
|         url = article_object.article_url | ||||
|         logger.info("Saving new video") | ||||
|         file_path = os.path.join(article_object.save_path, article_object.fname_template) | ||||
|         ydl_opts = { | ||||
|             'format': 'best[height<=720]', | ||||
|             'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download | ||||
|             'logger': MyLogger(), | ||||
|             'progress_hooks': [self.post_download_hook], | ||||
|             'updatetime': False | ||||
|         } | ||||
|         try: | ||||
|             with youtube_dl.YoutubeDL(ydl_opts) as ydl: | ||||
|                 ydl.download([url]) | ||||
|                 # article file name is updated in self.post_download_hook | ||||
|         except Exception as e: | ||||
|             logger.error(f"Youtube download crashed: {e}") | ||||
|             article_object.file_name = "" | ||||
|  | ||||
|         return article_object | ||||
		Reference in New Issue
	
	Block a user