Better structure
This commit is contained in:
		
							
								
								
									
										0
									
								
								app/utils_worker/_init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								app/utils_worker/_init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										33
									
								
								app/utils_worker/compress/runner.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								app/utils_worker/compress/runner.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,33 @@ | ||||
| import os | ||||
| import subprocess | ||||
| import logging | ||||
| logger = logging.getLogger(__name__) | ||||
| import configuration | ||||
| config = configuration.parsed["DOWNLOADS"] | ||||
|  | ||||
| shrink_sizes = [] | ||||
|  | ||||
| def shrink_pdf(article): | ||||
|     initial_size = os.path.getsize(article.save_path + article.file_name) | ||||
|     c = subprocess.run( | ||||
|         ["gs", "-sDEVICE=pdfwrite", "-dPDFSETTINGS=/screen", "-dNOPAUSE", "-dBATCH", f"-sOutputFile={config['default_download_path']}/compressed.pdf", f'"{article.save_path + article.file_name}"'], | ||||
|         stdout=subprocess.PIPE, | ||||
|         stderr=subprocess.PIPE | ||||
|     ) | ||||
|     if c.returncode == 0: | ||||
|         m = subprocess.run( | ||||
|             ["mv", "-f", f"{config['default_download_path']}/compressed.pdf", article.save_path + article.file_name] | ||||
|         ) | ||||
|         if m.returncode == 0: | ||||
|             final_size = os.path.getsize(article.save_path + article.file_name) | ||||
|             shrink_sizes.append(initial_size - final_size) | ||||
|             logger.info(f"Compression worked. Avg shrinkage: {sum(shrink_sizes)/len(shrink_sizes) / 1000} (kb)") | ||||
|             return article # even though no modifications were made | ||||
|         else: | ||||
|             logger.error(f"Compression ran but I could not copy back the file {m.stderr.decode()} - {m.stdout.decode()}") | ||||
|  | ||||
|  | ||||
|     else: | ||||
|         logger.error(f"Could not run the compression! {c.stderr.decode()} - {c.stdout.decode()}") | ||||
|      | ||||
|     return article | ||||
							
								
								
									
										0
									
								
								app/utils_worker/download/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								app/utils_worker/download/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										158
									
								
								app/utils_worker/download/browser.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										158
									
								
								app/utils_worker/download/browser.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,158 @@ | ||||
| import time | ||||
| import datetime | ||||
| import logging | ||||
| import os | ||||
| import base64 | ||||
| import requests | ||||
| from selenium import webdriver | ||||
| from selenium.webdriver.firefox.options import Options | ||||
| import configuration | ||||
|  | ||||
| config = configuration.parsed["DOWNLOADS"] | ||||
|  | ||||
|  | ||||
|  | ||||
| class PDFDownloader: | ||||
|     """Saves a given url. Fills the object it got as a parameter""" | ||||
|     logger = logging.getLogger(__name__) | ||||
|     # status-variable for restarting: | ||||
|     running = False | ||||
|      | ||||
|     def start(self): | ||||
|         options=Options() | ||||
|         options.profile = config["browser_profile_path"] | ||||
|         # TODO: Get headless mode interactively | ||||
|         options.add_argument('--headless') | ||||
|         # options.add_argument("--disable-infobars") | ||||
|         # options.set_preference("javascript.enabled", False) | ||||
|         # options.add_argument("--disable-popup-blocking") | ||||
|         # Print to pdf | ||||
|         options.set_preference("print_printer", "Mozilla Save to PDF") | ||||
|         options.set_preference("print.always_print_silent", True) | ||||
|         options.set_preference("print.show_print_progress", False) | ||||
|         options.set_preference('print.save_as_pdf.links.enabled', True) | ||||
|         options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True) | ||||
|         # Save existing pdf | ||||
|         options.set_preference("browser.download.folderList", 2) | ||||
|         # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf") | ||||
|         # options.set_preference("pdfjs.disabled", True) | ||||
|         options.set_preference("browser.download.dir", config["default_download_path"]) | ||||
|  | ||||
|         self.logger.info("Now Starting gecko driver") | ||||
|         self.driver = webdriver.Firefox(options=options) | ||||
|          | ||||
|         residues = os.listdir(config["default_download_path"]) | ||||
|         for res in residues: | ||||
|             os.remove(os.path.join(config["default_download_path"], res)) | ||||
|  | ||||
|         self.running = True | ||||
|  | ||||
|     def autostart(self): | ||||
|         if not self.running: | ||||
|             self.start() # relaunch the dl util     | ||||
|  | ||||
|     def finish(self): | ||||
|         self.driver.quit() | ||||
|         self.running = False | ||||
|  | ||||
|  | ||||
|     def download(self, article_object): | ||||
|         sleep_time = 1 | ||||
|         self.autostart() | ||||
|         url = article_object.article_url | ||||
|  | ||||
|         # arbitrary bug fixes: | ||||
|         if "focus.de" in url or "bloomberg.com" in url: | ||||
|             url = url.replace("https://", "https://outline.com/") | ||||
|             sleep_time += 5 | ||||
|         try: | ||||
|             self.driver.get(url) | ||||
|         except Exception as e: | ||||
|             self.logger.critical("Selenium .get(url) failed with error {}".format(e)) | ||||
|             self.finish() | ||||
|             return article_object # without changes | ||||
|          | ||||
|         time.sleep(sleep_time) | ||||
|         # leave the page time to do any funky business | ||||
|  | ||||
|         # in the mean time, get a page title if required | ||||
|         if article_object.is_title_bad: | ||||
|             article_object.title = self.driver.title.replace(".pdf","") | ||||
|             # will be propagated to dst as well | ||||
|  | ||||
|         fname = article_object.fname_template | ||||
|         dst = os.path.join(article_object.save_path, fname) | ||||
|         if os.path.exists(dst): | ||||
|             fname = make_path_unique(fname) | ||||
|             dst = os.path.join(article_object.save_path, fname) | ||||
|  | ||||
|  | ||||
|         if url[-4:] == ".pdf": | ||||
|             # according to the browser preferences, calling the url will open pdfjs. | ||||
|             # If not handled separately, printing would require the ctrl+p route, but setup is janky to say the least | ||||
|             success = self.get_exisiting_pdf(url, dst) | ||||
|         else: | ||||
|             success = self.get_new_pdf(dst) | ||||
|  | ||||
|  | ||||
|         if success: | ||||
|             article_object.file_name = fname | ||||
|             article_object.set_references = self.get_references() | ||||
|         else: | ||||
|             article_object.file_name = "" | ||||
|          | ||||
|         return article_object # this change is saved later manually | ||||
|  | ||||
|  | ||||
|     def get_exisiting_pdf(self, url, dst): | ||||
|         try: | ||||
|             r = requests.get(url) | ||||
|             bytes = r.content | ||||
|         except: | ||||
|             return False | ||||
|         return self.get_new_pdf(dst, other_bytes=bytes) | ||||
|  | ||||
|  | ||||
|     def get_new_pdf(self, dst, other_bytes=None): | ||||
|         os.makedirs(os.path.dirname(dst), exist_ok=True) | ||||
|  | ||||
|         if other_bytes is None: | ||||
|             try: | ||||
|                 result = self.driver.print_page() | ||||
|                 bytes = base64.b64decode(result, validate=True) | ||||
|             except: | ||||
|                 self.logger.error("Failed, probably because the driver went extinct.") | ||||
|                 return False | ||||
|         else: | ||||
|             bytes = other_bytes | ||||
|  | ||||
|         try: | ||||
|             with open(dst, "wb+") as f: | ||||
|                 f.write(bytes) | ||||
|             return True | ||||
|         except Exception as e: | ||||
|             self.logger.error(f"Failed, because of FS-operation: {e}") | ||||
|             return False | ||||
|          | ||||
|  | ||||
|     def get_references(self): | ||||
|         try: | ||||
|             hrefs = [e.get_attribute("href") for e in self.driver.find_elements_by_xpath("//a[@href]")] | ||||
|         except: | ||||
|             hrefs = [] | ||||
|         # TODO TEST THIS | ||||
|         hrefs = [h for h in hrefs \ | ||||
|             if bool([(domain in h) for domain in config["blacklisted_href_domains"]]) | ||||
|             ] # filter a tiny bit at least | ||||
|         return hrefs | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| def make_path_unique(path): | ||||
|     fname, ending = os.path.splitext(path) | ||||
|     fname += datetime.datetime.now().strftime("%d-%H%M%S") | ||||
|     return fname + ending | ||||
							
								
								
									
										0
									
								
								app/utils_worker/download/runner.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								app/utils_worker/download/runner.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										33
									
								
								app/utils_worker/download/youtube.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								app/utils_worker/download/youtube.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,33 @@ | ||||
| import logging | ||||
| import os | ||||
| from pytube import YouTube | ||||
|  | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| def save_video(article_object): | ||||
|     """Saves video accoring to url and save path""" | ||||
|     url = article_object.article_url | ||||
|     logger.info("Saving new video") | ||||
|     try: | ||||
|         yt = YouTube(url) | ||||
|         streams = yt.streams.filter(progressive=True).order_by('resolution') | ||||
|     except Exception as e: | ||||
|         article_object.file_name = "ERROR: {}".format(e) | ||||
|         return article_object | ||||
|  | ||||
|     if streams: # if it's not empty | ||||
|         vid = streams[-1] | ||||
|         article_object.source_name = "youtube.com" | ||||
|         article_object.title = yt.title | ||||
|         file_path = os.path.join(article_object.save_path, article_object.fname_template) | ||||
|         try: | ||||
|             vid.download(file_path) | ||||
|             article_object.file_name = article_object.fname_template | ||||
|         except Exception as e: | ||||
|             logger.error(f"Youtube download crashed: {e}") | ||||
|             article_object.file_name = "Error while downloading" | ||||
|     else: | ||||
|         article_object.file_name = "No streams available" | ||||
|      | ||||
|     return article_object | ||||
							
								
								
									
										60
									
								
								app/utils_worker/fetch/runner.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										60
									
								
								app/utils_worker/fetch/runner.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,60 @@ | ||||
| from newspaper import Article | ||||
| from urllib.parse import urlparse | ||||
| from htmldate import find_date | ||||
| import datetime | ||||
| import logging | ||||
| logging.getLogger('newspaper').setLevel(logging.ERROR) # quieter logs | ||||
| logging.getLogger('urllib').setLevel(logging.ERROR) # quieter logs | ||||
| logging.getLogger('urllib3.poolmanager').setLevel(logging.ERROR) # quieter logs | ||||
| logging.getLogger('htmldate').setLevel(logging.ERROR) #quieter logs | ||||
| logging.getLogger('charset_normalizer').setLevel(logging.ERROR) #quieter logs | ||||
| logger = logging.getLogger("fetch") | ||||
|  | ||||
|  | ||||
| class NewspaperDummy(): | ||||
|     title = "Error while running fetch" | ||||
|     summary = "Error while running fetch" | ||||
|     text = "Error while running fetch" | ||||
|     authors = [] | ||||
|     keywords = [] | ||||
|  | ||||
|  | ||||
| def get_description(article_object): | ||||
|     url = article_object.article_url | ||||
|     website = urlparse(url).netloc | ||||
|     article_object.source_name = website | ||||
|     try: | ||||
|         pub_date = datetime.datetime.strptime(find_date(url), '%Y-%d-%M') | ||||
|     except: # other file types | ||||
|         pub_date = datetime.datetime(year=1900, month=1, day=1) | ||||
|     article_object.pub_date = pub_date | ||||
|  | ||||
|     fallback = NewspaperDummy() | ||||
|     try: | ||||
|         news_article = Article(url) | ||||
|         news_article.download() | ||||
|         news_article.parse() | ||||
|     except: | ||||
|         news_article = fallback | ||||
|  | ||||
|  | ||||
|     if news_article.title: | ||||
|         title = news_article.title | ||||
|     else: | ||||
|         title = fallback.title | ||||
|  | ||||
|  | ||||
|     if news_article.summary: | ||||
|         summary = news_article.summary | ||||
|     elif news_article.text: | ||||
|         ind = min(500, len(news_article.text)) | ||||
|         summary = news_article.text[:ind] + "..." | ||||
|     else: | ||||
|         summary = fallback.summary         | ||||
|  | ||||
|     article_object.title = title | ||||
|     article_object.summary = summary | ||||
|     article_object.set_authors(news_article.authors) | ||||
|     article_object.set_keywords(news_article.keywords) | ||||
|      | ||||
|     return article_object | ||||
							
								
								
									
										18
									
								
								app/utils_worker/upload/runner.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								app/utils_worker/upload/runner.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,18 @@ | ||||
| from waybackpy import WaybackMachineSaveAPI # upload to archive.org | ||||
| import logging | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
| def upload_to_archive(article_object): | ||||
|     """uploads to archive.org and returns the archived url""" | ||||
|     user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed? | ||||
|     url = article_object.article_url | ||||
|     try: | ||||
|         wayback = WaybackMachineSaveAPI(url, user_agent) | ||||
|         archive_url = wayback.save() | ||||
|         logger.info(f"{url} uploaded to archive successfully") | ||||
|         article_object.archive_url = archive_url | ||||
|     except Exception as e: | ||||
|         article_object.archive_url = "Error while uploading: {}".format(e) | ||||
|         logger.error(f"Error while generating new url: {e}") | ||||
|  | ||||
|     return article_object | ||||
							
								
								
									
										43
									
								
								app/utils_worker/worker_template.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								app/utils_worker/worker_template.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,43 @@ | ||||
| from threading import Thread | ||||
| import time | ||||
| import logging | ||||
| # logger = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| class TemplateWorker(Thread): | ||||
|     """Parent class for any subsequent worker of the article-download pipeline. They should all run in parallel, thus the Thread subclassing""" | ||||
|     logger = logging.getLogger(__name__) | ||||
|  | ||||
|     def __init__(self, *args, **kwargs) -> None: | ||||
|         target = self._queue_processor # will be executed on Worker.start() | ||||
|         group = kwargs.get("group", None) | ||||
|         name = kwargs.get("name", None) | ||||
|  | ||||
|         super().__init__(group=group, target=target, name=name) | ||||
|         self._article_queue = [] | ||||
|         self.logger.info(f"Worker thread {self.__class__.__name__} initialized successfully") | ||||
|      | ||||
|  | ||||
|     def process(self, article_watcher): | ||||
|         self._article_queue.append(article_watcher)#.article_model.article_url) | ||||
|  | ||||
|  | ||||
|     def _queue_processor(self): | ||||
|         """This method is launched by thread.run() and idles when self._article_queue is empty. When an external caller appends to the queue it jumps into action""" | ||||
|         while True: # PLEASE tell me if I'm missing an obvious better way of doing this! | ||||
|             if len(self._article_queue) == 0: | ||||
|                 time.sleep(5) | ||||
|             else: | ||||
|                 article_watcher = self._article_queue.pop(0) | ||||
|                 self.logger.info(f"{self.__class__.__name__} is now processing article ({len(self._article_queue)} in queue)") | ||||
|                 self._handle_article(article_watcher) | ||||
|                  | ||||
|  | ||||
|     def _handle_article(self, article_watcher, action=None): | ||||
|         # TODO Overload in children classes | ||||
|         if action is None: | ||||
|             self.logger.error("Unoverloaded call of _handle_article(). This should not occur in prod") | ||||
|         else: | ||||
|             article = article_watcher.article | ||||
|             article = action(article) # action updates the article object but does not save the change | ||||
|             article.save() | ||||
							
								
								
									
										60
									
								
								app/utils_worker/workers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										60
									
								
								app/utils_worker/workers.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,60 @@ | ||||
| from .worker_template import TemplateWorker | ||||
| from .download.browser import PDFDownloader | ||||
| from .download.youtube import save_video | ||||
| from .fetch.runner import get_description | ||||
| from .upload.runner import upload_to_archive as run_upload | ||||
| from .compress.runner import shrink_pdf | ||||
|  | ||||
| import logging | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
| class DownloadWorker(TemplateWorker): | ||||
|     def __init__(self) -> None: | ||||
|         self.dl_runner = PDFDownloader().download | ||||
|         self.yt_runner = save_video | ||||
|         super().__init__() | ||||
|  | ||||
|     def _handle_article(self, article_watcher): | ||||
|         article = article_watcher.article | ||||
|         u = article.article_url | ||||
|  | ||||
|         if "youtu.be/" in u or "youtube.com/" in u: | ||||
|             action = self.yt_runner | ||||
|         else: | ||||
|             action = self.dl_runner | ||||
|  | ||||
|         super()._handle_article(article_watcher, action) | ||||
|         article_watcher.download_completed = True | ||||
|  | ||||
|  | ||||
|  | ||||
| class FetchWorker(TemplateWorker): | ||||
|     def __init__(self) -> None: | ||||
|         super().__init__() | ||||
|  | ||||
|     def _handle_article(self, article_watcher): | ||||
|         action = get_description # function | ||||
|         super()._handle_article(article_watcher, action) | ||||
|         article_watcher.fetch_completed = True | ||||
|  | ||||
|  | ||||
|  | ||||
| class UploadWorker(TemplateWorker): | ||||
|     def __init__(self) -> None: | ||||
|         super().__init__() | ||||
|  | ||||
|     def _handle_article(self, article_watcher): | ||||
|         action = run_upload # function | ||||
|         super()._handle_article(article_watcher, action) | ||||
|         article_watcher.upload_completed = True | ||||
|  | ||||
|  | ||||
|  | ||||
| class CompressWorker(TemplateWorker): | ||||
|     def __init__(self) -> None: | ||||
|         super().__init__() | ||||
|  | ||||
|     def _handle_article(self, article_watcher): | ||||
|         action = shrink_pdf | ||||
|         super()._handle_article(article_watcher, action) | ||||
|         article_watcher.compression_completed = True | ||||
		Reference in New Issue
	
	Block a user
	 Remy Moll
					Remy Moll