Many bug fixes
This commit is contained in:
		| @@ -9,6 +9,9 @@ shrink_sizes = [] | ||||
|  | ||||
| def shrink_pdf(article): | ||||
|     initial_size = os.path.getsize(article.save_path + article.file_name) | ||||
|     if article.file_name[-4:] != ".pdf": | ||||
|         return article # it probably was a youtube video | ||||
|          | ||||
|     c = subprocess.run( | ||||
|         ["gs", "-sDEVICE=pdfwrite", "-dPDFSETTINGS=/screen", "-dNOPAUSE", "-dBATCH", f"-sOutputFile={config['default_download_path']}/compressed.pdf", f'"{article.save_path + article.file_name}"'], | ||||
|         stdout=subprocess.PIPE, | ||||
|   | ||||
| @@ -7,10 +7,10 @@ import requests | ||||
| from selenium import webdriver | ||||
| from selenium.webdriver.firefox.options import Options | ||||
| import configuration | ||||
| import json | ||||
|  | ||||
| config = configuration.parsed["DOWNLOADS"] | ||||
|  | ||||
|  | ||||
| blacklisted = json.loads(config["blacklisted_href_domains"]) | ||||
|  | ||||
| class PDFDownloader: | ||||
|     """Saves a given url. Fills the object it got as a parameter""" | ||||
| @@ -61,10 +61,6 @@ class PDFDownloader: | ||||
|         self.autostart() | ||||
|         url = article_object.article_url | ||||
|  | ||||
|         # arbitrary bug fixes: | ||||
|         if "focus.de" in url or "bloomberg.com" in url: | ||||
|             url = url.replace("https://", "https://outline.com/") | ||||
|             sleep_time += 5 | ||||
|         try: | ||||
|             self.driver.get(url) | ||||
|         except Exception as e: | ||||
| @@ -97,7 +93,7 @@ class PDFDownloader: | ||||
|  | ||||
|         if success: | ||||
|             article_object.file_name = fname | ||||
|             article_object.set_references = self.get_references() | ||||
|             article_object.set_references(self.get_references()) | ||||
|         else: | ||||
|             article_object.file_name = "" | ||||
|          | ||||
| @@ -140,10 +136,12 @@ class PDFDownloader: | ||||
|             hrefs = [e.get_attribute("href") for e in self.driver.find_elements_by_xpath("//a[@href]")] | ||||
|         except: | ||||
|             hrefs = [] | ||||
|         # TODO TEST THIS | ||||
|         old = hrefs | ||||
|         hrefs = [h for h in hrefs \ | ||||
|             if bool([(domain in h) for domain in config["blacklisted_href_domains"]]) | ||||
|             if not sum([(domain in h) for domain in blacklisted]) # sum([True, False, False, False]) == 1 (esp. not 0) | ||||
|             ] # filter a tiny bit at least | ||||
|         diff = set(old) ^ set(hrefs) | ||||
|         self.logger.info(f"Removed {len(diff)} hrefs: {diff} (before:{len(old)}, after: {len(hrefs)})") | ||||
|         return hrefs | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -1,33 +1,65 @@ | ||||
| import logging | ||||
| from __future__ import unicode_literals | ||||
| import youtube_dl | ||||
| import os | ||||
| from pytube import YouTube | ||||
| import logging | ||||
|  | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| def save_video(article_object): | ||||
|     """Saves video accoring to url and save path""" | ||||
|     url = article_object.article_url | ||||
|     logger.info("Saving new video") | ||||
|     try: | ||||
|         yt = YouTube(url) | ||||
|         streams = yt.streams.filter(progressive=True).order_by('resolution') | ||||
|     except Exception as e: | ||||
|         article_object.file_name = "ERROR: {}".format(e) | ||||
|         return article_object | ||||
| class MyLogger(object): | ||||
|     def debug(self, msg): pass | ||||
|     def warning(self, msg): pass | ||||
|     def error(self, msg): | ||||
|         logger.error(msg) | ||||
|  | ||||
|     if streams: # if it's not empty | ||||
|         vid = streams[-1] | ||||
|         article_object.source_name = "youtube.com" | ||||
|         article_object.title = yt.title | ||||
|  | ||||
|  | ||||
| class YouTubeDownloader: | ||||
|     def __init__(self) -> None: | ||||
|         pass | ||||
|  | ||||
|  | ||||
|     def post_download_hook(self, ret_code): | ||||
|         # print(ret_code) | ||||
|         if ret_code['status'] == 'finished': | ||||
|             file_loc = ret_code["filename"] | ||||
|             fname = os.path.basename(file_loc) | ||||
|             self.article_object.file_name = fname | ||||
|  | ||||
|  | ||||
|     def save_video(self, article_object): | ||||
|         """Saves video accoring to url and save path""" | ||||
|         self.article_object = article_object | ||||
|         url = article_object.article_url | ||||
|         logger.info("Saving new video") | ||||
|         file_path = os.path.join(article_object.save_path, article_object.fname_template) | ||||
|         ydl_opts = { | ||||
|             'format': 'best[height<=720]', | ||||
|             'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download | ||||
|             'logger': MyLogger(), | ||||
|             'progress_hooks': [self.post_download_hook], | ||||
|             'updatetime': False | ||||
|         } | ||||
|         try: | ||||
|             vid.download(file_path) | ||||
|             article_object.file_name = article_object.fname_template | ||||
|             with youtube_dl.YoutubeDL(ydl_opts) as ydl: | ||||
|                 ydl.download([url]) | ||||
|                 # article file name is updated in self.post_download_hook | ||||
|         except Exception as e: | ||||
|             logger.error(f"Youtube download crashed: {e}") | ||||
|             article_object.file_name = "Error while downloading" | ||||
|     else: | ||||
|         article_object.file_name = "No streams available" | ||||
|      | ||||
|     return article_object | ||||
|             article_object.file_name = "" | ||||
|  | ||||
|         return article_object | ||||
|  | ||||
|  | ||||
|  | ||||
| # class DummyArticle: | ||||
| #     article_url = "https://www.welt.de/politik/ausland/article238267261/Baerbock-Lieferung-gepanzerter-Fahrzeuge-an-die-Ukraine-kein-Tabu.html" | ||||
| #     save_path = "/app/file_storage/" | ||||
| #     fname_template = "www.youtube.com -- Test" | ||||
| #     file_name = "" | ||||
|  | ||||
| # m = DummyArticle() | ||||
| # t = YouTubeDownloader() | ||||
| # t.save_video(m) | ||||
|  | ||||
| # print(m.file_name) | ||||
|   | ||||
| @@ -37,24 +37,28 @@ def get_description(article_object): | ||||
|     except: | ||||
|         news_article = fallback | ||||
|  | ||||
|  | ||||
|     if news_article.title: | ||||
|         title = news_article.title | ||||
|     else: | ||||
|         title = fallback.title | ||||
|  | ||||
|  | ||||
|     if news_article.summary: | ||||
|         summary = news_article.summary | ||||
|     elif news_article.text: | ||||
|         ind = min(500, len(news_article.text)) | ||||
|         summary = news_article.text[:ind] + "..." | ||||
|     else: | ||||
|         summary = fallback.summary         | ||||
|         summary = fallback.summary | ||||
|  | ||||
|     if news_article.meta_lang: | ||||
|         lang = news_article.meta_lang | ||||
|     else: | ||||
|         lang = "" | ||||
|  | ||||
|     article_object.title = title | ||||
|     article_object.summary = summary | ||||
|     article_object.language = lang | ||||
|     article_object.set_authors(news_article.authors) | ||||
|     article_object.set_keywords(news_article.keywords) | ||||
|      | ||||
|  | ||||
|     return article_object | ||||
|   | ||||
| @@ -9,10 +9,10 @@ def upload_to_archive(article_object): | ||||
|     try: | ||||
|         wayback = WaybackMachineSaveAPI(url, user_agent) | ||||
|         archive_url = wayback.save() | ||||
|         logger.info(f"{url} uploaded to archive successfully") | ||||
|         # logger.info(f"{url} uploaded to archive successfully") | ||||
|         article_object.archive_url = archive_url | ||||
|     except Exception as e: | ||||
|         article_object.archive_url = "Error while uploading: {}".format(e) | ||||
|         logger.error(f"Error while generating new url: {e}") | ||||
|         logger.error(f"Error while generating archive url: {e}") | ||||
|  | ||||
|     return article_object | ||||
| @@ -1,7 +1,6 @@ | ||||
| from threading import Thread | ||||
| import time | ||||
| import logging | ||||
| # logger = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| class TemplateWorker(Thread): | ||||
| @@ -34,7 +33,6 @@ class TemplateWorker(Thread): | ||||
|                  | ||||
|  | ||||
|     def _handle_article(self, article_watcher, action=None): | ||||
|         # TODO Overload in children classes | ||||
|         if action is None: | ||||
|             self.logger.error("Unoverloaded call of _handle_article(). This should not occur in prod") | ||||
|         else: | ||||
|   | ||||
| @@ -1,6 +1,6 @@ | ||||
| from .worker_template import TemplateWorker | ||||
| from .download.browser import PDFDownloader | ||||
| from .download.youtube import save_video | ||||
| from .download.youtube import YouTubeDownloader | ||||
| from .fetch.runner import get_description | ||||
| from .upload.runner import upload_to_archive as run_upload | ||||
| from .compress.runner import shrink_pdf | ||||
| @@ -11,7 +11,7 @@ logger = logging.getLogger(__name__) | ||||
| class DownloadWorker(TemplateWorker): | ||||
|     def __init__(self) -> None: | ||||
|         self.dl_runner = PDFDownloader().download | ||||
|         self.yt_runner = save_video | ||||
|         self.yt_runner = YouTubeDownloader().save_video | ||||
|         super().__init__() | ||||
|  | ||||
|     def _handle_article(self, article_watcher): | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Remy Moll
					Remy Moll