new component - upload to NAS
This commit is contained in:
		
							
								
								
									
										0
									
								
								news_fetch/app/utils_worker/download/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								news_fetch/app/utils_worker/download/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										172
									
								
								news_fetch/app/utils_worker/download/browser.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										172
									
								
								news_fetch/app/utils_worker/download/browser.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,172 @@
 | 
			
		||||
import time
 | 
			
		||||
import datetime
 | 
			
		||||
import logging
 | 
			
		||||
import os
 | 
			
		||||
import base64
 | 
			
		||||
import requests
 | 
			
		||||
from selenium import webdriver
 | 
			
		||||
import configuration
 | 
			
		||||
import json
 | 
			
		||||
 | 
			
		||||
config = configuration.parsed["DOWNLOADS"]
 | 
			
		||||
blacklisted = json.loads(config["blacklisted_href_domains"])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class PDFDownloader:
 | 
			
		||||
    """Saves a given url. Fills the object it got as a parameter"""
 | 
			
		||||
    logger = logging.getLogger(__name__)
 | 
			
		||||
    # status-variable for restarting:
 | 
			
		||||
    running = False
 | 
			
		||||
    
 | 
			
		||||
    def start(self):
 | 
			
		||||
        self.finish() # clear up
 | 
			
		||||
            
 | 
			
		||||
        options = webdriver.FirefoxOptions()
 | 
			
		||||
        options.profile = config["browser_profile_path"]
 | 
			
		||||
        # should be options.set_preference("profile", config["browser_profile_path"]) as of selenium 4 but that doesn't work
 | 
			
		||||
 | 
			
		||||
        if os.getenv("HEADLESS", "false") == "true":
 | 
			
		||||
            options.add_argument('--headless')
 | 
			
		||||
        else:
 | 
			
		||||
            self.logger.warning("Opening browser GUI because of 'HEADLESS=false'")
 | 
			
		||||
 | 
			
		||||
        options.set_preference('print.save_as_pdf.links.enabled', True)
 | 
			
		||||
        # Just save if the filetype is pdf already, does not work!
 | 
			
		||||
 | 
			
		||||
        options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
 | 
			
		||||
        options.set_preference("browser.download.folderList", 2)
 | 
			
		||||
        # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
 | 
			
		||||
        # options.set_preference("pdfjs.disabled", True)
 | 
			
		||||
        options.set_preference("browser.download.dir", config["default_download_path"])
 | 
			
		||||
 | 
			
		||||
        self.logger.info("Starting gecko driver")
 | 
			
		||||
        # self.driver = webdriver.Firefox(
 | 
			
		||||
        #     options = options,
 | 
			
		||||
        #     service = webdriver.firefox.service.Service(
 | 
			
		||||
        #         log_path = f'{config["local_storage_path"]}/geckodriver.log'
 | 
			
		||||
        # ))
 | 
			
		||||
        self.driver = webdriver.Remote(
 | 
			
		||||
            command_executor = 'http://geckodriver:4444',
 | 
			
		||||
            options = options,
 | 
			
		||||
            # can't set log path...
 | 
			
		||||
        )
 | 
			
		||||
        
 | 
			
		||||
        residues = os.listdir(config["default_download_path"])
 | 
			
		||||
        for res in residues:
 | 
			
		||||
            os.remove(os.path.join(config["default_download_path"], res))
 | 
			
		||||
 | 
			
		||||
        self.running = True
 | 
			
		||||
 | 
			
		||||
    def autostart(self):
 | 
			
		||||
        if not self.running:
 | 
			
		||||
            self.start()  # relaunch the dl util
 | 
			
		||||
 | 
			
		||||
    def finish(self):
 | 
			
		||||
        if self.running:
 | 
			
		||||
            self.logger.info("Exiting gecko driver")
 | 
			
		||||
            try:
 | 
			
		||||
                self.driver.quit()
 | 
			
		||||
                time.sleep(10)
 | 
			
		||||
            except:
 | 
			
		||||
                self.logger.critical("Connection to the driver broke off")
 | 
			
		||||
            self.running = False
 | 
			
		||||
        else:
 | 
			
		||||
            self.logger.info("Gecko driver not yet running")
 | 
			
		||||
 | 
			
		||||
    def download(self, article_object):
 | 
			
		||||
        sleep_time = 2
 | 
			
		||||
        self.autostart()
 | 
			
		||||
        url = article_object.article_url
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            self.driver.get(url)
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            self.logger.critical("Selenium .get(url) failed with error {}".format(e))
 | 
			
		||||
            self.finish()
 | 
			
		||||
            return article_object  # without changes
 | 
			
		||||
        
 | 
			
		||||
        time.sleep(sleep_time)
 | 
			
		||||
        # leave the page time to do any funky business
 | 
			
		||||
 | 
			
		||||
        # in the mean time, get a page title if required
 | 
			
		||||
        if article_object.is_title_bad:
 | 
			
		||||
            article_object.title = self.driver.title.replace(".pdf", "")
 | 
			
		||||
            # will be propagated to the saved file (dst) as well
 | 
			
		||||
 | 
			
		||||
        fname = article_object.fname_template
 | 
			
		||||
        dst = os.path.join(article_object.save_path, fname)
 | 
			
		||||
        if os.path.exists(dst):
 | 
			
		||||
            fname = make_path_unique(fname)
 | 
			
		||||
            dst = os.path.join(article_object.save_path, fname)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        if url[-4:] == ".pdf":
 | 
			
		||||
            # according to the browser preferences, calling the url will open pdfjs.
 | 
			
		||||
            # If not handled separately, printing would require the ctrl+p route, but setup is janky to say the least
 | 
			
		||||
            success = self.get_exisiting_pdf(url, dst)
 | 
			
		||||
        else:
 | 
			
		||||
            success = self.get_new_pdf(dst)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        if success:
 | 
			
		||||
            article_object.file_name = fname
 | 
			
		||||
            article_object.set_references(self.get_references())
 | 
			
		||||
        else:
 | 
			
		||||
            article_object.file_name = ""
 | 
			
		||||
        
 | 
			
		||||
        return article_object  # this change is saved later by the external caller
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def get_exisiting_pdf(self, url, dst):
 | 
			
		||||
        try:
 | 
			
		||||
            r = requests.get(url)
 | 
			
		||||
            bytes = r.content
 | 
			
		||||
        except:
 | 
			
		||||
            return False
 | 
			
		||||
        return self.get_new_pdf(dst, other_bytes=bytes)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def get_new_pdf(self, dst, other_bytes=None):
 | 
			
		||||
        os.makedirs(os.path.dirname(dst), exist_ok=True)
 | 
			
		||||
 | 
			
		||||
        if other_bytes is None:
 | 
			
		||||
            try:
 | 
			
		||||
                result = self.driver.print_page()
 | 
			
		||||
                bytes = base64.b64decode(result, validate=True)
 | 
			
		||||
            except:
 | 
			
		||||
                self.logger.error("Failed, probably because the driver went extinct.")
 | 
			
		||||
                return False
 | 
			
		||||
        else:
 | 
			
		||||
            bytes = other_bytes
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            with open(dst, "wb+") as f:
 | 
			
		||||
                f.write(bytes)
 | 
			
		||||
            return True
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            self.logger.error(f"Failed, because of FS-operation: {e}")
 | 
			
		||||
            return False
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
    def get_references(self):
 | 
			
		||||
        try:
 | 
			
		||||
            hrefs = [e.get_attribute("href") for e in self.driver.find_elements_by_xpath("//a[@href]")]
 | 
			
		||||
        except:
 | 
			
		||||
            hrefs = []
 | 
			
		||||
        len_old = len(hrefs)
 | 
			
		||||
        hrefs = [h for h in hrefs \
 | 
			
		||||
            if not sum([(domain in h) for domain in blacklisted]) # sum([True, False, False, False]) == 1 (esp. not 0)
 | 
			
		||||
            ] # filter a tiny bit at least
 | 
			
		||||
        self.logger.info(f"Hrefs filtered (before: {len_old}, after: {len(hrefs)})")
 | 
			
		||||
        return hrefs
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def make_path_unique(path):
 | 
			
		||||
    fname, ending = os.path.splitext(path)
 | 
			
		||||
    fname += datetime.datetime.now().strftime("%d-%H%M%S")
 | 
			
		||||
    return fname + ending
 | 
			
		||||
							
								
								
									
										0
									
								
								news_fetch/app/utils_worker/download/runner.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								news_fetch/app/utils_worker/download/runner.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										51
									
								
								news_fetch/app/utils_worker/download/youtube.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										51
									
								
								news_fetch/app/utils_worker/download/youtube.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,51 @@
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
import youtube_dl
 | 
			
		||||
import os
 | 
			
		||||
import logging
 | 
			
		||||
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class MyLogger(object):
 | 
			
		||||
    def debug(self, msg): pass
 | 
			
		||||
    def warning(self, msg): pass
 | 
			
		||||
    def error(self, msg):
 | 
			
		||||
        logger.error(msg)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class YouTubeDownloader:
 | 
			
		||||
    def __init__(self) -> None:
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def post_download_hook(self, ret_code):
 | 
			
		||||
        # print(ret_code)
 | 
			
		||||
        if ret_code['status'] == 'finished':
 | 
			
		||||
            file_loc = ret_code["filename"]
 | 
			
		||||
            fname = os.path.basename(file_loc)
 | 
			
		||||
            self.article_object.file_name = fname
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def save_video(self, article_object):
 | 
			
		||||
        """Saves video accoring to url and save path"""
 | 
			
		||||
        self.article_object = article_object
 | 
			
		||||
        url = article_object.article_url
 | 
			
		||||
        logger.info("Saving new video")
 | 
			
		||||
        file_path = os.path.join(article_object.save_path, article_object.fname_template)
 | 
			
		||||
        ydl_opts = {
 | 
			
		||||
            'format': 'best[height<=720]',
 | 
			
		||||
            'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download
 | 
			
		||||
            'logger': MyLogger(),
 | 
			
		||||
            'progress_hooks': [self.post_download_hook],
 | 
			
		||||
            'updatetime': False
 | 
			
		||||
        }
 | 
			
		||||
        try:
 | 
			
		||||
            with youtube_dl.YoutubeDL(ydl_opts) as ydl:
 | 
			
		||||
                ydl.download([url])
 | 
			
		||||
                # article file name is updated in self.post_download_hook
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            logger.error(f"Youtube download crashed: {e}")
 | 
			
		||||
            article_object.file_name = ""
 | 
			
		||||
 | 
			
		||||
        return article_object
 | 
			
		||||
		Reference in New Issue
	
	Block a user