reduced slack functionality, higher ease of use. Database migration wip
This commit is contained in:
0
news_fetch/utils_worker/download/__init__.py
Normal file
0
news_fetch/utils_worker/download/__init__.py
Normal file
161
news_fetch/utils_worker/download/browser.py
Normal file
161
news_fetch/utils_worker/download/browser.py
Normal file
@@ -0,0 +1,161 @@
|
||||
import time
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
import base64
|
||||
import requests
|
||||
from selenium import webdriver
|
||||
import configuration
|
||||
import json
|
||||
|
||||
config = configuration.main_config["DOWNLOADS"]
|
||||
blacklisted = json.loads(config["blacklisted_href_domains"])
|
||||
|
||||
|
||||
class PDFDownloader:
|
||||
"""Saves a given url. Fills the object it got as a parameter"""
|
||||
logger = logging.getLogger(__name__)
|
||||
# status-variable for restarting:
|
||||
running = False
|
||||
|
||||
def start(self):
|
||||
self.finish() # clear up
|
||||
|
||||
options = webdriver.FirefoxOptions()
|
||||
options.profile = config["browser_profile_path"]
|
||||
# should be options.set_preference("profile", config["browser_profile_path"]) as of selenium 4 but that doesn't work
|
||||
|
||||
if os.getenv("DEBUG", "false") == "true":
|
||||
self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
|
||||
else:
|
||||
options.add_argument('--headless')
|
||||
|
||||
options.set_preference('print.save_as_pdf.links.enabled', True)
|
||||
# Just save if the filetype is pdf already
|
||||
# TODO: this is not working right now
|
||||
|
||||
options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
|
||||
options.set_preference("browser.download.folderList", 2)
|
||||
# options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
|
||||
# options.set_preference("pdfjs.disabled", True)
|
||||
options.set_preference("browser.download.dir", config["default_download_path"])
|
||||
|
||||
self.logger.info("Starting gecko driver")
|
||||
# peviously, in a single docker image:
|
||||
# self.driver = webdriver.Firefox(
|
||||
# options = options,
|
||||
# service = webdriver.firefox.service.Service(
|
||||
# log_path = f'{config["local_storage_path"]}/geckodriver.log'
|
||||
# ))
|
||||
self.driver = webdriver.Remote(
|
||||
command_executor = 'http://geckodriver:4444',
|
||||
options = options,
|
||||
# can't set log path...
|
||||
)
|
||||
|
||||
residues = os.listdir(config["default_download_path"])
|
||||
for res in residues:
|
||||
os.remove(os.path.join(config["default_download_path"], res))
|
||||
|
||||
self.running = True
|
||||
|
||||
def autostart(self):
|
||||
if not self.running:
|
||||
self.start() # relaunch the dl util
|
||||
|
||||
def finish(self):
|
||||
if self.running:
|
||||
self.logger.info("Exiting gecko driver")
|
||||
try:
|
||||
self.driver.quit()
|
||||
time.sleep(10)
|
||||
except:
|
||||
self.logger.critical("Connection to the driver broke off")
|
||||
self.running = False
|
||||
else:
|
||||
self.logger.info("Gecko driver not yet running")
|
||||
|
||||
def download(self, article_object):
|
||||
sleep_time = 2
|
||||
self.autostart()
|
||||
url = article_object.article_url
|
||||
|
||||
try:
|
||||
self.driver.get(url)
|
||||
except Exception as e:
|
||||
self.logger.critical("Selenium .get(url) failed with error {}".format(e))
|
||||
self.finish()
|
||||
return article_object # without changes
|
||||
|
||||
time.sleep(sleep_time)
|
||||
# leave the page time to do any funky business
|
||||
|
||||
# in the mean time, get a page title if required
|
||||
if article_object.is_title_bad:
|
||||
article_object.title = self.driver.title.replace(".pdf", "") # some titles end with .pdf
|
||||
# will be propagated to the saved file (dst) as well
|
||||
|
||||
fname = article_object.fname_template
|
||||
dst = os.path.join(article_object.save_path, fname)
|
||||
if os.path.exists(dst):
|
||||
fname = make_path_unique(fname)
|
||||
dst = os.path.join(article_object.save_path, fname)
|
||||
|
||||
|
||||
if url[-4:] == ".pdf":
|
||||
# according to the browser preferences, calling the url will open pdfjs.
|
||||
# If not handled separately, printing would require the ctrl+p route, but setup is janky to say the least
|
||||
success = self.get_exisiting_pdf(url, dst)
|
||||
else:
|
||||
success = self.get_new_pdf(dst)
|
||||
|
||||
|
||||
if success:
|
||||
article_object.file_name = fname
|
||||
else:
|
||||
article_object.file_name = ""
|
||||
|
||||
return article_object # this change is saved later by the external caller
|
||||
|
||||
|
||||
def get_exisiting_pdf(self, url, dst):
|
||||
try:
|
||||
r = requests.get(url)
|
||||
bytes = r.content
|
||||
except:
|
||||
return False
|
||||
return self.get_new_pdf(dst, other_bytes=bytes)
|
||||
|
||||
|
||||
def get_new_pdf(self, dst, other_bytes=None):
|
||||
os.makedirs(os.path.dirname(dst), exist_ok=True)
|
||||
|
||||
if other_bytes is None:
|
||||
try:
|
||||
result = self.driver.print_page()
|
||||
bytes = base64.b64decode(result, validate=True)
|
||||
except:
|
||||
self.logger.error("Failed, probably because the driver went extinct.")
|
||||
return False
|
||||
else:
|
||||
bytes = other_bytes
|
||||
|
||||
try:
|
||||
with open(dst, "wb+") as f:
|
||||
f.write(bytes)
|
||||
return True
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed, because of FS-operation: {e}")
|
||||
return False
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def make_path_unique(path):
|
||||
fname, ending = os.path.splitext(path)
|
||||
fname += datetime.datetime.now().strftime("%d-%H%M%S")
|
||||
return fname + ending
|
0
news_fetch/utils_worker/download/runner.py
Normal file
0
news_fetch/utils_worker/download/runner.py
Normal file
51
news_fetch/utils_worker/download/youtube.py
Normal file
51
news_fetch/utils_worker/download/youtube.py
Normal file
@@ -0,0 +1,51 @@
|
||||
from __future__ import unicode_literals
|
||||
import youtube_dl
|
||||
import os
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MyLogger(object):
|
||||
def debug(self, msg): pass
|
||||
def warning(self, msg): pass
|
||||
def error(self, msg):
|
||||
logger.error(msg)
|
||||
|
||||
|
||||
|
||||
class YouTubeDownloader:
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
|
||||
def post_download_hook(self, ret_code):
|
||||
# print(ret_code)
|
||||
if ret_code['status'] == 'finished':
|
||||
file_loc = ret_code["filename"]
|
||||
fname = os.path.basename(file_loc)
|
||||
self.article_object.file_name = fname
|
||||
|
||||
|
||||
def save_video(self, article_object):
|
||||
"""Saves video accoring to url and save path"""
|
||||
self.article_object = article_object
|
||||
url = article_object.article_url
|
||||
logger.info("Saving new video")
|
||||
file_path = os.path.join(article_object.save_path, article_object.fname_template)
|
||||
ydl_opts = {
|
||||
'format': 'best[height<=720]',
|
||||
'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download
|
||||
'logger': MyLogger(),
|
||||
'progress_hooks': [self.post_download_hook],
|
||||
'updatetime': False
|
||||
}
|
||||
try:
|
||||
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
||||
ydl.download([url])
|
||||
# article file name is updated in self.post_download_hook
|
||||
except Exception as e:
|
||||
logger.error(f"Youtube download crashed: {e}")
|
||||
article_object.file_name = ""
|
||||
|
||||
return article_object
|
Reference in New Issue
Block a user