Many bug fixes

This commit is contained in:
Remy Moll
2022-04-20 16:49:55 +02:00
parent 0d76bcbb98
commit 8f3ea25662
16 changed files with 223 additions and 118 deletions

View File

@@ -7,10 +7,10 @@ import requests
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import configuration
import json
config = configuration.parsed["DOWNLOADS"]
blacklisted = json.loads(config["blacklisted_href_domains"])
class PDFDownloader:
"""Saves a given url. Fills the object it got as a parameter"""
@@ -61,10 +61,6 @@ class PDFDownloader:
self.autostart()
url = article_object.article_url
# arbitrary bug fixes:
if "focus.de" in url or "bloomberg.com" in url:
url = url.replace("https://", "https://outline.com/")
sleep_time += 5
try:
self.driver.get(url)
except Exception as e:
@@ -97,7 +93,7 @@ class PDFDownloader:
if success:
article_object.file_name = fname
article_object.set_references = self.get_references()
article_object.set_references(self.get_references())
else:
article_object.file_name = ""
@@ -140,10 +136,12 @@ class PDFDownloader:
hrefs = [e.get_attribute("href") for e in self.driver.find_elements_by_xpath("//a[@href]")]
except:
hrefs = []
# TODO TEST THIS
old = hrefs
hrefs = [h for h in hrefs \
if bool([(domain in h) for domain in config["blacklisted_href_domains"]])
if not sum([(domain in h) for domain in blacklisted]) # sum([True, False, False, False]) == 1 (esp. not 0)
] # filter a tiny bit at least
diff = set(old) ^ set(hrefs)
self.logger.info(f"Removed {len(diff)} hrefs: {diff} (before:{len(old)}, after: {len(hrefs)})")
return hrefs

View File

@@ -1,33 +1,65 @@
import logging
from __future__ import unicode_literals
import youtube_dl
import os
from pytube import YouTube
import logging
logger = logging.getLogger(__name__)
def save_video(article_object):
"""Saves video accoring to url and save path"""
url = article_object.article_url
logger.info("Saving new video")
try:
yt = YouTube(url)
streams = yt.streams.filter(progressive=True).order_by('resolution')
except Exception as e:
article_object.file_name = "ERROR: {}".format(e)
return article_object
class MyLogger(object):
def debug(self, msg): pass
def warning(self, msg): pass
def error(self, msg):
logger.error(msg)
if streams: # if it's not empty
vid = streams[-1]
article_object.source_name = "youtube.com"
article_object.title = yt.title
class YouTubeDownloader:
def __init__(self) -> None:
pass
def post_download_hook(self, ret_code):
# print(ret_code)
if ret_code['status'] == 'finished':
file_loc = ret_code["filename"]
fname = os.path.basename(file_loc)
self.article_object.file_name = fname
def save_video(self, article_object):
"""Saves video accoring to url and save path"""
self.article_object = article_object
url = article_object.article_url
logger.info("Saving new video")
file_path = os.path.join(article_object.save_path, article_object.fname_template)
ydl_opts = {
'format': 'best[height<=720]',
'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download
'logger': MyLogger(),
'progress_hooks': [self.post_download_hook],
'updatetime': False
}
try:
vid.download(file_path)
article_object.file_name = article_object.fname_template
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
# article file name is updated in self.post_download_hook
except Exception as e:
logger.error(f"Youtube download crashed: {e}")
article_object.file_name = "Error while downloading"
else:
article_object.file_name = "No streams available"
return article_object
article_object.file_name = ""
return article_object
# class DummyArticle:
# article_url = "https://www.welt.de/politik/ausland/article238267261/Baerbock-Lieferung-gepanzerter-Fahrzeuge-an-die-Ukraine-kein-Tabu.html"
# save_path = "/app/file_storage/"
# fname_template = "www.youtube.com -- Test"
# file_name = ""
# m = DummyArticle()
# t = YouTubeDownloader()
# t.save_video(m)
# print(m.file_name)