Many bug fixes
This commit is contained in:
@@ -9,6 +9,9 @@ shrink_sizes = []
|
||||
|
||||
def shrink_pdf(article):
|
||||
initial_size = os.path.getsize(article.save_path + article.file_name)
|
||||
if article.file_name[-4:] != ".pdf":
|
||||
return article # it probably was a youtube video
|
||||
|
||||
c = subprocess.run(
|
||||
["gs", "-sDEVICE=pdfwrite", "-dPDFSETTINGS=/screen", "-dNOPAUSE", "-dBATCH", f"-sOutputFile={config['default_download_path']}/compressed.pdf", f'"{article.save_path + article.file_name}"'],
|
||||
stdout=subprocess.PIPE,
|
||||
|
@@ -7,10 +7,10 @@ import requests
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.firefox.options import Options
|
||||
import configuration
|
||||
import json
|
||||
|
||||
config = configuration.parsed["DOWNLOADS"]
|
||||
|
||||
|
||||
blacklisted = json.loads(config["blacklisted_href_domains"])
|
||||
|
||||
class PDFDownloader:
|
||||
"""Saves a given url. Fills the object it got as a parameter"""
|
||||
@@ -61,10 +61,6 @@ class PDFDownloader:
|
||||
self.autostart()
|
||||
url = article_object.article_url
|
||||
|
||||
# arbitrary bug fixes:
|
||||
if "focus.de" in url or "bloomberg.com" in url:
|
||||
url = url.replace("https://", "https://outline.com/")
|
||||
sleep_time += 5
|
||||
try:
|
||||
self.driver.get(url)
|
||||
except Exception as e:
|
||||
@@ -97,7 +93,7 @@ class PDFDownloader:
|
||||
|
||||
if success:
|
||||
article_object.file_name = fname
|
||||
article_object.set_references = self.get_references()
|
||||
article_object.set_references(self.get_references())
|
||||
else:
|
||||
article_object.file_name = ""
|
||||
|
||||
@@ -140,10 +136,12 @@ class PDFDownloader:
|
||||
hrefs = [e.get_attribute("href") for e in self.driver.find_elements_by_xpath("//a[@href]")]
|
||||
except:
|
||||
hrefs = []
|
||||
# TODO TEST THIS
|
||||
old = hrefs
|
||||
hrefs = [h for h in hrefs \
|
||||
if bool([(domain in h) for domain in config["blacklisted_href_domains"]])
|
||||
if not sum([(domain in h) for domain in blacklisted]) # sum([True, False, False, False]) == 1 (esp. not 0)
|
||||
] # filter a tiny bit at least
|
||||
diff = set(old) ^ set(hrefs)
|
||||
self.logger.info(f"Removed {len(diff)} hrefs: {diff} (before:{len(old)}, after: {len(hrefs)})")
|
||||
return hrefs
|
||||
|
||||
|
||||
|
@@ -1,33 +1,65 @@
|
||||
import logging
|
||||
from __future__ import unicode_literals
|
||||
import youtube_dl
|
||||
import os
|
||||
from pytube import YouTube
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def save_video(article_object):
|
||||
"""Saves video accoring to url and save path"""
|
||||
url = article_object.article_url
|
||||
logger.info("Saving new video")
|
||||
try:
|
||||
yt = YouTube(url)
|
||||
streams = yt.streams.filter(progressive=True).order_by('resolution')
|
||||
except Exception as e:
|
||||
article_object.file_name = "ERROR: {}".format(e)
|
||||
return article_object
|
||||
class MyLogger(object):
|
||||
def debug(self, msg): pass
|
||||
def warning(self, msg): pass
|
||||
def error(self, msg):
|
||||
logger.error(msg)
|
||||
|
||||
if streams: # if it's not empty
|
||||
vid = streams[-1]
|
||||
article_object.source_name = "youtube.com"
|
||||
article_object.title = yt.title
|
||||
|
||||
|
||||
class YouTubeDownloader:
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
|
||||
def post_download_hook(self, ret_code):
|
||||
# print(ret_code)
|
||||
if ret_code['status'] == 'finished':
|
||||
file_loc = ret_code["filename"]
|
||||
fname = os.path.basename(file_loc)
|
||||
self.article_object.file_name = fname
|
||||
|
||||
|
||||
def save_video(self, article_object):
|
||||
"""Saves video accoring to url and save path"""
|
||||
self.article_object = article_object
|
||||
url = article_object.article_url
|
||||
logger.info("Saving new video")
|
||||
file_path = os.path.join(article_object.save_path, article_object.fname_template)
|
||||
ydl_opts = {
|
||||
'format': 'best[height<=720]',
|
||||
'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download
|
||||
'logger': MyLogger(),
|
||||
'progress_hooks': [self.post_download_hook],
|
||||
'updatetime': False
|
||||
}
|
||||
try:
|
||||
vid.download(file_path)
|
||||
article_object.file_name = article_object.fname_template
|
||||
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
||||
ydl.download([url])
|
||||
# article file name is updated in self.post_download_hook
|
||||
except Exception as e:
|
||||
logger.error(f"Youtube download crashed: {e}")
|
||||
article_object.file_name = "Error while downloading"
|
||||
else:
|
||||
article_object.file_name = "No streams available"
|
||||
|
||||
return article_object
|
||||
article_object.file_name = ""
|
||||
|
||||
return article_object
|
||||
|
||||
|
||||
|
||||
# class DummyArticle:
|
||||
# article_url = "https://www.welt.de/politik/ausland/article238267261/Baerbock-Lieferung-gepanzerter-Fahrzeuge-an-die-Ukraine-kein-Tabu.html"
|
||||
# save_path = "/app/file_storage/"
|
||||
# fname_template = "www.youtube.com -- Test"
|
||||
# file_name = ""
|
||||
|
||||
# m = DummyArticle()
|
||||
# t = YouTubeDownloader()
|
||||
# t.save_video(m)
|
||||
|
||||
# print(m.file_name)
|
||||
|
@@ -37,24 +37,28 @@ def get_description(article_object):
|
||||
except:
|
||||
news_article = fallback
|
||||
|
||||
|
||||
if news_article.title:
|
||||
title = news_article.title
|
||||
else:
|
||||
title = fallback.title
|
||||
|
||||
|
||||
if news_article.summary:
|
||||
summary = news_article.summary
|
||||
elif news_article.text:
|
||||
ind = min(500, len(news_article.text))
|
||||
summary = news_article.text[:ind] + "..."
|
||||
else:
|
||||
summary = fallback.summary
|
||||
summary = fallback.summary
|
||||
|
||||
if news_article.meta_lang:
|
||||
lang = news_article.meta_lang
|
||||
else:
|
||||
lang = ""
|
||||
|
||||
article_object.title = title
|
||||
article_object.summary = summary
|
||||
article_object.language = lang
|
||||
article_object.set_authors(news_article.authors)
|
||||
article_object.set_keywords(news_article.keywords)
|
||||
|
||||
|
||||
return article_object
|
||||
|
@@ -9,10 +9,10 @@ def upload_to_archive(article_object):
|
||||
try:
|
||||
wayback = WaybackMachineSaveAPI(url, user_agent)
|
||||
archive_url = wayback.save()
|
||||
logger.info(f"{url} uploaded to archive successfully")
|
||||
# logger.info(f"{url} uploaded to archive successfully")
|
||||
article_object.archive_url = archive_url
|
||||
except Exception as e:
|
||||
article_object.archive_url = "Error while uploading: {}".format(e)
|
||||
logger.error(f"Error while generating new url: {e}")
|
||||
logger.error(f"Error while generating archive url: {e}")
|
||||
|
||||
return article_object
|
@@ -1,7 +1,6 @@
|
||||
from threading import Thread
|
||||
import time
|
||||
import logging
|
||||
# logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TemplateWorker(Thread):
|
||||
@@ -34,7 +33,6 @@ class TemplateWorker(Thread):
|
||||
|
||||
|
||||
def _handle_article(self, article_watcher, action=None):
|
||||
# TODO Overload in children classes
|
||||
if action is None:
|
||||
self.logger.error("Unoverloaded call of _handle_article(). This should not occur in prod")
|
||||
else:
|
||||
|
@@ -1,6 +1,6 @@
|
||||
from .worker_template import TemplateWorker
|
||||
from .download.browser import PDFDownloader
|
||||
from .download.youtube import save_video
|
||||
from .download.youtube import YouTubeDownloader
|
||||
from .fetch.runner import get_description
|
||||
from .upload.runner import upload_to_archive as run_upload
|
||||
from .compress.runner import shrink_pdf
|
||||
@@ -11,7 +11,7 @@ logger = logging.getLogger(__name__)
|
||||
class DownloadWorker(TemplateWorker):
|
||||
def __init__(self) -> None:
|
||||
self.dl_runner = PDFDownloader().download
|
||||
self.yt_runner = save_video
|
||||
self.yt_runner = YouTubeDownloader().save_video
|
||||
super().__init__()
|
||||
|
||||
def _handle_article(self, article_watcher):
|
||||
|
Reference in New Issue
Block a user