reduced slack functionality, higher ease of use. Database migration wip

This commit is contained in:
2022-09-05 16:29:19 +02:00
parent 60c9e88c7b
commit 2e65828bbb
35 changed files with 789 additions and 998 deletions

View File

View File

@@ -0,0 +1,47 @@
import os
import subprocess
from pathlib import Path
import logging
logger = logging.getLogger(__name__)
import configuration
config = configuration.main_config["DOWNLOADS"]
shrink_sizes = []
def shrink_pdf(article):
article_loc = Path(article.save_path) / article.file_name
initial_size = article_loc.stat().st_size
compressed_tmp = Path(config['default_download_path']) / "compressed.pdf"
if article_loc.suffix != "pdf":
return article # it probably was a youtube video
c = subprocess.run(
[
"gs",
"-sDEVICE=pdfwrite",
"-dPDFSETTINGS=/screen",
"-dNOPAUSE",
"-dBATCH",
f"-sOutputFile={compressed_tmp}",
f"{article_loc}"
],
stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
if c.returncode == 0:
try:
os.replace(compressed_tmp, article_loc)
except OSError as e:
logger.error(f"Compression ran but I could not copy back the file {e}")
final_size = article_loc.stat().st_size
shrink_sizes.append(initial_size - final_size)
logger.info(f"Compression worked. Avg shrinkage: {int(sum(shrink_sizes)/len(shrink_sizes) / 1000)} KB")
else:
logger.error(f"Could not run the compression! {c.stderr.decode()} - {c.stdout.decode()}")
return article

View File

@@ -0,0 +1,161 @@
import time
import datetime
import logging
import os
import base64
import requests
from selenium import webdriver
import configuration
import json
config = configuration.main_config["DOWNLOADS"]
blacklisted = json.loads(config["blacklisted_href_domains"])
class PDFDownloader:
"""Saves a given url. Fills the object it got as a parameter"""
logger = logging.getLogger(__name__)
# status-variable for restarting:
running = False
def start(self):
self.finish() # clear up
options = webdriver.FirefoxOptions()
options.profile = config["browser_profile_path"]
# should be options.set_preference("profile", config["browser_profile_path"]) as of selenium 4 but that doesn't work
if os.getenv("DEBUG", "false") == "true":
self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
else:
options.add_argument('--headless')
options.set_preference('print.save_as_pdf.links.enabled', True)
# Just save if the filetype is pdf already
# TODO: this is not working right now
options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
options.set_preference("browser.download.folderList", 2)
# options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
# options.set_preference("pdfjs.disabled", True)
options.set_preference("browser.download.dir", config["default_download_path"])
self.logger.info("Starting gecko driver")
# peviously, in a single docker image:
# self.driver = webdriver.Firefox(
# options = options,
# service = webdriver.firefox.service.Service(
# log_path = f'{config["local_storage_path"]}/geckodriver.log'
# ))
self.driver = webdriver.Remote(
command_executor = 'http://geckodriver:4444',
options = options,
# can't set log path...
)
residues = os.listdir(config["default_download_path"])
for res in residues:
os.remove(os.path.join(config["default_download_path"], res))
self.running = True
def autostart(self):
if not self.running:
self.start() # relaunch the dl util
def finish(self):
if self.running:
self.logger.info("Exiting gecko driver")
try:
self.driver.quit()
time.sleep(10)
except:
self.logger.critical("Connection to the driver broke off")
self.running = False
else:
self.logger.info("Gecko driver not yet running")
def download(self, article_object):
sleep_time = 2
self.autostart()
url = article_object.article_url
try:
self.driver.get(url)
except Exception as e:
self.logger.critical("Selenium .get(url) failed with error {}".format(e))
self.finish()
return article_object # without changes
time.sleep(sleep_time)
# leave the page time to do any funky business
# in the mean time, get a page title if required
if article_object.is_title_bad:
article_object.title = self.driver.title.replace(".pdf", "") # some titles end with .pdf
# will be propagated to the saved file (dst) as well
fname = article_object.fname_template
dst = os.path.join(article_object.save_path, fname)
if os.path.exists(dst):
fname = make_path_unique(fname)
dst = os.path.join(article_object.save_path, fname)
if url[-4:] == ".pdf":
# according to the browser preferences, calling the url will open pdfjs.
# If not handled separately, printing would require the ctrl+p route, but setup is janky to say the least
success = self.get_exisiting_pdf(url, dst)
else:
success = self.get_new_pdf(dst)
if success:
article_object.file_name = fname
else:
article_object.file_name = ""
return article_object # this change is saved later by the external caller
def get_exisiting_pdf(self, url, dst):
try:
r = requests.get(url)
bytes = r.content
except:
return False
return self.get_new_pdf(dst, other_bytes=bytes)
def get_new_pdf(self, dst, other_bytes=None):
os.makedirs(os.path.dirname(dst), exist_ok=True)
if other_bytes is None:
try:
result = self.driver.print_page()
bytes = base64.b64decode(result, validate=True)
except:
self.logger.error("Failed, probably because the driver went extinct.")
return False
else:
bytes = other_bytes
try:
with open(dst, "wb+") as f:
f.write(bytes)
return True
except Exception as e:
self.logger.error(f"Failed, because of FS-operation: {e}")
return False
def make_path_unique(path):
fname, ending = os.path.splitext(path)
fname += datetime.datetime.now().strftime("%d-%H%M%S")
return fname + ending

View File

@@ -0,0 +1,51 @@
from __future__ import unicode_literals
import youtube_dl
import os
import logging
logger = logging.getLogger(__name__)
class MyLogger(object):
def debug(self, msg): pass
def warning(self, msg): pass
def error(self, msg):
logger.error(msg)
class YouTubeDownloader:
def __init__(self) -> None:
pass
def post_download_hook(self, ret_code):
# print(ret_code)
if ret_code['status'] == 'finished':
file_loc = ret_code["filename"]
fname = os.path.basename(file_loc)
self.article_object.file_name = fname
def save_video(self, article_object):
"""Saves video accoring to url and save path"""
self.article_object = article_object
url = article_object.article_url
logger.info("Saving new video")
file_path = os.path.join(article_object.save_path, article_object.fname_template)
ydl_opts = {
'format': 'best[height<=720]',
'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download
'logger': MyLogger(),
'progress_hooks': [self.post_download_hook],
'updatetime': False
}
try:
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
# article file name is updated in self.post_download_hook
except Exception as e:
logger.error(f"Youtube download crashed: {e}")
article_object.file_name = ""
return article_object

View File

@@ -0,0 +1,57 @@
from newspaper import Article
from urllib.parse import urlparse
from htmldate import find_date
import datetime
import logging
logging.getLogger('newspaper').setLevel(logging.ERROR) # quieter logs
logging.getLogger('urllib').setLevel(logging.ERROR) # quieter logs
logging.getLogger('urllib3.poolmanager').setLevel(logging.ERROR) # quieter logs
logging.getLogger('htmldate').setLevel(logging.ERROR) #quieter logs
logging.getLogger('charset_normalizer').setLevel(logging.ERROR) #quieter logs
logger = logging.getLogger("fetch")
def get_description(article_object):
url = article_object.article_url
website = urlparse(url).netloc
article_object.source_name = website
try:
article_object.pub_date = datetime.datetime.strptime(find_date(url), '%Y-%d-%M')
except: # other file types
article_object.pub_date = datetime.datetime(year=1900, month=1, day=1)
try:
news_article = Article(url)
news_article.download()
news_article.parse()
except:
news_article = object() # fallback value
try:
article_object.title = news_article.title
except AttributeError:
article_object.title = "Error while running fetch"
try:
if article_object.summary:
article_object.summary = news_article.summary
elif news_article.text:
ind = min(500, len(news_article.text))
article_object.summary = news_article.text[:ind] + "..."
else:
article_object.summary = ""
except AttributeError:
article_object.summary = ""
try:
article_object.language = news_article.meta_lang
except AttributeError:
article_object.language = ""
try:
article_object.set_authors(news_article.authors)
except AttributeError:
pass # list would have been empty anyway
return article_object

View File

@@ -0,0 +1,20 @@
import time
from waybackpy import WaybackMachineSaveAPI # upload to archive.org
import logging
logger = logging.getLogger(__name__)
def upload_to_archive(article_object):
"""uploads to archive.org and returns the archived url"""
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed?
url = article_object.article_url
try:
wayback = WaybackMachineSaveAPI(url, user_agent)
archive_url = wayback.save()
# logger.info(f"{url} uploaded to archive successfully")
article_object.archive_url = archive_url
except Exception as e:
article_object.archive_url = "Error while uploading: {}".format(e)
logger.error(f"Error while generating archive url: {e}")
return article_object

View File

@@ -0,0 +1,41 @@
from threading import Thread
import time
import logging
class TemplateWorker(Thread):
"""Parent class for any subsequent worker of the article-download pipeline. They should all run in parallel, thus the Thread subclassing"""
logger = logging.getLogger(__name__)
def __init__(self, *args, **kwargs) -> None:
target = self._queue_processor # will be executed on Worker.start()
group = kwargs.get("group", None)
name = kwargs.get("name", None)
super().__init__(group=group, target=target, name=name)
self._article_queue = []
self.logger.info(f"Worker thread {self.__class__.__name__} initialized successfully")
def process(self, article_watcher):
self._article_queue.append(article_watcher)#.article_model.article_url)
def _queue_processor(self):
"""This method is launched by thread.run() and idles when self._article_queue is empty. When an external caller appends to the queue it jumps into action"""
while True: # PLEASE tell me if I'm missing an obvious better way of doing this!
if len(self._article_queue) == 0:
time.sleep(5)
else:
article_watcher = self._article_queue.pop(0)
self.logger.info(f"{self.__class__.__name__} now processing from queue (length: {len(self._article_queue)}) - {article_watcher.article}")
self._handle_article(article_watcher)
def _handle_article(self, article_watcher, action=None):
if action is None:
self.logger.error("Unoverloaded call of _handle_article(). This should not occur in prod")
else:
article = article_watcher.article
article = action(article) # action updates the article object but does not save the change
article.save()

View File

@@ -0,0 +1,66 @@
from .worker_template import TemplateWorker
from .download.browser import PDFDownloader
from .download.youtube import YouTubeDownloader
from .fetch.runner import get_description
from .upload.runner import upload_to_archive as run_upload
from .compress.runner import shrink_pdf
import time
import logging
logger = logging.getLogger(__name__)
class DownloadWorker(TemplateWorker):
def __init__(self) -> None:
self.dl_runner = PDFDownloader().download
self.yt_runner = YouTubeDownloader().save_video
super().__init__()
def _handle_article(self, article_watcher):
article = article_watcher.article
u = article.article_url
if "youtu.be/" in u or "youtube.com/" in u:
action = self.yt_runner
else:
action = self.dl_runner
super()._handle_article(article_watcher, action)
article_watcher.download_completed = True
class FetchWorker(TemplateWorker):
def __init__(self) -> None:
super().__init__()
def _handle_article(self, article_watcher):
action = get_description # function
super()._handle_article(article_watcher, action)
article_watcher.fetch_completed = True
class UploadWorker(TemplateWorker):
def __init__(self) -> None:
super().__init__()
def _handle_article(self, article_watcher):
def action(*args, **kwargs):
time.sleep(10) # uploads to archive are throttled to 15/minute, but 5s still triggers a blacklisting
return run_upload(*args, **kwargs)
super()._handle_article(article_watcher, action)
article_watcher.upload_completed = True
class CompressWorker(TemplateWorker):
def __init__(self) -> None:
super().__init__()
def _handle_article(self, article_watcher):
action = shrink_pdf
super()._handle_article(article_watcher, action)
article_watcher.compression_completed = True