A working app with a few bugs sprinkled in. Configuration saved externally

2022-04-17 21:58:58 +02:00 · 2022-04-17 21:58:58 +02:00 · 0a6dde8c78
commit 0a6dde8c78
26 changed files with 1742 additions and 0 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1 @@
+.dev/
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+.dev/
+
+*.pyc
+*.log
--- a/22
+++ b/22
@ -0,0 +1,22 @@
+FROM ubuntu:latest
+# UGH, timezone issues
+RUN ln -snf /usr/share/zoneinfo/$CONTAINER_TIMEZONE /etc/localtime && echo $CONTAINER_TIMEZONE > /etc/timezone
+
+RUN apt-get update && apt-get install -y evince libcanberra-gtk-module && apt-get install -y xauth wget tar python3 python3-pip python3-setuptools python3-wheel python3-dev build-essential firefox ghostscript
+
+# Download gecko (firefox) driver for selenium
+RUN wget https://github.com/mozilla/geckodriver/releases/download/v0.30.0/geckodriver-v0.30.0-linux64.tar.gz
+RUN tar -x geckodriver -zf geckodriver-v0.30.0-linux64.tar.gz -O > /usr/bin/geckodriver
+RUN chmod +x /usr/bin/geckodriver
+RUN rm geckodriver-v0.30.0-linux64.tar.gz
+RUN echo "127.0.0.1 localhost" >> /etc/hosts
+
+COPY requirements.txt /app/
+RUN python3 -m pip install --upgrade pip && python3 -m pip install -r /app/requirements.txt
+
+RUN mkdir -p /app/auto_news
+COPY app /app/auto_news
+WORKDIR /app/auto_news
+
+
+ENTRYPOINT ["python3", "runner.py"]
--- a/README.md
+++ b/README.md
@ -0,0 +1,46 @@
+# Auto_news
+
+A utility to fetch article requests from slack and generate pdfs for them, fully automatically.
+
+
+## Running
+### How to run - auto archiving mode
+In this mode the program is launched as a docker container, in a headless mode. For persistence purposes a local storage volume is required, but that's it!
+`docker run -it -v <your storage>:/app/file_storage/ auto_news`
+
+You can specify additional parameters: 
+`docker run -it -v <your storage>:/app/file_storage/ auto_news debug` runs with debug values (does not write to prod db, does not send mails)
+`docker run -it -v <your storage>:/app/file_storage/ auto_news upload` catches up on past uploads to archive.
+`docker run -it -v <your storage>:/app/file_storage/ -e DISPLAY=":0" --network host -v \$XAUTHORITY:/root/.Xauthority auto_news check` lets you visually verify the downloaded files. Be aware that it requires additional parameters in order to open guis on the host.
+
+
+### How to run - development mode
+In this mode, a docker container is launched with an additional volume, the local code. You can test your code without the need to rebuild the image.
+`docker run -it -v <your storage>:/app/file_storage/ -v <your code>:/code/ --entry-point /bin/bash auto_news`
+You are droppped into a bash shell, in which you can navigate to the `/code` directory and then test live.
+
+
+% ### How to run - file checker mode
+% This mode requires the most access rights. You want to access all files and open gui programs.
+% `docker run -it -e DISPLAY=":0" --network host -v $XAUTHORITY:/root/.Xauthority -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ -v /mnt/Data/COSS/DOWNLOADS/auto_news/app:/code auto_news /bin/bash`
+% Similarly to the development mode, you can cd into code and run your checking duties.
+
+
+
+
+## Building
+
+### Things to keep in mind
+The software (firefox, selenium, python) changes frequently. For non-breaking changes it is useful to regularly clean build the docker image! This is also crucial to update the code itself.
+
+
+`docker build -t auto_news --no-cache .`
+where the `Dockerfile` has to be in the working directory
+
+
+
+## Cheat-sheet Remy:
+
+docker run -it -e LIVECODE=TRUE -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ -v /mnt/Data/COSS/DOWNLOADS/auto_news/app:/code/ auto_news /bin/bash
+
+docker run -it -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ auto_news
--- a/app/configuration.py
+++ b/app/configuration.py
@ -0,0 +1,42 @@
+import os
+import sys
+import configparser
+import logging
+from peewee import SqliteDatabase
+from rich.logging import RichHandler
+
+# first things first: logging
+logging.basicConfig(
+    format='%(message)s',
+    level=logging.INFO,
+    datefmt='%Y-%m-%d %H:%M:%S',
+    handlers=[RichHandler()]
+    )
+logger = logging.getLogger(__name__)
+
+
+# load config file containing constants and secrets
+parsed = configparser.ConfigParser()
+parsed.read("/app/file_storage/config.ini")
+
+if "debug" in sys.argv:
+    logger.warning("Running in debugging mode because launched with argument 'debug'")
+    # parsed.read("/code/config.ini")
+
+    db_path = os.path.join(parsed["DATABASE"]["db_path_dev"], parsed["DATABASE"]["db_name"])
+    parsed["SLACK"]["archive_id"] = parsed["SLACK"]["debug_id"]
+    parsed["MAIL"]["recipient"] = parsed["MAIL"]["sender"] 
+else:
+    logger.warning("Using production values, I hope you know what you're doing...")
+
+    db_path = os.path.join(parsed["DATABASE"]["db_path_prod"], parsed["DATABASE"]["db_name"])
+
+from utils_storage import models
+
+# Set up the database
+models.set_db(
+    SqliteDatabase(
+        db_path,
+        pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once
+    )
+)
--- a/app/launch.py
+++ b/app/launch.py
@ -0,0 +1,9 @@
+import configuration
+
+from utils_mail import runner
+
+class Dummy:
+    source_name = "AS"
+    title = "dummy title"
+    mail_info =  [{"reply_text": "UNFOOO", "file_path":None}]
+runner.send(Dummy())
--- a/app/runner.py
+++ b/app/runner.py
@ -0,0 +1,193 @@
+"""Main coordination of other util classes. Handles inbound and outbound calls"""
+import configuration
+models = configuration.models
+import sys
+import logging
+logger = logging.getLogger(__name__)
+
+from utils_mail import runner as mail_runner
+from utils_slack import runner as slack_runner
+from utils.workers import CompressWorker, DownloadWorker, FetchWorker, UploadWorker
+
+
+class ArticleWatcher:
+    """Wrapper for a newly created article object. Notifies the coordinator upon change/completition"""
+    def __init__(self, article, **kwargs) -> None:
+        self.article = article
+
+        self.completition_notifier = kwargs.get("notifier")
+
+        self.fetch = kwargs.get("worker_fetch", None)
+        self.download = kwargs.get("worker_download", None)
+        self.compress = kwargs.get("worker_compress", None)
+        self.upload = kwargs.get("worker_upload", None)
+
+        self.completition_notified = False
+        # self._download_called = self._compression_called = False
+        self._fetch_completed = self._download_completed = self._compression_completed = self._upload_completed = False
+
+        # first step: gather metadata
+        self.fetch.process(self) # this will call the update_status method
+        self.upload.process(self) # idependdent from the rest
+
+
+    def update_status(self, completed_action):
+        """Checks and notifies internal completition-status.
+        Article download is complete iff fetch and download were successfull and compression was run
+        """
+        # if self.completition_notified and self._compression_completed and self._fetch_completed and self._download_completed and self._upload_completed, we are done
+        # we don't need to delete self though, because it is then automatically garbage-collected
+        # all_done = self._fetch_completed and self._download_completed and self._compression_completed and self._upload_completed
+        # if self._fetch_completed and not self._download_called:
+        #     self._download_called = True
+        #     self.download.process(self)
+        # elif self._download_completed and not self._compression_called:
+        #     self._compression_called = True
+        #     self.compress.process(self)
+        # elif self._compression_completed: # last step
+        #     self.completition_notifier(self.article)
+        #     # triggers action in Coordinator
+        # elif self._upload_completed:
+        #     # this case occurs when upload was faster than compression
+        #     pass
+        # else:
+        #     logger.warning(f"update_status called with unusual configuration {self._fetch_completed},{self._download_completed},{self._compression_completed}")
+
+        if completed_action == "fetch":
+            self.download.process(self)
+        elif completed_action == "download":
+            self.compress.process(self)
+        elif completed_action == "compress": # last step
+            self.completition_notifier(self.article)
+            # triggers action in Coordinator
+        elif completed_action == "upload":
+            # this case occurs when upload was faster than compression
+            pass
+        else:
+            logger.warning(f"update_status called with unusual configuration: {completed_action}")
+
+
+    # ====== Attributes to be modified by the util workers
+    @property
+    def fetch_completed(self):
+        return self._fetch_completed
+
+    @fetch_completed.setter
+    def fetch_completed(self, value: bool):
+        self._fetch_completed = value
+        self.update_status("fetch")
+
+    @property
+    def download_completed(self):
+        return self._download_completed 
+
+    @download_completed.setter
+    def download_completed(self, value: bool):
+        self._download_completed = value
+        self.update_status("download")
+
+    @property
+    def compression_completed(self):
+        return self._compression_completed
+
+    @compression_completed.setter
+    def compression_completed(self, value: bool):
+        self._compression_completed = value
+        self.update_status("compress")
+
+    @property
+    def upload_completed(self):
+        return self._upload_completed
+
+    @upload_completed.setter
+    def upload_completed(self, value: bool):
+        self._upload_completed = value
+        self.update_status("upload")
+
+
+
+
+class Coordinator:
+    def __init__(self, **kwargs) -> None:
+        """Launcher calls this Coordinator as the main thread to handle connections between the other workers (threaded)."""
+        pass
+    
+    def add_workers(self, **kwargs):
+        self.worker_slack = kwargs.pop("worker_slack", None) 
+        self.worker_mail = kwargs.pop("worker_mail", None)
+        # the two above won't be needed in the Watcher
+        self.worker_download = kwargs.get("worker_download", None)
+        self.worker_fetch = kwargs.get("worker_fetch", None)
+        self.worker_compress = kwargs.get("worker_compress", None)
+        self.worker_upload = kwargs.get("worker_upload", None)
+        
+        self.kwargs = kwargs
+
+        for w in [self.worker_slack, self.worker_download, self.worker_fetch, self.worker_upload, self.worker_compress]:
+            if not w is None:
+                w.start()
+
+
+    def incoming_request(self, message):
+        # TODO CHECK ME!
+        """This method is passed onto the slack worker. It gets triggered when a new message is received."""
+        url = message.urls[0] # ignore all the other ones
+        a, is_new = models.ArticleDownload.get_or_create(article_url=url)
+        message.thread.article = a
+        message.thread.save()
+
+        if is_new:
+            self.kwargs.update({"notifier" : self.article_complete_notifier})
+            ArticleWatcher(
+                a,
+                **self.kwargs   
+            )
+
+            # All workers are implemented as a threaded queue. But the individual model requires a specific processing order:
+            # fetch -> download -> compress -> complete
+            # the watcher orchestrates the procedure and notifies upon completition
+            # the watcher will notify once it is sufficiently populated
+        else: # manually trigger notification immediatly
+            self.article_complete_notifier(a)
+
+
+
+    def manual_processing(self, url_list, target_calls):
+        for url in url_list:
+            article = models.ArticleDownload.get_or_none(article_url=url)
+            watcher = ArticleWatcher(article, self.article_complete_notifier)
+            for t in target_calls:
+                t.process(watcher)
+
+    def article_complete_notifier(self, article):
+        self.worker_slack.bot_worker.respond_channel_message(article)
+        self.worker_mail.send(article)
+
+
+
+if __name__ == "__main__":
+    coordinator = Coordinator()
+
+
+    if "upload" in sys.argv:
+        urls = models.ArticleDownload.select(models.ArticleDownload.article_url).where(models.ArticleDownload.archive_url == "").execute()
+        logger.info(f"Launching upload to archive for {len(urls)} urls.")
+        coordinator.manual_processing(urls, [UploadWorker()])
+    elif "check" in sys.argv:
+        logger.info("Not implemented yet.")
+    else: # launch with full action
+        kwargs = {
+            "worker_download" : DownloadWorker(),
+            "worker_fetch" : FetchWorker(),
+            "worker_upload" : UploadWorker(),
+            "worker_compress" : CompressWorker(),
+            "worker_slack" : slack_runner.BotRunner(coordinator.incoming_request),
+            "worker_mail" : mail_runner,
+        }
+        coordinator.add_workers(**kwargs)
+
+
+
+
+# TODO
+# Resume interrupted article models
--- a/app/utils/_init__.py
+++ b/app/utils/_init__.py
--- a/app/utils/compress/runner.py
+++ b/app/utils/compress/runner.py
@ -0,0 +1,40 @@
+import os
+import subprocess
+import logging
+logger = logging.getLogger(__name__)
+import configuration
+config = configuration.parsed["DOWNLOADS"]
+
+shrink_sizes = []
+
+def shrink_pdf(article):
+    initial_size = os.path.getsize(article.save_path + article.file_name)
+    c = subprocess.run(
+        ["gs", "-sDEVICE=pdfwrite", "-dPDFSETTINGS=/screen", "-dNOPAUSE", "-dBATCH", f"-sOutputFile={config['default_download_path']}/compressed.pdf", f'"{article.save_path + article.file_name}"'],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE
+    )
+    if c.returncode == 0:
+        m = subprocess.run(
+            ["mv", "-f", f"{config['default_download_path']}/compressed.pdf", article.save_path + article.file_name]
+        )
+        if m.returncode == 0:
+            final_size = os.path.getsize(article.save_path + article.file_name)
+            shrink_sizes.append(initial_size - final_size)
+            logger.info(f"Compression worked. Avg shrinkage: {sum(shrink_sizes)/len(shrink_sizes) / 1000} (kb)")
+            return article # even though no modifications were made
+        else:
+            logger.error(f"Compression ran but I could not copy back the file {m.stderr.decode()} - {m.stdout.decode()}")
+
+
+    else:
+        logger.error(f"Could not run the compression! {c.stderr.decode()} - {c.stdout.decode()}")
+    
+    return article
+
+
+
+
+
+# gs -sDEVICE=pdfwrite -dPDFSETTINGS=/screen -dNOPAUSE -dBATCH -sOutputFile=out.pdf 
+# ; mv -f temp.pdf file.pdf
--- a/app/utils/download/init.py
+++ b/app/utils/download/init.py
--- a/app/utils/download/browser.py
+++ b/app/utils/download/browser.py
@ -0,0 +1,158 @@
+import time
+import datetime
+import logging
+import os
+import base64
+import requests
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options
+import configuration
+
+config = configuration.parsed["DOWNLOADS"]
+
+
+
+class PDFDownloader:
+    """Saves a given url. Fills the object it got as a parameter"""
+    logger = logging.getLogger(__name__)
+    # status-variable for restarting:
+    running = False
+    
+    def start(self):
+        options=Options()
+        options.profile = config["browser_profile_path"]
+        # TODO: Get headless mode interactively
+        options.add_argument('--headless')
+        # options.add_argument("--disable-infobars")
+        # options.set_preference("javascript.enabled", False)
+        # options.add_argument("--disable-popup-blocking")
+        # Print to pdf
+        options.set_preference("print_printer", "Mozilla Save to PDF")
+        options.set_preference("print.always_print_silent", True)
+        options.set_preference("print.show_print_progress", False)
+        options.set_preference('print.save_as_pdf.links.enabled', True)
+        options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
+        # Save existing pdf
+        options.set_preference("browser.download.folderList", 2)
+        # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
+        # options.set_preference("pdfjs.disabled", True)
+        options.set_preference("browser.download.dir", config["default_download_path"])
+
+        self.logger.info("Now Starting gecko driver")
+        self.driver = webdriver.Firefox(options=options)
+        
+        residues = os.listdir(config["default_download_path"])
+        for res in residues:
+            os.remove(os.path.join(config["default_download_path"], res))
+
+        self.running = True
+
+    def autostart(self):
+        if not self.running:
+            self.start() # relaunch the dl util    
+
+    def finish(self):
+        self.driver.quit()
+        self.running = False
+
+
+    def download(self, article_object):
+        sleep_time = 1
+        self.autostart()
+        url = article_object.article_url
+
+        # arbitrary bug fixes:
+        if "focus.de" in url or "bloomberg.com" in url:
+            url = url.replace("https://", "https://outline.com/")
+            sleep_time += 5
+        try:
+            self.driver.get(url)
+        except Exception as e:
+            self.logger.critical("Selenium .get(url) failed with error {}".format(e))
+            self.finish()
+            return article_object # without changes
+        
+        time.sleep(sleep_time)
+        # leave the page time to do any funky business
+
+        # in the mean time, get a page title if required
+        if article_object.is_title_bad:
+            article_object.title = self.driver.title.replace(".pdf","")
+            # will be propagated to dst as well
+
+        fname = article_object.fname_template
+        dst = os.path.join(article_object.save_path, fname)
+        if os.path.exists(dst):
+            fname = make_path_unique(fname)
+            dst = os.path.join(article_object.save_path, fname)
+
+
+        if url[-4:] == ".pdf":
+            # according to the browser preferences, calling the url will open pdfjs.
+            # If not handled separately, printing would require the ctrl+p route, but setup is janky to say the least
+            success = self.get_exisiting_pdf(url, dst)
+        else:
+            success = self.get_new_pdf(dst)
+
+
+        if success:
+            article_object.file_name = fname
+            article_object.set_references = self.get_references()
+        else:
+            article_object.file_name = ""
+        
+        return article_object # this change is saved later manually
+
+
+    def get_exisiting_pdf(self, url, dst):
+        try:
+            r = requests.get(url)
+            bytes = r.content
+        except:
+            return False
+        return self.get_new_pdf(dst, other_bytes=bytes)
+
+
+    def get_new_pdf(self, dst, other_bytes=None):
+        os.makedirs(os.path.dirname(dst), exist_ok=True)
+
+        if other_bytes is None:
+            try:
+                result = self.driver.print_page()
+                bytes = base64.b64decode(result, validate=True)
+            except:
+                self.logger.error("Failed, probably because the driver went extinct.")
+                return False
+        else:
+            bytes = other_bytes
+
+        try:
+            with open(dst, "wb+") as f:
+                f.write(bytes)
+            return True
+        except Exception as e:
+            self.logger.error(f"Failed, because of FS-operation: {e}")
+            return False
+        
+
+    def get_references(self):
+        try:
+            hrefs = [e.get_attribute("href") for e in self.driver.find_elements_by_xpath("//a[@href]")]
+        except:
+            hrefs = []
+        # TODO TEST THIS
+        hrefs = [h for h in hrefs \
+            if bool([(domain in h) for domain in config["blacklisted_href_domains"]])
+            ] # filter a tiny bit at least
+        return hrefs
+
+
+
+
+
+
+
+def make_path_unique(path):
+    fname, ending = os.path.splitext(path)
+    fname += datetime.datetime.now().strftime("%d-%H%M%S")
+    return fname + ending
--- a/app/utils/download/runner.py
+++ b/app/utils/download/runner.py
--- a/app/utils/download/youtube.py
+++ b/app/utils/download/youtube.py
@ -0,0 +1,33 @@
+import logging
+import os
+from pytube import YouTube
+
+logger = logging.getLogger(__name__)
+
+
+def save_video(article_object):
+    """Saves video accoring to url and save path"""
+    url = article_object.article_url
+    logger.info("Saving new video")
+    try:
+        yt = YouTube(url)
+        streams = yt.streams.filter(progressive=True).order_by('resolution')
+    except Exception as e:
+        article_object.file_name = "ERROR: {}".format(e)
+        return article_object
+
+    if streams: # if it's not empty
+        vid = streams[-1]
+        article_object.source_name = "youtube.com"
+        article_object.title = yt.title
+        file_path = os.path.join(article_object.save_path, article_object.fname_template)
+        try:
+            vid.download(file_path)
+            article_object.file_name = article_object.fname_template
+        except Exception as e:
+            logger.error(f"Youtube download crashed: {e}")
+            article_object.file_name = "Error while downloading"
+    else:
+        article_object.file_name = "No streams available"
+    
+    return article_object
--- a/app/utils/fetch/runner.py
+++ b/app/utils/fetch/runner.py
@ -0,0 +1,60 @@
+from newspaper import Article
+from urllib.parse import urlparse
+from htmldate import find_date
+import datetime
+import logging
+logging.getLogger('newspaper').setLevel(logging.ERROR) # quieter logs
+logging.getLogger('urllib').setLevel(logging.ERROR) # quieter logs
+logging.getLogger('urllib3.poolmanager').setLevel(logging.ERROR) # quieter logs
+logging.getLogger('htmldate').setLevel(logging.ERROR) #quieter logs
+logging.getLogger('charset_normalizer').setLevel(logging.ERROR) #quieter logs
+logger = logging.getLogger("fetch")
+
+
+class NewspaperDummy():
+    title = "Error while running fetch"
+    summary = "Error while running fetch"
+    text = "Error while running fetch"
+    authors = []
+    keywords = []
+
+
+def get_description(article_object):
+    url = article_object.article_url
+    website = urlparse(url).netloc
+    article_object.source_name = website
+    try:
+        pub_date = datetime.datetime.strptime(find_date(url), '%Y-%d-%M')
+    except: # other file types
+        pub_date = datetime.datetime(year=1900, month=1, day=1)
+    article_object.pub_date = pub_date
+
+    fallback = NewspaperDummy()
+    try:
+        news_article = Article(url)
+        news_article.download()
+        news_article.parse()
+    except:
+        news_article = fallback
+
+
+    if news_article.title:
+        title = news_article.title
+    else:
+        title = fallback.title
+
+
+    if news_article.summary:
+        summary = news_article.summary
+    elif news_article.text:
+        ind = min(500, len(news_article.text))
+        summary = news_article.text[:ind] + "..."
+    else:
+        summary = fallback.summary        
+
+    article_object.title = title
+    article_object.summary = summary
+    article_object.set_authors(news_article.authors)
+    article_object.set_keywords(news_article.keywords)
+    
+    return article_object
--- a/app/utils/upload/runner.py
+++ b/app/utils/upload/runner.py
@ -0,0 +1,18 @@
+from waybackpy import WaybackMachineSaveAPI # upload to archive.org
+import logging
+logger = logging.getLogger(__name__)
+
+def upload_to_archive(article_object):
+    """uploads to archive.org and returns the archived url"""
+    user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed?
+    url = article_object.article_url
+    try:
+        wayback = WaybackMachineSaveAPI(url, user_agent)
+        archive_url = wayback.save()
+        logger.info(f"{url} uploaded to archive successfully")
+        article_object.archive_url = archive_url
+    except Exception as e:
+        article_object.archive_url = "Error while uploading: {}".format(e)
+        logger.error(f"Error while generating new url: {e}")
+
+    return article_object
--- a/app/utils/worker_template.py
+++ b/app/utils/worker_template.py
@ -0,0 +1,43 @@
+from threading import Thread
+import time
+import logging
+# logger = logging.getLogger(__name__)
+
+
+class TemplateWorker(Thread):
+    """Parent class for any subsequent worker of the article-download pipeline. They should all run in parallel, thus the Thread subclassing"""
+    logger = logging.getLogger(__name__)
+
+    def __init__(self, *args, **kwargs) -> None:
+        target = self._queue_processor # will be executed on Worker.start()
+        group = kwargs.get("group", None)
+        name = kwargs.get("name", None)
+
+        super().__init__(group=group, target=target, name=name)
+        self._article_queue = []
+        self.logger.info(f"Worker thread {self.__class__.__name__} initialized successfully")
+    
+
+    def process(self, article_watcher):
+        self._article_queue.append(article_watcher)#.article_model.article_url)
+
+
+    def _queue_processor(self):
+        """This method is launched by thread.run() and idles when self._article_queue is empty. When an external caller appends to the queue it jumps into action"""
+        while True: # PLEASE tell me if I'm missing an obvious better way of doing this!
+            if len(self._article_queue) == 0:
+                time.sleep(5)
+            else:
+                article_watcher = self._article_queue.pop(0)
+                self.logger.info(f"{self.__class__.__name__} is now processing an article")
+                self._handle_article(article_watcher)
+                
+
+    def _handle_article(self, article_watcher, action=None):
+        # TODO Overload in children classes
+        if action is None:
+            self.logger.error("Unoverloaded call of _handle_article(). This should not occur in prod")
+        else:
+            article = article_watcher.article
+            article = action(article) # action updates the article object but does not save the change
+            article.save()
--- a/app/utils/workers.py
+++ b/app/utils/workers.py
@ -0,0 +1,60 @@
+from .worker_template import TemplateWorker
+from .download.browser import PDFDownloader
+from .download.youtube import save_video
+from .fetch.runner import get_description
+from .upload.runner import upload_to_archive as run_upload
+from .compress.runner import shrink_pdf
+
+import logging
+logger = logging.getLogger(__name__)
+
+class DownloadWorker(TemplateWorker):
+    def __init__(self) -> None:
+        self.dl_runner = PDFDownloader().download
+        self.yt_runner = save_video
+        super().__init__()
+
+    def _handle_article(self, article_watcher):
+        article = article_watcher.article
+        u = article.article_url
+
+        if "youtu.be/" in u or "youtube.com/" in u:
+            action = self.yt_runner
+        else:
+            action = self.dl_runner
+
+        super()._handle_article(article_watcher, action)
+        article_watcher.download_completed = True
+
+
+
+class FetchWorker(TemplateWorker):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def _handle_article(self, article_watcher):
+        action = get_description # function
+        super()._handle_article(article_watcher, action)
+        article_watcher.fetch_completed = True
+
+
+
+class UploadWorker(TemplateWorker):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def _handle_article(self, article_watcher):
+        action = run_upload # function
+        super()._handle_article(article_watcher, action)
+        article_watcher.upload_completed = True
+
+
+
+class CompressWorker(TemplateWorker):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def _handle_article(self, article_watcher):
+        action = shrink_pdf
+        super()._handle_article(article_watcher, action)
+        article_watcher.compression_completed = True
--- a/app/utils_mail/runner.py
+++ b/app/utils_mail/runner.py
@ -0,0 +1,42 @@
+import smtplib
+from email.mime.multipart import MIMEMultipart
+from email.mime.text import MIMEText
+from email.mime.application import MIMEApplication
+import os
+import logging
+import configuration
+
+logger = logging.getLogger(__name__)
+config = configuration.parsed["MAIL"]
+
+def send(article_model):
+    mail = MIMEMultipart()
+    mail['Subject'] = "{} -- {}".format(article_model.source_name, article_model.title)
+    mail['From'] = config["sender"]
+    mail['To'] = config["recipient"]
+
+    msgs = article_model.mail_info # this is html
+    msg = [m["reply_text"] for m in msgs]
+    msg = "\n".join(msg)
+
+    content = MIMEText(msg, "html")
+    mail.attach(content)
+
+    files = [m["file_path"] for m in msgs if m["file_path"]]
+    for path in files:
+        with open(path, 'rb') as file:
+            part = MIMEApplication(file.read(), "pdf")
+        # encoders.encode_base64(part)
+        part.add_header('Content-Disposition', 'attachment', filename=os.path.basename(path))
+        mail.attach(part)
+
+    try:
+        smtp = smtplib.SMTP(config["smtp_server"], config["port"])
+        smtp.starttls()
+        smtp.login(config["uname"], config["password"])
+        smtp.sendmail(config["sender"], config["recipient"], mail.as_string())
+        smtp.quit()
+        logger.info("Mail successfully sent.")
+    except Exception as e:
+        logger.error("Could not send mail for article {}".format(article_model))
+        logger.info(e)
--- a/app/utils_slack/message_helpers.py
+++ b/app/utils_slack/message_helpers.py
@ -0,0 +1,265 @@
+import logging
+import configuration
+import requests
+import os
+import time
+import asyncio
+import sys
+from slack_sdk.errors import SlackApiError
+
+logger = logging.getLogger(__name__)
+config = configuration.parsed["SLACK"]
+models = configuration.models
+slack_client = "dummy"
+LATEST_RECORDED_REACTION = 0
+
+
+def init(client) -> None:
+    global slack_client
+    slack_client = client
+
+    # config["archive_id"] = channel_id
+    try:
+        LATEST_RECORDED_REACTION = models.Reaction.select(models.Reaction.id).order_by("id")[-1]
+    except IndexError: #query is actually empty, we have never fetched any messages until now
+        LATEST_RECORDED_REACTION = 0    
+    # fetch all te messages we could have possibly missed
+    
+    logger.info("Querying missed messages, threads and reactions. This can take some time.")
+    fetch_missed_channel_messages()
+    if "nofetch" in sys.argv:
+        logger.info("Omitted update of reactions and thread messages because of argument 'nofetch'.")
+    else:    # perform these two asyncronously
+        async def run_async():
+            await asyncio.gather(fetch_missed_channel_reactions(), fetch_missed_thread_messages())
+        asyncio.run(run_async())
+
+
+def get_past_messages():
+    """Gets all messages that have not yet been handled, be it by mistake or by downtime
+    As the message handler mkaes no distinction between channel messages and thread messages,
+    we don't have to worry about them here.
+    """
+
+    threaded_objects = []
+    for t in models.Thread.select():
+        if t.message_count > 1: # if only one message was written, it is the channel message
+            msg = t.last_message
+            if msg.is_by_human:
+                threaded_objects.append(msg)
+            # else don't, nothing to process
+    logger.info(f"Set {len(threaded_objects)} thread-messages as not yet handled.")
+
+
+    channel_objects = [t.initiator_message for t in models.Thread.select() if t.message_count == 1 and not t.is_fully_processed]
+    logger.info(f"Set {len(channel_objects)} channel-messages as not yet handled.")
+    
+    reaction_objects = list(models.Reaction.select().where(models.Reaction.id > LATEST_RECORDED_REACTION))
+    # the ones newer than the last before the fetch
+    
+    all_messages = channel_objects + threaded_objects
+    return all_messages, reaction_objects
+
+
+def fetch_missed_channel_messages():
+    # latest processed message_ts is:
+    presaved = models.Message.select().order_by(models.Message.ts)
+    if not presaved:
+        last_ts = 0
+    else:
+        last_message = presaved[-1]
+        last_ts = last_message.slack_ts
+    
+    result = slack_client.conversations_history(
+        channel=config["archive_id"],
+        oldest=last_ts
+    )
+
+    new_messages = result.get("messages", [])
+    # # filter the last one, it is a duplicate! (only if the db is not empty!)
+    # if last_ts != 0 and len(new_messages) != 0:
+    #     new_messages.pop(-1)
+
+    new_fetches = 0
+    for m in new_messages:
+        # print(m)
+        message_dict_to_model(m)
+        new_fetches += 1
+
+    refetch = result.get("has_more", False)
+    while refetch: # we have not actually fetched them all
+        try:
+            result = slack_client.conversations_history(
+                channel = config["archive_id"],
+                cursor = result["response_metadata"]["next_cursor"],
+                oldest = last_ts
+            ) # fetches 100 messages, older than the [-1](=oldest) element of new_fetches
+            refetch = result.get("has_more", False)
+
+            new_messages = result.get("messages", [])
+            for m in new_messages:
+                message_dict_to_model(m)
+                new_fetches += 1
+        except SlackApiError: # Most likely a rate-limit
+            logger.error("Error while fetching channel messages. (likely rate limit) Retrying in {} seconds...".format(config["api_wait_time"]))
+            time.sleep(config["api_wait_time"])
+            refetch = True
+    
+    logger.info(f"Fetched {new_fetches} new channel messages.")
+
+
+async def fetch_missed_thread_messages():
+    """After having gotten all base-threads, we need to fetch all their replies"""        
+    # I don't know of a better way: we need to fetch this for each and every thread (except if it is marked as permanently solved)
+    logger.info("Starting async fetch of thread messages...")
+    threads = [t for t in models.Thread.select() if not t.is_fully_processed]
+    new_messages = []
+    for i,t in enumerate(threads):
+        try:
+            messages = slack_client.conversations_replies(
+                channel = config["archive_id"],
+                ts = t.slack_ts,
+                oldest = t.messages[-1].slack_ts
+            )["messages"]
+        except SlackApiError:
+            logger.error("Hit rate limit while querying threaded messages, retrying in {}s ({}/{} queries elapsed)".format(config["api_wait_time"], i, len(threads)))
+            await asyncio.sleep(config["api_wait_time"])
+            messages = slack_client.conversations_replies(
+                channel = config["archive_id"],
+                ts = t.slack_ts,
+                oldest = t.messages[-1].slack_ts
+            )["messages"]
+
+        messages.pop(0) # the first message is the one posted in the channel. We already processed it!
+        
+        for m in messages:
+            # only append *new* messages
+            res = message_dict_to_model(m)
+            if res:
+                new_messages.append(res)
+    logger.info("Fetched {} new threaded messages.".format(len(new_messages)))
+
+
+async def fetch_missed_channel_reactions():
+    logger.info("Starting async fetch of channel reactions...")
+    threads = [t for t in models.Thread.select() if not t.is_fully_processed]
+    for i,t in enumerate(threads):
+        try:
+            query = slack_client.reactions_get(
+                channel = config["archive_id"],
+                timestamp = t.slack_ts
+            )
+            reactions = query["message"].get("reactions", []) # default = []
+        except SlackApiError: # probably a rate_limit:
+            logger.error("Hit rate limit while querying reactions. retrying in {}s ({}/{} queries elapsed)".format(config["api_wait_time"], i, len(threads)))
+            await asyncio.sleep(config["api_wait_time"])
+            reactions = query["message"].get("reactions", [])
+
+        for r in reactions:
+            reaction_dict_to_model(r, t)
+
+
+
+
+# Helpers for message conversion to db-objects
+def reaction_dict_to_model(reaction, thread=None):
+    if thread is None:
+        m_ts = reaction["item"]["ts"]
+        message = models.Message.get(ts = float(m_ts))
+        thread = message.thread
+    if "name" in reaction.keys(): # fetched through manual api query
+        content = reaction["name"]
+    elif "reaction" in reaction.keys(): # fetched through events
+        content = reaction["reaction"]
+    else:
+        logger.error(f"Weird reaction received: {reaction}")
+        return None
+
+    r, _ = models.Reaction.get_or_create(
+        type = content,
+        message = thread.initiator_message
+    )
+    logger.info("Saved reaction [{}]".format(content))
+    return r
+
+
+def message_dict_to_model(message):
+    if message["type"] == "message":
+        thread_ts = message["thread_ts"] if "thread_ts" in message else message["ts"]
+        uid = message.get("user", "BAD USER")
+        if uid == "BAD USER":
+            logger.critical("Message has no user?? {}".format(message))
+        
+        user, _ = models.User.get_or_create(user_id = uid)
+        thread, _ = models.Thread.get_or_create(thread_ts = thread_ts)
+        m, new = models.Message.get_or_create(
+            user = user,
+            thread = thread,
+            ts = message["ts"],
+            channel_id = config["archive_id"],
+            text = message["text"]
+        )
+        logger.info("Saved (text) {} (new={})".format(m, new))
+
+        for f in message.get("files", []): #default: []
+            m.file_type = f["filetype"]
+            m.perma_link = f["url_private_download"]
+            m.save()
+            logger.info("Saved permalink {} to {} (possibly overwriting)".format(f["name"], m))
+        if new:
+            return m
+        else:
+            return None
+    else:
+        logger.warning("What should I do of {}".format(message))
+        return None
+
+
+def say_substitute(*args, **kwargs):
+    logger.info("Now sending message through say-substitute: {}".format(" - ".join(args)))
+    slack_client.chat_postMessage(
+        channel=config["archive_id"],
+        text=" - ".join(args),
+        **kwargs
+    )
+    
+
+def save_as_related_file(url, article_object):
+    r = requests.get(url, headers={"Authorization": "Bearer {}".format(slack_client.token)})
+    saveto = article_object.save_path
+    ftype = url[url.rfind(".") + 1:]
+    fname = "{} - related no {}.{}".format(
+        article_object.file_name.replace(".pdf",""),
+        len(article_object.related) + 1,
+        ftype
+    )
+    with open(os.path.join(saveto, fname), "wb") as f:
+        f.write(r.content)
+    article_object.set_related([fname])
+    logger.info("Added {} to model {}".format(fname, article_object))
+    return fname
+
+
+def react_file_path_message(fname, article_object):
+    saveto = article_object.save_path
+    file_path = os.path.join(saveto, fname)
+    if os.path.exists(file_path):
+        article_object.set_related([fname])
+        logger.info("Added {} to model {}".format(fname, article_object))
+        return True
+    else:
+        return False
+
+
+def is_message_in_archiving(message) -> bool:
+    if isinstance(message, dict):
+        return message["channel"] == config["archive_id"]
+    else:
+        return message.channel_id == config["archive_id"]
+
+
+def is_reaction_in_archiving(event) -> bool:
+    if isinstance(event, dict):
+        return event["item"]["channel"] == config["archive_id"]
+    else:
+        return event.message.channel_id == config["archive_id"]
--- a/app/utils_slack/runner.py
+++ b/app/utils_slack/runner.py
@ -0,0 +1,190 @@
+from threading import Thread
+from slack_bolt import App
+from slack_bolt.adapter.socket_mode import SocketModeHandler
+
+import logging
+from rich.rule import Rule
+import configuration
+
+from . import message_helpers
+
+
+config = configuration.parsed["SLACK"]
+models = configuration.models
+
+class BotApp(App):
+    logger = logging.getLogger(__name__)
+
+    def __init__(self, callback, *args, **kwargs):
+
+        super().__init__(*args, **kwargs)
+        # models = models
+        self.callback = callback
+
+    def start(self):
+        message_helpers.init(self.client)
+        missed_messages, missed_reactions = message_helpers.get_past_messages()
+
+        [self.handle_incoming_message(m) for m in missed_messages]
+        [self.handle_incoming_reaction(r) for r in missed_reactions]
+
+        # self.react_missed_reactions(missed_reactions)
+        # self.react_missed_messages(missed_messages)
+        self.startup_status()
+
+
+
+    def handle_incoming_reaction(self, reaction):
+        if isinstance(reaction, dict): #else: the reaction is already being passed as a model
+            # CAUTION: filter for 'changed reactions' those are nasty (usually when adding an url)
+            reaction = message_helpers.reaction_dict_to_model(reaction)
+
+        thread = reaction.message.thread
+        article_object = thread.article
+        if not article_object is None:
+            reaction = reaction.type
+            status = 1 if reaction == "white_check_mark" else -1
+
+            # self.logger.info(f"Applying reaction {reaction} to its root message.")
+            article_object.verified = status
+            article_object.save()
+
+
+    def handle_incoming_message(self, message):
+        """Reacts to all messages inside channel archiving. Must then
+        distinguish between threaded replies and new requests
+        and react accordingly"""
+        if isinstance(message, dict): #else: the message is already being passed as a model
+            # CAUTION: filter for 'changed messages' those are nasty (usually when adding an url)
+            if message.get("subtype", "not bad") == "message_changed":
+                return False
+            message = message_helpers.message_dict_to_model(message)
+
+        # First check: belongs to thread?
+        is_threaded = message.thread.message_count > 1 and message != message.thread.initiator_message
+        if is_threaded:
+            self.incoming_thread_message(message)
+        else:
+            self.incoming_channel_message(message)
+            
+
+    def incoming_thread_message(self, message):
+        if message.user.user_id == config["bot_id"]:
+            return True # ignore the files uploaded by the bot. We handled them already!
+        
+        thread = message.thread
+        if thread.is_fully_processed:
+            return True
+
+        self.logger.info("Receiving thread-message")
+        self.respond_thread_message(message)
+
+
+    def incoming_channel_message(self, message):
+        self.logger.info("Handling message with {} url(s)".format(len(message.urls)))
+
+        if not message.urls: # no urls in a root-message => IGNORE
+            message.is_processed_override = True
+            message.save()
+            return
+
+        # ensure thread is still empty, this is a scenario encountered only in testing, but let's just filter it
+        if message.thread.message_count > 1:
+            self.logger.info("Discarded message because it is actually processed.")
+            return
+        
+        if len(message.urls) > 1:
+            message_helpers.say_substitute("Only the first url is being handled. Please send any subsequent url as a separate message", thread_ts=message.thread.slack_ts)
+        
+        self.callback(message)
+        # for url in message.urls:
+            # self.callback(url, message)
+            # stop here!
+
+
+
+    def respond_thread_message(self, message, say=message_helpers.say_substitute):
+        thread = message.thread
+        article = thread.article
+        if message.perma_link: # file upload means new data    
+            fname = message_helpers.save_as_related_file(message.perma_link, article)
+            say("File was saved as 'related file' under `{}`.".format(fname),
+                thread_ts=thread.slack_ts
+            )
+        else: # either a pointer to a new file (too large to upload), or trash
+            success = message_helpers.react_file_path_message(message.text, article)
+            if success:
+                say("File was saved as 'related file'", thread_ts=thread.slack_ts)
+            else:
+                self.logger.error("User replied to thread {} but the response did not contain a file/path".format(thread))
+                say("Cannot process response without associated file.",
+                    thread_ts=thread.slack_ts
+                )
+
+
+    def respond_channel_message(self, article, say=message_helpers.say_substitute):
+        # extra={"markup": True}
+        # self.logger.info(Rule(url[:min(len(url), 30)]))
+        thread = article.slack_thread.execute()[0]
+        answers = article.slack_info
+        for a in answers:
+            if a["file_path"]:
+                try: # either, a["file_path"] does not exist, or the upload resulted in an error
+                    self.client.files_upload(
+                        channels = config["archive_id"],
+                        initial_comment = f"<@{config['responsible_id']}> \n {a['reply_text']}",
+                        file = a["file_path"],
+                        thread_ts = thread.slack_ts
+                    )
+                    status = True
+                except:
+                    say(
+                        "File {} could not be uploaded.".format(a),
+                        thread_ts=thread.slack_ts
+                    )
+                    status = False
+            else: # anticipated that there is no file!
+                say(
+                    f"<@{config['responsible_id']}> \n {a['reply_text']}",
+                    thread_ts=thread.slack_ts
+                )
+                status = True
+        # self.logger.info(Rule(f"Fully handled (success={status})"))
+        
+
+    def startup_status(self):
+        threads = [t for t in models.Thread.select()]
+        all_threads = len(threads)
+        fully_processed = len([t for t in threads if t.is_fully_processed])
+        fully_unprocessed = len([t for t in threads if t.message_count == 1])
+        articles_unprocessed = len(models.ArticleDownload.select().where(models.ArticleDownload.verified < 1))
+        self.logger.info(f"[bold]STATUS[/bold]: Fully processed {all_threads}/{fully_processed} threads. {fully_unprocessed} threads have 0 replies. Article-objects to verify: {articles_unprocessed}", extra={"markup": True})
+
+
+    
+
+
+class BotRunner(Thread):
+    """Stupid encapsulation so that we can apply the slack decorators to the BotApp"""
+    def __init__(self, callback, *args, **kwargs) -> None:
+        self.bot_worker = BotApp(callback, token=config["auth_token"])
+
+        @self.bot_worker.event(event="message", matchers=[message_helpers.is_message_in_archiving])
+        def handle_incoming_message(message, say):
+            return self.bot_worker.handle_incoming_message(message)
+
+        @self.bot_worker.event(event="reaction_added", matchers=[message_helpers.is_reaction_in_archiving])
+        def handle_incoming_reaction(event, say):
+            return self.bot_worker.handle_incoming_reaction(event)
+
+        target = self.launch
+        super().__init__(target=target)
+
+
+    def launch(self):
+        self.bot_worker.start()
+        SocketModeHandler(self.bot_worker, config["app_token"]).start()
+
+
+    # def respond_to_message(self, message):
+    #     self.bot_worker.handle_incoming_message(message)
--- a/app/utils_storage/models.py
+++ b/app/utils_storage/models.py
@ -0,0 +1,320 @@
+import logging
+logger = logging.getLogger(__name__)
+
+from peewee import *
+import os
+import markdown
+import re
+import configuration
+import datetime
+
+config = configuration.parsed["DOWNLOADS"]
+slack_config = configuration.parsed["SLACK"]
+
+## Helpers
+db = DatabaseProxy()
+# set the nature of the db at runtime
+class BaseModel(Model):
+    class Meta:
+        database = db
+
+
+## == Article related models == ##
+class ArticleDownload(BaseModel):
+    title = CharField(default='')
+    pub_date = DateField(default = '')
+    download_date = DateField(default = datetime.date.today)
+    source_name = CharField(default = '')
+    article_url = TextField(default = '', unique=True)
+    archive_url = TextField(default = '')
+    file_name = TextField(default = '')
+    language = CharField(default = '')
+    summary = TextField(default = '')
+    comment = TextField(default = '')
+    verified = IntegerField(default = False)
+    # authors
+    # keywords
+    # ... are added through foreignkeys
+
+    def __str__(self) -> str:
+        return "ART ({} -- {})".format(self.title, self.source_name)
+
+    ## Useful Properties
+    @property
+    def save_path(self):
+        return f"{config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/"
+
+    def fname_nas(self, file_name=""):
+        if self.download_date:
+            if file_name:
+                return "NAS: {}/{}/{}/{}".format(config["remote_storage_path"], self.download_date.year, self.download_date.strftime("%B"), file_name)
+            else: # return the self. name
+                return "NAS: {}/{}/{}/{}".format(config["remote_storage_path"], self.download_date.year, self.download_date.strftime("%B"), self.file_name)
+        else:
+            return None
+
+    @property
+    def fname_template(self):
+        if self.source_name == "youtube.com":
+            fname = "{} -- {}".format(self.source_name, self.title)
+        else:
+            fname = "{} -- {}.pdf".format(self.source_name, self.title)
+        return clear_path_name(fname)
+
+    @property
+    def is_title_bad(self):  # add incrementally
+        return "PUR-Abo" in self.title \
+            or "Redirecting" in self.title \
+            or "Error while running fetch" in self.title
+            
+    @property
+    def slack_info(self):
+        status = [":x: No better version available", ":gear: Verification pending", ":white_check_mark: Verified by human"][self.verified + 1]
+        content = "\n>" + "\n>".join(self.summary.split("\n"))
+        file_status, msg = self.file_status()
+        if not file_status:
+            return [msg]
+        
+        # everything alright: generate real content
+        # first the base file
+        if self.file_name[-4:] == ".pdf":
+            answer = [{ # main reply with the base pdf
+                "reply_text" : f"*{self.title}*\n{status}\n{content}",
+                "file_path" : self.save_path + self.file_name 
+            }]
+        else: # don't upload if the file is too big!
+            location = "Not uploaded to slack, but the file will be on the NAS:\n`{}`".format(self.fname_nas())
+            answer = [{ # main reply with the base pdf
+                "reply_text" : "*{}*\n{}\n{}\n{}".format(self.title, status, content, location),
+                "file_path" : None 
+            }]
+
+        # then the related files
+        rel_text = ""
+        for r in self.related:
+            fname = r.related_file_name
+            lentry = "\n• `{}` ".format(self.fname_nas(fname))
+            if fname[-4:] == ".pdf": # this is a manageable file, directly upload
+                f_ret = self.save_path + fname
+                answer.append({"reply_text":"", "file_path" : f_ret})
+            else: # not pdf <=> too large. Don't upload but mention its existence
+                lentry += "(not uploaded to slack, but the file will be on the NAS)"
+                
+            rel_text += lentry
+
+        if rel_text:
+            rel_text = answer[0]["reply_text"] = answer[0]["reply_text"] + "\nRelated files:\n" + rel_text
+        
+        return answer
+
+    @property
+    def mail_info(self):
+        base = [{"reply_text": "[{}]({})\n".format(self.article_url, self.article_url), "file_path":None}] + self.slack_info
+        return [{"reply_text": markdown.markdown(m["reply_text"]), "file_path": m["file_path"]} for m in base]
+
+
+    ## Helpers
+    def set_keywords(self, keywords):
+        for k in keywords:
+            ArticleKeyword.create(
+                article = self,
+                keyword = k
+                )
+
+    def set_authors(self, authors):
+        for a in authors:
+            ArticleAuthor.create(
+                article = self,
+                author = a
+                )
+
+    def set_references(self, references):
+        for r in references:
+            ArticleReference.create(
+                article = self,
+                reference_url = r
+            )
+
+    def set_related(self, related):
+        for r in related:
+            ArticleRelated.create(
+                article = self,
+                related_file_name = r
+            )
+
+    def file_status(self):
+        if not self.file_name:
+            logger.error("Article {} has no filename!".format(self))
+            return False, {"reply_text": "Download failed, no file was saved.", "file_path": None}
+        
+        file_path_abs = self.save_path + self.file_name
+        if not os.path.exists(file_path_abs):
+            logger.error("Article {} has a filename, but the file does not exist at that location!".format(self))
+            return False, {"reply_text": "Can't find file. Either the download failed or the file was moved.", "file_path": None}
+
+        return True, {}
+
+
+class ArticleKeyword(BaseModel):
+    # instance gets created for every one keyword -> flexible in size
+    article = ForeignKeyField(ArticleDownload, backref='keywords')
+    keyword = CharField()
+
+
+class ArticleAuthor(BaseModel):
+    article = ForeignKeyField(ArticleDownload, backref='authors')
+    author = CharField()
+
+
+class ArticleReference(BaseModel):
+    article = ForeignKeyField(ArticleDownload, backref='references')
+    reference_url = TextField(default = '')
+
+
+class ArticleRelated(BaseModel):
+    article = ForeignKeyField(ArticleDownload, backref='related')
+    related_file_name = TextField(default = '')
+
+
+
+
+## == Slack-thread related models == ##
+class User(BaseModel):
+    user_id = CharField(default='', unique=True)
+    # messages
+
+
+class Thread(BaseModel):
+    """The threads that concern us are only created if the messages that contain urls"""
+    thread_ts = FloatField(default = 0)
+    article = ForeignKeyField(ArticleDownload, backref="slack_thread", null=True, default=None)
+    # provides, ts, user, models
+    # messages
+
+    @property
+    def slack_ts(self):
+        str_ts = str(self.thread_ts)
+        cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem!
+        return "{}{}".format(str_ts, cut_zeros*"0")
+
+    @property
+    def initiator_message(self):
+        return self.messages[0] # todo check if this needs sorting
+
+    @property
+    def message_count(self):
+        return self.messages.count()
+
+    @property
+    def last_message(self):
+        messages = Message.select().where(Message.thread == self).order_by(Message.ts) # can't be empty by definition/creation
+        return messages[-1]
+
+    @property
+    def is_fully_processed(self) -> bool:
+        init_message = self.initiator_message
+        if init_message.is_processed_override:
+            return True
+        # this override is set for instance, when no url was sent at all. Then set this thread to be ignored
+        
+        reactions = init_message.reaction
+        if not reactions:
+            return False
+        else:
+            r = reactions[0].type # can and should only have one reaction
+            return r == "white_check_mark" \
+                or r == "x"
+
+
+    
+class Message(BaseModel):
+    ts = FloatField(unique=True) #for sorting
+    channel_id = CharField(default='')
+    user = ForeignKeyField(User, backref="messages")
+    text = TextField(default='')
+    thread = ForeignKeyField(Thread, backref="messages", default=None)
+    perma_link = CharField(default='')
+    is_processed_override = BooleanField(default=False)
+    # reaction
+
+    def __str__(self) -> str:
+        return "MSG ({} -- {})".format(self.channel_id, self.text[:min(len(self.text), 50)].replace("\n","/") + "....")
+
+    @property
+    def slack_ts(self):
+        str_ts = str(self.ts)
+        cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem!
+        return "{}{}".format(str_ts, cut_zeros * "0")
+
+
+    @property
+    def urls(self):
+        pattern = r"<(.*?)>"
+        matches = re.findall(pattern, self.text)
+        matches = [m for m in matches if "." in m]
+        
+        new_matches = []
+        for m in matches:
+            if "." in m:  # must contain a tld, right?
+                # further complication: slack automatically abreviates urls in the format: 
+                # <url|link preview>. Lucky for us, "|" is a character derecommended in urls, meaning we can "safely" split for it and retain the first half
+                if "|" in m:
+                    keep = m.split("|")[0]
+                else:
+                    keep = m
+                new_matches.append(keep)
+        return new_matches
+    
+    @property
+    def is_by_human(self):
+        return self.user.user_id != slack_config["bot_id"]
+
+    
+    @property
+    def has_single_url(self):
+        return len(self.urls) == 1
+
+
+class Reaction(BaseModel):
+    type = CharField(default = "")
+    message = ForeignKeyField(Message, backref="reaction")
+
+
+
+
+
+
+
+
+
+
+
+
+def create_tables():
+    with db:
+        db.create_tables([ArticleDownload, ArticleKeyword, ArticleAuthor, ArticleReference, ArticleRelated, User, Message, Thread, Reaction])
+
+
+def set_db(db_object):
+    db.initialize(db_object)
+    create_tables()
+
+def clear_path_name(path):
+    keepcharacters = (' ','.','_', '-')
+    converted = "".join([c if (c.isalnum() or c in keepcharacters) else "_" for c in path]).rstrip()
+    return converted
+    # return re.sub(r'[^\x00-\x7f]', r'_', path)
+    # # cleared = path.replace("\n"," ")\
+    # #     .replace("|", "_")\
+    # #     .replace(":", "_")\
+    # #     .replace("?", "_")\
+    # #     .replace("!", "_")\
+    # #     .replace(",", "_")\
+    # #     .replace("/", "_")\
+    # #     .replace("\\", "_")\
+    # #     .replace("*", "")\
+    # #     .replace("\"", "'")\
+    # #     .replace("<", "'")\
+    # #     .replace(">", "'")
+    # # return cleared
+    
--- a/misc/exctract_from_mail_backup.py
+++ b/misc/exctract_from_mail_backup.py
@ -0,0 +1,23 @@
+import os
+import re
+import json
+
+os.chdir("/home/remy/Documents/mails2/")
+
+regex = "(?P<url>https?://[^\s]+)"
+
+all_files = os.listdir(".")
+all_urls = []
+
+for f in all_files:
+    with open(f, "r", encoding="utf8") as mail:
+        content = mail.readlines()
+    
+    search = "".join(content)
+    urls = re.findall(regex, search)
+    all_urls += urls
+
+print("Saved {} urls".format(len(all_urls)))
+
+with open("mails_export.json", "w") as f:
+    json.dump(all_urls, f)  
--- a/misc/hotfix_mails.py
+++ b/misc/hotfix_mails.py
@ -0,0 +1,38 @@
+import logging
+import keys
+from peewee import SqliteDatabase
+
+from persistence import article_models
+from archiving_utils import runner as archive_runner
+from mail_utils import runner as mail_runner
+
+# Global logger setup:
+logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S')
+logger = logging.getLogger("MailThread")
+
+
+# Constant values...
+DOWNLOADS_DB = "/app/file_storage/downloads.db"
+
+
+# DB Setup:
+article_models.set_db(SqliteDatabase(
+    DOWNLOADS_DB,
+    pragmas = {'journal_mode': 'wal'} # mutliple threads can access at once
+))
+
+
+mail_worker = mail_runner.MailSender(keys.MAIL_UNAME, keys.MAIL_PASSWORD, keys.MAIL_SENDER, keys.MAIL_RECIPIENT)
+dl_worker = archive_runner.ArchivingThread(article_models, mail_worker)
+dl_worker.start()
+
+
+
+# Retroactively sends a message to DIRK for messages that were archived using slack, but when the mail-reply was not yet implemented
+
+
+
+url_list = []
+
+for url in url_list:
+    dl_worker.get_or_save(url)
--- a/misc/hotfix_missed_messages.py
+++ b/misc/hotfix_missed_messages.py
@ -0,0 +1,88 @@
+import time
+import keys
+import slack_sdk
+from slack_sdk.errors import SlackApiError
+from peewee import SqliteDatabase
+
+from persistence import  message_models
+# from bot_utils import messages
+
+
+
+# Constant values...
+MESSAGES_DB = "/app/file_storage/messages.db"
+
+BOT_ID = "U02MR1R8UJH"
+ARCHIVE_ID = "C02MM7YG1V4"
+DEBUG_ID = "C02NM2H9J5Q"
+
+
+
+client = slack_sdk.WebClient(token=keys.OAUTH_TOKEN)
+
+message_models.set_db(SqliteDatabase(MESSAGES_DB))
+
+
+def message_dict_to_model(message):
+    if message["type"] == "message":
+        thread_ts = message["thread_ts"] if "thread_ts" in message else message["ts"]
+        uid = message.get("user", "BAD USER")
+        user, _ = message_models.User.get_or_create(user_id = uid)
+        thread, _ = message_models.Thread.get_or_create(thread_ts = thread_ts)
+        m, new = message_models.Message.get_or_create(
+            user = user,
+            thread = thread,
+            ts = message["ts"],
+            channel_id = ARCHIVE_ID,
+            text = message["text"]
+        )
+        print("Saved (text) {} (new={})".format(m, new))
+
+        for f in message.get("files", []): #default: []
+            m.file_type = f["filetype"]
+            m.perma_link = f["url_private_download"]
+            m.save()
+            print("Saved permalink {} to {} (possibly overwriting)".format(f["name"], m))
+        if new:
+            return m
+        else:
+            return None
+    else:
+        print("What should I do of {}".format(message))
+        return None
+
+
+def check_all_past_messages():
+    last_ts = 0
+    
+    result = client.conversations_history(
+        channel=ARCHIVE_ID,
+        oldest=last_ts
+    )
+
+    new_messages = result.get("messages", []) # fetches 100 messages by default
+
+    new_fetches = []
+    for m in new_messages:
+        new_fetches.append(message_dict_to_model(m))
+    # print(result)
+    refetch = result.get("has_more", False)
+    print(f"Refetching : {refetch}")
+    while refetch: # we have not actually fetched them all
+        try:
+            result = client.conversations_history(
+                channel = ARCHIVE_ID,
+                cursor = result["response_metadata"]["next_cursor"],
+                oldest = last_ts
+            ) # refetches in batches of 100 messages
+            refetch = result.get("has_more", False)
+            new_messages = result.get("messages", [])
+            for m in new_messages:
+                new_fetches.append(message_dict_to_model(m))
+        except SlackApiError: # Most likely a rate-limit
+            print("Error while fetching channel messages. (likely rate limit) Retrying in {} seconds...".format(30))
+            time.sleep(30)
+            refetch = True
+
+
+check_all_past_messages()
--- a/misc/hotfix_reactions.py
+++ b/misc/hotfix_reactions.py
@ -0,0 +1,38 @@
+from peewee import SqliteDatabase
+
+from persistence import article_models, message_models
+
+# Global logger setup:
+
+
+# Constant values...
+DOWNLOADS_DB = "../container_data/downloads.db"
+MESSAGES_DB = "../container_data/messages.db"
+
+BOT_ID = "U02MR1R8UJH"
+ARCHIVE_ID = "C02MM7YG1V4"
+DEBUG_ID = "C02NM2H9J5Q"
+
+
+# DB Setup:
+article_models.set_db(SqliteDatabase(
+    DOWNLOADS_DB,
+    pragmas = {'journal_mode': 'wal'} # mutliple threads can access at once
+))
+
+message_models.set_db(SqliteDatabase(MESSAGES_DB))
+
+
+
+for reaction in message_models.Reaction.select():
+    print(reaction)        
+    thread = reaction.message.thread
+    articles = message_models.get_referenced_articles(thread, article_models.ArticleDownload)
+    for a in articles:
+        print(a)
+    reaction = reaction.type
+    status = 1 if reaction == "white_check_mark" else -1
+    print(status)
+    for article in articles:
+        article.verified = status
+        article.save()
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,9 @@
+peewee
+selenium
+pytube
+waybackpy
+slack_bolt # relies on slack_sdk
+newspaper3k
+htmldate
+markdown
+rich