commit 0a6dde8c7826cae921f5b45144483a1ed4ede373 Author: Remy Moll Date: Sun Apr 17 21:58:58 2022 +0200 A working app with a few bugs sprinkled in. Configuration saved externally diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..b5d67a2 --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +.dev/ \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b6c6fee --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.dev/ + +*.pyc +*.log diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..d4d0203 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,22 @@ +FROM ubuntu:latest +# UGH, timezone issues +RUN ln -snf /usr/share/zoneinfo/$CONTAINER_TIMEZONE /etc/localtime && echo $CONTAINER_TIMEZONE > /etc/timezone + +RUN apt-get update && apt-get install -y evince libcanberra-gtk-module && apt-get install -y xauth wget tar python3 python3-pip python3-setuptools python3-wheel python3-dev build-essential firefox ghostscript + +# Download gecko (firefox) driver for selenium +RUN wget https://github.com/mozilla/geckodriver/releases/download/v0.30.0/geckodriver-v0.30.0-linux64.tar.gz +RUN tar -x geckodriver -zf geckodriver-v0.30.0-linux64.tar.gz -O > /usr/bin/geckodriver +RUN chmod +x /usr/bin/geckodriver +RUN rm geckodriver-v0.30.0-linux64.tar.gz +RUN echo "127.0.0.1 localhost" >> /etc/hosts + +COPY requirements.txt /app/ +RUN python3 -m pip install --upgrade pip && python3 -m pip install -r /app/requirements.txt + +RUN mkdir -p /app/auto_news +COPY app /app/auto_news +WORKDIR /app/auto_news + + +ENTRYPOINT ["python3", "runner.py"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..a7f23b2 --- /dev/null +++ b/README.md @@ -0,0 +1,46 @@ +# Auto_news + +A utility to fetch article requests from slack and generate pdfs for them, fully automatically. + + +## Running +### How to run - auto archiving mode +In this mode the program is launched as a docker container, in a headless mode. For persistence purposes a local storage volume is required, but that's it! +`docker run -it -v :/app/file_storage/ auto_news` + +You can specify additional parameters: +`docker run -it -v :/app/file_storage/ auto_news debug` runs with debug values (does not write to prod db, does not send mails) +`docker run -it -v :/app/file_storage/ auto_news upload` catches up on past uploads to archive. +`docker run -it -v :/app/file_storage/ -e DISPLAY=":0" --network host -v \$XAUTHORITY:/root/.Xauthority auto_news check` lets you visually verify the downloaded files. Be aware that it requires additional parameters in order to open guis on the host. + + +### How to run - development mode +In this mode, a docker container is launched with an additional volume, the local code. You can test your code without the need to rebuild the image. +`docker run -it -v :/app/file_storage/ -v :/code/ --entry-point /bin/bash auto_news` +You are droppped into a bash shell, in which you can navigate to the `/code` directory and then test live. + + +% ### How to run - file checker mode +% This mode requires the most access rights. You want to access all files and open gui programs. +% `docker run -it -e DISPLAY=":0" --network host -v $XAUTHORITY:/root/.Xauthority -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ -v /mnt/Data/COSS/DOWNLOADS/auto_news/app:/code auto_news /bin/bash` +% Similarly to the development mode, you can cd into code and run your checking duties. + + + + +## Building + +### Things to keep in mind +The software (firefox, selenium, python) changes frequently. For non-breaking changes it is useful to regularly clean build the docker image! This is also crucial to update the code itself. + + +`docker build -t auto_news --no-cache .` +where the `Dockerfile` has to be in the working directory + + + +## Cheat-sheet Remy: + +docker run -it -e LIVECODE=TRUE -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ -v /mnt/Data/COSS/DOWNLOADS/auto_news/app:/code/ auto_news /bin/bash + +docker run -it -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ auto_news \ No newline at end of file diff --git a/app/configuration.py b/app/configuration.py new file mode 100644 index 0000000..25bcd82 --- /dev/null +++ b/app/configuration.py @@ -0,0 +1,42 @@ +import os +import sys +import configparser +import logging +from peewee import SqliteDatabase +from rich.logging import RichHandler + +# first things first: logging +logging.basicConfig( + format='%(message)s', + level=logging.INFO, + datefmt='%Y-%m-%d %H:%M:%S', + handlers=[RichHandler()] + ) +logger = logging.getLogger(__name__) + + +# load config file containing constants and secrets +parsed = configparser.ConfigParser() +parsed.read("/app/file_storage/config.ini") + +if "debug" in sys.argv: + logger.warning("Running in debugging mode because launched with argument 'debug'") + # parsed.read("/code/config.ini") + + db_path = os.path.join(parsed["DATABASE"]["db_path_dev"], parsed["DATABASE"]["db_name"]) + parsed["SLACK"]["archive_id"] = parsed["SLACK"]["debug_id"] + parsed["MAIL"]["recipient"] = parsed["MAIL"]["sender"] +else: + logger.warning("Using production values, I hope you know what you're doing...") + + db_path = os.path.join(parsed["DATABASE"]["db_path_prod"], parsed["DATABASE"]["db_name"]) + +from utils_storage import models + +# Set up the database +models.set_db( + SqliteDatabase( + db_path, + pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once + ) +) \ No newline at end of file diff --git a/app/launch.py b/app/launch.py new file mode 100644 index 0000000..1cbb3ef --- /dev/null +++ b/app/launch.py @@ -0,0 +1,9 @@ +import configuration + +from utils_mail import runner + +class Dummy: + source_name = "AS" + title = "dummy title" + mail_info = [{"reply_text": "UNFOOO", "file_path":None}] +runner.send(Dummy()) \ No newline at end of file diff --git a/app/runner.py b/app/runner.py new file mode 100644 index 0000000..8ca103b --- /dev/null +++ b/app/runner.py @@ -0,0 +1,193 @@ +"""Main coordination of other util classes. Handles inbound and outbound calls""" +import configuration +models = configuration.models +import sys +import logging +logger = logging.getLogger(__name__) + +from utils_mail import runner as mail_runner +from utils_slack import runner as slack_runner +from utils.workers import CompressWorker, DownloadWorker, FetchWorker, UploadWorker + + +class ArticleWatcher: + """Wrapper for a newly created article object. Notifies the coordinator upon change/completition""" + def __init__(self, article, **kwargs) -> None: + self.article = article + + self.completition_notifier = kwargs.get("notifier") + + self.fetch = kwargs.get("worker_fetch", None) + self.download = kwargs.get("worker_download", None) + self.compress = kwargs.get("worker_compress", None) + self.upload = kwargs.get("worker_upload", None) + + self.completition_notified = False + # self._download_called = self._compression_called = False + self._fetch_completed = self._download_completed = self._compression_completed = self._upload_completed = False + + # first step: gather metadata + self.fetch.process(self) # this will call the update_status method + self.upload.process(self) # idependdent from the rest + + + def update_status(self, completed_action): + """Checks and notifies internal completition-status. + Article download is complete iff fetch and download were successfull and compression was run + """ + # if self.completition_notified and self._compression_completed and self._fetch_completed and self._download_completed and self._upload_completed, we are done + # we don't need to delete self though, because it is then automatically garbage-collected + # all_done = self._fetch_completed and self._download_completed and self._compression_completed and self._upload_completed + # if self._fetch_completed and not self._download_called: + # self._download_called = True + # self.download.process(self) + # elif self._download_completed and not self._compression_called: + # self._compression_called = True + # self.compress.process(self) + # elif self._compression_completed: # last step + # self.completition_notifier(self.article) + # # triggers action in Coordinator + # elif self._upload_completed: + # # this case occurs when upload was faster than compression + # pass + # else: + # logger.warning(f"update_status called with unusual configuration {self._fetch_completed},{self._download_completed},{self._compression_completed}") + + if completed_action == "fetch": + self.download.process(self) + elif completed_action == "download": + self.compress.process(self) + elif completed_action == "compress": # last step + self.completition_notifier(self.article) + # triggers action in Coordinator + elif completed_action == "upload": + # this case occurs when upload was faster than compression + pass + else: + logger.warning(f"update_status called with unusual configuration: {completed_action}") + + + # ====== Attributes to be modified by the util workers + @property + def fetch_completed(self): + return self._fetch_completed + + @fetch_completed.setter + def fetch_completed(self, value: bool): + self._fetch_completed = value + self.update_status("fetch") + + @property + def download_completed(self): + return self._download_completed + + @download_completed.setter + def download_completed(self, value: bool): + self._download_completed = value + self.update_status("download") + + @property + def compression_completed(self): + return self._compression_completed + + @compression_completed.setter + def compression_completed(self, value: bool): + self._compression_completed = value + self.update_status("compress") + + @property + def upload_completed(self): + return self._upload_completed + + @upload_completed.setter + def upload_completed(self, value: bool): + self._upload_completed = value + self.update_status("upload") + + + + +class Coordinator: + def __init__(self, **kwargs) -> None: + """Launcher calls this Coordinator as the main thread to handle connections between the other workers (threaded).""" + pass + + def add_workers(self, **kwargs): + self.worker_slack = kwargs.pop("worker_slack", None) + self.worker_mail = kwargs.pop("worker_mail", None) + # the two above won't be needed in the Watcher + self.worker_download = kwargs.get("worker_download", None) + self.worker_fetch = kwargs.get("worker_fetch", None) + self.worker_compress = kwargs.get("worker_compress", None) + self.worker_upload = kwargs.get("worker_upload", None) + + self.kwargs = kwargs + + for w in [self.worker_slack, self.worker_download, self.worker_fetch, self.worker_upload, self.worker_compress]: + if not w is None: + w.start() + + + def incoming_request(self, message): + # TODO CHECK ME! + """This method is passed onto the slack worker. It gets triggered when a new message is received.""" + url = message.urls[0] # ignore all the other ones + a, is_new = models.ArticleDownload.get_or_create(article_url=url) + message.thread.article = a + message.thread.save() + + if is_new: + self.kwargs.update({"notifier" : self.article_complete_notifier}) + ArticleWatcher( + a, + **self.kwargs + ) + + # All workers are implemented as a threaded queue. But the individual model requires a specific processing order: + # fetch -> download -> compress -> complete + # the watcher orchestrates the procedure and notifies upon completition + # the watcher will notify once it is sufficiently populated + else: # manually trigger notification immediatly + self.article_complete_notifier(a) + + + + def manual_processing(self, url_list, target_calls): + for url in url_list: + article = models.ArticleDownload.get_or_none(article_url=url) + watcher = ArticleWatcher(article, self.article_complete_notifier) + for t in target_calls: + t.process(watcher) + + def article_complete_notifier(self, article): + self.worker_slack.bot_worker.respond_channel_message(article) + self.worker_mail.send(article) + + + +if __name__ == "__main__": + coordinator = Coordinator() + + + if "upload" in sys.argv: + urls = models.ArticleDownload.select(models.ArticleDownload.article_url).where(models.ArticleDownload.archive_url == "").execute() + logger.info(f"Launching upload to archive for {len(urls)} urls.") + coordinator.manual_processing(urls, [UploadWorker()]) + elif "check" in sys.argv: + logger.info("Not implemented yet.") + else: # launch with full action + kwargs = { + "worker_download" : DownloadWorker(), + "worker_fetch" : FetchWorker(), + "worker_upload" : UploadWorker(), + "worker_compress" : CompressWorker(), + "worker_slack" : slack_runner.BotRunner(coordinator.incoming_request), + "worker_mail" : mail_runner, + } + coordinator.add_workers(**kwargs) + + + + +# TODO +# Resume interrupted article models \ No newline at end of file diff --git a/app/utils/_init__.py b/app/utils/_init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/utils/compress/runner.py b/app/utils/compress/runner.py new file mode 100644 index 0000000..1cffd90 --- /dev/null +++ b/app/utils/compress/runner.py @@ -0,0 +1,40 @@ +import os +import subprocess +import logging +logger = logging.getLogger(__name__) +import configuration +config = configuration.parsed["DOWNLOADS"] + +shrink_sizes = [] + +def shrink_pdf(article): + initial_size = os.path.getsize(article.save_path + article.file_name) + c = subprocess.run( + ["gs", "-sDEVICE=pdfwrite", "-dPDFSETTINGS=/screen", "-dNOPAUSE", "-dBATCH", f"-sOutputFile={config['default_download_path']}/compressed.pdf", f'"{article.save_path + article.file_name}"'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE + ) + if c.returncode == 0: + m = subprocess.run( + ["mv", "-f", f"{config['default_download_path']}/compressed.pdf", article.save_path + article.file_name] + ) + if m.returncode == 0: + final_size = os.path.getsize(article.save_path + article.file_name) + shrink_sizes.append(initial_size - final_size) + logger.info(f"Compression worked. Avg shrinkage: {sum(shrink_sizes)/len(shrink_sizes) / 1000} (kb)") + return article # even though no modifications were made + else: + logger.error(f"Compression ran but I could not copy back the file {m.stderr.decode()} - {m.stdout.decode()}") + + + else: + logger.error(f"Could not run the compression! {c.stderr.decode()} - {c.stdout.decode()}") + + return article + + + + + +# gs -sDEVICE=pdfwrite -dPDFSETTINGS=/screen -dNOPAUSE -dBATCH -sOutputFile=out.pdf +# ; mv -f temp.pdf file.pdf \ No newline at end of file diff --git a/app/utils/download/__init__.py b/app/utils/download/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/utils/download/browser.py b/app/utils/download/browser.py new file mode 100644 index 0000000..3af0b36 --- /dev/null +++ b/app/utils/download/browser.py @@ -0,0 +1,158 @@ +import time +import datetime +import logging +import os +import base64 +import requests +from selenium import webdriver +from selenium.webdriver.firefox.options import Options +import configuration + +config = configuration.parsed["DOWNLOADS"] + + + +class PDFDownloader: + """Saves a given url. Fills the object it got as a parameter""" + logger = logging.getLogger(__name__) + # status-variable for restarting: + running = False + + def start(self): + options=Options() + options.profile = config["browser_profile_path"] + # TODO: Get headless mode interactively + options.add_argument('--headless') + # options.add_argument("--disable-infobars") + # options.set_preference("javascript.enabled", False) + # options.add_argument("--disable-popup-blocking") + # Print to pdf + options.set_preference("print_printer", "Mozilla Save to PDF") + options.set_preference("print.always_print_silent", True) + options.set_preference("print.show_print_progress", False) + options.set_preference('print.save_as_pdf.links.enabled', True) + options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True) + # Save existing pdf + options.set_preference("browser.download.folderList", 2) + # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf") + # options.set_preference("pdfjs.disabled", True) + options.set_preference("browser.download.dir", config["default_download_path"]) + + self.logger.info("Now Starting gecko driver") + self.driver = webdriver.Firefox(options=options) + + residues = os.listdir(config["default_download_path"]) + for res in residues: + os.remove(os.path.join(config["default_download_path"], res)) + + self.running = True + + def autostart(self): + if not self.running: + self.start() # relaunch the dl util + + def finish(self): + self.driver.quit() + self.running = False + + + def download(self, article_object): + sleep_time = 1 + self.autostart() + url = article_object.article_url + + # arbitrary bug fixes: + if "focus.de" in url or "bloomberg.com" in url: + url = url.replace("https://", "https://outline.com/") + sleep_time += 5 + try: + self.driver.get(url) + except Exception as e: + self.logger.critical("Selenium .get(url) failed with error {}".format(e)) + self.finish() + return article_object # without changes + + time.sleep(sleep_time) + # leave the page time to do any funky business + + # in the mean time, get a page title if required + if article_object.is_title_bad: + article_object.title = self.driver.title.replace(".pdf","") + # will be propagated to dst as well + + fname = article_object.fname_template + dst = os.path.join(article_object.save_path, fname) + if os.path.exists(dst): + fname = make_path_unique(fname) + dst = os.path.join(article_object.save_path, fname) + + + if url[-4:] == ".pdf": + # according to the browser preferences, calling the url will open pdfjs. + # If not handled separately, printing would require the ctrl+p route, but setup is janky to say the least + success = self.get_exisiting_pdf(url, dst) + else: + success = self.get_new_pdf(dst) + + + if success: + article_object.file_name = fname + article_object.set_references = self.get_references() + else: + article_object.file_name = "" + + return article_object # this change is saved later manually + + + def get_exisiting_pdf(self, url, dst): + try: + r = requests.get(url) + bytes = r.content + except: + return False + return self.get_new_pdf(dst, other_bytes=bytes) + + + def get_new_pdf(self, dst, other_bytes=None): + os.makedirs(os.path.dirname(dst), exist_ok=True) + + if other_bytes is None: + try: + result = self.driver.print_page() + bytes = base64.b64decode(result, validate=True) + except: + self.logger.error("Failed, probably because the driver went extinct.") + return False + else: + bytes = other_bytes + + try: + with open(dst, "wb+") as f: + f.write(bytes) + return True + except Exception as e: + self.logger.error(f"Failed, because of FS-operation: {e}") + return False + + + def get_references(self): + try: + hrefs = [e.get_attribute("href") for e in self.driver.find_elements_by_xpath("//a[@href]")] + except: + hrefs = [] + # TODO TEST THIS + hrefs = [h for h in hrefs \ + if bool([(domain in h) for domain in config["blacklisted_href_domains"]]) + ] # filter a tiny bit at least + return hrefs + + + + + + + +def make_path_unique(path): + fname, ending = os.path.splitext(path) + fname += datetime.datetime.now().strftime("%d-%H%M%S") + return fname + ending \ No newline at end of file diff --git a/app/utils/download/runner.py b/app/utils/download/runner.py new file mode 100644 index 0000000..e69de29 diff --git a/app/utils/download/youtube.py b/app/utils/download/youtube.py new file mode 100644 index 0000000..014e382 --- /dev/null +++ b/app/utils/download/youtube.py @@ -0,0 +1,33 @@ +import logging +import os +from pytube import YouTube + +logger = logging.getLogger(__name__) + + +def save_video(article_object): + """Saves video accoring to url and save path""" + url = article_object.article_url + logger.info("Saving new video") + try: + yt = YouTube(url) + streams = yt.streams.filter(progressive=True).order_by('resolution') + except Exception as e: + article_object.file_name = "ERROR: {}".format(e) + return article_object + + if streams: # if it's not empty + vid = streams[-1] + article_object.source_name = "youtube.com" + article_object.title = yt.title + file_path = os.path.join(article_object.save_path, article_object.fname_template) + try: + vid.download(file_path) + article_object.file_name = article_object.fname_template + except Exception as e: + logger.error(f"Youtube download crashed: {e}") + article_object.file_name = "Error while downloading" + else: + article_object.file_name = "No streams available" + + return article_object diff --git a/app/utils/fetch/runner.py b/app/utils/fetch/runner.py new file mode 100644 index 0000000..1fc227e --- /dev/null +++ b/app/utils/fetch/runner.py @@ -0,0 +1,60 @@ +from newspaper import Article +from urllib.parse import urlparse +from htmldate import find_date +import datetime +import logging +logging.getLogger('newspaper').setLevel(logging.ERROR) # quieter logs +logging.getLogger('urllib').setLevel(logging.ERROR) # quieter logs +logging.getLogger('urllib3.poolmanager').setLevel(logging.ERROR) # quieter logs +logging.getLogger('htmldate').setLevel(logging.ERROR) #quieter logs +logging.getLogger('charset_normalizer').setLevel(logging.ERROR) #quieter logs +logger = logging.getLogger("fetch") + + +class NewspaperDummy(): + title = "Error while running fetch" + summary = "Error while running fetch" + text = "Error while running fetch" + authors = [] + keywords = [] + + +def get_description(article_object): + url = article_object.article_url + website = urlparse(url).netloc + article_object.source_name = website + try: + pub_date = datetime.datetime.strptime(find_date(url), '%Y-%d-%M') + except: # other file types + pub_date = datetime.datetime(year=1900, month=1, day=1) + article_object.pub_date = pub_date + + fallback = NewspaperDummy() + try: + news_article = Article(url) + news_article.download() + news_article.parse() + except: + news_article = fallback + + + if news_article.title: + title = news_article.title + else: + title = fallback.title + + + if news_article.summary: + summary = news_article.summary + elif news_article.text: + ind = min(500, len(news_article.text)) + summary = news_article.text[:ind] + "..." + else: + summary = fallback.summary + + article_object.title = title + article_object.summary = summary + article_object.set_authors(news_article.authors) + article_object.set_keywords(news_article.keywords) + + return article_object diff --git a/app/utils/upload/runner.py b/app/utils/upload/runner.py new file mode 100644 index 0000000..b8d188f --- /dev/null +++ b/app/utils/upload/runner.py @@ -0,0 +1,18 @@ +from waybackpy import WaybackMachineSaveAPI # upload to archive.org +import logging +logger = logging.getLogger(__name__) + +def upload_to_archive(article_object): + """uploads to archive.org and returns the archived url""" + user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed? + url = article_object.article_url + try: + wayback = WaybackMachineSaveAPI(url, user_agent) + archive_url = wayback.save() + logger.info(f"{url} uploaded to archive successfully") + article_object.archive_url = archive_url + except Exception as e: + article_object.archive_url = "Error while uploading: {}".format(e) + logger.error(f"Error while generating new url: {e}") + + return article_object \ No newline at end of file diff --git a/app/utils/worker_template.py b/app/utils/worker_template.py new file mode 100644 index 0000000..d1b44bc --- /dev/null +++ b/app/utils/worker_template.py @@ -0,0 +1,43 @@ +from threading import Thread +import time +import logging +# logger = logging.getLogger(__name__) + + +class TemplateWorker(Thread): + """Parent class for any subsequent worker of the article-download pipeline. They should all run in parallel, thus the Thread subclassing""" + logger = logging.getLogger(__name__) + + def __init__(self, *args, **kwargs) -> None: + target = self._queue_processor # will be executed on Worker.start() + group = kwargs.get("group", None) + name = kwargs.get("name", None) + + super().__init__(group=group, target=target, name=name) + self._article_queue = [] + self.logger.info(f"Worker thread {self.__class__.__name__} initialized successfully") + + + def process(self, article_watcher): + self._article_queue.append(article_watcher)#.article_model.article_url) + + + def _queue_processor(self): + """This method is launched by thread.run() and idles when self._article_queue is empty. When an external caller appends to the queue it jumps into action""" + while True: # PLEASE tell me if I'm missing an obvious better way of doing this! + if len(self._article_queue) == 0: + time.sleep(5) + else: + article_watcher = self._article_queue.pop(0) + self.logger.info(f"{self.__class__.__name__} is now processing an article") + self._handle_article(article_watcher) + + + def _handle_article(self, article_watcher, action=None): + # TODO Overload in children classes + if action is None: + self.logger.error("Unoverloaded call of _handle_article(). This should not occur in prod") + else: + article = article_watcher.article + article = action(article) # action updates the article object but does not save the change + article.save() diff --git a/app/utils/workers.py b/app/utils/workers.py new file mode 100644 index 0000000..f29aab0 --- /dev/null +++ b/app/utils/workers.py @@ -0,0 +1,60 @@ +from .worker_template import TemplateWorker +from .download.browser import PDFDownloader +from .download.youtube import save_video +from .fetch.runner import get_description +from .upload.runner import upload_to_archive as run_upload +from .compress.runner import shrink_pdf + +import logging +logger = logging.getLogger(__name__) + +class DownloadWorker(TemplateWorker): + def __init__(self) -> None: + self.dl_runner = PDFDownloader().download + self.yt_runner = save_video + super().__init__() + + def _handle_article(self, article_watcher): + article = article_watcher.article + u = article.article_url + + if "youtu.be/" in u or "youtube.com/" in u: + action = self.yt_runner + else: + action = self.dl_runner + + super()._handle_article(article_watcher, action) + article_watcher.download_completed = True + + + +class FetchWorker(TemplateWorker): + def __init__(self) -> None: + super().__init__() + + def _handle_article(self, article_watcher): + action = get_description # function + super()._handle_article(article_watcher, action) + article_watcher.fetch_completed = True + + + +class UploadWorker(TemplateWorker): + def __init__(self) -> None: + super().__init__() + + def _handle_article(self, article_watcher): + action = run_upload # function + super()._handle_article(article_watcher, action) + article_watcher.upload_completed = True + + + +class CompressWorker(TemplateWorker): + def __init__(self) -> None: + super().__init__() + + def _handle_article(self, article_watcher): + action = shrink_pdf + super()._handle_article(article_watcher, action) + article_watcher.compression_completed = True \ No newline at end of file diff --git a/app/utils_mail/runner.py b/app/utils_mail/runner.py new file mode 100644 index 0000000..b70f0fe --- /dev/null +++ b/app/utils_mail/runner.py @@ -0,0 +1,42 @@ +import smtplib +from email.mime.multipart import MIMEMultipart +from email.mime.text import MIMEText +from email.mime.application import MIMEApplication +import os +import logging +import configuration + +logger = logging.getLogger(__name__) +config = configuration.parsed["MAIL"] + +def send(article_model): + mail = MIMEMultipart() + mail['Subject'] = "{} -- {}".format(article_model.source_name, article_model.title) + mail['From'] = config["sender"] + mail['To'] = config["recipient"] + + msgs = article_model.mail_info # this is html + msg = [m["reply_text"] for m in msgs] + msg = "\n".join(msg) + + content = MIMEText(msg, "html") + mail.attach(content) + + files = [m["file_path"] for m in msgs if m["file_path"]] + for path in files: + with open(path, 'rb') as file: + part = MIMEApplication(file.read(), "pdf") + # encoders.encode_base64(part) + part.add_header('Content-Disposition', 'attachment', filename=os.path.basename(path)) + mail.attach(part) + + try: + smtp = smtplib.SMTP(config["smtp_server"], config["port"]) + smtp.starttls() + smtp.login(config["uname"], config["password"]) + smtp.sendmail(config["sender"], config["recipient"], mail.as_string()) + smtp.quit() + logger.info("Mail successfully sent.") + except Exception as e: + logger.error("Could not send mail for article {}".format(article_model)) + logger.info(e) \ No newline at end of file diff --git a/app/utils_slack/message_helpers.py b/app/utils_slack/message_helpers.py new file mode 100644 index 0000000..84cbc58 --- /dev/null +++ b/app/utils_slack/message_helpers.py @@ -0,0 +1,265 @@ +import logging +import configuration +import requests +import os +import time +import asyncio +import sys +from slack_sdk.errors import SlackApiError + +logger = logging.getLogger(__name__) +config = configuration.parsed["SLACK"] +models = configuration.models +slack_client = "dummy" +LATEST_RECORDED_REACTION = 0 + + +def init(client) -> None: + global slack_client + slack_client = client + + # config["archive_id"] = channel_id + try: + LATEST_RECORDED_REACTION = models.Reaction.select(models.Reaction.id).order_by("id")[-1] + except IndexError: #query is actually empty, we have never fetched any messages until now + LATEST_RECORDED_REACTION = 0 + # fetch all te messages we could have possibly missed + + logger.info("Querying missed messages, threads and reactions. This can take some time.") + fetch_missed_channel_messages() + if "nofetch" in sys.argv: + logger.info("Omitted update of reactions and thread messages because of argument 'nofetch'.") + else: # perform these two asyncronously + async def run_async(): + await asyncio.gather(fetch_missed_channel_reactions(), fetch_missed_thread_messages()) + asyncio.run(run_async()) + + +def get_past_messages(): + """Gets all messages that have not yet been handled, be it by mistake or by downtime + As the message handler mkaes no distinction between channel messages and thread messages, + we don't have to worry about them here. + """ + + threaded_objects = [] + for t in models.Thread.select(): + if t.message_count > 1: # if only one message was written, it is the channel message + msg = t.last_message + if msg.is_by_human: + threaded_objects.append(msg) + # else don't, nothing to process + logger.info(f"Set {len(threaded_objects)} thread-messages as not yet handled.") + + + channel_objects = [t.initiator_message for t in models.Thread.select() if t.message_count == 1 and not t.is_fully_processed] + logger.info(f"Set {len(channel_objects)} channel-messages as not yet handled.") + + reaction_objects = list(models.Reaction.select().where(models.Reaction.id > LATEST_RECORDED_REACTION)) + # the ones newer than the last before the fetch + + all_messages = channel_objects + threaded_objects + return all_messages, reaction_objects + + +def fetch_missed_channel_messages(): + # latest processed message_ts is: + presaved = models.Message.select().order_by(models.Message.ts) + if not presaved: + last_ts = 0 + else: + last_message = presaved[-1] + last_ts = last_message.slack_ts + + result = slack_client.conversations_history( + channel=config["archive_id"], + oldest=last_ts + ) + + new_messages = result.get("messages", []) + # # filter the last one, it is a duplicate! (only if the db is not empty!) + # if last_ts != 0 and len(new_messages) != 0: + # new_messages.pop(-1) + + new_fetches = 0 + for m in new_messages: + # print(m) + message_dict_to_model(m) + new_fetches += 1 + + refetch = result.get("has_more", False) + while refetch: # we have not actually fetched them all + try: + result = slack_client.conversations_history( + channel = config["archive_id"], + cursor = result["response_metadata"]["next_cursor"], + oldest = last_ts + ) # fetches 100 messages, older than the [-1](=oldest) element of new_fetches + refetch = result.get("has_more", False) + + new_messages = result.get("messages", []) + for m in new_messages: + message_dict_to_model(m) + new_fetches += 1 + except SlackApiError: # Most likely a rate-limit + logger.error("Error while fetching channel messages. (likely rate limit) Retrying in {} seconds...".format(config["api_wait_time"])) + time.sleep(config["api_wait_time"]) + refetch = True + + logger.info(f"Fetched {new_fetches} new channel messages.") + + +async def fetch_missed_thread_messages(): + """After having gotten all base-threads, we need to fetch all their replies""" + # I don't know of a better way: we need to fetch this for each and every thread (except if it is marked as permanently solved) + logger.info("Starting async fetch of thread messages...") + threads = [t for t in models.Thread.select() if not t.is_fully_processed] + new_messages = [] + for i,t in enumerate(threads): + try: + messages = slack_client.conversations_replies( + channel = config["archive_id"], + ts = t.slack_ts, + oldest = t.messages[-1].slack_ts + )["messages"] + except SlackApiError: + logger.error("Hit rate limit while querying threaded messages, retrying in {}s ({}/{} queries elapsed)".format(config["api_wait_time"], i, len(threads))) + await asyncio.sleep(config["api_wait_time"]) + messages = slack_client.conversations_replies( + channel = config["archive_id"], + ts = t.slack_ts, + oldest = t.messages[-1].slack_ts + )["messages"] + + messages.pop(0) # the first message is the one posted in the channel. We already processed it! + + for m in messages: + # only append *new* messages + res = message_dict_to_model(m) + if res: + new_messages.append(res) + logger.info("Fetched {} new threaded messages.".format(len(new_messages))) + + +async def fetch_missed_channel_reactions(): + logger.info("Starting async fetch of channel reactions...") + threads = [t for t in models.Thread.select() if not t.is_fully_processed] + for i,t in enumerate(threads): + try: + query = slack_client.reactions_get( + channel = config["archive_id"], + timestamp = t.slack_ts + ) + reactions = query["message"].get("reactions", []) # default = [] + except SlackApiError: # probably a rate_limit: + logger.error("Hit rate limit while querying reactions. retrying in {}s ({}/{} queries elapsed)".format(config["api_wait_time"], i, len(threads))) + await asyncio.sleep(config["api_wait_time"]) + reactions = query["message"].get("reactions", []) + + for r in reactions: + reaction_dict_to_model(r, t) + + + + +# Helpers for message conversion to db-objects +def reaction_dict_to_model(reaction, thread=None): + if thread is None: + m_ts = reaction["item"]["ts"] + message = models.Message.get(ts = float(m_ts)) + thread = message.thread + if "name" in reaction.keys(): # fetched through manual api query + content = reaction["name"] + elif "reaction" in reaction.keys(): # fetched through events + content = reaction["reaction"] + else: + logger.error(f"Weird reaction received: {reaction}") + return None + + r, _ = models.Reaction.get_or_create( + type = content, + message = thread.initiator_message + ) + logger.info("Saved reaction [{}]".format(content)) + return r + + +def message_dict_to_model(message): + if message["type"] == "message": + thread_ts = message["thread_ts"] if "thread_ts" in message else message["ts"] + uid = message.get("user", "BAD USER") + if uid == "BAD USER": + logger.critical("Message has no user?? {}".format(message)) + + user, _ = models.User.get_or_create(user_id = uid) + thread, _ = models.Thread.get_or_create(thread_ts = thread_ts) + m, new = models.Message.get_or_create( + user = user, + thread = thread, + ts = message["ts"], + channel_id = config["archive_id"], + text = message["text"] + ) + logger.info("Saved (text) {} (new={})".format(m, new)) + + for f in message.get("files", []): #default: [] + m.file_type = f["filetype"] + m.perma_link = f["url_private_download"] + m.save() + logger.info("Saved permalink {} to {} (possibly overwriting)".format(f["name"], m)) + if new: + return m + else: + return None + else: + logger.warning("What should I do of {}".format(message)) + return None + + +def say_substitute(*args, **kwargs): + logger.info("Now sending message through say-substitute: {}".format(" - ".join(args))) + slack_client.chat_postMessage( + channel=config["archive_id"], + text=" - ".join(args), + **kwargs + ) + + +def save_as_related_file(url, article_object): + r = requests.get(url, headers={"Authorization": "Bearer {}".format(slack_client.token)}) + saveto = article_object.save_path + ftype = url[url.rfind(".") + 1:] + fname = "{} - related no {}.{}".format( + article_object.file_name.replace(".pdf",""), + len(article_object.related) + 1, + ftype + ) + with open(os.path.join(saveto, fname), "wb") as f: + f.write(r.content) + article_object.set_related([fname]) + logger.info("Added {} to model {}".format(fname, article_object)) + return fname + + +def react_file_path_message(fname, article_object): + saveto = article_object.save_path + file_path = os.path.join(saveto, fname) + if os.path.exists(file_path): + article_object.set_related([fname]) + logger.info("Added {} to model {}".format(fname, article_object)) + return True + else: + return False + + +def is_message_in_archiving(message) -> bool: + if isinstance(message, dict): + return message["channel"] == config["archive_id"] + else: + return message.channel_id == config["archive_id"] + + +def is_reaction_in_archiving(event) -> bool: + if isinstance(event, dict): + return event["item"]["channel"] == config["archive_id"] + else: + return event.message.channel_id == config["archive_id"] diff --git a/app/utils_slack/runner.py b/app/utils_slack/runner.py new file mode 100644 index 0000000..92a88e3 --- /dev/null +++ b/app/utils_slack/runner.py @@ -0,0 +1,190 @@ +from threading import Thread +from slack_bolt import App +from slack_bolt.adapter.socket_mode import SocketModeHandler + +import logging +from rich.rule import Rule +import configuration + +from . import message_helpers + + +config = configuration.parsed["SLACK"] +models = configuration.models + +class BotApp(App): + logger = logging.getLogger(__name__) + + def __init__(self, callback, *args, **kwargs): + + super().__init__(*args, **kwargs) + # models = models + self.callback = callback + + def start(self): + message_helpers.init(self.client) + missed_messages, missed_reactions = message_helpers.get_past_messages() + + [self.handle_incoming_message(m) for m in missed_messages] + [self.handle_incoming_reaction(r) for r in missed_reactions] + + # self.react_missed_reactions(missed_reactions) + # self.react_missed_messages(missed_messages) + self.startup_status() + + + + def handle_incoming_reaction(self, reaction): + if isinstance(reaction, dict): #else: the reaction is already being passed as a model + # CAUTION: filter for 'changed reactions' those are nasty (usually when adding an url) + reaction = message_helpers.reaction_dict_to_model(reaction) + + thread = reaction.message.thread + article_object = thread.article + if not article_object is None: + reaction = reaction.type + status = 1 if reaction == "white_check_mark" else -1 + + # self.logger.info(f"Applying reaction {reaction} to its root message.") + article_object.verified = status + article_object.save() + + + def handle_incoming_message(self, message): + """Reacts to all messages inside channel archiving. Must then + distinguish between threaded replies and new requests + and react accordingly""" + if isinstance(message, dict): #else: the message is already being passed as a model + # CAUTION: filter for 'changed messages' those are nasty (usually when adding an url) + if message.get("subtype", "not bad") == "message_changed": + return False + message = message_helpers.message_dict_to_model(message) + + # First check: belongs to thread? + is_threaded = message.thread.message_count > 1 and message != message.thread.initiator_message + if is_threaded: + self.incoming_thread_message(message) + else: + self.incoming_channel_message(message) + + + def incoming_thread_message(self, message): + if message.user.user_id == config["bot_id"]: + return True # ignore the files uploaded by the bot. We handled them already! + + thread = message.thread + if thread.is_fully_processed: + return True + + self.logger.info("Receiving thread-message") + self.respond_thread_message(message) + + + def incoming_channel_message(self, message): + self.logger.info("Handling message with {} url(s)".format(len(message.urls))) + + if not message.urls: # no urls in a root-message => IGNORE + message.is_processed_override = True + message.save() + return + + # ensure thread is still empty, this is a scenario encountered only in testing, but let's just filter it + if message.thread.message_count > 1: + self.logger.info("Discarded message because it is actually processed.") + return + + if len(message.urls) > 1: + message_helpers.say_substitute("Only the first url is being handled. Please send any subsequent url as a separate message", thread_ts=message.thread.slack_ts) + + self.callback(message) + # for url in message.urls: + # self.callback(url, message) + # stop here! + + + + def respond_thread_message(self, message, say=message_helpers.say_substitute): + thread = message.thread + article = thread.article + if message.perma_link: # file upload means new data + fname = message_helpers.save_as_related_file(message.perma_link, article) + say("File was saved as 'related file' under `{}`.".format(fname), + thread_ts=thread.slack_ts + ) + else: # either a pointer to a new file (too large to upload), or trash + success = message_helpers.react_file_path_message(message.text, article) + if success: + say("File was saved as 'related file'", thread_ts=thread.slack_ts) + else: + self.logger.error("User replied to thread {} but the response did not contain a file/path".format(thread)) + say("Cannot process response without associated file.", + thread_ts=thread.slack_ts + ) + + + def respond_channel_message(self, article, say=message_helpers.say_substitute): + # extra={"markup": True} + # self.logger.info(Rule(url[:min(len(url), 30)])) + thread = article.slack_thread.execute()[0] + answers = article.slack_info + for a in answers: + if a["file_path"]: + try: # either, a["file_path"] does not exist, or the upload resulted in an error + self.client.files_upload( + channels = config["archive_id"], + initial_comment = f"<@{config['responsible_id']}> \n {a['reply_text']}", + file = a["file_path"], + thread_ts = thread.slack_ts + ) + status = True + except: + say( + "File {} could not be uploaded.".format(a), + thread_ts=thread.slack_ts + ) + status = False + else: # anticipated that there is no file! + say( + f"<@{config['responsible_id']}> \n {a['reply_text']}", + thread_ts=thread.slack_ts + ) + status = True + # self.logger.info(Rule(f"Fully handled (success={status})")) + + + def startup_status(self): + threads = [t for t in models.Thread.select()] + all_threads = len(threads) + fully_processed = len([t for t in threads if t.is_fully_processed]) + fully_unprocessed = len([t for t in threads if t.message_count == 1]) + articles_unprocessed = len(models.ArticleDownload.select().where(models.ArticleDownload.verified < 1)) + self.logger.info(f"[bold]STATUS[/bold]: Fully processed {all_threads}/{fully_processed} threads. {fully_unprocessed} threads have 0 replies. Article-objects to verify: {articles_unprocessed}", extra={"markup": True}) + + + + + +class BotRunner(Thread): + """Stupid encapsulation so that we can apply the slack decorators to the BotApp""" + def __init__(self, callback, *args, **kwargs) -> None: + self.bot_worker = BotApp(callback, token=config["auth_token"]) + + @self.bot_worker.event(event="message", matchers=[message_helpers.is_message_in_archiving]) + def handle_incoming_message(message, say): + return self.bot_worker.handle_incoming_message(message) + + @self.bot_worker.event(event="reaction_added", matchers=[message_helpers.is_reaction_in_archiving]) + def handle_incoming_reaction(event, say): + return self.bot_worker.handle_incoming_reaction(event) + + target = self.launch + super().__init__(target=target) + + + def launch(self): + self.bot_worker.start() + SocketModeHandler(self.bot_worker, config["app_token"]).start() + + + # def respond_to_message(self, message): + # self.bot_worker.handle_incoming_message(message) \ No newline at end of file diff --git a/app/utils_storage/models.py b/app/utils_storage/models.py new file mode 100644 index 0000000..3720aa8 --- /dev/null +++ b/app/utils_storage/models.py @@ -0,0 +1,320 @@ +import logging +logger = logging.getLogger(__name__) + +from peewee import * +import os +import markdown +import re +import configuration +import datetime + +config = configuration.parsed["DOWNLOADS"] +slack_config = configuration.parsed["SLACK"] + +## Helpers +db = DatabaseProxy() +# set the nature of the db at runtime +class BaseModel(Model): + class Meta: + database = db + + +## == Article related models == ## +class ArticleDownload(BaseModel): + title = CharField(default='') + pub_date = DateField(default = '') + download_date = DateField(default = datetime.date.today) + source_name = CharField(default = '') + article_url = TextField(default = '', unique=True) + archive_url = TextField(default = '') + file_name = TextField(default = '') + language = CharField(default = '') + summary = TextField(default = '') + comment = TextField(default = '') + verified = IntegerField(default = False) + # authors + # keywords + # ... are added through foreignkeys + + def __str__(self) -> str: + return "ART ({} -- {})".format(self.title, self.source_name) + + ## Useful Properties + @property + def save_path(self): + return f"{config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/" + + def fname_nas(self, file_name=""): + if self.download_date: + if file_name: + return "NAS: {}/{}/{}/{}".format(config["remote_storage_path"], self.download_date.year, self.download_date.strftime("%B"), file_name) + else: # return the self. name + return "NAS: {}/{}/{}/{}".format(config["remote_storage_path"], self.download_date.year, self.download_date.strftime("%B"), self.file_name) + else: + return None + + @property + def fname_template(self): + if self.source_name == "youtube.com": + fname = "{} -- {}".format(self.source_name, self.title) + else: + fname = "{} -- {}.pdf".format(self.source_name, self.title) + return clear_path_name(fname) + + @property + def is_title_bad(self): # add incrementally + return "PUR-Abo" in self.title \ + or "Redirecting" in self.title \ + or "Error while running fetch" in self.title + + @property + def slack_info(self): + status = [":x: No better version available", ":gear: Verification pending", ":white_check_mark: Verified by human"][self.verified + 1] + content = "\n>" + "\n>".join(self.summary.split("\n")) + file_status, msg = self.file_status() + if not file_status: + return [msg] + + # everything alright: generate real content + # first the base file + if self.file_name[-4:] == ".pdf": + answer = [{ # main reply with the base pdf + "reply_text" : f"*{self.title}*\n{status}\n{content}", + "file_path" : self.save_path + self.file_name + }] + else: # don't upload if the file is too big! + location = "Not uploaded to slack, but the file will be on the NAS:\n`{}`".format(self.fname_nas()) + answer = [{ # main reply with the base pdf + "reply_text" : "*{}*\n{}\n{}\n{}".format(self.title, status, content, location), + "file_path" : None + }] + + # then the related files + rel_text = "" + for r in self.related: + fname = r.related_file_name + lentry = "\n• `{}` ".format(self.fname_nas(fname)) + if fname[-4:] == ".pdf": # this is a manageable file, directly upload + f_ret = self.save_path + fname + answer.append({"reply_text":"", "file_path" : f_ret}) + else: # not pdf <=> too large. Don't upload but mention its existence + lentry += "(not uploaded to slack, but the file will be on the NAS)" + + rel_text += lentry + + if rel_text: + rel_text = answer[0]["reply_text"] = answer[0]["reply_text"] + "\nRelated files:\n" + rel_text + + return answer + + @property + def mail_info(self): + base = [{"reply_text": "[{}]({})\n".format(self.article_url, self.article_url), "file_path":None}] + self.slack_info + return [{"reply_text": markdown.markdown(m["reply_text"]), "file_path": m["file_path"]} for m in base] + + + ## Helpers + def set_keywords(self, keywords): + for k in keywords: + ArticleKeyword.create( + article = self, + keyword = k + ) + + def set_authors(self, authors): + for a in authors: + ArticleAuthor.create( + article = self, + author = a + ) + + def set_references(self, references): + for r in references: + ArticleReference.create( + article = self, + reference_url = r + ) + + def set_related(self, related): + for r in related: + ArticleRelated.create( + article = self, + related_file_name = r + ) + + def file_status(self): + if not self.file_name: + logger.error("Article {} has no filename!".format(self)) + return False, {"reply_text": "Download failed, no file was saved.", "file_path": None} + + file_path_abs = self.save_path + self.file_name + if not os.path.exists(file_path_abs): + logger.error("Article {} has a filename, but the file does not exist at that location!".format(self)) + return False, {"reply_text": "Can't find file. Either the download failed or the file was moved.", "file_path": None} + + return True, {} + + +class ArticleKeyword(BaseModel): + # instance gets created for every one keyword -> flexible in size + article = ForeignKeyField(ArticleDownload, backref='keywords') + keyword = CharField() + + +class ArticleAuthor(BaseModel): + article = ForeignKeyField(ArticleDownload, backref='authors') + author = CharField() + + +class ArticleReference(BaseModel): + article = ForeignKeyField(ArticleDownload, backref='references') + reference_url = TextField(default = '') + + +class ArticleRelated(BaseModel): + article = ForeignKeyField(ArticleDownload, backref='related') + related_file_name = TextField(default = '') + + + + +## == Slack-thread related models == ## +class User(BaseModel): + user_id = CharField(default='', unique=True) + # messages + + +class Thread(BaseModel): + """The threads that concern us are only created if the messages that contain urls""" + thread_ts = FloatField(default = 0) + article = ForeignKeyField(ArticleDownload, backref="slack_thread", null=True, default=None) + # provides, ts, user, models + # messages + + @property + def slack_ts(self): + str_ts = str(self.thread_ts) + cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem! + return "{}{}".format(str_ts, cut_zeros*"0") + + @property + def initiator_message(self): + return self.messages[0] # todo check if this needs sorting + + @property + def message_count(self): + return self.messages.count() + + @property + def last_message(self): + messages = Message.select().where(Message.thread == self).order_by(Message.ts) # can't be empty by definition/creation + return messages[-1] + + @property + def is_fully_processed(self) -> bool: + init_message = self.initiator_message + if init_message.is_processed_override: + return True + # this override is set for instance, when no url was sent at all. Then set this thread to be ignored + + reactions = init_message.reaction + if not reactions: + return False + else: + r = reactions[0].type # can and should only have one reaction + return r == "white_check_mark" \ + or r == "x" + + + +class Message(BaseModel): + ts = FloatField(unique=True) #for sorting + channel_id = CharField(default='') + user = ForeignKeyField(User, backref="messages") + text = TextField(default='') + thread = ForeignKeyField(Thread, backref="messages", default=None) + perma_link = CharField(default='') + is_processed_override = BooleanField(default=False) + # reaction + + def __str__(self) -> str: + return "MSG ({} -- {})".format(self.channel_id, self.text[:min(len(self.text), 50)].replace("\n","/") + "....") + + @property + def slack_ts(self): + str_ts = str(self.ts) + cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem! + return "{}{}".format(str_ts, cut_zeros * "0") + + + @property + def urls(self): + pattern = r"<(.*?)>" + matches = re.findall(pattern, self.text) + matches = [m for m in matches if "." in m] + + new_matches = [] + for m in matches: + if "." in m: # must contain a tld, right? + # further complication: slack automatically abreviates urls in the format: + # . Lucky for us, "|" is a character derecommended in urls, meaning we can "safely" split for it and retain the first half + if "|" in m: + keep = m.split("|")[0] + else: + keep = m + new_matches.append(keep) + return new_matches + + @property + def is_by_human(self): + return self.user.user_id != slack_config["bot_id"] + + + @property + def has_single_url(self): + return len(self.urls) == 1 + + +class Reaction(BaseModel): + type = CharField(default = "") + message = ForeignKeyField(Message, backref="reaction") + + + + + + + + + + + + +def create_tables(): + with db: + db.create_tables([ArticleDownload, ArticleKeyword, ArticleAuthor, ArticleReference, ArticleRelated, User, Message, Thread, Reaction]) + + +def set_db(db_object): + db.initialize(db_object) + create_tables() + +def clear_path_name(path): + keepcharacters = (' ','.','_', '-') + converted = "".join([c if (c.isalnum() or c in keepcharacters) else "_" for c in path]).rstrip() + return converted + # return re.sub(r'[^\x00-\x7f]', r'_', path) + # # cleared = path.replace("\n"," ")\ + # # .replace("|", "_")\ + # # .replace(":", "_")\ + # # .replace("?", "_")\ + # # .replace("!", "_")\ + # # .replace(",", "_")\ + # # .replace("/", "_")\ + # # .replace("\\", "_")\ + # # .replace("*", "")\ + # # .replace("\"", "'")\ + # # .replace("<", "'")\ + # # .replace(">", "'") + # # return cleared + \ No newline at end of file diff --git a/misc/exctract_from_mail_backup.py b/misc/exctract_from_mail_backup.py new file mode 100644 index 0000000..30f94de --- /dev/null +++ b/misc/exctract_from_mail_backup.py @@ -0,0 +1,23 @@ +import os +import re +import json + +os.chdir("/home/remy/Documents/mails2/") + +regex = "(?Phttps?://[^\s]+)" + +all_files = os.listdir(".") +all_urls = [] + +for f in all_files: + with open(f, "r", encoding="utf8") as mail: + content = mail.readlines() + + search = "".join(content) + urls = re.findall(regex, search) + all_urls += urls + +print("Saved {} urls".format(len(all_urls))) + +with open("mails_export.json", "w") as f: + json.dump(all_urls, f) \ No newline at end of file diff --git a/misc/hotfix_mails.py b/misc/hotfix_mails.py new file mode 100644 index 0000000..5634af6 --- /dev/null +++ b/misc/hotfix_mails.py @@ -0,0 +1,38 @@ +import logging +import keys +from peewee import SqliteDatabase + +from persistence import article_models +from archiving_utils import runner as archive_runner +from mail_utils import runner as mail_runner + +# Global logger setup: +logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S') +logger = logging.getLogger("MailThread") + + +# Constant values... +DOWNLOADS_DB = "/app/file_storage/downloads.db" + + +# DB Setup: +article_models.set_db(SqliteDatabase( + DOWNLOADS_DB, + pragmas = {'journal_mode': 'wal'} # mutliple threads can access at once +)) + + +mail_worker = mail_runner.MailSender(keys.MAIL_UNAME, keys.MAIL_PASSWORD, keys.MAIL_SENDER, keys.MAIL_RECIPIENT) +dl_worker = archive_runner.ArchivingThread(article_models, mail_worker) +dl_worker.start() + + + +# Retroactively sends a message to DIRK for messages that were archived using slack, but when the mail-reply was not yet implemented + + + +url_list = [] + +for url in url_list: + dl_worker.get_or_save(url) \ No newline at end of file diff --git a/misc/hotfix_missed_messages.py b/misc/hotfix_missed_messages.py new file mode 100644 index 0000000..0ad3364 --- /dev/null +++ b/misc/hotfix_missed_messages.py @@ -0,0 +1,88 @@ +import time +import keys +import slack_sdk +from slack_sdk.errors import SlackApiError +from peewee import SqliteDatabase + +from persistence import message_models +# from bot_utils import messages + + + +# Constant values... +MESSAGES_DB = "/app/file_storage/messages.db" + +BOT_ID = "U02MR1R8UJH" +ARCHIVE_ID = "C02MM7YG1V4" +DEBUG_ID = "C02NM2H9J5Q" + + + +client = slack_sdk.WebClient(token=keys.OAUTH_TOKEN) + +message_models.set_db(SqliteDatabase(MESSAGES_DB)) + + +def message_dict_to_model(message): + if message["type"] == "message": + thread_ts = message["thread_ts"] if "thread_ts" in message else message["ts"] + uid = message.get("user", "BAD USER") + user, _ = message_models.User.get_or_create(user_id = uid) + thread, _ = message_models.Thread.get_or_create(thread_ts = thread_ts) + m, new = message_models.Message.get_or_create( + user = user, + thread = thread, + ts = message["ts"], + channel_id = ARCHIVE_ID, + text = message["text"] + ) + print("Saved (text) {} (new={})".format(m, new)) + + for f in message.get("files", []): #default: [] + m.file_type = f["filetype"] + m.perma_link = f["url_private_download"] + m.save() + print("Saved permalink {} to {} (possibly overwriting)".format(f["name"], m)) + if new: + return m + else: + return None + else: + print("What should I do of {}".format(message)) + return None + + +def check_all_past_messages(): + last_ts = 0 + + result = client.conversations_history( + channel=ARCHIVE_ID, + oldest=last_ts + ) + + new_messages = result.get("messages", []) # fetches 100 messages by default + + new_fetches = [] + for m in new_messages: + new_fetches.append(message_dict_to_model(m)) + # print(result) + refetch = result.get("has_more", False) + print(f"Refetching : {refetch}") + while refetch: # we have not actually fetched them all + try: + result = client.conversations_history( + channel = ARCHIVE_ID, + cursor = result["response_metadata"]["next_cursor"], + oldest = last_ts + ) # refetches in batches of 100 messages + refetch = result.get("has_more", False) + new_messages = result.get("messages", []) + for m in new_messages: + new_fetches.append(message_dict_to_model(m)) + except SlackApiError: # Most likely a rate-limit + print("Error while fetching channel messages. (likely rate limit) Retrying in {} seconds...".format(30)) + time.sleep(30) + refetch = True + + +check_all_past_messages() \ No newline at end of file diff --git a/misc/hotfix_reactions.py b/misc/hotfix_reactions.py new file mode 100644 index 0000000..5e43b37 --- /dev/null +++ b/misc/hotfix_reactions.py @@ -0,0 +1,38 @@ +from peewee import SqliteDatabase + +from persistence import article_models, message_models + +# Global logger setup: + + +# Constant values... +DOWNLOADS_DB = "../container_data/downloads.db" +MESSAGES_DB = "../container_data/messages.db" + +BOT_ID = "U02MR1R8UJH" +ARCHIVE_ID = "C02MM7YG1V4" +DEBUG_ID = "C02NM2H9J5Q" + + +# DB Setup: +article_models.set_db(SqliteDatabase( + DOWNLOADS_DB, + pragmas = {'journal_mode': 'wal'} # mutliple threads can access at once +)) + +message_models.set_db(SqliteDatabase(MESSAGES_DB)) + + + +for reaction in message_models.Reaction.select(): + print(reaction) + thread = reaction.message.thread + articles = message_models.get_referenced_articles(thread, article_models.ArticleDownload) + for a in articles: + print(a) + reaction = reaction.type + status = 1 if reaction == "white_check_mark" else -1 + print(status) + for article in articles: + article.verified = status + article.save() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..50f3bca --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +peewee +selenium +pytube +waybackpy +slack_bolt # relies on slack_sdk +newspaper3k +htmldate +markdown +rich