diff --git a/.gitignore b/.gitignore index b6c6fee..39274ed 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ *.pyc *.log +__pycache__/ \ No newline at end of file diff --git a/README.md b/README.md index a7f23b2..9970347 100644 --- a/README.md +++ b/README.md @@ -6,26 +6,25 @@ A utility to fetch article requests from slack and generate pdfs for them, fully ## Running ### How to run - auto archiving mode In this mode the program is launched as a docker container, in a headless mode. For persistence purposes a local storage volume is required, but that's it! + `docker run -it -v :/app/file_storage/ auto_news` You can specify additional parameters: + `docker run -it -v :/app/file_storage/ auto_news debug` runs with debug values (does not write to prod db, does not send mails) -`docker run -it -v :/app/file_storage/ auto_news upload` catches up on past uploads to archive. -`docker run -it -v :/app/file_storage/ -e DISPLAY=":0" --network host -v \$XAUTHORITY:/root/.Xauthority auto_news check` lets you visually verify the downloaded files. Be aware that it requires additional parameters in order to open guis on the host. + +`docker run -it -v :/app/file_storage/ auto_news upload` catches up on incomplete uploads to archive. + +`docker run -it -v :/app/file_storage/ -e DISPLAY=":0" --network host -v \$XAUTHORITY:/root/.Xauthority auto_news check` lets you visually verify the downloaded files. The additional parameters are required in order to open guis on the host. ### How to run - development mode In this mode, a docker container is launched with an additional volume, the local code. You can test your code without the need to rebuild the image. + `docker run -it -v :/app/file_storage/ -v :/code/ --entry-point /bin/bash auto_news` You are droppped into a bash shell, in which you can navigate to the `/code` directory and then test live. -% ### How to run - file checker mode -% This mode requires the most access rights. You want to access all files and open gui programs. -% `docker run -it -e DISPLAY=":0" --network host -v $XAUTHORITY:/root/.Xauthority -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ -v /mnt/Data/COSS/DOWNLOADS/auto_news/app:/code auto_news /bin/bash` -% Similarly to the development mode, you can cd into code and run your checking duties. - - ## Building @@ -41,6 +40,17 @@ where the `Dockerfile` has to be in the working directory ## Cheat-sheet Remy: -docker run -it -e LIVECODE=TRUE -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ -v /mnt/Data/COSS/DOWNLOADS/auto_news/app:/code/ auto_news /bin/bash +`docker run -it -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ auto_news` -docker run -it -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ auto_news \ No newline at end of file +`docker run -it -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ -v /mnt/Data/COSS/auto_news/app:/code --entrypoint /bin/bash auto_news` + + +`docker run -it -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ -e DISPLAY=":0" --network host -v \$XAUTHORITY:/root/.Xauthority auto_news check` + + + +## Roadmap: + +[] automatically upload files to NAS +[] handle paywalled sites like faz, spiegel, .. through their dedicated edu-sites +... \ No newline at end of file diff --git a/app/runner.py b/app/runner.py index 8ca103b..82c4c72 100644 --- a/app/runner.py +++ b/app/runner.py @@ -7,7 +7,7 @@ logger = logging.getLogger(__name__) from utils_mail import runner as mail_runner from utils_slack import runner as slack_runner -from utils.workers import CompressWorker, DownloadWorker, FetchWorker, UploadWorker +from utils_worker.workers import CompressWorker, DownloadWorker, FetchWorker, UploadWorker class ArticleWatcher: @@ -174,7 +174,8 @@ if __name__ == "__main__": logger.info(f"Launching upload to archive for {len(urls)} urls.") coordinator.manual_processing(urls, [UploadWorker()]) elif "check" in sys.argv: - logger.info("Not implemented yet.") + from utils_check import runner as check_runner + check_runner.verify_unchecked() else: # launch with full action kwargs = { "worker_download" : DownloadWorker(), diff --git a/app/utils_check/runner.py b/app/utils_check/runner.py new file mode 100644 index 0000000..7563848 --- /dev/null +++ b/app/utils_check/runner.py @@ -0,0 +1,285 @@ +from rich.console import Console +from rich.table import Table +from rich.columns import Columns +from rich.rule import Rule +console = Console() +hline = Rule(style="white") + +import os +import subprocess +from slack_sdk import WebClient +import configuration +models = configuration.models + +u_options = { + "ENTER" : "Accept PDF as is. It gets marked as verified", + "D" : "set languange to DE and set verified", + "E" : "set languange to EN and set verified", + "O" : "set other language (prompted)", + "R" : "set related files (prompted multiple times)", + "B" : "reject and move to folder BAD", + "L" : "leave file as is, do not send reaction" +} + + +bot_client = WebClient( + token = configuration.parsed["SLACK"]["auth_token"] +) + + + + + +def file_overview(file_url: str, file_attributes: list, options: dict) -> None: + """Prints a neat overview of the current article""" + file_table = Table( + title = file_url, + row_styles = ["white", "bright_black"], + min_width = 150 + ) + + file_table.add_column("Attribute", justify = "right", no_wrap = True) + file_table.add_column("Value set by auto_news") + file_table.add_column("Status", justify = "right") + for attr in file_attributes: + file_table.add_row(attr["name"], attr["value"], attr["status"]) + + + option_key = "\n".join([f"[[bold]{k}[/bold]]" for k in options.keys()]) + option_action = "\n".join([f"[italic]{k}[/italic]" for k in options.values()]) + columns = Columns([option_key, option_action]) + + console.print(file_table) + console.print("Your options:") + console.print(columns) + + +def send_reaction_to_slack_thread(article, reaction): + """Sends the verification status as a reaction to the associated slack thread. This will significantly decrease load times of the bot""" + messages = models.Message.select().where(models.Message.text.contains(article.article_url)) + # TODO rewrite this shit + if len(messages) > 5: + print("Found more than 5 messages. Aborting reactions...") + return + for m in messages: + if not m.has_single_url: + print("Found thread but won't send reaction because thread has multiple urls") + pass + else: + ts = m.slack_ts + bot_client.reactions_add( + channel=configuration.parsed["SLACK"]["archive_id"], + name=reaction, + timestamp=ts + ) + print("Sent reaction to message") + +def prompt_language(query): + not_set = True + while not_set: + uin = input("Set language (nation-code, 2 letters) ") + if len(uin) != 2: + print("Bad code, try again") + else: + not_set = False + query.language = uin + query.save() + + +def prompt_related(query): + file_list = [] + finished = False + while not finished: + uin = input("Additional file for article? Type '1' to cancel ") + if uin == "1": + query.set_related(file_list) + finished = True + else: + file_list.append(uin) + + +def prompt_new_fname(query): + uin = input("New fname? ") + old_fname = query.file_name + query.file_name = uin + query.verified = 1 + if old_fname != "": + os.remove(query.save_path + old_fname) + query.save() + + + +def reject_article(article): + article.verified = -1 + article.save() + print("Article marked as bad") + # also update the threads to not be monitored anymore + send_reaction_to_slack_thread(article, "x") + + +def unreject_article(query): + query.verified = 1 + query.save() + # os.rename(badpdf, fname) + print("File set to verified") + + +def accept_article(article, last_accepted): + article.verified = 1 + article.save() + print("Article accepted as GOOD") + + # also update the threads to not be monitored anymore + send_reaction_to_slack_thread(article, "white_check_mark") + + """linked = None + try: + thread = message_models.Thread.get(id = last_accepted.id + 1) + rel = message_models.get_referenced_articles(thread, article_models.ArticleDownload) + assert len(rel) == 1 and rel[0] == article + linked = thread + except: # if the above, naive method (just increment by one), fails, resort to brute search. + print("Bruteforcing search") + for t in message_models.Thread.select(): + rel = message_models.get_referenced_articles(t, article_models.ArticleDownload) + if len(rel) == 1 and rel[0] == article: + linked = t + break + + if linked: + linked.initiator_message.is_processed_override = 1 + linked.initiator_message.save() + print("Message overwritten to PROCESSED") + + else: + print("No matching thread found")""" + return "" # linked + + + + + + +def verify_unchecked(): + query = models.ArticleDownload.select().where(models.ArticleDownload.verified == 0).execute() + last_linked = None + + for article in query: + console.print(hline) + core_info = [] + for e, name in zip([article.save_path, article.file_name, article.title, article.language], ["Save path", "File name", "Title", "Language"]): + entry = { + "status" : "[red]██[/red]" if (len(e) == 0 or e == -1) else "[green]██[/green]", + "value" : e if len(e) != 0 else "not set", + "name" : name + } + core_info.append(entry) + + try: + subprocess.Popen(["evince", f"file://{os.path.join(article.save_path, article.file_name)}"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + # supress evince gtk warnings + except Exception as e: + print(str(list((-1, e)))) + continue + + + + file_overview( + file_url = article.article_url, + file_attributes=core_info, + options = u_options + ) + + + proceed = False + while not proceed: + proceed = False + uin = input("Choice ?").lower() + if uin == "": + last_linked = accept_article(article, last_linked) # last linked accelerates the whole process + proceed = True + elif uin == "d": + article.language = "de" + article.verified = 1 + article.save() + proceed = True + elif uin == "e": + article.language = "en" + article.verified = 1 + article.save() + proceed = True + elif uin == "o": + prompt_language(article) + elif uin == "r": + prompt_related(article) + elif uin == "b": + reject_article(article) + proceed = True + elif uin == "l": + # do nothing + proceed = True + else: + print("Invalid input") + + + + +# def verify_bad(): +# b_options = { +# "ENTER":"Accept pdf as fixed", +# "B": "Keep pdf in BAD.", +# "R" : "set related files (prompted multiple times)", +# "C" : "Change the saved file-name and set as verified." +# } +# query = article_models.ArticleDownload.select().where(article_models.ArticleDownload.verified == -1).execute() + +# for q in query: +# pdf = q.file_name +# save_dir = get_save_path(q) +# fname = save_dir + "BAD/" + pdf +# try: +# subprocess.call(["xdg-open", fname]) +# except: +# print(f"[{testvar}██{testvar}] PDF moved:") +# print(fname) +# continue + +# status_pdf = f"{testvar}██{testvar}" +# if "just a moment" in pdf: +# status_pdf = f"{testvar}██{testvar}" + +# language = q.language +# status_language = f"{testvar}██{testvar}" +# if len(language) == 0: +# status_language = f"{testvar}██{testvar}" + + +# print_status_options( +# status=u_status.format( +# url = q.article_url, +# status_pdf = status_pdf, +# pdf = pdf[:80], +# status_language = status_language, +# language = language +# ), +# options = b_options) + + + +# proceed = False +# while not proceed: +# proceed = False +# uin = input("Choice? ").lower() +# if uin == "": +# unreject_article(q) +# proceed = True +# elif uin == "b": +# proceed = True +# elif uin == "r": +# prompt_related(q) +# elif uin == "c": +# prompt_new_fname(q) +# proceed = True +# else: +# print("Invalid input") + diff --git a/app/utils/_init__.py b/app/utils_worker/_init__.py similarity index 100% rename from app/utils/_init__.py rename to app/utils_worker/_init__.py diff --git a/app/utils/compress/runner.py b/app/utils_worker/compress/runner.py similarity index 92% rename from app/utils/compress/runner.py rename to app/utils_worker/compress/runner.py index 1cffd90..5f67bb9 100644 --- a/app/utils/compress/runner.py +++ b/app/utils_worker/compress/runner.py @@ -31,10 +31,3 @@ def shrink_pdf(article): logger.error(f"Could not run the compression! {c.stderr.decode()} - {c.stdout.decode()}") return article - - - - - -# gs -sDEVICE=pdfwrite -dPDFSETTINGS=/screen -dNOPAUSE -dBATCH -sOutputFile=out.pdf -# ; mv -f temp.pdf file.pdf \ No newline at end of file diff --git a/app/utils/download/__init__.py b/app/utils_worker/download/__init__.py similarity index 100% rename from app/utils/download/__init__.py rename to app/utils_worker/download/__init__.py diff --git a/app/utils/download/browser.py b/app/utils_worker/download/browser.py similarity index 100% rename from app/utils/download/browser.py rename to app/utils_worker/download/browser.py diff --git a/app/utils/download/runner.py b/app/utils_worker/download/runner.py similarity index 100% rename from app/utils/download/runner.py rename to app/utils_worker/download/runner.py diff --git a/app/utils/download/youtube.py b/app/utils_worker/download/youtube.py similarity index 100% rename from app/utils/download/youtube.py rename to app/utils_worker/download/youtube.py diff --git a/app/utils/fetch/runner.py b/app/utils_worker/fetch/runner.py similarity index 100% rename from app/utils/fetch/runner.py rename to app/utils_worker/fetch/runner.py diff --git a/app/utils/upload/runner.py b/app/utils_worker/upload/runner.py similarity index 100% rename from app/utils/upload/runner.py rename to app/utils_worker/upload/runner.py diff --git a/app/utils/worker_template.py b/app/utils_worker/worker_template.py similarity index 96% rename from app/utils/worker_template.py rename to app/utils_worker/worker_template.py index d1b44bc..96be787 100644 --- a/app/utils/worker_template.py +++ b/app/utils_worker/worker_template.py @@ -29,7 +29,7 @@ class TemplateWorker(Thread): time.sleep(5) else: article_watcher = self._article_queue.pop(0) - self.logger.info(f"{self.__class__.__name__} is now processing an article") + self.logger.info(f"{self.__class__.__name__} is now processing article ({len(self._article_queue)} in queue)") self._handle_article(article_watcher) diff --git a/app/utils/workers.py b/app/utils_worker/workers.py similarity index 100% rename from app/utils/workers.py rename to app/utils_worker/workers.py diff --git a/misc/hotfix_mails.py b/misc/hotfix_mails.py deleted file mode 100644 index 5634af6..0000000 --- a/misc/hotfix_mails.py +++ /dev/null @@ -1,38 +0,0 @@ -import logging -import keys -from peewee import SqliteDatabase - -from persistence import article_models -from archiving_utils import runner as archive_runner -from mail_utils import runner as mail_runner - -# Global logger setup: -logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S') -logger = logging.getLogger("MailThread") - - -# Constant values... -DOWNLOADS_DB = "/app/file_storage/downloads.db" - - -# DB Setup: -article_models.set_db(SqliteDatabase( - DOWNLOADS_DB, - pragmas = {'journal_mode': 'wal'} # mutliple threads can access at once -)) - - -mail_worker = mail_runner.MailSender(keys.MAIL_UNAME, keys.MAIL_PASSWORD, keys.MAIL_SENDER, keys.MAIL_RECIPIENT) -dl_worker = archive_runner.ArchivingThread(article_models, mail_worker) -dl_worker.start() - - - -# Retroactively sends a message to DIRK for messages that were archived using slack, but when the mail-reply was not yet implemented - - - -url_list = [] - -for url in url_list: - dl_worker.get_or_save(url) \ No newline at end of file