Better structure

This commit is contained in:
Remy Moll 2022-04-17 22:32:56 +02:00
parent 0a6dde8c78
commit 0d76bcbb98
15 changed files with 310 additions and 58 deletions

1
.gitignore vendored
View File

@ -2,3 +2,4 @@
*.pyc
*.log
__pycache__/

View File

@ -6,26 +6,25 @@ A utility to fetch article requests from slack and generate pdfs for them, fully
## Running
### How to run - auto archiving mode
In this mode the program is launched as a docker container, in a headless mode. For persistence purposes a local storage volume is required, but that's it!
`docker run -it -v <your storage>:/app/file_storage/ auto_news`
You can specify additional parameters:
`docker run -it -v <your storage>:/app/file_storage/ auto_news debug` runs with debug values (does not write to prod db, does not send mails)
`docker run -it -v <your storage>:/app/file_storage/ auto_news upload` catches up on past uploads to archive.
`docker run -it -v <your storage>:/app/file_storage/ -e DISPLAY=":0" --network host -v \$XAUTHORITY:/root/.Xauthority auto_news check` lets you visually verify the downloaded files. Be aware that it requires additional parameters in order to open guis on the host.
`docker run -it -v <your storage>:/app/file_storage/ auto_news upload` catches up on incomplete uploads to archive.
`docker run -it -v <your storage>:/app/file_storage/ -e DISPLAY=":0" --network host -v \$XAUTHORITY:/root/.Xauthority auto_news check` lets you visually verify the downloaded files. The additional parameters are required in order to open guis on the host.
### How to run - development mode
In this mode, a docker container is launched with an additional volume, the local code. You can test your code without the need to rebuild the image.
`docker run -it -v <your storage>:/app/file_storage/ -v <your code>:/code/ --entry-point /bin/bash auto_news`
You are droppped into a bash shell, in which you can navigate to the `/code` directory and then test live.
% ### How to run - file checker mode
% This mode requires the most access rights. You want to access all files and open gui programs.
% `docker run -it -e DISPLAY=":0" --network host -v $XAUTHORITY:/root/.Xauthority -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ -v /mnt/Data/COSS/DOWNLOADS/auto_news/app:/code auto_news /bin/bash`
% Similarly to the development mode, you can cd into code and run your checking duties.
## Building
@ -41,6 +40,17 @@ where the `Dockerfile` has to be in the working directory
## Cheat-sheet Remy:
docker run -it -e LIVECODE=TRUE -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ -v /mnt/Data/COSS/DOWNLOADS/auto_news/app:/code/ auto_news /bin/bash
`docker run -it -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ auto_news`
docker run -it -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ auto_news
`docker run -it -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ -v /mnt/Data/COSS/auto_news/app:/code --entrypoint /bin/bash auto_news`
`docker run -it -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ -e DISPLAY=":0" --network host -v \$XAUTHORITY:/root/.Xauthority auto_news check`
## Roadmap:
[] automatically upload files to NAS
[] handle paywalled sites like faz, spiegel, .. through their dedicated edu-sites
...

View File

@ -7,7 +7,7 @@ logger = logging.getLogger(__name__)
from utils_mail import runner as mail_runner
from utils_slack import runner as slack_runner
from utils.workers import CompressWorker, DownloadWorker, FetchWorker, UploadWorker
from utils_worker.workers import CompressWorker, DownloadWorker, FetchWorker, UploadWorker
class ArticleWatcher:
@ -174,7 +174,8 @@ if __name__ == "__main__":
logger.info(f"Launching upload to archive for {len(urls)} urls.")
coordinator.manual_processing(urls, [UploadWorker()])
elif "check" in sys.argv:
logger.info("Not implemented yet.")
from utils_check import runner as check_runner
check_runner.verify_unchecked()
else: # launch with full action
kwargs = {
"worker_download" : DownloadWorker(),

285
app/utils_check/runner.py Normal file
View File

@ -0,0 +1,285 @@
from rich.console import Console
from rich.table import Table
from rich.columns import Columns
from rich.rule import Rule
console = Console()
hline = Rule(style="white")
import os
import subprocess
from slack_sdk import WebClient
import configuration
models = configuration.models
u_options = {
"ENTER" : "Accept PDF as is. It gets marked as verified",
"D" : "set languange to DE and set verified",
"E" : "set languange to EN and set verified",
"O" : "set other language (prompted)",
"R" : "set related files (prompted multiple times)",
"B" : "reject and move to folder BAD",
"L" : "leave file as is, do not send reaction"
}
bot_client = WebClient(
token = configuration.parsed["SLACK"]["auth_token"]
)
def file_overview(file_url: str, file_attributes: list, options: dict) -> None:
"""Prints a neat overview of the current article"""
file_table = Table(
title = file_url,
row_styles = ["white", "bright_black"],
min_width = 150
)
file_table.add_column("Attribute", justify = "right", no_wrap = True)
file_table.add_column("Value set by auto_news")
file_table.add_column("Status", justify = "right")
for attr in file_attributes:
file_table.add_row(attr["name"], attr["value"], attr["status"])
option_key = "\n".join([f"[[bold]{k}[/bold]]" for k in options.keys()])
option_action = "\n".join([f"[italic]{k}[/italic]" for k in options.values()])
columns = Columns([option_key, option_action])
console.print(file_table)
console.print("Your options:")
console.print(columns)
def send_reaction_to_slack_thread(article, reaction):
"""Sends the verification status as a reaction to the associated slack thread. This will significantly decrease load times of the bot"""
messages = models.Message.select().where(models.Message.text.contains(article.article_url))
# TODO rewrite this shit
if len(messages) > 5:
print("Found more than 5 messages. Aborting reactions...")
return
for m in messages:
if not m.has_single_url:
print("Found thread but won't send reaction because thread has multiple urls")
pass
else:
ts = m.slack_ts
bot_client.reactions_add(
channel=configuration.parsed["SLACK"]["archive_id"],
name=reaction,
timestamp=ts
)
print("Sent reaction to message")
def prompt_language(query):
not_set = True
while not_set:
uin = input("Set language (nation-code, 2 letters) ")
if len(uin) != 2:
print("Bad code, try again")
else:
not_set = False
query.language = uin
query.save()
def prompt_related(query):
file_list = []
finished = False
while not finished:
uin = input("Additional file for article? Type '1' to cancel ")
if uin == "1":
query.set_related(file_list)
finished = True
else:
file_list.append(uin)
def prompt_new_fname(query):
uin = input("New fname? ")
old_fname = query.file_name
query.file_name = uin
query.verified = 1
if old_fname != "":
os.remove(query.save_path + old_fname)
query.save()
def reject_article(article):
article.verified = -1
article.save()
print("Article marked as bad")
# also update the threads to not be monitored anymore
send_reaction_to_slack_thread(article, "x")
def unreject_article(query):
query.verified = 1
query.save()
# os.rename(badpdf, fname)
print("File set to verified")
def accept_article(article, last_accepted):
article.verified = 1
article.save()
print("Article accepted as GOOD")
# also update the threads to not be monitored anymore
send_reaction_to_slack_thread(article, "white_check_mark")
"""linked = None
try:
thread = message_models.Thread.get(id = last_accepted.id + 1)
rel = message_models.get_referenced_articles(thread, article_models.ArticleDownload)
assert len(rel) == 1 and rel[0] == article
linked = thread
except: # if the above, naive method (just increment by one), fails, resort to brute search.
print("Bruteforcing search")
for t in message_models.Thread.select():
rel = message_models.get_referenced_articles(t, article_models.ArticleDownload)
if len(rel) == 1 and rel[0] == article:
linked = t
break
if linked:
linked.initiator_message.is_processed_override = 1
linked.initiator_message.save()
print("Message overwritten to PROCESSED")
else:
print("No matching thread found")"""
return "" # linked
def verify_unchecked():
query = models.ArticleDownload.select().where(models.ArticleDownload.verified == 0).execute()
last_linked = None
for article in query:
console.print(hline)
core_info = []
for e, name in zip([article.save_path, article.file_name, article.title, article.language], ["Save path", "File name", "Title", "Language"]):
entry = {
"status" : "[red]██[/red]" if (len(e) == 0 or e == -1) else "[green]██[/green]",
"value" : e if len(e) != 0 else "not set",
"name" : name
}
core_info.append(entry)
try:
subprocess.Popen(["evince", f"file://{os.path.join(article.save_path, article.file_name)}"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# supress evince gtk warnings
except Exception as e:
print(str(list((-1, e))))
continue
file_overview(
file_url = article.article_url,
file_attributes=core_info,
options = u_options
)
proceed = False
while not proceed:
proceed = False
uin = input("Choice ?").lower()
if uin == "":
last_linked = accept_article(article, last_linked) # last linked accelerates the whole process
proceed = True
elif uin == "d":
article.language = "de"
article.verified = 1
article.save()
proceed = True
elif uin == "e":
article.language = "en"
article.verified = 1
article.save()
proceed = True
elif uin == "o":
prompt_language(article)
elif uin == "r":
prompt_related(article)
elif uin == "b":
reject_article(article)
proceed = True
elif uin == "l":
# do nothing
proceed = True
else:
print("Invalid input")
# def verify_bad():
# b_options = {
# "ENTER":"Accept pdf as fixed",
# "B": "Keep pdf in BAD.",
# "R" : "set related files (prompted multiple times)",
# "C" : "Change the saved file-name and set as verified."
# }
# query = article_models.ArticleDownload.select().where(article_models.ArticleDownload.verified == -1).execute()
# for q in query:
# pdf = q.file_name
# save_dir = get_save_path(q)
# fname = save_dir + "BAD/" + pdf
# try:
# subprocess.call(["xdg-open", fname])
# except:
# print(f"[{testvar}██{testvar}] PDF moved:")
# print(fname)
# continue
# status_pdf = f"{testvar}██{testvar}"
# if "just a moment" in pdf:
# status_pdf = f"{testvar}██{testvar}"
# language = q.language
# status_language = f"{testvar}██{testvar}"
# if len(language) == 0:
# status_language = f"{testvar}██{testvar}"
# print_status_options(
# status=u_status.format(
# url = q.article_url,
# status_pdf = status_pdf,
# pdf = pdf[:80],
# status_language = status_language,
# language = language
# ),
# options = b_options)
# proceed = False
# while not proceed:
# proceed = False
# uin = input("Choice? ").lower()
# if uin == "":
# unreject_article(q)
# proceed = True
# elif uin == "b":
# proceed = True
# elif uin == "r":
# prompt_related(q)
# elif uin == "c":
# prompt_new_fname(q)
# proceed = True
# else:
# print("Invalid input")

View File

@ -31,10 +31,3 @@ def shrink_pdf(article):
logger.error(f"Could not run the compression! {c.stderr.decode()} - {c.stdout.decode()}")
return article
# gs -sDEVICE=pdfwrite -dPDFSETTINGS=/screen -dNOPAUSE -dBATCH -sOutputFile=out.pdf
# ; mv -f temp.pdf file.pdf

View File

@ -29,7 +29,7 @@ class TemplateWorker(Thread):
time.sleep(5)
else:
article_watcher = self._article_queue.pop(0)
self.logger.info(f"{self.__class__.__name__} is now processing an article")
self.logger.info(f"{self.__class__.__name__} is now processing article ({len(self._article_queue)} in queue)")
self._handle_article(article_watcher)

View File

@ -1,38 +0,0 @@
import logging
import keys
from peewee import SqliteDatabase
from persistence import article_models
from archiving_utils import runner as archive_runner
from mail_utils import runner as mail_runner
# Global logger setup:
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger("MailThread")
# Constant values...
DOWNLOADS_DB = "/app/file_storage/downloads.db"
# DB Setup:
article_models.set_db(SqliteDatabase(
DOWNLOADS_DB,
pragmas = {'journal_mode': 'wal'} # mutliple threads can access at once
))
mail_worker = mail_runner.MailSender(keys.MAIL_UNAME, keys.MAIL_PASSWORD, keys.MAIL_SENDER, keys.MAIL_RECIPIENT)
dl_worker = archive_runner.ArchivingThread(article_models, mail_worker)
dl_worker.start()
# Retroactively sends a message to DIRK for messages that were archived using slack, but when the mail-reply was not yet implemented
url_list = []
for url in url_list:
dl_worker.get_or_save(url)