322 lines
11 KiB
Python
322 lines
11 KiB
Python
import logging
|
|
logger = logging.getLogger(__name__)
|
|
|
|
from peewee import *
|
|
import os
|
|
import markdown
|
|
import re
|
|
import configuration
|
|
import datetime
|
|
|
|
config = configuration.parsed["DOWNLOADS"]
|
|
slack_config = configuration.parsed["SLACK"]
|
|
|
|
## Helpers
|
|
chat_db = DatabaseProxy()
|
|
download_db = DatabaseProxy()
|
|
|
|
# set the nature of the db at runtime
|
|
|
|
class DownloadBaseModel(Model):
|
|
class Meta:
|
|
database = download_db
|
|
|
|
class ChatBaseModel(Model):
|
|
class Meta:
|
|
database = chat_db
|
|
|
|
|
|
|
|
## == Article related models == ##
|
|
class ArticleDownload(DownloadBaseModel):
|
|
title = CharField(default='')
|
|
pub_date = DateField(default = '')
|
|
download_date = DateField(default = datetime.date.today)
|
|
source_name = CharField(default = '')
|
|
article_url = TextField(default = '', unique=True)
|
|
archive_url = TextField(default = '')
|
|
file_name = TextField(default = '')
|
|
language = CharField(default = '')
|
|
summary = TextField(default = '')
|
|
comment = TextField(default = '')
|
|
verified = IntegerField(default = False)
|
|
# authors
|
|
# keywords
|
|
# ... are added through foreignkeys
|
|
|
|
def __str__(self) -> str:
|
|
return f"ART [{self.title} -- {self.source_name}]"
|
|
|
|
## Useful Properties
|
|
@property
|
|
def save_path(self):
|
|
return f"{config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/"
|
|
|
|
def fname_nas(self, file_name=""):
|
|
if self.download_date:
|
|
if file_name:
|
|
return "NAS: {}/{}/{}/{}".format(config["remote_storage_path"], self.download_date.year, self.download_date.strftime("%B"), file_name)
|
|
else: # return the self. name
|
|
return "NAS: {}/{}/{}/{}".format(config["remote_storage_path"], self.download_date.year, self.download_date.strftime("%B"), self.file_name)
|
|
else:
|
|
return None
|
|
|
|
@property
|
|
def fname_template(self):
|
|
if "youtube.com" in self.source_name or "youtu.be" in self.source_name:
|
|
fname = "{} -- {}".format(self.source_name, self.title)
|
|
else:
|
|
fname = "{} -- {}.pdf".format(self.source_name, self.title)
|
|
return clear_path_name(fname)
|
|
|
|
@property
|
|
def is_title_bad(self): # add incrementally
|
|
return "PUR-Abo" in self.title \
|
|
or "Redirecting" in self.title \
|
|
or "Error while running fetch" in self.title
|
|
|
|
@property
|
|
def slack_info(self):
|
|
status = [":x: No better version available", ":gear: Verification pending", ":white_check_mark: Verified by human"][self.verified + 1]
|
|
content = "\n>" + "\n>".join(self.summary.split("\n"))
|
|
file_status, msg = self.file_status()
|
|
if not file_status:
|
|
return [msg]
|
|
|
|
# everything alright: generate real content
|
|
# first the base file
|
|
if self.file_name[-4:] == ".pdf":
|
|
answer = [{ # main reply with the base pdf
|
|
"reply_text" : f"*{self.title}*\n{status}\n{content}",
|
|
"file_path" : self.save_path + self.file_name
|
|
}]
|
|
else: # don't upload if the file is too big!
|
|
location = "Not uploaded to slack, but the file will be on the NAS:\n`{}`".format(self.fname_nas())
|
|
answer = [{ # main reply with the base pdf
|
|
"reply_text" : "*{}*\n{}\n{}\n{}".format(self.title, status, content, location),
|
|
"file_path" : None
|
|
}]
|
|
|
|
# then the related files
|
|
rel_text = ""
|
|
for r in self.related:
|
|
fname = r.related_file_name
|
|
lentry = "\n• `{}` ".format(self.fname_nas(fname))
|
|
if fname[-4:] == ".pdf": # this is a manageable file, directly upload
|
|
f_ret = self.save_path + fname
|
|
answer.append({"reply_text":"", "file_path" : f_ret})
|
|
else: # not pdf <=> too large. Don't upload but mention its existence
|
|
lentry += "(not uploaded to slack, but the file will be on the NAS)"
|
|
|
|
rel_text += lentry
|
|
|
|
if rel_text:
|
|
rel_text = answer[0]["reply_text"] = answer[0]["reply_text"] + "\nRelated files:\n" + rel_text
|
|
|
|
return answer
|
|
|
|
@property
|
|
def mail_info(self):
|
|
base = [{"reply_text": "[{}]({})\n".format(self.article_url, self.article_url), "file_path":None}] + self.slack_info
|
|
return [{"reply_text": markdown.markdown(m["reply_text"]), "file_path": m["file_path"]} for m in base]
|
|
|
|
|
|
## Helpers
|
|
def set_keywords(self, keywords):
|
|
for k in keywords:
|
|
ArticleKeyword.create(
|
|
article = self,
|
|
keyword = k
|
|
)
|
|
|
|
def set_authors(self, authors):
|
|
for a in authors:
|
|
ArticleAuthor.create(
|
|
article = self,
|
|
author = a
|
|
)
|
|
|
|
def set_references(self, references):
|
|
for r in references:
|
|
ArticleReference.create(
|
|
article = self,
|
|
reference_url = r
|
|
)
|
|
|
|
def set_related(self, related):
|
|
for r in related:
|
|
ArticleRelated.create(
|
|
article = self,
|
|
related_file_name = r
|
|
)
|
|
|
|
def file_status(self):
|
|
if not self.file_name:
|
|
logger.error("Article {} has no filename!".format(self))
|
|
return False, {"reply_text": "Download failed, no file was saved.", "file_path": None}
|
|
|
|
file_path_abs = self.save_path + self.file_name
|
|
if not os.path.exists(file_path_abs):
|
|
logger.error("Article {} has a filename, but the file does not exist at that location!".format(self))
|
|
return False, {"reply_text": "Can't find file. Either the download failed or the file was moved.", "file_path": None}
|
|
|
|
return True, {}
|
|
|
|
|
|
class ArticleKeyword(DownloadBaseModel):
|
|
# instance gets created for every one keyword -> flexible in size
|
|
article = ForeignKeyField(ArticleDownload, backref='keywords')
|
|
keyword = CharField()
|
|
|
|
|
|
class ArticleAuthor(DownloadBaseModel):
|
|
article = ForeignKeyField(ArticleDownload, backref='authors')
|
|
author = CharField()
|
|
|
|
|
|
class ArticleReference(DownloadBaseModel):
|
|
article = ForeignKeyField(ArticleDownload, backref='references')
|
|
reference_url = TextField(default = '')
|
|
|
|
|
|
class ArticleRelated(DownloadBaseModel):
|
|
article = ForeignKeyField(ArticleDownload, backref='related')
|
|
related_file_name = TextField(default = '')
|
|
|
|
|
|
|
|
|
|
## == Slack-thread related models == ##
|
|
class User(ChatBaseModel):
|
|
user_id = CharField(default='', unique=True)
|
|
# messages
|
|
|
|
|
|
class Thread(ChatBaseModel):
|
|
"""The threads that concern us are only created if the base massage contains a url"""
|
|
thread_ts = FloatField(default = 0)
|
|
article = ForeignKeyField(ArticleDownload, backref="slack_thread", null=True, default=None)
|
|
# provides, ts, user, models
|
|
# messages
|
|
|
|
@property
|
|
def slack_ts(self):
|
|
str_ts = str(self.thread_ts)
|
|
cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem!
|
|
return "{}{}".format(str_ts, cut_zeros*"0")
|
|
|
|
@property
|
|
def initiator_message(self):
|
|
try:
|
|
return self.messages[0] # TODO check if this needs sorting
|
|
except IndexError:
|
|
logger.warning(f"Thread {self} is empty. How can that be?")
|
|
return None
|
|
|
|
@property
|
|
def message_count(self):
|
|
# logger.warning("message_count was called")
|
|
return self.messages.count()
|
|
|
|
@property
|
|
def last_message(self):
|
|
messages = Message.select().where(Message.thread == self).order_by(Message.ts) # can't be empty by definition/creation
|
|
return messages[-1]
|
|
|
|
@property
|
|
def is_fully_processed(self) -> bool:
|
|
init_message = self.initiator_message
|
|
if init_message is None:
|
|
return False
|
|
|
|
if init_message.is_processed_override:
|
|
return True
|
|
# this override is set for instance, when no url was sent at all. Then set this thread to be ignored
|
|
|
|
reactions = init_message.reaction
|
|
if not reactions:
|
|
return False
|
|
else:
|
|
r = reactions[0].type # can and should only have one reaction
|
|
return r == "white_check_mark" \
|
|
or r == "x"
|
|
|
|
|
|
|
|
class Message(ChatBaseModel):
|
|
ts = FloatField(unique=True) #for sorting
|
|
channel_id = CharField(default='')
|
|
user = ForeignKeyField(User, backref="messages")
|
|
text = TextField(default='')
|
|
thread = ForeignKeyField(Thread, backref="messages", default=None)
|
|
file_type = CharField(default='')
|
|
perma_link = CharField(default='')
|
|
is_processed_override = BooleanField(default=False)
|
|
# reaction
|
|
|
|
def __str__(self) -> str:
|
|
return "MSG [{}]".format(self.text[:min(len(self.text), 30)].replace('\n','/') + '...')
|
|
|
|
@property
|
|
def slack_ts(self):
|
|
str_ts = str(self.ts)
|
|
cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem!
|
|
return "{}{}".format(str_ts, cut_zeros * "0")
|
|
|
|
|
|
@property
|
|
def urls(self):
|
|
pattern = r"<(.*?)>"
|
|
matches = re.findall(pattern, self.text)
|
|
matches = [m for m in matches if "." in m]
|
|
|
|
new_matches = []
|
|
for m in matches:
|
|
if "." in m: # must contain a tld, right?
|
|
# further complication: slack automatically abreviates urls in the format:
|
|
# <url|link preview>. Lucky for us, "|" is a character derecommended in urls, meaning we can "safely" split for it and retain the first half
|
|
if "|" in m:
|
|
keep = m.split("|")[0]
|
|
else:
|
|
keep = m
|
|
new_matches.append(keep)
|
|
return new_matches
|
|
|
|
@property
|
|
def is_by_human(self):
|
|
return self.user.user_id != slack_config["bot_id"]
|
|
|
|
|
|
@property
|
|
def has_single_url(self):
|
|
return len(self.urls) == 1
|
|
|
|
|
|
class Reaction(ChatBaseModel):
|
|
type = CharField(default = "")
|
|
message = ForeignKeyField(Message, backref="reaction")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_tables():
|
|
with download_db:
|
|
download_db.create_tables([ArticleDownload, ArticleKeyword, ArticleAuthor, ArticleReference, ArticleRelated])
|
|
with chat_db:
|
|
chat_db.create_tables([User, Message, Thread, Reaction])
|
|
|
|
|
|
def set_db(chat_db_object, download_db_object):
|
|
chat_db.initialize(chat_db_object)
|
|
download_db.initialize(download_db_object)
|
|
create_tables()
|
|
|
|
def clear_path_name(path):
|
|
keepcharacters = (' ','.','_', '-')
|
|
converted = "".join([c if (c.isalnum() or c in keepcharacters) else "_" for c in path]).rstrip()
|
|
return converted
|
|
|