import logging logger = logging.getLogger(__name__) from peewee import * import os import markdown import re import configuration import datetime config = configuration.parsed["DOWNLOADS"] slack_config = configuration.parsed["SLACK"] ## Helpers db = DatabaseProxy() # set the nature of the db at runtime class BaseModel(Model): class Meta: database = db ## == Article related models == ## class ArticleDownload(BaseModel): title = CharField(default='') pub_date = DateField(default = '') download_date = DateField(default = datetime.date.today) source_name = CharField(default = '') article_url = TextField(default = '', unique=True) archive_url = TextField(default = '') file_name = TextField(default = '') language = CharField(default = '') summary = TextField(default = '') comment = TextField(default = '') verified = IntegerField(default = False) # authors # keywords # ... are added through foreignkeys def __str__(self) -> str: return "ART ({} -- {})".format(self.title, self.source_name) ## Useful Properties @property def save_path(self): return f"{config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/" def fname_nas(self, file_name=""): if self.download_date: if file_name: return "NAS: {}/{}/{}/{}".format(config["remote_storage_path"], self.download_date.year, self.download_date.strftime("%B"), file_name) else: # return the self. name return "NAS: {}/{}/{}/{}".format(config["remote_storage_path"], self.download_date.year, self.download_date.strftime("%B"), self.file_name) else: return None @property def fname_template(self): if self.source_name == "youtube.com": fname = "{} -- {}".format(self.source_name, self.title) else: fname = "{} -- {}.pdf".format(self.source_name, self.title) return clear_path_name(fname) @property def is_title_bad(self): # add incrementally return "PUR-Abo" in self.title \ or "Redirecting" in self.title \ or "Error while running fetch" in self.title @property def slack_info(self): status = [":x: No better version available", ":gear: Verification pending", ":white_check_mark: Verified by human"][self.verified + 1] content = "\n>" + "\n>".join(self.summary.split("\n")) file_status, msg = self.file_status() if not file_status: return [msg] # everything alright: generate real content # first the base file if self.file_name[-4:] == ".pdf": answer = [{ # main reply with the base pdf "reply_text" : f"*{self.title}*\n{status}\n{content}", "file_path" : self.save_path + self.file_name }] else: # don't upload if the file is too big! location = "Not uploaded to slack, but the file will be on the NAS:\n`{}`".format(self.fname_nas()) answer = [{ # main reply with the base pdf "reply_text" : "*{}*\n{}\n{}\n{}".format(self.title, status, content, location), "file_path" : None }] # then the related files rel_text = "" for r in self.related: fname = r.related_file_name lentry = "\n• `{}` ".format(self.fname_nas(fname)) if fname[-4:] == ".pdf": # this is a manageable file, directly upload f_ret = self.save_path + fname answer.append({"reply_text":"", "file_path" : f_ret}) else: # not pdf <=> too large. Don't upload but mention its existence lentry += "(not uploaded to slack, but the file will be on the NAS)" rel_text += lentry if rel_text: rel_text = answer[0]["reply_text"] = answer[0]["reply_text"] + "\nRelated files:\n" + rel_text return answer @property def mail_info(self): base = [{"reply_text": "[{}]({})\n".format(self.article_url, self.article_url), "file_path":None}] + self.slack_info return [{"reply_text": markdown.markdown(m["reply_text"]), "file_path": m["file_path"]} for m in base] ## Helpers def set_keywords(self, keywords): for k in keywords: ArticleKeyword.create( article = self, keyword = k ) def set_authors(self, authors): for a in authors: ArticleAuthor.create( article = self, author = a ) def set_references(self, references): for r in references: ArticleReference.create( article = self, reference_url = r ) def set_related(self, related): for r in related: ArticleRelated.create( article = self, related_file_name = r ) def file_status(self): if not self.file_name: logger.error("Article {} has no filename!".format(self)) return False, {"reply_text": "Download failed, no file was saved.", "file_path": None} file_path_abs = self.save_path + self.file_name if not os.path.exists(file_path_abs): logger.error("Article {} has a filename, but the file does not exist at that location!".format(self)) return False, {"reply_text": "Can't find file. Either the download failed or the file was moved.", "file_path": None} return True, {} class ArticleKeyword(BaseModel): # instance gets created for every one keyword -> flexible in size article = ForeignKeyField(ArticleDownload, backref='keywords') keyword = CharField() class ArticleAuthor(BaseModel): article = ForeignKeyField(ArticleDownload, backref='authors') author = CharField() class ArticleReference(BaseModel): article = ForeignKeyField(ArticleDownload, backref='references') reference_url = TextField(default = '') class ArticleRelated(BaseModel): article = ForeignKeyField(ArticleDownload, backref='related') related_file_name = TextField(default = '') ## == Slack-thread related models == ## class User(BaseModel): user_id = CharField(default='', unique=True) # messages class Thread(BaseModel): """The threads that concern us are only created if the messages that contain urls""" thread_ts = FloatField(default = 0) article = ForeignKeyField(ArticleDownload, backref="slack_thread", null=True, default=None) # provides, ts, user, models # messages @property def slack_ts(self): str_ts = str(self.thread_ts) cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem! return "{}{}".format(str_ts, cut_zeros*"0") @property def initiator_message(self): return self.messages[0] # todo check if this needs sorting @property def message_count(self): return self.messages.count() @property def last_message(self): messages = Message.select().where(Message.thread == self).order_by(Message.ts) # can't be empty by definition/creation return messages[-1] @property def is_fully_processed(self) -> bool: init_message = self.initiator_message if init_message.is_processed_override: return True # this override is set for instance, when no url was sent at all. Then set this thread to be ignored reactions = init_message.reaction if not reactions: return False else: r = reactions[0].type # can and should only have one reaction return r == "white_check_mark" \ or r == "x" class Message(BaseModel): ts = FloatField(unique=True) #for sorting channel_id = CharField(default='') user = ForeignKeyField(User, backref="messages") text = TextField(default='') thread = ForeignKeyField(Thread, backref="messages", default=None) perma_link = CharField(default='') is_processed_override = BooleanField(default=False) # reaction def __str__(self) -> str: return "MSG ({} -- {})".format(self.channel_id, self.text[:min(len(self.text), 50)].replace("\n","/") + "....") @property def slack_ts(self): str_ts = str(self.ts) cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem! return "{}{}".format(str_ts, cut_zeros * "0") @property def urls(self): pattern = r"<(.*?)>" matches = re.findall(pattern, self.text) matches = [m for m in matches if "." in m] new_matches = [] for m in matches: if "." in m: # must contain a tld, right? # further complication: slack automatically abreviates urls in the format: # . Lucky for us, "|" is a character derecommended in urls, meaning we can "safely" split for it and retain the first half if "|" in m: keep = m.split("|")[0] else: keep = m new_matches.append(keep) return new_matches @property def is_by_human(self): return self.user.user_id != slack_config["bot_id"] @property def has_single_url(self): return len(self.urls) == 1 class Reaction(BaseModel): type = CharField(default = "") message = ForeignKeyField(Message, backref="reaction") def create_tables(): with db: db.create_tables([ArticleDownload, ArticleKeyword, ArticleAuthor, ArticleReference, ArticleRelated, User, Message, Thread, Reaction]) def set_db(db_object): db.initialize(db_object) create_tables() def clear_path_name(path): keepcharacters = (' ','.','_', '-') converted = "".join([c if (c.isalnum() or c in keepcharacters) else "_" for c in path]).rstrip() return converted # return re.sub(r'[^\x00-\x7f]', r'_', path) # # cleared = path.replace("\n"," ")\ # # .replace("|", "_")\ # # .replace(":", "_")\ # # .replace("?", "_")\ # # .replace("!", "_")\ # # .replace(",", "_")\ # # .replace("/", "_")\ # # .replace("\\", "_")\ # # .replace("*", "")\ # # .replace("\"", "'")\ # # .replace("<", "'")\ # # .replace(">", "'") # # return cleared