reduced slack functionality, higher ease of use. Database migration wip
This commit is contained in:
		
							
								
								
									
										10
									
								
								news_fetch/utils_storage/helpers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								news_fetch/utils_storage/helpers.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,10 @@ | ||||
| def clear_path_name(path): | ||||
|     keepcharacters = (' ','.','_', '-') | ||||
|     converted = "".join([c if (c.isalnum() or c in keepcharacters) else "_" for c in path]).rstrip() | ||||
|     return converted | ||||
|  | ||||
| def shorten_name(name, offset = 50): | ||||
|     if len(name) > offset: | ||||
|         return name[:offset] + "..." | ||||
|     else: | ||||
|         return name | ||||
							
								
								
									
										67
									
								
								news_fetch/utils_storage/migrations/migration.001.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										67
									
								
								news_fetch/utils_storage/migrations/migration.001.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,67 @@ | ||||
| from playhouse.migrate import * | ||||
|  | ||||
|  | ||||
| """ | ||||
| This migration assumes that downloads.db kept the exact same structure as before. | ||||
| messages.db should drop the table articlemodelreference in favor of a new field article in the thread-table | ||||
| Since each thread is constrained to exactly one article this makes the most sense. | ||||
|  | ||||
| This migration assumes that messages.db gets a new field in the table thread: | ||||
| id  |   thread_ts | article_id | ||||
|  | ||||
| We now need to migrate from the table articlemodelreference and then delete it. | ||||
| """ | ||||
|  | ||||
|  | ||||
| db = SqliteDatabase("/code/.dev/messages.db") | ||||
| migrator = SqliteMigrator(db) | ||||
|  | ||||
|  | ||||
| article_field = IntegerField(null=True) | ||||
|  | ||||
|  | ||||
| migrate( | ||||
|     migrator.add_column('thread', 'article_id', article_field), | ||||
|     # migrator.drop_column('some_table', 'old_column'), | ||||
| ) | ||||
|  | ||||
|  | ||||
|  | ||||
| # these are the old models, adapted to the migration | ||||
|  | ||||
| class BaseModel(Model): | ||||
|     class Meta: | ||||
|         database = db | ||||
|  | ||||
| class User(BaseModel): | ||||
|     user_id = CharField(default='', unique=True)    | ||||
|  | ||||
| class Thread(BaseModel): | ||||
|     """The threads that concern us are only created if the messages that contain urls""" | ||||
|     thread_ts = FloatField(default = 0) | ||||
|     article_id = IntegerField(null=True) | ||||
|  | ||||
|      | ||||
| class Message(BaseModel): | ||||
|     ts = FloatField(unique=True) #for sorting | ||||
|     channel_id = CharField(default='') | ||||
|     user = ForeignKeyField(User, backref="messages") | ||||
|     text = TextField(default='') | ||||
|     thread = ForeignKeyField(Thread, backref="messages", default=None) | ||||
|     file_type = CharField(default='') | ||||
|     perma_link = CharField(default='') | ||||
|     is_processed_override = BooleanField(default=False) | ||||
|  | ||||
|  | ||||
| class ArticleModelReference(BaseModel): | ||||
|     message = ForeignKeyField(Message, backref='article_model_references') | ||||
|     article_model_id = IntegerField(default = 0) | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| for ref in ArticleModelReference.select(): | ||||
|     ref.message.thread.article_id = ref.article_model_id | ||||
|     ref.message.thread.save() | ||||
|  | ||||
| db.drop_tables((ArticleModelReference)) | ||||
							
								
								
									
										297
									
								
								news_fetch/utils_storage/models.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										297
									
								
								news_fetch/utils_storage/models.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,297 @@ | ||||
| import logging | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
| from peewee import * | ||||
| import os | ||||
| import markdown | ||||
| import re | ||||
| import configuration | ||||
| import datetime | ||||
|  | ||||
| from . import helpers | ||||
| config = configuration.main_config["DOWNLOADS"] | ||||
| slack_config = configuration.main_config["SLACK"] | ||||
|  | ||||
| # set the nature of the db at runtime | ||||
| download_db = DatabaseProxy() | ||||
|  | ||||
|  | ||||
| class DownloadBaseModel(Model): | ||||
|     class Meta: | ||||
|         database = download_db | ||||
|  | ||||
|  | ||||
|  | ||||
| ## == Article related models == ## | ||||
| class ArticleDownload(DownloadBaseModel): | ||||
|     # in the beginning this is all we have | ||||
|     article_url = TextField(default = '', unique=True) | ||||
|      | ||||
|     # fetch then fills in the metadata | ||||
|     title = CharField(default='') | ||||
|     @property | ||||
|     def is_title_bad(self):  # add incrementally | ||||
|         return "PUR-Abo" in self.title \ | ||||
|             or "Redirecting" in self.title \ | ||||
|             or "Error while running fetch" in self.title | ||||
|  | ||||
|     summary = TextField(default = '') | ||||
|     source_name = CharField(default = '') | ||||
|     language = CharField(default = '') | ||||
|  | ||||
|  | ||||
|     file_name = TextField(default = '') | ||||
|     @property | ||||
|     def save_path(self): | ||||
|         return f"{config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/" | ||||
|     @property | ||||
|     def fname_nas(self, file_name=""): | ||||
|         if self.download_date: | ||||
|             if file_name: | ||||
|                 return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{file_name}" | ||||
|             else: # return the self. name | ||||
|                 return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{self.file_name}" | ||||
|         else: | ||||
|             return None | ||||
|     @property | ||||
|     def fname_template(self): | ||||
|         if "youtube.com" in self.source_name or "youtu.be" in self.source_name: | ||||
|             fname = f"{self.source_name} -- {self.title}" | ||||
|         else: | ||||
|             fname = f"{self.source_name} -- {self.title}.pdf" | ||||
|         return helpers.clear_path_name(fname) | ||||
|  | ||||
|      | ||||
|     archive_url = TextField(default = '') | ||||
|     pub_date = DateField(default = '') | ||||
|     download_date = DateField(default = datetime.date.today) | ||||
|  | ||||
|     slack_ts = FloatField(default = 0) # should be a fixed-length string but float is easier to sort by | ||||
|     @property | ||||
|     def slack_ts_full(self): | ||||
|         str_ts = str(self.slack_ts) | ||||
|         cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals | ||||
|         return f"{str_ts}{cut_zeros * '0'}" | ||||
|  | ||||
|     sent = BooleanField(default = False) | ||||
|      | ||||
|     archived_by = CharField(default = os.getenv("UNAME")) | ||||
|     # need to know who saved the message because the file needs to be on their computer in order to get verified | ||||
|     # verification happens in a different app, but the model has the fields here as well | ||||
|     comment = TextField(default = '') | ||||
|     verified = IntegerField(default = 0) # 0 = not verified, 1 = verified, -1 = marked as bad | ||||
|  | ||||
|     # authors | ||||
|     # keywords | ||||
|     # ... are added through foreignkeys | ||||
|     # we will also add an attribute named message, to reference which message should be replied to. This attribute does not need to be saved in the db | ||||
|  | ||||
|  | ||||
|     ## Helpers specific to a single article | ||||
|     def __str__(self) -> str: | ||||
|         if self.title != '' and self.source_name != '': | ||||
|             desc = f"{helpers.shorten_name(self.title)} -- {self.source_name}" | ||||
|         else: | ||||
|             desc = f"{self.article_url}" | ||||
|         return f"ART [{desc}]" | ||||
|  | ||||
|     @property | ||||
|     def slack_info(self): | ||||
|         status = [":x: No better version available", ":gear: Verification pending", ":white_check_mark: Verified by human"][self.verified + 1] | ||||
|         content = "\n>" + "\n>".join(self.summary.split("\n")) | ||||
|         file_status, msg = self.file_status() | ||||
|         if not file_status: | ||||
|             return [msg] | ||||
|          | ||||
|         # everything alright: generate real content | ||||
|         # first the base file | ||||
|         if self.file_name[-4:] == ".pdf": | ||||
|             answer = [{ # main reply with the base pdf | ||||
|                 "reply_text" : f"*{self.title}*\n{status}\n{content}", | ||||
|                 "file_path" : self.save_path + self.file_name  | ||||
|             }] | ||||
|         else: # don't upload if the file is too big! | ||||
|             location = f"Not uploaded to slack, but the file will be on the NAS:\n`{self.fname_nas}`" | ||||
|             answer = [{ # main reply with the base pdf | ||||
|                 "reply_text" : f"*{self.title}*\n{status}\n{content}\n{location}", | ||||
|                 "file_path" : None  | ||||
|             }] | ||||
|  | ||||
|         # then the related files | ||||
|         rel_text = "" | ||||
|         for r in self.related: | ||||
|             fname = r.related_file_name | ||||
|             lentry = "\n• `{}` ".format(self.fname_nas(fname)) | ||||
|             if fname[-4:] == ".pdf": # this is a manageable file, directly upload | ||||
|                 f_ret = self.save_path + fname | ||||
|                 answer.append({"reply_text":"", "file_path" : f_ret}) | ||||
|             else: # not pdf <=> too large. Don't upload but mention its existence | ||||
|                 lentry += "(not uploaded to slack, but the file will be on the NAS)" | ||||
|                  | ||||
|             rel_text += lentry | ||||
|  | ||||
|         if rel_text: | ||||
|             rel_text = answer[0]["reply_text"] = answer[0]["reply_text"] + "\nRelated files:\n" + rel_text | ||||
|          | ||||
|         return answer | ||||
|  | ||||
|     @property | ||||
|     def mail_info(self): | ||||
|         base = [{"reply_text": f"[{self.article_url}]({self.article_url})\n", "file_path":None}] + self.slack_info | ||||
|         return [{"reply_text": markdown.markdown(m["reply_text"]), "file_path": m["file_path"]} for m in base] | ||||
|  | ||||
|  | ||||
|     def set_authors(self, authors): | ||||
|         for a in authors: | ||||
|             ArticleAuthor.create( | ||||
|                 article = self, | ||||
|                 author = a | ||||
|                 ) | ||||
|  | ||||
|     def set_related(self, related): | ||||
|         for r in related: | ||||
|             ArticleRelated.create( | ||||
|                 article = self, | ||||
|                 related_file_name = r | ||||
|             ) | ||||
|  | ||||
|     def file_status(self): | ||||
|         if not self.file_name: | ||||
|             logger.error(f"Article {self} has no filename!") | ||||
|             return False, {"reply_text": "Download failed, no file was saved.", "file_path": None} | ||||
|          | ||||
|         file_path_abs = self.save_path + self.file_name | ||||
|         if not os.path.exists(file_path_abs): | ||||
|             logger.error(f"Article {self} has a filename, but the file does not exist at that location!") | ||||
|             return False, {"reply_text": "Can't find file. Either the download failed or the file was moved.", "file_path": None} | ||||
|  | ||||
|         return True, {} | ||||
|  | ||||
|  | ||||
| class ArticleAuthor(DownloadBaseModel): | ||||
|     article = ForeignKeyField(ArticleDownload, backref='authors') | ||||
|     author = CharField() | ||||
|  | ||||
|  | ||||
| class ArticleRelated(DownloadBaseModel): | ||||
|     # Related files, such as the full text of a paper, audio files, etc. | ||||
|     article = ForeignKeyField(ArticleDownload, backref='related') | ||||
|     related_file_name = TextField(default = '') | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # class Thread(ChatBaseModel): | ||||
| #     """The threads that concern us are only created if the base massage contains a url""" | ||||
| #     thread_ts = FloatField(default = 0) | ||||
| #     article = ForeignKeyField(ArticleDownload, backref="slack_thread", null=True, default=None) | ||||
| #     # provides, ts, user, models | ||||
| #     # messages | ||||
|  | ||||
| #     @property | ||||
| #     def slack_ts(self): | ||||
| #         str_ts = str(self.thread_ts) | ||||
| #         cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem! | ||||
| #         return "{}{}".format(str_ts, cut_zeros*"0") | ||||
|  | ||||
| #     @property | ||||
| #     def initiator_message(self): | ||||
| #         try: | ||||
| #             return self.messages[0] # TODO check if this needs sorting | ||||
| #         except IndexError: | ||||
| #             logger.warning(f"Thread {self} is empty. How can that be?") | ||||
| #             return None | ||||
|  | ||||
| #     @property | ||||
| #     def message_count(self): | ||||
| #         # logger.warning("message_count was called") | ||||
| #         return self.messages.count() | ||||
|  | ||||
| #     @property | ||||
| #     def last_message(self): | ||||
| #         messages = Message.select().where(Message.thread == self).order_by(Message.ts) # can't be empty by definition/creation | ||||
| #         return messages[-1] | ||||
|  | ||||
| #     @property | ||||
| #     def is_fully_processed(self) -> bool: | ||||
| #         init_message = self.initiator_message | ||||
| #         if init_message is None: | ||||
| #             return False | ||||
|          | ||||
| #         if init_message.is_processed_override: | ||||
| #             return True | ||||
| #         # this override is set for instance, when no url was sent at all. Then set this thread to be ignored | ||||
|          | ||||
| #         reactions = init_message.reaction | ||||
| #         if not reactions: | ||||
| #             return False | ||||
| #         else: | ||||
| #             r = reactions[0].type # can and should only have one reaction | ||||
| #             return r == "white_check_mark" \ | ||||
| #                 or r == "x" | ||||
|  | ||||
|  | ||||
|      | ||||
| # class Message(ChatBaseModel): | ||||
| #     ts = FloatField(unique=True) #for sorting | ||||
| #     channel_id = CharField(default='') | ||||
| #     user = ForeignKeyField(User, backref="messages") | ||||
| #     text = TextField(default='') | ||||
| #     thread = ForeignKeyField(Thread, backref="messages", default=None) | ||||
| #     file_type = CharField(default='') | ||||
| #     perma_link = CharField(default='') | ||||
| #     is_processed_override = BooleanField(default=False) | ||||
| #     # reaction | ||||
|  | ||||
| #     def __str__(self) -> str: | ||||
| #         return "MSG [{}]".format(shorten_name(self.text).replace('\n','/')) | ||||
|  | ||||
| #     @property | ||||
| #     def slack_ts(self): | ||||
| #         str_ts = str(self.ts) | ||||
| #         cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem! | ||||
| #         return "{}{}".format(str_ts, cut_zeros * "0") | ||||
|  | ||||
|  | ||||
| #     @property | ||||
| #     def urls(self): | ||||
| #         pattern = r"<(.*?)>" | ||||
| #         matches = re.findall(pattern, self.text) | ||||
| #         matches = [m for m in matches if "." in m] | ||||
|          | ||||
| #         new_matches = [] | ||||
| #         for m in matches: | ||||
| #             if "." in m:  # must contain a tld, right? | ||||
| #                 # further complication: slack automatically abreviates urls in the format:  | ||||
| #                 # <url|link preview>. Lucky for us, "|" is a character derecommended in urls, meaning we can "safely" split for it and retain the first half | ||||
| #                 if "|" in m: | ||||
| #                     keep = m.split("|")[0] | ||||
| #                 else: | ||||
| #                     keep = m | ||||
| #                 new_matches.append(keep) | ||||
| #         return new_matches | ||||
|      | ||||
| #     @property | ||||
| #     def is_by_human(self): | ||||
| #         return self.user.user_id != slack_config["bot_id"] | ||||
|  | ||||
|      | ||||
| #     @property | ||||
| #     def has_single_url(self): | ||||
| #         return len(self.urls) == 1 | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| def set_db(download_db_object): | ||||
|     download_db.initialize(download_db_object) | ||||
|     with download_db: # create tables (does nothing if they exist already) | ||||
|         download_db.create_tables([ArticleDownload, ArticleAuthor, ArticleRelated]) | ||||
|  | ||||
|  | ||||
		Reference in New Issue
	
	Block a user