new component - upload to NAS
This commit is contained in:
		
							
								
								
									
										67
									
								
								news_fetch/app/utils_storage/migrations/migration.001.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										67
									
								
								news_fetch/app/utils_storage/migrations/migration.001.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,67 @@ | ||||
| from playhouse.migrate import * | ||||
|  | ||||
|  | ||||
| """ | ||||
| This migration assumes that downloads.db kept the exact same structure as before. | ||||
| messages.db should drop the table articlemodelreference in favor of a new field article in the thread-table | ||||
| Since each thread is constrained to exactly one article this makes the most sense. | ||||
|  | ||||
| This migration assumes that messages.db gets a new field in the table thread: | ||||
| id  |   thread_ts | article_id | ||||
|  | ||||
| We now need to migrate from the table articlemodelreference and then delete it. | ||||
| """ | ||||
|  | ||||
|  | ||||
| db = SqliteDatabase("/code/.dev/messages.db") | ||||
| migrator = SqliteMigrator(db) | ||||
|  | ||||
|  | ||||
| article_field = IntegerField(null=True) | ||||
|  | ||||
|  | ||||
| migrate( | ||||
|     migrator.add_column('thread', 'article_id', article_field), | ||||
|     # migrator.drop_column('some_table', 'old_column'), | ||||
| ) | ||||
|  | ||||
|  | ||||
|  | ||||
| # these are the old models, adapted to the migration | ||||
|  | ||||
| class BaseModel(Model): | ||||
|     class Meta: | ||||
|         database = db | ||||
|  | ||||
| class User(BaseModel): | ||||
|     user_id = CharField(default='', unique=True)    | ||||
|  | ||||
| class Thread(BaseModel): | ||||
|     """The threads that concern us are only created if the messages that contain urls""" | ||||
|     thread_ts = FloatField(default = 0) | ||||
|     article_id = IntegerField(null=True) | ||||
|  | ||||
|      | ||||
| class Message(BaseModel): | ||||
|     ts = FloatField(unique=True) #for sorting | ||||
|     channel_id = CharField(default='') | ||||
|     user = ForeignKeyField(User, backref="messages") | ||||
|     text = TextField(default='') | ||||
|     thread = ForeignKeyField(Thread, backref="messages", default=None) | ||||
|     file_type = CharField(default='') | ||||
|     perma_link = CharField(default='') | ||||
|     is_processed_override = BooleanField(default=False) | ||||
|  | ||||
|  | ||||
| class ArticleModelReference(BaseModel): | ||||
|     message = ForeignKeyField(Message, backref='article_model_references') | ||||
|     article_model_id = IntegerField(default = 0) | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| for ref in ArticleModelReference.select(): | ||||
|     ref.message.thread.article_id = ref.article_model_id | ||||
|     ref.message.thread.save() | ||||
|  | ||||
| db.drop_tables((ArticleModelReference)) | ||||
							
								
								
									
										322
									
								
								news_fetch/app/utils_storage/models.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										322
									
								
								news_fetch/app/utils_storage/models.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,322 @@ | ||||
| import logging | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
| from peewee import * | ||||
| import os | ||||
| import markdown | ||||
| import re | ||||
| import configuration | ||||
| import datetime | ||||
|  | ||||
| config = configuration.parsed["DOWNLOADS"] | ||||
| slack_config = configuration.parsed["SLACK"] | ||||
|  | ||||
| ## Helpers | ||||
| chat_db = DatabaseProxy() | ||||
| download_db = DatabaseProxy() | ||||
|  | ||||
| # set the nature of the db at runtime | ||||
|  | ||||
| class DownloadBaseModel(Model): | ||||
|     class Meta: | ||||
|         database = download_db | ||||
|  | ||||
| class ChatBaseModel(Model): | ||||
|     class Meta: | ||||
|         database = chat_db | ||||
|  | ||||
|  | ||||
|  | ||||
| ## == Article related models == ## | ||||
| class ArticleDownload(DownloadBaseModel): | ||||
|     title = CharField(default='') | ||||
|     pub_date = DateField(default = '') | ||||
|     download_date = DateField(default = datetime.date.today) | ||||
|     source_name = CharField(default = '') | ||||
|     article_url = TextField(default = '', unique=True) | ||||
|     archive_url = TextField(default = '') | ||||
|     file_name = TextField(default = '') | ||||
|     language = CharField(default = '') | ||||
|     summary = TextField(default = '') | ||||
|     comment = TextField(default = '') | ||||
|     verified = IntegerField(default = False) | ||||
|     # authors | ||||
|     # keywords | ||||
|     # ... are added through foreignkeys | ||||
|  | ||||
|     def __str__(self) -> str: | ||||
|         return f"ART [{self.title} -- {self.source_name}]" | ||||
|  | ||||
|     ## Useful Properties | ||||
|     @property | ||||
|     def save_path(self): | ||||
|         return f"{config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/" | ||||
|  | ||||
|     def fname_nas(self, file_name=""): | ||||
|         if self.download_date: | ||||
|             if file_name: | ||||
|                 return "NAS: {}/{}/{}/{}".format(config["remote_storage_path"], self.download_date.year, self.download_date.strftime("%B"), file_name) | ||||
|             else: # return the self. name | ||||
|                 return "NAS: {}/{}/{}/{}".format(config["remote_storage_path"], self.download_date.year, self.download_date.strftime("%B"), self.file_name) | ||||
|         else: | ||||
|             return None | ||||
|  | ||||
|     @property | ||||
|     def fname_template(self): | ||||
|         if "youtube.com" in self.source_name or "youtu.be" in self.source_name: | ||||
|             fname = "{} -- {}".format(self.source_name, self.title) | ||||
|         else: | ||||
|             fname = "{} -- {}.pdf".format(self.source_name, self.title) | ||||
|         return clear_path_name(fname) | ||||
|  | ||||
|     @property | ||||
|     def is_title_bad(self):  # add incrementally | ||||
|         return "PUR-Abo" in self.title \ | ||||
|             or "Redirecting" in self.title \ | ||||
|             or "Error while running fetch" in self.title | ||||
|              | ||||
|     @property | ||||
|     def slack_info(self): | ||||
|         status = [":x: No better version available", ":gear: Verification pending", ":white_check_mark: Verified by human"][self.verified + 1] | ||||
|         content = "\n>" + "\n>".join(self.summary.split("\n")) | ||||
|         file_status, msg = self.file_status() | ||||
|         if not file_status: | ||||
|             return [msg] | ||||
|          | ||||
|         # everything alright: generate real content | ||||
|         # first the base file | ||||
|         if self.file_name[-4:] == ".pdf": | ||||
|             answer = [{ # main reply with the base pdf | ||||
|                 "reply_text" : f"*{self.title}*\n{status}\n{content}", | ||||
|                 "file_path" : self.save_path + self.file_name  | ||||
|             }] | ||||
|         else: # don't upload if the file is too big! | ||||
|             location = "Not uploaded to slack, but the file will be on the NAS:\n`{}`".format(self.fname_nas()) | ||||
|             answer = [{ # main reply with the base pdf | ||||
|                 "reply_text" : "*{}*\n{}\n{}\n{}".format(self.title, status, content, location), | ||||
|                 "file_path" : None  | ||||
|             }] | ||||
|  | ||||
|         # then the related files | ||||
|         rel_text = "" | ||||
|         for r in self.related: | ||||
|             fname = r.related_file_name | ||||
|             lentry = "\n• `{}` ".format(self.fname_nas(fname)) | ||||
|             if fname[-4:] == ".pdf": # this is a manageable file, directly upload | ||||
|                 f_ret = self.save_path + fname | ||||
|                 answer.append({"reply_text":"", "file_path" : f_ret}) | ||||
|             else: # not pdf <=> too large. Don't upload but mention its existence | ||||
|                 lentry += "(not uploaded to slack, but the file will be on the NAS)" | ||||
|                  | ||||
|             rel_text += lentry | ||||
|  | ||||
|         if rel_text: | ||||
|             rel_text = answer[0]["reply_text"] = answer[0]["reply_text"] + "\nRelated files:\n" + rel_text | ||||
|          | ||||
|         return answer | ||||
|  | ||||
|     @property | ||||
|     def mail_info(self): | ||||
|         base = [{"reply_text": "[{}]({})\n".format(self.article_url, self.article_url), "file_path":None}] + self.slack_info | ||||
|         return [{"reply_text": markdown.markdown(m["reply_text"]), "file_path": m["file_path"]} for m in base] | ||||
|  | ||||
|  | ||||
|     ## Helpers | ||||
|     def set_keywords(self, keywords): | ||||
|         for k in keywords: | ||||
|             ArticleKeyword.create( | ||||
|                 article = self, | ||||
|                 keyword = k | ||||
|                 ) | ||||
|  | ||||
|     def set_authors(self, authors): | ||||
|         for a in authors: | ||||
|             ArticleAuthor.create( | ||||
|                 article = self, | ||||
|                 author = a | ||||
|                 ) | ||||
|  | ||||
|     def set_references(self, references): | ||||
|         for r in references: | ||||
|             ArticleReference.create( | ||||
|                 article = self, | ||||
|                 reference_url = r | ||||
|             ) | ||||
|  | ||||
|     def set_related(self, related): | ||||
|         for r in related: | ||||
|             ArticleRelated.create( | ||||
|                 article = self, | ||||
|                 related_file_name = r | ||||
|             ) | ||||
|  | ||||
|     def file_status(self): | ||||
|         if not self.file_name: | ||||
|             logger.error("Article {} has no filename!".format(self)) | ||||
|             return False, {"reply_text": "Download failed, no file was saved.", "file_path": None} | ||||
|          | ||||
|         file_path_abs = self.save_path + self.file_name | ||||
|         if not os.path.exists(file_path_abs): | ||||
|             logger.error("Article {} has a filename, but the file does not exist at that location!".format(self)) | ||||
|             return False, {"reply_text": "Can't find file. Either the download failed or the file was moved.", "file_path": None} | ||||
|  | ||||
|         return True, {} | ||||
|  | ||||
|  | ||||
| class ArticleKeyword(DownloadBaseModel): | ||||
|     # instance gets created for every one keyword -> flexible in size | ||||
|     article = ForeignKeyField(ArticleDownload, backref='keywords') | ||||
|     keyword = CharField() | ||||
|  | ||||
|  | ||||
| class ArticleAuthor(DownloadBaseModel): | ||||
|     article = ForeignKeyField(ArticleDownload, backref='authors') | ||||
|     author = CharField() | ||||
|  | ||||
|  | ||||
| class ArticleReference(DownloadBaseModel): | ||||
|     article = ForeignKeyField(ArticleDownload, backref='references') | ||||
|     reference_url = TextField(default = '') | ||||
|  | ||||
|  | ||||
| class ArticleRelated(DownloadBaseModel): | ||||
|     article = ForeignKeyField(ArticleDownload, backref='related') | ||||
|     related_file_name = TextField(default = '') | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| ## == Slack-thread related models == ## | ||||
| class User(ChatBaseModel): | ||||
|     user_id = CharField(default='', unique=True) | ||||
|     # messages | ||||
|  | ||||
|  | ||||
| class Thread(ChatBaseModel): | ||||
|     """The threads that concern us are only created if the base massage contains a url""" | ||||
|     thread_ts = FloatField(default = 0) | ||||
|     article = ForeignKeyField(ArticleDownload, backref="slack_thread", null=True, default=None) | ||||
|     # provides, ts, user, models | ||||
|     # messages | ||||
|  | ||||
|     @property | ||||
|     def slack_ts(self): | ||||
|         str_ts = str(self.thread_ts) | ||||
|         cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem! | ||||
|         return "{}{}".format(str_ts, cut_zeros*"0") | ||||
|  | ||||
|     @property | ||||
|     def initiator_message(self): | ||||
|         try: | ||||
|             return self.messages[0] # TODO check if this needs sorting | ||||
|         except IndexError: | ||||
|             logger.warning(f"Thread {self} is empty. How can that be?") | ||||
|             return None | ||||
|  | ||||
|     @property | ||||
|     def message_count(self): | ||||
|         # logger.warning("message_count was called") | ||||
|         return self.messages.count() | ||||
|  | ||||
|     @property | ||||
|     def last_message(self): | ||||
|         messages = Message.select().where(Message.thread == self).order_by(Message.ts) # can't be empty by definition/creation | ||||
|         return messages[-1] | ||||
|  | ||||
|     @property | ||||
|     def is_fully_processed(self) -> bool: | ||||
|         init_message = self.initiator_message | ||||
|         if init_message is None: | ||||
|             return False | ||||
|          | ||||
|         if init_message.is_processed_override: | ||||
|             return True | ||||
|         # this override is set for instance, when no url was sent at all. Then set this thread to be ignored | ||||
|          | ||||
|         reactions = init_message.reaction | ||||
|         if not reactions: | ||||
|             return False | ||||
|         else: | ||||
|             r = reactions[0].type # can and should only have one reaction | ||||
|             return r == "white_check_mark" \ | ||||
|                 or r == "x" | ||||
|  | ||||
|  | ||||
|      | ||||
| class Message(ChatBaseModel): | ||||
|     ts = FloatField(unique=True) #for sorting | ||||
|     channel_id = CharField(default='') | ||||
|     user = ForeignKeyField(User, backref="messages") | ||||
|     text = TextField(default='') | ||||
|     thread = ForeignKeyField(Thread, backref="messages", default=None) | ||||
|     file_type = CharField(default='') | ||||
|     perma_link = CharField(default='') | ||||
|     is_processed_override = BooleanField(default=False) | ||||
|     # reaction | ||||
|  | ||||
|     def __str__(self) -> str: | ||||
|         return "MSG [{}]".format(self.text[:min(len(self.text), 30)].replace('\n','/') + '...') | ||||
|  | ||||
|     @property | ||||
|     def slack_ts(self): | ||||
|         str_ts = str(self.ts) | ||||
|         cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem! | ||||
|         return "{}{}".format(str_ts, cut_zeros * "0") | ||||
|  | ||||
|  | ||||
|     @property | ||||
|     def urls(self): | ||||
|         pattern = r"<(.*?)>" | ||||
|         matches = re.findall(pattern, self.text) | ||||
|         matches = [m for m in matches if "." in m] | ||||
|          | ||||
|         new_matches = [] | ||||
|         for m in matches: | ||||
|             if "." in m:  # must contain a tld, right? | ||||
|                 # further complication: slack automatically abreviates urls in the format:  | ||||
|                 # <url|link preview>. Lucky for us, "|" is a character derecommended in urls, meaning we can "safely" split for it and retain the first half | ||||
|                 if "|" in m: | ||||
|                     keep = m.split("|")[0] | ||||
|                 else: | ||||
|                     keep = m | ||||
|                 new_matches.append(keep) | ||||
|         return new_matches | ||||
|      | ||||
|     @property | ||||
|     def is_by_human(self): | ||||
|         return self.user.user_id != slack_config["bot_id"] | ||||
|  | ||||
|      | ||||
|     @property | ||||
|     def has_single_url(self): | ||||
|         return len(self.urls) == 1 | ||||
|  | ||||
|  | ||||
| class Reaction(ChatBaseModel): | ||||
|     type = CharField(default = "") | ||||
|     message = ForeignKeyField(Message, backref="reaction") | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| def create_tables(): | ||||
|     with download_db: | ||||
|         download_db.create_tables([ArticleDownload, ArticleKeyword, ArticleAuthor, ArticleReference, ArticleRelated]) | ||||
|     with chat_db: | ||||
|         chat_db.create_tables([User, Message, Thread, Reaction]) | ||||
|  | ||||
|  | ||||
| def set_db(chat_db_object, download_db_object): | ||||
|     chat_db.initialize(chat_db_object) | ||||
|     download_db.initialize(download_db_object) | ||||
|     create_tables() | ||||
|  | ||||
| def clear_path_name(path): | ||||
|     keepcharacters = (' ','.','_', '-') | ||||
|     converted = "".join([c if (c.isalnum() or c in keepcharacters) else "_" for c in path]).rstrip() | ||||
|     return converted | ||||
|      | ||||
		Reference in New Issue
	
	Block a user
	 Remy Moll
					Remy Moll