reduced slack functionality, higher ease of use. Database migration wip
This commit is contained in:
10
news_fetch/utils_storage/helpers.py
Normal file
10
news_fetch/utils_storage/helpers.py
Normal file
@@ -0,0 +1,10 @@
|
||||
def clear_path_name(path):
|
||||
keepcharacters = (' ','.','_', '-')
|
||||
converted = "".join([c if (c.isalnum() or c in keepcharacters) else "_" for c in path]).rstrip()
|
||||
return converted
|
||||
|
||||
def shorten_name(name, offset = 50):
|
||||
if len(name) > offset:
|
||||
return name[:offset] + "..."
|
||||
else:
|
||||
return name
|
67
news_fetch/utils_storage/migrations/migration.001.py
Normal file
67
news_fetch/utils_storage/migrations/migration.001.py
Normal file
@@ -0,0 +1,67 @@
|
||||
from playhouse.migrate import *
|
||||
|
||||
|
||||
"""
|
||||
This migration assumes that downloads.db kept the exact same structure as before.
|
||||
messages.db should drop the table articlemodelreference in favor of a new field article in the thread-table
|
||||
Since each thread is constrained to exactly one article this makes the most sense.
|
||||
|
||||
This migration assumes that messages.db gets a new field in the table thread:
|
||||
id | thread_ts | article_id
|
||||
|
||||
We now need to migrate from the table articlemodelreference and then delete it.
|
||||
"""
|
||||
|
||||
|
||||
db = SqliteDatabase("/code/.dev/messages.db")
|
||||
migrator = SqliteMigrator(db)
|
||||
|
||||
|
||||
article_field = IntegerField(null=True)
|
||||
|
||||
|
||||
migrate(
|
||||
migrator.add_column('thread', 'article_id', article_field),
|
||||
# migrator.drop_column('some_table', 'old_column'),
|
||||
)
|
||||
|
||||
|
||||
|
||||
# these are the old models, adapted to the migration
|
||||
|
||||
class BaseModel(Model):
|
||||
class Meta:
|
||||
database = db
|
||||
|
||||
class User(BaseModel):
|
||||
user_id = CharField(default='', unique=True)
|
||||
|
||||
class Thread(BaseModel):
|
||||
"""The threads that concern us are only created if the messages that contain urls"""
|
||||
thread_ts = FloatField(default = 0)
|
||||
article_id = IntegerField(null=True)
|
||||
|
||||
|
||||
class Message(BaseModel):
|
||||
ts = FloatField(unique=True) #for sorting
|
||||
channel_id = CharField(default='')
|
||||
user = ForeignKeyField(User, backref="messages")
|
||||
text = TextField(default='')
|
||||
thread = ForeignKeyField(Thread, backref="messages", default=None)
|
||||
file_type = CharField(default='')
|
||||
perma_link = CharField(default='')
|
||||
is_processed_override = BooleanField(default=False)
|
||||
|
||||
|
||||
class ArticleModelReference(BaseModel):
|
||||
message = ForeignKeyField(Message, backref='article_model_references')
|
||||
article_model_id = IntegerField(default = 0)
|
||||
|
||||
|
||||
|
||||
|
||||
for ref in ArticleModelReference.select():
|
||||
ref.message.thread.article_id = ref.article_model_id
|
||||
ref.message.thread.save()
|
||||
|
||||
db.drop_tables((ArticleModelReference))
|
297
news_fetch/utils_storage/models.py
Normal file
297
news_fetch/utils_storage/models.py
Normal file
@@ -0,0 +1,297 @@
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from peewee import *
|
||||
import os
|
||||
import markdown
|
||||
import re
|
||||
import configuration
|
||||
import datetime
|
||||
|
||||
from . import helpers
|
||||
config = configuration.main_config["DOWNLOADS"]
|
||||
slack_config = configuration.main_config["SLACK"]
|
||||
|
||||
# set the nature of the db at runtime
|
||||
download_db = DatabaseProxy()
|
||||
|
||||
|
||||
class DownloadBaseModel(Model):
|
||||
class Meta:
|
||||
database = download_db
|
||||
|
||||
|
||||
|
||||
## == Article related models == ##
|
||||
class ArticleDownload(DownloadBaseModel):
|
||||
# in the beginning this is all we have
|
||||
article_url = TextField(default = '', unique=True)
|
||||
|
||||
# fetch then fills in the metadata
|
||||
title = CharField(default='')
|
||||
@property
|
||||
def is_title_bad(self): # add incrementally
|
||||
return "PUR-Abo" in self.title \
|
||||
or "Redirecting" in self.title \
|
||||
or "Error while running fetch" in self.title
|
||||
|
||||
summary = TextField(default = '')
|
||||
source_name = CharField(default = '')
|
||||
language = CharField(default = '')
|
||||
|
||||
|
||||
file_name = TextField(default = '')
|
||||
@property
|
||||
def save_path(self):
|
||||
return f"{config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/"
|
||||
@property
|
||||
def fname_nas(self, file_name=""):
|
||||
if self.download_date:
|
||||
if file_name:
|
||||
return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{file_name}"
|
||||
else: # return the self. name
|
||||
return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{self.file_name}"
|
||||
else:
|
||||
return None
|
||||
@property
|
||||
def fname_template(self):
|
||||
if "youtube.com" in self.source_name or "youtu.be" in self.source_name:
|
||||
fname = f"{self.source_name} -- {self.title}"
|
||||
else:
|
||||
fname = f"{self.source_name} -- {self.title}.pdf"
|
||||
return helpers.clear_path_name(fname)
|
||||
|
||||
|
||||
archive_url = TextField(default = '')
|
||||
pub_date = DateField(default = '')
|
||||
download_date = DateField(default = datetime.date.today)
|
||||
|
||||
slack_ts = FloatField(default = 0) # should be a fixed-length string but float is easier to sort by
|
||||
@property
|
||||
def slack_ts_full(self):
|
||||
str_ts = str(self.slack_ts)
|
||||
cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals
|
||||
return f"{str_ts}{cut_zeros * '0'}"
|
||||
|
||||
sent = BooleanField(default = False)
|
||||
|
||||
archived_by = CharField(default = os.getenv("UNAME"))
|
||||
# need to know who saved the message because the file needs to be on their computer in order to get verified
|
||||
# verification happens in a different app, but the model has the fields here as well
|
||||
comment = TextField(default = '')
|
||||
verified = IntegerField(default = 0) # 0 = not verified, 1 = verified, -1 = marked as bad
|
||||
|
||||
# authors
|
||||
# keywords
|
||||
# ... are added through foreignkeys
|
||||
# we will also add an attribute named message, to reference which message should be replied to. This attribute does not need to be saved in the db
|
||||
|
||||
|
||||
## Helpers specific to a single article
|
||||
def __str__(self) -> str:
|
||||
if self.title != '' and self.source_name != '':
|
||||
desc = f"{helpers.shorten_name(self.title)} -- {self.source_name}"
|
||||
else:
|
||||
desc = f"{self.article_url}"
|
||||
return f"ART [{desc}]"
|
||||
|
||||
@property
|
||||
def slack_info(self):
|
||||
status = [":x: No better version available", ":gear: Verification pending", ":white_check_mark: Verified by human"][self.verified + 1]
|
||||
content = "\n>" + "\n>".join(self.summary.split("\n"))
|
||||
file_status, msg = self.file_status()
|
||||
if not file_status:
|
||||
return [msg]
|
||||
|
||||
# everything alright: generate real content
|
||||
# first the base file
|
||||
if self.file_name[-4:] == ".pdf":
|
||||
answer = [{ # main reply with the base pdf
|
||||
"reply_text" : f"*{self.title}*\n{status}\n{content}",
|
||||
"file_path" : self.save_path + self.file_name
|
||||
}]
|
||||
else: # don't upload if the file is too big!
|
||||
location = f"Not uploaded to slack, but the file will be on the NAS:\n`{self.fname_nas}`"
|
||||
answer = [{ # main reply with the base pdf
|
||||
"reply_text" : f"*{self.title}*\n{status}\n{content}\n{location}",
|
||||
"file_path" : None
|
||||
}]
|
||||
|
||||
# then the related files
|
||||
rel_text = ""
|
||||
for r in self.related:
|
||||
fname = r.related_file_name
|
||||
lentry = "\n• `{}` ".format(self.fname_nas(fname))
|
||||
if fname[-4:] == ".pdf": # this is a manageable file, directly upload
|
||||
f_ret = self.save_path + fname
|
||||
answer.append({"reply_text":"", "file_path" : f_ret})
|
||||
else: # not pdf <=> too large. Don't upload but mention its existence
|
||||
lentry += "(not uploaded to slack, but the file will be on the NAS)"
|
||||
|
||||
rel_text += lentry
|
||||
|
||||
if rel_text:
|
||||
rel_text = answer[0]["reply_text"] = answer[0]["reply_text"] + "\nRelated files:\n" + rel_text
|
||||
|
||||
return answer
|
||||
|
||||
@property
|
||||
def mail_info(self):
|
||||
base = [{"reply_text": f"[{self.article_url}]({self.article_url})\n", "file_path":None}] + self.slack_info
|
||||
return [{"reply_text": markdown.markdown(m["reply_text"]), "file_path": m["file_path"]} for m in base]
|
||||
|
||||
|
||||
def set_authors(self, authors):
|
||||
for a in authors:
|
||||
ArticleAuthor.create(
|
||||
article = self,
|
||||
author = a
|
||||
)
|
||||
|
||||
def set_related(self, related):
|
||||
for r in related:
|
||||
ArticleRelated.create(
|
||||
article = self,
|
||||
related_file_name = r
|
||||
)
|
||||
|
||||
def file_status(self):
|
||||
if not self.file_name:
|
||||
logger.error(f"Article {self} has no filename!")
|
||||
return False, {"reply_text": "Download failed, no file was saved.", "file_path": None}
|
||||
|
||||
file_path_abs = self.save_path + self.file_name
|
||||
if not os.path.exists(file_path_abs):
|
||||
logger.error(f"Article {self} has a filename, but the file does not exist at that location!")
|
||||
return False, {"reply_text": "Can't find file. Either the download failed or the file was moved.", "file_path": None}
|
||||
|
||||
return True, {}
|
||||
|
||||
|
||||
class ArticleAuthor(DownloadBaseModel):
|
||||
article = ForeignKeyField(ArticleDownload, backref='authors')
|
||||
author = CharField()
|
||||
|
||||
|
||||
class ArticleRelated(DownloadBaseModel):
|
||||
# Related files, such as the full text of a paper, audio files, etc.
|
||||
article = ForeignKeyField(ArticleDownload, backref='related')
|
||||
related_file_name = TextField(default = '')
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# class Thread(ChatBaseModel):
|
||||
# """The threads that concern us are only created if the base massage contains a url"""
|
||||
# thread_ts = FloatField(default = 0)
|
||||
# article = ForeignKeyField(ArticleDownload, backref="slack_thread", null=True, default=None)
|
||||
# # provides, ts, user, models
|
||||
# # messages
|
||||
|
||||
# @property
|
||||
# def slack_ts(self):
|
||||
# str_ts = str(self.thread_ts)
|
||||
# cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem!
|
||||
# return "{}{}".format(str_ts, cut_zeros*"0")
|
||||
|
||||
# @property
|
||||
# def initiator_message(self):
|
||||
# try:
|
||||
# return self.messages[0] # TODO check if this needs sorting
|
||||
# except IndexError:
|
||||
# logger.warning(f"Thread {self} is empty. How can that be?")
|
||||
# return None
|
||||
|
||||
# @property
|
||||
# def message_count(self):
|
||||
# # logger.warning("message_count was called")
|
||||
# return self.messages.count()
|
||||
|
||||
# @property
|
||||
# def last_message(self):
|
||||
# messages = Message.select().where(Message.thread == self).order_by(Message.ts) # can't be empty by definition/creation
|
||||
# return messages[-1]
|
||||
|
||||
# @property
|
||||
# def is_fully_processed(self) -> bool:
|
||||
# init_message = self.initiator_message
|
||||
# if init_message is None:
|
||||
# return False
|
||||
|
||||
# if init_message.is_processed_override:
|
||||
# return True
|
||||
# # this override is set for instance, when no url was sent at all. Then set this thread to be ignored
|
||||
|
||||
# reactions = init_message.reaction
|
||||
# if not reactions:
|
||||
# return False
|
||||
# else:
|
||||
# r = reactions[0].type # can and should only have one reaction
|
||||
# return r == "white_check_mark" \
|
||||
# or r == "x"
|
||||
|
||||
|
||||
|
||||
# class Message(ChatBaseModel):
|
||||
# ts = FloatField(unique=True) #for sorting
|
||||
# channel_id = CharField(default='')
|
||||
# user = ForeignKeyField(User, backref="messages")
|
||||
# text = TextField(default='')
|
||||
# thread = ForeignKeyField(Thread, backref="messages", default=None)
|
||||
# file_type = CharField(default='')
|
||||
# perma_link = CharField(default='')
|
||||
# is_processed_override = BooleanField(default=False)
|
||||
# # reaction
|
||||
|
||||
# def __str__(self) -> str:
|
||||
# return "MSG [{}]".format(shorten_name(self.text).replace('\n','/'))
|
||||
|
||||
# @property
|
||||
# def slack_ts(self):
|
||||
# str_ts = str(self.ts)
|
||||
# cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem!
|
||||
# return "{}{}".format(str_ts, cut_zeros * "0")
|
||||
|
||||
|
||||
# @property
|
||||
# def urls(self):
|
||||
# pattern = r"<(.*?)>"
|
||||
# matches = re.findall(pattern, self.text)
|
||||
# matches = [m for m in matches if "." in m]
|
||||
|
||||
# new_matches = []
|
||||
# for m in matches:
|
||||
# if "." in m: # must contain a tld, right?
|
||||
# # further complication: slack automatically abreviates urls in the format:
|
||||
# # <url|link preview>. Lucky for us, "|" is a character derecommended in urls, meaning we can "safely" split for it and retain the first half
|
||||
# if "|" in m:
|
||||
# keep = m.split("|")[0]
|
||||
# else:
|
||||
# keep = m
|
||||
# new_matches.append(keep)
|
||||
# return new_matches
|
||||
|
||||
# @property
|
||||
# def is_by_human(self):
|
||||
# return self.user.user_id != slack_config["bot_id"]
|
||||
|
||||
|
||||
# @property
|
||||
# def has_single_url(self):
|
||||
# return len(self.urls) == 1
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def set_db(download_db_object):
|
||||
download_db.initialize(download_db_object)
|
||||
with download_db: # create tables (does nothing if they exist already)
|
||||
download_db.create_tables([ArticleDownload, ArticleAuthor, ArticleRelated])
|
||||
|
||||
|
Reference in New Issue
Block a user