coss_archiving/news_fetch/utils_storage/models.py

import logging
logger = logging.getLogger(__name__)

from peewee import *
import os
import markdown
import configuration
import datetime

from . import helpers
config = configuration.main_config["DOWNLOADS"]
slack_config = configuration.main_config["SLACK"]
FILE_SIZE_THRESHOLD = 15 * 1024 * 1024 # 15MB


# set the nature of the db at runtime
download_db = DatabaseProxy()


class DownloadBaseModel(Model):
    class Meta:
        database = download_db


## == Article related models == ##
class ArticleDownload(DownloadBaseModel):
    # in the beginning this is all we have
    article_url = TextField(default = '', unique=True)

    # fetch then fills in the metadata
    title = TextField(default='')
    @property
    def is_title_bad(self):  # add incrementally
        return "PUR-Abo" in self.title \
            or "Redirecting" in self.title \
            or "Error while running fetch" in self.title

    summary = TextField(default = '')
    source_name = CharField(default = '')
    language = CharField(default = '')


    file_name = TextField(default = '')
    @property
    def save_path(self):
        return f"{config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/"
    @property
    def fname_nas(self, file_name=""):
        if self.download_date:
            if file_name:
                return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{file_name}"
            else: # return the self. name
                return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{self.file_name}"
        else:
            return None
    @property
    def fname_template(self):
        if "youtube.com" in self.source_name or "youtu.be" in self.source_name:
            fname = f"{self.source_name} -- {self.title}"
        else:
            fname = f"{self.source_name} -- {self.title}.pdf"
        return helpers.clear_path_name(fname)


    archive_url = TextField(default = '')
    pub_date = DateField(default = datetime.date.fromtimestamp(0))
    download_date = DateField(default = datetime.date.today)

    slack_ts = FloatField(default = 0) # should be a fixed-length string but float is easier to sort by
    @property
    def slack_ts_full(self):
        str_ts = str(self.slack_ts)
        cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals
        return f"{str_ts}{cut_zeros * '0'}"

    sent = BooleanField(default = False)

    archived_by = CharField(default = os.getenv("UNAME"))
    # need to know who saved the message because the file needs to be on their computer in order to get verified
    # verification happens in a different app, but the model has the fields here as well
    comment = TextField(default = '')
    verified = IntegerField(default = 0) # 0 = not verified, 1 = verified, -1 = marked as bad

    # authors
    # keywords
    # ... are added through foreignkeys
    # we will also add an attribute named message, to reference which message should be replied to. This attribute does not need to be saved in the db


    ## Helpers specific to a single article
    def __str__(self) -> str:
        if self.title != '' and self.source_name != '':
            desc = f"{helpers.shorten_name(self.title)} -- {self.source_name}"
        else:
            desc = f"{self.article_url}"
        return f"ART [{desc}]"

    def mail_info(self):
        summary = "\n> " + "\n> ".join(self.summary.split("\n"))
        answer_text = f"[{self.article_url}]({self.article_url})\n\n" # first the url
        answer_files = []
        # displays the summary in a blockquote

        status = self.file_status
        if status == 1: # file_name was empty
            return None # there has been an error do not send any message
        elif status == 2: # no file found at specified location
            answer_text += f"*{self.title}*\n{summary}\nFilename: {self.file_name}"
        elif status == 3: # file found but deemed too big
            location = f"File not sent directly. Location on NAS:\n`{self.fname_nas}`"
            answer_text += f"*{self.title}*\n{summary}\n{location}"
        else: # everything nominal
            answer_text += f"*{self.title}*\n{summary}"
            answer_files.append(self.save_path + self.file_name)

        # then the related files
        if self.related:
            rel_text = "Related files on NAS:"
            for r in self.related:
                fname = r.related_file_name
                rel_text += f"\n• `{self.fname_nas(fname)}` "

            answer_text += "\n\n" + rel_text

        return markdown.markdown(answer_text), answer_files


    def set_authors(self, authors):
        for a in authors:
            if len(a) < 100: # otherwise it's a mismatched string
                ArticleAuthor.create(
                    article = self,
                    author = a
                    )

    def set_related(self, related):
        for r in related:
            if len(r) > 255:
                raise Exception("Related file name too long for POSTGRES")

            ArticleRelated.create(
                article = self,
                related_file_name = r
            )

    @property
    def file_status(self):
        """0 = file exists, 1 = no file name!, 2 = file does not exit,3 = file exists but is too large"""
        if not self.file_name:
            logger.error(f"Article {self} has no filename!")
            return 2
        file_path_abs = self.save_path + self.file_name
        if not os.path.exists(file_path_abs):
            logger.error(f"Article {self} has a filename, but the file does not exist at that location!")
            return 2
        if (os.path.splitext(file_path_abs)[1] != ".pdf") or (os.path.getsize(file_path_abs) > FILE_SIZE_THRESHOLD):
            logger.warning(f"Article {self} has a file that exceeds the file size limit.")
            return 3


class ArticleAuthor(DownloadBaseModel):
    article = ForeignKeyField(ArticleDownload, backref='authors')
    author = CharField()


class ArticleRelated(DownloadBaseModel):
    # Related files, such as the full text of a paper, audio files, etc.
    article = ForeignKeyField(ArticleDownload, backref='related')
    related_file_name = TextField(default = '')


def set_db(download_db_object):
    download_db.initialize(download_db_object)
    with download_db: # create tables (does nothing if they exist already)
        download_db.create_tables([ArticleDownload, ArticleAuthor, ArticleRelated])