few bugs in news_fetch left, news_chek wip

2022-09-06 22:15:26 +02:00
parent 2e65828bbb
commit 713406dc67
15 changed files with 537 additions and 267 deletions
--- a/news_fetch/requirements.txt
+++ b/news_fetch/requirements.txt
@@ -8,3 +8,4 @@ newspaper3k
 htmldate
 markdown
 rich
+psycopg2
--- a/news_fetch/runner.py
+++ b/news_fetch/runner.py
@@ -123,7 +123,6 @@ class Coordinator(Thread):
        unsent = models.ArticleDownload.filter(sent = False)
        # .objects.filter(sent = False)
        for a in unsent:
-            print(a)
            self.incoming_request(article=a)


@@ -170,7 +169,7 @@ class Coordinator(Thread):

        for article in articles:
            notifier = lambda article: logger.info(f"Completed manual actions for {article}")
-            ArticleWatcher(article, None, workers_manual = workers, notifier = notifier) # Article watcher wants a thread to link article to TODO: handle threads as a kwarg 
+            ArticleWatcher(article, workers_manual = workers, notifier = notifier) # Article watcher wants a thread to link article to TODO: handle threads as a kwarg 

    def article_complete_notifier(self, article):
        if self.worker_slack is None:
@@ -192,7 +191,7 @@ if __name__ == "__main__":


    if "upload" in sys.argv:
-        articles = models.ArticleDownload.select().where(models.ArticleDownload.archive_url == "").execute()
+        articles = models.ArticleDownload.select().where(models.ArticleDownload.archive_url == "" or models.ArticleDownload.archive_url == "TODO:UPLOAD").execute()
        logger.info(f"Launching upload to archive for {len(articles)} articles.")
        coordinator.manual_processing(articles, [UploadWorker()])

--- a/news_fetch/utils_storage/models.py
+++ b/news_fetch/utils_storage/models.py
@@ -4,7 +4,6 @@ logger = logging.getLogger(__name__)
 from peewee import *
 import os
 import markdown
-import re
 import configuration
 import datetime

@@ -28,7 +27,7 @@ class ArticleDownload(DownloadBaseModel):
    article_url = TextField(default = '', unique=True)
    
    # fetch then fills in the metadata
-    title = CharField(default='')
+    title = TextField(default='')
    @property
    def is_title_bad(self):  # add incrementally
        return "PUR-Abo" in self.title \
@@ -63,7 +62,7 @@ class ArticleDownload(DownloadBaseModel):

    
    archive_url = TextField(default = '')
-    pub_date = DateField(default = '')
+    pub_date = DateField(default = datetime.date.fromtimestamp(0))
    download_date = DateField(default = datetime.date.today)

    slack_ts = FloatField(default = 0) # should be a fixed-length string but float is easier to sort by
@@ -143,13 +142,17 @@ class ArticleDownload(DownloadBaseModel):

    def set_authors(self, authors):
        for a in authors:
-            ArticleAuthor.create(
-                article = self,
-                author = a
-                )
+            if len(a) < 100: # otherwise it's a mismatched string
+                ArticleAuthor.create(
+                    article = self,
+                    author = a
+                    )

    def set_related(self, related):
        for r in related:
+            if len(r) > 255:
+                raise Exception("Related file name too long for POSTGRES")
+
            ArticleRelated.create(
                article = self,
                related_file_name = r
@@ -182,116 +185,7 @@ class ArticleRelated(DownloadBaseModel):



-# class Thread(ChatBaseModel):
-#     """The threads that concern us are only created if the base massage contains a url"""
-#     thread_ts = FloatField(default = 0)
-#     article = ForeignKeyField(ArticleDownload, backref="slack_thread", null=True, default=None)
-#     # provides, ts, user, models
-#     # messages
-
-#     @property
-#     def slack_ts(self):
-#         str_ts = str(self.thread_ts)
-#         cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem!
-#         return "{}{}".format(str_ts, cut_zeros*"0")
-
-#     @property
-#     def initiator_message(self):
-#         try:
-#             return self.messages[0] # TODO check if this needs sorting
-#         except IndexError:
-#             logger.warning(f"Thread {self} is empty. How can that be?")
-#             return None
-
-#     @property
-#     def message_count(self):
-#         # logger.warning("message_count was called")
-#         return self.messages.count()
-
-#     @property
-#     def last_message(self):
-#         messages = Message.select().where(Message.thread == self).order_by(Message.ts) # can't be empty by definition/creation
-#         return messages[-1]
-
-#     @property
-#     def is_fully_processed(self) -> bool:
-#         init_message = self.initiator_message
-#         if init_message is None:
-#             return False
-        
-#         if init_message.is_processed_override:
-#             return True
-#         # this override is set for instance, when no url was sent at all. Then set this thread to be ignored
-        
-#         reactions = init_message.reaction
-#         if not reactions:
-#             return False
-#         else:
-#             r = reactions[0].type # can and should only have one reaction
-#             return r == "white_check_mark" \
-#                 or r == "x"
-
-
-    
-# class Message(ChatBaseModel):
-#     ts = FloatField(unique=True) #for sorting
-#     channel_id = CharField(default='')
-#     user = ForeignKeyField(User, backref="messages")
-#     text = TextField(default='')
-#     thread = ForeignKeyField(Thread, backref="messages", default=None)
-#     file_type = CharField(default='')
-#     perma_link = CharField(default='')
-#     is_processed_override = BooleanField(default=False)
-#     # reaction
-
-#     def __str__(self) -> str:
-#         return "MSG [{}]".format(shorten_name(self.text).replace('\n','/'))
-
-#     @property
-#     def slack_ts(self):
-#         str_ts = str(self.ts)
-#         cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem!
-#         return "{}{}".format(str_ts, cut_zeros * "0")
-
-
-#     @property
-#     def urls(self):
-#         pattern = r"<(.*?)>"
-#         matches = re.findall(pattern, self.text)
-#         matches = [m for m in matches if "." in m]
-        
-#         new_matches = []
-#         for m in matches:
-#             if "." in m:  # must contain a tld, right?
-#                 # further complication: slack automatically abreviates urls in the format: 
-#                 # <url|link preview>. Lucky for us, "|" is a character derecommended in urls, meaning we can "safely" split for it and retain the first half
-#                 if "|" in m:
-#                     keep = m.split("|")[0]
-#                 else:
-#                     keep = m
-#                 new_matches.append(keep)
-#         return new_matches
-    
-#     @property
-#     def is_by_human(self):
-#         return self.user.user_id != slack_config["bot_id"]
-
-    
-#     @property
-#     def has_single_url(self):
-#         return len(self.urls) == 1
-
-
-
-
-
-
-
-
-
 def set_db(download_db_object):
    download_db.initialize(download_db_object)
    with download_db: # create tables (does nothing if they exist already)
        download_db.create_tables([ArticleDownload, ArticleAuthor, ArticleRelated])
-
-