few bugs in news_fetch left, news_chek wip

This commit is contained in:
2022-09-06 22:15:26 +02:00
parent 2e65828bbb
commit 713406dc67
15 changed files with 537 additions and 267 deletions

View File

@@ -8,3 +8,4 @@ newspaper3k
htmldate
markdown
rich
psycopg2

View File

@@ -123,7 +123,6 @@ class Coordinator(Thread):
unsent = models.ArticleDownload.filter(sent = False)
# .objects.filter(sent = False)
for a in unsent:
print(a)
self.incoming_request(article=a)
@@ -170,7 +169,7 @@ class Coordinator(Thread):
for article in articles:
notifier = lambda article: logger.info(f"Completed manual actions for {article}")
ArticleWatcher(article, None, workers_manual = workers, notifier = notifier) # Article watcher wants a thread to link article to TODO: handle threads as a kwarg
ArticleWatcher(article, workers_manual = workers, notifier = notifier) # Article watcher wants a thread to link article to TODO: handle threads as a kwarg
def article_complete_notifier(self, article):
if self.worker_slack is None:
@@ -192,7 +191,7 @@ if __name__ == "__main__":
if "upload" in sys.argv:
articles = models.ArticleDownload.select().where(models.ArticleDownload.archive_url == "").execute()
articles = models.ArticleDownload.select().where(models.ArticleDownload.archive_url == "" or models.ArticleDownload.archive_url == "TODO:UPLOAD").execute()
logger.info(f"Launching upload to archive for {len(articles)} articles.")
coordinator.manual_processing(articles, [UploadWorker()])

View File

@@ -4,7 +4,6 @@ logger = logging.getLogger(__name__)
from peewee import *
import os
import markdown
import re
import configuration
import datetime
@@ -28,7 +27,7 @@ class ArticleDownload(DownloadBaseModel):
article_url = TextField(default = '', unique=True)
# fetch then fills in the metadata
title = CharField(default='')
title = TextField(default='')
@property
def is_title_bad(self): # add incrementally
return "PUR-Abo" in self.title \
@@ -63,7 +62,7 @@ class ArticleDownload(DownloadBaseModel):
archive_url = TextField(default = '')
pub_date = DateField(default = '')
pub_date = DateField(default = datetime.date.fromtimestamp(0))
download_date = DateField(default = datetime.date.today)
slack_ts = FloatField(default = 0) # should be a fixed-length string but float is easier to sort by
@@ -143,13 +142,17 @@ class ArticleDownload(DownloadBaseModel):
def set_authors(self, authors):
for a in authors:
ArticleAuthor.create(
article = self,
author = a
)
if len(a) < 100: # otherwise it's a mismatched string
ArticleAuthor.create(
article = self,
author = a
)
def set_related(self, related):
for r in related:
if len(r) > 255:
raise Exception("Related file name too long for POSTGRES")
ArticleRelated.create(
article = self,
related_file_name = r
@@ -182,116 +185,7 @@ class ArticleRelated(DownloadBaseModel):
# class Thread(ChatBaseModel):
# """The threads that concern us are only created if the base massage contains a url"""
# thread_ts = FloatField(default = 0)
# article = ForeignKeyField(ArticleDownload, backref="slack_thread", null=True, default=None)
# # provides, ts, user, models
# # messages
# @property
# def slack_ts(self):
# str_ts = str(self.thread_ts)
# cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem!
# return "{}{}".format(str_ts, cut_zeros*"0")
# @property
# def initiator_message(self):
# try:
# return self.messages[0] # TODO check if this needs sorting
# except IndexError:
# logger.warning(f"Thread {self} is empty. How can that be?")
# return None
# @property
# def message_count(self):
# # logger.warning("message_count was called")
# return self.messages.count()
# @property
# def last_message(self):
# messages = Message.select().where(Message.thread == self).order_by(Message.ts) # can't be empty by definition/creation
# return messages[-1]
# @property
# def is_fully_processed(self) -> bool:
# init_message = self.initiator_message
# if init_message is None:
# return False
# if init_message.is_processed_override:
# return True
# # this override is set for instance, when no url was sent at all. Then set this thread to be ignored
# reactions = init_message.reaction
# if not reactions:
# return False
# else:
# r = reactions[0].type # can and should only have one reaction
# return r == "white_check_mark" \
# or r == "x"
# class Message(ChatBaseModel):
# ts = FloatField(unique=True) #for sorting
# channel_id = CharField(default='')
# user = ForeignKeyField(User, backref="messages")
# text = TextField(default='')
# thread = ForeignKeyField(Thread, backref="messages", default=None)
# file_type = CharField(default='')
# perma_link = CharField(default='')
# is_processed_override = BooleanField(default=False)
# # reaction
# def __str__(self) -> str:
# return "MSG [{}]".format(shorten_name(self.text).replace('\n','/'))
# @property
# def slack_ts(self):
# str_ts = str(self.ts)
# cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem!
# return "{}{}".format(str_ts, cut_zeros * "0")
# @property
# def urls(self):
# pattern = r"<(.*?)>"
# matches = re.findall(pattern, self.text)
# matches = [m for m in matches if "." in m]
# new_matches = []
# for m in matches:
# if "." in m: # must contain a tld, right?
# # further complication: slack automatically abreviates urls in the format:
# # <url|link preview>. Lucky for us, "|" is a character derecommended in urls, meaning we can "safely" split for it and retain the first half
# if "|" in m:
# keep = m.split("|")[0]
# else:
# keep = m
# new_matches.append(keep)
# return new_matches
# @property
# def is_by_human(self):
# return self.user.user_id != slack_config["bot_id"]
# @property
# def has_single_url(self):
# return len(self.urls) == 1
def set_db(download_db_object):
download_db.initialize(download_db_object)
with download_db: # create tables (does nothing if they exist already)
download_db.create_tables([ArticleDownload, ArticleAuthor, ArticleRelated])