few bugs in news_fetch left, news_chek wip
This commit is contained in:
@@ -8,3 +8,4 @@ newspaper3k
|
||||
htmldate
|
||||
markdown
|
||||
rich
|
||||
psycopg2
|
@@ -123,7 +123,6 @@ class Coordinator(Thread):
|
||||
unsent = models.ArticleDownload.filter(sent = False)
|
||||
# .objects.filter(sent = False)
|
||||
for a in unsent:
|
||||
print(a)
|
||||
self.incoming_request(article=a)
|
||||
|
||||
|
||||
@@ -170,7 +169,7 @@ class Coordinator(Thread):
|
||||
|
||||
for article in articles:
|
||||
notifier = lambda article: logger.info(f"Completed manual actions for {article}")
|
||||
ArticleWatcher(article, None, workers_manual = workers, notifier = notifier) # Article watcher wants a thread to link article to TODO: handle threads as a kwarg
|
||||
ArticleWatcher(article, workers_manual = workers, notifier = notifier) # Article watcher wants a thread to link article to TODO: handle threads as a kwarg
|
||||
|
||||
def article_complete_notifier(self, article):
|
||||
if self.worker_slack is None:
|
||||
@@ -192,7 +191,7 @@ if __name__ == "__main__":
|
||||
|
||||
|
||||
if "upload" in sys.argv:
|
||||
articles = models.ArticleDownload.select().where(models.ArticleDownload.archive_url == "").execute()
|
||||
articles = models.ArticleDownload.select().where(models.ArticleDownload.archive_url == "" or models.ArticleDownload.archive_url == "TODO:UPLOAD").execute()
|
||||
logger.info(f"Launching upload to archive for {len(articles)} articles.")
|
||||
coordinator.manual_processing(articles, [UploadWorker()])
|
||||
|
||||
|
@@ -4,7 +4,6 @@ logger = logging.getLogger(__name__)
|
||||
from peewee import *
|
||||
import os
|
||||
import markdown
|
||||
import re
|
||||
import configuration
|
||||
import datetime
|
||||
|
||||
@@ -28,7 +27,7 @@ class ArticleDownload(DownloadBaseModel):
|
||||
article_url = TextField(default = '', unique=True)
|
||||
|
||||
# fetch then fills in the metadata
|
||||
title = CharField(default='')
|
||||
title = TextField(default='')
|
||||
@property
|
||||
def is_title_bad(self): # add incrementally
|
||||
return "PUR-Abo" in self.title \
|
||||
@@ -63,7 +62,7 @@ class ArticleDownload(DownloadBaseModel):
|
||||
|
||||
|
||||
archive_url = TextField(default = '')
|
||||
pub_date = DateField(default = '')
|
||||
pub_date = DateField(default = datetime.date.fromtimestamp(0))
|
||||
download_date = DateField(default = datetime.date.today)
|
||||
|
||||
slack_ts = FloatField(default = 0) # should be a fixed-length string but float is easier to sort by
|
||||
@@ -143,13 +142,17 @@ class ArticleDownload(DownloadBaseModel):
|
||||
|
||||
def set_authors(self, authors):
|
||||
for a in authors:
|
||||
ArticleAuthor.create(
|
||||
article = self,
|
||||
author = a
|
||||
)
|
||||
if len(a) < 100: # otherwise it's a mismatched string
|
||||
ArticleAuthor.create(
|
||||
article = self,
|
||||
author = a
|
||||
)
|
||||
|
||||
def set_related(self, related):
|
||||
for r in related:
|
||||
if len(r) > 255:
|
||||
raise Exception("Related file name too long for POSTGRES")
|
||||
|
||||
ArticleRelated.create(
|
||||
article = self,
|
||||
related_file_name = r
|
||||
@@ -182,116 +185,7 @@ class ArticleRelated(DownloadBaseModel):
|
||||
|
||||
|
||||
|
||||
# class Thread(ChatBaseModel):
|
||||
# """The threads that concern us are only created if the base massage contains a url"""
|
||||
# thread_ts = FloatField(default = 0)
|
||||
# article = ForeignKeyField(ArticleDownload, backref="slack_thread", null=True, default=None)
|
||||
# # provides, ts, user, models
|
||||
# # messages
|
||||
|
||||
# @property
|
||||
# def slack_ts(self):
|
||||
# str_ts = str(self.thread_ts)
|
||||
# cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem!
|
||||
# return "{}{}".format(str_ts, cut_zeros*"0")
|
||||
|
||||
# @property
|
||||
# def initiator_message(self):
|
||||
# try:
|
||||
# return self.messages[0] # TODO check if this needs sorting
|
||||
# except IndexError:
|
||||
# logger.warning(f"Thread {self} is empty. How can that be?")
|
||||
# return None
|
||||
|
||||
# @property
|
||||
# def message_count(self):
|
||||
# # logger.warning("message_count was called")
|
||||
# return self.messages.count()
|
||||
|
||||
# @property
|
||||
# def last_message(self):
|
||||
# messages = Message.select().where(Message.thread == self).order_by(Message.ts) # can't be empty by definition/creation
|
||||
# return messages[-1]
|
||||
|
||||
# @property
|
||||
# def is_fully_processed(self) -> bool:
|
||||
# init_message = self.initiator_message
|
||||
# if init_message is None:
|
||||
# return False
|
||||
|
||||
# if init_message.is_processed_override:
|
||||
# return True
|
||||
# # this override is set for instance, when no url was sent at all. Then set this thread to be ignored
|
||||
|
||||
# reactions = init_message.reaction
|
||||
# if not reactions:
|
||||
# return False
|
||||
# else:
|
||||
# r = reactions[0].type # can and should only have one reaction
|
||||
# return r == "white_check_mark" \
|
||||
# or r == "x"
|
||||
|
||||
|
||||
|
||||
# class Message(ChatBaseModel):
|
||||
# ts = FloatField(unique=True) #for sorting
|
||||
# channel_id = CharField(default='')
|
||||
# user = ForeignKeyField(User, backref="messages")
|
||||
# text = TextField(default='')
|
||||
# thread = ForeignKeyField(Thread, backref="messages", default=None)
|
||||
# file_type = CharField(default='')
|
||||
# perma_link = CharField(default='')
|
||||
# is_processed_override = BooleanField(default=False)
|
||||
# # reaction
|
||||
|
||||
# def __str__(self) -> str:
|
||||
# return "MSG [{}]".format(shorten_name(self.text).replace('\n','/'))
|
||||
|
||||
# @property
|
||||
# def slack_ts(self):
|
||||
# str_ts = str(self.ts)
|
||||
# cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem!
|
||||
# return "{}{}".format(str_ts, cut_zeros * "0")
|
||||
|
||||
|
||||
# @property
|
||||
# def urls(self):
|
||||
# pattern = r"<(.*?)>"
|
||||
# matches = re.findall(pattern, self.text)
|
||||
# matches = [m for m in matches if "." in m]
|
||||
|
||||
# new_matches = []
|
||||
# for m in matches:
|
||||
# if "." in m: # must contain a tld, right?
|
||||
# # further complication: slack automatically abreviates urls in the format:
|
||||
# # <url|link preview>. Lucky for us, "|" is a character derecommended in urls, meaning we can "safely" split for it and retain the first half
|
||||
# if "|" in m:
|
||||
# keep = m.split("|")[0]
|
||||
# else:
|
||||
# keep = m
|
||||
# new_matches.append(keep)
|
||||
# return new_matches
|
||||
|
||||
# @property
|
||||
# def is_by_human(self):
|
||||
# return self.user.user_id != slack_config["bot_id"]
|
||||
|
||||
|
||||
# @property
|
||||
# def has_single_url(self):
|
||||
# return len(self.urls) == 1
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def set_db(download_db_object):
|
||||
download_db.initialize(download_db_object)
|
||||
with download_db: # create tables (does nothing if they exist already)
|
||||
download_db.create_tables([ArticleDownload, ArticleAuthor, ArticleRelated])
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user