new component - upload to NAS
This commit is contained in:
37
news_fetch/Dockerfile
Normal file
37
news_fetch/Dockerfile
Normal file
@@ -0,0 +1,37 @@
|
||||
FROM python:latest
|
||||
|
||||
ENV TZ Europe/Zurich
|
||||
|
||||
# RUN echo "deb http://deb.debian.org/debian/ unstable main contrib non-free" >> /etc/apt/sources.list
|
||||
# allows the installation of the latest firefox-release (debian is not usually a rolling release)
|
||||
RUN apt-get update && apt-get install -y \
|
||||
evince \
|
||||
# for checking
|
||||
xauth \
|
||||
#for gui
|
||||
# wget tar firefox \
|
||||
# for geckodriver
|
||||
ghostscript
|
||||
# for compression
|
||||
|
||||
|
||||
# Download gecko (firefox) driver for selenium
|
||||
# RUN wget https://github.com/mozilla/geckodriver/releases/download/v0.31.0/geckodriver-v0.31.0-linux64.tar.gz
|
||||
# RUN tar -x geckodriver -zf geckodriver-v0.31.0-linux64.tar.gz -O > /usr/bin/geckodriver
|
||||
# RUN chmod +x /usr/bin/geckodriver
|
||||
# RUN rm geckodriver-v0.31.0-linux64.tar.gz
|
||||
|
||||
|
||||
RUN useradd --create-home --shell /bin/bash --uid 1001 autonews
|
||||
# id mapped to local user
|
||||
# home directory needed for pip package installation
|
||||
RUN mkdir -p /app/auto_news
|
||||
RUN chown -R autonews:autonews /app
|
||||
USER autonews
|
||||
RUN export PATH=/home/autonews/.local/bin:$PATH
|
||||
|
||||
COPY requirements.txt /app/requirements.txt
|
||||
RUN python3 -m pip install -r /app/requirements.txt
|
||||
|
||||
COPY app /app/auto_news
|
||||
WORKDIR /app/auto_news
|
59
news_fetch/app/configuration.py
Normal file
59
news_fetch/app/configuration.py
Normal file
@@ -0,0 +1,59 @@
|
||||
from dataclasses import dataclass
|
||||
import os
|
||||
import shutil
|
||||
import configparser
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from peewee import SqliteDatabase
|
||||
from rich.logging import RichHandler
|
||||
|
||||
# first things first: logging
|
||||
logging.basicConfig(
|
||||
format='%(message)s',
|
||||
level=logging.INFO,
|
||||
datefmt='%H:%M:%S', # add %Y-%m-%d if needed
|
||||
handlers=[RichHandler()]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# load config file containing constants and secrets
|
||||
parsed = configparser.ConfigParser()
|
||||
parsed.read("/app/containerdata/config/news_fetch.config.ini")
|
||||
|
||||
if os.getenv("DEBUG", "false") == "true":
|
||||
logger.warning("Found 'DEBUG=true', setting up dummy databases")
|
||||
|
||||
db_base_path = parsed["DATABASE"]["db_path_dev"]
|
||||
parsed["SLACK"]["archive_id"] = parsed["SLACK"]["debug_id"]
|
||||
parsed["MAIL"]["recipient"] = parsed["MAIL"]["sender"]
|
||||
parsed["DOWNLOADS"]["local_storage_path"] = parsed["DATABASE"]["db_path_dev"]
|
||||
else:
|
||||
logger.warning("Found 'DEBUG=false' and running on production databases, I hope you know what you're doing...")
|
||||
db_base_path = parsed["DATABASE"]["db_path_prod"]
|
||||
logger.info("Backing up databases")
|
||||
backup_dst = parsed["DATABASE"]["db_backup"]
|
||||
today = datetime.today().strftime("%Y.%m.%d")
|
||||
shutil.copyfile(
|
||||
os.path.join(db_base_path, parsed["DATABASE"]["chat_db_name"]),
|
||||
os.path.join(backup_dst, today + "." + parsed["DATABASE"]["chat_db_name"]),
|
||||
)
|
||||
shutil.copyfile(
|
||||
os.path.join(db_base_path, parsed["DATABASE"]["download_db_name"]),
|
||||
os.path.join(backup_dst, today + "." + parsed["DATABASE"]["download_db_name"]),
|
||||
)
|
||||
|
||||
|
||||
from utils_storage import models
|
||||
|
||||
# Set up the database
|
||||
models.set_db(
|
||||
SqliteDatabase(
|
||||
os.path.join(db_base_path, parsed["DATABASE"]["chat_db_name"]),
|
||||
pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once
|
||||
),
|
||||
SqliteDatabase(
|
||||
os.path.join(db_base_path, parsed["DATABASE"]["download_db_name"]),
|
||||
pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once
|
||||
)
|
||||
)
|
197
news_fetch/app/runner.py
Normal file
197
news_fetch/app/runner.py
Normal file
@@ -0,0 +1,197 @@
|
||||
"""Main coordination of other util classes. Handles inbound and outbound calls"""
|
||||
import configuration
|
||||
models = configuration.models
|
||||
from threading import Thread
|
||||
import logging
|
||||
import os
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from utils_mail import runner as mail_runner
|
||||
from utils_slack import runner as slack_runner
|
||||
from utils_worker.workers import CompressWorker, DownloadWorker, FetchWorker, UploadWorker
|
||||
|
||||
|
||||
class ArticleWatcher:
|
||||
"""Wrapper for a newly created article object. Notifies the coordinator upon change/completition"""
|
||||
def __init__(self, article, thread, **kwargs) -> None:
|
||||
self.article_id = article.id # in case article becomes None at any point, we can still track the article
|
||||
self.article = article
|
||||
self.thread = thread
|
||||
|
||||
self.completition_notifier = kwargs.get("notifier")
|
||||
self.fetch = kwargs.get("worker_fetch", None)
|
||||
self.download = kwargs.get("worker_download", None)
|
||||
self.compress = kwargs.get("worker_compress", None)
|
||||
self.upload = kwargs.get("worker_upload", None)
|
||||
|
||||
self.completition_notified = False
|
||||
# self._download_called = self._compression_called = False
|
||||
self._fetch_completed = self._download_completed = self._compression_completed = self._upload_completed = False
|
||||
|
||||
# first step: gather metadata
|
||||
if self.fetch and self.upload:
|
||||
self.fetch.process(self) # this will call the update_status method
|
||||
self.upload.process(self) # idependent from the rest
|
||||
else: # the full kwargs were not provided, only do a manual run
|
||||
# overwrite update_status() because calls from the workers will result in erros
|
||||
self.update_status = lambda completed: logger.info(f"Completed action {completed}")
|
||||
for w in kwargs.get("workers_manual"):
|
||||
w.process(self)
|
||||
|
||||
|
||||
def update_status(self, completed_action):
|
||||
"""Checks and notifies internal completition-status.
|
||||
Article download is complete iff fetch and download were successfull and compression was run
|
||||
"""
|
||||
# if self.completition_notified and self._compression_completed and self._fetch_completed and self._download_completed and self._upload_completed, we are done
|
||||
if completed_action == "fetch":
|
||||
self.download.process(self)
|
||||
elif completed_action == "download":
|
||||
self.compress.process(self)
|
||||
elif completed_action == "compress": # last step
|
||||
self.completition_notifier(self.article, self.thread)
|
||||
# triggers action in Coordinator
|
||||
elif completed_action == "upload":
|
||||
# this case occurs when upload was faster than compression
|
||||
pass
|
||||
else:
|
||||
logger.warning(f"update_status called with unusual configuration: {completed_action}")
|
||||
|
||||
|
||||
# ====== Attributes to be modified by the util workers
|
||||
@property
|
||||
def fetch_completed(self):
|
||||
return self._fetch_completed
|
||||
|
||||
@fetch_completed.setter
|
||||
def fetch_completed(self, value: bool):
|
||||
self._fetch_completed = value
|
||||
self.update_status("fetch")
|
||||
|
||||
@property
|
||||
def download_completed(self):
|
||||
return self._download_completed
|
||||
|
||||
@download_completed.setter
|
||||
def download_completed(self, value: bool):
|
||||
self._download_completed = value
|
||||
self.update_status("download")
|
||||
|
||||
@property
|
||||
def compression_completed(self):
|
||||
return self._compression_completed
|
||||
|
||||
@compression_completed.setter
|
||||
def compression_completed(self, value: bool):
|
||||
self._compression_completed = value
|
||||
self.update_status("compress")
|
||||
|
||||
@property
|
||||
def upload_completed(self):
|
||||
return self._upload_completed
|
||||
|
||||
@upload_completed.setter
|
||||
def upload_completed(self, value: bool):
|
||||
self._upload_completed = value
|
||||
self.update_status("upload")
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"Article with id {self.article_id}"
|
||||
|
||||
|
||||
class Coordinator(Thread):
|
||||
def __init__(self, **kwargs) -> None:
|
||||
"""Launcher calls this Coordinator as the main thread to handle connections between the other workers (threaded)."""
|
||||
super().__init__(target = self.launch)
|
||||
|
||||
def add_workers(self, **kwargs):
|
||||
self.worker_slack = kwargs.pop("worker_slack", None)
|
||||
self.worker_mail = kwargs.pop("worker_mail", None)
|
||||
# the two above won't be needed in the Watcher
|
||||
self.worker_download = kwargs.get("worker_download", None)
|
||||
self.worker_fetch = kwargs.get("worker_fetch", None)
|
||||
self.worker_compress = kwargs.get("worker_compress", None)
|
||||
self.worker_upload = kwargs.get("worker_upload", None)
|
||||
|
||||
self.kwargs = kwargs
|
||||
|
||||
def launch(self) -> None:
|
||||
for w in [self.worker_download, self.worker_fetch, self.worker_upload, self.worker_compress]:
|
||||
if not w is None:
|
||||
w.start()
|
||||
|
||||
|
||||
def incoming_request(self, message):
|
||||
"""This method is passed onto the slack worker. It gets triggered when a new message is received."""
|
||||
url = message.urls[0] # ignore all the other ones
|
||||
article, is_new = models.ArticleDownload.get_or_create(article_url=url)
|
||||
thread = message.thread
|
||||
thread.article = article
|
||||
thread.save()
|
||||
self.kwargs.update({"notifier" : self.article_complete_notifier})
|
||||
|
||||
if is_new or (article.file_name == "" and article.verified == 0):
|
||||
# check for models that were created but were abandonned. This means they have missing information, most importantly no associated file
|
||||
# this overwrites previously set information, but that should not be too important
|
||||
ArticleWatcher(
|
||||
article,
|
||||
thread,
|
||||
**self.kwargs
|
||||
)
|
||||
|
||||
# All workers are implemented as a threaded queue. But the individual model requires a specific processing order:
|
||||
# fetch -> download -> compress -> complete
|
||||
# the watcher orchestrates the procedure and notifies upon completition
|
||||
# the watcher will notify once it is sufficiently populated
|
||||
else: # manually trigger notification immediatly
|
||||
logger.info(f"Found existing article {article}. Now sending")
|
||||
self.article_complete_notifier(article, thread)
|
||||
|
||||
|
||||
|
||||
def manual_processing(self, articles, workers):
|
||||
for w in workers:
|
||||
w.start()
|
||||
|
||||
for article in articles:
|
||||
notifier = lambda article: print(f"Completed manual actions for {article}")
|
||||
ArticleWatcher(article, None, workers_manual = workers, notifier = notifier) # Article watcher wants a thread to link article to TODO: handle threads as a kwarg
|
||||
|
||||
def article_complete_notifier(self, article, thread):
|
||||
if self.worker_slack is None:
|
||||
logger.warning("Not sending slack notifier")
|
||||
else:
|
||||
self.worker_slack.bot_worker.respond_channel_message(thread)
|
||||
if self.worker_mail is None:
|
||||
logger.warning("Not sending mail notifier")
|
||||
else:
|
||||
self.worker_mail.send(article)
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
coordinator = Coordinator()
|
||||
|
||||
|
||||
if os.getenv("UPLOAD", "false") == "true":
|
||||
articles = models.ArticleDownload.select().where(models.ArticleDownload.archive_url == "").execute()
|
||||
logger.info(f"Launching upload to archive for {len(articles)} articles.")
|
||||
coordinator.manual_processing(articles, [UploadWorker()])
|
||||
|
||||
elif os.getenv("CHECK", "false") == "true":
|
||||
from utils_check import runner as check_runner
|
||||
check_runner.verify_unchecked()
|
||||
|
||||
else: # launch with full action
|
||||
slack_runner = slack_runner.BotRunner(coordinator.incoming_request)
|
||||
kwargs = {
|
||||
"worker_download" : DownloadWorker(),
|
||||
"worker_fetch" : FetchWorker(),
|
||||
"worker_upload" : UploadWorker(),
|
||||
"worker_compress" : CompressWorker(),
|
||||
"worker_slack" : slack_runner,
|
||||
"worker_mail" : mail_runner,
|
||||
}
|
||||
coordinator.add_workers(**kwargs)
|
||||
coordinator.start()
|
||||
slack_runner.start()
|
207
news_fetch/app/utils_check/runner.py
Normal file
207
news_fetch/app/utils_check/runner.py
Normal file
@@ -0,0 +1,207 @@
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
from rich.columns import Columns
|
||||
from rich.rule import Rule
|
||||
console = Console()
|
||||
hline = Rule(style="white")
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
from slack_sdk import WebClient
|
||||
import configuration
|
||||
models = configuration.models
|
||||
|
||||
u_options = {
|
||||
"ENTER" : "Accept PDF as is. It gets marked as verified",
|
||||
"D" : "set languange to DE and set verified",
|
||||
"E" : "set languange to EN and set verified",
|
||||
"O" : "set other language (prompted)",
|
||||
"R" : "set related files (prompted multiple times)",
|
||||
"B" : "reject and move to folder BAD",
|
||||
"L" : "leave file as is, do not send reaction"
|
||||
}
|
||||
|
||||
|
||||
bot_client = WebClient(
|
||||
token = configuration.parsed["SLACK"]["auth_token"]
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def file_overview(file_url: str, file_attributes: list, options: dict) -> None:
|
||||
"""Prints a neat overview of the current article"""
|
||||
file_table = Table(
|
||||
title = file_url,
|
||||
row_styles = ["white", "bright_black"],
|
||||
min_width = 100
|
||||
)
|
||||
|
||||
file_table.add_column("Attribute", justify = "right", no_wrap = True)
|
||||
file_table.add_column("Value set by auto_news")
|
||||
file_table.add_column("Status", justify = "right")
|
||||
for attr in file_attributes:
|
||||
file_table.add_row(attr["name"], attr["value"], attr["status"])
|
||||
|
||||
|
||||
option_key = "\n".join([f"[[bold]{k}[/bold]]" for k in options.keys()])
|
||||
option_action = "\n".join([f"[italic]{k}[/italic]" for k in options.values()])
|
||||
columns = Columns([option_key, option_action])
|
||||
|
||||
console.print(file_table)
|
||||
console.print("Your options:")
|
||||
console.print(columns)
|
||||
|
||||
|
||||
def send_reaction_to_slack_thread(article, reaction):
|
||||
"""Sends the verification status as a reaction to the associated slack thread. This will significantly decrease load times of the bot"""
|
||||
thread = article.slack_thread
|
||||
messages = models.Message.select().where(models.Message.text.contains(article.article_url))
|
||||
# TODO rewrite this shit
|
||||
if len(messages) > 5:
|
||||
print("Found more than 5 messages. Aborting reactions...")
|
||||
return
|
||||
for m in messages:
|
||||
if not m.has_single_url:
|
||||
print("Found thread but won't send reaction because thread has multiple urls")
|
||||
pass
|
||||
else:
|
||||
ts = m.slack_ts
|
||||
bot_client.reactions_add(
|
||||
channel=configuration.parsed["SLACK"]["archive_id"],
|
||||
name=reaction,
|
||||
timestamp=ts
|
||||
)
|
||||
print("Sent reaction to message")
|
||||
|
||||
|
||||
def prompt_language(query):
|
||||
not_set = True
|
||||
while not_set:
|
||||
uin = input("Set language (nation-code, 2 letters) ")
|
||||
if len(uin) != 2:
|
||||
print("Bad code, try again")
|
||||
else:
|
||||
not_set = False
|
||||
query.language = uin
|
||||
query.save()
|
||||
|
||||
|
||||
def prompt_related(query):
|
||||
file_list = []
|
||||
finished = False
|
||||
while not finished:
|
||||
uin = input("Additional file for article? Type '1' to cancel ")
|
||||
if uin == "1":
|
||||
query.set_related(file_list)
|
||||
finished = True
|
||||
else:
|
||||
file_list.append(uin)
|
||||
|
||||
|
||||
def prompt_new_fname(query):
|
||||
uin = input("New fname? ")
|
||||
old_fname = query.file_name
|
||||
query.file_name = uin
|
||||
query.verified = 1
|
||||
if old_fname != "":
|
||||
os.remove(query.save_path + old_fname)
|
||||
query.save()
|
||||
|
||||
|
||||
|
||||
def reject_article(article):
|
||||
article.verified = -1
|
||||
article.save()
|
||||
print("Article marked as bad")
|
||||
# also update the threads to not be monitored anymore
|
||||
send_reaction_to_slack_thread(article, "x")
|
||||
|
||||
|
||||
def unreject_article(query):
|
||||
query.verified = 1
|
||||
query.save()
|
||||
# os.rename(badpdf, fname)
|
||||
print("File set to verified")
|
||||
|
||||
|
||||
def accept_article(article, last_accepted):
|
||||
article.verified = 1
|
||||
article.save()
|
||||
print("Article accepted as GOOD")
|
||||
|
||||
# also update the threads to not be monitored anymore
|
||||
send_reaction_to_slack_thread(article, "white_check_mark")
|
||||
|
||||
return "" # linked
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def verify_unchecked():
|
||||
query = models.ArticleDownload.select().where(models.ArticleDownload.verified == 0).execute()
|
||||
last_linked = None
|
||||
|
||||
for article in query:
|
||||
console.print(hline)
|
||||
core_info = []
|
||||
for e, name in zip([article.save_path, article.file_name, article.title, article.language], ["Save path", "File name", "Title", "Language"]):
|
||||
entry = {
|
||||
"status" : "[red]██[/red]" if (len(e) == 0 or e == -1) else "[green]██[/green]",
|
||||
"value" : e if len(e) != 0 else "not set",
|
||||
"name" : name
|
||||
}
|
||||
core_info.append(entry)
|
||||
|
||||
try:
|
||||
# close any previously opened windows:
|
||||
# subprocess.call(["kill", "`pgrep evince`"])
|
||||
os.system("pkill evince")
|
||||
# then open a new one
|
||||
subprocess.Popen(["evince", f"file://{os.path.join(article.save_path, article.file_name)}"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
# supress evince gtk warnings
|
||||
except Exception as e:
|
||||
print(e)
|
||||
continue
|
||||
|
||||
|
||||
|
||||
file_overview(
|
||||
file_url = article.article_url,
|
||||
file_attributes=core_info,
|
||||
options = u_options
|
||||
)
|
||||
|
||||
|
||||
proceed = False
|
||||
while not proceed:
|
||||
proceed = False
|
||||
uin = input("Choice ?").lower()
|
||||
if uin == "":
|
||||
last_linked = accept_article(article, last_linked) # last linked accelerates the whole process
|
||||
proceed = True
|
||||
elif uin == "d":
|
||||
article.language = "de"
|
||||
article.verified = 1
|
||||
article.save()
|
||||
proceed = True
|
||||
elif uin == "e":
|
||||
article.language = "en"
|
||||
article.verified = 1
|
||||
article.save()
|
||||
proceed = True
|
||||
elif uin == "o":
|
||||
prompt_language(article)
|
||||
elif uin == "r":
|
||||
prompt_related(article)
|
||||
elif uin == "b":
|
||||
reject_article(article)
|
||||
proceed = True
|
||||
elif uin == "l":
|
||||
# do nothing
|
||||
proceed = True
|
||||
else:
|
||||
print("Invalid input")
|
42
news_fetch/app/utils_mail/runner.py
Normal file
42
news_fetch/app/utils_mail/runner.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import smtplib
|
||||
from email.mime.multipart import MIMEMultipart
|
||||
from email.mime.text import MIMEText
|
||||
from email.mime.application import MIMEApplication
|
||||
import os
|
||||
import logging
|
||||
import configuration
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
config = configuration.parsed["MAIL"]
|
||||
|
||||
def send(article_model):
|
||||
mail = MIMEMultipart()
|
||||
mail['Subject'] = "{} -- {}".format(article_model.source_name, article_model.title)
|
||||
mail['From'] = config["sender"]
|
||||
mail['To'] = config["recipient"]
|
||||
|
||||
msgs = article_model.mail_info # this is html
|
||||
msg = [m["reply_text"] for m in msgs]
|
||||
msg = "\n".join(msg)
|
||||
|
||||
content = MIMEText(msg, "html")
|
||||
mail.attach(content)
|
||||
|
||||
files = [m["file_path"] for m in msgs if m["file_path"]]
|
||||
for path in files:
|
||||
with open(path, 'rb') as file:
|
||||
part = MIMEApplication(file.read(), "pdf")
|
||||
# encoders.encode_base64(part)
|
||||
part.add_header('Content-Disposition', 'attachment', filename=os.path.basename(path))
|
||||
mail.attach(part)
|
||||
|
||||
try:
|
||||
smtp = smtplib.SMTP(config["smtp_server"], config["port"])
|
||||
smtp.starttls()
|
||||
smtp.login(config["uname"], config["password"])
|
||||
smtp.sendmail(config["sender"], config["recipient"], mail.as_string())
|
||||
smtp.quit()
|
||||
logger.info("Mail successfully sent.")
|
||||
except Exception as e:
|
||||
logger.error("Could not send mail for article {}".format(article_model))
|
||||
logger.info(e)
|
277
news_fetch/app/utils_slack/message_helpers.py
Normal file
277
news_fetch/app/utils_slack/message_helpers.py
Normal file
@@ -0,0 +1,277 @@
|
||||
import logging
|
||||
import configuration
|
||||
import requests
|
||||
import os
|
||||
import time
|
||||
from threading import Thread
|
||||
from slack_sdk.errors import SlackApiError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
config = configuration.parsed["SLACK"]
|
||||
models = configuration.models
|
||||
slack_client = "dummy"
|
||||
LATEST_RECORDED_REACTION = 0
|
||||
|
||||
|
||||
def init(client) -> None:
|
||||
global slack_client
|
||||
slack_client = client
|
||||
|
||||
global LATEST_RECORDED_REACTION
|
||||
try:
|
||||
LATEST_RECORDED_REACTION = models.Reaction.select(models.Reaction.id).order_by("id")[-1]
|
||||
except IndexError: #query is actually empty, we have never fetched any messages until now
|
||||
LATEST_RECORDED_REACTION = 0
|
||||
|
||||
# fetch all te messages we could have possibly missed
|
||||
logger.info("Querying missed messages, threads and reactions. This can take some time.")
|
||||
fetch_missed_channel_messages() # not threaded
|
||||
t = Thread(target = fetch_missed_channel_reactions) # threaded, runs in background (usually takes a long time)
|
||||
t.start()
|
||||
|
||||
if os.getenv("REDUCEDFETCH", "false") == "true":
|
||||
logger.warning("Only fetching empty threads for bot messages because 'REDUCEDFETCH=true'")
|
||||
fetch_missed_thread_messages(reduced=True)
|
||||
else: # perform both asyncronously
|
||||
fetch_missed_thread_messages()
|
||||
|
||||
|
||||
|
||||
def get_unhandled_messages():
|
||||
"""Gets all messages that have not yet been handled, be it by mistake or by downtime
|
||||
As the message handler makes no distinction between channel messages and thread messages,
|
||||
we don't have to worry about them here.
|
||||
"""
|
||||
|
||||
threaded_objects = []
|
||||
for t in models.Thread.select():
|
||||
if t.message_count > 1: # if only one message was written, it is the channel message
|
||||
msg = t.last_message
|
||||
if msg.is_by_human:
|
||||
threaded_objects.append(msg)
|
||||
# else don't, nothing to process
|
||||
logger.info(f"Set {len(threaded_objects)} thread-messages as not yet handled.")
|
||||
|
||||
|
||||
channel_objects = [t.initiator_message for t in models.Thread.select() if (t.message_count == 1 and not t.is_fully_processed)]
|
||||
logger.info(f"Set {len(channel_objects)} channel-messages as not yet handled.")
|
||||
|
||||
reaction_objects = list(models.Reaction.select().where(models.Reaction.id > LATEST_RECORDED_REACTION))
|
||||
logger.info(f"Set {len(reaction_objects)} reactions as not yet handled.")
|
||||
# the ones newer than the last before the fetch
|
||||
|
||||
all_messages = channel_objects + threaded_objects
|
||||
return all_messages, reaction_objects
|
||||
|
||||
|
||||
def fetch_missed_channel_messages():
|
||||
# latest processed message_ts is:
|
||||
presaved = models.Message.select().order_by(models.Message.ts)
|
||||
if not presaved:
|
||||
last_ts = 0
|
||||
else:
|
||||
last_message = presaved[-1]
|
||||
last_ts = last_message.slack_ts
|
||||
|
||||
result = slack_client.conversations_history(
|
||||
channel=config["archive_id"],
|
||||
oldest=last_ts
|
||||
)
|
||||
|
||||
new_messages = result.get("messages", [])
|
||||
# # filter the last one, it is a duplicate! (only if the db is not empty!)
|
||||
# if last_ts != 0 and len(new_messages) != 0:
|
||||
# new_messages.pop(-1)
|
||||
|
||||
new_fetches = 0
|
||||
for m in new_messages:
|
||||
# print(m)
|
||||
message_dict_to_model(m)
|
||||
new_fetches += 1
|
||||
|
||||
refetch = result.get("has_more", False)
|
||||
while refetch: # we have not actually fetched them all
|
||||
try:
|
||||
result = slack_client.conversations_history(
|
||||
channel = config["archive_id"],
|
||||
cursor = result["response_metadata"]["next_cursor"],
|
||||
oldest = last_ts
|
||||
) # fetches 100 messages, older than the [-1](=oldest) element of new_fetches
|
||||
refetch = result.get("has_more", False)
|
||||
|
||||
new_messages = result.get("messages", [])
|
||||
for m in new_messages:
|
||||
message_dict_to_model(m)
|
||||
new_fetches += 1
|
||||
except SlackApiError: # Most likely a rate-limit
|
||||
logger.error("Error while fetching channel messages. (likely rate limit) Retrying in {} seconds...".format(config["api_wait_time"]))
|
||||
time.sleep(config["api_wait_time"])
|
||||
refetch = True
|
||||
|
||||
logger.info(f"Fetched {new_fetches} new channel messages.")
|
||||
|
||||
|
||||
def fetch_missed_thread_messages(reduced=False):
|
||||
"""After having gotten all base-threads, we need to fetch all their replies"""
|
||||
# I don't know of a better way: we need to fetch this for each and every thread (except if it is marked as permanently solved)
|
||||
logger.info("Starting fetch of thread messages...")
|
||||
if reduced:
|
||||
threads = [t for t in models.Thread.select() if (t.message_count == 1 and not t.is_fully_processed)]
|
||||
# this only fetches completely empty threads, which might be because the bot-message was not yet saved to the db.
|
||||
# once we got all the bot-messages the remaining empty threads will be the ones we need to process.
|
||||
else:
|
||||
threads = [t for t in models.Thread.select() if not t.is_fully_processed]
|
||||
logger.info(f"Fetching history for {len(threads)} empty threads")
|
||||
new_messages = []
|
||||
for i,t in enumerate(threads):
|
||||
try:
|
||||
messages = slack_client.conversations_replies(
|
||||
channel = config["archive_id"],
|
||||
ts = t.slack_ts,
|
||||
oldest = t.messages[-1].slack_ts
|
||||
)["messages"]
|
||||
except SlackApiError:
|
||||
logger.error("Hit rate limit while querying threaded messages, retrying in {}s ({}/{} queries elapsed)".format(config["api_wait_time"], i, len(threads)))
|
||||
time.sleep(int(config["api_wait_time"]))
|
||||
messages = slack_client.conversations_replies(
|
||||
channel = config["archive_id"],
|
||||
ts = t.slack_ts,
|
||||
oldest = t.messages[-1].slack_ts
|
||||
)["messages"]
|
||||
|
||||
messages.pop(0) # the first message is the one posted in the channel. We already processed it!
|
||||
|
||||
for m in messages:
|
||||
# only append *new* messages
|
||||
res = message_dict_to_model(m)
|
||||
if res:
|
||||
new_messages.append(res)
|
||||
logger.info("Fetched {} new threaded messages.".format(len(new_messages)))
|
||||
|
||||
|
||||
def fetch_missed_channel_reactions():
|
||||
logger.info("Starting background fetch of channel reactions...")
|
||||
threads = [t for t in models.Thread.select() if not t.is_fully_processed]
|
||||
for i,t in enumerate(threads):
|
||||
try:
|
||||
query = slack_client.reactions_get(
|
||||
channel = config["archive_id"],
|
||||
timestamp = t.slack_ts
|
||||
)
|
||||
reactions = query.get("message", []).get("reactions", []) # default = []
|
||||
except SlackApiError: # probably a rate_limit:
|
||||
logger.error("Hit rate limit while querying reactions. retrying in {}s ({}/{} queries elapsed)".format(config["api_wait_time"], i, len(threads)))
|
||||
time.sleep(int(config["api_wait_time"]))
|
||||
reactions = query.get("message", []).get("reactions", [])
|
||||
|
||||
for r in reactions:
|
||||
reaction_dict_to_model(r, t)
|
||||
|
||||
|
||||
|
||||
|
||||
# Helpers for message conversion to db-objects
|
||||
def reaction_dict_to_model(reaction, thread=None):
|
||||
if thread is None:
|
||||
m_ts = reaction["item"]["ts"]
|
||||
message = models.Message.get(ts = float(m_ts))
|
||||
thread = message.thread
|
||||
if "name" in reaction.keys(): # fetched through manual api query
|
||||
content = reaction["name"]
|
||||
elif "reaction" in reaction.keys(): # fetched through events
|
||||
content = reaction["reaction"]
|
||||
else:
|
||||
logger.error(f"Weird reaction received: {reaction}")
|
||||
return None
|
||||
|
||||
r, _ = models.Reaction.get_or_create(
|
||||
type = content,
|
||||
message = thread.initiator_message
|
||||
)
|
||||
logger.info("Saved reaction [{}]".format(content))
|
||||
return r
|
||||
|
||||
|
||||
def message_dict_to_model(message):
|
||||
if message["type"] == "message":
|
||||
thread_ts = message["thread_ts"] if "thread_ts" in message else message["ts"]
|
||||
uid = message.get("user", "BAD USER")
|
||||
if uid == "BAD USER":
|
||||
logger.critical("Message has no user?? {}".format(message))
|
||||
return None
|
||||
|
||||
user, _ = models.User.get_or_create(user_id = uid)
|
||||
thread, _ = models.Thread.get_or_create(thread_ts = thread_ts)
|
||||
m, new = models.Message.get_or_create(
|
||||
user = user,
|
||||
thread = thread,
|
||||
ts = message["ts"],
|
||||
channel_id = config["archive_id"],
|
||||
text = message["text"]
|
||||
)
|
||||
logger.info(f"Saved: {m} ({'new' if new else 'old'})")
|
||||
|
||||
files = message.get("files", [])
|
||||
if len(files) >= 1:
|
||||
f = files[0] #default: []
|
||||
m.file_type = f["filetype"]
|
||||
m.perma_link = f["url_private_download"]
|
||||
m.save()
|
||||
logger.info(f"Saved {m.file_type}-file for message (id={m.id})")
|
||||
if new:
|
||||
return m
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
logger.warning("What should I do of {}".format(message))
|
||||
return None
|
||||
|
||||
|
||||
def say_substitute(*args, **kwargs):
|
||||
logger.info("Now sending message through say-substitute: {}".format(" - ".join(args)))
|
||||
slack_client.chat_postMessage(
|
||||
channel=config["archive_id"],
|
||||
text=" - ".join(args),
|
||||
**kwargs
|
||||
)
|
||||
|
||||
|
||||
def save_as_related_file(url, article_object):
|
||||
r = requests.get(url, headers={"Authorization": "Bearer {}".format(slack_client.token)})
|
||||
saveto = article_object.save_path
|
||||
ftype = url[url.rfind(".") + 1:]
|
||||
fname = "{} - related no {}.{}".format(
|
||||
article_object.file_name.replace(".pdf",""),
|
||||
len(article_object.related) + 1,
|
||||
ftype
|
||||
)
|
||||
with open(os.path.join(saveto, fname), "wb") as f:
|
||||
f.write(r.content)
|
||||
article_object.set_related([fname])
|
||||
logger.info("Added {} to model {}".format(fname, article_object))
|
||||
return fname
|
||||
|
||||
|
||||
def react_file_path_message(fname, article_object):
|
||||
saveto = article_object.save_path
|
||||
file_path = os.path.join(saveto, fname)
|
||||
if os.path.exists(file_path):
|
||||
article_object.set_related([fname])
|
||||
logger.info("Added {} to model {}".format(fname, article_object))
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def is_message_in_archiving(message) -> bool:
|
||||
if isinstance(message, dict):
|
||||
return message["channel"] == config["archive_id"]
|
||||
else:
|
||||
return message.channel_id == config["archive_id"]
|
||||
|
||||
|
||||
def is_reaction_in_archiving(event) -> bool:
|
||||
if isinstance(event, dict):
|
||||
return event["item"]["channel"] == config["archive_id"]
|
||||
else:
|
||||
return event.message.channel_id == config["archive_id"]
|
184
news_fetch/app/utils_slack/runner.py
Normal file
184
news_fetch/app/utils_slack/runner.py
Normal file
@@ -0,0 +1,184 @@
|
||||
from slack_bolt import App
|
||||
from slack_bolt.adapter.socket_mode import SocketModeHandler
|
||||
|
||||
import logging
|
||||
import configuration
|
||||
|
||||
from . import message_helpers
|
||||
|
||||
|
||||
config = configuration.parsed["SLACK"]
|
||||
models = configuration.models
|
||||
|
||||
class BotApp(App):
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def __init__(self, callback, *args, **kwargs):
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
self.callback = callback
|
||||
|
||||
def start(self):
|
||||
message_helpers.init(self.client)
|
||||
missed_messages, missed_reactions = message_helpers.get_unhandled_messages()
|
||||
|
||||
[self.handle_incoming_message(m) for m in missed_messages]
|
||||
[self.handle_incoming_reaction(r) for r in missed_reactions]
|
||||
|
||||
# self.react_missed_reactions(missed_reactions)
|
||||
# self.react_missed_messages(missed_messages)
|
||||
self.startup_status()
|
||||
|
||||
|
||||
|
||||
def handle_incoming_reaction(self, reaction):
|
||||
if isinstance(reaction, dict): #else: the reaction is already being passed as a model
|
||||
# CAUTION: filter for 'changed reactions' those are nasty (usually when adding an url)
|
||||
reaction = message_helpers.reaction_dict_to_model(reaction)
|
||||
|
||||
thread = reaction.message.thread
|
||||
article_object = thread.article
|
||||
if not article_object is None:
|
||||
reaction = reaction.type
|
||||
status = 1 if reaction == "white_check_mark" else -1
|
||||
|
||||
# self.logger.info(f"Applying reaction {reaction} to its root message.")
|
||||
article_object.verified = status
|
||||
article_object.save()
|
||||
|
||||
|
||||
def handle_incoming_message(self, message):
|
||||
"""Reacts to all messages inside channel archiving. Must then
|
||||
distinguish between threaded replies and new requests
|
||||
and react accordingly"""
|
||||
if isinstance(message, dict): #else: the message is already being passed as a model
|
||||
# CAUTION: filter for 'changed messages' those are nasty (usually when adding an url)
|
||||
if message.get("subtype", "not bad") == "message_changed":
|
||||
return False
|
||||
message = message_helpers.message_dict_to_model(message)
|
||||
|
||||
# First check: belongs to thread?
|
||||
is_threaded = message.thread.message_count > 1 and message != message.thread.initiator_message
|
||||
if is_threaded:
|
||||
self.incoming_thread_message(message)
|
||||
else:
|
||||
self.incoming_channel_message(message)
|
||||
|
||||
|
||||
def incoming_thread_message(self, message):
|
||||
if message.user.user_id == config["bot_id"]:
|
||||
return True # ignore the files uploaded by the bot. We handled them already!
|
||||
|
||||
thread = message.thread
|
||||
if thread.is_fully_processed:
|
||||
return True
|
||||
|
||||
self.logger.info("Receiving thread-message")
|
||||
self.respond_thread_message(message)
|
||||
|
||||
|
||||
def incoming_channel_message(self, message):
|
||||
self.logger.info(f"Handling message {message} ({len(message.urls)} urls)")
|
||||
|
||||
if not message.urls: # no urls in a root-message => IGNORE
|
||||
message.is_processed_override = True
|
||||
message.save()
|
||||
return
|
||||
|
||||
# ensure thread is still empty, this is a scenario encountered only in testing, but let's just filter it
|
||||
if message.thread.message_count > 1:
|
||||
self.logger.info("Discarded message because it is actually processed.")
|
||||
return
|
||||
|
||||
if len(message.urls) > 1:
|
||||
message_helpers.say_substitute("Only the first url is being handled. Please send any subsequent url as a separate message", thread_ts=message.thread.slack_ts)
|
||||
|
||||
self.callback(message)
|
||||
# for url in message.urls:
|
||||
# self.callback(url, message)
|
||||
# stop here!
|
||||
|
||||
|
||||
|
||||
def respond_thread_message(self, message, say=message_helpers.say_substitute):
|
||||
thread = message.thread
|
||||
article = thread.article
|
||||
if message.perma_link: # file upload means new data
|
||||
fname = message_helpers.save_as_related_file(message.perma_link, article)
|
||||
say("File was saved as 'related file' under `{}`.".format(fname),
|
||||
thread_ts=thread.slack_ts
|
||||
)
|
||||
else: # either a pointer to a new file (too large to upload), or trash
|
||||
success = message_helpers.react_file_path_message(message.text, article)
|
||||
if success:
|
||||
say("File was saved as 'related file'", thread_ts=thread.slack_ts)
|
||||
else:
|
||||
self.logger.error("User replied to thread {} but the response did not contain a file/path".format(thread))
|
||||
say("Cannot process response without associated file.",
|
||||
thread_ts=thread.slack_ts
|
||||
)
|
||||
|
||||
|
||||
def respond_channel_message(self, thread, say=message_helpers.say_substitute):
|
||||
article = thread.article
|
||||
answers = article.slack_info
|
||||
for a in answers:
|
||||
if a["file_path"]:
|
||||
try: # either, a["file_path"] does not exist, or the upload resulted in an error
|
||||
self.client.files_upload(
|
||||
channels = config["archive_id"],
|
||||
initial_comment = f"<@{config['responsible_id']}> \n {a['reply_text']}",
|
||||
file = a["file_path"],
|
||||
thread_ts = thread.slack_ts
|
||||
)
|
||||
status = True
|
||||
except:
|
||||
say(
|
||||
"File {} could not be uploaded.".format(a),
|
||||
thread_ts=thread.slack_ts
|
||||
)
|
||||
status = False
|
||||
else: # anticipated that there is no file!
|
||||
say(
|
||||
f"<@{config['responsible_id']}> \n {a['reply_text']}",
|
||||
thread_ts=thread.slack_ts
|
||||
)
|
||||
status = True
|
||||
|
||||
|
||||
def startup_status(self):
|
||||
threads = [t for t in models.Thread.select()]
|
||||
all_threads = len(threads)
|
||||
fully_processed = len([t for t in threads if t.is_fully_processed])
|
||||
fully_unprocessed = len([t for t in threads if t.message_count == 1])
|
||||
articles_unprocessed = len(models.ArticleDownload.select().where(models.ArticleDownload.verified < 1))
|
||||
self.logger.info(f"[bold]STATUS[/bold]: Fully processed {fully_processed}/{all_threads} threads. {fully_unprocessed} threads have 0 replies. Article-objects to verify: {articles_unprocessed}", extra={"markup": True})
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class BotRunner():
|
||||
"""Stupid encapsulation so that we can apply the slack decorators to the BotApp"""
|
||||
def __init__(self, callback, *args, **kwargs) -> None:
|
||||
self.bot_worker = BotApp(callback, token=config["auth_token"])
|
||||
|
||||
@self.bot_worker.event(event="message", matchers=[message_helpers.is_message_in_archiving])
|
||||
def handle_incoming_message(message, say):
|
||||
return self.bot_worker.handle_incoming_message(message)
|
||||
|
||||
@self.bot_worker.event(event="reaction_added", matchers=[message_helpers.is_reaction_in_archiving])
|
||||
def handle_incoming_reaction(event, say):
|
||||
return self.bot_worker.handle_incoming_reaction(event)
|
||||
|
||||
# target = self.launch
|
||||
# super().__init__(target=target)
|
||||
|
||||
|
||||
def start(self):
|
||||
self.bot_worker.start()
|
||||
SocketModeHandler(self.bot_worker, config["app_token"]).start()
|
||||
|
||||
|
||||
# def respond_to_message(self, message):
|
||||
# self.bot_worker.handle_incoming_message(message)
|
67
news_fetch/app/utils_storage/migrations/migration.001.py
Normal file
67
news_fetch/app/utils_storage/migrations/migration.001.py
Normal file
@@ -0,0 +1,67 @@
|
||||
from playhouse.migrate import *
|
||||
|
||||
|
||||
"""
|
||||
This migration assumes that downloads.db kept the exact same structure as before.
|
||||
messages.db should drop the table articlemodelreference in favor of a new field article in the thread-table
|
||||
Since each thread is constrained to exactly one article this makes the most sense.
|
||||
|
||||
This migration assumes that messages.db gets a new field in the table thread:
|
||||
id | thread_ts | article_id
|
||||
|
||||
We now need to migrate from the table articlemodelreference and then delete it.
|
||||
"""
|
||||
|
||||
|
||||
db = SqliteDatabase("/code/.dev/messages.db")
|
||||
migrator = SqliteMigrator(db)
|
||||
|
||||
|
||||
article_field = IntegerField(null=True)
|
||||
|
||||
|
||||
migrate(
|
||||
migrator.add_column('thread', 'article_id', article_field),
|
||||
# migrator.drop_column('some_table', 'old_column'),
|
||||
)
|
||||
|
||||
|
||||
|
||||
# these are the old models, adapted to the migration
|
||||
|
||||
class BaseModel(Model):
|
||||
class Meta:
|
||||
database = db
|
||||
|
||||
class User(BaseModel):
|
||||
user_id = CharField(default='', unique=True)
|
||||
|
||||
class Thread(BaseModel):
|
||||
"""The threads that concern us are only created if the messages that contain urls"""
|
||||
thread_ts = FloatField(default = 0)
|
||||
article_id = IntegerField(null=True)
|
||||
|
||||
|
||||
class Message(BaseModel):
|
||||
ts = FloatField(unique=True) #for sorting
|
||||
channel_id = CharField(default='')
|
||||
user = ForeignKeyField(User, backref="messages")
|
||||
text = TextField(default='')
|
||||
thread = ForeignKeyField(Thread, backref="messages", default=None)
|
||||
file_type = CharField(default='')
|
||||
perma_link = CharField(default='')
|
||||
is_processed_override = BooleanField(default=False)
|
||||
|
||||
|
||||
class ArticleModelReference(BaseModel):
|
||||
message = ForeignKeyField(Message, backref='article_model_references')
|
||||
article_model_id = IntegerField(default = 0)
|
||||
|
||||
|
||||
|
||||
|
||||
for ref in ArticleModelReference.select():
|
||||
ref.message.thread.article_id = ref.article_model_id
|
||||
ref.message.thread.save()
|
||||
|
||||
db.drop_tables((ArticleModelReference))
|
322
news_fetch/app/utils_storage/models.py
Normal file
322
news_fetch/app/utils_storage/models.py
Normal file
@@ -0,0 +1,322 @@
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from peewee import *
|
||||
import os
|
||||
import markdown
|
||||
import re
|
||||
import configuration
|
||||
import datetime
|
||||
|
||||
config = configuration.parsed["DOWNLOADS"]
|
||||
slack_config = configuration.parsed["SLACK"]
|
||||
|
||||
## Helpers
|
||||
chat_db = DatabaseProxy()
|
||||
download_db = DatabaseProxy()
|
||||
|
||||
# set the nature of the db at runtime
|
||||
|
||||
class DownloadBaseModel(Model):
|
||||
class Meta:
|
||||
database = download_db
|
||||
|
||||
class ChatBaseModel(Model):
|
||||
class Meta:
|
||||
database = chat_db
|
||||
|
||||
|
||||
|
||||
## == Article related models == ##
|
||||
class ArticleDownload(DownloadBaseModel):
|
||||
title = CharField(default='')
|
||||
pub_date = DateField(default = '')
|
||||
download_date = DateField(default = datetime.date.today)
|
||||
source_name = CharField(default = '')
|
||||
article_url = TextField(default = '', unique=True)
|
||||
archive_url = TextField(default = '')
|
||||
file_name = TextField(default = '')
|
||||
language = CharField(default = '')
|
||||
summary = TextField(default = '')
|
||||
comment = TextField(default = '')
|
||||
verified = IntegerField(default = False)
|
||||
# authors
|
||||
# keywords
|
||||
# ... are added through foreignkeys
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"ART [{self.title} -- {self.source_name}]"
|
||||
|
||||
## Useful Properties
|
||||
@property
|
||||
def save_path(self):
|
||||
return f"{config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/"
|
||||
|
||||
def fname_nas(self, file_name=""):
|
||||
if self.download_date:
|
||||
if file_name:
|
||||
return "NAS: {}/{}/{}/{}".format(config["remote_storage_path"], self.download_date.year, self.download_date.strftime("%B"), file_name)
|
||||
else: # return the self. name
|
||||
return "NAS: {}/{}/{}/{}".format(config["remote_storage_path"], self.download_date.year, self.download_date.strftime("%B"), self.file_name)
|
||||
else:
|
||||
return None
|
||||
|
||||
@property
|
||||
def fname_template(self):
|
||||
if "youtube.com" in self.source_name or "youtu.be" in self.source_name:
|
||||
fname = "{} -- {}".format(self.source_name, self.title)
|
||||
else:
|
||||
fname = "{} -- {}.pdf".format(self.source_name, self.title)
|
||||
return clear_path_name(fname)
|
||||
|
||||
@property
|
||||
def is_title_bad(self): # add incrementally
|
||||
return "PUR-Abo" in self.title \
|
||||
or "Redirecting" in self.title \
|
||||
or "Error while running fetch" in self.title
|
||||
|
||||
@property
|
||||
def slack_info(self):
|
||||
status = [":x: No better version available", ":gear: Verification pending", ":white_check_mark: Verified by human"][self.verified + 1]
|
||||
content = "\n>" + "\n>".join(self.summary.split("\n"))
|
||||
file_status, msg = self.file_status()
|
||||
if not file_status:
|
||||
return [msg]
|
||||
|
||||
# everything alright: generate real content
|
||||
# first the base file
|
||||
if self.file_name[-4:] == ".pdf":
|
||||
answer = [{ # main reply with the base pdf
|
||||
"reply_text" : f"*{self.title}*\n{status}\n{content}",
|
||||
"file_path" : self.save_path + self.file_name
|
||||
}]
|
||||
else: # don't upload if the file is too big!
|
||||
location = "Not uploaded to slack, but the file will be on the NAS:\n`{}`".format(self.fname_nas())
|
||||
answer = [{ # main reply with the base pdf
|
||||
"reply_text" : "*{}*\n{}\n{}\n{}".format(self.title, status, content, location),
|
||||
"file_path" : None
|
||||
}]
|
||||
|
||||
# then the related files
|
||||
rel_text = ""
|
||||
for r in self.related:
|
||||
fname = r.related_file_name
|
||||
lentry = "\n• `{}` ".format(self.fname_nas(fname))
|
||||
if fname[-4:] == ".pdf": # this is a manageable file, directly upload
|
||||
f_ret = self.save_path + fname
|
||||
answer.append({"reply_text":"", "file_path" : f_ret})
|
||||
else: # not pdf <=> too large. Don't upload but mention its existence
|
||||
lentry += "(not uploaded to slack, but the file will be on the NAS)"
|
||||
|
||||
rel_text += lentry
|
||||
|
||||
if rel_text:
|
||||
rel_text = answer[0]["reply_text"] = answer[0]["reply_text"] + "\nRelated files:\n" + rel_text
|
||||
|
||||
return answer
|
||||
|
||||
@property
|
||||
def mail_info(self):
|
||||
base = [{"reply_text": "[{}]({})\n".format(self.article_url, self.article_url), "file_path":None}] + self.slack_info
|
||||
return [{"reply_text": markdown.markdown(m["reply_text"]), "file_path": m["file_path"]} for m in base]
|
||||
|
||||
|
||||
## Helpers
|
||||
def set_keywords(self, keywords):
|
||||
for k in keywords:
|
||||
ArticleKeyword.create(
|
||||
article = self,
|
||||
keyword = k
|
||||
)
|
||||
|
||||
def set_authors(self, authors):
|
||||
for a in authors:
|
||||
ArticleAuthor.create(
|
||||
article = self,
|
||||
author = a
|
||||
)
|
||||
|
||||
def set_references(self, references):
|
||||
for r in references:
|
||||
ArticleReference.create(
|
||||
article = self,
|
||||
reference_url = r
|
||||
)
|
||||
|
||||
def set_related(self, related):
|
||||
for r in related:
|
||||
ArticleRelated.create(
|
||||
article = self,
|
||||
related_file_name = r
|
||||
)
|
||||
|
||||
def file_status(self):
|
||||
if not self.file_name:
|
||||
logger.error("Article {} has no filename!".format(self))
|
||||
return False, {"reply_text": "Download failed, no file was saved.", "file_path": None}
|
||||
|
||||
file_path_abs = self.save_path + self.file_name
|
||||
if not os.path.exists(file_path_abs):
|
||||
logger.error("Article {} has a filename, but the file does not exist at that location!".format(self))
|
||||
return False, {"reply_text": "Can't find file. Either the download failed or the file was moved.", "file_path": None}
|
||||
|
||||
return True, {}
|
||||
|
||||
|
||||
class ArticleKeyword(DownloadBaseModel):
|
||||
# instance gets created for every one keyword -> flexible in size
|
||||
article = ForeignKeyField(ArticleDownload, backref='keywords')
|
||||
keyword = CharField()
|
||||
|
||||
|
||||
class ArticleAuthor(DownloadBaseModel):
|
||||
article = ForeignKeyField(ArticleDownload, backref='authors')
|
||||
author = CharField()
|
||||
|
||||
|
||||
class ArticleReference(DownloadBaseModel):
|
||||
article = ForeignKeyField(ArticleDownload, backref='references')
|
||||
reference_url = TextField(default = '')
|
||||
|
||||
|
||||
class ArticleRelated(DownloadBaseModel):
|
||||
article = ForeignKeyField(ArticleDownload, backref='related')
|
||||
related_file_name = TextField(default = '')
|
||||
|
||||
|
||||
|
||||
|
||||
## == Slack-thread related models == ##
|
||||
class User(ChatBaseModel):
|
||||
user_id = CharField(default='', unique=True)
|
||||
# messages
|
||||
|
||||
|
||||
class Thread(ChatBaseModel):
|
||||
"""The threads that concern us are only created if the base massage contains a url"""
|
||||
thread_ts = FloatField(default = 0)
|
||||
article = ForeignKeyField(ArticleDownload, backref="slack_thread", null=True, default=None)
|
||||
# provides, ts, user, models
|
||||
# messages
|
||||
|
||||
@property
|
||||
def slack_ts(self):
|
||||
str_ts = str(self.thread_ts)
|
||||
cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem!
|
||||
return "{}{}".format(str_ts, cut_zeros*"0")
|
||||
|
||||
@property
|
||||
def initiator_message(self):
|
||||
try:
|
||||
return self.messages[0] # TODO check if this needs sorting
|
||||
except IndexError:
|
||||
logger.warning(f"Thread {self} is empty. How can that be?")
|
||||
return None
|
||||
|
||||
@property
|
||||
def message_count(self):
|
||||
# logger.warning("message_count was called")
|
||||
return self.messages.count()
|
||||
|
||||
@property
|
||||
def last_message(self):
|
||||
messages = Message.select().where(Message.thread == self).order_by(Message.ts) # can't be empty by definition/creation
|
||||
return messages[-1]
|
||||
|
||||
@property
|
||||
def is_fully_processed(self) -> bool:
|
||||
init_message = self.initiator_message
|
||||
if init_message is None:
|
||||
return False
|
||||
|
||||
if init_message.is_processed_override:
|
||||
return True
|
||||
# this override is set for instance, when no url was sent at all. Then set this thread to be ignored
|
||||
|
||||
reactions = init_message.reaction
|
||||
if not reactions:
|
||||
return False
|
||||
else:
|
||||
r = reactions[0].type # can and should only have one reaction
|
||||
return r == "white_check_mark" \
|
||||
or r == "x"
|
||||
|
||||
|
||||
|
||||
class Message(ChatBaseModel):
|
||||
ts = FloatField(unique=True) #for sorting
|
||||
channel_id = CharField(default='')
|
||||
user = ForeignKeyField(User, backref="messages")
|
||||
text = TextField(default='')
|
||||
thread = ForeignKeyField(Thread, backref="messages", default=None)
|
||||
file_type = CharField(default='')
|
||||
perma_link = CharField(default='')
|
||||
is_processed_override = BooleanField(default=False)
|
||||
# reaction
|
||||
|
||||
def __str__(self) -> str:
|
||||
return "MSG [{}]".format(self.text[:min(len(self.text), 30)].replace('\n','/') + '...')
|
||||
|
||||
@property
|
||||
def slack_ts(self):
|
||||
str_ts = str(self.ts)
|
||||
cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem!
|
||||
return "{}{}".format(str_ts, cut_zeros * "0")
|
||||
|
||||
|
||||
@property
|
||||
def urls(self):
|
||||
pattern = r"<(.*?)>"
|
||||
matches = re.findall(pattern, self.text)
|
||||
matches = [m for m in matches if "." in m]
|
||||
|
||||
new_matches = []
|
||||
for m in matches:
|
||||
if "." in m: # must contain a tld, right?
|
||||
# further complication: slack automatically abreviates urls in the format:
|
||||
# <url|link preview>. Lucky for us, "|" is a character derecommended in urls, meaning we can "safely" split for it and retain the first half
|
||||
if "|" in m:
|
||||
keep = m.split("|")[0]
|
||||
else:
|
||||
keep = m
|
||||
new_matches.append(keep)
|
||||
return new_matches
|
||||
|
||||
@property
|
||||
def is_by_human(self):
|
||||
return self.user.user_id != slack_config["bot_id"]
|
||||
|
||||
|
||||
@property
|
||||
def has_single_url(self):
|
||||
return len(self.urls) == 1
|
||||
|
||||
|
||||
class Reaction(ChatBaseModel):
|
||||
type = CharField(default = "")
|
||||
message = ForeignKeyField(Message, backref="reaction")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def create_tables():
|
||||
with download_db:
|
||||
download_db.create_tables([ArticleDownload, ArticleKeyword, ArticleAuthor, ArticleReference, ArticleRelated])
|
||||
with chat_db:
|
||||
chat_db.create_tables([User, Message, Thread, Reaction])
|
||||
|
||||
|
||||
def set_db(chat_db_object, download_db_object):
|
||||
chat_db.initialize(chat_db_object)
|
||||
download_db.initialize(download_db_object)
|
||||
create_tables()
|
||||
|
||||
def clear_path_name(path):
|
||||
keepcharacters = (' ','.','_', '-')
|
||||
converted = "".join([c if (c.isalnum() or c in keepcharacters) else "_" for c in path]).rstrip()
|
||||
return converted
|
||||
|
0
news_fetch/app/utils_worker/_init__.py
Normal file
0
news_fetch/app/utils_worker/_init__.py
Normal file
47
news_fetch/app/utils_worker/compress/runner.py
Normal file
47
news_fetch/app/utils_worker/compress/runner.py
Normal file
@@ -0,0 +1,47 @@
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import configuration
|
||||
config = configuration.parsed["DOWNLOADS"]
|
||||
|
||||
shrink_sizes = []
|
||||
|
||||
def shrink_pdf(article):
|
||||
article_loc = Path(article.save_path) / article.file_name
|
||||
initial_size = article_loc.stat().st_size
|
||||
compressed_tmp = Path(config['default_download_path']) / "compressed.pdf"
|
||||
|
||||
if article_loc.suffix != "pdf":
|
||||
return article # it probably was a youtube video
|
||||
|
||||
c = subprocess.run(
|
||||
[
|
||||
"gs",
|
||||
"-sDEVICE=pdfwrite",
|
||||
"-dPDFSETTINGS=/screen",
|
||||
"-dNOPAUSE",
|
||||
"-dBATCH",
|
||||
f"-sOutputFile={compressed_tmp}",
|
||||
f"{article_loc}"
|
||||
],
|
||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||
)
|
||||
|
||||
if c.returncode == 0:
|
||||
try:
|
||||
os.replace(compressed_tmp, article_loc)
|
||||
except OSError as e:
|
||||
logger.error(f"Compression ran but I could not copy back the file {e}")
|
||||
|
||||
final_size = article_loc.stat().st_size
|
||||
shrink_sizes.append(initial_size - final_size)
|
||||
logger.info(f"Compression worked. Avg shrinkage: {int(sum(shrink_sizes)/len(shrink_sizes) / 1000)} KB")
|
||||
|
||||
|
||||
else:
|
||||
logger.error(f"Could not run the compression! {c.stderr.decode()} - {c.stdout.decode()}")
|
||||
|
||||
return article
|
0
news_fetch/app/utils_worker/download/__init__.py
Normal file
0
news_fetch/app/utils_worker/download/__init__.py
Normal file
172
news_fetch/app/utils_worker/download/browser.py
Normal file
172
news_fetch/app/utils_worker/download/browser.py
Normal file
@@ -0,0 +1,172 @@
|
||||
import time
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
import base64
|
||||
import requests
|
||||
from selenium import webdriver
|
||||
import configuration
|
||||
import json
|
||||
|
||||
config = configuration.parsed["DOWNLOADS"]
|
||||
blacklisted = json.loads(config["blacklisted_href_domains"])
|
||||
|
||||
|
||||
class PDFDownloader:
|
||||
"""Saves a given url. Fills the object it got as a parameter"""
|
||||
logger = logging.getLogger(__name__)
|
||||
# status-variable for restarting:
|
||||
running = False
|
||||
|
||||
def start(self):
|
||||
self.finish() # clear up
|
||||
|
||||
options = webdriver.FirefoxOptions()
|
||||
options.profile = config["browser_profile_path"]
|
||||
# should be options.set_preference("profile", config["browser_profile_path"]) as of selenium 4 but that doesn't work
|
||||
|
||||
if os.getenv("HEADLESS", "false") == "true":
|
||||
options.add_argument('--headless')
|
||||
else:
|
||||
self.logger.warning("Opening browser GUI because of 'HEADLESS=false'")
|
||||
|
||||
options.set_preference('print.save_as_pdf.links.enabled', True)
|
||||
# Just save if the filetype is pdf already, does not work!
|
||||
|
||||
options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
|
||||
options.set_preference("browser.download.folderList", 2)
|
||||
# options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
|
||||
# options.set_preference("pdfjs.disabled", True)
|
||||
options.set_preference("browser.download.dir", config["default_download_path"])
|
||||
|
||||
self.logger.info("Starting gecko driver")
|
||||
# self.driver = webdriver.Firefox(
|
||||
# options = options,
|
||||
# service = webdriver.firefox.service.Service(
|
||||
# log_path = f'{config["local_storage_path"]}/geckodriver.log'
|
||||
# ))
|
||||
self.driver = webdriver.Remote(
|
||||
command_executor = 'http://geckodriver:4444',
|
||||
options = options,
|
||||
# can't set log path...
|
||||
)
|
||||
|
||||
residues = os.listdir(config["default_download_path"])
|
||||
for res in residues:
|
||||
os.remove(os.path.join(config["default_download_path"], res))
|
||||
|
||||
self.running = True
|
||||
|
||||
def autostart(self):
|
||||
if not self.running:
|
||||
self.start() # relaunch the dl util
|
||||
|
||||
def finish(self):
|
||||
if self.running:
|
||||
self.logger.info("Exiting gecko driver")
|
||||
try:
|
||||
self.driver.quit()
|
||||
time.sleep(10)
|
||||
except:
|
||||
self.logger.critical("Connection to the driver broke off")
|
||||
self.running = False
|
||||
else:
|
||||
self.logger.info("Gecko driver not yet running")
|
||||
|
||||
def download(self, article_object):
|
||||
sleep_time = 2
|
||||
self.autostart()
|
||||
url = article_object.article_url
|
||||
|
||||
try:
|
||||
self.driver.get(url)
|
||||
except Exception as e:
|
||||
self.logger.critical("Selenium .get(url) failed with error {}".format(e))
|
||||
self.finish()
|
||||
return article_object # without changes
|
||||
|
||||
time.sleep(sleep_time)
|
||||
# leave the page time to do any funky business
|
||||
|
||||
# in the mean time, get a page title if required
|
||||
if article_object.is_title_bad:
|
||||
article_object.title = self.driver.title.replace(".pdf", "")
|
||||
# will be propagated to the saved file (dst) as well
|
||||
|
||||
fname = article_object.fname_template
|
||||
dst = os.path.join(article_object.save_path, fname)
|
||||
if os.path.exists(dst):
|
||||
fname = make_path_unique(fname)
|
||||
dst = os.path.join(article_object.save_path, fname)
|
||||
|
||||
|
||||
if url[-4:] == ".pdf":
|
||||
# according to the browser preferences, calling the url will open pdfjs.
|
||||
# If not handled separately, printing would require the ctrl+p route, but setup is janky to say the least
|
||||
success = self.get_exisiting_pdf(url, dst)
|
||||
else:
|
||||
success = self.get_new_pdf(dst)
|
||||
|
||||
|
||||
if success:
|
||||
article_object.file_name = fname
|
||||
article_object.set_references(self.get_references())
|
||||
else:
|
||||
article_object.file_name = ""
|
||||
|
||||
return article_object # this change is saved later by the external caller
|
||||
|
||||
|
||||
def get_exisiting_pdf(self, url, dst):
|
||||
try:
|
||||
r = requests.get(url)
|
||||
bytes = r.content
|
||||
except:
|
||||
return False
|
||||
return self.get_new_pdf(dst, other_bytes=bytes)
|
||||
|
||||
|
||||
def get_new_pdf(self, dst, other_bytes=None):
|
||||
os.makedirs(os.path.dirname(dst), exist_ok=True)
|
||||
|
||||
if other_bytes is None:
|
||||
try:
|
||||
result = self.driver.print_page()
|
||||
bytes = base64.b64decode(result, validate=True)
|
||||
except:
|
||||
self.logger.error("Failed, probably because the driver went extinct.")
|
||||
return False
|
||||
else:
|
||||
bytes = other_bytes
|
||||
|
||||
try:
|
||||
with open(dst, "wb+") as f:
|
||||
f.write(bytes)
|
||||
return True
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed, because of FS-operation: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def get_references(self):
|
||||
try:
|
||||
hrefs = [e.get_attribute("href") for e in self.driver.find_elements_by_xpath("//a[@href]")]
|
||||
except:
|
||||
hrefs = []
|
||||
len_old = len(hrefs)
|
||||
hrefs = [h for h in hrefs \
|
||||
if not sum([(domain in h) for domain in blacklisted]) # sum([True, False, False, False]) == 1 (esp. not 0)
|
||||
] # filter a tiny bit at least
|
||||
self.logger.info(f"Hrefs filtered (before: {len_old}, after: {len(hrefs)})")
|
||||
return hrefs
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def make_path_unique(path):
|
||||
fname, ending = os.path.splitext(path)
|
||||
fname += datetime.datetime.now().strftime("%d-%H%M%S")
|
||||
return fname + ending
|
0
news_fetch/app/utils_worker/download/runner.py
Normal file
0
news_fetch/app/utils_worker/download/runner.py
Normal file
51
news_fetch/app/utils_worker/download/youtube.py
Normal file
51
news_fetch/app/utils_worker/download/youtube.py
Normal file
@@ -0,0 +1,51 @@
|
||||
from __future__ import unicode_literals
|
||||
import youtube_dl
|
||||
import os
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MyLogger(object):
|
||||
def debug(self, msg): pass
|
||||
def warning(self, msg): pass
|
||||
def error(self, msg):
|
||||
logger.error(msg)
|
||||
|
||||
|
||||
|
||||
class YouTubeDownloader:
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
|
||||
def post_download_hook(self, ret_code):
|
||||
# print(ret_code)
|
||||
if ret_code['status'] == 'finished':
|
||||
file_loc = ret_code["filename"]
|
||||
fname = os.path.basename(file_loc)
|
||||
self.article_object.file_name = fname
|
||||
|
||||
|
||||
def save_video(self, article_object):
|
||||
"""Saves video accoring to url and save path"""
|
||||
self.article_object = article_object
|
||||
url = article_object.article_url
|
||||
logger.info("Saving new video")
|
||||
file_path = os.path.join(article_object.save_path, article_object.fname_template)
|
||||
ydl_opts = {
|
||||
'format': 'best[height<=720]',
|
||||
'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download
|
||||
'logger': MyLogger(),
|
||||
'progress_hooks': [self.post_download_hook],
|
||||
'updatetime': False
|
||||
}
|
||||
try:
|
||||
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
||||
ydl.download([url])
|
||||
# article file name is updated in self.post_download_hook
|
||||
except Exception as e:
|
||||
logger.error(f"Youtube download crashed: {e}")
|
||||
article_object.file_name = ""
|
||||
|
||||
return article_object
|
62
news_fetch/app/utils_worker/fetch/runner.py
Normal file
62
news_fetch/app/utils_worker/fetch/runner.py
Normal file
@@ -0,0 +1,62 @@
|
||||
from newspaper import Article
|
||||
from urllib.parse import urlparse
|
||||
from htmldate import find_date
|
||||
import datetime
|
||||
import logging
|
||||
logging.getLogger('newspaper').setLevel(logging.ERROR) # quieter logs
|
||||
logging.getLogger('urllib').setLevel(logging.ERROR) # quieter logs
|
||||
logging.getLogger('urllib3.poolmanager').setLevel(logging.ERROR) # quieter logs
|
||||
logging.getLogger('htmldate').setLevel(logging.ERROR) #quieter logs
|
||||
logging.getLogger('charset_normalizer').setLevel(logging.ERROR) #quieter logs
|
||||
logger = logging.getLogger("fetch")
|
||||
|
||||
|
||||
def get_description(article_object):
|
||||
url = article_object.article_url
|
||||
website = urlparse(url).netloc
|
||||
article_object.source_name = website
|
||||
|
||||
try:
|
||||
article_object.pub_date = datetime.datetime.strptime(find_date(url), '%Y-%d-%M')
|
||||
except: # other file types
|
||||
article_object.pub_date = datetime.datetime(year=1900, month=1, day=1)
|
||||
|
||||
try:
|
||||
news_article = Article(url)
|
||||
news_article.download()
|
||||
news_article.parse()
|
||||
except:
|
||||
news_article = object() # fallback value
|
||||
|
||||
try:
|
||||
article_object.title = news_article.title
|
||||
except AttributeError:
|
||||
article_object.title = "Error while running fetch"
|
||||
|
||||
try:
|
||||
if article_object.summary:
|
||||
article_object.summary = news_article.summary
|
||||
elif news_article.text:
|
||||
ind = min(500, len(news_article.text))
|
||||
article_object.summary = news_article.text[:ind] + "..."
|
||||
else:
|
||||
article_object.summary = ""
|
||||
except AttributeError:
|
||||
article_object.summary = ""
|
||||
|
||||
try:
|
||||
article_object.language = news_article.meta_lang
|
||||
except AttributeError:
|
||||
article_object.language = ""
|
||||
|
||||
try:
|
||||
article_object.set_authors(news_article.authors)
|
||||
except AttributeError:
|
||||
pass # list would have been empty anyway
|
||||
|
||||
try:
|
||||
article_object.set_keywords(news_article.keywords)
|
||||
except AttributeError:
|
||||
pass # list would have been empty anyway
|
||||
|
||||
return article_object
|
20
news_fetch/app/utils_worker/upload/runner.py
Normal file
20
news_fetch/app/utils_worker/upload/runner.py
Normal file
@@ -0,0 +1,20 @@
|
||||
import time
|
||||
from waybackpy import WaybackMachineSaveAPI # upload to archive.org
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def upload_to_archive(article_object):
|
||||
"""uploads to archive.org and returns the archived url"""
|
||||
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed?
|
||||
url = article_object.article_url
|
||||
try:
|
||||
wayback = WaybackMachineSaveAPI(url, user_agent)
|
||||
archive_url = wayback.save()
|
||||
# logger.info(f"{url} uploaded to archive successfully")
|
||||
article_object.archive_url = archive_url
|
||||
|
||||
except Exception as e:
|
||||
article_object.archive_url = "Error while uploading: {}".format(e)
|
||||
logger.error(f"Error while generating archive url: {e}")
|
||||
|
||||
return article_object
|
41
news_fetch/app/utils_worker/worker_template.py
Normal file
41
news_fetch/app/utils_worker/worker_template.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from threading import Thread
|
||||
import time
|
||||
import logging
|
||||
|
||||
|
||||
class TemplateWorker(Thread):
|
||||
"""Parent class for any subsequent worker of the article-download pipeline. They should all run in parallel, thus the Thread subclassing"""
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def __init__(self, *args, **kwargs) -> None:
|
||||
target = self._queue_processor # will be executed on Worker.start()
|
||||
group = kwargs.get("group", None)
|
||||
name = kwargs.get("name", None)
|
||||
|
||||
super().__init__(group=group, target=target, name=name)
|
||||
self._article_queue = []
|
||||
self.logger.info(f"Worker thread {self.__class__.__name__} initialized successfully")
|
||||
|
||||
|
||||
def process(self, article_watcher):
|
||||
self._article_queue.append(article_watcher)#.article_model.article_url)
|
||||
|
||||
|
||||
def _queue_processor(self):
|
||||
"""This method is launched by thread.run() and idles when self._article_queue is empty. When an external caller appends to the queue it jumps into action"""
|
||||
while True: # PLEASE tell me if I'm missing an obvious better way of doing this!
|
||||
if len(self._article_queue) == 0:
|
||||
time.sleep(5)
|
||||
else:
|
||||
article_watcher = self._article_queue.pop(0)
|
||||
self.logger.info(f"{self.__class__.__name__} now processing from queue (length: {len(self._article_queue)}) - {article_watcher.article}")
|
||||
self._handle_article(article_watcher)
|
||||
|
||||
|
||||
def _handle_article(self, article_watcher, action=None):
|
||||
if action is None:
|
||||
self.logger.error("Unoverloaded call of _handle_article(). This should not occur in prod")
|
||||
else:
|
||||
article = article_watcher.article
|
||||
article = action(article) # action updates the article object but does not save the change
|
||||
article.save()
|
66
news_fetch/app/utils_worker/workers.py
Normal file
66
news_fetch/app/utils_worker/workers.py
Normal file
@@ -0,0 +1,66 @@
|
||||
from .worker_template import TemplateWorker
|
||||
from .download.browser import PDFDownloader
|
||||
from .download.youtube import YouTubeDownloader
|
||||
from .fetch.runner import get_description
|
||||
from .upload.runner import upload_to_archive as run_upload
|
||||
from .compress.runner import shrink_pdf
|
||||
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DownloadWorker(TemplateWorker):
|
||||
def __init__(self) -> None:
|
||||
self.dl_runner = PDFDownloader().download
|
||||
self.yt_runner = YouTubeDownloader().save_video
|
||||
super().__init__()
|
||||
|
||||
def _handle_article(self, article_watcher):
|
||||
article = article_watcher.article
|
||||
u = article.article_url
|
||||
|
||||
if "youtu.be/" in u or "youtube.com/" in u:
|
||||
action = self.yt_runner
|
||||
else:
|
||||
action = self.dl_runner
|
||||
|
||||
super()._handle_article(article_watcher, action)
|
||||
article_watcher.download_completed = True
|
||||
|
||||
|
||||
|
||||
class FetchWorker(TemplateWorker):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
def _handle_article(self, article_watcher):
|
||||
action = get_description # function
|
||||
super()._handle_article(article_watcher, action)
|
||||
article_watcher.fetch_completed = True
|
||||
|
||||
|
||||
|
||||
class UploadWorker(TemplateWorker):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
|
||||
|
||||
def _handle_article(self, article_watcher):
|
||||
def action(*args, **kwargs):
|
||||
time.sleep(10) # uploads to archive are throttled to 15/minute, but 5s still triggers a blacklisting
|
||||
return run_upload(*args, **kwargs)
|
||||
|
||||
super()._handle_article(article_watcher, action)
|
||||
article_watcher.upload_completed = True
|
||||
|
||||
|
||||
|
||||
class CompressWorker(TemplateWorker):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
def _handle_article(self, article_watcher):
|
||||
action = shrink_pdf
|
||||
super()._handle_article(article_watcher, action)
|
||||
article_watcher.compression_completed = True
|
10
news_fetch/requirements.txt
Normal file
10
news_fetch/requirements.txt
Normal file
@@ -0,0 +1,10 @@
|
||||
peewee
|
||||
selenium
|
||||
# youtube-dl
|
||||
git+https://github.com/ytdl-org/youtube-dl.git #non-release version with faster downloads
|
||||
waybackpy
|
||||
slack_bolt # relies on slack_sdk
|
||||
newspaper3k
|
||||
htmldate
|
||||
markdown
|
||||
rich
|
Reference in New Issue
Block a user