new component - upload to NAS

This commit is contained in:
Remy Moll
2022-07-23 17:21:00 +02:00
parent 79e3f54955
commit 8e46f30f07
29 changed files with 132 additions and 63 deletions

37
news_fetch/Dockerfile Normal file
View File

@@ -0,0 +1,37 @@
FROM python:latest
ENV TZ Europe/Zurich
# RUN echo "deb http://deb.debian.org/debian/ unstable main contrib non-free" >> /etc/apt/sources.list
# allows the installation of the latest firefox-release (debian is not usually a rolling release)
RUN apt-get update && apt-get install -y \
evince \
# for checking
xauth \
#for gui
# wget tar firefox \
# for geckodriver
ghostscript
# for compression
# Download gecko (firefox) driver for selenium
# RUN wget https://github.com/mozilla/geckodriver/releases/download/v0.31.0/geckodriver-v0.31.0-linux64.tar.gz
# RUN tar -x geckodriver -zf geckodriver-v0.31.0-linux64.tar.gz -O > /usr/bin/geckodriver
# RUN chmod +x /usr/bin/geckodriver
# RUN rm geckodriver-v0.31.0-linux64.tar.gz
RUN useradd --create-home --shell /bin/bash --uid 1001 autonews
# id mapped to local user
# home directory needed for pip package installation
RUN mkdir -p /app/auto_news
RUN chown -R autonews:autonews /app
USER autonews
RUN export PATH=/home/autonews/.local/bin:$PATH
COPY requirements.txt /app/requirements.txt
RUN python3 -m pip install -r /app/requirements.txt
COPY app /app/auto_news
WORKDIR /app/auto_news

View File

@@ -0,0 +1,59 @@
from dataclasses import dataclass
import os
import shutil
import configparser
import logging
from datetime import datetime
from peewee import SqliteDatabase
from rich.logging import RichHandler
# first things first: logging
logging.basicConfig(
format='%(message)s',
level=logging.INFO,
datefmt='%H:%M:%S', # add %Y-%m-%d if needed
handlers=[RichHandler()]
)
logger = logging.getLogger(__name__)
# load config file containing constants and secrets
parsed = configparser.ConfigParser()
parsed.read("/app/containerdata/config/news_fetch.config.ini")
if os.getenv("DEBUG", "false") == "true":
logger.warning("Found 'DEBUG=true', setting up dummy databases")
db_base_path = parsed["DATABASE"]["db_path_dev"]
parsed["SLACK"]["archive_id"] = parsed["SLACK"]["debug_id"]
parsed["MAIL"]["recipient"] = parsed["MAIL"]["sender"]
parsed["DOWNLOADS"]["local_storage_path"] = parsed["DATABASE"]["db_path_dev"]
else:
logger.warning("Found 'DEBUG=false' and running on production databases, I hope you know what you're doing...")
db_base_path = parsed["DATABASE"]["db_path_prod"]
logger.info("Backing up databases")
backup_dst = parsed["DATABASE"]["db_backup"]
today = datetime.today().strftime("%Y.%m.%d")
shutil.copyfile(
os.path.join(db_base_path, parsed["DATABASE"]["chat_db_name"]),
os.path.join(backup_dst, today + "." + parsed["DATABASE"]["chat_db_name"]),
)
shutil.copyfile(
os.path.join(db_base_path, parsed["DATABASE"]["download_db_name"]),
os.path.join(backup_dst, today + "." + parsed["DATABASE"]["download_db_name"]),
)
from utils_storage import models
# Set up the database
models.set_db(
SqliteDatabase(
os.path.join(db_base_path, parsed["DATABASE"]["chat_db_name"]),
pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once
),
SqliteDatabase(
os.path.join(db_base_path, parsed["DATABASE"]["download_db_name"]),
pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once
)
)

197
news_fetch/app/runner.py Normal file
View File

@@ -0,0 +1,197 @@
"""Main coordination of other util classes. Handles inbound and outbound calls"""
import configuration
models = configuration.models
from threading import Thread
import logging
import os
logger = logging.getLogger(__name__)
from utils_mail import runner as mail_runner
from utils_slack import runner as slack_runner
from utils_worker.workers import CompressWorker, DownloadWorker, FetchWorker, UploadWorker
class ArticleWatcher:
"""Wrapper for a newly created article object. Notifies the coordinator upon change/completition"""
def __init__(self, article, thread, **kwargs) -> None:
self.article_id = article.id # in case article becomes None at any point, we can still track the article
self.article = article
self.thread = thread
self.completition_notifier = kwargs.get("notifier")
self.fetch = kwargs.get("worker_fetch", None)
self.download = kwargs.get("worker_download", None)
self.compress = kwargs.get("worker_compress", None)
self.upload = kwargs.get("worker_upload", None)
self.completition_notified = False
# self._download_called = self._compression_called = False
self._fetch_completed = self._download_completed = self._compression_completed = self._upload_completed = False
# first step: gather metadata
if self.fetch and self.upload:
self.fetch.process(self) # this will call the update_status method
self.upload.process(self) # idependent from the rest
else: # the full kwargs were not provided, only do a manual run
# overwrite update_status() because calls from the workers will result in erros
self.update_status = lambda completed: logger.info(f"Completed action {completed}")
for w in kwargs.get("workers_manual"):
w.process(self)
def update_status(self, completed_action):
"""Checks and notifies internal completition-status.
Article download is complete iff fetch and download were successfull and compression was run
"""
# if self.completition_notified and self._compression_completed and self._fetch_completed and self._download_completed and self._upload_completed, we are done
if completed_action == "fetch":
self.download.process(self)
elif completed_action == "download":
self.compress.process(self)
elif completed_action == "compress": # last step
self.completition_notifier(self.article, self.thread)
# triggers action in Coordinator
elif completed_action == "upload":
# this case occurs when upload was faster than compression
pass
else:
logger.warning(f"update_status called with unusual configuration: {completed_action}")
# ====== Attributes to be modified by the util workers
@property
def fetch_completed(self):
return self._fetch_completed
@fetch_completed.setter
def fetch_completed(self, value: bool):
self._fetch_completed = value
self.update_status("fetch")
@property
def download_completed(self):
return self._download_completed
@download_completed.setter
def download_completed(self, value: bool):
self._download_completed = value
self.update_status("download")
@property
def compression_completed(self):
return self._compression_completed
@compression_completed.setter
def compression_completed(self, value: bool):
self._compression_completed = value
self.update_status("compress")
@property
def upload_completed(self):
return self._upload_completed
@upload_completed.setter
def upload_completed(self, value: bool):
self._upload_completed = value
self.update_status("upload")
def __str__(self) -> str:
return f"Article with id {self.article_id}"
class Coordinator(Thread):
def __init__(self, **kwargs) -> None:
"""Launcher calls this Coordinator as the main thread to handle connections between the other workers (threaded)."""
super().__init__(target = self.launch)
def add_workers(self, **kwargs):
self.worker_slack = kwargs.pop("worker_slack", None)
self.worker_mail = kwargs.pop("worker_mail", None)
# the two above won't be needed in the Watcher
self.worker_download = kwargs.get("worker_download", None)
self.worker_fetch = kwargs.get("worker_fetch", None)
self.worker_compress = kwargs.get("worker_compress", None)
self.worker_upload = kwargs.get("worker_upload", None)
self.kwargs = kwargs
def launch(self) -> None:
for w in [self.worker_download, self.worker_fetch, self.worker_upload, self.worker_compress]:
if not w is None:
w.start()
def incoming_request(self, message):
"""This method is passed onto the slack worker. It gets triggered when a new message is received."""
url = message.urls[0] # ignore all the other ones
article, is_new = models.ArticleDownload.get_or_create(article_url=url)
thread = message.thread
thread.article = article
thread.save()
self.kwargs.update({"notifier" : self.article_complete_notifier})
if is_new or (article.file_name == "" and article.verified == 0):
# check for models that were created but were abandonned. This means they have missing information, most importantly no associated file
# this overwrites previously set information, but that should not be too important
ArticleWatcher(
article,
thread,
**self.kwargs
)
# All workers are implemented as a threaded queue. But the individual model requires a specific processing order:
# fetch -> download -> compress -> complete
# the watcher orchestrates the procedure and notifies upon completition
# the watcher will notify once it is sufficiently populated
else: # manually trigger notification immediatly
logger.info(f"Found existing article {article}. Now sending")
self.article_complete_notifier(article, thread)
def manual_processing(self, articles, workers):
for w in workers:
w.start()
for article in articles:
notifier = lambda article: print(f"Completed manual actions for {article}")
ArticleWatcher(article, None, workers_manual = workers, notifier = notifier) # Article watcher wants a thread to link article to TODO: handle threads as a kwarg
def article_complete_notifier(self, article, thread):
if self.worker_slack is None:
logger.warning("Not sending slack notifier")
else:
self.worker_slack.bot_worker.respond_channel_message(thread)
if self.worker_mail is None:
logger.warning("Not sending mail notifier")
else:
self.worker_mail.send(article)
if __name__ == "__main__":
coordinator = Coordinator()
if os.getenv("UPLOAD", "false") == "true":
articles = models.ArticleDownload.select().where(models.ArticleDownload.archive_url == "").execute()
logger.info(f"Launching upload to archive for {len(articles)} articles.")
coordinator.manual_processing(articles, [UploadWorker()])
elif os.getenv("CHECK", "false") == "true":
from utils_check import runner as check_runner
check_runner.verify_unchecked()
else: # launch with full action
slack_runner = slack_runner.BotRunner(coordinator.incoming_request)
kwargs = {
"worker_download" : DownloadWorker(),
"worker_fetch" : FetchWorker(),
"worker_upload" : UploadWorker(),
"worker_compress" : CompressWorker(),
"worker_slack" : slack_runner,
"worker_mail" : mail_runner,
}
coordinator.add_workers(**kwargs)
coordinator.start()
slack_runner.start()

View File

@@ -0,0 +1,207 @@
from rich.console import Console
from rich.table import Table
from rich.columns import Columns
from rich.rule import Rule
console = Console()
hline = Rule(style="white")
import os
import subprocess
from slack_sdk import WebClient
import configuration
models = configuration.models
u_options = {
"ENTER" : "Accept PDF as is. It gets marked as verified",
"D" : "set languange to DE and set verified",
"E" : "set languange to EN and set verified",
"O" : "set other language (prompted)",
"R" : "set related files (prompted multiple times)",
"B" : "reject and move to folder BAD",
"L" : "leave file as is, do not send reaction"
}
bot_client = WebClient(
token = configuration.parsed["SLACK"]["auth_token"]
)
def file_overview(file_url: str, file_attributes: list, options: dict) -> None:
"""Prints a neat overview of the current article"""
file_table = Table(
title = file_url,
row_styles = ["white", "bright_black"],
min_width = 100
)
file_table.add_column("Attribute", justify = "right", no_wrap = True)
file_table.add_column("Value set by auto_news")
file_table.add_column("Status", justify = "right")
for attr in file_attributes:
file_table.add_row(attr["name"], attr["value"], attr["status"])
option_key = "\n".join([f"[[bold]{k}[/bold]]" for k in options.keys()])
option_action = "\n".join([f"[italic]{k}[/italic]" for k in options.values()])
columns = Columns([option_key, option_action])
console.print(file_table)
console.print("Your options:")
console.print(columns)
def send_reaction_to_slack_thread(article, reaction):
"""Sends the verification status as a reaction to the associated slack thread. This will significantly decrease load times of the bot"""
thread = article.slack_thread
messages = models.Message.select().where(models.Message.text.contains(article.article_url))
# TODO rewrite this shit
if len(messages) > 5:
print("Found more than 5 messages. Aborting reactions...")
return
for m in messages:
if not m.has_single_url:
print("Found thread but won't send reaction because thread has multiple urls")
pass
else:
ts = m.slack_ts
bot_client.reactions_add(
channel=configuration.parsed["SLACK"]["archive_id"],
name=reaction,
timestamp=ts
)
print("Sent reaction to message")
def prompt_language(query):
not_set = True
while not_set:
uin = input("Set language (nation-code, 2 letters) ")
if len(uin) != 2:
print("Bad code, try again")
else:
not_set = False
query.language = uin
query.save()
def prompt_related(query):
file_list = []
finished = False
while not finished:
uin = input("Additional file for article? Type '1' to cancel ")
if uin == "1":
query.set_related(file_list)
finished = True
else:
file_list.append(uin)
def prompt_new_fname(query):
uin = input("New fname? ")
old_fname = query.file_name
query.file_name = uin
query.verified = 1
if old_fname != "":
os.remove(query.save_path + old_fname)
query.save()
def reject_article(article):
article.verified = -1
article.save()
print("Article marked as bad")
# also update the threads to not be monitored anymore
send_reaction_to_slack_thread(article, "x")
def unreject_article(query):
query.verified = 1
query.save()
# os.rename(badpdf, fname)
print("File set to verified")
def accept_article(article, last_accepted):
article.verified = 1
article.save()
print("Article accepted as GOOD")
# also update the threads to not be monitored anymore
send_reaction_to_slack_thread(article, "white_check_mark")
return "" # linked
def verify_unchecked():
query = models.ArticleDownload.select().where(models.ArticleDownload.verified == 0).execute()
last_linked = None
for article in query:
console.print(hline)
core_info = []
for e, name in zip([article.save_path, article.file_name, article.title, article.language], ["Save path", "File name", "Title", "Language"]):
entry = {
"status" : "[red]██[/red]" if (len(e) == 0 or e == -1) else "[green]██[/green]",
"value" : e if len(e) != 0 else "not set",
"name" : name
}
core_info.append(entry)
try:
# close any previously opened windows:
# subprocess.call(["kill", "`pgrep evince`"])
os.system("pkill evince")
# then open a new one
subprocess.Popen(["evince", f"file://{os.path.join(article.save_path, article.file_name)}"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# supress evince gtk warnings
except Exception as e:
print(e)
continue
file_overview(
file_url = article.article_url,
file_attributes=core_info,
options = u_options
)
proceed = False
while not proceed:
proceed = False
uin = input("Choice ?").lower()
if uin == "":
last_linked = accept_article(article, last_linked) # last linked accelerates the whole process
proceed = True
elif uin == "d":
article.language = "de"
article.verified = 1
article.save()
proceed = True
elif uin == "e":
article.language = "en"
article.verified = 1
article.save()
proceed = True
elif uin == "o":
prompt_language(article)
elif uin == "r":
prompt_related(article)
elif uin == "b":
reject_article(article)
proceed = True
elif uin == "l":
# do nothing
proceed = True
else:
print("Invalid input")

View File

@@ -0,0 +1,42 @@
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.application import MIMEApplication
import os
import logging
import configuration
logger = logging.getLogger(__name__)
config = configuration.parsed["MAIL"]
def send(article_model):
mail = MIMEMultipart()
mail['Subject'] = "{} -- {}".format(article_model.source_name, article_model.title)
mail['From'] = config["sender"]
mail['To'] = config["recipient"]
msgs = article_model.mail_info # this is html
msg = [m["reply_text"] for m in msgs]
msg = "\n".join(msg)
content = MIMEText(msg, "html")
mail.attach(content)
files = [m["file_path"] for m in msgs if m["file_path"]]
for path in files:
with open(path, 'rb') as file:
part = MIMEApplication(file.read(), "pdf")
# encoders.encode_base64(part)
part.add_header('Content-Disposition', 'attachment', filename=os.path.basename(path))
mail.attach(part)
try:
smtp = smtplib.SMTP(config["smtp_server"], config["port"])
smtp.starttls()
smtp.login(config["uname"], config["password"])
smtp.sendmail(config["sender"], config["recipient"], mail.as_string())
smtp.quit()
logger.info("Mail successfully sent.")
except Exception as e:
logger.error("Could not send mail for article {}".format(article_model))
logger.info(e)

View File

@@ -0,0 +1,277 @@
import logging
import configuration
import requests
import os
import time
from threading import Thread
from slack_sdk.errors import SlackApiError
logger = logging.getLogger(__name__)
config = configuration.parsed["SLACK"]
models = configuration.models
slack_client = "dummy"
LATEST_RECORDED_REACTION = 0
def init(client) -> None:
global slack_client
slack_client = client
global LATEST_RECORDED_REACTION
try:
LATEST_RECORDED_REACTION = models.Reaction.select(models.Reaction.id).order_by("id")[-1]
except IndexError: #query is actually empty, we have never fetched any messages until now
LATEST_RECORDED_REACTION = 0
# fetch all te messages we could have possibly missed
logger.info("Querying missed messages, threads and reactions. This can take some time.")
fetch_missed_channel_messages() # not threaded
t = Thread(target = fetch_missed_channel_reactions) # threaded, runs in background (usually takes a long time)
t.start()
if os.getenv("REDUCEDFETCH", "false") == "true":
logger.warning("Only fetching empty threads for bot messages because 'REDUCEDFETCH=true'")
fetch_missed_thread_messages(reduced=True)
else: # perform both asyncronously
fetch_missed_thread_messages()
def get_unhandled_messages():
"""Gets all messages that have not yet been handled, be it by mistake or by downtime
As the message handler makes no distinction between channel messages and thread messages,
we don't have to worry about them here.
"""
threaded_objects = []
for t in models.Thread.select():
if t.message_count > 1: # if only one message was written, it is the channel message
msg = t.last_message
if msg.is_by_human:
threaded_objects.append(msg)
# else don't, nothing to process
logger.info(f"Set {len(threaded_objects)} thread-messages as not yet handled.")
channel_objects = [t.initiator_message for t in models.Thread.select() if (t.message_count == 1 and not t.is_fully_processed)]
logger.info(f"Set {len(channel_objects)} channel-messages as not yet handled.")
reaction_objects = list(models.Reaction.select().where(models.Reaction.id > LATEST_RECORDED_REACTION))
logger.info(f"Set {len(reaction_objects)} reactions as not yet handled.")
# the ones newer than the last before the fetch
all_messages = channel_objects + threaded_objects
return all_messages, reaction_objects
def fetch_missed_channel_messages():
# latest processed message_ts is:
presaved = models.Message.select().order_by(models.Message.ts)
if not presaved:
last_ts = 0
else:
last_message = presaved[-1]
last_ts = last_message.slack_ts
result = slack_client.conversations_history(
channel=config["archive_id"],
oldest=last_ts
)
new_messages = result.get("messages", [])
# # filter the last one, it is a duplicate! (only if the db is not empty!)
# if last_ts != 0 and len(new_messages) != 0:
# new_messages.pop(-1)
new_fetches = 0
for m in new_messages:
# print(m)
message_dict_to_model(m)
new_fetches += 1
refetch = result.get("has_more", False)
while refetch: # we have not actually fetched them all
try:
result = slack_client.conversations_history(
channel = config["archive_id"],
cursor = result["response_metadata"]["next_cursor"],
oldest = last_ts
) # fetches 100 messages, older than the [-1](=oldest) element of new_fetches
refetch = result.get("has_more", False)
new_messages = result.get("messages", [])
for m in new_messages:
message_dict_to_model(m)
new_fetches += 1
except SlackApiError: # Most likely a rate-limit
logger.error("Error while fetching channel messages. (likely rate limit) Retrying in {} seconds...".format(config["api_wait_time"]))
time.sleep(config["api_wait_time"])
refetch = True
logger.info(f"Fetched {new_fetches} new channel messages.")
def fetch_missed_thread_messages(reduced=False):
"""After having gotten all base-threads, we need to fetch all their replies"""
# I don't know of a better way: we need to fetch this for each and every thread (except if it is marked as permanently solved)
logger.info("Starting fetch of thread messages...")
if reduced:
threads = [t for t in models.Thread.select() if (t.message_count == 1 and not t.is_fully_processed)]
# this only fetches completely empty threads, which might be because the bot-message was not yet saved to the db.
# once we got all the bot-messages the remaining empty threads will be the ones we need to process.
else:
threads = [t for t in models.Thread.select() if not t.is_fully_processed]
logger.info(f"Fetching history for {len(threads)} empty threads")
new_messages = []
for i,t in enumerate(threads):
try:
messages = slack_client.conversations_replies(
channel = config["archive_id"],
ts = t.slack_ts,
oldest = t.messages[-1].slack_ts
)["messages"]
except SlackApiError:
logger.error("Hit rate limit while querying threaded messages, retrying in {}s ({}/{} queries elapsed)".format(config["api_wait_time"], i, len(threads)))
time.sleep(int(config["api_wait_time"]))
messages = slack_client.conversations_replies(
channel = config["archive_id"],
ts = t.slack_ts,
oldest = t.messages[-1].slack_ts
)["messages"]
messages.pop(0) # the first message is the one posted in the channel. We already processed it!
for m in messages:
# only append *new* messages
res = message_dict_to_model(m)
if res:
new_messages.append(res)
logger.info("Fetched {} new threaded messages.".format(len(new_messages)))
def fetch_missed_channel_reactions():
logger.info("Starting background fetch of channel reactions...")
threads = [t for t in models.Thread.select() if not t.is_fully_processed]
for i,t in enumerate(threads):
try:
query = slack_client.reactions_get(
channel = config["archive_id"],
timestamp = t.slack_ts
)
reactions = query.get("message", []).get("reactions", []) # default = []
except SlackApiError: # probably a rate_limit:
logger.error("Hit rate limit while querying reactions. retrying in {}s ({}/{} queries elapsed)".format(config["api_wait_time"], i, len(threads)))
time.sleep(int(config["api_wait_time"]))
reactions = query.get("message", []).get("reactions", [])
for r in reactions:
reaction_dict_to_model(r, t)
# Helpers for message conversion to db-objects
def reaction_dict_to_model(reaction, thread=None):
if thread is None:
m_ts = reaction["item"]["ts"]
message = models.Message.get(ts = float(m_ts))
thread = message.thread
if "name" in reaction.keys(): # fetched through manual api query
content = reaction["name"]
elif "reaction" in reaction.keys(): # fetched through events
content = reaction["reaction"]
else:
logger.error(f"Weird reaction received: {reaction}")
return None
r, _ = models.Reaction.get_or_create(
type = content,
message = thread.initiator_message
)
logger.info("Saved reaction [{}]".format(content))
return r
def message_dict_to_model(message):
if message["type"] == "message":
thread_ts = message["thread_ts"] if "thread_ts" in message else message["ts"]
uid = message.get("user", "BAD USER")
if uid == "BAD USER":
logger.critical("Message has no user?? {}".format(message))
return None
user, _ = models.User.get_or_create(user_id = uid)
thread, _ = models.Thread.get_or_create(thread_ts = thread_ts)
m, new = models.Message.get_or_create(
user = user,
thread = thread,
ts = message["ts"],
channel_id = config["archive_id"],
text = message["text"]
)
logger.info(f"Saved: {m} ({'new' if new else 'old'})")
files = message.get("files", [])
if len(files) >= 1:
f = files[0] #default: []
m.file_type = f["filetype"]
m.perma_link = f["url_private_download"]
m.save()
logger.info(f"Saved {m.file_type}-file for message (id={m.id})")
if new:
return m
else:
return None
else:
logger.warning("What should I do of {}".format(message))
return None
def say_substitute(*args, **kwargs):
logger.info("Now sending message through say-substitute: {}".format(" - ".join(args)))
slack_client.chat_postMessage(
channel=config["archive_id"],
text=" - ".join(args),
**kwargs
)
def save_as_related_file(url, article_object):
r = requests.get(url, headers={"Authorization": "Bearer {}".format(slack_client.token)})
saveto = article_object.save_path
ftype = url[url.rfind(".") + 1:]
fname = "{} - related no {}.{}".format(
article_object.file_name.replace(".pdf",""),
len(article_object.related) + 1,
ftype
)
with open(os.path.join(saveto, fname), "wb") as f:
f.write(r.content)
article_object.set_related([fname])
logger.info("Added {} to model {}".format(fname, article_object))
return fname
def react_file_path_message(fname, article_object):
saveto = article_object.save_path
file_path = os.path.join(saveto, fname)
if os.path.exists(file_path):
article_object.set_related([fname])
logger.info("Added {} to model {}".format(fname, article_object))
return True
else:
return False
def is_message_in_archiving(message) -> bool:
if isinstance(message, dict):
return message["channel"] == config["archive_id"]
else:
return message.channel_id == config["archive_id"]
def is_reaction_in_archiving(event) -> bool:
if isinstance(event, dict):
return event["item"]["channel"] == config["archive_id"]
else:
return event.message.channel_id == config["archive_id"]

View File

@@ -0,0 +1,184 @@
from slack_bolt import App
from slack_bolt.adapter.socket_mode import SocketModeHandler
import logging
import configuration
from . import message_helpers
config = configuration.parsed["SLACK"]
models = configuration.models
class BotApp(App):
logger = logging.getLogger(__name__)
def __init__(self, callback, *args, **kwargs):
super().__init__(*args, **kwargs)
self.callback = callback
def start(self):
message_helpers.init(self.client)
missed_messages, missed_reactions = message_helpers.get_unhandled_messages()
[self.handle_incoming_message(m) for m in missed_messages]
[self.handle_incoming_reaction(r) for r in missed_reactions]
# self.react_missed_reactions(missed_reactions)
# self.react_missed_messages(missed_messages)
self.startup_status()
def handle_incoming_reaction(self, reaction):
if isinstance(reaction, dict): #else: the reaction is already being passed as a model
# CAUTION: filter for 'changed reactions' those are nasty (usually when adding an url)
reaction = message_helpers.reaction_dict_to_model(reaction)
thread = reaction.message.thread
article_object = thread.article
if not article_object is None:
reaction = reaction.type
status = 1 if reaction == "white_check_mark" else -1
# self.logger.info(f"Applying reaction {reaction} to its root message.")
article_object.verified = status
article_object.save()
def handle_incoming_message(self, message):
"""Reacts to all messages inside channel archiving. Must then
distinguish between threaded replies and new requests
and react accordingly"""
if isinstance(message, dict): #else: the message is already being passed as a model
# CAUTION: filter for 'changed messages' those are nasty (usually when adding an url)
if message.get("subtype", "not bad") == "message_changed":
return False
message = message_helpers.message_dict_to_model(message)
# First check: belongs to thread?
is_threaded = message.thread.message_count > 1 and message != message.thread.initiator_message
if is_threaded:
self.incoming_thread_message(message)
else:
self.incoming_channel_message(message)
def incoming_thread_message(self, message):
if message.user.user_id == config["bot_id"]:
return True # ignore the files uploaded by the bot. We handled them already!
thread = message.thread
if thread.is_fully_processed:
return True
self.logger.info("Receiving thread-message")
self.respond_thread_message(message)
def incoming_channel_message(self, message):
self.logger.info(f"Handling message {message} ({len(message.urls)} urls)")
if not message.urls: # no urls in a root-message => IGNORE
message.is_processed_override = True
message.save()
return
# ensure thread is still empty, this is a scenario encountered only in testing, but let's just filter it
if message.thread.message_count > 1:
self.logger.info("Discarded message because it is actually processed.")
return
if len(message.urls) > 1:
message_helpers.say_substitute("Only the first url is being handled. Please send any subsequent url as a separate message", thread_ts=message.thread.slack_ts)
self.callback(message)
# for url in message.urls:
# self.callback(url, message)
# stop here!
def respond_thread_message(self, message, say=message_helpers.say_substitute):
thread = message.thread
article = thread.article
if message.perma_link: # file upload means new data
fname = message_helpers.save_as_related_file(message.perma_link, article)
say("File was saved as 'related file' under `{}`.".format(fname),
thread_ts=thread.slack_ts
)
else: # either a pointer to a new file (too large to upload), or trash
success = message_helpers.react_file_path_message(message.text, article)
if success:
say("File was saved as 'related file'", thread_ts=thread.slack_ts)
else:
self.logger.error("User replied to thread {} but the response did not contain a file/path".format(thread))
say("Cannot process response without associated file.",
thread_ts=thread.slack_ts
)
def respond_channel_message(self, thread, say=message_helpers.say_substitute):
article = thread.article
answers = article.slack_info
for a in answers:
if a["file_path"]:
try: # either, a["file_path"] does not exist, or the upload resulted in an error
self.client.files_upload(
channels = config["archive_id"],
initial_comment = f"<@{config['responsible_id']}> \n {a['reply_text']}",
file = a["file_path"],
thread_ts = thread.slack_ts
)
status = True
except:
say(
"File {} could not be uploaded.".format(a),
thread_ts=thread.slack_ts
)
status = False
else: # anticipated that there is no file!
say(
f"<@{config['responsible_id']}> \n {a['reply_text']}",
thread_ts=thread.slack_ts
)
status = True
def startup_status(self):
threads = [t for t in models.Thread.select()]
all_threads = len(threads)
fully_processed = len([t for t in threads if t.is_fully_processed])
fully_unprocessed = len([t for t in threads if t.message_count == 1])
articles_unprocessed = len(models.ArticleDownload.select().where(models.ArticleDownload.verified < 1))
self.logger.info(f"[bold]STATUS[/bold]: Fully processed {fully_processed}/{all_threads} threads. {fully_unprocessed} threads have 0 replies. Article-objects to verify: {articles_unprocessed}", extra={"markup": True})
class BotRunner():
"""Stupid encapsulation so that we can apply the slack decorators to the BotApp"""
def __init__(self, callback, *args, **kwargs) -> None:
self.bot_worker = BotApp(callback, token=config["auth_token"])
@self.bot_worker.event(event="message", matchers=[message_helpers.is_message_in_archiving])
def handle_incoming_message(message, say):
return self.bot_worker.handle_incoming_message(message)
@self.bot_worker.event(event="reaction_added", matchers=[message_helpers.is_reaction_in_archiving])
def handle_incoming_reaction(event, say):
return self.bot_worker.handle_incoming_reaction(event)
# target = self.launch
# super().__init__(target=target)
def start(self):
self.bot_worker.start()
SocketModeHandler(self.bot_worker, config["app_token"]).start()
# def respond_to_message(self, message):
# self.bot_worker.handle_incoming_message(message)

View File

@@ -0,0 +1,67 @@
from playhouse.migrate import *
"""
This migration assumes that downloads.db kept the exact same structure as before.
messages.db should drop the table articlemodelreference in favor of a new field article in the thread-table
Since each thread is constrained to exactly one article this makes the most sense.
This migration assumes that messages.db gets a new field in the table thread:
id | thread_ts | article_id
We now need to migrate from the table articlemodelreference and then delete it.
"""
db = SqliteDatabase("/code/.dev/messages.db")
migrator = SqliteMigrator(db)
article_field = IntegerField(null=True)
migrate(
migrator.add_column('thread', 'article_id', article_field),
# migrator.drop_column('some_table', 'old_column'),
)
# these are the old models, adapted to the migration
class BaseModel(Model):
class Meta:
database = db
class User(BaseModel):
user_id = CharField(default='', unique=True)
class Thread(BaseModel):
"""The threads that concern us are only created if the messages that contain urls"""
thread_ts = FloatField(default = 0)
article_id = IntegerField(null=True)
class Message(BaseModel):
ts = FloatField(unique=True) #for sorting
channel_id = CharField(default='')
user = ForeignKeyField(User, backref="messages")
text = TextField(default='')
thread = ForeignKeyField(Thread, backref="messages", default=None)
file_type = CharField(default='')
perma_link = CharField(default='')
is_processed_override = BooleanField(default=False)
class ArticleModelReference(BaseModel):
message = ForeignKeyField(Message, backref='article_model_references')
article_model_id = IntegerField(default = 0)
for ref in ArticleModelReference.select():
ref.message.thread.article_id = ref.article_model_id
ref.message.thread.save()
db.drop_tables((ArticleModelReference))

View File

@@ -0,0 +1,322 @@
import logging
logger = logging.getLogger(__name__)
from peewee import *
import os
import markdown
import re
import configuration
import datetime
config = configuration.parsed["DOWNLOADS"]
slack_config = configuration.parsed["SLACK"]
## Helpers
chat_db = DatabaseProxy()
download_db = DatabaseProxy()
# set the nature of the db at runtime
class DownloadBaseModel(Model):
class Meta:
database = download_db
class ChatBaseModel(Model):
class Meta:
database = chat_db
## == Article related models == ##
class ArticleDownload(DownloadBaseModel):
title = CharField(default='')
pub_date = DateField(default = '')
download_date = DateField(default = datetime.date.today)
source_name = CharField(default = '')
article_url = TextField(default = '', unique=True)
archive_url = TextField(default = '')
file_name = TextField(default = '')
language = CharField(default = '')
summary = TextField(default = '')
comment = TextField(default = '')
verified = IntegerField(default = False)
# authors
# keywords
# ... are added through foreignkeys
def __str__(self) -> str:
return f"ART [{self.title} -- {self.source_name}]"
## Useful Properties
@property
def save_path(self):
return f"{config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/"
def fname_nas(self, file_name=""):
if self.download_date:
if file_name:
return "NAS: {}/{}/{}/{}".format(config["remote_storage_path"], self.download_date.year, self.download_date.strftime("%B"), file_name)
else: # return the self. name
return "NAS: {}/{}/{}/{}".format(config["remote_storage_path"], self.download_date.year, self.download_date.strftime("%B"), self.file_name)
else:
return None
@property
def fname_template(self):
if "youtube.com" in self.source_name or "youtu.be" in self.source_name:
fname = "{} -- {}".format(self.source_name, self.title)
else:
fname = "{} -- {}.pdf".format(self.source_name, self.title)
return clear_path_name(fname)
@property
def is_title_bad(self): # add incrementally
return "PUR-Abo" in self.title \
or "Redirecting" in self.title \
or "Error while running fetch" in self.title
@property
def slack_info(self):
status = [":x: No better version available", ":gear: Verification pending", ":white_check_mark: Verified by human"][self.verified + 1]
content = "\n>" + "\n>".join(self.summary.split("\n"))
file_status, msg = self.file_status()
if not file_status:
return [msg]
# everything alright: generate real content
# first the base file
if self.file_name[-4:] == ".pdf":
answer = [{ # main reply with the base pdf
"reply_text" : f"*{self.title}*\n{status}\n{content}",
"file_path" : self.save_path + self.file_name
}]
else: # don't upload if the file is too big!
location = "Not uploaded to slack, but the file will be on the NAS:\n`{}`".format(self.fname_nas())
answer = [{ # main reply with the base pdf
"reply_text" : "*{}*\n{}\n{}\n{}".format(self.title, status, content, location),
"file_path" : None
}]
# then the related files
rel_text = ""
for r in self.related:
fname = r.related_file_name
lentry = "\n• `{}` ".format(self.fname_nas(fname))
if fname[-4:] == ".pdf": # this is a manageable file, directly upload
f_ret = self.save_path + fname
answer.append({"reply_text":"", "file_path" : f_ret})
else: # not pdf <=> too large. Don't upload but mention its existence
lentry += "(not uploaded to slack, but the file will be on the NAS)"
rel_text += lentry
if rel_text:
rel_text = answer[0]["reply_text"] = answer[0]["reply_text"] + "\nRelated files:\n" + rel_text
return answer
@property
def mail_info(self):
base = [{"reply_text": "[{}]({})\n".format(self.article_url, self.article_url), "file_path":None}] + self.slack_info
return [{"reply_text": markdown.markdown(m["reply_text"]), "file_path": m["file_path"]} for m in base]
## Helpers
def set_keywords(self, keywords):
for k in keywords:
ArticleKeyword.create(
article = self,
keyword = k
)
def set_authors(self, authors):
for a in authors:
ArticleAuthor.create(
article = self,
author = a
)
def set_references(self, references):
for r in references:
ArticleReference.create(
article = self,
reference_url = r
)
def set_related(self, related):
for r in related:
ArticleRelated.create(
article = self,
related_file_name = r
)
def file_status(self):
if not self.file_name:
logger.error("Article {} has no filename!".format(self))
return False, {"reply_text": "Download failed, no file was saved.", "file_path": None}
file_path_abs = self.save_path + self.file_name
if not os.path.exists(file_path_abs):
logger.error("Article {} has a filename, but the file does not exist at that location!".format(self))
return False, {"reply_text": "Can't find file. Either the download failed or the file was moved.", "file_path": None}
return True, {}
class ArticleKeyword(DownloadBaseModel):
# instance gets created for every one keyword -> flexible in size
article = ForeignKeyField(ArticleDownload, backref='keywords')
keyword = CharField()
class ArticleAuthor(DownloadBaseModel):
article = ForeignKeyField(ArticleDownload, backref='authors')
author = CharField()
class ArticleReference(DownloadBaseModel):
article = ForeignKeyField(ArticleDownload, backref='references')
reference_url = TextField(default = '')
class ArticleRelated(DownloadBaseModel):
article = ForeignKeyField(ArticleDownload, backref='related')
related_file_name = TextField(default = '')
## == Slack-thread related models == ##
class User(ChatBaseModel):
user_id = CharField(default='', unique=True)
# messages
class Thread(ChatBaseModel):
"""The threads that concern us are only created if the base massage contains a url"""
thread_ts = FloatField(default = 0)
article = ForeignKeyField(ArticleDownload, backref="slack_thread", null=True, default=None)
# provides, ts, user, models
# messages
@property
def slack_ts(self):
str_ts = str(self.thread_ts)
cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem!
return "{}{}".format(str_ts, cut_zeros*"0")
@property
def initiator_message(self):
try:
return self.messages[0] # TODO check if this needs sorting
except IndexError:
logger.warning(f"Thread {self} is empty. How can that be?")
return None
@property
def message_count(self):
# logger.warning("message_count was called")
return self.messages.count()
@property
def last_message(self):
messages = Message.select().where(Message.thread == self).order_by(Message.ts) # can't be empty by definition/creation
return messages[-1]
@property
def is_fully_processed(self) -> bool:
init_message = self.initiator_message
if init_message is None:
return False
if init_message.is_processed_override:
return True
# this override is set for instance, when no url was sent at all. Then set this thread to be ignored
reactions = init_message.reaction
if not reactions:
return False
else:
r = reactions[0].type # can and should only have one reaction
return r == "white_check_mark" \
or r == "x"
class Message(ChatBaseModel):
ts = FloatField(unique=True) #for sorting
channel_id = CharField(default='')
user = ForeignKeyField(User, backref="messages")
text = TextField(default='')
thread = ForeignKeyField(Thread, backref="messages", default=None)
file_type = CharField(default='')
perma_link = CharField(default='')
is_processed_override = BooleanField(default=False)
# reaction
def __str__(self) -> str:
return "MSG [{}]".format(self.text[:min(len(self.text), 30)].replace('\n','/') + '...')
@property
def slack_ts(self):
str_ts = str(self.ts)
cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem!
return "{}{}".format(str_ts, cut_zeros * "0")
@property
def urls(self):
pattern = r"<(.*?)>"
matches = re.findall(pattern, self.text)
matches = [m for m in matches if "." in m]
new_matches = []
for m in matches:
if "." in m: # must contain a tld, right?
# further complication: slack automatically abreviates urls in the format:
# <url|link preview>. Lucky for us, "|" is a character derecommended in urls, meaning we can "safely" split for it and retain the first half
if "|" in m:
keep = m.split("|")[0]
else:
keep = m
new_matches.append(keep)
return new_matches
@property
def is_by_human(self):
return self.user.user_id != slack_config["bot_id"]
@property
def has_single_url(self):
return len(self.urls) == 1
class Reaction(ChatBaseModel):
type = CharField(default = "")
message = ForeignKeyField(Message, backref="reaction")
def create_tables():
with download_db:
download_db.create_tables([ArticleDownload, ArticleKeyword, ArticleAuthor, ArticleReference, ArticleRelated])
with chat_db:
chat_db.create_tables([User, Message, Thread, Reaction])
def set_db(chat_db_object, download_db_object):
chat_db.initialize(chat_db_object)
download_db.initialize(download_db_object)
create_tables()
def clear_path_name(path):
keepcharacters = (' ','.','_', '-')
converted = "".join([c if (c.isalnum() or c in keepcharacters) else "_" for c in path]).rstrip()
return converted

View File

View File

@@ -0,0 +1,47 @@
import os
import subprocess
from pathlib import Path
import logging
logger = logging.getLogger(__name__)
import configuration
config = configuration.parsed["DOWNLOADS"]
shrink_sizes = []
def shrink_pdf(article):
article_loc = Path(article.save_path) / article.file_name
initial_size = article_loc.stat().st_size
compressed_tmp = Path(config['default_download_path']) / "compressed.pdf"
if article_loc.suffix != "pdf":
return article # it probably was a youtube video
c = subprocess.run(
[
"gs",
"-sDEVICE=pdfwrite",
"-dPDFSETTINGS=/screen",
"-dNOPAUSE",
"-dBATCH",
f"-sOutputFile={compressed_tmp}",
f"{article_loc}"
],
stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
if c.returncode == 0:
try:
os.replace(compressed_tmp, article_loc)
except OSError as e:
logger.error(f"Compression ran but I could not copy back the file {e}")
final_size = article_loc.stat().st_size
shrink_sizes.append(initial_size - final_size)
logger.info(f"Compression worked. Avg shrinkage: {int(sum(shrink_sizes)/len(shrink_sizes) / 1000)} KB")
else:
logger.error(f"Could not run the compression! {c.stderr.decode()} - {c.stdout.decode()}")
return article

View File

@@ -0,0 +1,172 @@
import time
import datetime
import logging
import os
import base64
import requests
from selenium import webdriver
import configuration
import json
config = configuration.parsed["DOWNLOADS"]
blacklisted = json.loads(config["blacklisted_href_domains"])
class PDFDownloader:
"""Saves a given url. Fills the object it got as a parameter"""
logger = logging.getLogger(__name__)
# status-variable for restarting:
running = False
def start(self):
self.finish() # clear up
options = webdriver.FirefoxOptions()
options.profile = config["browser_profile_path"]
# should be options.set_preference("profile", config["browser_profile_path"]) as of selenium 4 but that doesn't work
if os.getenv("HEADLESS", "false") == "true":
options.add_argument('--headless')
else:
self.logger.warning("Opening browser GUI because of 'HEADLESS=false'")
options.set_preference('print.save_as_pdf.links.enabled', True)
# Just save if the filetype is pdf already, does not work!
options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
options.set_preference("browser.download.folderList", 2)
# options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
# options.set_preference("pdfjs.disabled", True)
options.set_preference("browser.download.dir", config["default_download_path"])
self.logger.info("Starting gecko driver")
# self.driver = webdriver.Firefox(
# options = options,
# service = webdriver.firefox.service.Service(
# log_path = f'{config["local_storage_path"]}/geckodriver.log'
# ))
self.driver = webdriver.Remote(
command_executor = 'http://geckodriver:4444',
options = options,
# can't set log path...
)
residues = os.listdir(config["default_download_path"])
for res in residues:
os.remove(os.path.join(config["default_download_path"], res))
self.running = True
def autostart(self):
if not self.running:
self.start() # relaunch the dl util
def finish(self):
if self.running:
self.logger.info("Exiting gecko driver")
try:
self.driver.quit()
time.sleep(10)
except:
self.logger.critical("Connection to the driver broke off")
self.running = False
else:
self.logger.info("Gecko driver not yet running")
def download(self, article_object):
sleep_time = 2
self.autostart()
url = article_object.article_url
try:
self.driver.get(url)
except Exception as e:
self.logger.critical("Selenium .get(url) failed with error {}".format(e))
self.finish()
return article_object # without changes
time.sleep(sleep_time)
# leave the page time to do any funky business
# in the mean time, get a page title if required
if article_object.is_title_bad:
article_object.title = self.driver.title.replace(".pdf", "")
# will be propagated to the saved file (dst) as well
fname = article_object.fname_template
dst = os.path.join(article_object.save_path, fname)
if os.path.exists(dst):
fname = make_path_unique(fname)
dst = os.path.join(article_object.save_path, fname)
if url[-4:] == ".pdf":
# according to the browser preferences, calling the url will open pdfjs.
# If not handled separately, printing would require the ctrl+p route, but setup is janky to say the least
success = self.get_exisiting_pdf(url, dst)
else:
success = self.get_new_pdf(dst)
if success:
article_object.file_name = fname
article_object.set_references(self.get_references())
else:
article_object.file_name = ""
return article_object # this change is saved later by the external caller
def get_exisiting_pdf(self, url, dst):
try:
r = requests.get(url)
bytes = r.content
except:
return False
return self.get_new_pdf(dst, other_bytes=bytes)
def get_new_pdf(self, dst, other_bytes=None):
os.makedirs(os.path.dirname(dst), exist_ok=True)
if other_bytes is None:
try:
result = self.driver.print_page()
bytes = base64.b64decode(result, validate=True)
except:
self.logger.error("Failed, probably because the driver went extinct.")
return False
else:
bytes = other_bytes
try:
with open(dst, "wb+") as f:
f.write(bytes)
return True
except Exception as e:
self.logger.error(f"Failed, because of FS-operation: {e}")
return False
def get_references(self):
try:
hrefs = [e.get_attribute("href") for e in self.driver.find_elements_by_xpath("//a[@href]")]
except:
hrefs = []
len_old = len(hrefs)
hrefs = [h for h in hrefs \
if not sum([(domain in h) for domain in blacklisted]) # sum([True, False, False, False]) == 1 (esp. not 0)
] # filter a tiny bit at least
self.logger.info(f"Hrefs filtered (before: {len_old}, after: {len(hrefs)})")
return hrefs
def make_path_unique(path):
fname, ending = os.path.splitext(path)
fname += datetime.datetime.now().strftime("%d-%H%M%S")
return fname + ending

View File

@@ -0,0 +1,51 @@
from __future__ import unicode_literals
import youtube_dl
import os
import logging
logger = logging.getLogger(__name__)
class MyLogger(object):
def debug(self, msg): pass
def warning(self, msg): pass
def error(self, msg):
logger.error(msg)
class YouTubeDownloader:
def __init__(self) -> None:
pass
def post_download_hook(self, ret_code):
# print(ret_code)
if ret_code['status'] == 'finished':
file_loc = ret_code["filename"]
fname = os.path.basename(file_loc)
self.article_object.file_name = fname
def save_video(self, article_object):
"""Saves video accoring to url and save path"""
self.article_object = article_object
url = article_object.article_url
logger.info("Saving new video")
file_path = os.path.join(article_object.save_path, article_object.fname_template)
ydl_opts = {
'format': 'best[height<=720]',
'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download
'logger': MyLogger(),
'progress_hooks': [self.post_download_hook],
'updatetime': False
}
try:
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
# article file name is updated in self.post_download_hook
except Exception as e:
logger.error(f"Youtube download crashed: {e}")
article_object.file_name = ""
return article_object

View File

@@ -0,0 +1,62 @@
from newspaper import Article
from urllib.parse import urlparse
from htmldate import find_date
import datetime
import logging
logging.getLogger('newspaper').setLevel(logging.ERROR) # quieter logs
logging.getLogger('urllib').setLevel(logging.ERROR) # quieter logs
logging.getLogger('urllib3.poolmanager').setLevel(logging.ERROR) # quieter logs
logging.getLogger('htmldate').setLevel(logging.ERROR) #quieter logs
logging.getLogger('charset_normalizer').setLevel(logging.ERROR) #quieter logs
logger = logging.getLogger("fetch")
def get_description(article_object):
url = article_object.article_url
website = urlparse(url).netloc
article_object.source_name = website
try:
article_object.pub_date = datetime.datetime.strptime(find_date(url), '%Y-%d-%M')
except: # other file types
article_object.pub_date = datetime.datetime(year=1900, month=1, day=1)
try:
news_article = Article(url)
news_article.download()
news_article.parse()
except:
news_article = object() # fallback value
try:
article_object.title = news_article.title
except AttributeError:
article_object.title = "Error while running fetch"
try:
if article_object.summary:
article_object.summary = news_article.summary
elif news_article.text:
ind = min(500, len(news_article.text))
article_object.summary = news_article.text[:ind] + "..."
else:
article_object.summary = ""
except AttributeError:
article_object.summary = ""
try:
article_object.language = news_article.meta_lang
except AttributeError:
article_object.language = ""
try:
article_object.set_authors(news_article.authors)
except AttributeError:
pass # list would have been empty anyway
try:
article_object.set_keywords(news_article.keywords)
except AttributeError:
pass # list would have been empty anyway
return article_object

View File

@@ -0,0 +1,20 @@
import time
from waybackpy import WaybackMachineSaveAPI # upload to archive.org
import logging
logger = logging.getLogger(__name__)
def upload_to_archive(article_object):
"""uploads to archive.org and returns the archived url"""
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed?
url = article_object.article_url
try:
wayback = WaybackMachineSaveAPI(url, user_agent)
archive_url = wayback.save()
# logger.info(f"{url} uploaded to archive successfully")
article_object.archive_url = archive_url
except Exception as e:
article_object.archive_url = "Error while uploading: {}".format(e)
logger.error(f"Error while generating archive url: {e}")
return article_object

View File

@@ -0,0 +1,41 @@
from threading import Thread
import time
import logging
class TemplateWorker(Thread):
"""Parent class for any subsequent worker of the article-download pipeline. They should all run in parallel, thus the Thread subclassing"""
logger = logging.getLogger(__name__)
def __init__(self, *args, **kwargs) -> None:
target = self._queue_processor # will be executed on Worker.start()
group = kwargs.get("group", None)
name = kwargs.get("name", None)
super().__init__(group=group, target=target, name=name)
self._article_queue = []
self.logger.info(f"Worker thread {self.__class__.__name__} initialized successfully")
def process(self, article_watcher):
self._article_queue.append(article_watcher)#.article_model.article_url)
def _queue_processor(self):
"""This method is launched by thread.run() and idles when self._article_queue is empty. When an external caller appends to the queue it jumps into action"""
while True: # PLEASE tell me if I'm missing an obvious better way of doing this!
if len(self._article_queue) == 0:
time.sleep(5)
else:
article_watcher = self._article_queue.pop(0)
self.logger.info(f"{self.__class__.__name__} now processing from queue (length: {len(self._article_queue)}) - {article_watcher.article}")
self._handle_article(article_watcher)
def _handle_article(self, article_watcher, action=None):
if action is None:
self.logger.error("Unoverloaded call of _handle_article(). This should not occur in prod")
else:
article = article_watcher.article
article = action(article) # action updates the article object but does not save the change
article.save()

View File

@@ -0,0 +1,66 @@
from .worker_template import TemplateWorker
from .download.browser import PDFDownloader
from .download.youtube import YouTubeDownloader
from .fetch.runner import get_description
from .upload.runner import upload_to_archive as run_upload
from .compress.runner import shrink_pdf
import time
import logging
logger = logging.getLogger(__name__)
class DownloadWorker(TemplateWorker):
def __init__(self) -> None:
self.dl_runner = PDFDownloader().download
self.yt_runner = YouTubeDownloader().save_video
super().__init__()
def _handle_article(self, article_watcher):
article = article_watcher.article
u = article.article_url
if "youtu.be/" in u or "youtube.com/" in u:
action = self.yt_runner
else:
action = self.dl_runner
super()._handle_article(article_watcher, action)
article_watcher.download_completed = True
class FetchWorker(TemplateWorker):
def __init__(self) -> None:
super().__init__()
def _handle_article(self, article_watcher):
action = get_description # function
super()._handle_article(article_watcher, action)
article_watcher.fetch_completed = True
class UploadWorker(TemplateWorker):
def __init__(self) -> None:
super().__init__()
def _handle_article(self, article_watcher):
def action(*args, **kwargs):
time.sleep(10) # uploads to archive are throttled to 15/minute, but 5s still triggers a blacklisting
return run_upload(*args, **kwargs)
super()._handle_article(article_watcher, action)
article_watcher.upload_completed = True
class CompressWorker(TemplateWorker):
def __init__(self) -> None:
super().__init__()
def _handle_article(self, article_watcher):
action = shrink_pdf
super()._handle_article(article_watcher, action)
article_watcher.compression_completed = True

View File

@@ -0,0 +1,10 @@
peewee
selenium
# youtube-dl
git+https://github.com/ytdl-org/youtube-dl.git #non-release version with faster downloads
waybackpy
slack_bolt # relies on slack_sdk
newspaper3k
htmldate
markdown
rich