Better launch, cleaner shutdown (wip)

This commit is contained in:
Remy Moll 2022-08-11 13:42:45 +02:00
parent bc5eaba519
commit 9ca4985853
14 changed files with 147 additions and 37 deletions

View File

@ -36,25 +36,32 @@ Upload mode is much simpler, it goes over the exisiting database and operates on
* For normal `production` mode run:
`docker compose --env-file env/production up`
`docker compose --env-file env/production run news_fetch`
* For `debug` mode, you will likely want interactivity, so you need to run:
* For `debug` mode run:
`docker compose --env-file env/debug up -d && docker compose --env-file env/debug exec news_fetch bash && docker compose --env-file env/debug down`
`docker compose --env-file env/debug run news_fetch`
which should automatically shutdown the containers once you are done. (`ctrl+d` to exit the container shell). If not, re-run `docker compose --env-file env/debug down` manually.
which drops you into an interactive shell (`ctrl+d` to exit the container shell).
> Note:
> The live-mounted code is now under `/code`. Note that the `DEBUG=true` environment variable is still set. If you want to test things on production, run `export DEBUG=false`. Running `python runner.py` will now run the newly written code but, with the production database and storage.
* For `check` mode, some env-variables are also changed and you still require interactivity. You don't need the geckodriver service however. The simplest way is to run
`docker compose --env-file env/check run news_fetch`
`docker compose --env-file env/check run --no-deps --rm news_fetch`
* Finally, for `upload` mode no interactivity is required and no additional services are required. Simply run:
`docker compose --env-file env/upload run news_fetch`
`docker compose --env-file env/upload run --no-deps --rm news_fetch`
### Stopping
Run
`docker compose --env-file env/production down`
which terminates all containers associated with the `docker-compose.yaml`.
## Building
@ -70,3 +77,9 @@ In docker compose, run
## Roadmap:
[_] handle paywalled sites like faz, spiegel, ... through their dedicated sites (see nexisuni.com for instance), available through the ETH network
## Manual Sync to NAS:
I use `rsync`. Mounting the NAS locally, I navigate to the location of the local folder (notice the trailing slash). Then run
`rsync -Razq --no-perms --no-owner --no-group --temp-dir=/tmp --progress --log-file=rsync.log <local folder>/ "<remote>"`
where `<remote>` is the location where the NAS is mounted. (options:`R` - relative paths , `a` - archive mode (multiple actions), `z` - ??, `q` - quiet. We also don't copy most of the metadata and we keep a log of the transfers.)

View File

@ -1,4 +1,5 @@
# docker compose --env-file env/debug up
# Usage:
# docker compose --env-file env/<mode> up
version: "3.9"
@ -15,7 +16,7 @@ services:
environment:
- DISPLAY=$DISPLAY
- TERM=xterm-256color # colored logs
- COLUMNS=160 # for wider logs
- COLUMNS=150 # for wider logs
- DEBUG=${DEBUG}
- CHECK=${CHECK}
@ -28,7 +29,7 @@ services:
geckodriver:
image: selenium/standalone-firefox:102.0.1
image: selenium/standalone-firefox:103.0
volumes:
- ${XSOCK-/dev/null}:${XSOCK-/tmp/sock}
- ${XAUTHORITY-/dev/null}:/home/auto_news/.Xauthority

1
env/check vendored
View File

@ -1,7 +1,6 @@
# Does not run any downloads but displays the previously downloaded but not yet checked files. Requires display-acces via xauth
CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
HOSTS_FILE=~/Bulk/COSS/Downloads/coss_archiving/dependencies/hosts
XAUTHORTIY=$XAUTHORTIY
XSOCK=/tmp/.X11-unix

1
env/debug vendored
View File

@ -1,7 +1,6 @@
# Runs in a debugging mode, does not launch anything at all but starts a bash process
CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
HOSTS_FILE=~/Bulk/COSS/Downloads/coss_archiving/dependencies/hosts
CODE=./
XAUTHORTIY=$XAUTHORTIY

2
env/production vendored
View File

@ -1,8 +1,8 @@
# Runs on the main slack channel with the full worker setup. If nothing funky has occured, reducedfetch is a speedup
CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
HOSTS_FILE=~/Bulk/COSS/Downloads/coss_archiving/dependencies/hosts
CONTAINERS_TO_RUN=nas_sync, geckodriver
DEBUG=false
CHECK=false
UPLOAD=false

3
env/upload vendored
View File

@ -1,9 +1,8 @@
# Does not run any other workers and only upploads to archive the urls that weren't previously uploaded
CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
HOSTS_FILE=~/Bulk/COSS/Downloads/coss_archiving/dependencies/hosts
NEWS_FETCH_DEPENDS_ON="[]"
DEBUG=false
CHECK=false
UPLOAD=true

View File

@ -4,6 +4,7 @@ models = configuration.models
from threading import Thread
import logging
import os
import sys
logger = logging.getLogger(__name__)
from utils_mail import runner as mail_runner
@ -102,7 +103,7 @@ class ArticleWatcher:
class Coordinator(Thread):
def __init__(self, **kwargs) -> None:
"""Launcher calls this Coordinator as the main thread to handle connections between the other workers (threaded)."""
super().__init__(target = self.launch)
super().__init__(target = self.launch, daemon=True)
def add_workers(self, **kwargs):
self.worker_slack = kwargs.pop("worker_slack", None)
@ -192,6 +193,13 @@ if __name__ == "__main__":
"worker_slack" : slack_runner,
"worker_mail" : mail_runner,
}
coordinator.add_workers(**kwargs)
coordinator.start()
slack_runner.start()
try:
coordinator.add_workers(**kwargs)
coordinator.start()
slack_runner.start()
except KeyboardInterrupt:
logger.info("Keyboard interrupt. Stopping Slack and Coordinator")
slack_runner.stop()
print("BYE!")
# coordinator was set as a daemon thread, so it will be stopped automatically
sys.exit(0)

View File

@ -55,7 +55,7 @@ def file_overview(file_url: str, file_attributes: list, options: dict) -> None:
def send_reaction_to_slack_thread(article, reaction):
"""Sends the verification status as a reaction to the associated slack thread. This will significantly decrease load times of the bot"""
"""Sends the verification status as a reaction to the associated slack thread."""
thread = article.slack_thread
messages = models.Message.select().where(models.Message.text.contains(article.article_url))
# TODO rewrite this shit
@ -63,9 +63,10 @@ def send_reaction_to_slack_thread(article, reaction):
print("Found more than 5 messages. Aborting reactions...")
return
for m in messages:
if not m.has_single_url:
if m.is_processed_override:
print("Message already processed. Aborting reactions...")
elif not m.has_single_url:
print("Found thread but won't send reaction because thread has multiple urls")
pass
else:
ts = m.slack_ts
bot_client.reactions_add(

View File

@ -37,6 +37,6 @@ def send(article_model):
smtp.sendmail(config["sender"], config["recipient"], mail.as_string())
smtp.quit()
logger.info("Mail successfully sent.")
except Exception as e:
except smtplib.SMTPException as e:
logger.error("Could not send mail for article {}".format(article_model))
logger.info(e)

View File

@ -14,6 +14,7 @@ LATEST_RECORDED_REACTION = 0
def init(client) -> None:
"""Starts fetching past messages and returns the freshly launched thread"""
global slack_client
slack_client = client
@ -26,7 +27,7 @@ def init(client) -> None:
# fetch all te messages we could have possibly missed
logger.info("Querying missed messages, threads and reactions. This can take some time.")
fetch_missed_channel_messages() # not threaded
t = Thread(target = fetch_missed_channel_reactions) # threaded, runs in background (usually takes a long time)
t = Thread(target = fetch_missed_channel_reactions, daemon=True) # threaded, runs in background (usually takes a long time)
t.start()
if os.getenv("REDUCEDFETCH", "false") == "true":
@ -153,16 +154,23 @@ def fetch_missed_channel_reactions():
logger.info("Starting background fetch of channel reactions...")
threads = [t for t in models.Thread.select() if not t.is_fully_processed]
for i,t in enumerate(threads):
reactions = []
try:
query = slack_client.reactions_get(
channel = config["archive_id"],
timestamp = t.slack_ts
)
reactions = query.get("message", []).get("reactions", []) # default = []
except SlackApiError: # probably a rate_limit:
logger.error("Hit rate limit while querying reactions. retrying in {}s ({}/{} queries elapsed)".format(config["api_wait_time"], i, len(threads)))
time.sleep(int(config["api_wait_time"]))
reactions = query.get("message", []).get("reactions", [])
except SlackApiError as e:
if e.response.get("error", "") == "message_not_found":
m = t.initiator_message
logger.warning(f"Message (id={m.id}) not found. Skipping and saving...")
# this usually means the message is past the 1000 message limit imposed by slack. Mark it as processed in the db
m.is_processed_override = True
m.save()
else: # probably a rate_limit:
logger.error("Hit rate limit while querying reactions. retrying in {}s ({}/{} queries elapsed)".format(config["api_wait_time"], i, len(threads)))
time.sleep(int(config["api_wait_time"]))
for r in reactions:
reaction_dict_to_model(r, t)

View File

@ -1,5 +1,6 @@
from slack_bolt import App
from slack_bolt.adapter.socket_mode import SocketModeHandler
from slack_sdk.errors import SlackApiError
import logging
import configuration
@ -18,7 +19,7 @@ class BotApp(App):
super().__init__(*args, **kwargs)
self.callback = callback
def start(self):
def pre_start(self):
message_helpers.init(self.client)
missed_messages, missed_reactions = message_helpers.get_unhandled_messages()
@ -124,7 +125,7 @@ class BotApp(App):
answers = article.slack_info
for a in answers:
if a["file_path"]:
try: # either, a["file_path"] does not exist, or the upload resulted in an error
try: # upload resulted in an error
self.client.files_upload(
channels = config["archive_id"],
initial_comment = f"<@{config['responsible_id']}> \n {a['reply_text']}",
@ -132,12 +133,13 @@ class BotApp(App):
thread_ts = thread.slack_ts
)
status = True
except:
except SlackApiError as e:
say(
"File {} could not be uploaded.".format(a),
thread_ts=thread.slack_ts
)
status = False
self.logger.error(f"File upload failed: {e}")
else: # anticipated that there is no file!
say(
f"<@{config['responsible_id']}> \n {a['reply_text']}",
@ -171,14 +173,17 @@ class BotRunner():
def handle_incoming_reaction(event, say):
return self.bot_worker.handle_incoming_reaction(event)
# target = self.launch
# super().__init__(target=target)
self.handler = SocketModeHandler(self.bot_worker, config["app_token"])
def start(self):
self.bot_worker.start()
SocketModeHandler(self.bot_worker, config["app_token"]).start()
self.bot_worker.pre_start()
self.handler.start()
def stop(self):
self.handler.close()
print("Bye handler!")
# def respond_to_message(self, message):
# self.bot_worker.handle_incoming_message(message)

View File

@ -31,7 +31,8 @@ class PDFDownloader:
self.logger.warning("Opening browser GUI because of 'HEADLESS=false'")
options.set_preference('print.save_as_pdf.links.enabled', True)
# Just save if the filetype is pdf already, does not work!
# Just save if the filetype is pdf already
# TODO: this is not working right now
options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
options.set_preference("browser.download.folderList", 2)
@ -40,6 +41,7 @@ class PDFDownloader:
options.set_preference("browser.download.dir", config["default_download_path"])
self.logger.info("Starting gecko driver")
# peviously, in a single docker image:
# self.driver = webdriver.Firefox(
# options = options,
# service = webdriver.firefox.service.Service(

View File

@ -57,6 +57,6 @@ def get_description(article_object):
try:
article_object.set_keywords(news_article.keywords)
except AttributeError:
pass # list would have been empty anyway
pass # list would have been empty anyway
return article_object

View File

@ -0,0 +1,75 @@
# Usage:
# docker compose --env-file env/<mode> run <args> news_fetch && docker-compose --env-file env/production down
version: "3.9"
services:
geckodriver:
image: selenium/standalone-firefox:103.0
volumes:
- ${XSOCK-/dev/null}:${XSOCK-/tmp/sock}
- ${XAUTHORITY-/dev/null}:/home/auto_news/.Xauthority
environment:
- DISPLAY=$DISPLAY
- START_VNC=false
- START_XVFB=false
user: 1001:1001
expose: # exposed to other docker-compose services only
- "4444"
vpn:
image: wazum/openconnect-proxy:latest
env_file:
- ${CONTAINER_DATA}/config/vpn.config
cap_add:
- NET_ADMIN
volumes:
- /dev/net/tun:/dev/net/tun
# alternative to cap_add & volumes: specify privileged: true
nas_sync:
depends_on:
- vpn # used to establish a connection to the SMB server
network_mode: "service:vpn"
build: nas_sync
image: nas_sync:latest
cap_add: # capabilities needed for mounting the SMB share
- SYS_ADMIN
- DAC_READ_SEARCH
volumes:
- ${CONTAINER_DATA}/files:/sync/local_files
- ${CONTAINER_DATA}/config/nas_sync.config:/sync/nas_sync.config
- ${CONTAINER_DATA}/config/nas_login.config:/sync/nas_login.config
command:
- nas22.ethz.ch/gess_coss_1/helbing_support/Files RM/Archiving/TEST # first command is the target mount path
- lsyncd
- /sync/nas_sync.config
news_fetch:
build: news_fetch
image: news_fetch:latest
depends_on: # when using docker compose run news_fetch, the dependencies are started as well
- nas_sync
- geckodriver
volumes:
- ${CONTAINER_DATA}:/app/containerdata # always set
- ${CODE:-/dev/null}:/code # not set in prod, defaults to /dev/null
- ${XSOCK-/dev/null}:${XSOCK-/tmp/sock} # x11 socket, needed for gui
# - ${XAUTHORITY-/dev/null}:/home/auto_news/.Xauthority # xauth needed for authenticating to x11
environment:
- DISPLAY=$DISPLAY # needed to let x11 apps know where to connect to
- DEBUG=${DEBUG}
- CHECK=${CHECK}
- UPLOAD=${UPLOAD}
- HEADLESS=${HEADLESS}
- REDUCEDFETCH=${REDUCEDFETCH}
entrypoint: ${ENTRYPOINT:-python3 runner.py} # by default launch workers as defined in the Dockerfile
stdin_open: ${INTERACTIVE:-false} # docker run -i
tty: ${INTERACTIVE:-false} # docker run -t