Better launch, cleaner shutdown (wip)

This commit is contained in:
Remy Moll 2022-08-11 13:42:45 +02:00
parent bc5eaba519
commit 9ca4985853
14 changed files with 147 additions and 37 deletions

View File

@ -36,25 +36,32 @@ Upload mode is much simpler, it goes over the exisiting database and operates on
* For normal `production` mode run: * For normal `production` mode run:
`docker compose --env-file env/production up` `docker compose --env-file env/production run news_fetch`
* For `debug` mode, you will likely want interactivity, so you need to run: * For `debug` mode run:
`docker compose --env-file env/debug up -d && docker compose --env-file env/debug exec news_fetch bash && docker compose --env-file env/debug down` `docker compose --env-file env/debug run news_fetch`
which should automatically shutdown the containers once you are done. (`ctrl+d` to exit the container shell). If not, re-run `docker compose --env-file env/debug down` manually. which drops you into an interactive shell (`ctrl+d` to exit the container shell).
> Note: > Note:
> The live-mounted code is now under `/code`. Note that the `DEBUG=true` environment variable is still set. If you want to test things on production, run `export DEBUG=false`. Running `python runner.py` will now run the newly written code but, with the production database and storage. > The live-mounted code is now under `/code`. Note that the `DEBUG=true` environment variable is still set. If you want to test things on production, run `export DEBUG=false`. Running `python runner.py` will now run the newly written code but, with the production database and storage.
* For `check` mode, some env-variables are also changed and you still require interactivity. You don't need the geckodriver service however. The simplest way is to run * For `check` mode, some env-variables are also changed and you still require interactivity. You don't need the geckodriver service however. The simplest way is to run
`docker compose --env-file env/check run news_fetch` `docker compose --env-file env/check run --no-deps --rm news_fetch`
* Finally, for `upload` mode no interactivity is required and no additional services are required. Simply run: * Finally, for `upload` mode no interactivity is required and no additional services are required. Simply run:
`docker compose --env-file env/upload run news_fetch` `docker compose --env-file env/upload run --no-deps --rm news_fetch`
### Stopping
Run
`docker compose --env-file env/production down`
which terminates all containers associated with the `docker-compose.yaml`.
## Building ## Building
@ -70,3 +77,9 @@ In docker compose, run
## Roadmap: ## Roadmap:
[_] handle paywalled sites like faz, spiegel, ... through their dedicated sites (see nexisuni.com for instance), available through the ETH network [_] handle paywalled sites like faz, spiegel, ... through their dedicated sites (see nexisuni.com for instance), available through the ETH network
## Manual Sync to NAS:
I use `rsync`. Mounting the NAS locally, I navigate to the location of the local folder (notice the trailing slash). Then run
`rsync -Razq --no-perms --no-owner --no-group --temp-dir=/tmp --progress --log-file=rsync.log <local folder>/ "<remote>"`
where `<remote>` is the location where the NAS is mounted. (options:`R` - relative paths , `a` - archive mode (multiple actions), `z` - ??, `q` - quiet. We also don't copy most of the metadata and we keep a log of the transfers.)

View File

@ -1,4 +1,5 @@
# docker compose --env-file env/debug up # Usage:
# docker compose --env-file env/<mode> up
version: "3.9" version: "3.9"
@ -15,7 +16,7 @@ services:
environment: environment:
- DISPLAY=$DISPLAY - DISPLAY=$DISPLAY
- TERM=xterm-256color # colored logs - TERM=xterm-256color # colored logs
- COLUMNS=160 # for wider logs - COLUMNS=150 # for wider logs
- DEBUG=${DEBUG} - DEBUG=${DEBUG}
- CHECK=${CHECK} - CHECK=${CHECK}
@ -28,7 +29,7 @@ services:
geckodriver: geckodriver:
image: selenium/standalone-firefox:102.0.1 image: selenium/standalone-firefox:103.0
volumes: volumes:
- ${XSOCK-/dev/null}:${XSOCK-/tmp/sock} - ${XSOCK-/dev/null}:${XSOCK-/tmp/sock}
- ${XAUTHORITY-/dev/null}:/home/auto_news/.Xauthority - ${XAUTHORITY-/dev/null}:/home/auto_news/.Xauthority

1
env/check vendored
View File

@ -1,7 +1,6 @@
# Does not run any downloads but displays the previously downloaded but not yet checked files. Requires display-acces via xauth # Does not run any downloads but displays the previously downloaded but not yet checked files. Requires display-acces via xauth
CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
HOSTS_FILE=~/Bulk/COSS/Downloads/coss_archiving/dependencies/hosts
XAUTHORTIY=$XAUTHORTIY XAUTHORTIY=$XAUTHORTIY
XSOCK=/tmp/.X11-unix XSOCK=/tmp/.X11-unix

1
env/debug vendored
View File

@ -1,7 +1,6 @@
# Runs in a debugging mode, does not launch anything at all but starts a bash process # Runs in a debugging mode, does not launch anything at all but starts a bash process
CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
HOSTS_FILE=~/Bulk/COSS/Downloads/coss_archiving/dependencies/hosts
CODE=./ CODE=./
XAUTHORTIY=$XAUTHORTIY XAUTHORTIY=$XAUTHORTIY

2
env/production vendored
View File

@ -1,8 +1,8 @@
# Runs on the main slack channel with the full worker setup. If nothing funky has occured, reducedfetch is a speedup # Runs on the main slack channel with the full worker setup. If nothing funky has occured, reducedfetch is a speedup
CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
HOSTS_FILE=~/Bulk/COSS/Downloads/coss_archiving/dependencies/hosts
CONTAINERS_TO_RUN=nas_sync, geckodriver
DEBUG=false DEBUG=false
CHECK=false CHECK=false
UPLOAD=false UPLOAD=false

3
env/upload vendored
View File

@ -1,9 +1,8 @@
# Does not run any other workers and only upploads to archive the urls that weren't previously uploaded # Does not run any other workers and only upploads to archive the urls that weren't previously uploaded
CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
HOSTS_FILE=~/Bulk/COSS/Downloads/coss_archiving/dependencies/hosts
NEWS_FETCH_DEPENDS_ON="[]"
DEBUG=false DEBUG=false
CHECK=false CHECK=false
UPLOAD=true UPLOAD=true

View File

@ -4,6 +4,7 @@ models = configuration.models
from threading import Thread from threading import Thread
import logging import logging
import os import os
import sys
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
from utils_mail import runner as mail_runner from utils_mail import runner as mail_runner
@ -102,7 +103,7 @@ class ArticleWatcher:
class Coordinator(Thread): class Coordinator(Thread):
def __init__(self, **kwargs) -> None: def __init__(self, **kwargs) -> None:
"""Launcher calls this Coordinator as the main thread to handle connections between the other workers (threaded).""" """Launcher calls this Coordinator as the main thread to handle connections between the other workers (threaded)."""
super().__init__(target = self.launch) super().__init__(target = self.launch, daemon=True)
def add_workers(self, **kwargs): def add_workers(self, **kwargs):
self.worker_slack = kwargs.pop("worker_slack", None) self.worker_slack = kwargs.pop("worker_slack", None)
@ -192,6 +193,13 @@ if __name__ == "__main__":
"worker_slack" : slack_runner, "worker_slack" : slack_runner,
"worker_mail" : mail_runner, "worker_mail" : mail_runner,
} }
try:
coordinator.add_workers(**kwargs) coordinator.add_workers(**kwargs)
coordinator.start() coordinator.start()
slack_runner.start() slack_runner.start()
except KeyboardInterrupt:
logger.info("Keyboard interrupt. Stopping Slack and Coordinator")
slack_runner.stop()
print("BYE!")
# coordinator was set as a daemon thread, so it will be stopped automatically
sys.exit(0)

View File

@ -55,7 +55,7 @@ def file_overview(file_url: str, file_attributes: list, options: dict) -> None:
def send_reaction_to_slack_thread(article, reaction): def send_reaction_to_slack_thread(article, reaction):
"""Sends the verification status as a reaction to the associated slack thread. This will significantly decrease load times of the bot""" """Sends the verification status as a reaction to the associated slack thread."""
thread = article.slack_thread thread = article.slack_thread
messages = models.Message.select().where(models.Message.text.contains(article.article_url)) messages = models.Message.select().where(models.Message.text.contains(article.article_url))
# TODO rewrite this shit # TODO rewrite this shit
@ -63,9 +63,10 @@ def send_reaction_to_slack_thread(article, reaction):
print("Found more than 5 messages. Aborting reactions...") print("Found more than 5 messages. Aborting reactions...")
return return
for m in messages: for m in messages:
if not m.has_single_url: if m.is_processed_override:
print("Message already processed. Aborting reactions...")
elif not m.has_single_url:
print("Found thread but won't send reaction because thread has multiple urls") print("Found thread but won't send reaction because thread has multiple urls")
pass
else: else:
ts = m.slack_ts ts = m.slack_ts
bot_client.reactions_add( bot_client.reactions_add(

View File

@ -37,6 +37,6 @@ def send(article_model):
smtp.sendmail(config["sender"], config["recipient"], mail.as_string()) smtp.sendmail(config["sender"], config["recipient"], mail.as_string())
smtp.quit() smtp.quit()
logger.info("Mail successfully sent.") logger.info("Mail successfully sent.")
except Exception as e: except smtplib.SMTPException as e:
logger.error("Could not send mail for article {}".format(article_model)) logger.error("Could not send mail for article {}".format(article_model))
logger.info(e) logger.info(e)

View File

@ -14,6 +14,7 @@ LATEST_RECORDED_REACTION = 0
def init(client) -> None: def init(client) -> None:
"""Starts fetching past messages and returns the freshly launched thread"""
global slack_client global slack_client
slack_client = client slack_client = client
@ -26,7 +27,7 @@ def init(client) -> None:
# fetch all te messages we could have possibly missed # fetch all te messages we could have possibly missed
logger.info("Querying missed messages, threads and reactions. This can take some time.") logger.info("Querying missed messages, threads and reactions. This can take some time.")
fetch_missed_channel_messages() # not threaded fetch_missed_channel_messages() # not threaded
t = Thread(target = fetch_missed_channel_reactions) # threaded, runs in background (usually takes a long time) t = Thread(target = fetch_missed_channel_reactions, daemon=True) # threaded, runs in background (usually takes a long time)
t.start() t.start()
if os.getenv("REDUCEDFETCH", "false") == "true": if os.getenv("REDUCEDFETCH", "false") == "true":
@ -153,16 +154,23 @@ def fetch_missed_channel_reactions():
logger.info("Starting background fetch of channel reactions...") logger.info("Starting background fetch of channel reactions...")
threads = [t for t in models.Thread.select() if not t.is_fully_processed] threads = [t for t in models.Thread.select() if not t.is_fully_processed]
for i,t in enumerate(threads): for i,t in enumerate(threads):
reactions = []
try: try:
query = slack_client.reactions_get( query = slack_client.reactions_get(
channel = config["archive_id"], channel = config["archive_id"],
timestamp = t.slack_ts timestamp = t.slack_ts
) )
reactions = query.get("message", []).get("reactions", []) # default = [] reactions = query.get("message", []).get("reactions", []) # default = []
except SlackApiError: # probably a rate_limit: except SlackApiError as e:
if e.response.get("error", "") == "message_not_found":
m = t.initiator_message
logger.warning(f"Message (id={m.id}) not found. Skipping and saving...")
# this usually means the message is past the 1000 message limit imposed by slack. Mark it as processed in the db
m.is_processed_override = True
m.save()
else: # probably a rate_limit:
logger.error("Hit rate limit while querying reactions. retrying in {}s ({}/{} queries elapsed)".format(config["api_wait_time"], i, len(threads))) logger.error("Hit rate limit while querying reactions. retrying in {}s ({}/{} queries elapsed)".format(config["api_wait_time"], i, len(threads)))
time.sleep(int(config["api_wait_time"])) time.sleep(int(config["api_wait_time"]))
reactions = query.get("message", []).get("reactions", [])
for r in reactions: for r in reactions:
reaction_dict_to_model(r, t) reaction_dict_to_model(r, t)

View File

@ -1,5 +1,6 @@
from slack_bolt import App from slack_bolt import App
from slack_bolt.adapter.socket_mode import SocketModeHandler from slack_bolt.adapter.socket_mode import SocketModeHandler
from slack_sdk.errors import SlackApiError
import logging import logging
import configuration import configuration
@ -18,7 +19,7 @@ class BotApp(App):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.callback = callback self.callback = callback
def start(self): def pre_start(self):
message_helpers.init(self.client) message_helpers.init(self.client)
missed_messages, missed_reactions = message_helpers.get_unhandled_messages() missed_messages, missed_reactions = message_helpers.get_unhandled_messages()
@ -124,7 +125,7 @@ class BotApp(App):
answers = article.slack_info answers = article.slack_info
for a in answers: for a in answers:
if a["file_path"]: if a["file_path"]:
try: # either, a["file_path"] does not exist, or the upload resulted in an error try: # upload resulted in an error
self.client.files_upload( self.client.files_upload(
channels = config["archive_id"], channels = config["archive_id"],
initial_comment = f"<@{config['responsible_id']}> \n {a['reply_text']}", initial_comment = f"<@{config['responsible_id']}> \n {a['reply_text']}",
@ -132,12 +133,13 @@ class BotApp(App):
thread_ts = thread.slack_ts thread_ts = thread.slack_ts
) )
status = True status = True
except: except SlackApiError as e:
say( say(
"File {} could not be uploaded.".format(a), "File {} could not be uploaded.".format(a),
thread_ts=thread.slack_ts thread_ts=thread.slack_ts
) )
status = False status = False
self.logger.error(f"File upload failed: {e}")
else: # anticipated that there is no file! else: # anticipated that there is no file!
say( say(
f"<@{config['responsible_id']}> \n {a['reply_text']}", f"<@{config['responsible_id']}> \n {a['reply_text']}",
@ -171,14 +173,17 @@ class BotRunner():
def handle_incoming_reaction(event, say): def handle_incoming_reaction(event, say):
return self.bot_worker.handle_incoming_reaction(event) return self.bot_worker.handle_incoming_reaction(event)
# target = self.launch self.handler = SocketModeHandler(self.bot_worker, config["app_token"])
# super().__init__(target=target)
def start(self): def start(self):
self.bot_worker.start() self.bot_worker.pre_start()
SocketModeHandler(self.bot_worker, config["app_token"]).start() self.handler.start()
def stop(self):
self.handler.close()
print("Bye handler!")
# def respond_to_message(self, message): # def respond_to_message(self, message):
# self.bot_worker.handle_incoming_message(message) # self.bot_worker.handle_incoming_message(message)

View File

@ -31,7 +31,8 @@ class PDFDownloader:
self.logger.warning("Opening browser GUI because of 'HEADLESS=false'") self.logger.warning("Opening browser GUI because of 'HEADLESS=false'")
options.set_preference('print.save_as_pdf.links.enabled', True) options.set_preference('print.save_as_pdf.links.enabled', True)
# Just save if the filetype is pdf already, does not work! # Just save if the filetype is pdf already
# TODO: this is not working right now
options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True) options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
options.set_preference("browser.download.folderList", 2) options.set_preference("browser.download.folderList", 2)
@ -40,6 +41,7 @@ class PDFDownloader:
options.set_preference("browser.download.dir", config["default_download_path"]) options.set_preference("browser.download.dir", config["default_download_path"])
self.logger.info("Starting gecko driver") self.logger.info("Starting gecko driver")
# peviously, in a single docker image:
# self.driver = webdriver.Firefox( # self.driver = webdriver.Firefox(
# options = options, # options = options,
# service = webdriver.firefox.service.Service( # service = webdriver.firefox.service.Service(

View File

@ -0,0 +1,75 @@
# Usage:
# docker compose --env-file env/<mode> run <args> news_fetch && docker-compose --env-file env/production down
version: "3.9"
services:
geckodriver:
image: selenium/standalone-firefox:103.0
volumes:
- ${XSOCK-/dev/null}:${XSOCK-/tmp/sock}
- ${XAUTHORITY-/dev/null}:/home/auto_news/.Xauthority
environment:
- DISPLAY=$DISPLAY
- START_VNC=false
- START_XVFB=false
user: 1001:1001
expose: # exposed to other docker-compose services only
- "4444"
vpn:
image: wazum/openconnect-proxy:latest
env_file:
- ${CONTAINER_DATA}/config/vpn.config
cap_add:
- NET_ADMIN
volumes:
- /dev/net/tun:/dev/net/tun
# alternative to cap_add & volumes: specify privileged: true
nas_sync:
depends_on:
- vpn # used to establish a connection to the SMB server
network_mode: "service:vpn"
build: nas_sync
image: nas_sync:latest
cap_add: # capabilities needed for mounting the SMB share
- SYS_ADMIN
- DAC_READ_SEARCH
volumes:
- ${CONTAINER_DATA}/files:/sync/local_files
- ${CONTAINER_DATA}/config/nas_sync.config:/sync/nas_sync.config
- ${CONTAINER_DATA}/config/nas_login.config:/sync/nas_login.config
command:
- nas22.ethz.ch/gess_coss_1/helbing_support/Files RM/Archiving/TEST # first command is the target mount path
- lsyncd
- /sync/nas_sync.config
news_fetch:
build: news_fetch
image: news_fetch:latest
depends_on: # when using docker compose run news_fetch, the dependencies are started as well
- nas_sync
- geckodriver
volumes:
- ${CONTAINER_DATA}:/app/containerdata # always set
- ${CODE:-/dev/null}:/code # not set in prod, defaults to /dev/null
- ${XSOCK-/dev/null}:${XSOCK-/tmp/sock} # x11 socket, needed for gui
# - ${XAUTHORITY-/dev/null}:/home/auto_news/.Xauthority # xauth needed for authenticating to x11
environment:
- DISPLAY=$DISPLAY # needed to let x11 apps know where to connect to
- DEBUG=${DEBUG}
- CHECK=${CHECK}
- UPLOAD=${UPLOAD}
- HEADLESS=${HEADLESS}
- REDUCEDFETCH=${REDUCEDFETCH}
entrypoint: ${ENTRYPOINT:-python3 runner.py} # by default launch workers as defined in the Dockerfile
stdin_open: ${INTERACTIVE:-false} # docker run -i
tty: ${INTERACTIVE:-false} # docker run -t