Better launch, cleaner shutdown (wip)

2022-08-11 13:42:45 +02:00 · 2022-08-11 13:42:45 +02:00 · 9ca4985853
commit 9ca4985853
parent bc5eaba519
14 changed files with 147 additions and 37 deletions
--- a/README.md
+++ b/README.md
@ -36,25 +36,32 @@ Upload mode is much simpler, it goes over the exisiting database and operates on
 * For normal `production` mode run:
-    `docker compose --env-file env/production up`
+    `docker compose --env-file env/production run news_fetch`
-* For `debug` mode, you will likely want interactivity, so you need to run:
+* For `debug` mode run:
-    `docker compose --env-file env/debug  up -d && docker compose --env-file env/debug exec news_fetch bash && docker compose --env-file env/debug down`
+    `docker compose --env-file env/debug run news_fetch`
-    which should automatically shutdown the containers once you are done. (`ctrl+d` to exit the container shell). If not, re-run `docker compose --env-file env/debug down` manually.
+    which drops you into an interactive shell (`ctrl+d` to exit the container shell).
    > Note:
    > The live-mounted code is now under `/code`. Note that the `DEBUG=true` environment variable is still set. If you want to test things on production, run `export DEBUG=false`. Running `python runner.py` will now run the newly written code but, with the production database and storage.
 * For `check` mode, some env-variables are also changed and you still require interactivity. You don't need the geckodriver service however. The simplest way is to run
-    `docker compose --env-file env/check run news_fetch`
+    `docker compose --env-file env/check run --no-deps --rm news_fetch`
 * Finally, for `upload` mode no interactivity is required and no additional services are required. Simply run:
-    `docker compose --env-file env/upload run news_fetch`
+    `docker compose --env-file env/upload run --no-deps --rm news_fetch`
 ### Stopping
 Run 
 `docker compose --env-file env/production down`
 which terminates all containers associated with the `docker-compose.yaml`. 
 ## Building
@ -70,3 +77,9 @@ In docker compose, run
 ## Roadmap:
 [_] handle paywalled sites like faz, spiegel, ... through their dedicated sites (see nexisuni.com for instance), available through the ETH network
 ## Manual Sync to NAS:
 I use `rsync`. Mounting the NAS locally, I navigate to the location of the local folder (notice the trailing slash). Then run
 `rsync -Razq --no-perms --no-owner --no-group --temp-dir=/tmp --progress --log-file=rsync.log <local folder>/ "<remote>"`
 where `<remote>` is the location where the NAS is mounted. (options:`R` - relative paths  , `a` - archive mode (multiple actions), `z` - ??, `q` - quiet. We also don't copy most of the metadata and we keep a log of the transfers.)
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@ -1,4 +1,5 @@
-# docker compose --env-file env/debug up 
+# Usage:
 # docker compose --env-file env/<mode> up 
 version: "3.9"
@ -15,7 +16,7 @@ services:
    environment:
      - DISPLAY=$DISPLAY
      - TERM=xterm-256color # colored logs
-      - COLUMNS=160 # for wider logs
+      - COLUMNS=150 # for wider logs
      - DEBUG=${DEBUG}
      - CHECK=${CHECK}
@ -28,7 +29,7 @@ services:
  geckodriver:
-    image: selenium/standalone-firefox:102.0.1
+    image: selenium/standalone-firefox:103.0
    volumes:
      - ${XSOCK-/dev/null}:${XSOCK-/tmp/sock}
      - ${XAUTHORITY-/dev/null}:/home/auto_news/.Xauthority
--- a/env/check
+++ b/env/check
@ -1,7 +1,6 @@
 # Does not run any downloads but displays the previously downloaded but not yet checked files. Requires display-acces via xauth
 CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
 HOSTS_FILE=~/Bulk/COSS/Downloads/coss_archiving/dependencies/hosts
 XAUTHORTIY=$XAUTHORTIY
 XSOCK=/tmp/.X11-unix
--- a/env/debug
+++ b/env/debug
@ -1,7 +1,6 @@
 # Runs in a debugging mode, does not launch anything at all but starts a bash process
 CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
 HOSTS_FILE=~/Bulk/COSS/Downloads/coss_archiving/dependencies/hosts
 CODE=./
 XAUTHORTIY=$XAUTHORTIY
--- a/env/production
+++ b/env/production
@ -1,8 +1,8 @@
 # Runs on the main slack channel with the full worker setup. If nothing funky has occured, reducedfetch is a speedup
 CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
 HOSTS_FILE=~/Bulk/COSS/Downloads/coss_archiving/dependencies/hosts
 CONTAINERS_TO_RUN=nas_sync, geckodriver
 DEBUG=false
 CHECK=false
 UPLOAD=false
--- a/env/upload
+++ b/env/upload
@ -1,9 +1,8 @@
 # Does not run any other workers and only upploads to archive the urls that weren't previously uploaded
 CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
 HOSTS_FILE=~/Bulk/COSS/Downloads/coss_archiving/dependencies/hosts
 NEWS_FETCH_DEPENDS_ON="[]"
 DEBUG=false
 CHECK=false
 UPLOAD=true
--- a/news_fetch/app/runner.py
+++ b/news_fetch/app/runner.py
@ -4,6 +4,7 @@ models = configuration.models
 from threading import Thread
 import logging
 import os
 import sys
 logger = logging.getLogger(__name__)
 from utils_mail import runner as mail_runner
@ -102,7 +103,7 @@ class ArticleWatcher:
 class Coordinator(Thread):
    def __init__(self, **kwargs) -> None:
        """Launcher calls this Coordinator as the main thread to handle connections between the other workers (threaded)."""
-        super().__init__(target = self.launch)
+        super().__init__(target = self.launch, daemon=True)
    def add_workers(self, **kwargs):
        self.worker_slack = kwargs.pop("worker_slack", None) 
@ -192,6 +193,13 @@ if __name__ == "__main__":
            "worker_slack" : slack_runner,
            "worker_mail" : mail_runner,
        }
        try:
            coordinator.add_workers(**kwargs)
            coordinator.start()
            slack_runner.start()
        except KeyboardInterrupt:
            logger.info("Keyboard interrupt. Stopping Slack and Coordinator")
            slack_runner.stop()
            print("BYE!")
            # coordinator was set as a daemon thread, so it will be stopped automatically
            sys.exit(0)
--- a/news_fetch/app/utils_check/runner.py
+++ b/news_fetch/app/utils_check/runner.py
@ -55,7 +55,7 @@ def file_overview(file_url: str, file_attributes: list, options: dict) -> None:
 def send_reaction_to_slack_thread(article, reaction):
-    """Sends the verification status as a reaction to the associated slack thread. This will significantly decrease load times of the bot"""
+    """Sends the verification status as a reaction to the associated slack thread."""
    thread = article.slack_thread
    messages = models.Message.select().where(models.Message.text.contains(article.article_url))
    # TODO rewrite this shit
@ -63,9 +63,10 @@ def send_reaction_to_slack_thread(article, reaction):
        print("Found more than 5 messages. Aborting reactions...")
        return
    for m in messages:
-        if not m.has_single_url:
+        if m.is_processed_override:
            print("Message already processed. Aborting reactions...")
        elif not m.has_single_url:
            print("Found thread but won't send reaction because thread has multiple urls")
            pass
        else:
            ts = m.slack_ts
            bot_client.reactions_add(
--- a/news_fetch/app/utils_mail/runner.py
+++ b/news_fetch/app/utils_mail/runner.py
@ -37,6 +37,6 @@ def send(article_model):
        smtp.sendmail(config["sender"], config["recipient"], mail.as_string())
        smtp.quit()
        logger.info("Mail successfully sent.")
-    except Exception as e:
+    except smtplib.SMTPException as e:
        logger.error("Could not send mail for article {}".format(article_model))
        logger.info(e)
--- a/news_fetch/app/utils_slack/message_helpers.py
+++ b/news_fetch/app/utils_slack/message_helpers.py
@ -14,6 +14,7 @@ LATEST_RECORDED_REACTION = 0
 def init(client) -> None:
    """Starts fetching past messages and returns the freshly launched thread"""
    global slack_client
    slack_client = client
@ -26,7 +27,7 @@ def init(client) -> None:
    # fetch all te messages we could have possibly missed
    logger.info("Querying missed messages, threads and reactions. This can take some time.")
    fetch_missed_channel_messages() # not threaded
-    t = Thread(target = fetch_missed_channel_reactions) # threaded, runs in background (usually takes a long time)
+    t = Thread(target = fetch_missed_channel_reactions, daemon=True) # threaded, runs in background (usually takes a long time)
    t.start()
    if os.getenv("REDUCEDFETCH", "false") == "true":
@ -153,16 +154,23 @@ def fetch_missed_channel_reactions():
    logger.info("Starting background fetch of channel reactions...")
    threads = [t for t in models.Thread.select() if not t.is_fully_processed]
    for i,t in enumerate(threads):
        reactions = []
        try:
            query = slack_client.reactions_get(
                channel = config["archive_id"],
                timestamp = t.slack_ts
            )
            reactions = query.get("message", []).get("reactions", []) # default = []
-        except SlackApiError: # probably a rate_limit:
+        except SlackApiError as e:
            if e.response.get("error", "") == "message_not_found":
                m = t.initiator_message
                logger.warning(f"Message (id={m.id}) not found. Skipping and saving...")
                # this usually means the message is past the 1000 message limit imposed by slack. Mark it as processed in the db
                m.is_processed_override = True
                m.save()
            else: # probably a rate_limit:
                logger.error("Hit rate limit while querying reactions. retrying in {}s ({}/{} queries elapsed)".format(config["api_wait_time"], i, len(threads)))
                time.sleep(int(config["api_wait_time"]))
            reactions = query.get("message", []).get("reactions", [])
        for r in reactions:
            reaction_dict_to_model(r, t)
--- a/news_fetch/app/utils_slack/runner.py
+++ b/news_fetch/app/utils_slack/runner.py
@ -1,5 +1,6 @@
 from slack_bolt import App
 from slack_bolt.adapter.socket_mode import SocketModeHandler
 from slack_sdk.errors import SlackApiError
 import logging
 import configuration
@ -18,7 +19,7 @@ class BotApp(App):
        super().__init__(*args, **kwargs)
        self.callback = callback
-    def start(self):
+    def pre_start(self):
        message_helpers.init(self.client)
        missed_messages, missed_reactions = message_helpers.get_unhandled_messages()
@ -124,7 +125,7 @@ class BotApp(App):
        answers = article.slack_info
        for a in answers:
            if a["file_path"]:
-                try: # either, a["file_path"] does not exist, or the upload resulted in an error
+                try: # upload resulted in an error
                    self.client.files_upload(
                        channels = config["archive_id"],
                        initial_comment = f"<@{config['responsible_id']}> \n {a['reply_text']}",
@ -132,12 +133,13 @@ class BotApp(App):
                        thread_ts = thread.slack_ts
                    )
                    status = True
-                except:
+                except SlackApiError as e:
                    say(
                        "File {} could not be uploaded.".format(a),
                        thread_ts=thread.slack_ts
                    )
                    status = False
                    self.logger.error(f"File upload failed: {e}")
            else: # anticipated that there is no file!
                say(
                    f"<@{config['responsible_id']}> \n {a['reply_text']}",
@ -171,14 +173,17 @@ class BotRunner():
        def handle_incoming_reaction(event, say):
            return self.bot_worker.handle_incoming_reaction(event)
-        # target = self.launch
+        self.handler = SocketModeHandler(self.bot_worker, config["app_token"])
        # super().__init__(target=target)
    def start(self):
-        self.bot_worker.start()
+        self.bot_worker.pre_start()
-        SocketModeHandler(self.bot_worker, config["app_token"]).start()
+        self.handler.start()
    def stop(self):
        self.handler.close()
        print("Bye handler!")
    # def respond_to_message(self, message):
    #     self.bot_worker.handle_incoming_message(message)
--- a/news_fetch/app/utils_worker/download/browser.py
+++ b/news_fetch/app/utils_worker/download/browser.py
@ -31,7 +31,8 @@ class PDFDownloader:
            self.logger.warning("Opening browser GUI because of 'HEADLESS=false'")
        options.set_preference('print.save_as_pdf.links.enabled', True)
-        # Just save if the filetype is pdf already, does not work!
+        # Just save if the filetype is pdf already
        # TODO: this is not working right now
        options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
        options.set_preference("browser.download.folderList", 2)
@ -40,6 +41,7 @@ class PDFDownloader:
        options.set_preference("browser.download.dir", config["default_download_path"])
        self.logger.info("Starting gecko driver")
        # peviously, in a single docker image:
        # self.driver = webdriver.Firefox(
        #     options = options,
        #     service = webdriver.firefox.service.Service(
--- a/testing.docker-compose.yaml
+++ b/testing.docker-compose.yaml
@ -0,0 +1,75 @@
 # Usage:
 # docker compose --env-file env/<mode> run <args> news_fetch && docker-compose --env-file env/production down
 version: "3.9"
 services:
  geckodriver:
    image: selenium/standalone-firefox:103.0
    volumes:
      - ${XSOCK-/dev/null}:${XSOCK-/tmp/sock}
      - ${XAUTHORITY-/dev/null}:/home/auto_news/.Xauthority
    environment:
      - DISPLAY=$DISPLAY
      - START_VNC=false
      - START_XVFB=false
    user: 1001:1001
    expose: # exposed to other docker-compose services only
    - "4444"
  vpn:
    image: wazum/openconnect-proxy:latest
    env_file:
      - ${CONTAINER_DATA}/config/vpn.config
    cap_add:
    - NET_ADMIN
    volumes:
      - /dev/net/tun:/dev/net/tun
    # alternative to cap_add & volumes: specify privileged: true
  nas_sync:
    depends_on:
      - vpn # used to establish a connection to the SMB server
    network_mode: "service:vpn"
    build: nas_sync
    image: nas_sync:latest
    cap_add: # capabilities needed for mounting the SMB share
      - SYS_ADMIN
      - DAC_READ_SEARCH
    volumes:
      - ${CONTAINER_DATA}/files:/sync/local_files
      - ${CONTAINER_DATA}/config/nas_sync.config:/sync/nas_sync.config
      - ${CONTAINER_DATA}/config/nas_login.config:/sync/nas_login.config
    command: 
      - nas22.ethz.ch/gess_coss_1/helbing_support/Files RM/Archiving/TEST # first command is the target mount path
      - lsyncd
      - /sync/nas_sync.config
  news_fetch:
    build: news_fetch
    image: news_fetch:latest
    depends_on: # when using docker compose run news_fetch, the dependencies are started as well
      - nas_sync
      - geckodriver
    volumes:
      - ${CONTAINER_DATA}:/app/containerdata # always set
      - ${CODE:-/dev/null}:/code # not set in prod, defaults to /dev/null
      - ${XSOCK-/dev/null}:${XSOCK-/tmp/sock} # x11 socket, needed for gui
      # - ${XAUTHORITY-/dev/null}:/home/auto_news/.Xauthority # xauth needed for authenticating to x11
    environment:
      - DISPLAY=$DISPLAY # needed to let x11 apps know where to connect to
      - DEBUG=${DEBUG}
      - CHECK=${CHECK}
      - UPLOAD=${UPLOAD}
      - HEADLESS=${HEADLESS}
      - REDUCEDFETCH=${REDUCEDFETCH}
    entrypoint: ${ENTRYPOINT:-python3 runner.py} # by default launch workers as defined in the Dockerfile
    stdin_open: ${INTERACTIVE:-false} # docker run -i
    tty: ${INTERACTIVE:-false}        # docker run -t