Better launch, cleaner shutdown (wip)

2022-08-11 13:42:45 +02:00
parent bc5eaba519
commit 9ca4985853
14 changed files with 147 additions and 37 deletions
--- a/README.md
+++ b/README.md
@@ -36,25 +36,32 @@ Upload mode is much simpler, it goes over the exisiting database and operates on

 * For normal `production` mode run:

-    `docker compose --env-file env/production up`
+    `docker compose --env-file env/production run news_fetch`


-* For `debug` mode, you will likely want interactivity, so you need to run:
+* For `debug` mode run:

-    `docker compose --env-file env/debug  up -d && docker compose --env-file env/debug exec news_fetch bash && docker compose --env-file env/debug down`
+    `docker compose --env-file env/debug run news_fetch`
    
-    which should automatically shutdown the containers once you are done. (`ctrl+d` to exit the container shell). If not, re-run `docker compose --env-file env/debug down` manually.
+    which drops you into an interactive shell (`ctrl+d` to exit the container shell).

    > Note:
    > The live-mounted code is now under `/code`. Note that the `DEBUG=true` environment variable is still set. If you want to test things on production, run `export DEBUG=false`. Running `python runner.py` will now run the newly written code but, with the production database and storage.

 * For `check` mode, some env-variables are also changed and you still require interactivity. You don't need the geckodriver service however. The simplest way is to run

-    `docker compose --env-file env/check run news_fetch`
+    `docker compose --env-file env/check run --no-deps --rm news_fetch`

 * Finally, for `upload` mode no interactivity is required and no additional services are required. Simply run:
    
-    `docker compose --env-file env/upload run news_fetch`
+    `docker compose --env-file env/upload run --no-deps --rm news_fetch`
+
+### Stopping
+Run 
+
+`docker compose --env-file env/production down`
+
+which terminates all containers associated with the `docker-compose.yaml`. 

 ## Building

@@ -70,3 +77,9 @@ In docker compose, run
 ## Roadmap:

 [_] handle paywalled sites like faz, spiegel, ... through their dedicated sites (see nexisuni.com for instance), available through the ETH network
+
+
+## Manual Sync to NAS:
+I use `rsync`. Mounting the NAS locally, I navigate to the location of the local folder (notice the trailing slash). Then run
+`rsync -Razq --no-perms --no-owner --no-group --temp-dir=/tmp --progress --log-file=rsync.log <local folder>/ "<remote>"`
+where `<remote>` is the location where the NAS is mounted. (options:`R` - relative paths  , `a` - archive mode (multiple actions), `z` - ??, `q` - quiet. We also don't copy most of the metadata and we keep a log of the transfers.)
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -1,4 +1,5 @@
-# docker compose --env-file env/debug up 
+# Usage:
+# docker compose --env-file env/<mode> up 

 version: "3.9"

@@ -15,7 +16,7 @@ services:
    environment:
      - DISPLAY=$DISPLAY
      - TERM=xterm-256color # colored logs
-      - COLUMNS=160 # for wider logs
+      - COLUMNS=150 # for wider logs

      - DEBUG=${DEBUG}
      - CHECK=${CHECK}
@@ -28,7 +29,7 @@ services:


  geckodriver:
-    image: selenium/standalone-firefox:102.0.1
+    image: selenium/standalone-firefox:103.0
    volumes:
      - ${XSOCK-/dev/null}:${XSOCK-/tmp/sock}
      - ${XAUTHORITY-/dev/null}:/home/auto_news/.Xauthority
--- a/env/check
+++ b/env/check
@@ -1,7 +1,6 @@
 # Does not run any downloads but displays the previously downloaded but not yet checked files. Requires display-acces via xauth

 CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
-HOSTS_FILE=~/Bulk/COSS/Downloads/coss_archiving/dependencies/hosts

 XAUTHORTIY=$XAUTHORTIY
 XSOCK=/tmp/.X11-unix
--- a/env/debug
+++ b/env/debug
@@ -1,7 +1,6 @@
 # Runs in a debugging mode, does not launch anything at all but starts a bash process

 CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
-HOSTS_FILE=~/Bulk/COSS/Downloads/coss_archiving/dependencies/hosts

 CODE=./
 XAUTHORTIY=$XAUTHORTIY
--- a/env/production
+++ b/env/production
@@ -1,8 +1,8 @@
 # Runs on the main slack channel with the full worker setup. If nothing funky has occured, reducedfetch is a speedup

 CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
-HOSTS_FILE=~/Bulk/COSS/Downloads/coss_archiving/dependencies/hosts

+CONTAINERS_TO_RUN=nas_sync, geckodriver
 DEBUG=false
 CHECK=false
 UPLOAD=false
--- a/env/upload
+++ b/env/upload
@@ -1,9 +1,8 @@
 # Does not run any other workers and only upploads to archive the urls that weren't previously uploaded

 CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
-HOSTS_FILE=~/Bulk/COSS/Downloads/coss_archiving/dependencies/hosts
-

+NEWS_FETCH_DEPENDS_ON="[]"
 DEBUG=false
 CHECK=false
 UPLOAD=true
--- a/news_fetch/app/runner.py
+++ b/news_fetch/app/runner.py
@@ -4,6 +4,7 @@ models = configuration.models
 from threading import Thread
 import logging
 import os
+import sys
 logger = logging.getLogger(__name__)

 from utils_mail import runner as mail_runner
@@ -102,7 +103,7 @@ class ArticleWatcher:
 class Coordinator(Thread):
    def __init__(self, **kwargs) -> None:
        """Launcher calls this Coordinator as the main thread to handle connections between the other workers (threaded)."""
-        super().__init__(target = self.launch)
+        super().__init__(target = self.launch, daemon=True)
    
    def add_workers(self, **kwargs):
        self.worker_slack = kwargs.pop("worker_slack", None) 
@@ -192,6 +193,13 @@ if __name__ == "__main__":
            "worker_slack" : slack_runner,
            "worker_mail" : mail_runner,
        }
-        coordinator.add_workers(**kwargs)
-        coordinator.start()
-        slack_runner.start()
+        try:
+            coordinator.add_workers(**kwargs)
+            coordinator.start()
+            slack_runner.start()
+        except KeyboardInterrupt:
+            logger.info("Keyboard interrupt. Stopping Slack and Coordinator")
+            slack_runner.stop()
+            print("BYE!")
+            # coordinator was set as a daemon thread, so it will be stopped automatically
+            sys.exit(0)
--- a/news_fetch/app/utils_check/runner.py
+++ b/news_fetch/app/utils_check/runner.py
@@ -55,7 +55,7 @@ def file_overview(file_url: str, file_attributes: list, options: dict) -> None:


 def send_reaction_to_slack_thread(article, reaction):
-    """Sends the verification status as a reaction to the associated slack thread. This will significantly decrease load times of the bot"""
+    """Sends the verification status as a reaction to the associated slack thread."""
    thread = article.slack_thread
    messages = models.Message.select().where(models.Message.text.contains(article.article_url))
    # TODO rewrite this shit
@@ -63,9 +63,10 @@ def send_reaction_to_slack_thread(article, reaction):
        print("Found more than 5 messages. Aborting reactions...")
        return
    for m in messages:
-        if not m.has_single_url:
+        if m.is_processed_override:
+            print("Message already processed. Aborting reactions...")
+        elif not m.has_single_url:
            print("Found thread but won't send reaction because thread has multiple urls")
-            pass
        else:
            ts = m.slack_ts
            bot_client.reactions_add(
--- a/news_fetch/app/utils_mail/runner.py
+++ b/news_fetch/app/utils_mail/runner.py
@@ -37,6 +37,6 @@ def send(article_model):
        smtp.sendmail(config["sender"], config["recipient"], mail.as_string())
        smtp.quit()
        logger.info("Mail successfully sent.")
-    except Exception as e:
+    except smtplib.SMTPException as e:
        logger.error("Could not send mail for article {}".format(article_model))
        logger.info(e)
--- a/news_fetch/app/utils_slack/message_helpers.py
+++ b/news_fetch/app/utils_slack/message_helpers.py
@@ -14,6 +14,7 @@ LATEST_RECORDED_REACTION = 0


 def init(client) -> None:
+    """Starts fetching past messages and returns the freshly launched thread"""
    global slack_client
    slack_client = client

@@ -26,7 +27,7 @@ def init(client) -> None:
    # fetch all te messages we could have possibly missed
    logger.info("Querying missed messages, threads and reactions. This can take some time.")
    fetch_missed_channel_messages() # not threaded
-    t = Thread(target = fetch_missed_channel_reactions) # threaded, runs in background (usually takes a long time)
+    t = Thread(target = fetch_missed_channel_reactions, daemon=True) # threaded, runs in background (usually takes a long time)
    t.start()

    if os.getenv("REDUCEDFETCH", "false") == "true":
@@ -153,16 +154,23 @@ def fetch_missed_channel_reactions():
    logger.info("Starting background fetch of channel reactions...")
    threads = [t for t in models.Thread.select() if not t.is_fully_processed]
    for i,t in enumerate(threads):
+        reactions = []
        try:
            query = slack_client.reactions_get(
                channel = config["archive_id"],
                timestamp = t.slack_ts
            )
            reactions = query.get("message", []).get("reactions", []) # default = []
-        except SlackApiError: # probably a rate_limit:
-            logger.error("Hit rate limit while querying reactions. retrying in {}s ({}/{} queries elapsed)".format(config["api_wait_time"], i, len(threads)))
-            time.sleep(int(config["api_wait_time"]))
-            reactions = query.get("message", []).get("reactions", [])
+        except SlackApiError as e:
+            if e.response.get("error", "") == "message_not_found":
+                m = t.initiator_message
+                logger.warning(f"Message (id={m.id}) not found. Skipping and saving...")
+                # this usually means the message is past the 1000 message limit imposed by slack. Mark it as processed in the db
+                m.is_processed_override = True
+                m.save()
+            else: # probably a rate_limit:
+                logger.error("Hit rate limit while querying reactions. retrying in {}s ({}/{} queries elapsed)".format(config["api_wait_time"], i, len(threads)))
+                time.sleep(int(config["api_wait_time"]))

        for r in reactions:
            reaction_dict_to_model(r, t)
--- a/news_fetch/app/utils_slack/runner.py
+++ b/news_fetch/app/utils_slack/runner.py
@@ -1,5 +1,6 @@
 from slack_bolt import App
 from slack_bolt.adapter.socket_mode import SocketModeHandler
+from slack_sdk.errors import SlackApiError

 import logging
 import configuration
@@ -18,7 +19,7 @@ class BotApp(App):
        super().__init__(*args, **kwargs)
        self.callback = callback

-    def start(self):
+    def pre_start(self):
        message_helpers.init(self.client)
        missed_messages, missed_reactions = message_helpers.get_unhandled_messages()

@@ -124,7 +125,7 @@ class BotApp(App):
        answers = article.slack_info
        for a in answers:
            if a["file_path"]:
-                try: # either, a["file_path"] does not exist, or the upload resulted in an error
+                try: # upload resulted in an error
                    self.client.files_upload(
                        channels = config["archive_id"],
                        initial_comment = f"<@{config['responsible_id']}> \n {a['reply_text']}",
@@ -132,12 +133,13 @@ class BotApp(App):
                        thread_ts = thread.slack_ts
                    )
                    status = True
-                except:
+                except SlackApiError as e:
                    say(
                        "File {} could not be uploaded.".format(a),
                        thread_ts=thread.slack_ts
                    )
                    status = False
+                    self.logger.error(f"File upload failed: {e}")
            else: # anticipated that there is no file!
                say(
                    f"<@{config['responsible_id']}> \n {a['reply_text']}",
@@ -171,14 +173,17 @@ class BotRunner():
        def handle_incoming_reaction(event, say):
            return self.bot_worker.handle_incoming_reaction(event)

-        # target = self.launch
-        # super().__init__(target=target)
+        self.handler = SocketModeHandler(self.bot_worker, config["app_token"])


    def start(self):
-        self.bot_worker.start()
-        SocketModeHandler(self.bot_worker, config["app_token"]).start()
+        self.bot_worker.pre_start()
+        self.handler.start()


+    def stop(self):
+        self.handler.close()
+        print("Bye handler!")
+
    # def respond_to_message(self, message):
    #     self.bot_worker.handle_incoming_message(message)
--- a/news_fetch/app/utils_worker/download/browser.py
+++ b/news_fetch/app/utils_worker/download/browser.py
@@ -31,7 +31,8 @@ class PDFDownloader:
            self.logger.warning("Opening browser GUI because of 'HEADLESS=false'")

        options.set_preference('print.save_as_pdf.links.enabled', True)
-        # Just save if the filetype is pdf already, does not work!
+        # Just save if the filetype is pdf already
+        # TODO: this is not working right now

        options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
        options.set_preference("browser.download.folderList", 2)
@@ -40,6 +41,7 @@ class PDFDownloader:
        options.set_preference("browser.download.dir", config["default_download_path"])

        self.logger.info("Starting gecko driver")
+        # peviously, in a single docker image:
        # self.driver = webdriver.Firefox(
        #     options = options,
        #     service = webdriver.firefox.service.Service(
--- a/news_fetch/app/utils_worker/fetch/runner.py
+++ b/news_fetch/app/utils_worker/fetch/runner.py
@@ -57,6 +57,6 @@ def get_description(article_object):
    try:
        article_object.set_keywords(news_article.keywords)
    except AttributeError:
-        pass  # list would have been empty anyway
+        pass # list would have been empty anyway
    
    return article_object
--- a/testing.docker-compose.yaml
+++ b/testing.docker-compose.yaml
@@ -0,0 +1,75 @@
+# Usage:
+# docker compose --env-file env/<mode> run <args> news_fetch && docker-compose --env-file env/production down
+
+version: "3.9"
+
+services:
+
+  geckodriver:
+    image: selenium/standalone-firefox:103.0
+    volumes:
+      - ${XSOCK-/dev/null}:${XSOCK-/tmp/sock}
+      - ${XAUTHORITY-/dev/null}:/home/auto_news/.Xauthority
+    environment:
+      - DISPLAY=$DISPLAY
+      - START_VNC=false
+      - START_XVFB=false
+    user: 1001:1001
+    expose: # exposed to other docker-compose services only
+    - "4444"
+
+
+  vpn:
+    image: wazum/openconnect-proxy:latest
+    env_file:
+      - ${CONTAINER_DATA}/config/vpn.config
+    cap_add:
+    - NET_ADMIN
+    volumes:
+      - /dev/net/tun:/dev/net/tun
+    # alternative to cap_add & volumes: specify privileged: true
+
+
+  nas_sync:
+    depends_on:
+      - vpn # used to establish a connection to the SMB server
+    network_mode: "service:vpn"
+    build: nas_sync
+    image: nas_sync:latest
+    cap_add: # capabilities needed for mounting the SMB share
+      - SYS_ADMIN
+      - DAC_READ_SEARCH
+    volumes:
+      - ${CONTAINER_DATA}/files:/sync/local_files
+      - ${CONTAINER_DATA}/config/nas_sync.config:/sync/nas_sync.config
+      - ${CONTAINER_DATA}/config/nas_login.config:/sync/nas_login.config
+    command: 
+      - nas22.ethz.ch/gess_coss_1/helbing_support/Files RM/Archiving/TEST # first command is the target mount path
+      - lsyncd
+      - /sync/nas_sync.config
+
+
+  news_fetch:
+    build: news_fetch
+    image: news_fetch:latest
+
+    depends_on: # when using docker compose run news_fetch, the dependencies are started as well
+      - nas_sync
+      - geckodriver
+
+    volumes:
+      - ${CONTAINER_DATA}:/app/containerdata # always set
+      - ${CODE:-/dev/null}:/code # not set in prod, defaults to /dev/null
+      - ${XSOCK-/dev/null}:${XSOCK-/tmp/sock} # x11 socket, needed for gui
+      # - ${XAUTHORITY-/dev/null}:/home/auto_news/.Xauthority # xauth needed for authenticating to x11
+    environment:
+      - DISPLAY=$DISPLAY # needed to let x11 apps know where to connect to
+
+      - DEBUG=${DEBUG}
+      - CHECK=${CHECK}
+      - UPLOAD=${UPLOAD}
+      - HEADLESS=${HEADLESS}
+      - REDUCEDFETCH=${REDUCEDFETCH}
+    entrypoint: ${ENTRYPOINT:-python3 runner.py} # by default launch workers as defined in the Dockerfile
+    stdin_open: ${INTERACTIVE:-false} # docker run -i
+    tty: ${INTERACTIVE:-false}        # docker run -t