diff --git a/README.md b/README.md index 9a84016..21edf56 100644 --- a/README.md +++ b/README.md @@ -36,25 +36,32 @@ Upload mode is much simpler, it goes over the exisiting database and operates on * For normal `production` mode run: - `docker compose --env-file env/production up` + `docker compose --env-file env/production run news_fetch` -* For `debug` mode, you will likely want interactivity, so you need to run: +* For `debug` mode run: - `docker compose --env-file env/debug up -d && docker compose --env-file env/debug exec news_fetch bash && docker compose --env-file env/debug down` - - which should automatically shutdown the containers once you are done. (`ctrl+d` to exit the container shell). If not, re-run `docker compose --env-file env/debug down` manually. + `docker compose --env-file env/debug run news_fetch` + + which drops you into an interactive shell (`ctrl+d` to exit the container shell). > Note: > The live-mounted code is now under `/code`. Note that the `DEBUG=true` environment variable is still set. If you want to test things on production, run `export DEBUG=false`. Running `python runner.py` will now run the newly written code but, with the production database and storage. * For `check` mode, some env-variables are also changed and you still require interactivity. You don't need the geckodriver service however. The simplest way is to run - `docker compose --env-file env/check run news_fetch` + `docker compose --env-file env/check run --no-deps --rm news_fetch` * Finally, for `upload` mode no interactivity is required and no additional services are required. Simply run: - `docker compose --env-file env/upload run news_fetch` + `docker compose --env-file env/upload run --no-deps --rm news_fetch` + +### Stopping +Run + +`docker compose --env-file env/production down` + +which terminates all containers associated with the `docker-compose.yaml`. ## Building @@ -70,3 +77,9 @@ In docker compose, run ## Roadmap: [_] handle paywalled sites like faz, spiegel, ... through their dedicated sites (see nexisuni.com for instance), available through the ETH network + + +## Manual Sync to NAS: +I use `rsync`. Mounting the NAS locally, I navigate to the location of the local folder (notice the trailing slash). Then run +`rsync -Razq --no-perms --no-owner --no-group --temp-dir=/tmp --progress --log-file=rsync.log / ""` +where `` is the location where the NAS is mounted. (options:`R` - relative paths , `a` - archive mode (multiple actions), `z` - ??, `q` - quiet. We also don't copy most of the metadata and we keep a log of the transfers.) \ No newline at end of file diff --git a/docker-compose.yaml b/docker-compose.yaml index f726737..33bcc58 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,4 +1,5 @@ -# docker compose --env-file env/debug up +# Usage: +# docker compose --env-file env/ up version: "3.9" @@ -15,7 +16,7 @@ services: environment: - DISPLAY=$DISPLAY - TERM=xterm-256color # colored logs - - COLUMNS=160 # for wider logs + - COLUMNS=150 # for wider logs - DEBUG=${DEBUG} - CHECK=${CHECK} @@ -28,7 +29,7 @@ services: geckodriver: - image: selenium/standalone-firefox:102.0.1 + image: selenium/standalone-firefox:103.0 volumes: - ${XSOCK-/dev/null}:${XSOCK-/tmp/sock} - ${XAUTHORITY-/dev/null}:/home/auto_news/.Xauthority diff --git a/env/check b/env/check index d86271c..5a95409 100644 --- a/env/check +++ b/env/check @@ -1,7 +1,6 @@ # Does not run any downloads but displays the previously downloaded but not yet checked files. Requires display-acces via xauth CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving -HOSTS_FILE=~/Bulk/COSS/Downloads/coss_archiving/dependencies/hosts XAUTHORTIY=$XAUTHORTIY XSOCK=/tmp/.X11-unix diff --git a/env/debug b/env/debug index ce8714e..ec1170f 100644 --- a/env/debug +++ b/env/debug @@ -1,7 +1,6 @@ # Runs in a debugging mode, does not launch anything at all but starts a bash process CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving -HOSTS_FILE=~/Bulk/COSS/Downloads/coss_archiving/dependencies/hosts CODE=./ XAUTHORTIY=$XAUTHORTIY diff --git a/env/production b/env/production index 0d67a27..97e36db 100644 --- a/env/production +++ b/env/production @@ -1,8 +1,8 @@ # Runs on the main slack channel with the full worker setup. If nothing funky has occured, reducedfetch is a speedup CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving -HOSTS_FILE=~/Bulk/COSS/Downloads/coss_archiving/dependencies/hosts +CONTAINERS_TO_RUN=nas_sync, geckodriver DEBUG=false CHECK=false UPLOAD=false diff --git a/env/upload b/env/upload index 03055fb..7e7c459 100644 --- a/env/upload +++ b/env/upload @@ -1,9 +1,8 @@ # Does not run any other workers and only upploads to archive the urls that weren't previously uploaded CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving -HOSTS_FILE=~/Bulk/COSS/Downloads/coss_archiving/dependencies/hosts - +NEWS_FETCH_DEPENDS_ON="[]" DEBUG=false CHECK=false UPLOAD=true diff --git a/news_fetch/app/runner.py b/news_fetch/app/runner.py index fdb71b9..b33fb2f 100644 --- a/news_fetch/app/runner.py +++ b/news_fetch/app/runner.py @@ -4,6 +4,7 @@ models = configuration.models from threading import Thread import logging import os +import sys logger = logging.getLogger(__name__) from utils_mail import runner as mail_runner @@ -102,7 +103,7 @@ class ArticleWatcher: class Coordinator(Thread): def __init__(self, **kwargs) -> None: """Launcher calls this Coordinator as the main thread to handle connections between the other workers (threaded).""" - super().__init__(target = self.launch) + super().__init__(target = self.launch, daemon=True) def add_workers(self, **kwargs): self.worker_slack = kwargs.pop("worker_slack", None) @@ -192,6 +193,13 @@ if __name__ == "__main__": "worker_slack" : slack_runner, "worker_mail" : mail_runner, } - coordinator.add_workers(**kwargs) - coordinator.start() - slack_runner.start() + try: + coordinator.add_workers(**kwargs) + coordinator.start() + slack_runner.start() + except KeyboardInterrupt: + logger.info("Keyboard interrupt. Stopping Slack and Coordinator") + slack_runner.stop() + print("BYE!") + # coordinator was set as a daemon thread, so it will be stopped automatically + sys.exit(0) \ No newline at end of file diff --git a/news_fetch/app/utils_check/runner.py b/news_fetch/app/utils_check/runner.py index 5af8420..2325dd6 100644 --- a/news_fetch/app/utils_check/runner.py +++ b/news_fetch/app/utils_check/runner.py @@ -55,7 +55,7 @@ def file_overview(file_url: str, file_attributes: list, options: dict) -> None: def send_reaction_to_slack_thread(article, reaction): - """Sends the verification status as a reaction to the associated slack thread. This will significantly decrease load times of the bot""" + """Sends the verification status as a reaction to the associated slack thread.""" thread = article.slack_thread messages = models.Message.select().where(models.Message.text.contains(article.article_url)) # TODO rewrite this shit @@ -63,9 +63,10 @@ def send_reaction_to_slack_thread(article, reaction): print("Found more than 5 messages. Aborting reactions...") return for m in messages: - if not m.has_single_url: + if m.is_processed_override: + print("Message already processed. Aborting reactions...") + elif not m.has_single_url: print("Found thread but won't send reaction because thread has multiple urls") - pass else: ts = m.slack_ts bot_client.reactions_add( diff --git a/news_fetch/app/utils_mail/runner.py b/news_fetch/app/utils_mail/runner.py index b70f0fe..d7e09ba 100644 --- a/news_fetch/app/utils_mail/runner.py +++ b/news_fetch/app/utils_mail/runner.py @@ -37,6 +37,6 @@ def send(article_model): smtp.sendmail(config["sender"], config["recipient"], mail.as_string()) smtp.quit() logger.info("Mail successfully sent.") - except Exception as e: + except smtplib.SMTPException as e: logger.error("Could not send mail for article {}".format(article_model)) logger.info(e) \ No newline at end of file diff --git a/news_fetch/app/utils_slack/message_helpers.py b/news_fetch/app/utils_slack/message_helpers.py index 2c4b1f0..f0ca785 100644 --- a/news_fetch/app/utils_slack/message_helpers.py +++ b/news_fetch/app/utils_slack/message_helpers.py @@ -14,6 +14,7 @@ LATEST_RECORDED_REACTION = 0 def init(client) -> None: + """Starts fetching past messages and returns the freshly launched thread""" global slack_client slack_client = client @@ -26,7 +27,7 @@ def init(client) -> None: # fetch all te messages we could have possibly missed logger.info("Querying missed messages, threads and reactions. This can take some time.") fetch_missed_channel_messages() # not threaded - t = Thread(target = fetch_missed_channel_reactions) # threaded, runs in background (usually takes a long time) + t = Thread(target = fetch_missed_channel_reactions, daemon=True) # threaded, runs in background (usually takes a long time) t.start() if os.getenv("REDUCEDFETCH", "false") == "true": @@ -153,16 +154,23 @@ def fetch_missed_channel_reactions(): logger.info("Starting background fetch of channel reactions...") threads = [t for t in models.Thread.select() if not t.is_fully_processed] for i,t in enumerate(threads): + reactions = [] try: query = slack_client.reactions_get( channel = config["archive_id"], timestamp = t.slack_ts ) reactions = query.get("message", []).get("reactions", []) # default = [] - except SlackApiError: # probably a rate_limit: - logger.error("Hit rate limit while querying reactions. retrying in {}s ({}/{} queries elapsed)".format(config["api_wait_time"], i, len(threads))) - time.sleep(int(config["api_wait_time"])) - reactions = query.get("message", []).get("reactions", []) + except SlackApiError as e: + if e.response.get("error", "") == "message_not_found": + m = t.initiator_message + logger.warning(f"Message (id={m.id}) not found. Skipping and saving...") + # this usually means the message is past the 1000 message limit imposed by slack. Mark it as processed in the db + m.is_processed_override = True + m.save() + else: # probably a rate_limit: + logger.error("Hit rate limit while querying reactions. retrying in {}s ({}/{} queries elapsed)".format(config["api_wait_time"], i, len(threads))) + time.sleep(int(config["api_wait_time"])) for r in reactions: reaction_dict_to_model(r, t) diff --git a/news_fetch/app/utils_slack/runner.py b/news_fetch/app/utils_slack/runner.py index 9be7a0c..e35117a 100644 --- a/news_fetch/app/utils_slack/runner.py +++ b/news_fetch/app/utils_slack/runner.py @@ -1,5 +1,6 @@ from slack_bolt import App from slack_bolt.adapter.socket_mode import SocketModeHandler +from slack_sdk.errors import SlackApiError import logging import configuration @@ -18,7 +19,7 @@ class BotApp(App): super().__init__(*args, **kwargs) self.callback = callback - def start(self): + def pre_start(self): message_helpers.init(self.client) missed_messages, missed_reactions = message_helpers.get_unhandled_messages() @@ -124,7 +125,7 @@ class BotApp(App): answers = article.slack_info for a in answers: if a["file_path"]: - try: # either, a["file_path"] does not exist, or the upload resulted in an error + try: # upload resulted in an error self.client.files_upload( channels = config["archive_id"], initial_comment = f"<@{config['responsible_id']}> \n {a['reply_text']}", @@ -132,12 +133,13 @@ class BotApp(App): thread_ts = thread.slack_ts ) status = True - except: + except SlackApiError as e: say( "File {} could not be uploaded.".format(a), thread_ts=thread.slack_ts ) status = False + self.logger.error(f"File upload failed: {e}") else: # anticipated that there is no file! say( f"<@{config['responsible_id']}> \n {a['reply_text']}", @@ -171,14 +173,17 @@ class BotRunner(): def handle_incoming_reaction(event, say): return self.bot_worker.handle_incoming_reaction(event) - # target = self.launch - # super().__init__(target=target) + self.handler = SocketModeHandler(self.bot_worker, config["app_token"]) def start(self): - self.bot_worker.start() - SocketModeHandler(self.bot_worker, config["app_token"]).start() + self.bot_worker.pre_start() + self.handler.start() + def stop(self): + self.handler.close() + print("Bye handler!") + # def respond_to_message(self, message): # self.bot_worker.handle_incoming_message(message) \ No newline at end of file diff --git a/news_fetch/app/utils_worker/download/browser.py b/news_fetch/app/utils_worker/download/browser.py index 0c05f56..4c47648 100644 --- a/news_fetch/app/utils_worker/download/browser.py +++ b/news_fetch/app/utils_worker/download/browser.py @@ -31,7 +31,8 @@ class PDFDownloader: self.logger.warning("Opening browser GUI because of 'HEADLESS=false'") options.set_preference('print.save_as_pdf.links.enabled', True) - # Just save if the filetype is pdf already, does not work! + # Just save if the filetype is pdf already + # TODO: this is not working right now options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True) options.set_preference("browser.download.folderList", 2) @@ -40,6 +41,7 @@ class PDFDownloader: options.set_preference("browser.download.dir", config["default_download_path"]) self.logger.info("Starting gecko driver") + # peviously, in a single docker image: # self.driver = webdriver.Firefox( # options = options, # service = webdriver.firefox.service.Service( diff --git a/news_fetch/app/utils_worker/fetch/runner.py b/news_fetch/app/utils_worker/fetch/runner.py index cef88ba..06b2cbe 100644 --- a/news_fetch/app/utils_worker/fetch/runner.py +++ b/news_fetch/app/utils_worker/fetch/runner.py @@ -57,6 +57,6 @@ def get_description(article_object): try: article_object.set_keywords(news_article.keywords) except AttributeError: - pass # list would have been empty anyway + pass # list would have been empty anyway return article_object diff --git a/testing.docker-compose.yaml b/testing.docker-compose.yaml new file mode 100644 index 0000000..313aa64 --- /dev/null +++ b/testing.docker-compose.yaml @@ -0,0 +1,75 @@ +# Usage: +# docker compose --env-file env/ run news_fetch && docker-compose --env-file env/production down + +version: "3.9" + +services: + + geckodriver: + image: selenium/standalone-firefox:103.0 + volumes: + - ${XSOCK-/dev/null}:${XSOCK-/tmp/sock} + - ${XAUTHORITY-/dev/null}:/home/auto_news/.Xauthority + environment: + - DISPLAY=$DISPLAY + - START_VNC=false + - START_XVFB=false + user: 1001:1001 + expose: # exposed to other docker-compose services only + - "4444" + + + vpn: + image: wazum/openconnect-proxy:latest + env_file: + - ${CONTAINER_DATA}/config/vpn.config + cap_add: + - NET_ADMIN + volumes: + - /dev/net/tun:/dev/net/tun + # alternative to cap_add & volumes: specify privileged: true + + + nas_sync: + depends_on: + - vpn # used to establish a connection to the SMB server + network_mode: "service:vpn" + build: nas_sync + image: nas_sync:latest + cap_add: # capabilities needed for mounting the SMB share + - SYS_ADMIN + - DAC_READ_SEARCH + volumes: + - ${CONTAINER_DATA}/files:/sync/local_files + - ${CONTAINER_DATA}/config/nas_sync.config:/sync/nas_sync.config + - ${CONTAINER_DATA}/config/nas_login.config:/sync/nas_login.config + command: + - nas22.ethz.ch/gess_coss_1/helbing_support/Files RM/Archiving/TEST # first command is the target mount path + - lsyncd + - /sync/nas_sync.config + + + news_fetch: + build: news_fetch + image: news_fetch:latest + + depends_on: # when using docker compose run news_fetch, the dependencies are started as well + - nas_sync + - geckodriver + + volumes: + - ${CONTAINER_DATA}:/app/containerdata # always set + - ${CODE:-/dev/null}:/code # not set in prod, defaults to /dev/null + - ${XSOCK-/dev/null}:${XSOCK-/tmp/sock} # x11 socket, needed for gui + # - ${XAUTHORITY-/dev/null}:/home/auto_news/.Xauthority # xauth needed for authenticating to x11 + environment: + - DISPLAY=$DISPLAY # needed to let x11 apps know where to connect to + + - DEBUG=${DEBUG} + - CHECK=${CHECK} + - UPLOAD=${UPLOAD} + - HEADLESS=${HEADLESS} + - REDUCEDFETCH=${REDUCEDFETCH} + entrypoint: ${ENTRYPOINT:-python3 runner.py} # by default launch workers as defined in the Dockerfile + stdin_open: ${INTERACTIVE:-false} # docker run -i + tty: ${INTERACTIVE:-false} # docker run -t