diff --git a/.dockerignore b/.dockerignore index b5d67a2..1726aa3 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1 +1,2 @@ -.dev/ \ No newline at end of file +.dev/ +__pycache__/ \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 367c40c..2be5e50 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,8 @@ FROM python:latest ENV TZ Euopre/Zurich + + RUN echo "deb http://deb.debian.org/debian/ unstable main contrib non-free" >> /etc/apt/sources.list RUN apt-get update && apt-get install -y \ evince \ @@ -16,7 +18,6 @@ RUN wget https://github.com/mozilla/geckodriver/releases/download/v0.31.0/geckod RUN tar -x geckodriver -zf geckodriver-v0.31.0-linux64.tar.gz -O > /usr/bin/geckodriver RUN chmod +x /usr/bin/geckodriver RUN rm geckodriver-v0.31.0-linux64.tar.gz -RUN echo "127.0.0.1 localhost" >> /etc/hosts RUN useradd --create-home --shell /bin/bash --uid 1001 autonews @@ -24,15 +25,12 @@ RUN useradd --create-home --shell /bin/bash --uid 1001 autonews # home directory needed for pip package installation RUN mkdir -p /app/auto_news RUN chown -R autonews:autonews /app - - USER autonews +RUN export PATH=/home/autonews/.local/bin:$PATH -COPY requirements.txt /app/ -RUN python3 -m pip install -r /app/requirements.txt COPY app /app/auto_news WORKDIR /app/auto_news - +RUN python3 -m pip install -r requirements.txt ENTRYPOINT ["python3", "runner.py"] \ No newline at end of file diff --git a/README.md b/README.md index ab427c3..f7d206f 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,8 @@ A utility to fetch article requests from slack and generate pdfs for them, fully automatically. -## Running +## Running - Pure docker +> I recommend running with docker compose instead ### How to run - auto archiving mode In this mode the program is launched as a docker container, in a headless mode. For persistence purposes a local storage volume is required, but that's it! @@ -15,6 +16,12 @@ You can specify additional parameters: `docker run -it -v :/app/file_storage/ auto_news upload` catches up on incomplete uploads to archive. +`docker run -it -v :/app/file_storage/ auto_news reducedfetch` makes assumption about the status of the slack chat and greatly reduces the number of api calls (faster start up). + +These parameters can be combined (mostyl for testing I guess) + +Finally for manual file verification: + `docker run -it -v :/app/file_storage/ -e DISPLAY=":0" --network host -v \$XAUTHORITY:/root/.Xauthority auto_news check` lets you visually verify the downloaded files. The additional parameters are required in order to open guis on the host. @@ -24,33 +31,51 @@ In this mode, a docker container is launched with an additional volume, the loca `docker run -it -v :/app/file_storage/ -v :/code/ --entry-point /bin/bash auto_news` You are droppped into a bash shell, in which you can navigate to the `/code` directory and then test live. +### Cheat-sheet Remy: +`docker run -it -v /mnt/Data/COSS/Downloads/auto_news.container/:/app/file_storage/ auto_news` + +`docker run -it -v /mnt/Data/COSS/Downloads/auto_news.container/:/app/file_storage/ -v /mnt/Data/COSS/Development/auto_news/app:/code --entrypoint /bin/bash auto_news` + + +`docker run -it -v /mnt/Data/COSS/Downloads/auto_news.container/:/app/file_storage/ -e DISPLAY=":0" --network host -v XAUTHORITY:/root/.Xauthority auto_news check` + + +## Running - Docker compose + +I also wrote a rudimentary docker compose file which makes running much more simple. Just run + +`docker compose --env-file up` + +All relevant passthroughs and mounts are specified through the env-file, for which I configured 4 versions: production, debug (development in general), upload and check. These files will have to be adapted to your individual setup but can be reused more easily. + +> Note: +> +> The `debug` requires additional input. Once `docker compose up` is running, in a new session run `docker compose --env-file env/debug exec bash`. The live-mounted code is then under `/code`. Note that the `DEBUG=true` environment variable is still set. If you want to test things on production, run `export DEBUG=false`. ## Building -### Things to keep in mind -The software (firefox, selenium, python) changes frequently. For non-breaking changes it is useful to regularly clean build the docker image! This is also crucial to update the code itself. +> The software (firefox, selenium, python) changes frequently. For non-breaking changes it is useful to regularly clean build the docker image! This is also crucial to update the code itself. +In docker, simply run: `docker build -t auto_news --no-cache .` + where the `Dockerfile` has to be in the working directory +In docker compose, run the usual command, but append + +`docker compose ... up --build` -## Cheat-sheet Remy: -`docker run -it -v /mnt/Data/COSS/CONTAINERDATA/:/app/file_storage/ auto_news` - -`docker run -it -v /mnt/Data/COSS/CONTAINERDATA/:/app/file_storage/ -v /mnt/Data/COSS/auto_news/app:/code --entrypoint /bin/bash auto_news` - - -`docker run -it -v /mnt/Data/COSS/CONTAINERDATA/:/app/file_storage/ -e DISPLAY=":0" --network host -v XAUTHORITY:/root/.Xauthority auto_news check` ## Roadmap: [ ] automatically upload files to NAS -[ ] handle paywalled sites like faz, spiegel, .. through their dedicated edu-sites + +[ ] handle paywalled sites like faz, spiegel, .. through their dedicated edu-friendly sites ... \ No newline at end of file diff --git a/app/configuration.py b/app/configuration.py index 0981f26..8e7c148 100644 --- a/app/configuration.py +++ b/app/configuration.py @@ -1,5 +1,4 @@ import os -import sys import configparser import logging from peewee import SqliteDatabase @@ -19,18 +18,18 @@ logger = logging.getLogger(__name__) parsed = configparser.ConfigParser() parsed.read("/app/file_storage/config.ini") -if "debug" in sys.argv: - logger.warning("Running in debugging mode because launched with argument 'debug'") - # parsed.read("/code/config.ini") +if os.getenv("DEBUG", "false") == "true": + logger.warning("Found 'DEBUG=true', setting up dummy databases") db_base_path = parsed["DATABASE"]["db_path_dev"] parsed["SLACK"]["archive_id"] = parsed["SLACK"]["debug_id"] parsed["MAIL"]["recipient"] = parsed["MAIL"]["sender"] else: - logger.warning("Using production values, I hope you know what you're doing...") + logger.warning("Found 'DEBUG=false' and running on production databases, I hope you know what you're doing...") db_base_path = parsed["DATABASE"]["db_path_prod"] + from utils_storage import models # Set up the database diff --git a/requirements.txt b/app/requirements.txt similarity index 100% rename from requirements.txt rename to app/requirements.txt diff --git a/app/runner.py b/app/runner.py index fbd5fe9..bb0db7e 100644 --- a/app/runner.py +++ b/app/runner.py @@ -1,9 +1,9 @@ """Main coordination of other util classes. Handles inbound and outbound calls""" import configuration models = configuration.models -import sys from threading import Thread import logging +import os logger = logging.getLogger(__name__) from utils_mail import runner as mail_runner @@ -172,12 +172,12 @@ if __name__ == "__main__": coordinator = Coordinator() - if "upload" in sys.argv: + if os.getenv("UPLOAD", "false") == "true": articles = models.ArticleDownload.select().where(models.ArticleDownload.archive_url == "").execute() logger.info(f"Launching upload to archive for {len(articles)} articles.") coordinator.manual_processing(articles, [UploadWorker()]) - elif "check" in sys.argv: + elif os.getenv("CHECK", "false") == "true": from utils_check import runner as check_runner check_runner.verify_unchecked() diff --git a/app/utils_slack/message_helpers.py b/app/utils_slack/message_helpers.py index a0dbaf4..14c1d60 100644 --- a/app/utils_slack/message_helpers.py +++ b/app/utils_slack/message_helpers.py @@ -3,7 +3,6 @@ import configuration import requests import os import time -import sys from threading import Thread from slack_sdk.errors import SlackApiError @@ -30,10 +29,10 @@ def init(client) -> None: t = Thread(target = fetch_missed_channel_reactions) # threaded, runs in background (usually takes a long time) t.start() - if "reducedfetch" in sys.argv: - logger.warning("Only fetching empty threads for bot messages because of argument 'reducedfetch'") + if os.getenv("REDUCEDFETCH", "false") == "true": + logger.warning("Only fetching empty threads for bot messages because 'REDUCEDFETCH=true'") fetch_missed_thread_messages(reduced=True) - else: # perform these two asyncronously + else: # perform both asyncronously fetch_missed_thread_messages() diff --git a/app/utils_worker/download/browser.py b/app/utils_worker/download/browser.py index fd301da..830f6ae 100644 --- a/app/utils_worker/download/browser.py +++ b/app/utils_worker/download/browser.py @@ -2,7 +2,6 @@ import time import datetime import logging import os -import sys import base64 import requests from selenium import webdriver @@ -20,28 +19,34 @@ class PDFDownloader: running = False def start(self): - options=Options() + try: + self.finish() + except: + self.logger.info("gecko driver not yet running") + options = webdriver.FirefoxOptions() options.profile = config["browser_profile_path"] - if "notheadless" in sys.argv: - self.logger.warning("Opening browser GUI because of Argument 'notheadless'") - else: + # should be options.set_preference("profile", config["browser_profile_path"]) as of selenium 4 but that doesn't work + + if os.getenv("HEADLESS", "false") == "true": options.add_argument('--headless') + else: + self.logger.warning("Opening browser GUI because of 'HEADLESS=true'") - # Print to pdf - options.set_preference("print_printer", "Mozilla Save to PDF") - options.set_preference("print.always_print_silent", True) - options.set_preference("print.show_print_progress", False) options.set_preference('print.save_as_pdf.links.enabled', True) - # Just save if the filetype is pdf already, does not work! + options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True) options.set_preference("browser.download.folderList", 2) # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf") # options.set_preference("pdfjs.disabled", True) options.set_preference("browser.download.dir", config["default_download_path"]) - self.logger.info("Now Starting gecko driver") - self.driver = webdriver.Firefox(options=options) + self.logger.info("Starting gecko driver") + self.driver = webdriver.Firefox( + options = options, + service = webdriver.firefox.service.Service( + log_path = f'{config["local_storage_path"]}/geckodriver.log' + )) residues = os.listdir(config["default_download_path"]) for res in residues: @@ -54,6 +59,7 @@ class PDFDownloader: self.start() # relaunch the dl util def finish(self): + self.logger.info("Exiting gecko driver") self.driver.quit() self.running = False diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..8cb90fe --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,36 @@ +# docker compose --env-file env/debug up + + +version: "3.9" +services: + auto_news: + build: . + volumes: + - ${CONTAINER_DATA}:/app/file_storage + - ${HOSTS_FILE}:/etc/hosts + + - ${CODE:-/dev/null}:/code # not set in prod, defaults to /dev/null + - ${XAUTHORITY-/dev/null}:/home/auto_news/.Xauthority + network_mode: host + environment: + - DISPLAY=$DISPLAY + - DEBUG=${DEBUG} + - CHECK=${CHECK} + - UPLOAD=${UPLOAD} + - HEADLESS=${HEADLESS} + - REDUCEDFETCH=${REDUCEDFETCH} + + entrypoint: ${ENTRYPOINT:-"python3 runner.py"} # by default launch workers as defined in the Dockerfile + + # geckodriver: + # image: selenium/standalone-firefox:100.0 + # volumes: + # + # - ${CONTAINER_DATA-/dev/null}:/app/file_storage + # - ${FIREFOX_PROFILE}:/auto_news.profile + # - ${HOSTS_FILE}:/etc/hosts + # environment: + # - DISPLAY=$DISPLAY + # - START_XVFB=false + + # network_mode: host \ No newline at end of file diff --git a/env/check b/env/check new file mode 100644 index 0000000..fcd04c6 --- /dev/null +++ b/env/check @@ -0,0 +1,12 @@ +# Does not run any downloads but displays the previously downloaded but not yet checked files. Requires display-acces via xauth + +CONTAINER_DATA=/mnt/Data/COSS/Downloads/auto_news.container +HOSTS_FILE=/mnt/Data/COSS/Downloads/auto_news.container/dependencies/hosts + +XAUTHORTIY=$XAUTHORTIY + +DEBUG=false +CHECK=true +HEADLESS=true +UPLOAD=false +REDUCEDFETCH=false \ No newline at end of file diff --git a/env/debug b/env/debug new file mode 100644 index 0000000..969b7b3 --- /dev/null +++ b/env/debug @@ -0,0 +1,15 @@ +# Runs in a debugging mode, does not launch anything at all but starts a bash process + +CONTAINER_DATA=/mnt/Data/COSS/Downloads/auto_news.container +HOSTS_FILE=/mnt/Data/COSS/Downloads/auto_news.container/dependencies/hosts + +CODE=./ +XAUTHORTIY=$XAUTHORTIY + +DEBUG=true +CHECK=false +UPLOAD=false +HEADLESS=false +REDUCEDFETCH=false + +ENTRYPOINT="sleep infinity" \ No newline at end of file diff --git a/env/production b/env/production new file mode 100644 index 0000000..d4a9a24 --- /dev/null +++ b/env/production @@ -0,0 +1,10 @@ +# Runs on the main slack channel with the full worker setup. If nothing funky has occured, reducedfetch is a speedup + +CONTAINER_DATA=/mnt/Data/Downloads/auto_news.container +HOSTS_FILE=/mnt/Data/COSS/Downloads/auto_news.container/dependencies/hosts + +DEBUG=false +CHECK=false +UPLOAD=false +HEADLESS=true +REDUCEDFETCH=true \ No newline at end of file diff --git a/env/upload b/env/upload new file mode 100644 index 0000000..4ef11af --- /dev/null +++ b/env/upload @@ -0,0 +1,11 @@ +# Does not run any other workers and only upploads to archive the urls that weren't previously uploaded + +CONTAINER_DATA=/mnt/Data/COSS/Downloads/auto_news.container +HOSTS_FILE=/mnt/Data/COSS/Downloads/auto_news.container/dependencies/hosts + + +DEBUG=false +CHECK=false +UPLOAD=true +HEADLESS=true +REDUCEDFETCH=false \ No newline at end of file