From 647944d23c0357ce1cb41564d692f7f3a89e5a72 Mon Sep 17 00:00:00 2001 From: Remy Moll Date: Fri, 9 Dec 2022 11:19:45 +0100 Subject: [PATCH] Bug fixes, makefile for launch --- Makefile | 87 +++++++++++++++++ README.md | 2 +- config/README.md | 8 -- config/container.yaml | 37 ++++++++ config/db.config.ini | 7 -- config/local.env | 18 ++++ config/nas_login.config | 3 - config/nas_sync.config | 12 --- config/news_fetch.config.ini | 31 ------ config/vpn.config | 4 - docker-compose.yaml | 58 +++++++----- launch | 70 -------------- manual/gather_media_files.py | 10 +- nas_sync/Dockerfile | 9 -- nas_sync/entrypoint.sh | 10 -- news_check/requirements.txt | 3 +- news_check/server/configuration.py | 15 ++- news_check/server/models.py | 8 +- news_fetch/configuration.py | 27 +++--- news_fetch/requirements.txt | 1 + news_fetch/utils_mail/runner.py | 20 ++-- news_fetch/utils_slack/runner.py | 20 ++-- news_fetch/utils_storage/models.py | 47 +++++---- news_fetch/utils_worker/download/browser.py | 100 +++++++++++--------- news_fetch/utils_worker/download/youtube.py | 14 ++- 25 files changed, 321 insertions(+), 300 deletions(-) create mode 100644 Makefile delete mode 100644 config/README.md create mode 100644 config/container.yaml delete mode 100644 config/db.config.ini create mode 100644 config/local.env delete mode 100644 config/nas_login.config delete mode 100644 config/nas_sync.config delete mode 100644 config/news_fetch.config.ini delete mode 100644 config/vpn.config delete mode 100644 launch delete mode 100644 nas_sync/Dockerfile delete mode 100644 nas_sync/entrypoint.sh diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..1adc525 --- /dev/null +++ b/Makefile @@ -0,0 +1,87 @@ +include config/local.env +export + +build: + @echo "Building..." + docker compose build $(flags) + + +down: + @echo "Stopping containers..." + docker compose down -t 0 --volumes + + +# Variables specific to debug +debug: export DEBUG=true +debug: export HEADFULL=true +debug: export ENTRYPOINT=/bin/bash +debug: export CODE=./ +debug: + @echo "Running in debug mode..." + docker compose up -d geckodriver + docker compose run -it --service-ports $(target) $(flags) || true + make down + + +production: export DEBUG=false +production: + @echo "Running in production mode..." + docker compose run -it --service-ports $(target) $(flags) || true + make down + + +nas_sync: + @echo "Syncing NAS..." + SYNC_FOLDER=$(folder) docker compose run -it nas_sync $(flags) || true + docker compose down + docker container prune -f + make down + + + + +## Misc: +edit_profile: export CODE=./ +edit_profile: export HEADFULL=true +edit_profile: + @echo "Editing profile..." + docker compose up -d geckodriver + sleep 5 + docker compose exec geckodriver /bin/bash /code/geckodriver/edit_profile.sh || true + # runs inside the container + make down + + + +db_interface: + docker create \ + --name pgadmin \ + -p 8080:80 \ + -e 'PGADMIN_DEFAULT_EMAIL=${UNAME}@test.com' \ + -e 'PGADMIN_DEFAULT_PASSWORD=password' \ + -e 'PGADMIN_CONFIG_ENHANCED_COOKIE_PROTECTION=True' \ + -e 'PGADMIN_CONFIG_LOGIN_BANNER="Authorised users only!"' \ + dpage/pgadmin4 + + docker start pgadmin + + sleep 5 + + # TODO auto add the server to the list displayed in the browser + # docker exec pgadmin sh -c "echo ${SERVER_DATA} > /tmp/servers.json" + # docker exec pgadmin sh -c "/venv/bin/python setup.py --load-servers /tmp/servers.json --user remy@test.com" + @echo "Go to http://localhost:8080 to access the database interface" + @echo "Username: ${UNAME}@test.com" + @echo "Password: password" + @echo "Hit any key to stop (not ctrl+c)" + read STOP + + docker stop pgadmin + docker rm pgadmin + + +logs: + docker compose logs -f $(target) $(flags) + + + make down \ No newline at end of file diff --git a/README.md b/README.md index 0e3b6dd..930735d 100644 --- a/README.md +++ b/README.md @@ -124,4 +124,4 @@ I use `rsync`. Mounting the NAS locally, I navigate to the location of the local `rsync -Razq --no-perms --no-owner --no-group --temp-dir=/tmp --progress --log-file=rsync.log / ""` where `` is the location where the NAS is mounted. (options:`R` - relative paths , `a` - archive mode (multiple actions), `z` - ??, `q` - quiet. We also don't copy most of the metadata and we keep a log of the transfers.) -You can also use your OS' native copy option and select *de not overwrite*. This should only copy the missing files, significantly speeding up the operation. \ No newline at end of file +You can also use your OS' native copy option and select *do not overwrite*. This should only copy the missing files, significantly speeding up the operation. \ No newline at end of file diff --git a/config/README.md b/config/README.md deleted file mode 100644 index cefa4d9..0000000 --- a/config/README.md +++ /dev/null @@ -1,8 +0,0 @@ -## Configuration: example -The files inside this directory (not the ones in `env/`) are a sample of the required configuration. - -Please create a copy of these files under `/config/...`. - -> Note: -> -> Some of the fields are blank, please fill them in as needed. \ No newline at end of file diff --git a/config/container.yaml b/config/container.yaml new file mode 100644 index 0000000..1314b1e --- /dev/null +++ b/config/container.yaml @@ -0,0 +1,37 @@ +mail: + smtp_server: smtp.ethz.ch + port: 587 + sender: "****************" + recipient: "****************" + uname: "****************" + password: "************" + + +slack: + bot_id: U02MR1R8UJH + archive_id: C02MM7YG1V4 + debug_id: C02NM2H9J5Q + api_wait_time: 90 + auth_token: "****************" + app_token: "****************" + + +database: + debug_db: /app/containerdata/debug/downloads.db + db_printout: /app/containerdata/backups + production_db_name: coss_archiving + production_user_name: "ca_rw" + production_password: "****************" + + ## user_name: ca_ro + ## password: "****************" + + +downloads: + local_storage_path: /app/containerdata/files + debug_storage_path: /app/containerdata/debug/ + default_download_path: /app/containerdata/tmp + remote_storage_path: /helbing_support/Archiving-Pipeline + browser_profile_path: /app/containerdata/dependencies/news_fetch.profile + # please keep this exact name + browser_print_delay: 3 diff --git a/config/db.config.ini b/config/db.config.ini deleted file mode 100644 index 07c3b01..0000000 --- a/config/db.config.ini +++ /dev/null @@ -1,7 +0,0 @@ -[DATABASE] -db_name: coss_archiving -user_name: **************** -password: **************** - -## user_name: ca_ro -## password: #TK5cLxA^YyoxWjR6 \ No newline at end of file diff --git a/config/local.env b/config/local.env new file mode 100644 index 0000000..5a16a0f --- /dev/null +++ b/config/local.env @@ -0,0 +1,18 @@ +CONTAINER_DATA=*********** +UNAME=*********** +U_ID=*********** + +DB_HOST=*********** + + +OPENCONNECT_URL=*********** +OPENCONNECT_USER=*********** +OPENCONNECT_PASSWORD=*********** +OPENCONNECT_OPTIONS=--authgroup student-net + + +NAS_HOST=*********** +NAS_PATH=/gess_coss_1/helbing_support/Archiving-Pipeline +NAS_USERNAME=*********** +NAS_PASSWORD=*********** +# Special characters like # need to be escaped (write: \#) \ No newline at end of file diff --git a/config/nas_login.config b/config/nas_login.config deleted file mode 100644 index 4d325d0..0000000 --- a/config/nas_login.config +++ /dev/null @@ -1,3 +0,0 @@ -user=remoll -domain=D -password=**************** \ No newline at end of file diff --git a/config/nas_sync.config b/config/nas_sync.config deleted file mode 100644 index b5f8a61..0000000 --- a/config/nas_sync.config +++ /dev/null @@ -1,12 +0,0 @@ -settings { - logfile = "/tmp/lsyncd.log", - statusFile = "/tmp/lsyncd.status", - nodaemon = true, -} - -sync { - default.rsync, - source = "/sync/local_files", - target = "/sync/remote_files", - init = false, -} diff --git a/config/news_fetch.config.ini b/config/news_fetch.config.ini deleted file mode 100644 index 24155c2..0000000 --- a/config/news_fetch.config.ini +++ /dev/null @@ -1,31 +0,0 @@ -[MAIL] -smtp_server: smtp.ethz.ch -port: 587 -sender: **************** -recipient: **************** -uname: **************** -password: **************** - - -[SLACK] -bot_id: U02MR1R8UJH -archive_id: C02MM7YG1V4 -debug_id: C02NM2H9J5Q -api_wait_time: 90 -auth_token: **************** -app_token: **************** - - -[DATABASE] -download_db_debug: /app/containerdata/debug/downloads.db -db_printout: /app/containerdata/backups - - -[DOWNLOADS] -local_storage_path: /app/containerdata/files -debug_storage_path: /app/containerdata/debug/ -default_download_path: /app/containerdata/tmp -remote_storage_path: /helbing_support/Archiving-Pipeline -browser_profile_path: /app/containerdata/dependencies/news_fetch.profile -# please keep this exact name -browser_print_delay: 3 diff --git a/config/vpn.config b/config/vpn.config deleted file mode 100644 index a412bc6..0000000 --- a/config/vpn.config +++ /dev/null @@ -1,4 +0,0 @@ -OPENCONNECT_URL=sslvpn.ethz.ch/student-net -OPENCONNECT_USER=**************** -OPENCONNECT_PASSWORD=**************** -OPENCONNECT_OPTIONS=--authgroup student-net \ No newline at end of file diff --git a/docker-compose.yaml b/docker-compose.yaml index 84fa678..c2c9118 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -4,33 +4,17 @@ services: vpn: # Creates a connection behind the ETH Firewall to access NAS and Postgres image: wazum/openconnect-proxy:latest - env_file: - - ${CONTAINER_DATA}/config/vpn.config + environment: + - OPENCONNECT_URL=${OPENCONNECT_URL} + - OPENCONNECT_USER=${OPENCONNECT_USER} + - OPENCONNECT_PASSWORD=${OPENCONNECT_PASSWORD} + - OPENCONNECT_OPTIONS=${OPENCONNECT_OPTIONS} cap_add: - NET_ADMIN volumes: - /dev/net/tun:/dev/net/tun # alternative to cap_add & volumes: specify privileged: true expose: ["5432"] # exposed here because db_passhtrough uses this network. See below for more details - - - nas_sync: # Syncs locally downloaded files with the NAS-share on nas22.ethz.ch/... - depends_on: - - vpn - network_mode: "service:vpn" # used to establish a connection to the SMB server from inside ETH network - build: nas_sync # local folder to build - image: nas_sync:latest - cap_add: # capabilities needed for mounting the SMB share - - SYS_ADMIN - - DAC_READ_SEARCH - volumes: - - ${CONTAINER_DATA}/files:/sync/local_files - - ${CONTAINER_DATA}/config/nas_sync.config:/sync/nas_sync.config - - ${CONTAINER_DATA}/config/nas_login.config:/sync/nas_login.config - command: - - nas22.ethz.ch/gess_coss_1/helbing_support/Archiving-Pipeline # first command is the target mount path - - lsyncd - - /sync/nas_sync.config geckodriver: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers) @@ -40,7 +24,6 @@ services: - START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash) - START_XVFB=${HEADFULL-false} - SE_VNC_NO_PASSWORD=1 - # - SE_OPTS="--profile /user_data/news_fetch.profile.firefox" volumes: - ${CONTAINER_DATA}/dependencies:/firefox_profile/ - ${CODE:-/dev/null}:/code @@ -53,7 +36,7 @@ services: db_passthrough: # Allows a container on the local network to connect to a service (here postgres) through the vpn network_mode: "service:vpn" image: alpine/socat:latest - command: ["tcp-listen:5432,reuseaddr,fork", "tcp-connect:id-hdb-psgr-cp48.ethz.ch:5432"] + command: ["tcp-listen:5432,reuseaddr,fork", "tcp-connect:${DB_HOST}:5432"] # expose: ["5432"] We would want this passthrough to expose its ports to the other containers # BUT since it uses the same network as the vpn-service, it can't expose ports on its own. 5432 is therefore exposed under service.vpn.expose @@ -62,14 +45,14 @@ services: build: news_fetch image: news_fetch:latest depends_on: # when using docker compose run news_fetch, the dependencies are started as well - - nas_sync - geckodriver - db_passthrough - volumes: - ${CONTAINER_DATA}:/app/containerdata # always set + - ./config/container.yaml:/app/config.yaml - ${CODE:-/dev/null}:/code # not set in prod, defaults to /dev/null environment: + - CONFIG_FILE=/app/config.yaml - DEBUG=${DEBUG} - UNAME=${UNAME} user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user @@ -86,10 +69,33 @@ services: - db_passthrough volumes: - ${CONTAINER_DATA}:/app/containerdata # always set + - ./config/container.yaml:/app/config.yaml - ${CODE:-/dev/null}:/code # not set in prod, defaults to /dev/null environment: + - CONFIG_FILE=/app/config.yaml - UNAME=${UNAME} ports: - "8080:80" # 80 inside container entrypoint: ${ENTRYPOINT:-python app.py} # by default launch workers as defined in the Dockerfile - tty: true + + + nas_sync: + image: alpine:latest + volumes: + - ${CONTAINER_DATA}/files:/sync/local_files + - coss_smb_share:/sync/remote_files + command: + - /bin/sh + - -c + - | + apk add rsync + rsync -av --no-perms --no-owner --no-group --progress /sync/local_files/${SYNC_FOLDER}/ /sync/remote_files/${SYNC_FOLDER} -n + + +volumes: + coss_smb_share: + driver: local + driver_opts: + type: cifs + o: "addr=${NAS_HOST},nounix,file_mode=0777,dir_mode=0777,domain=D,username=${NAS_USERNAME},password=${NAS_PASSWORD}" + device: //${NAS_HOST}${NAS_PATH} diff --git a/launch b/launch deleted file mode 100644 index b117a19..0000000 --- a/launch +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash -set -e -set -o ignoreeof - -echo "Bash script launching COSS_ARCHIVING..." - - -# CHANGE ME ONCE! -export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving -export UNAME=remy -export U_ID=1000 - - -### Main use cases ### -if [[ $1 == "debug" ]] -then - export DEBUG=true - export HEADFULL=true - export CODE=./ - export ENTRYPOINT=/bin/bash - # since service ports does not open ports on implicitly started containers, also start geckodriver: - docker compose up -d geckodriver - -elif [[ $1 == "production" ]] -then - export DEBUG=false - -elif [[ $1 == "build" ]] -then - export DEBUG=false - shift - docker compose build "$@" - exit 0 - - -### Manual Shutdown ### -elif [[ $1 == "down" ]] -then - docker compose down -t 0 - exit 0 - - - -### Edge cases -> for firefox ### -elif [[ $1 == "edit_profile" ]] -then - export CODE=./ - export HEADFULL=true - - docker compose up -d geckodriver - sleep 5 - docker compose exec geckodriver /bin/bash /code/geckodriver/edit_profile.sh # inside the container - docker compose down -t 0 - - -### Fallback #### -else - echo "Please specify the execution mode (debug/production/build/edit_profile/down) as the first argument" - exit 1 -fi - - - -shift # consumes the variable set in $1 so that $@ only contains the remaining arguments - -docker compose run -it --service-ports "$@" - -echo "Docker run finished, shutting down containers..." -docker compose down -t 0 -echo "Bye!" diff --git a/manual/gather_media_files.py b/manual/gather_media_files.py index 246e294..538e072 100644 --- a/manual/gather_media_files.py +++ b/manual/gather_media_files.py @@ -46,7 +46,13 @@ def fetch(): def show(): for a in runner.models.ArticleDownload.select(): - print(f"URL: {a.article_url} \nARCHIVE_URL: {a.archive_url} \nFILE_NAME: {a.file_name}") + print(f""" + URL: {a.article_url} + ARCHIVE_URL: {a.archive_url} + ARTICLE_SOURCE: {a.source_name} + FILE_NAME: {a.file_name} + """) + if __name__ == "__main__": logger.info("Overwriting production values for single time media-fetch") @@ -55,7 +61,7 @@ if __name__ == "__main__": runner.configuration.models.set_db( runner.configuration.SqliteDatabase("../.dev/media_downloads.db") ) - runner.configuration.main_config["DOWNLOADS"]["local_storage_path"] = "../.dev/" + runner.configuration.main_config["downloads"]["local_storage_path"] = "../.dev/" if len(sys.argv) == 1: # no additional arguments diff --git a/nas_sync/Dockerfile b/nas_sync/Dockerfile deleted file mode 100644 index b79bb0e..0000000 --- a/nas_sync/Dockerfile +++ /dev/null @@ -1,9 +0,0 @@ -FROM bash:latest -# alpine with bash instead of sh -ENV TZ=Europe/Berlin -RUN apk add lsyncd cifs-utils rsync -RUN mkdir -p /sync/remote_files -COPY entrypoint.sh /sync/entrypoint.sh - - -ENTRYPOINT ["bash", "/sync/entrypoint.sh"] diff --git a/nas_sync/entrypoint.sh b/nas_sync/entrypoint.sh deleted file mode 100644 index 2b65df4..0000000 --- a/nas_sync/entrypoint.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash -set -e - -sleep 5 # waits for the vpn to have an established connection -echo "Starting NAS sync" -mount -t cifs "//$1" -o credentials=/sync/nas_login.config /sync/remote_files -echo "Successfully mounted SAMBA remote: $1 --> /sync/remote_files" -shift # consumes the variable set in $1 so tat $@ only contains the remaining arguments - -exec "$@" diff --git a/news_check/requirements.txt b/news_check/requirements.txt index 62abb13..3dd1f23 100644 --- a/news_check/requirements.txt +++ b/news_check/requirements.txt @@ -1,4 +1,5 @@ flask peewee markdown -psycopg2 \ No newline at end of file +psycopg2 +pyyaml \ No newline at end of file diff --git a/news_check/server/configuration.py b/news_check/server/configuration.py index f3214da..cc9ce7c 100644 --- a/news_check/server/configuration.py +++ b/news_check/server/configuration.py @@ -1,17 +1,16 @@ from peewee import PostgresqlDatabase -import configparser import time +import yaml +import os -main_config = configparser.ConfigParser() -main_config.read("/app/containerdata/config/news_fetch.config.ini") +config_location = os.getenv("CONFIG_FILE") +with open(config_location, "r") as f: + config = yaml.safe_load(f) -db_config = configparser.ConfigParser() -db_config.read("/app/containerdata/config/db.config.ini") - -cred = db_config["DATABASE"] +cred = config["database"] time.sleep(10) # wait for the vpn to connect (can't use a healthcheck because there is no depends_on) db = PostgresqlDatabase( - cred["db_name"], user=cred["user_name"], password=cred["password"], host="vpn", port=5432 + cred["production_db_name"], user=cred["production_user_name"], password=cred["production_password"], host="vpn", port=5432 ) import models diff --git a/news_check/server/models.py b/news_check/server/models.py index e975d75..8d291c5 100644 --- a/news_check/server/models.py +++ b/news_check/server/models.py @@ -6,7 +6,7 @@ import os import datetime import configuration -config = configuration.main_config["DOWNLOADS"] +downloads_config = configuration.config["downloads"] # set the nature of the db at runtime download_db = DatabaseProxy() @@ -34,14 +34,14 @@ class ArticleDownload(DownloadBaseModel): file_name = TextField(default = '') @property def save_path(self): - return f"{config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/" + return f"{downloads_config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/" @property def fname_nas(self, file_name=""): if self.download_date: if file_name: - return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{file_name}" + return f"NAS: {downloads_config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{file_name}" else: # return the self. name - return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{self.file_name}" + return f"NAS: {downloads_config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{self.file_name}" else: return None diff --git a/news_fetch/configuration.py b/news_fetch/configuration.py index 75a13b0..0ce9e1c 100644 --- a/news_fetch/configuration.py +++ b/news_fetch/configuration.py @@ -1,9 +1,7 @@ -import os -import configparser -import logging import time -# import shutil -# from datetime import datetime +import os +import logging +import yaml from peewee import SqliteDatabase, PostgresqlDatabase from rich.logging import RichHandler @@ -19,22 +17,21 @@ logger = logging.getLogger(__name__) # load config file containing constants and secrets -main_config = configparser.ConfigParser() -main_config.read("/app/containerdata/config/news_fetch.config.ini") -db_config = configparser.ConfigParser() -db_config.read("/app/containerdata/config/db.config.ini") +config_location = os.getenv("CONFIG_FILE") +with open(config_location, "r") as f: + config = yaml.safe_load(f) # DEBUG MODE: if os.getenv("DEBUG", "false") == "true": logger.warning("Found 'DEBUG=true', setting up dummy databases") - main_config["SLACK"]["archive_id"] = main_config["SLACK"]["debug_id"] - main_config["MAIL"]["recipient"] = main_config["MAIL"]["sender"] - main_config["DOWNLOADS"]["local_storage_path"] = main_config["DOWNLOADS"]["debug_storage_path"] + config["slack"]["archive_id"] = config["slack"]["debug_id"] + config["mail"]["recipient"] = config["mail"]["sender"] + config["downloads"]["local_storage_path"] = config["downloads"]["debug_storage_path"] download_db = SqliteDatabase( - main_config["DATABASE"]["download_db_debug"], + config["database"]["debug_db"], pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once ) @@ -43,9 +40,9 @@ else: logger.warning("Found 'DEBUG=false' and running on production databases, I hope you know what you're doing...") time.sleep(10) # wait for the vpn to connect (can't use a healthcheck because there is no depends_on) - cred = db_config["DATABASE"] + cred = config["database"] download_db = PostgresqlDatabase( - cred["db_name"], user=cred["user_name"], password=cred["password"], host="vpn", port=5432 + cred["production_db_name"], user=cred["production_user_name"], password=cred["production_password"], host="vpn", port=5432 ) # TODO Reimplement backup/printout # logger.info("Backing up databases") diff --git a/news_fetch/requirements.txt b/news_fetch/requirements.txt index 29bab81..bda2ce8 100644 --- a/news_fetch/requirements.txt +++ b/news_fetch/requirements.txt @@ -10,3 +10,4 @@ markdown rich psycopg2 unidecode +pyyaml \ No newline at end of file diff --git a/news_fetch/utils_mail/runner.py b/news_fetch/utils_mail/runner.py index 4ada4f3..d5e1f71 100644 --- a/news_fetch/utils_mail/runner.py +++ b/news_fetch/utils_mail/runner.py @@ -7,16 +7,20 @@ import logging import configuration logger = logging.getLogger(__name__) -config = configuration.main_config["MAIL"] +mail_config = configuration.config["mail"] def send(article_model): mail = MIMEMultipart() mail['Subject'] = "{} -- {}".format(article_model.source_name, article_model.title) - mail['From'] = config["sender"] - mail['To'] = config["recipient"] - - msg, files = article_model.mail_info() # this is html + mail['From'] = mail_config["sender"] + mail['To'] = mail_config["recipient"] + try: + msg, files = article_model.mail_info() # this is html + except: # Raised by model if article has no associated file + logger.info("Skipping mail sending") + return + content = MIMEText(msg, "html") mail.attach(content) @@ -29,14 +33,14 @@ def send(article_model): try: try: - smtp = smtplib.SMTP(config["smtp_server"], config["port"]) + smtp = smtplib.SMTP(mail_config["smtp_server"], mail_config["port"]) except ConnectionRefusedError: logger.error("Server refused connection. Is this an error on your side?") return False smtp.starttls() - smtp.login(config["uname"], config["password"]) - smtp.sendmail(config["sender"], config["recipient"], mail.as_string()) + smtp.login(mail_config["uname"], mail_config["password"]) + smtp.sendmail(mail_config["sender"], mail_config["recipient"], mail.as_string()) smtp.quit() logger.info("Mail successfully sent.") except smtplib.SMTPException as e: diff --git a/news_fetch/utils_slack/runner.py b/news_fetch/utils_slack/runner.py index 9b41a4b..4238810 100644 --- a/news_fetch/utils_slack/runner.py +++ b/news_fetch/utils_slack/runner.py @@ -7,7 +7,7 @@ import re import time import configuration -config = configuration.main_config["SLACK"] +slack_config = configuration.config["slack"] models = configuration.models class MessageIsUnwanted(Exception): @@ -61,7 +61,7 @@ class Message: @property def is_by_human(self): - return self.user.user_id != config["bot_id"] + return self.user.user_id != slack_config["bot_id"] @property @@ -87,7 +87,7 @@ class BotApp(App): def say_substitute(self, *args, **kwargs): self.client.chat_postMessage( - channel=config["archive_id"], + channel=slack_config["archive_id"], text=" - ".join(args), **kwargs ) @@ -101,7 +101,7 @@ class BotApp(App): last_ts = presaved.slack_ts_full result = self.client.conversations_history( - channel=config["archive_id"], + channel=slack_config["archive_id"], oldest=last_ts ) @@ -116,7 +116,7 @@ class BotApp(App): while refetch: # we have not actually fetched them all try: result = self.client.conversations_history( - channel = config["archive_id"], + channel = slack_config["archive_id"], cursor = result["response_metadata"]["next_cursor"], oldest = last_ts ) # fetches 100 messages, older than the [-1](=oldest) element of new_fetches @@ -126,8 +126,8 @@ class BotApp(App): for m in new_messages: return_messages.append(Message(m)) except SlackApiError: # Most likely a rate-limit - self.logger.error("Error while fetching channel messages. (likely rate limit) Retrying in {} seconds...".format(config["api_wait_time"])) - time.sleep(config["api_wait_time"]) + self.logger.error("Error while fetching channel messages. (likely rate limit) Retrying in {} seconds...".format(slack_config["api_wait_time"])) + time.sleep(slack_config["api_wait_time"]) refetch = True self.logger.info(f"Fetched {len(return_messages)} new channel messages.") @@ -181,7 +181,7 @@ class BotRunner(): """Stupid encapsulation so that we can apply the slack decorators to the BotApp""" def __init__(self, callback, *args, **kwargs) -> None: - self.bot_worker = BotApp(callback, token=config["auth_token"]) + self.bot_worker = BotApp(callback, token=slack_config["auth_token"]) @self.bot_worker.event(event="message", matchers=[is_message_in_archiving]) def handle_incoming_message(message, say): @@ -195,7 +195,7 @@ class BotRunner(): def handle_all_other_reactions(event, say): self.logger.log("Ignoring slack event that isn't a message") - self.handler = SocketModeHandler(self.bot_worker, config["app_token"]) + self.handler = SocketModeHandler(self.bot_worker, slack_config["app_token"]) def start(self): @@ -215,5 +215,5 @@ class BotRunner(): def is_message_in_archiving(message) -> bool: - return message["channel"] == config["archive_id"] + return message["channel"] == slack_config["archive_id"] diff --git a/news_fetch/utils_storage/models.py b/news_fetch/utils_storage/models.py index f7ae8b9..928d01e 100644 --- a/news_fetch/utils_storage/models.py +++ b/news_fetch/utils_storage/models.py @@ -8,8 +8,7 @@ import configuration import datetime from . import helpers -config = configuration.main_config["DOWNLOADS"] -slack_config = configuration.main_config["SLACK"] +downloads_config = configuration.config["downloads"] FILE_SIZE_THRESHOLD = 15 * 1024 * 1024 # 15MB @@ -34,7 +33,8 @@ class ArticleDownload(DownloadBaseModel): def is_title_bad(self): # add incrementally return "PUR-Abo" in self.title \ or "Redirecting" in self.title \ - or "Error while running fetch" in self.title + or "Error while running fetch" in self.title \ + or self.title == "" summary = TextField(default = '') source_name = CharField(default = '') @@ -44,14 +44,14 @@ class ArticleDownload(DownloadBaseModel): file_name = TextField(default = '') @property def save_path(self): - return f"{config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/" + return f"{downloads_config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/" @property def fname_nas(self, file_name=""): if self.download_date: if file_name: - return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{file_name}" + return f"NAS: {downloads_config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{file_name}" else: # return the self. name - return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{self.file_name}" + return f"NAS: {downloads_config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{self.file_name}" else: return None @property @@ -102,18 +102,22 @@ class ArticleDownload(DownloadBaseModel): answer_files = [] # displays the summary in a blockquote - status = self.file_status - if status == 1: # file_name was empty - return None # there has been an error do not send any message - elif status == 2: # no file found at specified location - answer_text += f"*{self.title}*\n{summary}\nFilename: {self.file_name}" - elif status == 3: # file found but deemed too big - location = f"File not sent directly. Location on NAS:\n`{self.fname_nas}`" - answer_text += f"*{self.title}*\n{summary}\n{location}" - else: # everything nominal + try: + self.ensure_file_present() answer_text += f"*{self.title}*\n{summary}" answer_files.append(self.save_path + self.file_name) + except Exception as e: + msg = e.args[0] + logger.error(f"Article {self} has file-issues: {msg}") + if "file too big" in msg: + location = f"File too big to send directly. Location on NAS:\n`{self.fname_nas}`" + answer_text += f"*{self.title}*\n{summary}\n{location}" + + else: # file not found, or filename not set + raise e + # reraise the exception, so that the caller can handle it + # then the related files if self.related: rel_text = "Related files on NAS:" @@ -144,19 +148,14 @@ class ArticleDownload(DownloadBaseModel): related_file_name = r ) - @property - def file_status(self): - """0 = file exists, 1 = no file name!, 2 = file does not exit,3 = file exists but is too large""" + def ensure_file_present(self): if not self.file_name: - logger.error(f"Article {self} has no filename!") - return 2 + raise Exception("no filename") file_path_abs = self.save_path + self.file_name if not os.path.exists(file_path_abs): - logger.error(f"Article {self} has a filename, but the file does not exist at that location!") - return 2 + raise Exception("file not found") if (os.path.splitext(file_path_abs)[1] != ".pdf") or (os.path.getsize(file_path_abs) > FILE_SIZE_THRESHOLD): - logger.warning(f"Article {self} has a file that exceeds the file size limit.") - return 3 + raise Exception("file too big") diff --git a/news_fetch/utils_worker/download/browser.py b/news_fetch/utils_worker/download/browser.py index 092f3d7..d20aaec 100644 --- a/news_fetch/utils_worker/download/browser.py +++ b/news_fetch/utils_worker/download/browser.py @@ -11,7 +11,7 @@ from selenium import webdriver import configuration -config = configuration.main_config["DOWNLOADS"] +download_config = configuration.config["downloads"] def driver_running(f): def wrapper(*args, **kwargs): @@ -66,74 +66,88 @@ class PDFDownloader: @driver_running def download(self, article_object): - sleep_time = int(config["browser_print_delay"]) url = article_object.article_url + + if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly + self.logger.info("Downloading existing pdf") + success = self.get_exisiting_pdf(article_object) + # get a page title if required + if article_object.is_title_bad: + article_object.title = self.driver.title.replace(".pdf", "") # some titles end with .pdf + # will be propagated to the saved file (dst) as well + else: + success = self.get_new_pdf(article_object) + + if not success: + self.logger.error("Download failed") + # TODO: need to reset the file name to empty? + return article_object # changes to this are saved later by the external caller + + + def get_exisiting_pdf(self, article_object): + # get a better page title if required + if article_object.is_title_bad: + article_object.title = article_object.article_url.split("/")[-1].split(".pdf")[0] try: - self.driver.get(url) + r = requests.get(article_object.article_url) + bytes = r.content + except: + return False + return self.write_pdf(bytes, article_object) + + + def get_new_pdf(self, article_object): + sleep_time = int(download_config["browser_print_delay"]) + + try: + self.driver.get(article_object.article_url) except Exception as e: self.logger.critical("Selenium .get(url) failed with error {}".format(e)) self.finish() - return article_object # without changes + return False time.sleep(sleep_time) # leave the page time to do any funky business - # in the mean time, get a page title if required if article_object.is_title_bad: - article_object.title = self.driver.title.replace(".pdf", "") # some titles end with .pdf - # will be propagated to the saved file (dst) as well + article_object.title = self.driver.title + try: + result = self.driver.print_page() + bytes = base64.b64decode(result, validate=True) + except: + self.logger.error("Failed, probably because the driver went extinct.") + return False + + return self.write_pdf(bytes, article_object) + + + def get_file_destination(self, article_object): fname = article_object.fname_template fname = ensure_unique(article_object.save_path, fname) dst = os.path.join(article_object.save_path, fname) + return dst, fname - if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly - success = self.get_exisiting_pdf(url, dst) - else: - success = self.get_new_pdf(dst) - - if success: - article_object.file_name = fname - else: - article_object.file_name = "" - - return article_object # this change is saved later by the external caller - - - def get_exisiting_pdf(self, url, dst): - try: - r = requests.get(url) - bytes = r.content - except: - return False - return self.get_new_pdf(dst, other_bytes=bytes) - - - def get_new_pdf(self, dst, other_bytes=None): + def write_pdf(self, content, article_object): + dst, fname = self.get_file_destination(article_object) os.makedirs(os.path.dirname(dst), exist_ok=True) - - if other_bytes is None: - try: - result = self.driver.print_page() - bytes = base64.b64decode(result, validate=True) - except: - self.logger.error("Failed, probably because the driver went extinct.") - return False - else: - bytes = other_bytes - + try: with open(dst, "wb+") as f: - f.write(bytes) + f.write(content) + + article_object.file_name = fname return True except Exception as e: self.logger.error(f"Failed, because of FS-operation: {e}") return False - def create_tmp_profile(self, full_profile_path: Path = Path(config["browser_profile_path"])) -> Path: + + + def create_tmp_profile(self, full_profile_path: Path = Path(download_config["browser_profile_path"])) -> Path: reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}") os.mkdir(reduced_profile_path) # copy needed directories diff --git a/news_fetch/utils_worker/download/youtube.py b/news_fetch/utils_worker/download/youtube.py index a16305b..e08027d 100644 --- a/news_fetch/utils_worker/download/youtube.py +++ b/news_fetch/utils_worker/download/youtube.py @@ -1,10 +1,11 @@ import youtube_dl import os import logging +import configuration +download_config = configuration.config["downloads"] logger = logging.getLogger(__name__) - class MyLogger(object): def debug(self, msg): pass def warning(self, msg): pass @@ -19,7 +20,6 @@ class YouTubeDownloader: def post_download_hook(self, ret_code): - # print(ret_code) if ret_code['status'] == 'finished': file_loc = ret_code["filename"] fname = os.path.basename(file_loc) @@ -35,9 +35,11 @@ class YouTubeDownloader: ydl_opts = { 'format': 'best[height<=720]', 'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download - 'logger': MyLogger(), + 'logger': MyLogger(), # supress verbosity 'progress_hooks': [self.post_download_hook], - 'updatetime': False + 'updatetime': False, + # File is also used by firefox so make sure to not write to it! + # youtube dl apparenlty does not support cookies.sqlite and the documentation is not clear on how to use cookies.txt } try: with youtube_dl.YoutubeDL(ydl_opts) as ydl: @@ -46,5 +48,9 @@ class YouTubeDownloader: except Exception as e: logger.error(f"Youtube download crashed: {e}") article_object.file_name = "" + logfile = os.path.join(download_config["local_storage_path"], "failed_downloads.csv") + logger.info(f"Logging youtube errors seperately to {logfile}") + with open(logfile, "a+") as f: + f.write(f"{url}\n") return article_object