Bug fixes, makefile for launch
This commit is contained in:
		
							
								
								
									
										87
									
								
								Makefile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										87
									
								
								Makefile
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,87 @@ | |||||||
|  | include config/local.env | ||||||
|  | export | ||||||
|  |  | ||||||
|  | build: | ||||||
|  | 	@echo "Building..." | ||||||
|  | 	docker compose build $(flags) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | down: | ||||||
|  | 	@echo "Stopping containers..." | ||||||
|  | 	docker compose down -t 0 --volumes | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # Variables specific to debug | ||||||
|  | debug: export DEBUG=true | ||||||
|  | debug: export HEADFULL=true | ||||||
|  | debug: export ENTRYPOINT=/bin/bash | ||||||
|  | debug: export CODE=./ | ||||||
|  | debug: | ||||||
|  | 	@echo "Running in debug mode..." | ||||||
|  | 	docker compose up -d geckodriver | ||||||
|  | 	docker compose run -it --service-ports $(target) $(flags) || true | ||||||
|  | 	make down | ||||||
|  |  | ||||||
|  |  | ||||||
|  | production: export DEBUG=false | ||||||
|  | production: | ||||||
|  | 	@echo "Running in production mode..." | ||||||
|  | 	docker compose run -it --service-ports $(target) $(flags) || true | ||||||
|  | 	make down | ||||||
|  |  | ||||||
|  |  | ||||||
|  | nas_sync: | ||||||
|  | 	@echo "Syncing NAS..." | ||||||
|  | 	SYNC_FOLDER=$(folder) docker compose run -it nas_sync $(flags) || true | ||||||
|  | 	docker compose down | ||||||
|  | 	docker container prune -f | ||||||
|  | 	make down | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | ## Misc: | ||||||
|  | edit_profile: export CODE=./ | ||||||
|  | edit_profile: export HEADFULL=true | ||||||
|  | edit_profile: | ||||||
|  | 	@echo "Editing profile..." | ||||||
|  | 	docker compose up -d geckodriver | ||||||
|  | 	sleep 5 | ||||||
|  | 	docker compose exec geckodriver /bin/bash /code/geckodriver/edit_profile.sh || true | ||||||
|  | 	# runs inside the container | ||||||
|  | 	make down | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | db_interface: | ||||||
|  | 	docker create \ | ||||||
|  | 	--name pgadmin \ | ||||||
|  | 	-p 8080:80 \ | ||||||
|  | 	-e 'PGADMIN_DEFAULT_EMAIL=${UNAME}@test.com' \ | ||||||
|  | 	-e 'PGADMIN_DEFAULT_PASSWORD=password' \ | ||||||
|  | 	-e 'PGADMIN_CONFIG_ENHANCED_COOKIE_PROTECTION=True' \ | ||||||
|  | 	-e 'PGADMIN_CONFIG_LOGIN_BANNER="Authorised users only!"' \ | ||||||
|  | 	dpage/pgadmin4 | ||||||
|  |  | ||||||
|  | 	docker start pgadmin | ||||||
|  |  | ||||||
|  | 	sleep 5 | ||||||
|  |  | ||||||
|  | 	# TODO auto add the server to the list displayed in the browser | ||||||
|  | 	# docker exec pgadmin sh -c "echo ${SERVER_DATA} > /tmp/servers.json" | ||||||
|  | 	# docker exec pgadmin sh -c "/venv/bin/python setup.py --load-servers /tmp/servers.json --user remy@test.com" | ||||||
|  | 	@echo "Go to http://localhost:8080 to access the database interface" | ||||||
|  | 	@echo "Username: ${UNAME}@test.com" | ||||||
|  | 	@echo "Password: password" | ||||||
|  | 	@echo "Hit any key to stop (not ctrl+c)" | ||||||
|  | 	read STOP | ||||||
|  |  | ||||||
|  | 	docker stop pgadmin | ||||||
|  | 	docker rm pgadmin | ||||||
|  |  | ||||||
|  |  | ||||||
|  | logs: | ||||||
|  | 	docker compose logs -f $(target) $(flags) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | 	make down | ||||||
| @@ -124,4 +124,4 @@ I use `rsync`. Mounting the NAS locally, I navigate to the location of the local | |||||||
| `rsync -Razq --no-perms --no-owner --no-group --temp-dir=/tmp --progress --log-file=rsync.log <local folder>/ "<remote>"` | `rsync -Razq --no-perms --no-owner --no-group --temp-dir=/tmp --progress --log-file=rsync.log <local folder>/ "<remote>"` | ||||||
| where `<remote>` is the location where the NAS is mounted. (options:`R` - relative paths  , `a` - archive mode (multiple actions), `z` - ??, `q` - quiet. We also don't copy most of the metadata and we keep a log of the transfers.) | where `<remote>` is the location where the NAS is mounted. (options:`R` - relative paths  , `a` - archive mode (multiple actions), `z` - ??, `q` - quiet. We also don't copy most of the metadata and we keep a log of the transfers.) | ||||||
|  |  | ||||||
| You can also use your OS' native copy option and select *de not overwrite*. This should only copy the missing files, significantly speeding up the operation. | You can also use your OS' native copy option and select *do not overwrite*. This should only copy the missing files, significantly speeding up the operation. | ||||||
| @@ -1,8 +0,0 @@ | |||||||
| ## Configuration: example |  | ||||||
| The files inside this directory (not the ones in `env/`) are a sample of the required configuration. |  | ||||||
|  |  | ||||||
| Please create a copy of these files under `<location of downloads>/config/...`. |  | ||||||
|  |  | ||||||
| > Note: |  | ||||||
| > |  | ||||||
| > Some of the fields are blank, please fill them in as needed. |  | ||||||
							
								
								
									
										37
									
								
								config/container.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								config/container.yaml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,37 @@ | |||||||
|  | mail: | ||||||
|  |   smtp_server: smtp.ethz.ch | ||||||
|  |   port: 587 | ||||||
|  |   sender: "****************" | ||||||
|  |   recipient: "****************" | ||||||
|  |   uname: "****************" | ||||||
|  |   password: "************" | ||||||
|  |  | ||||||
|  |  | ||||||
|  | slack: | ||||||
|  |   bot_id: U02MR1R8UJH | ||||||
|  |   archive_id: C02MM7YG1V4 | ||||||
|  |   debug_id: C02NM2H9J5Q | ||||||
|  |   api_wait_time: 90 | ||||||
|  |   auth_token: "****************" | ||||||
|  |   app_token: "****************" | ||||||
|  |  | ||||||
|  |  | ||||||
|  | database: | ||||||
|  |   debug_db: /app/containerdata/debug/downloads.db | ||||||
|  |   db_printout: /app/containerdata/backups | ||||||
|  |   production_db_name: coss_archiving | ||||||
|  |   production_user_name: "ca_rw" | ||||||
|  |   production_password: "****************" | ||||||
|  |  | ||||||
|  |   ## user_name: ca_ro | ||||||
|  |   ## password: "****************" | ||||||
|  |  | ||||||
|  |  | ||||||
|  | downloads: | ||||||
|  |   local_storage_path: /app/containerdata/files | ||||||
|  |   debug_storage_path: /app/containerdata/debug/ | ||||||
|  |   default_download_path: /app/containerdata/tmp | ||||||
|  |   remote_storage_path: /helbing_support/Archiving-Pipeline | ||||||
|  |   browser_profile_path: /app/containerdata/dependencies/news_fetch.profile | ||||||
|  |   # please keep this exact name | ||||||
|  |   browser_print_delay: 3 | ||||||
| @@ -1,7 +0,0 @@ | |||||||
| [DATABASE] |  | ||||||
| db_name: coss_archiving |  | ||||||
| user_name: **************** |  | ||||||
| password: **************** |  | ||||||
|  |  | ||||||
| ## user_name: ca_ro |  | ||||||
| ## password: #TK5cLxA^YyoxWjR6 |  | ||||||
							
								
								
									
										18
									
								
								config/local.env
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								config/local.env
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,18 @@ | |||||||
|  | CONTAINER_DATA=*********** | ||||||
|  | UNAME=*********** | ||||||
|  | U_ID=*********** | ||||||
|  |  | ||||||
|  | DB_HOST=*********** | ||||||
|  |  | ||||||
|  |  | ||||||
|  | OPENCONNECT_URL=*********** | ||||||
|  | OPENCONNECT_USER=*********** | ||||||
|  | OPENCONNECT_PASSWORD=*********** | ||||||
|  | OPENCONNECT_OPTIONS=--authgroup student-net | ||||||
|  |  | ||||||
|  |  | ||||||
|  | NAS_HOST=*********** | ||||||
|  | NAS_PATH=/gess_coss_1/helbing_support/Archiving-Pipeline | ||||||
|  | NAS_USERNAME=*********** | ||||||
|  | NAS_PASSWORD=*********** | ||||||
|  | # Special characters like # need to be escaped (write: \#)  | ||||||
| @@ -1,3 +0,0 @@ | |||||||
| user=remoll |  | ||||||
| domain=D |  | ||||||
| password=**************** |  | ||||||
| @@ -1,12 +0,0 @@ | |||||||
| settings { |  | ||||||
|    logfile    = "/tmp/lsyncd.log", |  | ||||||
|    statusFile = "/tmp/lsyncd.status", |  | ||||||
|    nodaemon   = true, |  | ||||||
| } |  | ||||||
|  |  | ||||||
| sync { |  | ||||||
|    default.rsync, |  | ||||||
|    source = "/sync/local_files", |  | ||||||
|    target = "/sync/remote_files", |  | ||||||
|    init = false, |  | ||||||
| } |  | ||||||
| @@ -1,31 +0,0 @@ | |||||||
| [MAIL] |  | ||||||
| smtp_server: smtp.ethz.ch |  | ||||||
| port: 587 |  | ||||||
| sender: **************** |  | ||||||
| recipient: **************** |  | ||||||
| uname: **************** |  | ||||||
| password: **************** |  | ||||||
|  |  | ||||||
|  |  | ||||||
| [SLACK] |  | ||||||
| bot_id: U02MR1R8UJH |  | ||||||
| archive_id: C02MM7YG1V4 |  | ||||||
| debug_id: C02NM2H9J5Q |  | ||||||
| api_wait_time: 90 |  | ||||||
| auth_token: **************** |  | ||||||
| app_token: **************** |  | ||||||
|  |  | ||||||
|  |  | ||||||
| [DATABASE] |  | ||||||
| download_db_debug: /app/containerdata/debug/downloads.db |  | ||||||
| db_printout: /app/containerdata/backups |  | ||||||
|  |  | ||||||
|  |  | ||||||
| [DOWNLOADS] |  | ||||||
| local_storage_path: /app/containerdata/files |  | ||||||
| debug_storage_path: /app/containerdata/debug/ |  | ||||||
| default_download_path: /app/containerdata/tmp |  | ||||||
| remote_storage_path: /helbing_support/Archiving-Pipeline |  | ||||||
| browser_profile_path: /app/containerdata/dependencies/news_fetch.profile |  | ||||||
| # please keep this exact name |  | ||||||
| browser_print_delay: 3 |  | ||||||
| @@ -1,4 +0,0 @@ | |||||||
| OPENCONNECT_URL=sslvpn.ethz.ch/student-net |  | ||||||
| OPENCONNECT_USER=**************** |  | ||||||
| OPENCONNECT_PASSWORD=**************** |  | ||||||
| OPENCONNECT_OPTIONS=--authgroup student-net |  | ||||||
| @@ -4,8 +4,11 @@ services: | |||||||
|  |  | ||||||
|   vpn: # Creates a connection behind the ETH Firewall to access NAS and Postgres |   vpn: # Creates a connection behind the ETH Firewall to access NAS and Postgres | ||||||
|     image: wazum/openconnect-proxy:latest |     image: wazum/openconnect-proxy:latest | ||||||
|     env_file: |     environment: | ||||||
|       - ${CONTAINER_DATA}/config/vpn.config |       - OPENCONNECT_URL=${OPENCONNECT_URL} | ||||||
|  |       - OPENCONNECT_USER=${OPENCONNECT_USER} | ||||||
|  |       - OPENCONNECT_PASSWORD=${OPENCONNECT_PASSWORD} | ||||||
|  |       - OPENCONNECT_OPTIONS=${OPENCONNECT_OPTIONS} | ||||||
|     cap_add: |     cap_add: | ||||||
|     - NET_ADMIN |     - NET_ADMIN | ||||||
|     volumes: |     volumes: | ||||||
| @@ -14,25 +17,6 @@ services: | |||||||
|     expose: ["5432"] # exposed here because db_passhtrough uses this network. See below for more details |     expose: ["5432"] # exposed here because db_passhtrough uses this network. See below for more details | ||||||
|  |  | ||||||
|  |  | ||||||
|   nas_sync: # Syncs locally downloaded files with the NAS-share on nas22.ethz.ch/... |  | ||||||
|     depends_on: |  | ||||||
|       - vpn |  | ||||||
|     network_mode: "service:vpn" # used to establish a connection to the SMB server from inside ETH network |  | ||||||
|     build: nas_sync # local folder to build |  | ||||||
|     image: nas_sync:latest |  | ||||||
|     cap_add: # capabilities needed for mounting the SMB share |  | ||||||
|       - SYS_ADMIN |  | ||||||
|       - DAC_READ_SEARCH |  | ||||||
|     volumes: |  | ||||||
|       - ${CONTAINER_DATA}/files:/sync/local_files |  | ||||||
|       - ${CONTAINER_DATA}/config/nas_sync.config:/sync/nas_sync.config |  | ||||||
|       - ${CONTAINER_DATA}/config/nas_login.config:/sync/nas_login.config |  | ||||||
|     command: |  | ||||||
|       - nas22.ethz.ch/gess_coss_1/helbing_support/Archiving-Pipeline # first command is the target mount path |  | ||||||
|       - lsyncd |  | ||||||
|       - /sync/nas_sync.config |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   geckodriver: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers) |   geckodriver: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers) | ||||||
|     image: selenium/standalone-firefox:latest |     image: selenium/standalone-firefox:latest | ||||||
|     shm_size: 2gb |     shm_size: 2gb | ||||||
| @@ -40,7 +24,6 @@ services: | |||||||
|       - START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash) |       - START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash) | ||||||
|       - START_XVFB=${HEADFULL-false} |       - START_XVFB=${HEADFULL-false} | ||||||
|       - SE_VNC_NO_PASSWORD=1 |       - SE_VNC_NO_PASSWORD=1 | ||||||
|       # - SE_OPTS="--profile /user_data/news_fetch.profile.firefox" |  | ||||||
|     volumes: |     volumes: | ||||||
|       - ${CONTAINER_DATA}/dependencies:/firefox_profile/ |       - ${CONTAINER_DATA}/dependencies:/firefox_profile/ | ||||||
|       - ${CODE:-/dev/null}:/code |       - ${CODE:-/dev/null}:/code | ||||||
| @@ -53,7 +36,7 @@ services: | |||||||
|   db_passthrough: # Allows a container on the local network to connect to a service (here postgres) through the vpn |   db_passthrough: # Allows a container on the local network to connect to a service (here postgres) through the vpn | ||||||
|     network_mode: "service:vpn" |     network_mode: "service:vpn" | ||||||
|     image: alpine/socat:latest |     image: alpine/socat:latest | ||||||
|     command: ["tcp-listen:5432,reuseaddr,fork", "tcp-connect:id-hdb-psgr-cp48.ethz.ch:5432"] |     command: ["tcp-listen:5432,reuseaddr,fork", "tcp-connect:${DB_HOST}:5432"] | ||||||
|     # expose: ["5432"] We would want this passthrough to expose its ports to the other containers |     # expose: ["5432"] We would want this passthrough to expose its ports to the other containers | ||||||
|     # BUT since it uses the same network as the vpn-service, it can't expose ports on its own. 5432 is therefore exposed under service.vpn.expose  |     # BUT since it uses the same network as the vpn-service, it can't expose ports on its own. 5432 is therefore exposed under service.vpn.expose  | ||||||
|  |  | ||||||
| @@ -62,14 +45,14 @@ services: | |||||||
|     build: news_fetch |     build: news_fetch | ||||||
|     image: news_fetch:latest |     image: news_fetch:latest | ||||||
|     depends_on: # when using docker compose run news_fetch, the dependencies are started as well |     depends_on: # when using docker compose run news_fetch, the dependencies are started as well | ||||||
|       - nas_sync |  | ||||||
|       - geckodriver |       - geckodriver | ||||||
|       - db_passthrough |       - db_passthrough | ||||||
|  |  | ||||||
|     volumes: |     volumes: | ||||||
|       - ${CONTAINER_DATA}:/app/containerdata # always set |       - ${CONTAINER_DATA}:/app/containerdata # always set | ||||||
|  |       - ./config/container.yaml:/app/config.yaml | ||||||
|       - ${CODE:-/dev/null}:/code # not set in prod, defaults to /dev/null |       - ${CODE:-/dev/null}:/code # not set in prod, defaults to /dev/null | ||||||
|     environment: |     environment: | ||||||
|  |       - CONFIG_FILE=/app/config.yaml | ||||||
|       - DEBUG=${DEBUG} |       - DEBUG=${DEBUG} | ||||||
|       - UNAME=${UNAME} |       - UNAME=${UNAME} | ||||||
|     user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user |     user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user | ||||||
| @@ -86,10 +69,33 @@ services: | |||||||
|       - db_passthrough |       - db_passthrough | ||||||
|     volumes: |     volumes: | ||||||
|       - ${CONTAINER_DATA}:/app/containerdata # always set |       - ${CONTAINER_DATA}:/app/containerdata # always set | ||||||
|  |       - ./config/container.yaml:/app/config.yaml | ||||||
|       - ${CODE:-/dev/null}:/code # not set in prod, defaults to /dev/null |       - ${CODE:-/dev/null}:/code # not set in prod, defaults to /dev/null | ||||||
|     environment: |     environment: | ||||||
|  |       - CONFIG_FILE=/app/config.yaml | ||||||
|       - UNAME=${UNAME} |       - UNAME=${UNAME} | ||||||
|     ports: |     ports: | ||||||
|       - "8080:80" # 80 inside container |       - "8080:80" # 80 inside container | ||||||
|     entrypoint: ${ENTRYPOINT:-python app.py} # by default launch workers as defined in the Dockerfile |     entrypoint: ${ENTRYPOINT:-python app.py} # by default launch workers as defined in the Dockerfile | ||||||
|     tty: true |  | ||||||
|  |  | ||||||
|  |   nas_sync: | ||||||
|  |     image: alpine:latest | ||||||
|  |     volumes: | ||||||
|  |       - ${CONTAINER_DATA}/files:/sync/local_files | ||||||
|  |       - coss_smb_share:/sync/remote_files | ||||||
|  |     command: | ||||||
|  |       - /bin/sh | ||||||
|  |       - -c | ||||||
|  |       - | | ||||||
|  |         apk add rsync | ||||||
|  |         rsync -av --no-perms --no-owner --no-group --progress /sync/local_files/${SYNC_FOLDER}/ /sync/remote_files/${SYNC_FOLDER} -n | ||||||
|  |  | ||||||
|  |  | ||||||
|  | volumes: | ||||||
|  |   coss_smb_share: | ||||||
|  |     driver: local | ||||||
|  |     driver_opts: | ||||||
|  |       type: cifs | ||||||
|  |       o: "addr=${NAS_HOST},nounix,file_mode=0777,dir_mode=0777,domain=D,username=${NAS_USERNAME},password=${NAS_PASSWORD}" | ||||||
|  |       device: //${NAS_HOST}${NAS_PATH} | ||||||
|   | |||||||
							
								
								
									
										70
									
								
								launch
									
									
									
									
									
								
							
							
						
						
									
										70
									
								
								launch
									
									
									
									
									
								
							| @@ -1,70 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
| set -e |  | ||||||
| set -o ignoreeof |  | ||||||
|  |  | ||||||
| echo "Bash script launching COSS_ARCHIVING..." |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # CHANGE ME ONCE! |  | ||||||
| export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving |  | ||||||
| export UNAME=remy |  | ||||||
| export U_ID=1000 |  | ||||||
|  |  | ||||||
|  |  | ||||||
| ### Main use cases ### |  | ||||||
| if [[ $1 == "debug" ]] |  | ||||||
| then |  | ||||||
|     export DEBUG=true |  | ||||||
|     export HEADFULL=true |  | ||||||
|     export CODE=./ |  | ||||||
|     export ENTRYPOINT=/bin/bash |  | ||||||
|     # since service ports does not open ports on implicitly started containers, also start geckodriver: |  | ||||||
|     docker compose up -d geckodriver |  | ||||||
|  |  | ||||||
| elif [[ $1 == "production" ]] |  | ||||||
| then |  | ||||||
|     export DEBUG=false |  | ||||||
|  |  | ||||||
| elif [[ $1 == "build" ]] |  | ||||||
| then |  | ||||||
|     export DEBUG=false |  | ||||||
|     shift |  | ||||||
|     docker compose build "$@" |  | ||||||
|     exit 0 |  | ||||||
|  |  | ||||||
|  |  | ||||||
| ### Manual Shutdown ### |  | ||||||
| elif [[ $1 == "down" ]] |  | ||||||
| then |  | ||||||
|     docker compose down -t 0 |  | ||||||
|     exit 0 |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| ### Edge cases -> for firefox ### |  | ||||||
| elif [[ $1 == "edit_profile" ]] |  | ||||||
| then |  | ||||||
|     export CODE=./ |  | ||||||
|     export HEADFULL=true |  | ||||||
|  |  | ||||||
|     docker compose up -d geckodriver |  | ||||||
|     sleep 5 |  | ||||||
|     docker compose exec  geckodriver /bin/bash /code/geckodriver/edit_profile.sh # inside the container |  | ||||||
|     docker compose down -t 0 |  | ||||||
|  |  | ||||||
|  |  | ||||||
| ### Fallback #### |  | ||||||
| else |  | ||||||
|     echo "Please specify the execution mode (debug/production/build/edit_profile/down) as the first argument" |  | ||||||
|     exit 1 |  | ||||||
| fi |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| shift # consumes the variable set in $1 so that $@ only contains the remaining arguments |  | ||||||
|  |  | ||||||
| docker compose run -it --service-ports "$@" |  | ||||||
|  |  | ||||||
| echo "Docker run finished, shutting down containers..." |  | ||||||
| docker compose down -t 0 |  | ||||||
| echo "Bye!" |  | ||||||
| @@ -46,7 +46,13 @@ def fetch(): | |||||||
|  |  | ||||||
| def show(): | def show(): | ||||||
|     for a in runner.models.ArticleDownload.select(): |     for a in runner.models.ArticleDownload.select(): | ||||||
|         print(f"URL: {a.article_url} \nARCHIVE_URL: {a.archive_url} \nFILE_NAME: {a.file_name}") |         print(f""" | ||||||
|  |         URL: {a.article_url} | ||||||
|  |         ARCHIVE_URL: {a.archive_url} | ||||||
|  |         ARTICLE_SOURCE: {a.source_name} | ||||||
|  |         FILE_NAME: {a.file_name} | ||||||
|  |         """) | ||||||
|  |  | ||||||
|  |  | ||||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||||
|     logger.info("Overwriting production values for single time media-fetch") |     logger.info("Overwriting production values for single time media-fetch") | ||||||
| @@ -55,7 +61,7 @@ if __name__ == "__main__": | |||||||
|     runner.configuration.models.set_db( |     runner.configuration.models.set_db( | ||||||
|         runner.configuration.SqliteDatabase("../.dev/media_downloads.db") |         runner.configuration.SqliteDatabase("../.dev/media_downloads.db") | ||||||
|     ) |     ) | ||||||
|     runner.configuration.main_config["DOWNLOADS"]["local_storage_path"] = "../.dev/" |     runner.configuration.main_config["downloads"]["local_storage_path"] = "../.dev/" | ||||||
|  |  | ||||||
|  |  | ||||||
|     if len(sys.argv) == 1: # no additional arguments |     if len(sys.argv) == 1: # no additional arguments | ||||||
|   | |||||||
| @@ -1,9 +0,0 @@ | |||||||
| FROM bash:latest |  | ||||||
| # alpine with bash instead of sh |  | ||||||
| ENV TZ=Europe/Berlin |  | ||||||
| RUN apk add lsyncd cifs-utils rsync |  | ||||||
| RUN mkdir -p /sync/remote_files |  | ||||||
| COPY entrypoint.sh /sync/entrypoint.sh |  | ||||||
|  |  | ||||||
|  |  | ||||||
| ENTRYPOINT ["bash", "/sync/entrypoint.sh"] |  | ||||||
| @@ -1,10 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
| set -e |  | ||||||
|  |  | ||||||
| sleep 5 # waits for the vpn to have an established connection |  | ||||||
| echo "Starting NAS sync" |  | ||||||
| mount -t cifs "//$1" -o credentials=/sync/nas_login.config /sync/remote_files |  | ||||||
| echo "Successfully mounted SAMBA remote: $1 --> /sync/remote_files" |  | ||||||
| shift # consumes the variable set in $1 so tat $@ only contains the remaining arguments |  | ||||||
|  |  | ||||||
| exec "$@" |  | ||||||
| @@ -2,3 +2,4 @@ flask | |||||||
| peewee | peewee | ||||||
| markdown | markdown | ||||||
| psycopg2 | psycopg2 | ||||||
|  | pyyaml | ||||||
| @@ -1,17 +1,16 @@ | |||||||
| from peewee import PostgresqlDatabase | from peewee import PostgresqlDatabase | ||||||
| import configparser |  | ||||||
| import time | import time | ||||||
|  | import yaml | ||||||
|  | import os | ||||||
|  |  | ||||||
| main_config = configparser.ConfigParser() | config_location = os.getenv("CONFIG_FILE") | ||||||
| main_config.read("/app/containerdata/config/news_fetch.config.ini") | with open(config_location, "r") as f: | ||||||
|  |     config = yaml.safe_load(f) | ||||||
|  |  | ||||||
| db_config = configparser.ConfigParser() | cred = config["database"] | ||||||
| db_config.read("/app/containerdata/config/db.config.ini") |  | ||||||
|  |  | ||||||
| cred = db_config["DATABASE"] |  | ||||||
| time.sleep(10) # wait for the vpn to connect (can't use a healthcheck because there is no depends_on) | time.sleep(10) # wait for the vpn to connect (can't use a healthcheck because there is no depends_on) | ||||||
| db = PostgresqlDatabase( | db = PostgresqlDatabase( | ||||||
|     cred["db_name"], user=cred["user_name"], password=cred["password"], host="vpn", port=5432 |     cred["production_db_name"], user=cred["production_user_name"], password=cred["production_password"], host="vpn", port=5432 | ||||||
| ) | ) | ||||||
|  |  | ||||||
| import models | import models | ||||||
|   | |||||||
| @@ -6,7 +6,7 @@ import os | |||||||
| import datetime | import datetime | ||||||
| import configuration | import configuration | ||||||
|  |  | ||||||
| config = configuration.main_config["DOWNLOADS"] | downloads_config = configuration.config["downloads"] | ||||||
|  |  | ||||||
| # set the nature of the db at runtime | # set the nature of the db at runtime | ||||||
| download_db = DatabaseProxy() | download_db = DatabaseProxy() | ||||||
| @@ -34,14 +34,14 @@ class ArticleDownload(DownloadBaseModel): | |||||||
|     file_name = TextField(default = '') |     file_name = TextField(default = '') | ||||||
|     @property |     @property | ||||||
|     def save_path(self): |     def save_path(self): | ||||||
|         return f"{config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/" |         return f"{downloads_config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/" | ||||||
|     @property |     @property | ||||||
|     def fname_nas(self, file_name=""): |     def fname_nas(self, file_name=""): | ||||||
|         if self.download_date: |         if self.download_date: | ||||||
|             if file_name: |             if file_name: | ||||||
|                 return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{file_name}" |                 return f"NAS: {downloads_config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{file_name}" | ||||||
|             else: # return the self. name |             else: # return the self. name | ||||||
|                 return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{self.file_name}" |                 return f"NAS: {downloads_config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{self.file_name}" | ||||||
|         else: |         else: | ||||||
|             return None |             return None | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,9 +1,7 @@ | |||||||
| import os |  | ||||||
| import configparser |  | ||||||
| import logging |  | ||||||
| import time | import time | ||||||
| # import shutil | import os | ||||||
| # from datetime import datetime | import logging | ||||||
|  | import yaml | ||||||
| from peewee import SqliteDatabase, PostgresqlDatabase | from peewee import SqliteDatabase, PostgresqlDatabase | ||||||
| from rich.logging import RichHandler | from rich.logging import RichHandler | ||||||
|  |  | ||||||
| @@ -19,22 +17,21 @@ logger = logging.getLogger(__name__) | |||||||
|  |  | ||||||
|  |  | ||||||
| # load config file containing constants and secrets | # load config file containing constants and secrets | ||||||
| main_config = configparser.ConfigParser() | config_location = os.getenv("CONFIG_FILE") | ||||||
| main_config.read("/app/containerdata/config/news_fetch.config.ini") | with open(config_location, "r") as f: | ||||||
| db_config = configparser.ConfigParser() |     config = yaml.safe_load(f) | ||||||
| db_config.read("/app/containerdata/config/db.config.ini") |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # DEBUG MODE: | # DEBUG MODE: | ||||||
| if os.getenv("DEBUG", "false") == "true": | if os.getenv("DEBUG", "false") == "true": | ||||||
|     logger.warning("Found 'DEBUG=true', setting up dummy databases") |     logger.warning("Found 'DEBUG=true', setting up dummy databases") | ||||||
|      |      | ||||||
|     main_config["SLACK"]["archive_id"] = main_config["SLACK"]["debug_id"] |     config["slack"]["archive_id"] = config["slack"]["debug_id"] | ||||||
|     main_config["MAIL"]["recipient"] = main_config["MAIL"]["sender"] |     config["mail"]["recipient"] = config["mail"]["sender"] | ||||||
|     main_config["DOWNLOADS"]["local_storage_path"] = main_config["DOWNLOADS"]["debug_storage_path"] |     config["downloads"]["local_storage_path"] = config["downloads"]["debug_storage_path"] | ||||||
|  |  | ||||||
|     download_db = SqliteDatabase( |     download_db = SqliteDatabase( | ||||||
|         main_config["DATABASE"]["download_db_debug"], |         config["database"]["debug_db"], | ||||||
|         pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once |         pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once | ||||||
|     ) |     ) | ||||||
|  |  | ||||||
| @@ -43,9 +40,9 @@ else: | |||||||
|     logger.warning("Found 'DEBUG=false' and running on production databases, I hope you know what you're doing...") |     logger.warning("Found 'DEBUG=false' and running on production databases, I hope you know what you're doing...") | ||||||
|      |      | ||||||
|     time.sleep(10) # wait for the vpn to connect (can't use a healthcheck because there is no depends_on) |     time.sleep(10) # wait for the vpn to connect (can't use a healthcheck because there is no depends_on) | ||||||
|     cred = db_config["DATABASE"] |     cred = config["database"] | ||||||
|     download_db = PostgresqlDatabase( |     download_db = PostgresqlDatabase( | ||||||
|         cred["db_name"], user=cred["user_name"], password=cred["password"], host="vpn", port=5432 |         cred["production_db_name"], user=cred["production_user_name"], password=cred["production_password"], host="vpn", port=5432 | ||||||
|     ) |     ) | ||||||
|     # TODO Reimplement backup/printout |     # TODO Reimplement backup/printout | ||||||
|     # logger.info("Backing up databases") |     # logger.info("Backing up databases") | ||||||
|   | |||||||
| @@ -10,3 +10,4 @@ markdown | |||||||
| rich | rich | ||||||
| psycopg2 | psycopg2 | ||||||
| unidecode | unidecode | ||||||
|  | pyyaml | ||||||
| @@ -7,15 +7,19 @@ import logging | |||||||
| import configuration | import configuration | ||||||
|  |  | ||||||
| logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||||
| config = configuration.main_config["MAIL"] | mail_config = configuration.config["mail"] | ||||||
|  |  | ||||||
| def send(article_model): | def send(article_model): | ||||||
|     mail = MIMEMultipart() |     mail = MIMEMultipart() | ||||||
|     mail['Subject'] = "{} -- {}".format(article_model.source_name, article_model.title) |     mail['Subject'] = "{} -- {}".format(article_model.source_name, article_model.title) | ||||||
|     mail['From'] = config["sender"] |     mail['From'] = mail_config["sender"] | ||||||
|     mail['To'] = config["recipient"] |     mail['To'] = mail_config["recipient"] | ||||||
|  |  | ||||||
|     msg, files = article_model.mail_info() # this is html |     try: | ||||||
|  |         msg, files = article_model.mail_info() # this is html | ||||||
|  |     except: # Raised by model if article has no associated file | ||||||
|  |         logger.info("Skipping mail sending") | ||||||
|  |         return | ||||||
|          |          | ||||||
|     content = MIMEText(msg, "html") |     content = MIMEText(msg, "html") | ||||||
|     mail.attach(content) |     mail.attach(content) | ||||||
| @@ -29,14 +33,14 @@ def send(article_model): | |||||||
|  |  | ||||||
|     try: |     try: | ||||||
|         try: |         try: | ||||||
|             smtp = smtplib.SMTP(config["smtp_server"], config["port"]) |             smtp = smtplib.SMTP(mail_config["smtp_server"], mail_config["port"]) | ||||||
|         except ConnectionRefusedError: |         except ConnectionRefusedError: | ||||||
|             logger.error("Server refused connection. Is this an error on your side?") |             logger.error("Server refused connection. Is this an error on your side?") | ||||||
|             return False |             return False | ||||||
|  |  | ||||||
|         smtp.starttls() |         smtp.starttls() | ||||||
|         smtp.login(config["uname"], config["password"]) |         smtp.login(mail_config["uname"], mail_config["password"]) | ||||||
|         smtp.sendmail(config["sender"], config["recipient"], mail.as_string()) |         smtp.sendmail(mail_config["sender"], mail_config["recipient"], mail.as_string()) | ||||||
|         smtp.quit() |         smtp.quit() | ||||||
|         logger.info("Mail successfully sent.") |         logger.info("Mail successfully sent.") | ||||||
|     except smtplib.SMTPException as e: |     except smtplib.SMTPException as e: | ||||||
|   | |||||||
| @@ -7,7 +7,7 @@ import re | |||||||
| import time | import time | ||||||
|  |  | ||||||
| import configuration | import configuration | ||||||
| config = configuration.main_config["SLACK"] | slack_config = configuration.config["slack"] | ||||||
| models = configuration.models | models = configuration.models | ||||||
|  |  | ||||||
| class MessageIsUnwanted(Exception): | class MessageIsUnwanted(Exception): | ||||||
| @@ -61,7 +61,7 @@ class Message: | |||||||
|  |  | ||||||
|     @property |     @property | ||||||
|     def is_by_human(self): |     def is_by_human(self): | ||||||
|         return self.user.user_id != config["bot_id"] |         return self.user.user_id != slack_config["bot_id"] | ||||||
|  |  | ||||||
|      |      | ||||||
|     @property |     @property | ||||||
| @@ -87,7 +87,7 @@ class BotApp(App): | |||||||
|  |  | ||||||
|     def say_substitute(self, *args, **kwargs): |     def say_substitute(self, *args, **kwargs): | ||||||
|         self.client.chat_postMessage( |         self.client.chat_postMessage( | ||||||
|             channel=config["archive_id"], |             channel=slack_config["archive_id"], | ||||||
|             text=" - ".join(args), |             text=" - ".join(args), | ||||||
|             **kwargs |             **kwargs | ||||||
|         ) |         ) | ||||||
| @@ -101,7 +101,7 @@ class BotApp(App): | |||||||
|             last_ts = presaved.slack_ts_full |             last_ts = presaved.slack_ts_full | ||||||
|  |  | ||||||
|         result = self.client.conversations_history( |         result = self.client.conversations_history( | ||||||
|             channel=config["archive_id"], |             channel=slack_config["archive_id"], | ||||||
|             oldest=last_ts |             oldest=last_ts | ||||||
|         ) |         ) | ||||||
|  |  | ||||||
| @@ -116,7 +116,7 @@ class BotApp(App): | |||||||
|         while refetch: # we have not actually fetched them all |         while refetch: # we have not actually fetched them all | ||||||
|             try: |             try: | ||||||
|                 result = self.client.conversations_history( |                 result = self.client.conversations_history( | ||||||
|                     channel = config["archive_id"], |                     channel = slack_config["archive_id"], | ||||||
|                     cursor = result["response_metadata"]["next_cursor"], |                     cursor = result["response_metadata"]["next_cursor"], | ||||||
|                     oldest = last_ts |                     oldest = last_ts | ||||||
|                 ) # fetches 100 messages, older than the [-1](=oldest) element of new_fetches |                 ) # fetches 100 messages, older than the [-1](=oldest) element of new_fetches | ||||||
| @@ -126,8 +126,8 @@ class BotApp(App): | |||||||
|                 for m in new_messages: |                 for m in new_messages: | ||||||
|                     return_messages.append(Message(m)) |                     return_messages.append(Message(m)) | ||||||
|             except SlackApiError: # Most likely a rate-limit |             except SlackApiError: # Most likely a rate-limit | ||||||
|                 self.logger.error("Error while fetching channel messages. (likely rate limit) Retrying in {} seconds...".format(config["api_wait_time"])) |                 self.logger.error("Error while fetching channel messages. (likely rate limit) Retrying in {} seconds...".format(slack_config["api_wait_time"])) | ||||||
|                 time.sleep(config["api_wait_time"]) |                 time.sleep(slack_config["api_wait_time"]) | ||||||
|                 refetch = True |                 refetch = True | ||||||
|          |          | ||||||
|         self.logger.info(f"Fetched {len(return_messages)} new channel messages.") |         self.logger.info(f"Fetched {len(return_messages)} new channel messages.") | ||||||
| @@ -181,7 +181,7 @@ class BotRunner(): | |||||||
|  |  | ||||||
|     """Stupid encapsulation so that we can apply the slack decorators to the BotApp""" |     """Stupid encapsulation so that we can apply the slack decorators to the BotApp""" | ||||||
|     def __init__(self, callback, *args, **kwargs) -> None: |     def __init__(self, callback, *args, **kwargs) -> None: | ||||||
|         self.bot_worker = BotApp(callback, token=config["auth_token"]) |         self.bot_worker = BotApp(callback, token=slack_config["auth_token"]) | ||||||
|  |  | ||||||
|         @self.bot_worker.event(event="message", matchers=[is_message_in_archiving]) |         @self.bot_worker.event(event="message", matchers=[is_message_in_archiving]) | ||||||
|         def handle_incoming_message(message, say): |         def handle_incoming_message(message, say): | ||||||
| @@ -195,7 +195,7 @@ class BotRunner(): | |||||||
|         def handle_all_other_reactions(event, say): |         def handle_all_other_reactions(event, say): | ||||||
|             self.logger.log("Ignoring slack event that isn't a message") |             self.logger.log("Ignoring slack event that isn't a message") | ||||||
|  |  | ||||||
|         self.handler = SocketModeHandler(self.bot_worker, config["app_token"]) |         self.handler = SocketModeHandler(self.bot_worker, slack_config["app_token"]) | ||||||
|  |  | ||||||
|  |  | ||||||
|     def start(self): |     def start(self): | ||||||
| @@ -215,5 +215,5 @@ class BotRunner(): | |||||||
|  |  | ||||||
|  |  | ||||||
| def is_message_in_archiving(message) -> bool: | def is_message_in_archiving(message) -> bool: | ||||||
|     return message["channel"] == config["archive_id"] |     return message["channel"] == slack_config["archive_id"] | ||||||
|  |  | ||||||
|   | |||||||
| @@ -8,8 +8,7 @@ import configuration | |||||||
| import datetime | import datetime | ||||||
|  |  | ||||||
| from . import helpers | from . import helpers | ||||||
| config = configuration.main_config["DOWNLOADS"] | downloads_config = configuration.config["downloads"] | ||||||
| slack_config = configuration.main_config["SLACK"] |  | ||||||
| FILE_SIZE_THRESHOLD = 15 * 1024 * 1024 # 15MB | FILE_SIZE_THRESHOLD = 15 * 1024 * 1024 # 15MB | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -34,7 +33,8 @@ class ArticleDownload(DownloadBaseModel): | |||||||
|     def is_title_bad(self):  # add incrementally |     def is_title_bad(self):  # add incrementally | ||||||
|         return "PUR-Abo" in self.title \ |         return "PUR-Abo" in self.title \ | ||||||
|             or "Redirecting" in self.title \ |             or "Redirecting" in self.title \ | ||||||
|             or "Error while running fetch" in self.title |             or "Error while running fetch" in self.title \ | ||||||
|  |             or self.title == "" | ||||||
|  |  | ||||||
|     summary = TextField(default = '') |     summary = TextField(default = '') | ||||||
|     source_name = CharField(default = '') |     source_name = CharField(default = '') | ||||||
| @@ -44,14 +44,14 @@ class ArticleDownload(DownloadBaseModel): | |||||||
|     file_name = TextField(default = '') |     file_name = TextField(default = '') | ||||||
|     @property |     @property | ||||||
|     def save_path(self): |     def save_path(self): | ||||||
|         return f"{config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/" |         return f"{downloads_config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/" | ||||||
|     @property |     @property | ||||||
|     def fname_nas(self, file_name=""): |     def fname_nas(self, file_name=""): | ||||||
|         if self.download_date: |         if self.download_date: | ||||||
|             if file_name: |             if file_name: | ||||||
|                 return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{file_name}" |                 return f"NAS: {downloads_config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{file_name}" | ||||||
|             else: # return the self. name |             else: # return the self. name | ||||||
|                 return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{self.file_name}" |                 return f"NAS: {downloads_config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{self.file_name}" | ||||||
|         else: |         else: | ||||||
|             return None |             return None | ||||||
|     @property |     @property | ||||||
| @@ -102,18 +102,22 @@ class ArticleDownload(DownloadBaseModel): | |||||||
|         answer_files = [] |         answer_files = [] | ||||||
|         # displays the summary in a blockquote |         # displays the summary in a blockquote | ||||||
|  |  | ||||||
|         status = self.file_status |         try: | ||||||
|         if status == 1: # file_name was empty |             self.ensure_file_present() | ||||||
|             return None # there has been an error do not send any message |  | ||||||
|         elif status == 2: # no file found at specified location |  | ||||||
|             answer_text += f"*{self.title}*\n{summary}\nFilename: {self.file_name}" |  | ||||||
|         elif status == 3: # file found but deemed too big |  | ||||||
|             location = f"File not sent directly. Location on NAS:\n`{self.fname_nas}`" |  | ||||||
|             answer_text += f"*{self.title}*\n{summary}\n{location}" |  | ||||||
|         else: # everything nominal |  | ||||||
|             answer_text += f"*{self.title}*\n{summary}" |             answer_text += f"*{self.title}*\n{summary}" | ||||||
|             answer_files.append(self.save_path + self.file_name) |             answer_files.append(self.save_path + self.file_name) | ||||||
|  |  | ||||||
|  |         except Exception as e: | ||||||
|  |             msg = e.args[0] | ||||||
|  |             logger.error(f"Article {self} has file-issues: {msg}") | ||||||
|  |             if "file too big" in msg: | ||||||
|  |                 location = f"File too big to send directly. Location on NAS:\n`{self.fname_nas}`" | ||||||
|  |                 answer_text += f"*{self.title}*\n{summary}\n{location}" | ||||||
|  |                  | ||||||
|  |             else: # file not found, or filename not set | ||||||
|  |                 raise e | ||||||
|  |                 # reraise the exception, so that the caller can handle it | ||||||
|  |  | ||||||
|         # then the related files |         # then the related files | ||||||
|         if self.related: |         if self.related: | ||||||
|             rel_text = "Related files on NAS:" |             rel_text = "Related files on NAS:" | ||||||
| @@ -144,19 +148,14 @@ class ArticleDownload(DownloadBaseModel): | |||||||
|                 related_file_name = r |                 related_file_name = r | ||||||
|             ) |             ) | ||||||
|      |      | ||||||
|     @property |     def ensure_file_present(self): | ||||||
|     def file_status(self): |  | ||||||
|         """0 = file exists, 1 = no file name!, 2 = file does not exit,3 = file exists but is too large""" |  | ||||||
|         if not self.file_name: |         if not self.file_name: | ||||||
|             logger.error(f"Article {self} has no filename!") |             raise Exception("no filename") | ||||||
|             return 2 |  | ||||||
|         file_path_abs = self.save_path + self.file_name |         file_path_abs = self.save_path + self.file_name | ||||||
|         if not os.path.exists(file_path_abs): |         if not os.path.exists(file_path_abs): | ||||||
|             logger.error(f"Article {self} has a filename, but the file does not exist at that location!") |             raise Exception("file not found") | ||||||
|             return 2 |  | ||||||
|         if (os.path.splitext(file_path_abs)[1] != ".pdf") or (os.path.getsize(file_path_abs) > FILE_SIZE_THRESHOLD): |         if (os.path.splitext(file_path_abs)[1] != ".pdf") or (os.path.getsize(file_path_abs) > FILE_SIZE_THRESHOLD): | ||||||
|             logger.warning(f"Article {self} has a file that exceeds the file size limit.") |             raise Exception("file too big") | ||||||
|             return 3 |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -11,7 +11,7 @@ from selenium import webdriver | |||||||
|  |  | ||||||
| import configuration | import configuration | ||||||
|  |  | ||||||
| config = configuration.main_config["DOWNLOADS"] | download_config = configuration.config["downloads"] | ||||||
|  |  | ||||||
| def driver_running(f): | def driver_running(f): | ||||||
|     def wrapper(*args, **kwargs): |     def wrapper(*args, **kwargs): | ||||||
| @@ -66,74 +66,88 @@ class PDFDownloader: | |||||||
|  |  | ||||||
|     @driver_running |     @driver_running | ||||||
|     def download(self, article_object): |     def download(self, article_object): | ||||||
|         sleep_time = int(config["browser_print_delay"]) |  | ||||||
|         url = article_object.article_url |         url = article_object.article_url | ||||||
|  |  | ||||||
|  |  | ||||||
|  |         if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly | ||||||
|  |             self.logger.info("Downloading existing pdf") | ||||||
|  |             success = self.get_exisiting_pdf(article_object) | ||||||
|  |             # get a page title if required | ||||||
|  |             if article_object.is_title_bad: | ||||||
|  |                 article_object.title = self.driver.title.replace(".pdf", "") # some titles end with .pdf | ||||||
|  |                 # will be propagated to the saved file (dst) as well | ||||||
|  |         else: | ||||||
|  |             success = self.get_new_pdf(article_object) | ||||||
|  |  | ||||||
|  |         if not success: | ||||||
|  |             self.logger.error("Download failed") | ||||||
|  |         # TODO: need to reset the file name to empty? | ||||||
|  |         return article_object # changes to this are saved later by the external caller | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     def get_exisiting_pdf(self, article_object): | ||||||
|  |         # get a better page title if required | ||||||
|  |         if article_object.is_title_bad: | ||||||
|  |             article_object.title = article_object.article_url.split("/")[-1].split(".pdf")[0] | ||||||
|         try: |         try: | ||||||
|             self.driver.get(url) |             r = requests.get(article_object.article_url) | ||||||
|  |             bytes = r.content | ||||||
|  |         except: | ||||||
|  |             return False | ||||||
|  |         return self.write_pdf(bytes, article_object) | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     def get_new_pdf(self, article_object): | ||||||
|  |         sleep_time = int(download_config["browser_print_delay"]) | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             self.driver.get(article_object.article_url) | ||||||
|         except Exception as e: |         except Exception as e: | ||||||
|             self.logger.critical("Selenium .get(url) failed with error {}".format(e)) |             self.logger.critical("Selenium .get(url) failed with error {}".format(e)) | ||||||
|             self.finish() |             self.finish() | ||||||
|             return article_object  # without changes |             return False | ||||||
|          |          | ||||||
|         time.sleep(sleep_time) |         time.sleep(sleep_time) | ||||||
|         # leave the page time to do any funky business |         # leave the page time to do any funky business | ||||||
|  |  | ||||||
|         # in the mean time, get a page title if required |  | ||||||
|         if article_object.is_title_bad: |         if article_object.is_title_bad: | ||||||
|             article_object.title = self.driver.title.replace(".pdf", "") # some titles end with .pdf |             article_object.title = self.driver.title | ||||||
|             # will be propagated to the saved file (dst) as well |  | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             result = self.driver.print_page() | ||||||
|  |             bytes = base64.b64decode(result, validate=True) | ||||||
|  |         except: | ||||||
|  |             self.logger.error("Failed, probably because the driver went extinct.") | ||||||
|  |             return False | ||||||
|  |  | ||||||
|  |         return self.write_pdf(bytes, article_object) | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     def get_file_destination(self, article_object): | ||||||
|         fname = article_object.fname_template |         fname = article_object.fname_template | ||||||
|         fname = ensure_unique(article_object.save_path, fname) |         fname = ensure_unique(article_object.save_path, fname) | ||||||
|         dst = os.path.join(article_object.save_path, fname) |         dst = os.path.join(article_object.save_path, fname) | ||||||
|  |         return dst, fname | ||||||
|  |  | ||||||
|  |  | ||||||
|         if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly |     def write_pdf(self, content, article_object): | ||||||
|             success = self.get_exisiting_pdf(url, dst) |         dst, fname = self.get_file_destination(article_object) | ||||||
|         else: |  | ||||||
|             success = self.get_new_pdf(dst) |  | ||||||
|  |  | ||||||
|         if success: |  | ||||||
|             article_object.file_name = fname |  | ||||||
|         else: |  | ||||||
|             article_object.file_name = "" |  | ||||||
|          |  | ||||||
|         return article_object # this change is saved later by the external caller |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     def get_exisiting_pdf(self, url, dst): |  | ||||||
|         try: |  | ||||||
|             r = requests.get(url) |  | ||||||
|             bytes = r.content |  | ||||||
|         except: |  | ||||||
|             return False |  | ||||||
|         return self.get_new_pdf(dst, other_bytes=bytes) |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     def get_new_pdf(self, dst, other_bytes=None): |  | ||||||
|         os.makedirs(os.path.dirname(dst), exist_ok=True) |         os.makedirs(os.path.dirname(dst), exist_ok=True) | ||||||
|          |          | ||||||
|         if other_bytes is None: |  | ||||||
|             try: |  | ||||||
|                 result = self.driver.print_page() |  | ||||||
|                 bytes = base64.b64decode(result, validate=True) |  | ||||||
|             except: |  | ||||||
|                 self.logger.error("Failed, probably because the driver went extinct.") |  | ||||||
|                 return False |  | ||||||
|         else: |  | ||||||
|             bytes = other_bytes |  | ||||||
|  |  | ||||||
|         try: |         try: | ||||||
|             with open(dst, "wb+") as f: |             with open(dst, "wb+") as f: | ||||||
|                 f.write(bytes) |                 f.write(content) | ||||||
|  |              | ||||||
|  |             article_object.file_name = fname | ||||||
|             return True |             return True | ||||||
|         except Exception as e: |         except Exception as e: | ||||||
|             self.logger.error(f"Failed, because of FS-operation: {e}") |             self.logger.error(f"Failed, because of FS-operation: {e}") | ||||||
|             return False |             return False | ||||||
|  |  | ||||||
|  |  | ||||||
|     def create_tmp_profile(self, full_profile_path: Path = Path(config["browser_profile_path"])) -> Path: |  | ||||||
|  |          | ||||||
|  |     def create_tmp_profile(self, full_profile_path: Path = Path(download_config["browser_profile_path"])) -> Path: | ||||||
|         reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}") |         reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}") | ||||||
|         os.mkdir(reduced_profile_path) |         os.mkdir(reduced_profile_path) | ||||||
|         # copy needed directories |         # copy needed directories | ||||||
|   | |||||||
| @@ -1,10 +1,11 @@ | |||||||
| import youtube_dl | import youtube_dl | ||||||
| import os | import os | ||||||
| import logging | import logging | ||||||
|  | import configuration | ||||||
|  |  | ||||||
|  | download_config = configuration.config["downloads"] | ||||||
| logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||||
|  |  | ||||||
|  |  | ||||||
| class MyLogger(object): | class MyLogger(object): | ||||||
|     def debug(self, msg): pass |     def debug(self, msg): pass | ||||||
|     def warning(self, msg): pass |     def warning(self, msg): pass | ||||||
| @@ -19,7 +20,6 @@ class YouTubeDownloader: | |||||||
|  |  | ||||||
|  |  | ||||||
|     def post_download_hook(self, ret_code): |     def post_download_hook(self, ret_code): | ||||||
|         # print(ret_code) |  | ||||||
|         if ret_code['status'] == 'finished': |         if ret_code['status'] == 'finished': | ||||||
|             file_loc = ret_code["filename"] |             file_loc = ret_code["filename"] | ||||||
|             fname = os.path.basename(file_loc) |             fname = os.path.basename(file_loc) | ||||||
| @@ -35,9 +35,11 @@ class YouTubeDownloader: | |||||||
|         ydl_opts = { |         ydl_opts = { | ||||||
|             'format': 'best[height<=720]', |             'format': 'best[height<=720]', | ||||||
|             'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download |             'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download | ||||||
|             'logger': MyLogger(), |             'logger': MyLogger(), # supress verbosity | ||||||
|             'progress_hooks': [self.post_download_hook], |             'progress_hooks': [self.post_download_hook], | ||||||
|             'updatetime': False |             'updatetime': False, | ||||||
|  |             # File is also used by firefox so make sure to not write to it! | ||||||
|  |             # youtube dl apparenlty does not support cookies.sqlite and the documentation is not clear on how to use cookies.txt | ||||||
|         } |         } | ||||||
|         try: |         try: | ||||||
|             with youtube_dl.YoutubeDL(ydl_opts) as ydl: |             with youtube_dl.YoutubeDL(ydl_opts) as ydl: | ||||||
| @@ -46,5 +48,9 @@ class YouTubeDownloader: | |||||||
|         except Exception as e: |         except Exception as e: | ||||||
|             logger.error(f"Youtube download crashed: {e}") |             logger.error(f"Youtube download crashed: {e}") | ||||||
|             article_object.file_name = "" |             article_object.file_name = "" | ||||||
|  |             logfile = os.path.join(download_config["local_storage_path"], "failed_downloads.csv") | ||||||
|  |             logger.info(f"Logging youtube errors seperately to {logfile}") | ||||||
|  |             with open(logfile, "a+") as f: | ||||||
|  |                 f.write(f"{url}\n") | ||||||
|  |  | ||||||
|         return article_object |         return article_object | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user