Bug fixes, makefile for launch
This commit is contained in:
		
							
								
								
									
										87
									
								
								Makefile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										87
									
								
								Makefile
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,87 @@
 | 
			
		||||
include config/local.env
 | 
			
		||||
export
 | 
			
		||||
 | 
			
		||||
build:
 | 
			
		||||
	@echo "Building..."
 | 
			
		||||
	docker compose build $(flags)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
down:
 | 
			
		||||
	@echo "Stopping containers..."
 | 
			
		||||
	docker compose down -t 0 --volumes
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Variables specific to debug
 | 
			
		||||
debug: export DEBUG=true
 | 
			
		||||
debug: export HEADFULL=true
 | 
			
		||||
debug: export ENTRYPOINT=/bin/bash
 | 
			
		||||
debug: export CODE=./
 | 
			
		||||
debug:
 | 
			
		||||
	@echo "Running in debug mode..."
 | 
			
		||||
	docker compose up -d geckodriver
 | 
			
		||||
	docker compose run -it --service-ports $(target) $(flags) || true
 | 
			
		||||
	make down
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
production: export DEBUG=false
 | 
			
		||||
production:
 | 
			
		||||
	@echo "Running in production mode..."
 | 
			
		||||
	docker compose run -it --service-ports $(target) $(flags) || true
 | 
			
		||||
	make down
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
nas_sync:
 | 
			
		||||
	@echo "Syncing NAS..."
 | 
			
		||||
	SYNC_FOLDER=$(folder) docker compose run -it nas_sync $(flags) || true
 | 
			
		||||
	docker compose down
 | 
			
		||||
	docker container prune -f
 | 
			
		||||
	make down
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## Misc:
 | 
			
		||||
edit_profile: export CODE=./
 | 
			
		||||
edit_profile: export HEADFULL=true
 | 
			
		||||
edit_profile:
 | 
			
		||||
	@echo "Editing profile..."
 | 
			
		||||
	docker compose up -d geckodriver
 | 
			
		||||
	sleep 5
 | 
			
		||||
	docker compose exec geckodriver /bin/bash /code/geckodriver/edit_profile.sh || true
 | 
			
		||||
	# runs inside the container
 | 
			
		||||
	make down
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
db_interface:
 | 
			
		||||
	docker create \
 | 
			
		||||
	--name pgadmin \
 | 
			
		||||
	-p 8080:80 \
 | 
			
		||||
	-e 'PGADMIN_DEFAULT_EMAIL=${UNAME}@test.com' \
 | 
			
		||||
	-e 'PGADMIN_DEFAULT_PASSWORD=password' \
 | 
			
		||||
	-e 'PGADMIN_CONFIG_ENHANCED_COOKIE_PROTECTION=True' \
 | 
			
		||||
	-e 'PGADMIN_CONFIG_LOGIN_BANNER="Authorised users only!"' \
 | 
			
		||||
	dpage/pgadmin4
 | 
			
		||||
 | 
			
		||||
	docker start pgadmin
 | 
			
		||||
 | 
			
		||||
	sleep 5
 | 
			
		||||
 | 
			
		||||
	# TODO auto add the server to the list displayed in the browser
 | 
			
		||||
	# docker exec pgadmin sh -c "echo ${SERVER_DATA} > /tmp/servers.json"
 | 
			
		||||
	# docker exec pgadmin sh -c "/venv/bin/python setup.py --load-servers /tmp/servers.json --user remy@test.com"
 | 
			
		||||
	@echo "Go to http://localhost:8080 to access the database interface"
 | 
			
		||||
	@echo "Username: ${UNAME}@test.com"
 | 
			
		||||
	@echo "Password: password"
 | 
			
		||||
	@echo "Hit any key to stop (not ctrl+c)"
 | 
			
		||||
	read STOP
 | 
			
		||||
 | 
			
		||||
	docker stop pgadmin
 | 
			
		||||
	docker rm pgadmin
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
logs:
 | 
			
		||||
	docker compose logs -f $(target) $(flags)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	make down
 | 
			
		||||
@@ -124,4 +124,4 @@ I use `rsync`. Mounting the NAS locally, I navigate to the location of the local
 | 
			
		||||
`rsync -Razq --no-perms --no-owner --no-group --temp-dir=/tmp --progress --log-file=rsync.log <local folder>/ "<remote>"`
 | 
			
		||||
where `<remote>` is the location where the NAS is mounted. (options:`R` - relative paths  , `a` - archive mode (multiple actions), `z` - ??, `q` - quiet. We also don't copy most of the metadata and we keep a log of the transfers.)
 | 
			
		||||
 | 
			
		||||
You can also use your OS' native copy option and select *de not overwrite*. This should only copy the missing files, significantly speeding up the operation.
 | 
			
		||||
You can also use your OS' native copy option and select *do not overwrite*. This should only copy the missing files, significantly speeding up the operation.
 | 
			
		||||
@@ -1,8 +0,0 @@
 | 
			
		||||
## Configuration: example
 | 
			
		||||
The files inside this directory (not the ones in `env/`) are a sample of the required configuration.
 | 
			
		||||
 | 
			
		||||
Please create a copy of these files under `<location of downloads>/config/...`.
 | 
			
		||||
 | 
			
		||||
> Note:
 | 
			
		||||
>
 | 
			
		||||
> Some of the fields are blank, please fill them in as needed.
 | 
			
		||||
							
								
								
									
										37
									
								
								config/container.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								config/container.yaml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,37 @@
 | 
			
		||||
mail:
 | 
			
		||||
  smtp_server: smtp.ethz.ch
 | 
			
		||||
  port: 587
 | 
			
		||||
  sender: "****************"
 | 
			
		||||
  recipient: "****************"
 | 
			
		||||
  uname: "****************"
 | 
			
		||||
  password: "************"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
slack:
 | 
			
		||||
  bot_id: U02MR1R8UJH
 | 
			
		||||
  archive_id: C02MM7YG1V4
 | 
			
		||||
  debug_id: C02NM2H9J5Q
 | 
			
		||||
  api_wait_time: 90
 | 
			
		||||
  auth_token: "****************"
 | 
			
		||||
  app_token: "****************"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
database:
 | 
			
		||||
  debug_db: /app/containerdata/debug/downloads.db
 | 
			
		||||
  db_printout: /app/containerdata/backups
 | 
			
		||||
  production_db_name: coss_archiving
 | 
			
		||||
  production_user_name: "ca_rw"
 | 
			
		||||
  production_password: "****************"
 | 
			
		||||
 | 
			
		||||
  ## user_name: ca_ro
 | 
			
		||||
  ## password: "****************"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
downloads:
 | 
			
		||||
  local_storage_path: /app/containerdata/files
 | 
			
		||||
  debug_storage_path: /app/containerdata/debug/
 | 
			
		||||
  default_download_path: /app/containerdata/tmp
 | 
			
		||||
  remote_storage_path: /helbing_support/Archiving-Pipeline
 | 
			
		||||
  browser_profile_path: /app/containerdata/dependencies/news_fetch.profile
 | 
			
		||||
  # please keep this exact name
 | 
			
		||||
  browser_print_delay: 3
 | 
			
		||||
@@ -1,7 +0,0 @@
 | 
			
		||||
[DATABASE]
 | 
			
		||||
db_name: coss_archiving
 | 
			
		||||
user_name: ****************
 | 
			
		||||
password: ****************
 | 
			
		||||
 | 
			
		||||
## user_name: ca_ro
 | 
			
		||||
## password: #TK5cLxA^YyoxWjR6
 | 
			
		||||
							
								
								
									
										18
									
								
								config/local.env
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								config/local.env
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,18 @@
 | 
			
		||||
CONTAINER_DATA=***********
 | 
			
		||||
UNAME=***********
 | 
			
		||||
U_ID=***********
 | 
			
		||||
 | 
			
		||||
DB_HOST=***********
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
OPENCONNECT_URL=***********
 | 
			
		||||
OPENCONNECT_USER=***********
 | 
			
		||||
OPENCONNECT_PASSWORD=***********
 | 
			
		||||
OPENCONNECT_OPTIONS=--authgroup student-net
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
NAS_HOST=***********
 | 
			
		||||
NAS_PATH=/gess_coss_1/helbing_support/Archiving-Pipeline
 | 
			
		||||
NAS_USERNAME=***********
 | 
			
		||||
NAS_PASSWORD=***********
 | 
			
		||||
# Special characters like # need to be escaped (write: \#) 
 | 
			
		||||
@@ -1,3 +0,0 @@
 | 
			
		||||
user=remoll
 | 
			
		||||
domain=D
 | 
			
		||||
password=****************
 | 
			
		||||
@@ -1,12 +0,0 @@
 | 
			
		||||
settings {
 | 
			
		||||
   logfile    = "/tmp/lsyncd.log",
 | 
			
		||||
   statusFile = "/tmp/lsyncd.status",
 | 
			
		||||
   nodaemon   = true,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sync {
 | 
			
		||||
   default.rsync,
 | 
			
		||||
   source = "/sync/local_files",
 | 
			
		||||
   target = "/sync/remote_files",
 | 
			
		||||
   init = false,
 | 
			
		||||
}
 | 
			
		||||
@@ -1,31 +0,0 @@
 | 
			
		||||
[MAIL]
 | 
			
		||||
smtp_server: smtp.ethz.ch
 | 
			
		||||
port: 587
 | 
			
		||||
sender: ****************
 | 
			
		||||
recipient: ****************
 | 
			
		||||
uname: ****************
 | 
			
		||||
password: ****************
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
[SLACK]
 | 
			
		||||
bot_id: U02MR1R8UJH
 | 
			
		||||
archive_id: C02MM7YG1V4
 | 
			
		||||
debug_id: C02NM2H9J5Q
 | 
			
		||||
api_wait_time: 90
 | 
			
		||||
auth_token: ****************
 | 
			
		||||
app_token: ****************
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
[DATABASE]
 | 
			
		||||
download_db_debug: /app/containerdata/debug/downloads.db
 | 
			
		||||
db_printout: /app/containerdata/backups
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
[DOWNLOADS]
 | 
			
		||||
local_storage_path: /app/containerdata/files
 | 
			
		||||
debug_storage_path: /app/containerdata/debug/
 | 
			
		||||
default_download_path: /app/containerdata/tmp
 | 
			
		||||
remote_storage_path: /helbing_support/Archiving-Pipeline
 | 
			
		||||
browser_profile_path: /app/containerdata/dependencies/news_fetch.profile
 | 
			
		||||
# please keep this exact name
 | 
			
		||||
browser_print_delay: 3
 | 
			
		||||
@@ -1,4 +0,0 @@
 | 
			
		||||
OPENCONNECT_URL=sslvpn.ethz.ch/student-net
 | 
			
		||||
OPENCONNECT_USER=****************
 | 
			
		||||
OPENCONNECT_PASSWORD=****************
 | 
			
		||||
OPENCONNECT_OPTIONS=--authgroup student-net
 | 
			
		||||
@@ -4,33 +4,17 @@ services:
 | 
			
		||||
 | 
			
		||||
  vpn: # Creates a connection behind the ETH Firewall to access NAS and Postgres
 | 
			
		||||
    image: wazum/openconnect-proxy:latest
 | 
			
		||||
    env_file:
 | 
			
		||||
      - ${CONTAINER_DATA}/config/vpn.config
 | 
			
		||||
    environment:
 | 
			
		||||
      - OPENCONNECT_URL=${OPENCONNECT_URL}
 | 
			
		||||
      - OPENCONNECT_USER=${OPENCONNECT_USER}
 | 
			
		||||
      - OPENCONNECT_PASSWORD=${OPENCONNECT_PASSWORD}
 | 
			
		||||
      - OPENCONNECT_OPTIONS=${OPENCONNECT_OPTIONS}
 | 
			
		||||
    cap_add:
 | 
			
		||||
    - NET_ADMIN
 | 
			
		||||
    volumes:
 | 
			
		||||
      - /dev/net/tun:/dev/net/tun
 | 
			
		||||
    # alternative to cap_add & volumes: specify privileged: true
 | 
			
		||||
    expose: ["5432"] # exposed here because db_passhtrough uses this network. See below for more details
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
  nas_sync: # Syncs locally downloaded files with the NAS-share on nas22.ethz.ch/...
 | 
			
		||||
    depends_on:
 | 
			
		||||
      - vpn
 | 
			
		||||
    network_mode: "service:vpn" # used to establish a connection to the SMB server from inside ETH network
 | 
			
		||||
    build: nas_sync # local folder to build
 | 
			
		||||
    image: nas_sync:latest
 | 
			
		||||
    cap_add: # capabilities needed for mounting the SMB share
 | 
			
		||||
      - SYS_ADMIN
 | 
			
		||||
      - DAC_READ_SEARCH
 | 
			
		||||
    volumes:
 | 
			
		||||
      - ${CONTAINER_DATA}/files:/sync/local_files
 | 
			
		||||
      - ${CONTAINER_DATA}/config/nas_sync.config:/sync/nas_sync.config
 | 
			
		||||
      - ${CONTAINER_DATA}/config/nas_login.config:/sync/nas_login.config
 | 
			
		||||
    command:
 | 
			
		||||
      - nas22.ethz.ch/gess_coss_1/helbing_support/Archiving-Pipeline # first command is the target mount path
 | 
			
		||||
      - lsyncd
 | 
			
		||||
      - /sync/nas_sync.config
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  geckodriver: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers)
 | 
			
		||||
@@ -40,7 +24,6 @@ services:
 | 
			
		||||
      - START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash)
 | 
			
		||||
      - START_XVFB=${HEADFULL-false}
 | 
			
		||||
      - SE_VNC_NO_PASSWORD=1
 | 
			
		||||
      # - SE_OPTS="--profile /user_data/news_fetch.profile.firefox"
 | 
			
		||||
    volumes:
 | 
			
		||||
      - ${CONTAINER_DATA}/dependencies:/firefox_profile/
 | 
			
		||||
      - ${CODE:-/dev/null}:/code
 | 
			
		||||
@@ -53,7 +36,7 @@ services:
 | 
			
		||||
  db_passthrough: # Allows a container on the local network to connect to a service (here postgres) through the vpn
 | 
			
		||||
    network_mode: "service:vpn"
 | 
			
		||||
    image: alpine/socat:latest
 | 
			
		||||
    command: ["tcp-listen:5432,reuseaddr,fork", "tcp-connect:id-hdb-psgr-cp48.ethz.ch:5432"]
 | 
			
		||||
    command: ["tcp-listen:5432,reuseaddr,fork", "tcp-connect:${DB_HOST}:5432"]
 | 
			
		||||
    # expose: ["5432"] We would want this passthrough to expose its ports to the other containers
 | 
			
		||||
    # BUT since it uses the same network as the vpn-service, it can't expose ports on its own. 5432 is therefore exposed under service.vpn.expose 
 | 
			
		||||
 | 
			
		||||
@@ -62,14 +45,14 @@ services:
 | 
			
		||||
    build: news_fetch
 | 
			
		||||
    image: news_fetch:latest
 | 
			
		||||
    depends_on: # when using docker compose run news_fetch, the dependencies are started as well
 | 
			
		||||
      - nas_sync
 | 
			
		||||
      - geckodriver
 | 
			
		||||
      - db_passthrough
 | 
			
		||||
 | 
			
		||||
    volumes:
 | 
			
		||||
      - ${CONTAINER_DATA}:/app/containerdata # always set
 | 
			
		||||
      - ./config/container.yaml:/app/config.yaml
 | 
			
		||||
      - ${CODE:-/dev/null}:/code # not set in prod, defaults to /dev/null
 | 
			
		||||
    environment:
 | 
			
		||||
      - CONFIG_FILE=/app/config.yaml
 | 
			
		||||
      - DEBUG=${DEBUG}
 | 
			
		||||
      - UNAME=${UNAME}
 | 
			
		||||
    user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user
 | 
			
		||||
@@ -86,10 +69,33 @@ services:
 | 
			
		||||
      - db_passthrough
 | 
			
		||||
    volumes:
 | 
			
		||||
      - ${CONTAINER_DATA}:/app/containerdata # always set
 | 
			
		||||
      - ./config/container.yaml:/app/config.yaml
 | 
			
		||||
      - ${CODE:-/dev/null}:/code # not set in prod, defaults to /dev/null
 | 
			
		||||
    environment:
 | 
			
		||||
      - CONFIG_FILE=/app/config.yaml
 | 
			
		||||
      - UNAME=${UNAME}
 | 
			
		||||
    ports:
 | 
			
		||||
      - "8080:80" # 80 inside container
 | 
			
		||||
    entrypoint: ${ENTRYPOINT:-python app.py} # by default launch workers as defined in the Dockerfile
 | 
			
		||||
    tty: true
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  nas_sync:
 | 
			
		||||
    image: alpine:latest
 | 
			
		||||
    volumes:
 | 
			
		||||
      - ${CONTAINER_DATA}/files:/sync/local_files
 | 
			
		||||
      - coss_smb_share:/sync/remote_files
 | 
			
		||||
    command:
 | 
			
		||||
      - /bin/sh
 | 
			
		||||
      - -c
 | 
			
		||||
      - |
 | 
			
		||||
        apk add rsync
 | 
			
		||||
        rsync -av --no-perms --no-owner --no-group --progress /sync/local_files/${SYNC_FOLDER}/ /sync/remote_files/${SYNC_FOLDER} -n
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
volumes:
 | 
			
		||||
  coss_smb_share:
 | 
			
		||||
    driver: local
 | 
			
		||||
    driver_opts:
 | 
			
		||||
      type: cifs
 | 
			
		||||
      o: "addr=${NAS_HOST},nounix,file_mode=0777,dir_mode=0777,domain=D,username=${NAS_USERNAME},password=${NAS_PASSWORD}"
 | 
			
		||||
      device: //${NAS_HOST}${NAS_PATH}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										70
									
								
								launch
									
									
									
									
									
								
							
							
						
						
									
										70
									
								
								launch
									
									
									
									
									
								
							@@ -1,70 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
set -e
 | 
			
		||||
set -o ignoreeof
 | 
			
		||||
 | 
			
		||||
echo "Bash script launching COSS_ARCHIVING..."
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# CHANGE ME ONCE!
 | 
			
		||||
export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
 | 
			
		||||
export UNAME=remy
 | 
			
		||||
export U_ID=1000
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
### Main use cases ###
 | 
			
		||||
if [[ $1 == "debug" ]]
 | 
			
		||||
then
 | 
			
		||||
    export DEBUG=true
 | 
			
		||||
    export HEADFULL=true
 | 
			
		||||
    export CODE=./
 | 
			
		||||
    export ENTRYPOINT=/bin/bash
 | 
			
		||||
    # since service ports does not open ports on implicitly started containers, also start geckodriver:
 | 
			
		||||
    docker compose up -d geckodriver
 | 
			
		||||
 | 
			
		||||
elif [[ $1 == "production" ]]
 | 
			
		||||
then
 | 
			
		||||
    export DEBUG=false
 | 
			
		||||
 | 
			
		||||
elif [[ $1 == "build" ]]
 | 
			
		||||
then
 | 
			
		||||
    export DEBUG=false
 | 
			
		||||
    shift
 | 
			
		||||
    docker compose build "$@"
 | 
			
		||||
    exit 0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
### Manual Shutdown ###
 | 
			
		||||
elif [[ $1 == "down" ]]
 | 
			
		||||
then
 | 
			
		||||
    docker compose down -t 0
 | 
			
		||||
    exit 0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
### Edge cases -> for firefox ###
 | 
			
		||||
elif [[ $1 == "edit_profile" ]]
 | 
			
		||||
then
 | 
			
		||||
    export CODE=./
 | 
			
		||||
    export HEADFULL=true
 | 
			
		||||
 | 
			
		||||
    docker compose up -d geckodriver
 | 
			
		||||
    sleep 5
 | 
			
		||||
    docker compose exec  geckodriver /bin/bash /code/geckodriver/edit_profile.sh # inside the container
 | 
			
		||||
    docker compose down -t 0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
### Fallback ####
 | 
			
		||||
else
 | 
			
		||||
    echo "Please specify the execution mode (debug/production/build/edit_profile/down) as the first argument"
 | 
			
		||||
    exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
shift # consumes the variable set in $1 so that $@ only contains the remaining arguments
 | 
			
		||||
 | 
			
		||||
docker compose run -it --service-ports "$@"
 | 
			
		||||
 | 
			
		||||
echo "Docker run finished, shutting down containers..."
 | 
			
		||||
docker compose down -t 0
 | 
			
		||||
echo "Bye!"
 | 
			
		||||
@@ -46,7 +46,13 @@ def fetch():
 | 
			
		||||
 | 
			
		||||
def show():
 | 
			
		||||
    for a in runner.models.ArticleDownload.select():
 | 
			
		||||
        print(f"URL: {a.article_url} \nARCHIVE_URL: {a.archive_url} \nFILE_NAME: {a.file_name}")
 | 
			
		||||
        print(f"""
 | 
			
		||||
        URL: {a.article_url}
 | 
			
		||||
        ARCHIVE_URL: {a.archive_url}
 | 
			
		||||
        ARTICLE_SOURCE: {a.source_name}
 | 
			
		||||
        FILE_NAME: {a.file_name}
 | 
			
		||||
        """)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    logger.info("Overwriting production values for single time media-fetch")
 | 
			
		||||
@@ -55,7 +61,7 @@ if __name__ == "__main__":
 | 
			
		||||
    runner.configuration.models.set_db(
 | 
			
		||||
        runner.configuration.SqliteDatabase("../.dev/media_downloads.db")
 | 
			
		||||
    )
 | 
			
		||||
    runner.configuration.main_config["DOWNLOADS"]["local_storage_path"] = "../.dev/"
 | 
			
		||||
    runner.configuration.main_config["downloads"]["local_storage_path"] = "../.dev/"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    if len(sys.argv) == 1: # no additional arguments
 | 
			
		||||
 
 | 
			
		||||
@@ -1,9 +0,0 @@
 | 
			
		||||
FROM bash:latest
 | 
			
		||||
# alpine with bash instead of sh
 | 
			
		||||
ENV TZ=Europe/Berlin
 | 
			
		||||
RUN apk add lsyncd cifs-utils rsync
 | 
			
		||||
RUN mkdir -p /sync/remote_files
 | 
			
		||||
COPY entrypoint.sh /sync/entrypoint.sh
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
ENTRYPOINT ["bash", "/sync/entrypoint.sh"]
 | 
			
		||||
@@ -1,10 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
set -e
 | 
			
		||||
 | 
			
		||||
sleep 5 # waits for the vpn to have an established connection
 | 
			
		||||
echo "Starting NAS sync"
 | 
			
		||||
mount -t cifs "//$1" -o credentials=/sync/nas_login.config /sync/remote_files
 | 
			
		||||
echo "Successfully mounted SAMBA remote: $1 --> /sync/remote_files"
 | 
			
		||||
shift # consumes the variable set in $1 so tat $@ only contains the remaining arguments
 | 
			
		||||
 | 
			
		||||
exec "$@"
 | 
			
		||||
@@ -1,4 +1,5 @@
 | 
			
		||||
flask
 | 
			
		||||
peewee
 | 
			
		||||
markdown
 | 
			
		||||
psycopg2
 | 
			
		||||
psycopg2
 | 
			
		||||
pyyaml
 | 
			
		||||
@@ -1,17 +1,16 @@
 | 
			
		||||
from peewee import PostgresqlDatabase
 | 
			
		||||
import configparser
 | 
			
		||||
import time
 | 
			
		||||
import yaml
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
main_config = configparser.ConfigParser()
 | 
			
		||||
main_config.read("/app/containerdata/config/news_fetch.config.ini")
 | 
			
		||||
config_location = os.getenv("CONFIG_FILE")
 | 
			
		||||
with open(config_location, "r") as f:
 | 
			
		||||
    config = yaml.safe_load(f)
 | 
			
		||||
 | 
			
		||||
db_config = configparser.ConfigParser()
 | 
			
		||||
db_config.read("/app/containerdata/config/db.config.ini")
 | 
			
		||||
 | 
			
		||||
cred = db_config["DATABASE"]
 | 
			
		||||
cred = config["database"]
 | 
			
		||||
time.sleep(10) # wait for the vpn to connect (can't use a healthcheck because there is no depends_on)
 | 
			
		||||
db = PostgresqlDatabase(
 | 
			
		||||
    cred["db_name"], user=cred["user_name"], password=cred["password"], host="vpn", port=5432
 | 
			
		||||
    cred["production_db_name"], user=cred["production_user_name"], password=cred["production_password"], host="vpn", port=5432
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
import models
 | 
			
		||||
 
 | 
			
		||||
@@ -6,7 +6,7 @@ import os
 | 
			
		||||
import datetime
 | 
			
		||||
import configuration
 | 
			
		||||
 | 
			
		||||
config = configuration.main_config["DOWNLOADS"]
 | 
			
		||||
downloads_config = configuration.config["downloads"]
 | 
			
		||||
 | 
			
		||||
# set the nature of the db at runtime
 | 
			
		||||
download_db = DatabaseProxy()
 | 
			
		||||
@@ -34,14 +34,14 @@ class ArticleDownload(DownloadBaseModel):
 | 
			
		||||
    file_name = TextField(default = '')
 | 
			
		||||
    @property
 | 
			
		||||
    def save_path(self):
 | 
			
		||||
        return f"{config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/"
 | 
			
		||||
        return f"{downloads_config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/"
 | 
			
		||||
    @property
 | 
			
		||||
    def fname_nas(self, file_name=""):
 | 
			
		||||
        if self.download_date:
 | 
			
		||||
            if file_name:
 | 
			
		||||
                return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{file_name}"
 | 
			
		||||
                return f"NAS: {downloads_config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{file_name}"
 | 
			
		||||
            else: # return the self. name
 | 
			
		||||
                return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{self.file_name}"
 | 
			
		||||
                return f"NAS: {downloads_config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{self.file_name}"
 | 
			
		||||
        else:
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,9 +1,7 @@
 | 
			
		||||
import os
 | 
			
		||||
import configparser
 | 
			
		||||
import logging
 | 
			
		||||
import time
 | 
			
		||||
# import shutil
 | 
			
		||||
# from datetime import datetime
 | 
			
		||||
import os
 | 
			
		||||
import logging
 | 
			
		||||
import yaml
 | 
			
		||||
from peewee import SqliteDatabase, PostgresqlDatabase
 | 
			
		||||
from rich.logging import RichHandler
 | 
			
		||||
 | 
			
		||||
@@ -19,22 +17,21 @@ logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# load config file containing constants and secrets
 | 
			
		||||
main_config = configparser.ConfigParser()
 | 
			
		||||
main_config.read("/app/containerdata/config/news_fetch.config.ini")
 | 
			
		||||
db_config = configparser.ConfigParser()
 | 
			
		||||
db_config.read("/app/containerdata/config/db.config.ini")
 | 
			
		||||
config_location = os.getenv("CONFIG_FILE")
 | 
			
		||||
with open(config_location, "r") as f:
 | 
			
		||||
    config = yaml.safe_load(f)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# DEBUG MODE:
 | 
			
		||||
if os.getenv("DEBUG", "false") == "true":
 | 
			
		||||
    logger.warning("Found 'DEBUG=true', setting up dummy databases")
 | 
			
		||||
    
 | 
			
		||||
    main_config["SLACK"]["archive_id"] = main_config["SLACK"]["debug_id"]
 | 
			
		||||
    main_config["MAIL"]["recipient"] = main_config["MAIL"]["sender"]
 | 
			
		||||
    main_config["DOWNLOADS"]["local_storage_path"] = main_config["DOWNLOADS"]["debug_storage_path"]
 | 
			
		||||
    config["slack"]["archive_id"] = config["slack"]["debug_id"]
 | 
			
		||||
    config["mail"]["recipient"] = config["mail"]["sender"]
 | 
			
		||||
    config["downloads"]["local_storage_path"] = config["downloads"]["debug_storage_path"]
 | 
			
		||||
 | 
			
		||||
    download_db = SqliteDatabase(
 | 
			
		||||
        main_config["DATABASE"]["download_db_debug"],
 | 
			
		||||
        config["database"]["debug_db"],
 | 
			
		||||
        pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
@@ -43,9 +40,9 @@ else:
 | 
			
		||||
    logger.warning("Found 'DEBUG=false' and running on production databases, I hope you know what you're doing...")
 | 
			
		||||
    
 | 
			
		||||
    time.sleep(10) # wait for the vpn to connect (can't use a healthcheck because there is no depends_on)
 | 
			
		||||
    cred = db_config["DATABASE"]
 | 
			
		||||
    cred = config["database"]
 | 
			
		||||
    download_db = PostgresqlDatabase(
 | 
			
		||||
        cred["db_name"], user=cred["user_name"], password=cred["password"], host="vpn", port=5432
 | 
			
		||||
        cred["production_db_name"], user=cred["production_user_name"], password=cred["production_password"], host="vpn", port=5432
 | 
			
		||||
    )
 | 
			
		||||
    # TODO Reimplement backup/printout
 | 
			
		||||
    # logger.info("Backing up databases")
 | 
			
		||||
 
 | 
			
		||||
@@ -10,3 +10,4 @@ markdown
 | 
			
		||||
rich
 | 
			
		||||
psycopg2
 | 
			
		||||
unidecode
 | 
			
		||||
pyyaml
 | 
			
		||||
@@ -7,16 +7,20 @@ import logging
 | 
			
		||||
import configuration
 | 
			
		||||
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
config = configuration.main_config["MAIL"]
 | 
			
		||||
mail_config = configuration.config["mail"]
 | 
			
		||||
 | 
			
		||||
def send(article_model):
 | 
			
		||||
    mail = MIMEMultipart()
 | 
			
		||||
    mail['Subject'] = "{} -- {}".format(article_model.source_name, article_model.title)
 | 
			
		||||
    mail['From'] = config["sender"]
 | 
			
		||||
    mail['To'] = config["recipient"]
 | 
			
		||||
 | 
			
		||||
    msg, files = article_model.mail_info() # this is html
 | 
			
		||||
    mail['From'] = mail_config["sender"]
 | 
			
		||||
    mail['To'] = mail_config["recipient"]
 | 
			
		||||
 | 
			
		||||
    try:
 | 
			
		||||
        msg, files = article_model.mail_info() # this is html
 | 
			
		||||
    except: # Raised by model if article has no associated file
 | 
			
		||||
        logger.info("Skipping mail sending")
 | 
			
		||||
        return
 | 
			
		||||
        
 | 
			
		||||
    content = MIMEText(msg, "html")
 | 
			
		||||
    mail.attach(content)
 | 
			
		||||
 | 
			
		||||
@@ -29,14 +33,14 @@ def send(article_model):
 | 
			
		||||
 | 
			
		||||
    try:
 | 
			
		||||
        try:
 | 
			
		||||
            smtp = smtplib.SMTP(config["smtp_server"], config["port"])
 | 
			
		||||
            smtp = smtplib.SMTP(mail_config["smtp_server"], mail_config["port"])
 | 
			
		||||
        except ConnectionRefusedError:
 | 
			
		||||
            logger.error("Server refused connection. Is this an error on your side?")
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
        smtp.starttls()
 | 
			
		||||
        smtp.login(config["uname"], config["password"])
 | 
			
		||||
        smtp.sendmail(config["sender"], config["recipient"], mail.as_string())
 | 
			
		||||
        smtp.login(mail_config["uname"], mail_config["password"])
 | 
			
		||||
        smtp.sendmail(mail_config["sender"], mail_config["recipient"], mail.as_string())
 | 
			
		||||
        smtp.quit()
 | 
			
		||||
        logger.info("Mail successfully sent.")
 | 
			
		||||
    except smtplib.SMTPException as e:
 | 
			
		||||
 
 | 
			
		||||
@@ -7,7 +7,7 @@ import re
 | 
			
		||||
import time
 | 
			
		||||
 | 
			
		||||
import configuration
 | 
			
		||||
config = configuration.main_config["SLACK"]
 | 
			
		||||
slack_config = configuration.config["slack"]
 | 
			
		||||
models = configuration.models
 | 
			
		||||
 | 
			
		||||
class MessageIsUnwanted(Exception):
 | 
			
		||||
@@ -61,7 +61,7 @@ class Message:
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def is_by_human(self):
 | 
			
		||||
        return self.user.user_id != config["bot_id"]
 | 
			
		||||
        return self.user.user_id != slack_config["bot_id"]
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    @property
 | 
			
		||||
@@ -87,7 +87,7 @@ class BotApp(App):
 | 
			
		||||
 | 
			
		||||
    def say_substitute(self, *args, **kwargs):
 | 
			
		||||
        self.client.chat_postMessage(
 | 
			
		||||
            channel=config["archive_id"],
 | 
			
		||||
            channel=slack_config["archive_id"],
 | 
			
		||||
            text=" - ".join(args),
 | 
			
		||||
            **kwargs
 | 
			
		||||
        )
 | 
			
		||||
@@ -101,7 +101,7 @@ class BotApp(App):
 | 
			
		||||
            last_ts = presaved.slack_ts_full
 | 
			
		||||
 | 
			
		||||
        result = self.client.conversations_history(
 | 
			
		||||
            channel=config["archive_id"],
 | 
			
		||||
            channel=slack_config["archive_id"],
 | 
			
		||||
            oldest=last_ts
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
@@ -116,7 +116,7 @@ class BotApp(App):
 | 
			
		||||
        while refetch: # we have not actually fetched them all
 | 
			
		||||
            try:
 | 
			
		||||
                result = self.client.conversations_history(
 | 
			
		||||
                    channel = config["archive_id"],
 | 
			
		||||
                    channel = slack_config["archive_id"],
 | 
			
		||||
                    cursor = result["response_metadata"]["next_cursor"],
 | 
			
		||||
                    oldest = last_ts
 | 
			
		||||
                ) # fetches 100 messages, older than the [-1](=oldest) element of new_fetches
 | 
			
		||||
@@ -126,8 +126,8 @@ class BotApp(App):
 | 
			
		||||
                for m in new_messages:
 | 
			
		||||
                    return_messages.append(Message(m))
 | 
			
		||||
            except SlackApiError: # Most likely a rate-limit
 | 
			
		||||
                self.logger.error("Error while fetching channel messages. (likely rate limit) Retrying in {} seconds...".format(config["api_wait_time"]))
 | 
			
		||||
                time.sleep(config["api_wait_time"])
 | 
			
		||||
                self.logger.error("Error while fetching channel messages. (likely rate limit) Retrying in {} seconds...".format(slack_config["api_wait_time"]))
 | 
			
		||||
                time.sleep(slack_config["api_wait_time"])
 | 
			
		||||
                refetch = True
 | 
			
		||||
        
 | 
			
		||||
        self.logger.info(f"Fetched {len(return_messages)} new channel messages.")
 | 
			
		||||
@@ -181,7 +181,7 @@ class BotRunner():
 | 
			
		||||
 | 
			
		||||
    """Stupid encapsulation so that we can apply the slack decorators to the BotApp"""
 | 
			
		||||
    def __init__(self, callback, *args, **kwargs) -> None:
 | 
			
		||||
        self.bot_worker = BotApp(callback, token=config["auth_token"])
 | 
			
		||||
        self.bot_worker = BotApp(callback, token=slack_config["auth_token"])
 | 
			
		||||
 | 
			
		||||
        @self.bot_worker.event(event="message", matchers=[is_message_in_archiving])
 | 
			
		||||
        def handle_incoming_message(message, say):
 | 
			
		||||
@@ -195,7 +195,7 @@ class BotRunner():
 | 
			
		||||
        def handle_all_other_reactions(event, say):
 | 
			
		||||
            self.logger.log("Ignoring slack event that isn't a message")
 | 
			
		||||
 | 
			
		||||
        self.handler = SocketModeHandler(self.bot_worker, config["app_token"])
 | 
			
		||||
        self.handler = SocketModeHandler(self.bot_worker, slack_config["app_token"])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def start(self):
 | 
			
		||||
@@ -215,5 +215,5 @@ class BotRunner():
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_message_in_archiving(message) -> bool:
 | 
			
		||||
    return message["channel"] == config["archive_id"]
 | 
			
		||||
    return message["channel"] == slack_config["archive_id"]
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -8,8 +8,7 @@ import configuration
 | 
			
		||||
import datetime
 | 
			
		||||
 | 
			
		||||
from . import helpers
 | 
			
		||||
config = configuration.main_config["DOWNLOADS"]
 | 
			
		||||
slack_config = configuration.main_config["SLACK"]
 | 
			
		||||
downloads_config = configuration.config["downloads"]
 | 
			
		||||
FILE_SIZE_THRESHOLD = 15 * 1024 * 1024 # 15MB
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -34,7 +33,8 @@ class ArticleDownload(DownloadBaseModel):
 | 
			
		||||
    def is_title_bad(self):  # add incrementally
 | 
			
		||||
        return "PUR-Abo" in self.title \
 | 
			
		||||
            or "Redirecting" in self.title \
 | 
			
		||||
            or "Error while running fetch" in self.title
 | 
			
		||||
            or "Error while running fetch" in self.title \
 | 
			
		||||
            or self.title == ""
 | 
			
		||||
 | 
			
		||||
    summary = TextField(default = '')
 | 
			
		||||
    source_name = CharField(default = '')
 | 
			
		||||
@@ -44,14 +44,14 @@ class ArticleDownload(DownloadBaseModel):
 | 
			
		||||
    file_name = TextField(default = '')
 | 
			
		||||
    @property
 | 
			
		||||
    def save_path(self):
 | 
			
		||||
        return f"{config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/"
 | 
			
		||||
        return f"{downloads_config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/"
 | 
			
		||||
    @property
 | 
			
		||||
    def fname_nas(self, file_name=""):
 | 
			
		||||
        if self.download_date:
 | 
			
		||||
            if file_name:
 | 
			
		||||
                return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{file_name}"
 | 
			
		||||
                return f"NAS: {downloads_config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{file_name}"
 | 
			
		||||
            else: # return the self. name
 | 
			
		||||
                return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{self.file_name}"
 | 
			
		||||
                return f"NAS: {downloads_config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{self.file_name}"
 | 
			
		||||
        else:
 | 
			
		||||
            return None
 | 
			
		||||
    @property
 | 
			
		||||
@@ -102,18 +102,22 @@ class ArticleDownload(DownloadBaseModel):
 | 
			
		||||
        answer_files = []
 | 
			
		||||
        # displays the summary in a blockquote
 | 
			
		||||
 | 
			
		||||
        status = self.file_status
 | 
			
		||||
        if status == 1: # file_name was empty
 | 
			
		||||
            return None # there has been an error do not send any message
 | 
			
		||||
        elif status == 2: # no file found at specified location
 | 
			
		||||
            answer_text += f"*{self.title}*\n{summary}\nFilename: {self.file_name}"
 | 
			
		||||
        elif status == 3: # file found but deemed too big
 | 
			
		||||
            location = f"File not sent directly. Location on NAS:\n`{self.fname_nas}`"
 | 
			
		||||
            answer_text += f"*{self.title}*\n{summary}\n{location}"
 | 
			
		||||
        else: # everything nominal
 | 
			
		||||
        try:
 | 
			
		||||
            self.ensure_file_present()
 | 
			
		||||
            answer_text += f"*{self.title}*\n{summary}"
 | 
			
		||||
            answer_files.append(self.save_path + self.file_name)
 | 
			
		||||
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            msg = e.args[0]
 | 
			
		||||
            logger.error(f"Article {self} has file-issues: {msg}")
 | 
			
		||||
            if "file too big" in msg:
 | 
			
		||||
                location = f"File too big to send directly. Location on NAS:\n`{self.fname_nas}`"
 | 
			
		||||
                answer_text += f"*{self.title}*\n{summary}\n{location}"
 | 
			
		||||
                
 | 
			
		||||
            else: # file not found, or filename not set
 | 
			
		||||
                raise e
 | 
			
		||||
                # reraise the exception, so that the caller can handle it
 | 
			
		||||
 | 
			
		||||
        # then the related files
 | 
			
		||||
        if self.related:
 | 
			
		||||
            rel_text = "Related files on NAS:"
 | 
			
		||||
@@ -144,19 +148,14 @@ class ArticleDownload(DownloadBaseModel):
 | 
			
		||||
                related_file_name = r
 | 
			
		||||
            )
 | 
			
		||||
    
 | 
			
		||||
    @property
 | 
			
		||||
    def file_status(self):
 | 
			
		||||
        """0 = file exists, 1 = no file name!, 2 = file does not exit,3 = file exists but is too large"""
 | 
			
		||||
    def ensure_file_present(self):
 | 
			
		||||
        if not self.file_name:
 | 
			
		||||
            logger.error(f"Article {self} has no filename!")
 | 
			
		||||
            return 2
 | 
			
		||||
            raise Exception("no filename")
 | 
			
		||||
        file_path_abs = self.save_path + self.file_name
 | 
			
		||||
        if not os.path.exists(file_path_abs):
 | 
			
		||||
            logger.error(f"Article {self} has a filename, but the file does not exist at that location!")
 | 
			
		||||
            return 2
 | 
			
		||||
            raise Exception("file not found")
 | 
			
		||||
        if (os.path.splitext(file_path_abs)[1] != ".pdf") or (os.path.getsize(file_path_abs) > FILE_SIZE_THRESHOLD):
 | 
			
		||||
            logger.warning(f"Article {self} has a file that exceeds the file size limit.")
 | 
			
		||||
            return 3
 | 
			
		||||
            raise Exception("file too big")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -11,7 +11,7 @@ from selenium import webdriver
 | 
			
		||||
 | 
			
		||||
import configuration
 | 
			
		||||
 | 
			
		||||
config = configuration.main_config["DOWNLOADS"]
 | 
			
		||||
download_config = configuration.config["downloads"]
 | 
			
		||||
 | 
			
		||||
def driver_running(f):
 | 
			
		||||
    def wrapper(*args, **kwargs):
 | 
			
		||||
@@ -66,74 +66,88 @@ class PDFDownloader:
 | 
			
		||||
 | 
			
		||||
    @driver_running
 | 
			
		||||
    def download(self, article_object):
 | 
			
		||||
        sleep_time = int(config["browser_print_delay"])
 | 
			
		||||
        url = article_object.article_url
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly
 | 
			
		||||
            self.logger.info("Downloading existing pdf")
 | 
			
		||||
            success = self.get_exisiting_pdf(article_object)
 | 
			
		||||
            # get a page title if required
 | 
			
		||||
            if article_object.is_title_bad:
 | 
			
		||||
                article_object.title = self.driver.title.replace(".pdf", "") # some titles end with .pdf
 | 
			
		||||
                # will be propagated to the saved file (dst) as well
 | 
			
		||||
        else:
 | 
			
		||||
            success = self.get_new_pdf(article_object)
 | 
			
		||||
 | 
			
		||||
        if not success:
 | 
			
		||||
            self.logger.error("Download failed")
 | 
			
		||||
        # TODO: need to reset the file name to empty?
 | 
			
		||||
        return article_object # changes to this are saved later by the external caller
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def get_exisiting_pdf(self, article_object):
 | 
			
		||||
        # get a better page title if required
 | 
			
		||||
        if article_object.is_title_bad:
 | 
			
		||||
            article_object.title = article_object.article_url.split("/")[-1].split(".pdf")[0]
 | 
			
		||||
        try:
 | 
			
		||||
            self.driver.get(url)
 | 
			
		||||
            r = requests.get(article_object.article_url)
 | 
			
		||||
            bytes = r.content
 | 
			
		||||
        except:
 | 
			
		||||
            return False
 | 
			
		||||
        return self.write_pdf(bytes, article_object)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def get_new_pdf(self, article_object):
 | 
			
		||||
        sleep_time = int(download_config["browser_print_delay"])
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            self.driver.get(article_object.article_url)
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            self.logger.critical("Selenium .get(url) failed with error {}".format(e))
 | 
			
		||||
            self.finish()
 | 
			
		||||
            return article_object  # without changes
 | 
			
		||||
            return False
 | 
			
		||||
        
 | 
			
		||||
        time.sleep(sleep_time)
 | 
			
		||||
        # leave the page time to do any funky business
 | 
			
		||||
 | 
			
		||||
        # in the mean time, get a page title if required
 | 
			
		||||
        if article_object.is_title_bad:
 | 
			
		||||
            article_object.title = self.driver.title.replace(".pdf", "") # some titles end with .pdf
 | 
			
		||||
            # will be propagated to the saved file (dst) as well
 | 
			
		||||
            article_object.title = self.driver.title
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            result = self.driver.print_page()
 | 
			
		||||
            bytes = base64.b64decode(result, validate=True)
 | 
			
		||||
        except:
 | 
			
		||||
            self.logger.error("Failed, probably because the driver went extinct.")
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
        return self.write_pdf(bytes, article_object)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def get_file_destination(self, article_object):
 | 
			
		||||
        fname = article_object.fname_template
 | 
			
		||||
        fname = ensure_unique(article_object.save_path, fname)
 | 
			
		||||
        dst = os.path.join(article_object.save_path, fname)
 | 
			
		||||
        return dst, fname
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly
 | 
			
		||||
            success = self.get_exisiting_pdf(url, dst)
 | 
			
		||||
        else:
 | 
			
		||||
            success = self.get_new_pdf(dst)
 | 
			
		||||
 | 
			
		||||
        if success:
 | 
			
		||||
            article_object.file_name = fname
 | 
			
		||||
        else:
 | 
			
		||||
            article_object.file_name = ""
 | 
			
		||||
        
 | 
			
		||||
        return article_object # this change is saved later by the external caller
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def get_exisiting_pdf(self, url, dst):
 | 
			
		||||
        try:
 | 
			
		||||
            r = requests.get(url)
 | 
			
		||||
            bytes = r.content
 | 
			
		||||
        except:
 | 
			
		||||
            return False
 | 
			
		||||
        return self.get_new_pdf(dst, other_bytes=bytes)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def get_new_pdf(self, dst, other_bytes=None):
 | 
			
		||||
    def write_pdf(self, content, article_object):
 | 
			
		||||
        dst, fname = self.get_file_destination(article_object)
 | 
			
		||||
        os.makedirs(os.path.dirname(dst), exist_ok=True)
 | 
			
		||||
 | 
			
		||||
        if other_bytes is None:
 | 
			
		||||
            try:
 | 
			
		||||
                result = self.driver.print_page()
 | 
			
		||||
                bytes = base64.b64decode(result, validate=True)
 | 
			
		||||
            except:
 | 
			
		||||
                self.logger.error("Failed, probably because the driver went extinct.")
 | 
			
		||||
                return False
 | 
			
		||||
        else:
 | 
			
		||||
            bytes = other_bytes
 | 
			
		||||
 | 
			
		||||
        
 | 
			
		||||
        try:
 | 
			
		||||
            with open(dst, "wb+") as f:
 | 
			
		||||
                f.write(bytes)
 | 
			
		||||
                f.write(content)
 | 
			
		||||
            
 | 
			
		||||
            article_object.file_name = fname
 | 
			
		||||
            return True
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            self.logger.error(f"Failed, because of FS-operation: {e}")
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def create_tmp_profile(self, full_profile_path: Path = Path(config["browser_profile_path"])) -> Path:
 | 
			
		||||
 | 
			
		||||
        
 | 
			
		||||
    def create_tmp_profile(self, full_profile_path: Path = Path(download_config["browser_profile_path"])) -> Path:
 | 
			
		||||
        reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}")
 | 
			
		||||
        os.mkdir(reduced_profile_path)
 | 
			
		||||
        # copy needed directories
 | 
			
		||||
 
 | 
			
		||||
@@ -1,10 +1,11 @@
 | 
			
		||||
import youtube_dl
 | 
			
		||||
import os
 | 
			
		||||
import logging
 | 
			
		||||
import configuration
 | 
			
		||||
 | 
			
		||||
download_config = configuration.config["downloads"]
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class MyLogger(object):
 | 
			
		||||
    def debug(self, msg): pass
 | 
			
		||||
    def warning(self, msg): pass
 | 
			
		||||
@@ -19,7 +20,6 @@ class YouTubeDownloader:
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def post_download_hook(self, ret_code):
 | 
			
		||||
        # print(ret_code)
 | 
			
		||||
        if ret_code['status'] == 'finished':
 | 
			
		||||
            file_loc = ret_code["filename"]
 | 
			
		||||
            fname = os.path.basename(file_loc)
 | 
			
		||||
@@ -35,9 +35,11 @@ class YouTubeDownloader:
 | 
			
		||||
        ydl_opts = {
 | 
			
		||||
            'format': 'best[height<=720]',
 | 
			
		||||
            'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download
 | 
			
		||||
            'logger': MyLogger(),
 | 
			
		||||
            'logger': MyLogger(), # supress verbosity
 | 
			
		||||
            'progress_hooks': [self.post_download_hook],
 | 
			
		||||
            'updatetime': False
 | 
			
		||||
            'updatetime': False,
 | 
			
		||||
            # File is also used by firefox so make sure to not write to it!
 | 
			
		||||
            # youtube dl apparenlty does not support cookies.sqlite and the documentation is not clear on how to use cookies.txt
 | 
			
		||||
        }
 | 
			
		||||
        try:
 | 
			
		||||
            with youtube_dl.YoutubeDL(ydl_opts) as ydl:
 | 
			
		||||
@@ -46,5 +48,9 @@ class YouTubeDownloader:
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            logger.error(f"Youtube download crashed: {e}")
 | 
			
		||||
            article_object.file_name = ""
 | 
			
		||||
            logfile = os.path.join(download_config["local_storage_path"], "failed_downloads.csv")
 | 
			
		||||
            logger.info(f"Logging youtube errors seperately to {logfile}")
 | 
			
		||||
            with open(logfile, "a+") as f:
 | 
			
		||||
                f.write(f"{url}\n")
 | 
			
		||||
 | 
			
		||||
        return article_object
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user