Bug fixes, makefile for launch
This commit is contained in:
parent
24b3bc3b51
commit
647944d23c
87
Makefile
Normal file
87
Makefile
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
include config/local.env
|
||||||
|
export
|
||||||
|
|
||||||
|
build:
|
||||||
|
@echo "Building..."
|
||||||
|
docker compose build $(flags)
|
||||||
|
|
||||||
|
|
||||||
|
down:
|
||||||
|
@echo "Stopping containers..."
|
||||||
|
docker compose down -t 0 --volumes
|
||||||
|
|
||||||
|
|
||||||
|
# Variables specific to debug
|
||||||
|
debug: export DEBUG=true
|
||||||
|
debug: export HEADFULL=true
|
||||||
|
debug: export ENTRYPOINT=/bin/bash
|
||||||
|
debug: export CODE=./
|
||||||
|
debug:
|
||||||
|
@echo "Running in debug mode..."
|
||||||
|
docker compose up -d geckodriver
|
||||||
|
docker compose run -it --service-ports $(target) $(flags) || true
|
||||||
|
make down
|
||||||
|
|
||||||
|
|
||||||
|
production: export DEBUG=false
|
||||||
|
production:
|
||||||
|
@echo "Running in production mode..."
|
||||||
|
docker compose run -it --service-ports $(target) $(flags) || true
|
||||||
|
make down
|
||||||
|
|
||||||
|
|
||||||
|
nas_sync:
|
||||||
|
@echo "Syncing NAS..."
|
||||||
|
SYNC_FOLDER=$(folder) docker compose run -it nas_sync $(flags) || true
|
||||||
|
docker compose down
|
||||||
|
docker container prune -f
|
||||||
|
make down
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Misc:
|
||||||
|
edit_profile: export CODE=./
|
||||||
|
edit_profile: export HEADFULL=true
|
||||||
|
edit_profile:
|
||||||
|
@echo "Editing profile..."
|
||||||
|
docker compose up -d geckodriver
|
||||||
|
sleep 5
|
||||||
|
docker compose exec geckodriver /bin/bash /code/geckodriver/edit_profile.sh || true
|
||||||
|
# runs inside the container
|
||||||
|
make down
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
db_interface:
|
||||||
|
docker create \
|
||||||
|
--name pgadmin \
|
||||||
|
-p 8080:80 \
|
||||||
|
-e 'PGADMIN_DEFAULT_EMAIL=${UNAME}@test.com' \
|
||||||
|
-e 'PGADMIN_DEFAULT_PASSWORD=password' \
|
||||||
|
-e 'PGADMIN_CONFIG_ENHANCED_COOKIE_PROTECTION=True' \
|
||||||
|
-e 'PGADMIN_CONFIG_LOGIN_BANNER="Authorised users only!"' \
|
||||||
|
dpage/pgadmin4
|
||||||
|
|
||||||
|
docker start pgadmin
|
||||||
|
|
||||||
|
sleep 5
|
||||||
|
|
||||||
|
# TODO auto add the server to the list displayed in the browser
|
||||||
|
# docker exec pgadmin sh -c "echo ${SERVER_DATA} > /tmp/servers.json"
|
||||||
|
# docker exec pgadmin sh -c "/venv/bin/python setup.py --load-servers /tmp/servers.json --user remy@test.com"
|
||||||
|
@echo "Go to http://localhost:8080 to access the database interface"
|
||||||
|
@echo "Username: ${UNAME}@test.com"
|
||||||
|
@echo "Password: password"
|
||||||
|
@echo "Hit any key to stop (not ctrl+c)"
|
||||||
|
read STOP
|
||||||
|
|
||||||
|
docker stop pgadmin
|
||||||
|
docker rm pgadmin
|
||||||
|
|
||||||
|
|
||||||
|
logs:
|
||||||
|
docker compose logs -f $(target) $(flags)
|
||||||
|
|
||||||
|
|
||||||
|
make down
|
@ -124,4 +124,4 @@ I use `rsync`. Mounting the NAS locally, I navigate to the location of the local
|
|||||||
`rsync -Razq --no-perms --no-owner --no-group --temp-dir=/tmp --progress --log-file=rsync.log <local folder>/ "<remote>"`
|
`rsync -Razq --no-perms --no-owner --no-group --temp-dir=/tmp --progress --log-file=rsync.log <local folder>/ "<remote>"`
|
||||||
where `<remote>` is the location where the NAS is mounted. (options:`R` - relative paths , `a` - archive mode (multiple actions), `z` - ??, `q` - quiet. We also don't copy most of the metadata and we keep a log of the transfers.)
|
where `<remote>` is the location where the NAS is mounted. (options:`R` - relative paths , `a` - archive mode (multiple actions), `z` - ??, `q` - quiet. We also don't copy most of the metadata and we keep a log of the transfers.)
|
||||||
|
|
||||||
You can also use your OS' native copy option and select *de not overwrite*. This should only copy the missing files, significantly speeding up the operation.
|
You can also use your OS' native copy option and select *do not overwrite*. This should only copy the missing files, significantly speeding up the operation.
|
@ -1,8 +0,0 @@
|
|||||||
## Configuration: example
|
|
||||||
The files inside this directory (not the ones in `env/`) are a sample of the required configuration.
|
|
||||||
|
|
||||||
Please create a copy of these files under `<location of downloads>/config/...`.
|
|
||||||
|
|
||||||
> Note:
|
|
||||||
>
|
|
||||||
> Some of the fields are blank, please fill them in as needed.
|
|
37
config/container.yaml
Normal file
37
config/container.yaml
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
mail:
|
||||||
|
smtp_server: smtp.ethz.ch
|
||||||
|
port: 587
|
||||||
|
sender: "****************"
|
||||||
|
recipient: "****************"
|
||||||
|
uname: "****************"
|
||||||
|
password: "************"
|
||||||
|
|
||||||
|
|
||||||
|
slack:
|
||||||
|
bot_id: U02MR1R8UJH
|
||||||
|
archive_id: C02MM7YG1V4
|
||||||
|
debug_id: C02NM2H9J5Q
|
||||||
|
api_wait_time: 90
|
||||||
|
auth_token: "****************"
|
||||||
|
app_token: "****************"
|
||||||
|
|
||||||
|
|
||||||
|
database:
|
||||||
|
debug_db: /app/containerdata/debug/downloads.db
|
||||||
|
db_printout: /app/containerdata/backups
|
||||||
|
production_db_name: coss_archiving
|
||||||
|
production_user_name: "ca_rw"
|
||||||
|
production_password: "****************"
|
||||||
|
|
||||||
|
## user_name: ca_ro
|
||||||
|
## password: "****************"
|
||||||
|
|
||||||
|
|
||||||
|
downloads:
|
||||||
|
local_storage_path: /app/containerdata/files
|
||||||
|
debug_storage_path: /app/containerdata/debug/
|
||||||
|
default_download_path: /app/containerdata/tmp
|
||||||
|
remote_storage_path: /helbing_support/Archiving-Pipeline
|
||||||
|
browser_profile_path: /app/containerdata/dependencies/news_fetch.profile
|
||||||
|
# please keep this exact name
|
||||||
|
browser_print_delay: 3
|
@ -1,7 +0,0 @@
|
|||||||
[DATABASE]
|
|
||||||
db_name: coss_archiving
|
|
||||||
user_name: ****************
|
|
||||||
password: ****************
|
|
||||||
|
|
||||||
## user_name: ca_ro
|
|
||||||
## password: #TK5cLxA^YyoxWjR6
|
|
18
config/local.env
Normal file
18
config/local.env
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
CONTAINER_DATA=***********
|
||||||
|
UNAME=***********
|
||||||
|
U_ID=***********
|
||||||
|
|
||||||
|
DB_HOST=***********
|
||||||
|
|
||||||
|
|
||||||
|
OPENCONNECT_URL=***********
|
||||||
|
OPENCONNECT_USER=***********
|
||||||
|
OPENCONNECT_PASSWORD=***********
|
||||||
|
OPENCONNECT_OPTIONS=--authgroup student-net
|
||||||
|
|
||||||
|
|
||||||
|
NAS_HOST=***********
|
||||||
|
NAS_PATH=/gess_coss_1/helbing_support/Archiving-Pipeline
|
||||||
|
NAS_USERNAME=***********
|
||||||
|
NAS_PASSWORD=***********
|
||||||
|
# Special characters like # need to be escaped (write: \#)
|
@ -1,3 +0,0 @@
|
|||||||
user=remoll
|
|
||||||
domain=D
|
|
||||||
password=****************
|
|
@ -1,12 +0,0 @@
|
|||||||
settings {
|
|
||||||
logfile = "/tmp/lsyncd.log",
|
|
||||||
statusFile = "/tmp/lsyncd.status",
|
|
||||||
nodaemon = true,
|
|
||||||
}
|
|
||||||
|
|
||||||
sync {
|
|
||||||
default.rsync,
|
|
||||||
source = "/sync/local_files",
|
|
||||||
target = "/sync/remote_files",
|
|
||||||
init = false,
|
|
||||||
}
|
|
@ -1,31 +0,0 @@
|
|||||||
[MAIL]
|
|
||||||
smtp_server: smtp.ethz.ch
|
|
||||||
port: 587
|
|
||||||
sender: ****************
|
|
||||||
recipient: ****************
|
|
||||||
uname: ****************
|
|
||||||
password: ****************
|
|
||||||
|
|
||||||
|
|
||||||
[SLACK]
|
|
||||||
bot_id: U02MR1R8UJH
|
|
||||||
archive_id: C02MM7YG1V4
|
|
||||||
debug_id: C02NM2H9J5Q
|
|
||||||
api_wait_time: 90
|
|
||||||
auth_token: ****************
|
|
||||||
app_token: ****************
|
|
||||||
|
|
||||||
|
|
||||||
[DATABASE]
|
|
||||||
download_db_debug: /app/containerdata/debug/downloads.db
|
|
||||||
db_printout: /app/containerdata/backups
|
|
||||||
|
|
||||||
|
|
||||||
[DOWNLOADS]
|
|
||||||
local_storage_path: /app/containerdata/files
|
|
||||||
debug_storage_path: /app/containerdata/debug/
|
|
||||||
default_download_path: /app/containerdata/tmp
|
|
||||||
remote_storage_path: /helbing_support/Archiving-Pipeline
|
|
||||||
browser_profile_path: /app/containerdata/dependencies/news_fetch.profile
|
|
||||||
# please keep this exact name
|
|
||||||
browser_print_delay: 3
|
|
@ -1,4 +0,0 @@
|
|||||||
OPENCONNECT_URL=sslvpn.ethz.ch/student-net
|
|
||||||
OPENCONNECT_USER=****************
|
|
||||||
OPENCONNECT_PASSWORD=****************
|
|
||||||
OPENCONNECT_OPTIONS=--authgroup student-net
|
|
@ -4,33 +4,17 @@ services:
|
|||||||
|
|
||||||
vpn: # Creates a connection behind the ETH Firewall to access NAS and Postgres
|
vpn: # Creates a connection behind the ETH Firewall to access NAS and Postgres
|
||||||
image: wazum/openconnect-proxy:latest
|
image: wazum/openconnect-proxy:latest
|
||||||
env_file:
|
environment:
|
||||||
- ${CONTAINER_DATA}/config/vpn.config
|
- OPENCONNECT_URL=${OPENCONNECT_URL}
|
||||||
|
- OPENCONNECT_USER=${OPENCONNECT_USER}
|
||||||
|
- OPENCONNECT_PASSWORD=${OPENCONNECT_PASSWORD}
|
||||||
|
- OPENCONNECT_OPTIONS=${OPENCONNECT_OPTIONS}
|
||||||
cap_add:
|
cap_add:
|
||||||
- NET_ADMIN
|
- NET_ADMIN
|
||||||
volumes:
|
volumes:
|
||||||
- /dev/net/tun:/dev/net/tun
|
- /dev/net/tun:/dev/net/tun
|
||||||
# alternative to cap_add & volumes: specify privileged: true
|
# alternative to cap_add & volumes: specify privileged: true
|
||||||
expose: ["5432"] # exposed here because db_passhtrough uses this network. See below for more details
|
expose: ["5432"] # exposed here because db_passhtrough uses this network. See below for more details
|
||||||
|
|
||||||
|
|
||||||
nas_sync: # Syncs locally downloaded files with the NAS-share on nas22.ethz.ch/...
|
|
||||||
depends_on:
|
|
||||||
- vpn
|
|
||||||
network_mode: "service:vpn" # used to establish a connection to the SMB server from inside ETH network
|
|
||||||
build: nas_sync # local folder to build
|
|
||||||
image: nas_sync:latest
|
|
||||||
cap_add: # capabilities needed for mounting the SMB share
|
|
||||||
- SYS_ADMIN
|
|
||||||
- DAC_READ_SEARCH
|
|
||||||
volumes:
|
|
||||||
- ${CONTAINER_DATA}/files:/sync/local_files
|
|
||||||
- ${CONTAINER_DATA}/config/nas_sync.config:/sync/nas_sync.config
|
|
||||||
- ${CONTAINER_DATA}/config/nas_login.config:/sync/nas_login.config
|
|
||||||
command:
|
|
||||||
- nas22.ethz.ch/gess_coss_1/helbing_support/Archiving-Pipeline # first command is the target mount path
|
|
||||||
- lsyncd
|
|
||||||
- /sync/nas_sync.config
|
|
||||||
|
|
||||||
|
|
||||||
geckodriver: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers)
|
geckodriver: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers)
|
||||||
@ -40,7 +24,6 @@ services:
|
|||||||
- START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash)
|
- START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash)
|
||||||
- START_XVFB=${HEADFULL-false}
|
- START_XVFB=${HEADFULL-false}
|
||||||
- SE_VNC_NO_PASSWORD=1
|
- SE_VNC_NO_PASSWORD=1
|
||||||
# - SE_OPTS="--profile /user_data/news_fetch.profile.firefox"
|
|
||||||
volumes:
|
volumes:
|
||||||
- ${CONTAINER_DATA}/dependencies:/firefox_profile/
|
- ${CONTAINER_DATA}/dependencies:/firefox_profile/
|
||||||
- ${CODE:-/dev/null}:/code
|
- ${CODE:-/dev/null}:/code
|
||||||
@ -53,7 +36,7 @@ services:
|
|||||||
db_passthrough: # Allows a container on the local network to connect to a service (here postgres) through the vpn
|
db_passthrough: # Allows a container on the local network to connect to a service (here postgres) through the vpn
|
||||||
network_mode: "service:vpn"
|
network_mode: "service:vpn"
|
||||||
image: alpine/socat:latest
|
image: alpine/socat:latest
|
||||||
command: ["tcp-listen:5432,reuseaddr,fork", "tcp-connect:id-hdb-psgr-cp48.ethz.ch:5432"]
|
command: ["tcp-listen:5432,reuseaddr,fork", "tcp-connect:${DB_HOST}:5432"]
|
||||||
# expose: ["5432"] We would want this passthrough to expose its ports to the other containers
|
# expose: ["5432"] We would want this passthrough to expose its ports to the other containers
|
||||||
# BUT since it uses the same network as the vpn-service, it can't expose ports on its own. 5432 is therefore exposed under service.vpn.expose
|
# BUT since it uses the same network as the vpn-service, it can't expose ports on its own. 5432 is therefore exposed under service.vpn.expose
|
||||||
|
|
||||||
@ -62,14 +45,14 @@ services:
|
|||||||
build: news_fetch
|
build: news_fetch
|
||||||
image: news_fetch:latest
|
image: news_fetch:latest
|
||||||
depends_on: # when using docker compose run news_fetch, the dependencies are started as well
|
depends_on: # when using docker compose run news_fetch, the dependencies are started as well
|
||||||
- nas_sync
|
|
||||||
- geckodriver
|
- geckodriver
|
||||||
- db_passthrough
|
- db_passthrough
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
- ${CONTAINER_DATA}:/app/containerdata # always set
|
- ${CONTAINER_DATA}:/app/containerdata # always set
|
||||||
|
- ./config/container.yaml:/app/config.yaml
|
||||||
- ${CODE:-/dev/null}:/code # not set in prod, defaults to /dev/null
|
- ${CODE:-/dev/null}:/code # not set in prod, defaults to /dev/null
|
||||||
environment:
|
environment:
|
||||||
|
- CONFIG_FILE=/app/config.yaml
|
||||||
- DEBUG=${DEBUG}
|
- DEBUG=${DEBUG}
|
||||||
- UNAME=${UNAME}
|
- UNAME=${UNAME}
|
||||||
user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user
|
user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user
|
||||||
@ -86,10 +69,33 @@ services:
|
|||||||
- db_passthrough
|
- db_passthrough
|
||||||
volumes:
|
volumes:
|
||||||
- ${CONTAINER_DATA}:/app/containerdata # always set
|
- ${CONTAINER_DATA}:/app/containerdata # always set
|
||||||
|
- ./config/container.yaml:/app/config.yaml
|
||||||
- ${CODE:-/dev/null}:/code # not set in prod, defaults to /dev/null
|
- ${CODE:-/dev/null}:/code # not set in prod, defaults to /dev/null
|
||||||
environment:
|
environment:
|
||||||
|
- CONFIG_FILE=/app/config.yaml
|
||||||
- UNAME=${UNAME}
|
- UNAME=${UNAME}
|
||||||
ports:
|
ports:
|
||||||
- "8080:80" # 80 inside container
|
- "8080:80" # 80 inside container
|
||||||
entrypoint: ${ENTRYPOINT:-python app.py} # by default launch workers as defined in the Dockerfile
|
entrypoint: ${ENTRYPOINT:-python app.py} # by default launch workers as defined in the Dockerfile
|
||||||
tty: true
|
|
||||||
|
|
||||||
|
nas_sync:
|
||||||
|
image: alpine:latest
|
||||||
|
volumes:
|
||||||
|
- ${CONTAINER_DATA}/files:/sync/local_files
|
||||||
|
- coss_smb_share:/sync/remote_files
|
||||||
|
command:
|
||||||
|
- /bin/sh
|
||||||
|
- -c
|
||||||
|
- |
|
||||||
|
apk add rsync
|
||||||
|
rsync -av --no-perms --no-owner --no-group --progress /sync/local_files/${SYNC_FOLDER}/ /sync/remote_files/${SYNC_FOLDER} -n
|
||||||
|
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
coss_smb_share:
|
||||||
|
driver: local
|
||||||
|
driver_opts:
|
||||||
|
type: cifs
|
||||||
|
o: "addr=${NAS_HOST},nounix,file_mode=0777,dir_mode=0777,domain=D,username=${NAS_USERNAME},password=${NAS_PASSWORD}"
|
||||||
|
device: //${NAS_HOST}${NAS_PATH}
|
||||||
|
70
launch
70
launch
@ -1,70 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
set -o ignoreeof
|
|
||||||
|
|
||||||
echo "Bash script launching COSS_ARCHIVING..."
|
|
||||||
|
|
||||||
|
|
||||||
# CHANGE ME ONCE!
|
|
||||||
export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
|
|
||||||
export UNAME=remy
|
|
||||||
export U_ID=1000
|
|
||||||
|
|
||||||
|
|
||||||
### Main use cases ###
|
|
||||||
if [[ $1 == "debug" ]]
|
|
||||||
then
|
|
||||||
export DEBUG=true
|
|
||||||
export HEADFULL=true
|
|
||||||
export CODE=./
|
|
||||||
export ENTRYPOINT=/bin/bash
|
|
||||||
# since service ports does not open ports on implicitly started containers, also start geckodriver:
|
|
||||||
docker compose up -d geckodriver
|
|
||||||
|
|
||||||
elif [[ $1 == "production" ]]
|
|
||||||
then
|
|
||||||
export DEBUG=false
|
|
||||||
|
|
||||||
elif [[ $1 == "build" ]]
|
|
||||||
then
|
|
||||||
export DEBUG=false
|
|
||||||
shift
|
|
||||||
docker compose build "$@"
|
|
||||||
exit 0
|
|
||||||
|
|
||||||
|
|
||||||
### Manual Shutdown ###
|
|
||||||
elif [[ $1 == "down" ]]
|
|
||||||
then
|
|
||||||
docker compose down -t 0
|
|
||||||
exit 0
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
### Edge cases -> for firefox ###
|
|
||||||
elif [[ $1 == "edit_profile" ]]
|
|
||||||
then
|
|
||||||
export CODE=./
|
|
||||||
export HEADFULL=true
|
|
||||||
|
|
||||||
docker compose up -d geckodriver
|
|
||||||
sleep 5
|
|
||||||
docker compose exec geckodriver /bin/bash /code/geckodriver/edit_profile.sh # inside the container
|
|
||||||
docker compose down -t 0
|
|
||||||
|
|
||||||
|
|
||||||
### Fallback ####
|
|
||||||
else
|
|
||||||
echo "Please specify the execution mode (debug/production/build/edit_profile/down) as the first argument"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
shift # consumes the variable set in $1 so that $@ only contains the remaining arguments
|
|
||||||
|
|
||||||
docker compose run -it --service-ports "$@"
|
|
||||||
|
|
||||||
echo "Docker run finished, shutting down containers..."
|
|
||||||
docker compose down -t 0
|
|
||||||
echo "Bye!"
|
|
@ -46,7 +46,13 @@ def fetch():
|
|||||||
|
|
||||||
def show():
|
def show():
|
||||||
for a in runner.models.ArticleDownload.select():
|
for a in runner.models.ArticleDownload.select():
|
||||||
print(f"URL: {a.article_url} \nARCHIVE_URL: {a.archive_url} \nFILE_NAME: {a.file_name}")
|
print(f"""
|
||||||
|
URL: {a.article_url}
|
||||||
|
ARCHIVE_URL: {a.archive_url}
|
||||||
|
ARTICLE_SOURCE: {a.source_name}
|
||||||
|
FILE_NAME: {a.file_name}
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
logger.info("Overwriting production values for single time media-fetch")
|
logger.info("Overwriting production values for single time media-fetch")
|
||||||
@ -55,7 +61,7 @@ if __name__ == "__main__":
|
|||||||
runner.configuration.models.set_db(
|
runner.configuration.models.set_db(
|
||||||
runner.configuration.SqliteDatabase("../.dev/media_downloads.db")
|
runner.configuration.SqliteDatabase("../.dev/media_downloads.db")
|
||||||
)
|
)
|
||||||
runner.configuration.main_config["DOWNLOADS"]["local_storage_path"] = "../.dev/"
|
runner.configuration.main_config["downloads"]["local_storage_path"] = "../.dev/"
|
||||||
|
|
||||||
|
|
||||||
if len(sys.argv) == 1: # no additional arguments
|
if len(sys.argv) == 1: # no additional arguments
|
||||||
|
@ -1,9 +0,0 @@
|
|||||||
FROM bash:latest
|
|
||||||
# alpine with bash instead of sh
|
|
||||||
ENV TZ=Europe/Berlin
|
|
||||||
RUN apk add lsyncd cifs-utils rsync
|
|
||||||
RUN mkdir -p /sync/remote_files
|
|
||||||
COPY entrypoint.sh /sync/entrypoint.sh
|
|
||||||
|
|
||||||
|
|
||||||
ENTRYPOINT ["bash", "/sync/entrypoint.sh"]
|
|
@ -1,10 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
sleep 5 # waits for the vpn to have an established connection
|
|
||||||
echo "Starting NAS sync"
|
|
||||||
mount -t cifs "//$1" -o credentials=/sync/nas_login.config /sync/remote_files
|
|
||||||
echo "Successfully mounted SAMBA remote: $1 --> /sync/remote_files"
|
|
||||||
shift # consumes the variable set in $1 so tat $@ only contains the remaining arguments
|
|
||||||
|
|
||||||
exec "$@"
|
|
@ -1,4 +1,5 @@
|
|||||||
flask
|
flask
|
||||||
peewee
|
peewee
|
||||||
markdown
|
markdown
|
||||||
psycopg2
|
psycopg2
|
||||||
|
pyyaml
|
@ -1,17 +1,16 @@
|
|||||||
from peewee import PostgresqlDatabase
|
from peewee import PostgresqlDatabase
|
||||||
import configparser
|
|
||||||
import time
|
import time
|
||||||
|
import yaml
|
||||||
|
import os
|
||||||
|
|
||||||
main_config = configparser.ConfigParser()
|
config_location = os.getenv("CONFIG_FILE")
|
||||||
main_config.read("/app/containerdata/config/news_fetch.config.ini")
|
with open(config_location, "r") as f:
|
||||||
|
config = yaml.safe_load(f)
|
||||||
|
|
||||||
db_config = configparser.ConfigParser()
|
cred = config["database"]
|
||||||
db_config.read("/app/containerdata/config/db.config.ini")
|
|
||||||
|
|
||||||
cred = db_config["DATABASE"]
|
|
||||||
time.sleep(10) # wait for the vpn to connect (can't use a healthcheck because there is no depends_on)
|
time.sleep(10) # wait for the vpn to connect (can't use a healthcheck because there is no depends_on)
|
||||||
db = PostgresqlDatabase(
|
db = PostgresqlDatabase(
|
||||||
cred["db_name"], user=cred["user_name"], password=cred["password"], host="vpn", port=5432
|
cred["production_db_name"], user=cred["production_user_name"], password=cred["production_password"], host="vpn", port=5432
|
||||||
)
|
)
|
||||||
|
|
||||||
import models
|
import models
|
||||||
|
@ -6,7 +6,7 @@ import os
|
|||||||
import datetime
|
import datetime
|
||||||
import configuration
|
import configuration
|
||||||
|
|
||||||
config = configuration.main_config["DOWNLOADS"]
|
downloads_config = configuration.config["downloads"]
|
||||||
|
|
||||||
# set the nature of the db at runtime
|
# set the nature of the db at runtime
|
||||||
download_db = DatabaseProxy()
|
download_db = DatabaseProxy()
|
||||||
@ -34,14 +34,14 @@ class ArticleDownload(DownloadBaseModel):
|
|||||||
file_name = TextField(default = '')
|
file_name = TextField(default = '')
|
||||||
@property
|
@property
|
||||||
def save_path(self):
|
def save_path(self):
|
||||||
return f"{config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/"
|
return f"{downloads_config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/"
|
||||||
@property
|
@property
|
||||||
def fname_nas(self, file_name=""):
|
def fname_nas(self, file_name=""):
|
||||||
if self.download_date:
|
if self.download_date:
|
||||||
if file_name:
|
if file_name:
|
||||||
return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{file_name}"
|
return f"NAS: {downloads_config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{file_name}"
|
||||||
else: # return the self. name
|
else: # return the self. name
|
||||||
return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{self.file_name}"
|
return f"NAS: {downloads_config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{self.file_name}"
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -1,9 +1,7 @@
|
|||||||
import os
|
|
||||||
import configparser
|
|
||||||
import logging
|
|
||||||
import time
|
import time
|
||||||
# import shutil
|
import os
|
||||||
# from datetime import datetime
|
import logging
|
||||||
|
import yaml
|
||||||
from peewee import SqliteDatabase, PostgresqlDatabase
|
from peewee import SqliteDatabase, PostgresqlDatabase
|
||||||
from rich.logging import RichHandler
|
from rich.logging import RichHandler
|
||||||
|
|
||||||
@ -19,22 +17,21 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
# load config file containing constants and secrets
|
# load config file containing constants and secrets
|
||||||
main_config = configparser.ConfigParser()
|
config_location = os.getenv("CONFIG_FILE")
|
||||||
main_config.read("/app/containerdata/config/news_fetch.config.ini")
|
with open(config_location, "r") as f:
|
||||||
db_config = configparser.ConfigParser()
|
config = yaml.safe_load(f)
|
||||||
db_config.read("/app/containerdata/config/db.config.ini")
|
|
||||||
|
|
||||||
|
|
||||||
# DEBUG MODE:
|
# DEBUG MODE:
|
||||||
if os.getenv("DEBUG", "false") == "true":
|
if os.getenv("DEBUG", "false") == "true":
|
||||||
logger.warning("Found 'DEBUG=true', setting up dummy databases")
|
logger.warning("Found 'DEBUG=true', setting up dummy databases")
|
||||||
|
|
||||||
main_config["SLACK"]["archive_id"] = main_config["SLACK"]["debug_id"]
|
config["slack"]["archive_id"] = config["slack"]["debug_id"]
|
||||||
main_config["MAIL"]["recipient"] = main_config["MAIL"]["sender"]
|
config["mail"]["recipient"] = config["mail"]["sender"]
|
||||||
main_config["DOWNLOADS"]["local_storage_path"] = main_config["DOWNLOADS"]["debug_storage_path"]
|
config["downloads"]["local_storage_path"] = config["downloads"]["debug_storage_path"]
|
||||||
|
|
||||||
download_db = SqliteDatabase(
|
download_db = SqliteDatabase(
|
||||||
main_config["DATABASE"]["download_db_debug"],
|
config["database"]["debug_db"],
|
||||||
pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once
|
pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -43,9 +40,9 @@ else:
|
|||||||
logger.warning("Found 'DEBUG=false' and running on production databases, I hope you know what you're doing...")
|
logger.warning("Found 'DEBUG=false' and running on production databases, I hope you know what you're doing...")
|
||||||
|
|
||||||
time.sleep(10) # wait for the vpn to connect (can't use a healthcheck because there is no depends_on)
|
time.sleep(10) # wait for the vpn to connect (can't use a healthcheck because there is no depends_on)
|
||||||
cred = db_config["DATABASE"]
|
cred = config["database"]
|
||||||
download_db = PostgresqlDatabase(
|
download_db = PostgresqlDatabase(
|
||||||
cred["db_name"], user=cred["user_name"], password=cred["password"], host="vpn", port=5432
|
cred["production_db_name"], user=cred["production_user_name"], password=cred["production_password"], host="vpn", port=5432
|
||||||
)
|
)
|
||||||
# TODO Reimplement backup/printout
|
# TODO Reimplement backup/printout
|
||||||
# logger.info("Backing up databases")
|
# logger.info("Backing up databases")
|
||||||
|
@ -10,3 +10,4 @@ markdown
|
|||||||
rich
|
rich
|
||||||
psycopg2
|
psycopg2
|
||||||
unidecode
|
unidecode
|
||||||
|
pyyaml
|
@ -7,16 +7,20 @@ import logging
|
|||||||
import configuration
|
import configuration
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
config = configuration.main_config["MAIL"]
|
mail_config = configuration.config["mail"]
|
||||||
|
|
||||||
def send(article_model):
|
def send(article_model):
|
||||||
mail = MIMEMultipart()
|
mail = MIMEMultipart()
|
||||||
mail['Subject'] = "{} -- {}".format(article_model.source_name, article_model.title)
|
mail['Subject'] = "{} -- {}".format(article_model.source_name, article_model.title)
|
||||||
mail['From'] = config["sender"]
|
mail['From'] = mail_config["sender"]
|
||||||
mail['To'] = config["recipient"]
|
mail['To'] = mail_config["recipient"]
|
||||||
|
|
||||||
msg, files = article_model.mail_info() # this is html
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
msg, files = article_model.mail_info() # this is html
|
||||||
|
except: # Raised by model if article has no associated file
|
||||||
|
logger.info("Skipping mail sending")
|
||||||
|
return
|
||||||
|
|
||||||
content = MIMEText(msg, "html")
|
content = MIMEText(msg, "html")
|
||||||
mail.attach(content)
|
mail.attach(content)
|
||||||
|
|
||||||
@ -29,14 +33,14 @@ def send(article_model):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
try:
|
try:
|
||||||
smtp = smtplib.SMTP(config["smtp_server"], config["port"])
|
smtp = smtplib.SMTP(mail_config["smtp_server"], mail_config["port"])
|
||||||
except ConnectionRefusedError:
|
except ConnectionRefusedError:
|
||||||
logger.error("Server refused connection. Is this an error on your side?")
|
logger.error("Server refused connection. Is this an error on your side?")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
smtp.starttls()
|
smtp.starttls()
|
||||||
smtp.login(config["uname"], config["password"])
|
smtp.login(mail_config["uname"], mail_config["password"])
|
||||||
smtp.sendmail(config["sender"], config["recipient"], mail.as_string())
|
smtp.sendmail(mail_config["sender"], mail_config["recipient"], mail.as_string())
|
||||||
smtp.quit()
|
smtp.quit()
|
||||||
logger.info("Mail successfully sent.")
|
logger.info("Mail successfully sent.")
|
||||||
except smtplib.SMTPException as e:
|
except smtplib.SMTPException as e:
|
||||||
|
@ -7,7 +7,7 @@ import re
|
|||||||
import time
|
import time
|
||||||
|
|
||||||
import configuration
|
import configuration
|
||||||
config = configuration.main_config["SLACK"]
|
slack_config = configuration.config["slack"]
|
||||||
models = configuration.models
|
models = configuration.models
|
||||||
|
|
||||||
class MessageIsUnwanted(Exception):
|
class MessageIsUnwanted(Exception):
|
||||||
@ -61,7 +61,7 @@ class Message:
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def is_by_human(self):
|
def is_by_human(self):
|
||||||
return self.user.user_id != config["bot_id"]
|
return self.user.user_id != slack_config["bot_id"]
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@ -87,7 +87,7 @@ class BotApp(App):
|
|||||||
|
|
||||||
def say_substitute(self, *args, **kwargs):
|
def say_substitute(self, *args, **kwargs):
|
||||||
self.client.chat_postMessage(
|
self.client.chat_postMessage(
|
||||||
channel=config["archive_id"],
|
channel=slack_config["archive_id"],
|
||||||
text=" - ".join(args),
|
text=" - ".join(args),
|
||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
@ -101,7 +101,7 @@ class BotApp(App):
|
|||||||
last_ts = presaved.slack_ts_full
|
last_ts = presaved.slack_ts_full
|
||||||
|
|
||||||
result = self.client.conversations_history(
|
result = self.client.conversations_history(
|
||||||
channel=config["archive_id"],
|
channel=slack_config["archive_id"],
|
||||||
oldest=last_ts
|
oldest=last_ts
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -116,7 +116,7 @@ class BotApp(App):
|
|||||||
while refetch: # we have not actually fetched them all
|
while refetch: # we have not actually fetched them all
|
||||||
try:
|
try:
|
||||||
result = self.client.conversations_history(
|
result = self.client.conversations_history(
|
||||||
channel = config["archive_id"],
|
channel = slack_config["archive_id"],
|
||||||
cursor = result["response_metadata"]["next_cursor"],
|
cursor = result["response_metadata"]["next_cursor"],
|
||||||
oldest = last_ts
|
oldest = last_ts
|
||||||
) # fetches 100 messages, older than the [-1](=oldest) element of new_fetches
|
) # fetches 100 messages, older than the [-1](=oldest) element of new_fetches
|
||||||
@ -126,8 +126,8 @@ class BotApp(App):
|
|||||||
for m in new_messages:
|
for m in new_messages:
|
||||||
return_messages.append(Message(m))
|
return_messages.append(Message(m))
|
||||||
except SlackApiError: # Most likely a rate-limit
|
except SlackApiError: # Most likely a rate-limit
|
||||||
self.logger.error("Error while fetching channel messages. (likely rate limit) Retrying in {} seconds...".format(config["api_wait_time"]))
|
self.logger.error("Error while fetching channel messages. (likely rate limit) Retrying in {} seconds...".format(slack_config["api_wait_time"]))
|
||||||
time.sleep(config["api_wait_time"])
|
time.sleep(slack_config["api_wait_time"])
|
||||||
refetch = True
|
refetch = True
|
||||||
|
|
||||||
self.logger.info(f"Fetched {len(return_messages)} new channel messages.")
|
self.logger.info(f"Fetched {len(return_messages)} new channel messages.")
|
||||||
@ -181,7 +181,7 @@ class BotRunner():
|
|||||||
|
|
||||||
"""Stupid encapsulation so that we can apply the slack decorators to the BotApp"""
|
"""Stupid encapsulation so that we can apply the slack decorators to the BotApp"""
|
||||||
def __init__(self, callback, *args, **kwargs) -> None:
|
def __init__(self, callback, *args, **kwargs) -> None:
|
||||||
self.bot_worker = BotApp(callback, token=config["auth_token"])
|
self.bot_worker = BotApp(callback, token=slack_config["auth_token"])
|
||||||
|
|
||||||
@self.bot_worker.event(event="message", matchers=[is_message_in_archiving])
|
@self.bot_worker.event(event="message", matchers=[is_message_in_archiving])
|
||||||
def handle_incoming_message(message, say):
|
def handle_incoming_message(message, say):
|
||||||
@ -195,7 +195,7 @@ class BotRunner():
|
|||||||
def handle_all_other_reactions(event, say):
|
def handle_all_other_reactions(event, say):
|
||||||
self.logger.log("Ignoring slack event that isn't a message")
|
self.logger.log("Ignoring slack event that isn't a message")
|
||||||
|
|
||||||
self.handler = SocketModeHandler(self.bot_worker, config["app_token"])
|
self.handler = SocketModeHandler(self.bot_worker, slack_config["app_token"])
|
||||||
|
|
||||||
|
|
||||||
def start(self):
|
def start(self):
|
||||||
@ -215,5 +215,5 @@ class BotRunner():
|
|||||||
|
|
||||||
|
|
||||||
def is_message_in_archiving(message) -> bool:
|
def is_message_in_archiving(message) -> bool:
|
||||||
return message["channel"] == config["archive_id"]
|
return message["channel"] == slack_config["archive_id"]
|
||||||
|
|
||||||
|
@ -8,8 +8,7 @@ import configuration
|
|||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
from . import helpers
|
from . import helpers
|
||||||
config = configuration.main_config["DOWNLOADS"]
|
downloads_config = configuration.config["downloads"]
|
||||||
slack_config = configuration.main_config["SLACK"]
|
|
||||||
FILE_SIZE_THRESHOLD = 15 * 1024 * 1024 # 15MB
|
FILE_SIZE_THRESHOLD = 15 * 1024 * 1024 # 15MB
|
||||||
|
|
||||||
|
|
||||||
@ -34,7 +33,8 @@ class ArticleDownload(DownloadBaseModel):
|
|||||||
def is_title_bad(self): # add incrementally
|
def is_title_bad(self): # add incrementally
|
||||||
return "PUR-Abo" in self.title \
|
return "PUR-Abo" in self.title \
|
||||||
or "Redirecting" in self.title \
|
or "Redirecting" in self.title \
|
||||||
or "Error while running fetch" in self.title
|
or "Error while running fetch" in self.title \
|
||||||
|
or self.title == ""
|
||||||
|
|
||||||
summary = TextField(default = '')
|
summary = TextField(default = '')
|
||||||
source_name = CharField(default = '')
|
source_name = CharField(default = '')
|
||||||
@ -44,14 +44,14 @@ class ArticleDownload(DownloadBaseModel):
|
|||||||
file_name = TextField(default = '')
|
file_name = TextField(default = '')
|
||||||
@property
|
@property
|
||||||
def save_path(self):
|
def save_path(self):
|
||||||
return f"{config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/"
|
return f"{downloads_config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/"
|
||||||
@property
|
@property
|
||||||
def fname_nas(self, file_name=""):
|
def fname_nas(self, file_name=""):
|
||||||
if self.download_date:
|
if self.download_date:
|
||||||
if file_name:
|
if file_name:
|
||||||
return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{file_name}"
|
return f"NAS: {downloads_config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{file_name}"
|
||||||
else: # return the self. name
|
else: # return the self. name
|
||||||
return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{self.file_name}"
|
return f"NAS: {downloads_config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{self.file_name}"
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
@property
|
@property
|
||||||
@ -102,18 +102,22 @@ class ArticleDownload(DownloadBaseModel):
|
|||||||
answer_files = []
|
answer_files = []
|
||||||
# displays the summary in a blockquote
|
# displays the summary in a blockquote
|
||||||
|
|
||||||
status = self.file_status
|
try:
|
||||||
if status == 1: # file_name was empty
|
self.ensure_file_present()
|
||||||
return None # there has been an error do not send any message
|
|
||||||
elif status == 2: # no file found at specified location
|
|
||||||
answer_text += f"*{self.title}*\n{summary}\nFilename: {self.file_name}"
|
|
||||||
elif status == 3: # file found but deemed too big
|
|
||||||
location = f"File not sent directly. Location on NAS:\n`{self.fname_nas}`"
|
|
||||||
answer_text += f"*{self.title}*\n{summary}\n{location}"
|
|
||||||
else: # everything nominal
|
|
||||||
answer_text += f"*{self.title}*\n{summary}"
|
answer_text += f"*{self.title}*\n{summary}"
|
||||||
answer_files.append(self.save_path + self.file_name)
|
answer_files.append(self.save_path + self.file_name)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
msg = e.args[0]
|
||||||
|
logger.error(f"Article {self} has file-issues: {msg}")
|
||||||
|
if "file too big" in msg:
|
||||||
|
location = f"File too big to send directly. Location on NAS:\n`{self.fname_nas}`"
|
||||||
|
answer_text += f"*{self.title}*\n{summary}\n{location}"
|
||||||
|
|
||||||
|
else: # file not found, or filename not set
|
||||||
|
raise e
|
||||||
|
# reraise the exception, so that the caller can handle it
|
||||||
|
|
||||||
# then the related files
|
# then the related files
|
||||||
if self.related:
|
if self.related:
|
||||||
rel_text = "Related files on NAS:"
|
rel_text = "Related files on NAS:"
|
||||||
@ -144,19 +148,14 @@ class ArticleDownload(DownloadBaseModel):
|
|||||||
related_file_name = r
|
related_file_name = r
|
||||||
)
|
)
|
||||||
|
|
||||||
@property
|
def ensure_file_present(self):
|
||||||
def file_status(self):
|
|
||||||
"""0 = file exists, 1 = no file name!, 2 = file does not exit,3 = file exists but is too large"""
|
|
||||||
if not self.file_name:
|
if not self.file_name:
|
||||||
logger.error(f"Article {self} has no filename!")
|
raise Exception("no filename")
|
||||||
return 2
|
|
||||||
file_path_abs = self.save_path + self.file_name
|
file_path_abs = self.save_path + self.file_name
|
||||||
if not os.path.exists(file_path_abs):
|
if not os.path.exists(file_path_abs):
|
||||||
logger.error(f"Article {self} has a filename, but the file does not exist at that location!")
|
raise Exception("file not found")
|
||||||
return 2
|
|
||||||
if (os.path.splitext(file_path_abs)[1] != ".pdf") or (os.path.getsize(file_path_abs) > FILE_SIZE_THRESHOLD):
|
if (os.path.splitext(file_path_abs)[1] != ".pdf") or (os.path.getsize(file_path_abs) > FILE_SIZE_THRESHOLD):
|
||||||
logger.warning(f"Article {self} has a file that exceeds the file size limit.")
|
raise Exception("file too big")
|
||||||
return 3
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -11,7 +11,7 @@ from selenium import webdriver
|
|||||||
|
|
||||||
import configuration
|
import configuration
|
||||||
|
|
||||||
config = configuration.main_config["DOWNLOADS"]
|
download_config = configuration.config["downloads"]
|
||||||
|
|
||||||
def driver_running(f):
|
def driver_running(f):
|
||||||
def wrapper(*args, **kwargs):
|
def wrapper(*args, **kwargs):
|
||||||
@ -66,74 +66,88 @@ class PDFDownloader:
|
|||||||
|
|
||||||
@driver_running
|
@driver_running
|
||||||
def download(self, article_object):
|
def download(self, article_object):
|
||||||
sleep_time = int(config["browser_print_delay"])
|
|
||||||
url = article_object.article_url
|
url = article_object.article_url
|
||||||
|
|
||||||
|
|
||||||
|
if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly
|
||||||
|
self.logger.info("Downloading existing pdf")
|
||||||
|
success = self.get_exisiting_pdf(article_object)
|
||||||
|
# get a page title if required
|
||||||
|
if article_object.is_title_bad:
|
||||||
|
article_object.title = self.driver.title.replace(".pdf", "") # some titles end with .pdf
|
||||||
|
# will be propagated to the saved file (dst) as well
|
||||||
|
else:
|
||||||
|
success = self.get_new_pdf(article_object)
|
||||||
|
|
||||||
|
if not success:
|
||||||
|
self.logger.error("Download failed")
|
||||||
|
# TODO: need to reset the file name to empty?
|
||||||
|
return article_object # changes to this are saved later by the external caller
|
||||||
|
|
||||||
|
|
||||||
|
def get_exisiting_pdf(self, article_object):
|
||||||
|
# get a better page title if required
|
||||||
|
if article_object.is_title_bad:
|
||||||
|
article_object.title = article_object.article_url.split("/")[-1].split(".pdf")[0]
|
||||||
try:
|
try:
|
||||||
self.driver.get(url)
|
r = requests.get(article_object.article_url)
|
||||||
|
bytes = r.content
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
return self.write_pdf(bytes, article_object)
|
||||||
|
|
||||||
|
|
||||||
|
def get_new_pdf(self, article_object):
|
||||||
|
sleep_time = int(download_config["browser_print_delay"])
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.driver.get(article_object.article_url)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.critical("Selenium .get(url) failed with error {}".format(e))
|
self.logger.critical("Selenium .get(url) failed with error {}".format(e))
|
||||||
self.finish()
|
self.finish()
|
||||||
return article_object # without changes
|
return False
|
||||||
|
|
||||||
time.sleep(sleep_time)
|
time.sleep(sleep_time)
|
||||||
# leave the page time to do any funky business
|
# leave the page time to do any funky business
|
||||||
|
|
||||||
# in the mean time, get a page title if required
|
|
||||||
if article_object.is_title_bad:
|
if article_object.is_title_bad:
|
||||||
article_object.title = self.driver.title.replace(".pdf", "") # some titles end with .pdf
|
article_object.title = self.driver.title
|
||||||
# will be propagated to the saved file (dst) as well
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = self.driver.print_page()
|
||||||
|
bytes = base64.b64decode(result, validate=True)
|
||||||
|
except:
|
||||||
|
self.logger.error("Failed, probably because the driver went extinct.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
return self.write_pdf(bytes, article_object)
|
||||||
|
|
||||||
|
|
||||||
|
def get_file_destination(self, article_object):
|
||||||
fname = article_object.fname_template
|
fname = article_object.fname_template
|
||||||
fname = ensure_unique(article_object.save_path, fname)
|
fname = ensure_unique(article_object.save_path, fname)
|
||||||
dst = os.path.join(article_object.save_path, fname)
|
dst = os.path.join(article_object.save_path, fname)
|
||||||
|
return dst, fname
|
||||||
|
|
||||||
|
|
||||||
if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly
|
def write_pdf(self, content, article_object):
|
||||||
success = self.get_exisiting_pdf(url, dst)
|
dst, fname = self.get_file_destination(article_object)
|
||||||
else:
|
|
||||||
success = self.get_new_pdf(dst)
|
|
||||||
|
|
||||||
if success:
|
|
||||||
article_object.file_name = fname
|
|
||||||
else:
|
|
||||||
article_object.file_name = ""
|
|
||||||
|
|
||||||
return article_object # this change is saved later by the external caller
|
|
||||||
|
|
||||||
|
|
||||||
def get_exisiting_pdf(self, url, dst):
|
|
||||||
try:
|
|
||||||
r = requests.get(url)
|
|
||||||
bytes = r.content
|
|
||||||
except:
|
|
||||||
return False
|
|
||||||
return self.get_new_pdf(dst, other_bytes=bytes)
|
|
||||||
|
|
||||||
|
|
||||||
def get_new_pdf(self, dst, other_bytes=None):
|
|
||||||
os.makedirs(os.path.dirname(dst), exist_ok=True)
|
os.makedirs(os.path.dirname(dst), exist_ok=True)
|
||||||
|
|
||||||
if other_bytes is None:
|
|
||||||
try:
|
|
||||||
result = self.driver.print_page()
|
|
||||||
bytes = base64.b64decode(result, validate=True)
|
|
||||||
except:
|
|
||||||
self.logger.error("Failed, probably because the driver went extinct.")
|
|
||||||
return False
|
|
||||||
else:
|
|
||||||
bytes = other_bytes
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open(dst, "wb+") as f:
|
with open(dst, "wb+") as f:
|
||||||
f.write(bytes)
|
f.write(content)
|
||||||
|
|
||||||
|
article_object.file_name = fname
|
||||||
return True
|
return True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(f"Failed, because of FS-operation: {e}")
|
self.logger.error(f"Failed, because of FS-operation: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def create_tmp_profile(self, full_profile_path: Path = Path(config["browser_profile_path"])) -> Path:
|
|
||||||
|
|
||||||
|
def create_tmp_profile(self, full_profile_path: Path = Path(download_config["browser_profile_path"])) -> Path:
|
||||||
reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}")
|
reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}")
|
||||||
os.mkdir(reduced_profile_path)
|
os.mkdir(reduced_profile_path)
|
||||||
# copy needed directories
|
# copy needed directories
|
||||||
|
@ -1,10 +1,11 @@
|
|||||||
import youtube_dl
|
import youtube_dl
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
|
import configuration
|
||||||
|
|
||||||
|
download_config = configuration.config["downloads"]
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class MyLogger(object):
|
class MyLogger(object):
|
||||||
def debug(self, msg): pass
|
def debug(self, msg): pass
|
||||||
def warning(self, msg): pass
|
def warning(self, msg): pass
|
||||||
@ -19,7 +20,6 @@ class YouTubeDownloader:
|
|||||||
|
|
||||||
|
|
||||||
def post_download_hook(self, ret_code):
|
def post_download_hook(self, ret_code):
|
||||||
# print(ret_code)
|
|
||||||
if ret_code['status'] == 'finished':
|
if ret_code['status'] == 'finished':
|
||||||
file_loc = ret_code["filename"]
|
file_loc = ret_code["filename"]
|
||||||
fname = os.path.basename(file_loc)
|
fname = os.path.basename(file_loc)
|
||||||
@ -35,9 +35,11 @@ class YouTubeDownloader:
|
|||||||
ydl_opts = {
|
ydl_opts = {
|
||||||
'format': 'best[height<=720]',
|
'format': 'best[height<=720]',
|
||||||
'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download
|
'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download
|
||||||
'logger': MyLogger(),
|
'logger': MyLogger(), # supress verbosity
|
||||||
'progress_hooks': [self.post_download_hook],
|
'progress_hooks': [self.post_download_hook],
|
||||||
'updatetime': False
|
'updatetime': False,
|
||||||
|
# File is also used by firefox so make sure to not write to it!
|
||||||
|
# youtube dl apparenlty does not support cookies.sqlite and the documentation is not clear on how to use cookies.txt
|
||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
||||||
@ -46,5 +48,9 @@ class YouTubeDownloader:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Youtube download crashed: {e}")
|
logger.error(f"Youtube download crashed: {e}")
|
||||||
article_object.file_name = ""
|
article_object.file_name = ""
|
||||||
|
logfile = os.path.join(download_config["local_storage_path"], "failed_downloads.csv")
|
||||||
|
logger.info(f"Logging youtube errors seperately to {logfile}")
|
||||||
|
with open(logfile, "a+") as f:
|
||||||
|
f.write(f"{url}\n")
|
||||||
|
|
||||||
return article_object
|
return article_object
|
||||||
|
Loading…
x
Reference in New Issue
Block a user