Compare commits

...

25 Commits

Author SHA1 Message Date
c46d0d3ecc Update usage instructions 2022-12-09 12:06:47 +01:00
ab2214c25e dont sync private config 2022-12-09 11:21:59 +01:00
647944d23c Bug fixes, makefile for launch 2022-12-09 11:19:45 +01:00
24b3bc3b51 Further restriction of possible file names, due to NAS issues 2022-11-04 15:35:59 +01:00
9e257f12a1 fix error where sqlite db could not be created 2022-11-04 14:40:30 +01:00
191d008451 Merge branch 'main' of ssh://git.kluster.moll.re:2222/remoll/coss_archiving 2022-10-29 17:19:46 +02:00
104b99df7e single use media archiving made usable again 2022-10-29 17:18:41 +02:00
6301a62de8 Update 'README.md' 2022-10-25 09:22:19 +00:00
e6bfe811d0 update nas target, documentation 2022-10-24 17:25:48 +02:00
6c08dec20a remove .vscode 2022-10-06 15:57:04 +02:00
5eab6c55da bug fixes 2022-10-06 15:55:30 +02:00
cca902b1f2 updates to documentation 2022-09-26 15:32:05 +02:00
9349b046d2 Fixed browser profile bug, line breaks and exceptions in news_check 2022-09-26 15:25:55 +02:00
db161e50c8 Switched from geckodriver to chrome 2022-09-18 19:26:55 +02:00
7cf7422b46 discern by archived_by 2022-09-12 13:30:32 +02:00
6661477599 turns out cached sites are suckers for ram 2022-09-12 13:09:46 +02:00
b00e5b4dff And mapped user permissions 2022-09-12 12:44:31 +02:00
118b5120fd And increase java heap space 2022-09-12 12:15:59 +02:00
6d8de7777f Forgot to add shm (else java crashes) 2022-09-12 11:56:10 +02:00
6b88b79e9d Sample config 2022-09-12 10:31:37 +02:00
d3d44dcdc9 working (feature complete) news_fetch 2022-09-09 22:32:22 +02:00
afead44d6c Working, refactored news_fetch, better documentation for launch 2022-09-08 16:19:15 +02:00
713406dc67 few bugs in news_fetch left, news_chek wip 2022-09-06 22:15:26 +02:00
2e65828bbb reduced slack functionality, higher ease of use. Database migration wip 2022-09-05 16:29:19 +02:00
60c9e88c7b WIP: Article checker with svelte 2022-08-31 12:09:21 +02:00
76 changed files with 4451 additions and 2125 deletions

31
.gitignore vendored
View File

@@ -1,5 +1,34 @@
.dev/
*.pyc
*.log
__pycache__/
config/container.yaml
config/local.env
## svelte:
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
pnpm-debug.log*
lerna-debug.log*
node_modules
dist
dist-ssr
*.local
# Editor directories and files
.vscode/*
.idea
.DS_Store
*.suo
*.ntvs*
*.njsproj
*.sln
*.sw?

View File

@@ -1,4 +0,0 @@
{
"python.linting.flake8Enabled": true,
"python.linting.enabled": false
}

87
Makefile Normal file
View File

@@ -0,0 +1,87 @@
include config/local.env
export
build:
@echo "Building..."
docker compose build $(flags)
down:
@echo "Stopping containers..."
docker compose down -t 0 --volumes
# Variables specific to debug
debug: export DEBUG=true
debug: export HEADFULL=true
debug: export ENTRYPOINT=/bin/bash
debug: export CODE=./
debug:
@echo "Running in debug mode..."
docker compose up -d geckodriver
docker compose run -it --service-ports $(target) $(flags) || true
make down
production: export DEBUG=false
production:
@echo "Running in production mode..."
docker compose run -it --service-ports $(target) $(flags) || true
make down
nas_sync:
@echo "Syncing NAS..."
SYNC_FOLDER=$(folder) docker compose run -it nas_sync $(flags) || true
docker compose down
docker container prune -f
make down
## Misc:
edit_profile: export CODE=./
edit_profile: export HEADFULL=true
edit_profile:
@echo "Editing profile..."
docker compose up -d geckodriver
sleep 5
docker compose exec geckodriver /bin/bash /code/geckodriver/edit_profile.sh || true
# runs inside the container
make down
db_interface:
docker create \
--name pgadmin \
-p 8080:80 \
-e 'PGADMIN_DEFAULT_EMAIL=${UNAME}@test.com' \
-e 'PGADMIN_DEFAULT_PASSWORD=password' \
-e 'PGADMIN_CONFIG_ENHANCED_COOKIE_PROTECTION=True' \
-e 'PGADMIN_CONFIG_LOGIN_BANNER="Authorised users only!"' \
dpage/pgadmin4
docker start pgadmin
sleep 5
# TODO auto add the server to the list displayed in the browser
# docker exec pgadmin sh -c "echo ${SERVER_DATA} > /tmp/servers.json"
# docker exec pgadmin sh -c "/venv/bin/python setup.py --load-servers /tmp/servers.json --user remy@test.com"
@echo "Go to http://localhost:8080 to access the database interface"
@echo "Username: ${UNAME}@test.com"
@echo "Password: password"
@echo "Hit any key to stop (not ctrl+c)"
read STOP
docker stop pgadmin
docker rm pgadmin
logs:
docker compose logs -f $(target) $(flags)
make down

126
README.md
View File

@@ -11,75 +11,125 @@ A utility to
... fully automatically. Run it now, thank me later.
---
## Running - Docker compose
The included `docker-compose` file is now necessary for easy orchestration of the various services.
## Running - through makefile
All relevant passthroughs and mounts are specified through the env-file, for which I configured 4 versions:
Execute the file by runnning `make`. This won't do anything in itself. For the main usage you need to specify a mode and a target.
* production
* debug (development in general)
* upload
* check
These files will have to be adapted to your individual setup but won't change significantly once set up.
`make <mode> target=<target>`
### Overview of the modes
The production mode performs all automatic actions and therfore does not require any manual intervention. It queries the slack workspace, adds the new requests to the database, downloads all files and metadata, uploads the urls to archive.org and sends out the downloaded article. As a last step the newly created file is synced to the COSS-NAS.
The production mode performs all automatic actions and therefore does not require any manual intervention. It queries the slack workspace, adds the new requests to the database, downloads all files and metadata, uploads the urls to archive.org and sends out the downloaded article. As a last step the newly created file is synced to the COSS-NAS.
The debug mode is more sophisticated and allows for big code changes without the need to recompile. It directly mounts the code-directory into the cotainer. As a failsafe the environment-variable `DEBUG=true` is set. The whole utility is then run on a sandbox environment (slack-channel, database, email) so that Dirk is not affected by any mishaps.
The check mode is less sophisticated but shows the downloaded articles to the host for visual verification. This requires passthroughs for X11.
Upload mode is much simpler, it goes over the exisiting database and operates on the articles, where the upload to archive.org has not yet occured (archive.org is slow and the other operations usually finish before the queue was consumed). It retries their upload.
* For normal `production` mode run:
`docker compose --env-file env/production run news_fetch`
Two additional 'modes' are `build` and `down`. Build rebuilds the container, which is necessary after code changes. Down ensures a clean shutdown of *all* containers. Usually the launch-script handles this already but it sometimes fails, in which case `down` needs to be called again.
* For `debug` mode run:
### Overview of the targets
`docker compose --env-file env/debug run news_fetch`
which drops you into an interactive shell (`ctrl+d` to exit the container shell).
In essence a target is simply a service from docker-compose, which is run in an interactive environment. As such all services defined in `docker-compose.yaml` can be called as target. Only two of them will be of real use:
> Note:
> The live-mounted code is now under `/code`. Note that the `DEBUG=true` environment variable is still set. If you want to test things on production, run `export DEBUG=false`. Running `python runner.py` will now run the newly written code but, with the production database and storage.
`news_fetch` does the majority of the actions mentioned above. By default, that is without any options, it runs a metadata-fetch, download, and upload to archive.org. The upload is usually the slowest which is why articles that are processed but don't yet have an archive.org url tend to pile up. You can therefore specify the option `upload` which only starts the upload for the concerned articles, as a catch-up if you will.
* For `check` mode, some env-variables are also changed and you still require interactivity. You don't need the geckodriver service however. The simplest way is to run
Example usage:
`docker compose --env-file env/check run --no-deps --rm news_fetch`
```bash
make production target=news_fetch # full mode
make production target=news_fetch flags=upload # upload mode (lighter resource usage)
make debug target=news_fetch # debug mode, which drops you inside a new shell
* Finally, for `upload` mode no interactivity is required and no additional services are required. Simply run:
`docker compose --env-file env/upload run --no-deps --rm news_fetch`
make production target=news_check
```
### Stopping
Run
`news_check` starts a webapp, accessible under [http://localhost:8080](http://localhost:8080) and allows you to easily check the downloaded articles.
`docker compose --env-file env/production down`
### Synchronising changes with NAS
I recommend `rsync`.
From within the ETH-network you can launch
```
make nas_sync folder=<target>
```
this will launch a docker container running `rsync` and connected to both the COSS NAS-share and your local files. Specifying a folder restricts the files that are watched for changes.
example: `make nas_sync folder=2022/September` will take significantly less time than `make nas_sync folder=2022` but only considers files written to the September folder.
> Please check the logs for any suspicious messages. `rsync`ing to smb shares is prone to errors.
### Misc. usage:
```bash
make build # rebuilds all containers to reflect code changes
make down # shuts down all containers (usually not necessary since this occurs automatically)
make edit_profile # opens a firefox window under localhost:7900 to edit the profile used by news_fetch
make db_interfacce # opens a postgres-interface to view the remote database (localhost:8080)
```
## First run:
> The program relies on a functioning firefox profile!
For the first run ever, run
`make edit_profile`
This will generate a new firefox profile under `coss_archiving/dependencies/news_fetch.profile`.
You can then go to [http://localhost:7900](http://localhost:7900) in your browser. Check the profile (under firefox://profile-internals).
Now install two addons: Idontcareaboutcookies and bypass paywalls clean (from firefox://extensions). They ensure that most sites just work out of the box. You can additionally install adblockers such as ublock origin.
You can then use this profile to further tweak various sites. The state of the sites (namely their cookies) will be used by `news_fetch`.
> Whenever you need to make changes to the profile, for instance re-log in to websites, just rerun `make edit_profile`.
which terminates all containers associated with the `docker-compose.yaml`.
## Building
> The software (firefox, selenium, python) changes frequently. For non-breaking changes it is useful to regularly clean build the docker image! This is also crucial to update the code itself.
> The software **will** change. Because the images referenced in docker compose are usually the `latest` ones, it is sufficient to update the containers.
In docker compose, run
`docker compose --env-file env/production build`
Or simpler, just run
`make build` (should issues occur you can also run `make build flags=--no-cache`)
## Roadmap:
[_] handle paywalled sites like faz, spiegel, ... through their dedicated sites (see nexisuni.com for instance), available through the ETH network
- [ ] handle paywalled sites like faz, spiegel, ... through their dedicated sites (see nexisuni.com for instance), available through the ETH network
- [ ] improve reliability of nas_sync. (+ logging)
- [ ] divide month folders into smaller ones
## Manual Sync to NAS:
I use `rsync`. Mounting the NAS locally, I navigate to the location of the local folder (notice the trailing slash). Then run
`rsync -Razq --no-perms --no-owner --no-group --temp-dir=/tmp --progress --log-file=rsync.log <local folder>/ "<remote>"`
where `<remote>` is the location where the NAS is mounted. (options:`R` - relative paths , `a` - archive mode (multiple actions), `z` - ??, `q` - quiet. We also don't copy most of the metadata and we keep a log of the transfers.)
## Appendix: (Running - Docker compose)
> I strongly recommend sticking to the usage of `make`.
Instead of using the launch file you can manually issue `docker compose` comands. Example: check for logs.
All relevant mounts and env-variables are easiest specified through the env-file, for which I configured 2 versions:
* production
* debug (development in general)
These files will have to be adapted to your individual setup but won't change significantly once set up.
Example usage:
```bash
docker compose --env-file env/production run news_fetch # full mode
docker compose --env-file env/production run news_fetch upload # upload mode (lighter resource usage)
docker compose --env-file env/debug run news_fetch # debug mode, which drops you inside a new shell
docker copose --env-file env/production news_check
# Misc:
docker compose --env-file env/production up # starts all services and shows their combined logs
docker compose --env-file env/production logs -f news_fetch # follows along with the logs of only one service
docker compose --env-file env/production down
```

37
config/container.yaml Normal file
View File

@@ -0,0 +1,37 @@
mail:
smtp_server: smtp.ethz.ch
port: 587
sender: "****************"
recipient: "****************"
uname: "****************"
password: "************"
slack:
bot_id: U02MR1R8UJH
archive_id: C02MM7YG1V4
debug_id: C02NM2H9J5Q
api_wait_time: 90
auth_token: "****************"
app_token: "****************"
database:
debug_db: /app/containerdata/debug/downloads.db
db_printout: /app/containerdata/backups
production_db_name: coss_archiving
production_user_name: "ca_rw"
production_password: "****************"
## user_name: ca_ro
## password: "****************"
downloads:
local_storage_path: /app/containerdata/files
debug_storage_path: /app/containerdata/debug/
default_download_path: /app/containerdata/tmp
remote_storage_path: /helbing_support/Archiving-Pipeline
browser_profile_path: /app/containerdata/dependencies/news_fetch.profile
# please keep this exact name
browser_print_delay: 3

10
config/env/debug vendored Normal file
View File

@@ -0,0 +1,10 @@
# Runs in a debugging mode, does not launch anything at all but starts a bash process
export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
export UNAME=remy
export U_ID=1000
export DEBUG=true
export HEADFULL=true
export CODE=./
export ENTRYPOINT=/bin/bash

7
config/env/production vendored Normal file
View File

@@ -0,0 +1,7 @@
# Runs on the main slack channel with the full worker setup. If nothing funky has occured, reducedfetch is a speedup
CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
export UNAME=remy
export U_ID=1000
export DEBUG=false

18
config/local.env Normal file
View File

@@ -0,0 +1,18 @@
CONTAINER_DATA=***********
UNAME=***********
U_ID=***********
DB_HOST=***********
OPENCONNECT_URL=***********
OPENCONNECT_USER=***********
OPENCONNECT_PASSWORD=***********
OPENCONNECT_OPTIONS=--authgroup student-net
NAS_HOST=***********
NAS_PATH=/gess_coss_1/helbing_support/Archiving-Pipeline
NAS_USERNAME=***********
NAS_PASSWORD=***********
# Special characters like # need to be escaped (write: \#)

View File

@@ -1,75 +1,101 @@
# Usage:
# docker compose --env-file env/<mode> run <args> news_fetch && docker-compose --env-file env/production down
version: "3.9"
services:
geckodriver:
image: selenium/standalone-firefox:103.0
volumes:
- ${XSOCK-/dev/null}:${XSOCK-/tmp/sock}
- ${XAUTHORITY-/dev/null}:/home/auto_news/.Xauthority
environment:
- DISPLAY=$DISPLAY
- START_VNC=false
- START_XVFB=false
user: 1001:1001
expose: # exposed to other docker-compose services only
- "4444"
vpn:
vpn: # Creates a connection behind the ETH Firewall to access NAS and Postgres
image: wazum/openconnect-proxy:latest
env_file:
- ${CONTAINER_DATA}/config/vpn.config
environment:
- OPENCONNECT_URL=${OPENCONNECT_URL}
- OPENCONNECT_USER=${OPENCONNECT_USER}
- OPENCONNECT_PASSWORD=${OPENCONNECT_PASSWORD}
- OPENCONNECT_OPTIONS=${OPENCONNECT_OPTIONS}
cap_add:
- NET_ADMIN
volumes:
- /dev/net/tun:/dev/net/tun
# alternative to cap_add & volumes: specify privileged: true
expose: ["5432"] # exposed here because db_passhtrough uses this network. See below for more details
geckodriver: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers)
image: selenium/standalone-firefox:latest
shm_size: 2gb
environment:
- START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash)
- START_XVFB=${HEADFULL-false}
- SE_VNC_NO_PASSWORD=1
volumes:
- ${CONTAINER_DATA}/dependencies:/firefox_profile/
- ${CODE:-/dev/null}:/code
user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user
expose: ["4444"] # exposed to other docker-compose services only
ports:
- 7900:7900 # port for webvnc
db_passthrough: # Allows a container on the local network to connect to a service (here postgres) through the vpn
network_mode: "service:vpn"
image: alpine/socat:latest
command: ["tcp-listen:5432,reuseaddr,fork", "tcp-connect:${DB_HOST}:5432"]
# expose: ["5432"] We would want this passthrough to expose its ports to the other containers
# BUT since it uses the same network as the vpn-service, it can't expose ports on its own. 5432 is therefore exposed under service.vpn.expose
news_fetch: # Orchestration of the automatic download. It generates pdfs (via the geckodriver container), fetches descriptions, triggers a snaphsot (on archive.org) and writes to a db
build: news_fetch
image: news_fetch:latest
depends_on: # when using docker compose run news_fetch, the dependencies are started as well
- geckodriver
- db_passthrough
volumes:
- ${CONTAINER_DATA}:/app/containerdata # always set
- ./config/container.yaml:/app/config.yaml
- ${CODE:-/dev/null}:/code # not set in prod, defaults to /dev/null
environment:
- CONFIG_FILE=/app/config.yaml
- DEBUG=${DEBUG}
- UNAME=${UNAME}
user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user
entrypoint: ${ENTRYPOINT:-python runner.py} # by default launch workers as defined in the Dockerfile
# stdin_open: ${INTERACTIVE:-false} # docker run -i
# tty: ${INTERACTIVE:-false} # docker run -t
news_check: # Creates a small webapp on http://localhost:8080 to check previously generated pdfs (some of which are unusable and must be marked as such)
build: news_check
image: news_check:latest
user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user
depends_on:
- db_passthrough
volumes:
- ${CONTAINER_DATA}:/app/containerdata # always set
- ./config/container.yaml:/app/config.yaml
- ${CODE:-/dev/null}:/code # not set in prod, defaults to /dev/null
environment:
- CONFIG_FILE=/app/config.yaml
- UNAME=${UNAME}
ports:
- "8080:80" # 80 inside container
entrypoint: ${ENTRYPOINT:-python app.py} # by default launch workers as defined in the Dockerfile
nas_sync:
depends_on:
- vpn # used to establish a connection to the SMB server
network_mode: "service:vpn"
build: nas_sync
image: nas_sync:latest
cap_add: # capabilities needed for mounting the SMB share
- SYS_ADMIN
- DAC_READ_SEARCH
image: alpine:latest
volumes:
- ${CONTAINER_DATA}/files:/sync/local_files
- ${CONTAINER_DATA}/config/nas_sync.config:/sync/nas_sync.config
- ${CONTAINER_DATA}/config/nas_login.config:/sync/nas_login.config
command:
- nas22.ethz.ch/gess_coss_1/helbing_support/Files RM/Archiving/TEST # first command is the target mount path
- lsyncd
- /sync/nas_sync.config
- coss_smb_share:/sync/remote_files
command:
- /bin/sh
- -c
- |
apk add rsync
rsync -av --no-perms --no-owner --no-group --progress /sync/local_files/${SYNC_FOLDER}/ /sync/remote_files/${SYNC_FOLDER} -n
news_fetch:
build: news_fetch
image: news_fetch:latest
depends_on: # when using docker compose run news_fetch, the dependencies are started as well
- nas_sync
- geckodriver
volumes:
- ${CONTAINER_DATA}:/app/containerdata # always set
- ${CODE:-/dev/null}:/code # not set in prod, defaults to /dev/null
- ${XSOCK-/dev/null}:${XSOCK-/tmp/sock} # x11 socket, needed for gui
# - ${XAUTHORITY-/dev/null}:/home/auto_news/.Xauthority # xauth needed for authenticating to x11
environment:
- DISPLAY=$DISPLAY # needed to let x11 apps know where to connect to
- DEBUG=${DEBUG}
- CHECK=${CHECK}
- UPLOAD=${UPLOAD}
- HEADLESS=${HEADLESS}
- REDUCEDFETCH=${REDUCEDFETCH}
entrypoint: ${ENTRYPOINT:-python3 runner.py} # by default launch workers as defined in the Dockerfile
stdin_open: ${INTERACTIVE:-false} # docker run -i
tty: ${INTERACTIVE:-false} # docker run -t
volumes:
coss_smb_share:
driver: local
driver_opts:
type: cifs
o: "addr=${NAS_HOST},nounix,file_mode=0777,dir_mode=0777,domain=D,username=${NAS_USERNAME},password=${NAS_PASSWORD}"
device: //${NAS_HOST}${NAS_PATH}

15
env/check vendored
View File

@@ -1,15 +0,0 @@
# Does not run any downloads but displays the previously downloaded but not yet checked files. Requires display-acces via xauth
CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
XAUTHORTIY=$XAUTHORTIY
XSOCK=/tmp/.X11-unix
DEBUG=false
CHECK=true
HEADLESS=true
UPLOAD=false
REDUCEDFETCH=false
# ENTRYPOINT="/bin/bash"
INTERACTIVE=true

16
env/debug vendored
View File

@@ -1,16 +0,0 @@
# Runs in a debugging mode, does not launch anything at all but starts a bash process
CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
CODE=./
XAUTHORTIY=$XAUTHORTIY
XSOCK=/tmp/.X11-unix
DEBUG=true
CHECK=false
UPLOAD=false
HEADLESS=false
REDUCEDFETCH=false
ENTRYPOINT="/bin/bash"
INTERACTIVE=true

10
env/production vendored
View File

@@ -1,10 +0,0 @@
# Runs on the main slack channel with the full worker setup. If nothing funky has occured, reducedfetch is a speedup
CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
CONTAINERS_TO_RUN=nas_sync, geckodriver
DEBUG=false
CHECK=false
UPLOAD=false
HEADLESS=true
REDUCEDFETCH=true

10
env/upload vendored
View File

@@ -1,10 +0,0 @@
# Does not run any other workers and only upploads to archive the urls that weren't previously uploaded
CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
NEWS_FETCH_DEPENDS_ON="[]"
DEBUG=false
CHECK=false
UPLOAD=true
HEADLESS=true
REDUCEDFETCH=false

View File

@@ -0,0 +1,8 @@
if [ -d "/firefox_profile/news_fetch.profile" ]
then
echo "Profile already exists, skipping folder creation"
else
echo "Creating empty folder for profile"
mkdir -p /firefox_profile/news_fetch.profile/
fi
firefox --profile /firefox_profile/news_fetch.profile

7
manual/README.md Normal file
View File

@@ -0,0 +1,7 @@
### MANUAL TASKS
The files inside this directory contain scripts for repetitive but somewhat automatable tasks.
> ⚠️ warning:
>
> Most scripts still require manual intervention before/after running and probably require changes to the code. **Please make sure you understand them before using them!**

21
manual/batch_archive.py Normal file
View File

@@ -0,0 +1,21 @@
"""
Saves websites specified in 'batch_urls.txt' to the wayback machine. Outputs archive urls to terminal
Hint: use 'python batch_archive.py > batch_archive.txt' to save the output to a file
"""
from waybackpy import WaybackMachineSaveAPI # upload to archive.org
import time
urls = []
with open ("batch_urls.txt", "r") as f:
urls = f.readlines()
for i, url in enumerate(urls):
print(f"Saving url {i+1} / {len(urls)}")
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed?
wayback = WaybackMachineSaveAPI(url, user_agent)
archive_url = wayback.save()
print(archive_url)
time.sleep(20)
# Uploads to archive.org are rate limited

18
manual/batch_urls.txt Normal file
View File

@@ -0,0 +1,18 @@
https://id2020.org
https://www.weforum.org/platforms/the-centre-for-cybersecurity
https://www.unhcr.org/blogs/wp-content/uploads/sites/48/2018/04/fs.pdf
https://en.wikipedia.org/wiki/Social_Credit_System
https://en.wikipedia.org/wiki/Customer_lifetime_value
https://www.weforum.org/reports/the-internet-of-bodies-is-here-tackling-new-challenges-of-technology-governance
https://www.un.org/en/about-us/universal-declaration-of-human-rights
https://www.biometricupdate.com/201909/id2020-and-partners-launch-program-to-provide-digital-id-with-vaccines
https://www.wired.com/2008/06/pb-theory/
https://www.medtechdive.com/news/fda-warns-of-false-positives-with-bd-coronavirus-diagnostic/581115/
https://www.bbc.com/news/world-middle-east-52579475
https://www.timesofisrael.com/over-12000-mistakenly-quarantined-by-phone-tracking-health-ministry-admits/
https://www.delftdesignforvalues.nl
https://www.theglobalist.com/technology-big-data-artificial-intelligence-future-peace-rooms/
https://link.springer.com/chapter/10.1007/978-3-319-90869-4_17
https://www.youtube.com/watch?v=_KhAsJRk2lo
https://www.bloomberg.org/environment/supporting-sustainable-cities/american-cities-climate-challenge/
https://climatecitycup.org

33
manual/batch_youtube.py Normal file
View File

@@ -0,0 +1,33 @@
"""
Saves youtube videos specified in 'batch_urls.txt' to the local folder. (to be copied manually)
"""
import youtube_dl
urls = []
with open ("batch_urls.txt", "r") as f:
urls = f.readlines()
def post_download_hook(ret_code):
if ret_code['status'] == 'finished':
file_loc = ret_code["filename"]
print(file_loc)
def save_video(url):
"""Saves video accoring to url and save path"""
ydl_opts = {
'format': 'best[height<=720]',
'progress_hooks': [post_download_hook],
'updatetime': False
}
try:
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
except Exception as e:
print(f"Youtube download crashed: {e}")
for i, url in enumerate(urls):
print(f"Downloading video {i+1} / {len(urls)}")
save_video(url)

View File

@@ -1,3 +1,6 @@
"""
Extracts all urls from a list of mails exported from thunderbird. Writes to 'mails_url_export.json'
"""
import os
import re
import json
@@ -19,5 +22,5 @@ for f in all_files:
print("Saved {} urls".format(len(all_urls)))
with open("media_mails_export.json", "w") as f:
with open("mails_url_export.json", "w") as f:
json.dump(all_urls, f)

View File

@@ -0,0 +1,70 @@
"""
Runs the news_fetch pipeline against a manually curated list of urls and saves them locally
"""
import sys
sys.path.append("../news_fetch")
import runner
import os
import logging
logger = logging.getLogger()
class DummyMessage:
"""Required by the dispatcher"""
ts = 0
def __init__(self, url):
self.urls = [url]
def fetch():
dispatcher = runner.Dispatcher()
dispatcher.workers_in = [
{"FetchWorker": runner.FetchWorker(), "DownloadWorker": runner.DownloadWorker()},
{"UploadWorker": runner.UploadWorker()}
]
print_worker = runner.PrintWorker("Finished processing", sent = True)
dispatcher.workers_out = [{"PrintWorker": print_worker}]
dispatcher.start()
with open("media_urls.txt", "r") as f:
url_list = [l.replace("\n", "") for l in f.readlines()]
with open("media_urls.txt", "w") as f:
f.write("") # empty the file once it is read so that it does not get processed again
if url_list:
logger.info(f"Found {len(url_list)} media urls")
for u in url_list:
dispatcher.incoming_request(DummyMessage(u))
else:
logger.info(f"No additional media urls found. Running the pipeline with messages from db.")
print_worker.keep_alive()
def show():
for a in runner.models.ArticleDownload.select():
print(f"""
URL: {a.article_url}
ARCHIVE_URL: {a.archive_url}
ARTICLE_SOURCE: {a.source_name}
FILE_NAME: {a.file_name}
""")
if __name__ == "__main__":
logger.info("Overwriting production values for single time media-fetch")
if not os.path.exists("../.dev/"):
os.mkdir("../.dev/")
runner.configuration.models.set_db(
runner.configuration.SqliteDatabase("../.dev/media_downloads.db")
)
runner.configuration.main_config["downloads"]["local_storage_path"] = "../.dev/"
if len(sys.argv) == 1: # no additional arguments
fetch()
elif sys.argv[1] == "show":
show()

View File

@@ -0,0 +1,170 @@
import datetime
import sys
sys.path.append("../news_fetch/")
import configuration # lives in app
from peewee import *
import os
import time
old_db = SqliteDatabase("/app/containerdata/downloads.db")
cred = configuration.db_config["DATABASE"]
download_db = PostgresqlDatabase(
cred["db_name"], user=cred["user_name"], password=cred["password"], host="vpn", port=5432
)
## OLD Models
class OLDModel(Model):
class Meta:
database = old_db
class OLDArticleDownload(OLDModel):
class Meta:
db_table = 'articledownload'
title = CharField(default='')
pub_date = DateField(default = '')
download_date = DateField(default = 0)
source_name = CharField(default = '')
article_url = TextField(default = '', unique=True)
archive_url = TextField(default = '')
file_name = TextField(default = '')
language = CharField(default = '')
summary = TextField(default = '')
comment = TextField(default = '')
verified = IntegerField(default = False)
# authors
# keywords
# ... are added through foreignkeys
class OLDArticleAuthor(OLDModel):
class Meta:
db_table = 'articleauthor'
article = ForeignKeyField(OLDArticleDownload, backref='authors')
author = CharField()
class OLDArticleRelated(OLDModel):
class Meta:
db_table = 'articlerelated'
article = ForeignKeyField(OLDArticleDownload, backref='related')
related_file_name = TextField(default = '')
## NEW Models
class NEWModel(Model):
class Meta:
database = download_db
class ArticleDownload(NEWModel):
# in the beginning this is all we have
article_url = TextField(default = '', unique=True)
# fetch then fills in the metadata
title = TextField(default='')
summary = TextField(default = '')
source_name = CharField(default = '')
language = CharField(default = '')
file_name = TextField(default = '')
archive_url = TextField(default = '')
pub_date = DateField(default = '')
download_date = DateField(default = 0)
slack_ts = FloatField(default = 0) # should be a fixed-length string but float is easier to sort by
sent = BooleanField(default = False)
archived_by = CharField(default = os.getenv("UNAME"))
# need to know who saved the message because the file needs to be on their computer in order to get verified
# verification happens in a different app, but the model has the fields here as well
comment = TextField(default = '')
verified = IntegerField(default = 0) # 0 = not verified, 1 = verified, -1 = marked as bad
def set_authors(self, authors):
for a in authors:
if len(a) < 100:
ArticleAuthor.create(
article = self,
author = a
)
def set_related(self, related):
for r in related:
ArticleRelated.create(
article = self,
related_file_name = r
)
# authors
# keywords
# ... are added through foreignkeys
# we will also add an attribute named message, to reference which message should be replied to. This attribute does not need to be saved in the db
class ArticleAuthor(NEWModel):
article = ForeignKeyField(ArticleDownload, backref='authors')
author = CharField()
class ArticleRelated(NEWModel):
# Related files, such as the full text of a paper, audio files, etc.
article = ForeignKeyField(ArticleDownload, backref='related')
related_file_name = TextField(default = '')
####################################################################
# Migrate using sensible defaults:
download_db.create_tables([ArticleDownload, ArticleAuthor, ArticleRelated])
it = 0
for old_art in OLDArticleDownload.select():
print("====================================================================")
it+=1
print(f"IT {it} New article with data:")
print(
old_art.article_url,
old_art.title,
old_art.summary,
old_art.source_name,
old_art.language,
old_art.file_name,
old_art.archive_url,
old_art.pub_date if old_art.pub_date != "" else datetime.date.fromtimestamp(0),
old_art.download_date,
True,
old_art.comment,
old_art.verified
)
new_art = ArticleDownload.create(
article_url = old_art.article_url,
title = old_art.title,
summary = old_art.summary,
source_name = old_art.source_name,
language = old_art.language,
file_name = old_art.file_name,
archive_url = old_art.archive_url,
pub_date = old_art.pub_date if old_art.pub_date != "" else datetime.date.fromtimestamp(0),
download_date = old_art.download_date,
# slack_ts = FloatField(default = 0)
sent = True,
# archived_by = CharField(default = os.getenv("UNAME"))
comment = old_art.comment,
verified = old_art.verified
)
new_art.set_related([r.related_file_name for r in old_art.related])
new_art.set_authors([a.author for a in old_art.authors])

View File

@@ -1,73 +0,0 @@
import sys
from webbrowser import get
sys.path.append("../app")
import runner
import logging
logger = logging.getLogger()
import json
from rich.console import Console
from rich.table import Table
console = Console()
logger.info("Overwriting production values for single time media-fetch")
runner.configuration.models.set_db(
runner.configuration.SqliteDatabase("../.dev/media_message_dummy.db"), # chat_db (not needed here)
runner.configuration.SqliteDatabase("../.dev/media_downloads.db")
)
runner.configuration.parsed["DOWNLOADS"]["local_storage_path"] = "../.dev/"
def fetch():
coordinator = runner.Coordinator()
kwargs = {
"worker_download" : runner.DownloadWorker(),
"worker_fetch" : runner.FetchWorker(),
"worker_upload" : runner.UploadWorker(),
"worker_compress" : runner.CompressWorker(),
}
coordinator.add_workers(**kwargs)
coordinator.start()
with open("media_urls.json", "r") as f:
url_list = json.loads(f.read())
logger.info(f"Found {len(url_list)} media urls")
for u in url_list:
msg_text = f"<{u}|dummy preview text>"
dummy_thread = runner.models.Thread()
msg = runner.models.Message(text= msg_text, thread=dummy_thread)
coordinator.incoming_request(msg)
def show():
t = Table(
title = "ArticleDownloads",
row_styles = ["white", "bright_black"],
)
entries = ["title", "article_url", "archive_url", "authors"]
for e in entries:
t.add_column(e, justify = "right")
sel = runner.models.ArticleDownload.select()
for s in sel:
c = [getattr(s, e) for e in entries]#
c[-1] = str([a.author for a in c[-1]])
print(c)
t.add_row(*c)
console.print(t)
# fetch()
show()

View File

@@ -1,88 +0,0 @@
import time
import keys
import slack_sdk
from slack_sdk.errors import SlackApiError
from peewee import SqliteDatabase
from persistence import message_models
# from bot_utils import messages
# Constant values...
MESSAGES_DB = "/app/containerdata/messages.db"
BOT_ID = "U02MR1R8UJH"
ARCHIVE_ID = "C02MM7YG1V4"
DEBUG_ID = "C02NM2H9J5Q"
client = slack_sdk.WebClient(token=keys.OAUTH_TOKEN)
message_models.set_db(SqliteDatabase(MESSAGES_DB))
def message_dict_to_model(message):
if message["type"] == "message":
thread_ts = message["thread_ts"] if "thread_ts" in message else message["ts"]
uid = message.get("user", "BAD USER")
user, _ = message_models.User.get_or_create(user_id = uid)
thread, _ = message_models.Thread.get_or_create(thread_ts = thread_ts)
m, new = message_models.Message.get_or_create(
user = user,
thread = thread,
ts = message["ts"],
channel_id = ARCHIVE_ID,
text = message["text"]
)
print("Saved (text) {} (new={})".format(m, new))
for f in message.get("files", []): #default: []
m.file_type = f["filetype"]
m.perma_link = f["url_private_download"]
m.save()
print("Saved permalink {} to {} (possibly overwriting)".format(f["name"], m))
if new:
return m
else:
return None
else:
print("What should I do of {}".format(message))
return None
def check_all_past_messages():
last_ts = 0
result = client.conversations_history(
channel=ARCHIVE_ID,
oldest=last_ts
)
new_messages = result.get("messages", []) # fetches 100 messages by default
new_fetches = []
for m in new_messages:
new_fetches.append(message_dict_to_model(m))
# print(result)
refetch = result.get("has_more", False)
print(f"Refetching : {refetch}")
while refetch: # we have not actually fetched them all
try:
result = client.conversations_history(
channel = ARCHIVE_ID,
cursor = result["response_metadata"]["next_cursor"],
oldest = last_ts
) # refetches in batches of 100 messages
refetch = result.get("has_more", False)
new_messages = result.get("messages", [])
for m in new_messages:
new_fetches.append(message_dict_to_model(m))
except SlackApiError: # Most likely a rate-limit
print("Error while fetching channel messages. (likely rate limit) Retrying in {} seconds...".format(30))
time.sleep(30)
refetch = True
check_all_past_messages()

View File

@@ -1,38 +0,0 @@
from peewee import SqliteDatabase
from persistence import article_models, message_models
# Global logger setup:
# Constant values...
DOWNLOADS_DB = "../container_data/downloads.db"
MESSAGES_DB = "../container_data/messages.db"
BOT_ID = "U02MR1R8UJH"
ARCHIVE_ID = "C02MM7YG1V4"
DEBUG_ID = "C02NM2H9J5Q"
# DB Setup:
article_models.set_db(SqliteDatabase(
DOWNLOADS_DB,
pragmas = {'journal_mode': 'wal'} # mutliple threads can access at once
))
message_models.set_db(SqliteDatabase(MESSAGES_DB))
for reaction in message_models.Reaction.select():
print(reaction)
thread = reaction.message.thread
articles = message_models.get_referenced_articles(thread, article_models.ArticleDownload)
for a in articles:
print(a)
reaction = reaction.type
status = 1 if reaction == "white_check_mark" else -1
print(status)
for article in articles:
article.verified = status
article.save()

View File

@@ -1,151 +0,0 @@
[
"https://www.swissinfo.ch/ger/wirtschaft/koennen-ki-und-direkte-demokratie-nebeneinander-bestehen-/47542048",
"https://www.zeit.de/2011/33/CH-Oekonophysik",
"https://ourworld.unu.edu/en/green-idea-self-organizing-traffic-signals",
"https://www.youtube.com/watch?v=-FQD4ie9UYA",
"https://www.brandeins.de/corporate-services/mck-wissen/mck-wissen-logistik/schwaermen-fuer-das-optimum",
"https://www.youtube.com/watch?v=upQM4Xzh8zM",
"https://www.youtube.com/watch?v=gAkoprZmW4k",
"https://www.youtube.com/watch?v=VMzfDVAWXHI&t=1s",
"https://www.youtube.com/watch?v=1SwTiIlkndE",
"https://www.informatik-aktuell.de/management-und-recht/digitalisierung/digitale-revolution-und-oekonomie-40-quo-vadis.html",
"https://www.youtube.com/watch?v=cSvvH0SBFOw",
"https://www.linkedin.com/posts/margit-osterloh-24198a104_pl%C3%A4doyer-gegen-sprechverbote-ugcPost-6925702100450480129-K7Dl?utm_source=linkedin_share&utm_medium=member_desktop_web",
"https://www.nebelspalter.ch/plaedoyer-gegen-sprechverbote",
"https://falling-walls.com/people/dirk-helbing/",
"https://digitalsensemaker.podigee.io/3-2-mit-dirk-helbing",
"https://www.blick.ch/wirtschaft/musk-als-hueter-der-redefreiheit-eth-experte-sagt-musks-vorhaben-hat-potenzial-aber-id17437811.html",
"https://www.trend.at/standpunkte/mit-verantwortung-zukunft-10082300",
"https://www.pantarhei.ch/podcast/",
"https://ethz.ch/en/industry/industry/news/data/2022/04/intelligent-traffic-lights-for-optimal-traffic-flow.html",
"https://ethz.ch/de/wirtschaft/industry/news/data/2022/04/optimaler-verkehrsfluss-mit-intelligenten-ampeln.html",
"https://www.spektrum.de/news/die-verschlungenen-wege-der-menschen/1181815",
"https://www.pcwelt.de/a/diktatur-4-0-schoene-neue-digitalisierte-welt,3447005",
"https://www.nzz.ch/english/cancel-culture-at-eth-a-professor-receives-death-threats-over-a-lecture-slide-ld.1675322",
"https://www.brandeins.de/corporate-services/mck-wissen/mck-wissen-logistik/schwaermen-fuer-das-optimum",
"https://www.achgut.com/artikel/ausgestossene_der_woche_prinz_william_als_immaginierter_rassist",
"https://www.pinterpolitik.com/in-depth/klaim-big-data-luhut-perlu-diuji/",
"https://www.srf.ch/kultur/gesellschaft-religion/eklat-an-der-eth-wenn-ein-angeblicher-schweinevergleich-zur-staatsaffaere-wird",
"https://open.spotify.com/episode/6s1icdoplZeNOINvx6ZHTd?si=610a699eba004da2&nd=1",
"https://www.nzz.ch/schweiz/shitstorm-an-der-eth-ein-professor-erhaelt-morddrohungen-ld.1673554",
"https://www.nzz.ch/schweiz/shitstorm-an-der-eth-ein-professor-erhaelt-morddrohungen-ld.1673554",
"https://djmag.com/features/after-astroworld-what-being-done-stop-crowd-crushes-happening-again",
"https://prisma-hsg.ch/articles/meine-daten-deine-daten-unsere-daten/",
"https://www.srf.ch/audio/focus/zukunftsforscher-dirk-helbing-die-welt-ist-keine-maschine?id=10756661",
"https://www.20min.ch/story/roboter-fuer-hunde-machen-wenig-sinn-647302764916",
"https://www.wienerzeitung.at/nachrichten/wissen/mensch/942890-Roboter-als-Praesidentschaftskandidaten.html",
"https://disruptors.fm/11-building-a-crystal-ball-of-the-world-unseating-capitalism-and-creating-a-new-world-order-with-prof-dirk-helbing/",
"https://www.spreaker.com/user/disruptorsfm/11-building-crystal-ball-of-the-world-un",
"https://www.youtube.com/watch?v=fRkCMC3zqSQ",
"https://arstechnica.com/science/2021/11/what-the-physics-of-crowds-can-tell-us-about-the-tragic-deaths-at-astroworld/",
"https://www.fox23.com/news/trending/astroworld-festival-big-crowds-can-flow-like-liquid-with-terrifying-results/37QH6Q4RGFELHGCZSZTBV46STU/",
"https://futurism.com/astroworld-theory-deaths-bodies-fluid",
"https://www.businessinsider.com/why-people-died-astroworld-crowd-crush-physics-fluid-dynamics-2021-11",
"https://theconversation.com/ten-tips-for-surviving-a-crowd-crush-112169",
"https://www.limmattalerzeitung.ch/basel/das-wort-zum-tag-kopie-von-4-januar-hypotenuse-schlaegt-kathete-trivia-trampel-pandemie-ld.2233931",
"https://magazine.swissinformatics.org/en/whats-wrong-with-ai/",
"https://magazine.swissinformatics.org/en/whats-wrong-with-ai/",
"https://www.netkwesties.nl/1541/wrr-ai-wordt-de-verbrandingsmotor-van.htm",
"https://youtu.be/ptm9zLG2KaE",
"https://www.deutschlandfunkkultur.de/die-zukunft-der-demokratie-mehr-teilhabe-von-unten-wagen.976.de.html?dram:article_id=468341",
"https://www.springer.com/gp/book/9783642240034",
"https://www.springer.com/de/book/9783319908687",
"https://technikjournal.de/2017/08/02/ein-plaedoyer-fuer-die-digitale-demokratie/",
"https://technikjournal.de/2017/08/02/ein-plaedoyer-fuer-die-digitale-demokratie/",
"https://trafo.hypotheses.org/23989",
"https://web.archive.org/web/20200609053329/https://www.wiko-berlin.de/institution/projekte-kooperationen/projekte/working-futures/wiko-briefs-working-futures-in-corona-times/the-corona-crisis-reveals-the-struggle-for-a-sustainable-digital-future/",
"https://www.wiko-berlin.de/institution/projekte-kooperationen/projekte/working-futures/wiko-briefs-working-futures-in-corona-times/",
"https://www.youtube.com/watch?v=gAkoprZmW4k",
"https://www.rhein-zeitung.de/region/aus-den-lokalredaktionen/nahe-zeitung_artikel,-peter-flaschels-lebenswerk-hat-die-sozialgeschichte-beeinflusst-_arid,2322161.html",
"https://www.blick.ch/wirtschaft/online-boom-ohne-ende-corona-befeuert-die-tech-revolution-id16359910.html",
"https://www.nzz.ch/meinung/china-unterwirft-tech-und-social-media-das-geht-auch-europa-an-ld.1643010",
"https://www.say.media/article/la-mort-par-algorithme",
"https://www.suedostschweiz.ch/aus-dem-leben/2021-08-14/stau-ist-nicht-gleich-stau",
"https://www.swissinfo.ch/eng/directdemocracy/political-perspectives_digital-democracy--too-risky--or-the-chance-of-a-generation-/43836222",
"https://kow-berlin.com/exhibitions/illusion-einer-menschenmenge",
"https://www.springer.com/gp/book/9783642240034",
"https://www.springer.com/de/book/9783319908687",
"https://www.politik-kommunikation.de/ressorts/artikel/eine-gefaehrliche-machtasymmetrie-1383558602",
"https://www.springer.com/gp/book/9783642240034",
"https://www.springer.com/de/book/9783319908687",
"https://solutions.hamburg/ethik-und-digitalisierung-nicht-voneinander-getrennt-betrachten/",
"https://www.springer.com/gp/book/9783642240034",
"https://www.springer.com/de/book/9783319908687",
"https://avenue.argusdatainsights.ch/Article/AvenueClip?artikelHash=d14d91ec9a8b4cb0b6bb3012c0cefd8b_27F0B19422F1F03723769C18906AA1EE&artikelDateiId=298862327",
"https://www.tagblatt.ch/kultur/grosses-ranking-ihre-stimme-hat-gewicht-das-sind-die-50-profiliertesten-intellektuellen-der-schweiz-ld.2182261",
"https://reliefweb.int/report/world/building-multisystemic-understanding-societal-resilience-covid-19-pandemic",
"https://reliefweb.int/report/world/building-multisystemic-understanding-societal-resilience-covid-19-pandemic",
"https://www.events.at/e/wie-wir-in-zukunft-leben-wollen-die-stadt-als-datenfeld",
"https://www.events.at/e/wie-wir-in-zukunft-leben-wollen-die-stadt-als-datenfeld",
"https://greennetproject.org/en/2018/11/27/prof-dirk-helbing-es-braucht-vor-allem-tolle-ideen-in-die-sich-die-leute-verlieben/",
"https://www.hpcwire.com/2011/05/06/simulating_society_at_the_global_scale/",
"https://www.technologyreview.com/2010/04/30/204005/europes-plan-to-simulate-the-entire-planet/",
"https://komentare.sme.sk/c/22543617/smrt-podla-algoritmu.html",
"https://komentare.sme.sk/c/22543617/smrt-podla-algoritmu.html",
"https://www.confidencial.com.ni/opinion/muerte-por-algoritmo/",
"https://www.nzz.ch/panorama/wie-kann-eine-massenpanik-verhindert-werden-ld.1614761",
"https://www.20min.ch/story/roboter-fuer-hunde-machen-wenig-sinn-647302764916",
"https://www.wienerzeitung.at/nachrichten/wissen/mensch/942890-Roboter-als-Praesidentschaftskandidaten.html",
"https://www.srf.ch/audio/focus/zukunftsforscher-dirk-helbing-die-welt-ist-keine-maschine?id=10756661",
"https://disruptors.fm/11-building-a-crystal-ball-of-the-world-unseating-capitalism-and-creating-a-new-world-order-with-prof-dirk-helbing/",
"https://www.spreaker.com/user/disruptorsfm/11-building-crystal-ball-of-the-world-un",
"https://www.youtube.com/watch?v=fRkCMC3zqSQ",
"https://arstechnica.com/science/2021/11/what-the-physics-of-crowds-can-tell-us-about-the-tragic-deaths-at-astroworld/",
"https://www.fox23.com/news/trending/astroworld-festival-big-crowds-can-flow-like-liquid-with-terrifying-results/37QH6Q4RGFELHGCZSZTBV46STU/",
"https://futurism.com/astroworld-theory-deaths-bodies-fluid",
"https://www.businessinsider.com/why-people-died-astroworld-crowd-crush-physics-fluid-dynamics-2021-11",
"https://theconversation.com/ten-tips-for-surviving-a-crowd-crush-112169",
"https://www.limmattalerzeitung.ch/basel/das-wort-zum-tag-kopie-von-4-januar-hypotenuse-schlaegt-kathete-trivia-trampel-pandemie-ld.2233931",
"https://www.pantarhei.ch/podcast/",
"https://www.focus.it/scienza/scienze/folla-fisica-modelli-simulazioni",
"https://www.focus.it/scienza/scienze/folla-fisica-modelli-simulazioni",
"https://www.netkwesties.nl/1541/wrr-ai-wordt-de-verbrandingsmotor-van.htm",
"https://www.transformationbeats.com/de/transformation/digitale-gesellschaft/",
"https://www.transformationbeats.com/de/transformation/digitale-gesellschaft/",
"https://www.suedkurier.de/ueberregional/wirtschaft/Wie-uns-der-Staat-heimlich-erzieht-sogar-auf-dem-Klo;art416,8763904",
"https://www.suedkurier.de/ueberregional/wirtschaft/Wie-uns-der-Staat-heimlich-erzieht-sogar-auf-dem-Klo;art416,8763904",
"https://www.deutschlandfunkkultur.de/die-zukunft-der-demokratie-mehr-teilhabe-von-unten-wagen.976.de.html?dram:article_id=468341",
"https://www.springer.com/gp/book/9783642240034",
"https://www.springer.com/de/book/9783319908687",
"https://trafo.hypotheses.org/23989",
"https://web.archive.org/web/20200609053329/https://www.wiko-berlin.de/institution/projekte-kooperationen/projekte/working-futures/wiko-briefs-working-futures-in-corona-times/the-corona-crisis-reveals-the-struggle-for-a-sustainable-digital-future/",
"https://www.wiko-berlin.de/institution/projekte-kooperationen/projekte/working-futures/wiko-briefs-working-futures-in-corona-times/",
"https://www.youtube.com/watch?v=gAkoprZmW4k",
"https://futurium.de/de/gespraech/ranga-yogeshwar-1/ranga-yogeshwar-dirk-helbing-mit-musik-von-till-broenner",
"https://www.springer.com/gp/book/9783642240034",
"https://www.springer.com/de/book/9783319908687",
"https://idw-online.de/en/news113518",
"https://blmplus.de/die-digitalcharta-ist-erst-der-anfang-ein-szenario-von-dirk-helbing/",
"https://www.risiko-dialog.ch/big-nudging-vom-computer-gelenkt-aber-wohin/",
"https://idw-online.de/de/news13986",
"https://www.uni-stuttgart.de/presse/archiv/uni-kurier/uk84_85/forschung/fw66.html",
"https://www.infosperber.ch/medien/trends/rankings-oft-unbrauchbar-so-oder-so-aber-immer-schadlich/",
"https://www.infosperber.ch/medien/trends/rankings-oft-unbrauchbar-so-oder-so-aber-immer-schadlich/",
"https://www.nzz.ch/meinung/china-unterwirft-tech-und-social-media-das-geht-auch-europa-an-ld.1643010",
"https://www.suedostschweiz.ch/aus-dem-leben/2021-08-14/stau-ist-nicht-gleich-stau",
"https://www.swissinfo.ch/eng/directdemocracy/political-perspectives_digital-democracy--too-risky--or-the-chance-of-a-generation-/43836222",
"https://werteundwandel.de/inhalte/d2030-in-aufbruchstimmung-fuer-eine-lebenswerte-zukunft/",
"https://www.springer.com/gp/book/9783642240034",
"https://www.springer.com/de/book/9783319908687",
"https://www.youtube.com/watch?v=n9e77iYZPEY",
"https://greennetproject.org/en/2018/11/27/prof-dirk-helbing-es-braucht-vor-allem-tolle-ideen-in-die-sich-die-leute-verlieben/",
"https://www.hpcwire.com/2011/05/06/simulating_society_at_the_global_scale/",
"https://www.say.media/article/la-mort-par-algorithme",
"https://www.confidencial.com.ni/opinion/muerte-por-algoritmo/",
"https://www.nzz.ch/panorama/wie-kann-eine-massenpanik-verhindert-werden-ld.1614761",
"https://www.nesta.org.uk/report/digital-democracy-the-tools-transforming-political-engagement/",
"https://www.nature.com/articles/news.2010.351",
"https://www.focus.de/panorama/welt/tid-19265/gastkommentar-nutzt-die-moeglichkeiten-des-computers_aid_534372.html",
"https://www.theglobalist.com/democracy-technology-innovation-society-internet/",
"https://www.theglobalist.com/capitalism-democracy-technology-surveillance-privacy/",
"https://www.theglobalist.com/google-artificial-intelligence-big-data-technology-future/",
"https://www.theglobalist.com/fascism-big-data-artificial-intelligence-surveillance-democracy/",
"https://www.theglobalist.com/technology-big-data-artificial-intelligence-future-peace-rooms/",
"https://www.theglobalist.com/technology-society-sustainability-future-humanity/",
"https://www.theglobalist.com/society-technology-peace-sustainability/",
"https://www.theglobalist.com/democracy-technology-social-media-artificial-intelligence/",
"https://www.theglobalist.com/financial-system-reform-economy-internet-of-things-capitalism/",
"https://www.theglobalist.com/capitalism-society-equality-sustainability-crowd-funding/",
"https://www.theglobalist.com/united-nations-world-government-peace-sustainability-society/",
"https://www.theglobalist.com/world-economy-sustainability-environment-society/"
]

View File

@@ -1,3 +0,0 @@
user=****
domain=D
password=**************

View File

@@ -1,33 +0,0 @@
[MAIL]
smtp_server: smtp.******
port: 587
sender: **************
recipient: **************
uname: **************
password: **************+
[SLACK]
bot_id: U02MR1R8UJH
responsible_id: U01AC9ZEN2G
archive_id: C02MM7YG1V4
debug_id: C02NM2H9J5Q
api_wait_time: 90
auth_token: xoxb-**************************************************
app_token: xapp-1-**************************************************
[DATABASE]
download_db_name: downloads.db
chat_db_name: messages.db
db_path_prod: /app/containerdata
db_path_dev: /code/.dev
db_backup: /app/containerdata/backups
[DOWNLOADS]
local_storage_path: /app/containerdata/files
default_download_path: /app/containerdata/tmp
remote_storage_path: /**********
browser_profile_path: /app/containerdata/dependencies/<profile name>
blacklisted_href_domains: ["google.", "facebook."]

View File

@@ -1,4 +0,0 @@
OPENCONNECT_URL=sslvpn.ethz.ch/student-net
OPENCONNECT_USER=***************
OPENCONNECT_PASSWORD=**************
OPENCONNECT_OPTIONS=--authgroup student-net

View File

@@ -1,9 +0,0 @@
FROM bash:latest
# alpine with bash instead of sh
ENV TZ=Europe/Berlin
RUN apk add lsyncd cifs-utils rsync
RUN mkdir -p /sync/remote_files
COPY entrypoint.sh /sync/entrypoint.sh
ENTRYPOINT ["bash", "/sync/entrypoint.sh"]

View File

@@ -1,10 +0,0 @@
#!/bin/bash
set -e
sleep 5 # waits for the vpn to have an established connection
echo "Starting NAS sync"
mount -t cifs "//$1" -o credentials=/sync/nas_login.config /sync/remote_files
echo "Successfully mounted SAMBA remote: $1 --> /sync/remote_files"
shift # consumes the variable set in $1 so tat $@ only contains the remaining arguments
exec "$@"

25
news_check/Dockerfile Normal file
View File

@@ -0,0 +1,25 @@
FROM node:18.8 as build-deps
WORKDIR /app/client
COPY client/package.json ./
COPY client/package-lock.json ./
COPY client/rollup.config.js ./
COPY client/src ./src/
RUN npm install
RUN npm run build
FROM python:latest
ENV TZ Europe/Zurich
WORKDIR /app/news_check
COPY requirements.txt requirements.txt
RUN python3 -m pip install -r requirements.txt
COPY client/public/index.html client/public/index.html
COPY --from=build-deps /app/client/public client/public/
COPY server server/
WORKDIR /app/news_check/server
# CMD python app.py

4
news_check/client/.gitignore vendored Normal file
View File

@@ -0,0 +1,4 @@
/node_modules/
/public/build/
.DS_Store

107
news_check/client/README.md Normal file
View File

@@ -0,0 +1,107 @@
# This repo is no longer maintained. Consider using `npm init vite` and selecting the `svelte` option or — if you want a full-fledged app framework and don't mind using pre-1.0 software — use [SvelteKit](https://kit.svelte.dev), the official application framework for Svelte.
---
# svelte app
This is a project template for [Svelte](https://svelte.dev) apps. It lives at https://github.com/sveltejs/template.
To create a new project based on this template using [degit](https://github.com/Rich-Harris/degit):
```bash
npx degit sveltejs/template svelte-app
cd svelte-app
```
*Note that you will need to have [Node.js](https://nodejs.org) installed.*
## Get started
Install the dependencies...
```bash
cd svelte-app
npm install
```
...then start [Rollup](https://rollupjs.org):
```bash
npm run dev
```
Navigate to [localhost:8080](http://localhost:8080). You should see your app running. Edit a component file in `src`, save it, and reload the page to see your changes.
By default, the server will only respond to requests from localhost. To allow connections from other computers, edit the `sirv` commands in package.json to include the option `--host 0.0.0.0`.
If you're using [Visual Studio Code](https://code.visualstudio.com/) we recommend installing the official extension [Svelte for VS Code](https://marketplace.visualstudio.com/items?itemName=svelte.svelte-vscode). If you are using other editors you may need to install a plugin in order to get syntax highlighting and intellisense.
## Building and running in production mode
To create an optimised version of the app:
```bash
npm run build
```
You can run the newly built app with `npm run start`. This uses [sirv](https://github.com/lukeed/sirv), which is included in your package.json's `dependencies` so that the app will work when you deploy to platforms like [Heroku](https://heroku.com).
## Single-page app mode
By default, sirv will only respond to requests that match files in `public`. This is to maximise compatibility with static fileservers, allowing you to deploy your app anywhere.
If you're building a single-page app (SPA) with multiple routes, sirv needs to be able to respond to requests for *any* path. You can make it so by editing the `"start"` command in package.json:
```js
"start": "sirv public --single"
```
## Using TypeScript
This template comes with a script to set up a TypeScript development environment, you can run it immediately after cloning the template with:
```bash
node scripts/setupTypeScript.js
```
Or remove the script via:
```bash
rm scripts/setupTypeScript.js
```
If you want to use `baseUrl` or `path` aliases within your `tsconfig`, you need to set up `@rollup/plugin-alias` to tell Rollup to resolve the aliases. For more info, see [this StackOverflow question](https://stackoverflow.com/questions/63427935/setup-tsconfig-path-in-svelte).
## Deploying to the web
### With [Vercel](https://vercel.com)
Install `vercel` if you haven't already:
```bash
npm install -g vercel
```
Then, from within your project folder:
```bash
cd public
vercel deploy --name my-project
```
### With [surge](https://surge.sh/)
Install `surge` if you haven't already:
```bash
npm install -g surge
```
Then, from within your project folder:
```bash
npm run build
surge public my-project.surge.sh
```

1955
news_check/client/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,23 @@
{
"name": "svelte-app",
"version": "1.0.0",
"private": true,
"scripts": {
"build": "rollup -c",
"dev": "rollup -c -w",
"start": "sirv public --no-clear"
},
"devDependencies": {
"@rollup/plugin-commonjs": "^17.0.0",
"@rollup/plugin-node-resolve": "^11.0.0",
"rollup": "^2.3.4",
"rollup-plugin-css-only": "^3.1.0",
"rollup-plugin-livereload": "^2.0.0",
"rollup-plugin-svelte": "^7.0.0",
"rollup-plugin-terser": "^7.0.0",
"svelte": "^3.0.0"
},
"dependencies": {
"sirv-cli": "^2.0.0"
}
}

View File

@@ -0,0 +1,25 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset='utf-8'>
<meta name='viewport' content='width=device-width,initial-scale=1'>
<title>NEWS CHECK</title>
<link rel='icon' type='image/png' href='https://ethz.ch/etc/designs/ethz/img/icons/ETH-APP-Icons-Theme-white/192-xxxhpdi.png'>
<link rel='stylesheet' href='/build/bundle.css'>
<script defer src='/build/bundle.js'></script>
<link href="https://cdn.jsdelivr.net/npm/daisyui@2.24.0/dist/full.css" rel="stylesheet" type="text/css" />
<script src="https://cdn.tailwindcss.com"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.0.943/pdf.min.js"></script>
<html data-theme="light"></html> <!-- Daisy-ui theme -->
</head>
<body>
</body>
</html>

View File

@@ -0,0 +1,76 @@
import svelte from 'rollup-plugin-svelte';
import commonjs from '@rollup/plugin-commonjs';
import resolve from '@rollup/plugin-node-resolve';
import livereload from 'rollup-plugin-livereload';
import { terser } from 'rollup-plugin-terser';
import css from 'rollup-plugin-css-only';
const production = !process.env.ROLLUP_WATCH;
function serve() {
let server;
function toExit() {
if (server) server.kill(0);
}
return {
writeBundle() {
if (server) return;
server = require('child_process').spawn('npm', ['run', 'start', '--', '--dev'], {
stdio: ['ignore', 'inherit', 'inherit'],
shell: true
});
process.on('SIGTERM', toExit);
process.on('exit', toExit);
}
};
}
export default {
input: 'src/main.js',
output: {
sourcemap: true,
format: 'iife',
name: 'app',
file: 'public/build/bundle.js'
},
plugins: [
svelte({
compilerOptions: {
// enable run-time checks when not in production
dev: !production
}
}),
// we'll extract any component CSS out into
// a separate file - better for performance
css({ output: 'bundle.css' }),
// If you have external dependencies installed from
// npm, you'll most likely need these plugins. In
// some cases you'll need additional configuration -
// consult the documentation for details:
// https://github.com/rollup/plugins/tree/master/packages/commonjs
resolve({
browser: true,
dedupe: ['svelte']
}),
commonjs(),
// In dev mode, call `npm run start` once
// the bundle has been generated
!production && serve(),
// Watch the `public` directory and refresh the
// browser on changes when not in production
!production && livereload('public'),
// If we're building for production (npm run build
// instead of npm run dev), minify
production && terser()
],
watch: {
clearScreen: false
}
};

View File

@@ -0,0 +1,121 @@
// @ts-check
/** This script modifies the project to support TS code in .svelte files like:
<script lang="ts">
export let name: string;
</script>
As well as validating the code for CI.
*/
/** To work on this script:
rm -rf test-template template && git clone sveltejs/template test-template && node scripts/setupTypeScript.js test-template
*/
const fs = require("fs")
const path = require("path")
const { argv } = require("process")
const projectRoot = argv[2] || path.join(__dirname, "..")
// Add deps to pkg.json
const packageJSON = JSON.parse(fs.readFileSync(path.join(projectRoot, "package.json"), "utf8"))
packageJSON.devDependencies = Object.assign(packageJSON.devDependencies, {
"svelte-check": "^2.0.0",
"svelte-preprocess": "^4.0.0",
"@rollup/plugin-typescript": "^8.0.0",
"typescript": "^4.0.0",
"tslib": "^2.0.0",
"@tsconfig/svelte": "^2.0.0"
})
// Add script for checking
packageJSON.scripts = Object.assign(packageJSON.scripts, {
"check": "svelte-check --tsconfig ./tsconfig.json"
})
// Write the package JSON
fs.writeFileSync(path.join(projectRoot, "package.json"), JSON.stringify(packageJSON, null, " "))
// mv src/main.js to main.ts - note, we need to edit rollup.config.js for this too
const beforeMainJSPath = path.join(projectRoot, "src", "main.js")
const afterMainTSPath = path.join(projectRoot, "src", "main.ts")
fs.renameSync(beforeMainJSPath, afterMainTSPath)
// Switch the app.svelte file to use TS
const appSveltePath = path.join(projectRoot, "src", "App.svelte")
let appFile = fs.readFileSync(appSveltePath, "utf8")
appFile = appFile.replace("<script>", '<script lang="ts">')
appFile = appFile.replace("export let name;", 'export let name: string;')
fs.writeFileSync(appSveltePath, appFile)
// Edit rollup config
const rollupConfigPath = path.join(projectRoot, "rollup.config.js")
let rollupConfig = fs.readFileSync(rollupConfigPath, "utf8")
// Edit imports
rollupConfig = rollupConfig.replace(`'rollup-plugin-terser';`, `'rollup-plugin-terser';
import sveltePreprocess from 'svelte-preprocess';
import typescript from '@rollup/plugin-typescript';`)
// Replace name of entry point
rollupConfig = rollupConfig.replace(`'src/main.js'`, `'src/main.ts'`)
// Add preprocessor
rollupConfig = rollupConfig.replace(
'compilerOptions:',
'preprocess: sveltePreprocess({ sourceMap: !production }),\n\t\t\tcompilerOptions:'
);
// Add TypeScript
rollupConfig = rollupConfig.replace(
'commonjs(),',
'commonjs(),\n\t\ttypescript({\n\t\t\tsourceMap: !production,\n\t\t\tinlineSources: !production\n\t\t}),'
);
fs.writeFileSync(rollupConfigPath, rollupConfig)
// Add TSConfig
const tsconfig = `{
"extends": "@tsconfig/svelte/tsconfig.json",
"include": ["src/**/*"],
"exclude": ["node_modules/*", "__sapper__/*", "public/*"]
}`
const tsconfigPath = path.join(projectRoot, "tsconfig.json")
fs.writeFileSync(tsconfigPath, tsconfig)
// Add global.d.ts
const dtsPath = path.join(projectRoot, "src", "global.d.ts")
fs.writeFileSync(dtsPath, `/// <reference types="svelte" />`)
// Delete this script, but not during testing
if (!argv[2]) {
// Remove the script
fs.unlinkSync(path.join(__filename))
// Check for Mac's DS_store file, and if it's the only one left remove it
const remainingFiles = fs.readdirSync(path.join(__dirname))
if (remainingFiles.length === 1 && remainingFiles[0] === '.DS_store') {
fs.unlinkSync(path.join(__dirname, '.DS_store'))
}
// Check if the scripts folder is empty
if (fs.readdirSync(path.join(__dirname)).length === 0) {
// Remove the scripts folder
fs.rmdirSync(path.join(__dirname))
}
}
// Adds the extension recommendation
fs.mkdirSync(path.join(projectRoot, ".vscode"), { recursive: true })
fs.writeFileSync(path.join(projectRoot, ".vscode", "extensions.json"), `{
"recommendations": ["svelte.svelte-vscode"]
}
`)
console.log("Converted to TypeScript.")
if (fs.existsSync(path.join(projectRoot, "node_modules"))) {
console.log("\nYou will need to re-run your dependency manager to get started.")
}

View File

@@ -0,0 +1,48 @@
<script>
import PDFView from './PDFView.svelte';
import ArticleStatus from './ArticleStatus.svelte';
import ArticleOperations from './ArticleOperations.svelte';
import Toast from './Toast.svelte';
let current_id = 0;
let interfaceState = updateInterface()
async function updateInterface () {
let url = '';
if (current_id == 0) {
url = '/api/article/first';
} else {
url = '/api/article/' + current_id + '/next';
}
const response = await fetch(url)
const data = await response.json()
current_id = data.id;
let article_url = '/api/article/' + current_id + '/get';
const article_response = await fetch(article_url);
const article_data = await article_response.json();
return article_data;
}
function triggerUpdate () {
interfaceState = updateInterface();
}
</script>
{#await interfaceState}
...
{:then article_data}
<div class="flex w-full h-screen gap-5 p-5">
<div class="w-3/5"><PDFView article_data={article_data}/></div>
<div class="divider divider-horizontal"></div>
<div class="w-2/5">
<ArticleStatus article_data={article_data}/>
<div class="divider divider-vertical"></div>
<ArticleOperations article_data={article_data} callback={triggerUpdate}/>
</div>
</div>
{/await}
<Toast/>

View File

@@ -0,0 +1,119 @@
<script>
export let article_data;
export let callback;
window.focus()
import { addToast } from './Toast.svelte';
const actions = [
{name: 'Mark as good (and skip to next)', kbd: 'A'},
{name: 'Mark as bad (and skip to next)', kbd: 'B'},
{name: 'Upload related file', kbd: 'R', comment: "can be used multiple times"},
{name: 'Skip', kbd: 'S'},
]
let fileInput = document.createElement('input');
fileInput.type = 'file';
fileInput.onchange = e => {
let result = (async () => {
uploadRelatedFile(e.target.files[0]);
})()
}
function onKeyDown(e) {apiAction(e.key)}
function apiAction(key) {
if (actions.map(d => d.kbd.toLowerCase()).includes(key.toLowerCase())){ // ignore other keypresses
const updateArticle = (async() => {
let success
if (key.toLowerCase() == "s") {
addToast('success', "Article skipped")
callback()
return
} else if (key.toLowerCase() == "r") {
fileInput.click() // this will trigger a change in fileInput,
return
} else {
const response = await fetch('/api/article/' + article_data.id + '/set', {
method: 'POST',
headers: {'Content-Type': 'application/json'},
body: JSON.stringify({
'action': key.toLowerCase(),
})
})
success = response.status == 200
}
if (success){
addToast('success')
callback()
} else {
addToast('error')
}
})()
}
}
async function uploadRelatedFile(file) {
const formData = new FormData()
formData.append('file', file)
const response = await fetch('/api/article/' + article_data.id + '/set', {
method: 'POST',
body : formData,
})
const success = response.status == 200;
if (success){
const data = await response.json()
let fname = data.file_path
addToast('success', "File uploaded as " + fname)
} else {
addToast('error', "File upload failed")
}
return success;
}
</script>
<div class="card bg-neutral-300 shadow-xl">
<div class="card-body">
<h2 class="card-title">Your options: (click on action or use keyboard)</h2>
<div class="overflow-x-auto">
<table class="table w-full table-compact">
<thead>
<tr>
<th>Action</th>
<th>Keyboard shortcut</th>
</tr>
</thead>
<tbody>
{#each actions as action}
<tr>
<td><button on:click={() => apiAction(action.kbd)}>{ action.name }</button></td>
<td><kbd class="kbd">
{ action.kbd }</kbd>
{#if action.comment}({ action.comment }){/if}
</td>
</tr>
{/each}
</tbody>
</table>
</div>
</div>
</div>
<!-- Listen for keypresses -->
<svelte:window on:keydown|preventDefault={onKeyDown} />

View File

@@ -0,0 +1,53 @@
<script>
export let article_data;
const status_items = [
{name: 'Title', value: article_data.title},
{name: 'Url', value: article_data.article_url},
{name: 'Source', value: article_data.source_name},
{name: 'Filename', value: article_data.file_name},
{name: 'Location', value: article_data.save_path},
{name: 'Language', value: article_data.language},
{name: 'Authors', value: article_data.authors},
{name: "Related", value: article_data.related},
{name: "Sent", value: article_data.sent},
]
</script>
<style>
td {
overflow-wrap: break-word;
word-wrap: break-word;
word-break: break-word;
}
</style>
<div class="card bg-neutral-300 shadow-xl overflow-x-auto">
<div class="card-body">
<h2 class="card-title">Article overview:</h2>
<table class="table w-full table-compact" style="table-layout: fixed">
<thead>
<tr>
<th>Attribute</th>
<th>Value</th>
</tr>
</thead>
<tbody>
{#each status_items as item}
<tr>
<td>{ item.name }</td>
{#if (item.value != "" || status_items.value == false) }
<td class='bg-emerald-200' style="white-space: normal; width:70%">
{#if item.name == "Url"}
<a href="{ item.value }" target="_blank">{ item.value }</a>
{:else}
{ item.value }
{/if}
</td>
{:else}
<td class='bg-red-200'>not set</td>
{/if}
</tr>
{/each}
</tbody>
</table>
</div>
</div>

View File

@@ -0,0 +1,15 @@
<script>
export let article_data;
</script>
<div class="h-full w-full shadow-xl">
<object class="pdf-view" data="{article_data.save_path + article_data.file_name}" title="Article PDF"> </object>
</div>
<style>
.pdf-view {
width: 100%;
height: 100%;
}
</style>

View File

@@ -0,0 +1,34 @@
<script context="module">
import {fade} from 'svelte/transition';
import { writable } from 'svelte/store';
let toasts = writable([])
export function addToast (type, message="") {
if (message == "") {
message = toast_states[type]["text"]
}
toasts.update((all) => [{"class" : toast_states[type]["class"], "text": message}, ...all]);
toasts = toasts;
setTimeout(() => {
toasts.update((all) => all.slice(0, -1));
}, 2000);
}
const toast_states = {
'success' : {class: 'alert-success', text: 'Article updated successfully'},
'error' : {class: 'alert-error', text: 'Article update failed'},
}
</script>
<div class="toast">
{#each $toasts as toast}
<div class="alert { toast.class }" transition:fade>
<div> <span>{ toast.text }.</span> </div>
</div>
{/each}
</div>

View File

@@ -0,0 +1,7 @@
import App from './App.svelte';
const app = new App({
target: document.body,
});
export default app;

View File

@@ -0,0 +1,5 @@
flask
peewee
markdown
psycopg2
pyyaml

93
news_check/server/app.py Normal file
View File

@@ -0,0 +1,93 @@
from flask import Flask, send_from_directory, request
import os
import configuration
models = configuration.models
db = configuration.db
app = Flask(__name__)
###############################################################################
# SVELTE 'STATIC' BACKEND. Always send index.html and the requested js-files. (compiled by npm)
@app.route("/") #index.html
def index():
return send_from_directory('../client/public', 'index.html')
@app.route("/<path:path>") #js-files
def js(path):
return send_from_directory('../client/public', path)
@app.route("/app/containerdata/files/<path:path>")
def static_pdf(path):
return send_from_directory('/app/containerdata/files/', path)
###############################################################################
# (simple) API for news_check.
@app.route("/api/article/<int:id>/get")
def get_article_by_id(id):
with db:
article = models.ArticleDownload.get_by_id(id)
return article.to_dict()
@app.route("/api/article/first")
def get_article_first(min_id=0):
with db:
article = models.ArticleDownload.select(models.ArticleDownload.id).where(
(models.ArticleDownload.verified == 0) &
(models.ArticleDownload.id > min_id) &
(models.ArticleDownload.archived_by == os.getenv("UNAME"))
).order_by(models.ArticleDownload.id).first()
return {"id" : article.id}
@app.route("/api/article/<int:id>/next")
def get_article_next(id):
with db:
if models.ArticleDownload.get_by_id(id + 1).verified == 0:
return {"id" : id + 1}
else:
return get_article_first(min_id=id) # if the current article was skipped, but the +1 is already verified, get_first will return the same article again. so specify min id.
@app.route("/api/article/<int:id>/set", methods=['POST'])
def set_article(id):
json = request.get_json(silent=True) # do not raise 400 if there is no json!
# no json usually means a file was uploaded
if json is None:
print("Detected likely file upload.")
action = None
else:
action = request.json.get('action', None) # action inside the json might still be empty
with db:
article = models.ArticleDownload.get_by_id(id)
if action:
if action == "a":
article.verified = 1
elif action == "b":
article.verified = -1
else: # implicitly action == "r":
# request.files is an immutable dict
file = request.files.get("file", None)
if file is None: # upload tends to crash
return "No file uploaded", 400
artname, _ = os.path.splitext(article.file_name)
fname = f"{artname} -- related_{article.related.count() + 1}.{file.filename.split('.')[-1]}"
fpath = os.path.join(article.save_path, fname)
print(f"Saving file to {fpath}")
file.save(fpath)
article.set_related([fname])
return {"file_path": fpath}
article.save()
return "ok"
if __name__ == "__main__":
debug = os.getenv("DEBUG", "false") == "true"
app.run(host="0.0.0.0", port="80", debug=debug)

View File

@@ -0,0 +1,17 @@
from peewee import PostgresqlDatabase
import time
import yaml
import os
config_location = os.getenv("CONFIG_FILE")
with open(config_location, "r") as f:
config = yaml.safe_load(f)
cred = config["database"]
time.sleep(10) # wait for the vpn to connect (can't use a healthcheck because there is no depends_on)
db = PostgresqlDatabase(
cred["production_db_name"], user=cred["production_user_name"], password=cred["production_password"], host="vpn", port=5432
)
import models
models.set_db(db)

134
news_check/server/models.py Normal file
View File

@@ -0,0 +1,134 @@
import logging
logger = logging.getLogger(__name__)
from peewee import *
import os
import datetime
import configuration
downloads_config = configuration.config["downloads"]
# set the nature of the db at runtime
download_db = DatabaseProxy()
class DownloadBaseModel(Model):
class Meta:
database = download_db
## == Article related models == ##
class ArticleDownload(DownloadBaseModel):
# in the beginning this is all we have
article_url = TextField(default = '', unique=True)
# fetch then fills in the metadata
title = TextField(default='')
summary = TextField(default = '')
source_name = CharField(default = '')
language = CharField(default = '')
file_name = TextField(default = '')
@property
def save_path(self):
return f"{downloads_config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/"
@property
def fname_nas(self, file_name=""):
if self.download_date:
if file_name:
return f"NAS: {downloads_config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{file_name}"
else: # return the self. name
return f"NAS: {downloads_config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{self.file_name}"
else:
return None
archive_url = TextField(default = '')
pub_date = DateField(default = datetime.date.fromtimestamp(0))
download_date = DateField(default = datetime.date.today)
slack_ts = FloatField(default = 0) # should be a fixed-length string but float is easier to sort by
sent = BooleanField(default = False)
archived_by = CharField(default = os.getenv("UNAME"))
# need to know who saved the message because the file needs to be on their computer in order to get verified
# verification happens in a different app, but the model has the fields here as well
comment = TextField(default = '')
verified = IntegerField(default = 0) # 0 = not verified, 1 = verified, -1 = marked as bad
# authors
# keywords
# ... are added through foreignkeys
# we will also add an attribute named message, to reference which message should be replied to. This attribute does not need to be saved in the db
def to_dict(self):
return {
"id": self.id,
"article_url": self.article_url,
"title": self.title,
"summary": self.summary,
"source_name": self.source_name,
"language": self.language,
"file_name": self.file_name,
"save_path": self.save_path,
"fname_nas": self.fname_nas,
"archive_url": self.archive_url,
"pub_date": self.pub_date.strftime("%Y-%m-%d"),
"download_date": self.download_date.strftime("%Y-%m-%d"),
"sent": self.sent,
"comment": self.comment,
"related": [r.related_file_name for r in self.related],
"authors": [a.author for a in self.authors]
}
def set_related(self, related):
for r in related:
if len(r) > 255:
raise Exception("Related file name too long for POSTGRES")
ArticleRelated.create(
article = self,
related_file_name = r
)
def file_status(self):
if not self.file_name:
logger.error(f"Article {self} has no filename!")
return False, {"reply_text": "Download failed, no file was saved.", "file_path": None}
file_path_abs = self.save_path + self.file_name
if not os.path.exists(file_path_abs):
logger.error(f"Article {self} has a filename, but the file does not exist at that location!")
return False, {"reply_text": "Can't find file. Either the download failed or the file was moved.", "file_path": None}
return True, {}
class ArticleAuthor(DownloadBaseModel):
article = ForeignKeyField(ArticleDownload, backref='authors')
author = CharField()
class ArticleRelated(DownloadBaseModel):
# Related files, such as the full text of a paper, audio files, etc.
article = ForeignKeyField(ArticleDownload, backref='related')
related_file_name = TextField(default = '')
def set_db(download_db_object):
download_db.initialize(download_db_object)
with download_db: # create tables (does nothing if they exist already)
download_db.create_tables([ArticleDownload, ArticleAuthor, ArticleRelated])

View File

@@ -1,2 +1,2 @@
.dev/
Dockerfile
__pycache__/

View File

@@ -2,26 +2,10 @@ FROM python:latest
ENV TZ Europe/Zurich
RUN apt-get update && apt-get install -y \
evince \
# for checking
xauth \
#for gui
ghostscript
# for compression
RUN useradd --create-home --shell /bin/bash --uid 1001 autonews
# id mapped to local user
# home directory needed for pip package installation
RUN mkdir -p /app/auto_news
RUN chown -R autonews:autonews /app
USER autonews
RUN export PATH=/home/autonews/.local/bin:$PATH
RUN mkdir -p /app/news_fetch
COPY requirements.txt /app/requirements.txt
RUN python3 -m pip install -r /app/requirements.txt
COPY app /app/auto_news
WORKDIR /app/auto_news
COPY . /app/news_fetch
WORKDIR /app/news_fetch

View File

@@ -1,59 +0,0 @@
from dataclasses import dataclass
import os
import shutil
import configparser
import logging
from datetime import datetime
from peewee import SqliteDatabase
from rich.logging import RichHandler
# first things first: logging
logging.basicConfig(
format='%(message)s',
level=logging.INFO,
datefmt='%H:%M:%S', # add %Y-%m-%d if needed
handlers=[RichHandler()]
)
logger = logging.getLogger(__name__)
# load config file containing constants and secrets
parsed = configparser.ConfigParser()
parsed.read("/app/containerdata/config/news_fetch.config.ini")
if os.getenv("DEBUG", "false") == "true":
logger.warning("Found 'DEBUG=true', setting up dummy databases")
db_base_path = parsed["DATABASE"]["db_path_dev"]
parsed["SLACK"]["archive_id"] = parsed["SLACK"]["debug_id"]
parsed["MAIL"]["recipient"] = parsed["MAIL"]["sender"]
parsed["DOWNLOADS"]["local_storage_path"] = parsed["DATABASE"]["db_path_dev"]
else:
logger.warning("Found 'DEBUG=false' and running on production databases, I hope you know what you're doing...")
db_base_path = parsed["DATABASE"]["db_path_prod"]
logger.info("Backing up databases")
backup_dst = parsed["DATABASE"]["db_backup"]
today = datetime.today().strftime("%Y.%m.%d")
shutil.copyfile(
os.path.join(db_base_path, parsed["DATABASE"]["chat_db_name"]),
os.path.join(backup_dst, today + "." + parsed["DATABASE"]["chat_db_name"]),
)
shutil.copyfile(
os.path.join(db_base_path, parsed["DATABASE"]["download_db_name"]),
os.path.join(backup_dst, today + "." + parsed["DATABASE"]["download_db_name"]),
)
from utils_storage import models
# Set up the database
models.set_db(
SqliteDatabase(
os.path.join(db_base_path, parsed["DATABASE"]["chat_db_name"]),
pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once
),
SqliteDatabase(
os.path.join(db_base_path, parsed["DATABASE"]["download_db_name"]),
pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once
)
)

View File

@@ -1,205 +0,0 @@
"""Main coordination of other util classes. Handles inbound and outbound calls"""
import configuration
models = configuration.models
from threading import Thread
import logging
import os
import sys
logger = logging.getLogger(__name__)
from utils_mail import runner as mail_runner
from utils_slack import runner as slack_runner
from utils_worker.workers import CompressWorker, DownloadWorker, FetchWorker, UploadWorker
class ArticleWatcher:
"""Wrapper for a newly created article object. Notifies the coordinator upon change/completition"""
def __init__(self, article, thread, **kwargs) -> None:
self.article_id = article.id # in case article becomes None at any point, we can still track the article
self.article = article
self.thread = thread
self.completition_notifier = kwargs.get("notifier")
self.fetch = kwargs.get("worker_fetch", None)
self.download = kwargs.get("worker_download", None)
self.compress = kwargs.get("worker_compress", None)
self.upload = kwargs.get("worker_upload", None)
self.completition_notified = False
# self._download_called = self._compression_called = False
self._fetch_completed = self._download_completed = self._compression_completed = self._upload_completed = False
# first step: gather metadata
if self.fetch and self.upload:
self.fetch.process(self) # this will call the update_status method
self.upload.process(self) # idependent from the rest
else: # the full kwargs were not provided, only do a manual run
# overwrite update_status() because calls from the workers will result in erros
self.update_status = lambda completed: logger.info(f"Completed action {completed}")
for w in kwargs.get("workers_manual"):
w.process(self)
def update_status(self, completed_action):
"""Checks and notifies internal completition-status.
Article download is complete iff fetch and download were successfull and compression was run
"""
# if self.completition_notified and self._compression_completed and self._fetch_completed and self._download_completed and self._upload_completed, we are done
if completed_action == "fetch":
self.download.process(self)
elif completed_action == "download":
self.compress.process(self)
elif completed_action == "compress": # last step
self.completition_notifier(self.article, self.thread)
# triggers action in Coordinator
elif completed_action == "upload":
# this case occurs when upload was faster than compression
pass
else:
logger.warning(f"update_status called with unusual configuration: {completed_action}")
# ====== Attributes to be modified by the util workers
@property
def fetch_completed(self):
return self._fetch_completed
@fetch_completed.setter
def fetch_completed(self, value: bool):
self._fetch_completed = value
self.update_status("fetch")
@property
def download_completed(self):
return self._download_completed
@download_completed.setter
def download_completed(self, value: bool):
self._download_completed = value
self.update_status("download")
@property
def compression_completed(self):
return self._compression_completed
@compression_completed.setter
def compression_completed(self, value: bool):
self._compression_completed = value
self.update_status("compress")
@property
def upload_completed(self):
return self._upload_completed
@upload_completed.setter
def upload_completed(self, value: bool):
self._upload_completed = value
self.update_status("upload")
def __str__(self) -> str:
return f"Article with id {self.article_id}"
class Coordinator(Thread):
def __init__(self, **kwargs) -> None:
"""Launcher calls this Coordinator as the main thread to handle connections between the other workers (threaded)."""
super().__init__(target = self.launch, daemon=True)
def add_workers(self, **kwargs):
self.worker_slack = kwargs.pop("worker_slack", None)
self.worker_mail = kwargs.pop("worker_mail", None)
# the two above won't be needed in the Watcher
self.worker_download = kwargs.get("worker_download", None)
self.worker_fetch = kwargs.get("worker_fetch", None)
self.worker_compress = kwargs.get("worker_compress", None)
self.worker_upload = kwargs.get("worker_upload", None)
self.kwargs = kwargs
def launch(self) -> None:
for w in [self.worker_download, self.worker_fetch, self.worker_upload, self.worker_compress]:
if not w is None:
w.start()
def incoming_request(self, message):
"""This method is passed onto the slack worker. It gets triggered when a new message is received."""
url = message.urls[0] # ignore all the other ones
article, is_new = models.ArticleDownload.get_or_create(article_url=url)
thread = message.thread
thread.article = article
thread.save()
self.kwargs.update({"notifier" : self.article_complete_notifier})
if is_new or (article.file_name == "" and article.verified == 0):
# check for models that were created but were abandonned. This means they have missing information, most importantly no associated file
# this overwrites previously set information, but that should not be too important
ArticleWatcher(
article,
thread,
**self.kwargs
)
# All workers are implemented as a threaded queue. But the individual model requires a specific processing order:
# fetch -> download -> compress -> complete
# the watcher orchestrates the procedure and notifies upon completition
# the watcher will notify once it is sufficiently populated
else: # manually trigger notification immediatly
logger.info(f"Found existing article {article}. Now sending")
self.article_complete_notifier(article, thread)
def manual_processing(self, articles, workers):
for w in workers:
w.start()
for article in articles:
notifier = lambda article: print(f"Completed manual actions for {article}")
ArticleWatcher(article, None, workers_manual = workers, notifier = notifier) # Article watcher wants a thread to link article to TODO: handle threads as a kwarg
def article_complete_notifier(self, article, thread):
if self.worker_slack is None:
logger.warning("Not sending slack notifier")
else:
self.worker_slack.bot_worker.respond_channel_message(thread)
if self.worker_mail is None:
logger.warning("Not sending mail notifier")
else:
self.worker_mail.send(article)
if __name__ == "__main__":
coordinator = Coordinator()
if os.getenv("UPLOAD", "false") == "true":
articles = models.ArticleDownload.select().where(models.ArticleDownload.archive_url == "").execute()
logger.info(f"Launching upload to archive for {len(articles)} articles.")
coordinator.manual_processing(articles, [UploadWorker()])
elif os.getenv("CHECK", "false") == "true":
from utils_check import runner as check_runner
check_runner.verify_unchecked()
else: # launch with full action
slack_runner = slack_runner.BotRunner(coordinator.incoming_request)
kwargs = {
"worker_download" : DownloadWorker(),
"worker_fetch" : FetchWorker(),
"worker_upload" : UploadWorker(),
"worker_compress" : CompressWorker(),
"worker_slack" : slack_runner,
"worker_mail" : mail_runner,
}
try:
coordinator.add_workers(**kwargs)
coordinator.start()
slack_runner.start()
except KeyboardInterrupt:
logger.info("Keyboard interrupt. Stopping Slack and Coordinator")
slack_runner.stop()
print("BYE!")
# coordinator was set as a daemon thread, so it will be stopped automatically
sys.exit(0)

View File

@@ -1,208 +0,0 @@
from rich.console import Console
from rich.table import Table
from rich.columns import Columns
from rich.rule import Rule
console = Console()
hline = Rule(style="white")
import os
import subprocess
from slack_sdk import WebClient
import configuration
models = configuration.models
u_options = {
"ENTER" : "Accept PDF as is. It gets marked as verified",
"D" : "set languange to DE and set verified",
"E" : "set languange to EN and set verified",
"O" : "set other language (prompted)",
"R" : "set related files (prompted multiple times)",
"B" : "reject and move to folder BAD",
"L" : "leave file as is, do not send reaction"
}
bot_client = WebClient(
token = configuration.parsed["SLACK"]["auth_token"]
)
def file_overview(file_url: str, file_attributes: list, options: dict) -> None:
"""Prints a neat overview of the current article"""
file_table = Table(
title = file_url,
row_styles = ["white", "bright_black"],
min_width = 100
)
file_table.add_column("Attribute", justify = "right", no_wrap = True)
file_table.add_column("Value set by auto_news")
file_table.add_column("Status", justify = "right")
for attr in file_attributes:
file_table.add_row(attr["name"], attr["value"], attr["status"])
option_key = "\n".join([f"[[bold]{k}[/bold]]" for k in options.keys()])
option_action = "\n".join([f"[italic]{k}[/italic]" for k in options.values()])
columns = Columns([option_key, option_action])
console.print(file_table)
console.print("Your options:")
console.print(columns)
def send_reaction_to_slack_thread(article, reaction):
"""Sends the verification status as a reaction to the associated slack thread."""
thread = article.slack_thread
messages = models.Message.select().where(models.Message.text.contains(article.article_url))
# TODO rewrite this shit
if len(messages) > 5:
print("Found more than 5 messages. Aborting reactions...")
return
for m in messages:
if m.is_processed_override:
print("Message already processed. Aborting reactions...")
elif not m.has_single_url:
print("Found thread but won't send reaction because thread has multiple urls")
else:
ts = m.slack_ts
bot_client.reactions_add(
channel=configuration.parsed["SLACK"]["archive_id"],
name=reaction,
timestamp=ts
)
print("Sent reaction to message")
def prompt_language(query):
not_set = True
while not_set:
uin = input("Set language (nation-code, 2 letters) ")
if len(uin) != 2:
print("Bad code, try again")
else:
not_set = False
query.language = uin
query.save()
def prompt_related(query):
file_list = []
finished = False
while not finished:
uin = input("Additional file for article? Type '1' to cancel ")
if uin == "1":
query.set_related(file_list)
finished = True
else:
file_list.append(uin)
def prompt_new_fname(query):
uin = input("New fname? ")
old_fname = query.file_name
query.file_name = uin
query.verified = 1
if old_fname != "":
os.remove(query.save_path + old_fname)
query.save()
def reject_article(article):
article.verified = -1
article.save()
print("Article marked as bad")
# also update the threads to not be monitored anymore
send_reaction_to_slack_thread(article, "x")
def unreject_article(query):
query.verified = 1
query.save()
# os.rename(badpdf, fname)
print("File set to verified")
def accept_article(article, last_accepted):
article.verified = 1
article.save()
print("Article accepted as GOOD")
# also update the threads to not be monitored anymore
send_reaction_to_slack_thread(article, "white_check_mark")
return "" # linked
def verify_unchecked():
query = models.ArticleDownload.select().where(models.ArticleDownload.verified == 0).execute()
last_linked = None
for article in query:
console.print(hline)
core_info = []
for e, name in zip([article.save_path, article.file_name, article.title, article.language], ["Save path", "File name", "Title", "Language"]):
entry = {
"status" : "[red]██[/red]" if (len(e) == 0 or e == -1) else "[green]██[/green]",
"value" : e if len(e) != 0 else "not set",
"name" : name
}
core_info.append(entry)
try:
# close any previously opened windows:
# subprocess.call(["kill", "`pgrep evince`"])
os.system("pkill evince")
# then open a new one
subprocess.Popen(["evince", f"file://{os.path.join(article.save_path, article.file_name)}"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# supress evince gtk warnings
except Exception as e:
print(e)
continue
file_overview(
file_url = article.article_url,
file_attributes=core_info,
options = u_options
)
proceed = False
while not proceed:
proceed = False
uin = input("Choice ?").lower()
if uin == "":
last_linked = accept_article(article, last_linked) # last linked accelerates the whole process
proceed = True
elif uin == "d":
article.language = "de"
article.verified = 1
article.save()
proceed = True
elif uin == "e":
article.language = "en"
article.verified = 1
article.save()
proceed = True
elif uin == "o":
prompt_language(article)
elif uin == "r":
prompt_related(article)
elif uin == "b":
reject_article(article)
proceed = True
elif uin == "l":
# do nothing
proceed = True
else:
print("Invalid input")

View File

@@ -1,285 +0,0 @@
import logging
import configuration
import requests
import os
import time
from threading import Thread
from slack_sdk.errors import SlackApiError
logger = logging.getLogger(__name__)
config = configuration.parsed["SLACK"]
models = configuration.models
slack_client = "dummy"
LATEST_RECORDED_REACTION = 0
def init(client) -> None:
"""Starts fetching past messages and returns the freshly launched thread"""
global slack_client
slack_client = client
global LATEST_RECORDED_REACTION
try:
LATEST_RECORDED_REACTION = models.Reaction.select(models.Reaction.id).order_by("id")[-1]
except IndexError: #query is actually empty, we have never fetched any messages until now
LATEST_RECORDED_REACTION = 0
# fetch all te messages we could have possibly missed
logger.info("Querying missed messages, threads and reactions. This can take some time.")
fetch_missed_channel_messages() # not threaded
t = Thread(target = fetch_missed_channel_reactions, daemon=True) # threaded, runs in background (usually takes a long time)
t.start()
if os.getenv("REDUCEDFETCH", "false") == "true":
logger.warning("Only fetching empty threads for bot messages because 'REDUCEDFETCH=true'")
fetch_missed_thread_messages(reduced=True)
else: # perform both asyncronously
fetch_missed_thread_messages()
def get_unhandled_messages():
"""Gets all messages that have not yet been handled, be it by mistake or by downtime
As the message handler makes no distinction between channel messages and thread messages,
we don't have to worry about them here.
"""
threaded_objects = []
for t in models.Thread.select():
if t.message_count > 1: # if only one message was written, it is the channel message
msg = t.last_message
if msg.is_by_human:
threaded_objects.append(msg)
# else don't, nothing to process
logger.info(f"Set {len(threaded_objects)} thread-messages as not yet handled.")
channel_objects = [t.initiator_message for t in models.Thread.select() if (t.message_count == 1 and not t.is_fully_processed)]
logger.info(f"Set {len(channel_objects)} channel-messages as not yet handled.")
reaction_objects = list(models.Reaction.select().where(models.Reaction.id > LATEST_RECORDED_REACTION))
logger.info(f"Set {len(reaction_objects)} reactions as not yet handled.")
# the ones newer than the last before the fetch
all_messages = channel_objects + threaded_objects
return all_messages, reaction_objects
def fetch_missed_channel_messages():
# latest processed message_ts is:
presaved = models.Message.select().order_by(models.Message.ts)
if not presaved:
last_ts = 0
else:
last_message = presaved[-1]
last_ts = last_message.slack_ts
result = slack_client.conversations_history(
channel=config["archive_id"],
oldest=last_ts
)
new_messages = result.get("messages", [])
# # filter the last one, it is a duplicate! (only if the db is not empty!)
# if last_ts != 0 and len(new_messages) != 0:
# new_messages.pop(-1)
new_fetches = 0
for m in new_messages:
# print(m)
message_dict_to_model(m)
new_fetches += 1
refetch = result.get("has_more", False)
while refetch: # we have not actually fetched them all
try:
result = slack_client.conversations_history(
channel = config["archive_id"],
cursor = result["response_metadata"]["next_cursor"],
oldest = last_ts
) # fetches 100 messages, older than the [-1](=oldest) element of new_fetches
refetch = result.get("has_more", False)
new_messages = result.get("messages", [])
for m in new_messages:
message_dict_to_model(m)
new_fetches += 1
except SlackApiError: # Most likely a rate-limit
logger.error("Error while fetching channel messages. (likely rate limit) Retrying in {} seconds...".format(config["api_wait_time"]))
time.sleep(config["api_wait_time"])
refetch = True
logger.info(f"Fetched {new_fetches} new channel messages.")
def fetch_missed_thread_messages(reduced=False):
"""After having gotten all base-threads, we need to fetch all their replies"""
# I don't know of a better way: we need to fetch this for each and every thread (except if it is marked as permanently solved)
logger.info("Starting fetch of thread messages...")
if reduced:
threads = [t for t in models.Thread.select() if (t.message_count == 1 and not t.is_fully_processed)]
# this only fetches completely empty threads, which might be because the bot-message was not yet saved to the db.
# once we got all the bot-messages the remaining empty threads will be the ones we need to process.
else:
threads = [t for t in models.Thread.select() if not t.is_fully_processed]
logger.info(f"Fetching history for {len(threads)} empty threads")
new_messages = []
for i,t in enumerate(threads):
try:
messages = slack_client.conversations_replies(
channel = config["archive_id"],
ts = t.slack_ts,
oldest = t.messages[-1].slack_ts
)["messages"]
except SlackApiError:
logger.error("Hit rate limit while querying threaded messages, retrying in {}s ({}/{} queries elapsed)".format(config["api_wait_time"], i, len(threads)))
time.sleep(int(config["api_wait_time"]))
messages = slack_client.conversations_replies(
channel = config["archive_id"],
ts = t.slack_ts,
oldest = t.messages[-1].slack_ts
)["messages"]
messages.pop(0) # the first message is the one posted in the channel. We already processed it!
for m in messages:
# only append *new* messages
res = message_dict_to_model(m)
if res:
new_messages.append(res)
logger.info("Fetched {} new threaded messages.".format(len(new_messages)))
def fetch_missed_channel_reactions():
logger.info("Starting background fetch of channel reactions...")
threads = [t for t in models.Thread.select() if not t.is_fully_processed]
for i,t in enumerate(threads):
reactions = []
try:
query = slack_client.reactions_get(
channel = config["archive_id"],
timestamp = t.slack_ts
)
reactions = query.get("message", []).get("reactions", []) # default = []
except SlackApiError as e:
if e.response.get("error", "") == "message_not_found":
m = t.initiator_message
logger.warning(f"Message (id={m.id}) not found. Skipping and saving...")
# this usually means the message is past the 1000 message limit imposed by slack. Mark it as processed in the db
m.is_processed_override = True
m.save()
else: # probably a rate_limit:
logger.error("Hit rate limit while querying reactions. retrying in {}s ({}/{} queries elapsed)".format(config["api_wait_time"], i, len(threads)))
time.sleep(int(config["api_wait_time"]))
for r in reactions:
reaction_dict_to_model(r, t)
# Helpers for message conversion to db-objects
def reaction_dict_to_model(reaction, thread=None):
if thread is None:
m_ts = reaction["item"]["ts"]
message = models.Message.get(ts = float(m_ts))
thread = message.thread
if "name" in reaction.keys(): # fetched through manual api query
content = reaction["name"]
elif "reaction" in reaction.keys(): # fetched through events
content = reaction["reaction"]
else:
logger.error(f"Weird reaction received: {reaction}")
return None
r, _ = models.Reaction.get_or_create(
type = content,
message = thread.initiator_message
)
logger.info("Saved reaction [{}]".format(content))
return r
def message_dict_to_model(message):
if message["type"] == "message":
thread_ts = message["thread_ts"] if "thread_ts" in message else message["ts"]
uid = message.get("user", "BAD USER")
if uid == "BAD USER":
logger.critical("Message has no user?? {}".format(message))
return None
user, _ = models.User.get_or_create(user_id = uid)
thread, _ = models.Thread.get_or_create(thread_ts = thread_ts)
m, new = models.Message.get_or_create(
user = user,
thread = thread,
ts = message["ts"],
channel_id = config["archive_id"],
text = message["text"]
)
logger.info(f"Saved: {m} ({'new' if new else 'old'})")
files = message.get("files", [])
if len(files) >= 1:
f = files[0] #default: []
m.file_type = f["filetype"]
m.perma_link = f["url_private_download"]
m.save()
logger.info(f"Saved {m.file_type}-file for message (id={m.id})")
if new:
return m
else:
return None
else:
logger.warning("What should I do of {}".format(message))
return None
def say_substitute(*args, **kwargs):
logger.info("Now sending message through say-substitute: {}".format(" - ".join(args)))
slack_client.chat_postMessage(
channel=config["archive_id"],
text=" - ".join(args),
**kwargs
)
def save_as_related_file(url, article_object):
r = requests.get(url, headers={"Authorization": "Bearer {}".format(slack_client.token)})
saveto = article_object.save_path
ftype = url[url.rfind(".") + 1:]
fname = "{} - related no {}.{}".format(
article_object.file_name.replace(".pdf",""),
len(article_object.related) + 1,
ftype
)
with open(os.path.join(saveto, fname), "wb") as f:
f.write(r.content)
article_object.set_related([fname])
logger.info("Added {} to model {}".format(fname, article_object))
return fname
def react_file_path_message(fname, article_object):
saveto = article_object.save_path
file_path = os.path.join(saveto, fname)
if os.path.exists(file_path):
article_object.set_related([fname])
logger.info("Added {} to model {}".format(fname, article_object))
return True
else:
return False
def is_message_in_archiving(message) -> bool:
if isinstance(message, dict):
return message["channel"] == config["archive_id"]
else:
return message.channel_id == config["archive_id"]
def is_reaction_in_archiving(event) -> bool:
if isinstance(event, dict):
return event["item"]["channel"] == config["archive_id"]
else:
return event.message.channel_id == config["archive_id"]

View File

@@ -1,189 +0,0 @@
from slack_bolt import App
from slack_bolt.adapter.socket_mode import SocketModeHandler
from slack_sdk.errors import SlackApiError
import logging
import configuration
from . import message_helpers
config = configuration.parsed["SLACK"]
models = configuration.models
class BotApp(App):
logger = logging.getLogger(__name__)
def __init__(self, callback, *args, **kwargs):
super().__init__(*args, **kwargs)
self.callback = callback
def pre_start(self):
message_helpers.init(self.client)
missed_messages, missed_reactions = message_helpers.get_unhandled_messages()
[self.handle_incoming_message(m) for m in missed_messages]
[self.handle_incoming_reaction(r) for r in missed_reactions]
# self.react_missed_reactions(missed_reactions)
# self.react_missed_messages(missed_messages)
self.startup_status()
def handle_incoming_reaction(self, reaction):
if isinstance(reaction, dict): #else: the reaction is already being passed as a model
# CAUTION: filter for 'changed reactions' those are nasty (usually when adding an url)
reaction = message_helpers.reaction_dict_to_model(reaction)
thread = reaction.message.thread
article_object = thread.article
if not article_object is None:
reaction = reaction.type
status = 1 if reaction == "white_check_mark" else -1
# self.logger.info(f"Applying reaction {reaction} to its root message.")
article_object.verified = status
article_object.save()
def handle_incoming_message(self, message):
"""Reacts to all messages inside channel archiving. Must then
distinguish between threaded replies and new requests
and react accordingly"""
if isinstance(message, dict): #else: the message is already being passed as a model
# CAUTION: filter for 'changed messages' those are nasty (usually when adding an url)
if message.get("subtype", "not bad") == "message_changed":
return False
message = message_helpers.message_dict_to_model(message)
# First check: belongs to thread?
is_threaded = message.thread.message_count > 1 and message != message.thread.initiator_message
if is_threaded:
self.incoming_thread_message(message)
else:
self.incoming_channel_message(message)
def incoming_thread_message(self, message):
if message.user.user_id == config["bot_id"]:
return True # ignore the files uploaded by the bot. We handled them already!
thread = message.thread
if thread.is_fully_processed:
return True
self.logger.info("Receiving thread-message")
self.respond_thread_message(message)
def incoming_channel_message(self, message):
self.logger.info(f"Handling message {message} ({len(message.urls)} urls)")
if not message.urls: # no urls in a root-message => IGNORE
message.is_processed_override = True
message.save()
return
# ensure thread is still empty, this is a scenario encountered only in testing, but let's just filter it
if message.thread.message_count > 1:
self.logger.info("Discarded message because it is actually processed.")
return
if len(message.urls) > 1:
message_helpers.say_substitute("Only the first url is being handled. Please send any subsequent url as a separate message", thread_ts=message.thread.slack_ts)
self.callback(message)
# for url in message.urls:
# self.callback(url, message)
# stop here!
def respond_thread_message(self, message, say=message_helpers.say_substitute):
thread = message.thread
article = thread.article
if message.perma_link: # file upload means new data
fname = message_helpers.save_as_related_file(message.perma_link, article)
say("File was saved as 'related file' under `{}`.".format(fname),
thread_ts=thread.slack_ts
)
else: # either a pointer to a new file (too large to upload), or trash
success = message_helpers.react_file_path_message(message.text, article)
if success:
say("File was saved as 'related file'", thread_ts=thread.slack_ts)
else:
self.logger.error("User replied to thread {} but the response did not contain a file/path".format(thread))
say("Cannot process response without associated file.",
thread_ts=thread.slack_ts
)
def respond_channel_message(self, thread, say=message_helpers.say_substitute):
article = thread.article
answers = article.slack_info
for a in answers:
if a["file_path"]:
try: # upload resulted in an error
self.client.files_upload(
channels = config["archive_id"],
initial_comment = f"<@{config['responsible_id']}> \n {a['reply_text']}",
file = a["file_path"],
thread_ts = thread.slack_ts
)
status = True
except SlackApiError as e:
say(
"File {} could not be uploaded.".format(a),
thread_ts=thread.slack_ts
)
status = False
self.logger.error(f"File upload failed: {e}")
else: # anticipated that there is no file!
say(
f"<@{config['responsible_id']}> \n {a['reply_text']}",
thread_ts=thread.slack_ts
)
status = True
def startup_status(self):
threads = [t for t in models.Thread.select()]
all_threads = len(threads)
fully_processed = len([t for t in threads if t.is_fully_processed])
fully_unprocessed = len([t for t in threads if t.message_count == 1])
articles_unprocessed = len(models.ArticleDownload.select().where(models.ArticleDownload.verified < 1))
self.logger.info(f"[bold]STATUS[/bold]: Fully processed {fully_processed}/{all_threads} threads. {fully_unprocessed} threads have 0 replies. Article-objects to verify: {articles_unprocessed}", extra={"markup": True})
class BotRunner():
"""Stupid encapsulation so that we can apply the slack decorators to the BotApp"""
def __init__(self, callback, *args, **kwargs) -> None:
self.bot_worker = BotApp(callback, token=config["auth_token"])
@self.bot_worker.event(event="message", matchers=[message_helpers.is_message_in_archiving])
def handle_incoming_message(message, say):
return self.bot_worker.handle_incoming_message(message)
@self.bot_worker.event(event="reaction_added", matchers=[message_helpers.is_reaction_in_archiving])
def handle_incoming_reaction(event, say):
return self.bot_worker.handle_incoming_reaction(event)
self.handler = SocketModeHandler(self.bot_worker, config["app_token"])
def start(self):
self.bot_worker.pre_start()
self.handler.start()
def stop(self):
self.handler.close()
print("Bye handler!")
# def respond_to_message(self, message):
# self.bot_worker.handle_incoming_message(message)

View File

@@ -1,331 +0,0 @@
import logging
logger = logging.getLogger(__name__)
from peewee import *
import os
import markdown
import re
import configuration
import datetime
config = configuration.parsed["DOWNLOADS"]
slack_config = configuration.parsed["SLACK"]
## Helpers
chat_db = DatabaseProxy()
download_db = DatabaseProxy()
# set the nature of the db at runtime
class DownloadBaseModel(Model):
class Meta:
database = download_db
class ChatBaseModel(Model):
class Meta:
database = chat_db
## == Article related models == ##
class ArticleDownload(DownloadBaseModel):
title = CharField(default='')
pub_date = DateField(default = '')
download_date = DateField(default = datetime.date.today)
source_name = CharField(default = '')
article_url = TextField(default = '', unique=True)
archive_url = TextField(default = '')
file_name = TextField(default = '')
language = CharField(default = '')
summary = TextField(default = '')
comment = TextField(default = '')
verified = IntegerField(default = False)
# authors
# keywords
# ... are added through foreignkeys
def __str__(self) -> str:
if self.title != '' and self.source_name != '':
desc = f"{shorten_name(self.title)} -- {self.source_name}"
else:
desc = f"{self.article_url}"
return f"ART [{desc}]"
## Useful Properties
@property
def save_path(self):
return f"{config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/"
def fname_nas(self, file_name=""):
if self.download_date:
if file_name:
return "NAS: {}/{}/{}/{}".format(config["remote_storage_path"], self.download_date.year, self.download_date.strftime("%B"), file_name)
else: # return the self. name
return "NAS: {}/{}/{}/{}".format(config["remote_storage_path"], self.download_date.year, self.download_date.strftime("%B"), self.file_name)
else:
return None
@property
def fname_template(self):
if "youtube.com" in self.source_name or "youtu.be" in self.source_name:
fname = "{} -- {}".format(self.source_name, self.title)
else:
fname = "{} -- {}.pdf".format(self.source_name, self.title)
return clear_path_name(fname)
@property
def is_title_bad(self): # add incrementally
return "PUR-Abo" in self.title \
or "Redirecting" in self.title \
or "Error while running fetch" in self.title
@property
def slack_info(self):
status = [":x: No better version available", ":gear: Verification pending", ":white_check_mark: Verified by human"][self.verified + 1]
content = "\n>" + "\n>".join(self.summary.split("\n"))
file_status, msg = self.file_status()
if not file_status:
return [msg]
# everything alright: generate real content
# first the base file
if self.file_name[-4:] == ".pdf":
answer = [{ # main reply with the base pdf
"reply_text" : f"*{self.title}*\n{status}\n{content}",
"file_path" : self.save_path + self.file_name
}]
else: # don't upload if the file is too big!
location = "Not uploaded to slack, but the file will be on the NAS:\n`{}`".format(self.fname_nas())
answer = [{ # main reply with the base pdf
"reply_text" : "*{}*\n{}\n{}\n{}".format(self.title, status, content, location),
"file_path" : None
}]
# then the related files
rel_text = ""
for r in self.related:
fname = r.related_file_name
lentry = "\n• `{}` ".format(self.fname_nas(fname))
if fname[-4:] == ".pdf": # this is a manageable file, directly upload
f_ret = self.save_path + fname
answer.append({"reply_text":"", "file_path" : f_ret})
else: # not pdf <=> too large. Don't upload but mention its existence
lentry += "(not uploaded to slack, but the file will be on the NAS)"
rel_text += lentry
if rel_text:
rel_text = answer[0]["reply_text"] = answer[0]["reply_text"] + "\nRelated files:\n" + rel_text
return answer
@property
def mail_info(self):
base = [{"reply_text": "[{}]({})\n".format(self.article_url, self.article_url), "file_path":None}] + self.slack_info
return [{"reply_text": markdown.markdown(m["reply_text"]), "file_path": m["file_path"]} for m in base]
## Helpers
def set_keywords(self, keywords):
for k in keywords:
ArticleKeyword.create(
article = self,
keyword = k
)
def set_authors(self, authors):
for a in authors:
ArticleAuthor.create(
article = self,
author = a
)
def set_references(self, references):
for r in references:
ArticleReference.create(
article = self,
reference_url = r
)
def set_related(self, related):
for r in related:
ArticleRelated.create(
article = self,
related_file_name = r
)
def file_status(self):
if not self.file_name:
logger.error("Article {} has no filename!".format(self))
return False, {"reply_text": "Download failed, no file was saved.", "file_path": None}
file_path_abs = self.save_path + self.file_name
if not os.path.exists(file_path_abs):
logger.error("Article {} has a filename, but the file does not exist at that location!".format(self))
return False, {"reply_text": "Can't find file. Either the download failed or the file was moved.", "file_path": None}
return True, {}
class ArticleKeyword(DownloadBaseModel):
# instance gets created for every one keyword -> flexible in size
article = ForeignKeyField(ArticleDownload, backref='keywords')
keyword = CharField()
class ArticleAuthor(DownloadBaseModel):
article = ForeignKeyField(ArticleDownload, backref='authors')
author = CharField()
class ArticleReference(DownloadBaseModel):
article = ForeignKeyField(ArticleDownload, backref='references')
reference_url = TextField(default = '')
class ArticleRelated(DownloadBaseModel):
article = ForeignKeyField(ArticleDownload, backref='related')
related_file_name = TextField(default = '')
## == Slack-thread related models == ##
class User(ChatBaseModel):
user_id = CharField(default='', unique=True)
# messages
class Thread(ChatBaseModel):
"""The threads that concern us are only created if the base massage contains a url"""
thread_ts = FloatField(default = 0)
article = ForeignKeyField(ArticleDownload, backref="slack_thread", null=True, default=None)
# provides, ts, user, models
# messages
@property
def slack_ts(self):
str_ts = str(self.thread_ts)
cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem!
return "{}{}".format(str_ts, cut_zeros*"0")
@property
def initiator_message(self):
try:
return self.messages[0] # TODO check if this needs sorting
except IndexError:
logger.warning(f"Thread {self} is empty. How can that be?")
return None
@property
def message_count(self):
# logger.warning("message_count was called")
return self.messages.count()
@property
def last_message(self):
messages = Message.select().where(Message.thread == self).order_by(Message.ts) # can't be empty by definition/creation
return messages[-1]
@property
def is_fully_processed(self) -> bool:
init_message = self.initiator_message
if init_message is None:
return False
if init_message.is_processed_override:
return True
# this override is set for instance, when no url was sent at all. Then set this thread to be ignored
reactions = init_message.reaction
if not reactions:
return False
else:
r = reactions[0].type # can and should only have one reaction
return r == "white_check_mark" \
or r == "x"
class Message(ChatBaseModel):
ts = FloatField(unique=True) #for sorting
channel_id = CharField(default='')
user = ForeignKeyField(User, backref="messages")
text = TextField(default='')
thread = ForeignKeyField(Thread, backref="messages", default=None)
file_type = CharField(default='')
perma_link = CharField(default='')
is_processed_override = BooleanField(default=False)
# reaction
def __str__(self) -> str:
return "MSG [{}]".format(shorten_name(self.text).replace('\n','/'))
@property
def slack_ts(self):
str_ts = str(self.ts)
cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem!
return "{}{}".format(str_ts, cut_zeros * "0")
@property
def urls(self):
pattern = r"<(.*?)>"
matches = re.findall(pattern, self.text)
matches = [m for m in matches if "." in m]
new_matches = []
for m in matches:
if "." in m: # must contain a tld, right?
# further complication: slack automatically abreviates urls in the format:
# <url|link preview>. Lucky for us, "|" is a character derecommended in urls, meaning we can "safely" split for it and retain the first half
if "|" in m:
keep = m.split("|")[0]
else:
keep = m
new_matches.append(keep)
return new_matches
@property
def is_by_human(self):
return self.user.user_id != slack_config["bot_id"]
@property
def has_single_url(self):
return len(self.urls) == 1
class Reaction(ChatBaseModel):
type = CharField(default = "")
message = ForeignKeyField(Message, backref="reaction")
def create_tables():
with download_db:
download_db.create_tables([ArticleDownload, ArticleKeyword, ArticleAuthor, ArticleReference, ArticleRelated])
with chat_db:
chat_db.create_tables([User, Message, Thread, Reaction])
def set_db(chat_db_object, download_db_object):
chat_db.initialize(chat_db_object)
download_db.initialize(download_db_object)
create_tables()
def clear_path_name(path):
keepcharacters = (' ','.','_', '-')
converted = "".join([c if (c.isalnum() or c in keepcharacters) else "_" for c in path]).rstrip()
return converted
def shorten_name(name, offset = 50):
if len(name) > offset:
return name[:offset] + "..."
else:
return name

View File

@@ -1,47 +0,0 @@
import os
import subprocess
from pathlib import Path
import logging
logger = logging.getLogger(__name__)
import configuration
config = configuration.parsed["DOWNLOADS"]
shrink_sizes = []
def shrink_pdf(article):
article_loc = Path(article.save_path) / article.file_name
initial_size = article_loc.stat().st_size
compressed_tmp = Path(config['default_download_path']) / "compressed.pdf"
if article_loc.suffix != "pdf":
return article # it probably was a youtube video
c = subprocess.run(
[
"gs",
"-sDEVICE=pdfwrite",
"-dPDFSETTINGS=/screen",
"-dNOPAUSE",
"-dBATCH",
f"-sOutputFile={compressed_tmp}",
f"{article_loc}"
],
stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
if c.returncode == 0:
try:
os.replace(compressed_tmp, article_loc)
except OSError as e:
logger.error(f"Compression ran but I could not copy back the file {e}")
final_size = article_loc.stat().st_size
shrink_sizes.append(initial_size - final_size)
logger.info(f"Compression worked. Avg shrinkage: {int(sum(shrink_sizes)/len(shrink_sizes) / 1000)} KB")
else:
logger.error(f"Could not run the compression! {c.stderr.decode()} - {c.stdout.decode()}")
return article

View File

@@ -1,174 +0,0 @@
import time
import datetime
import logging
import os
import base64
import requests
from selenium import webdriver
import configuration
import json
config = configuration.parsed["DOWNLOADS"]
blacklisted = json.loads(config["blacklisted_href_domains"])
class PDFDownloader:
"""Saves a given url. Fills the object it got as a parameter"""
logger = logging.getLogger(__name__)
# status-variable for restarting:
running = False
def start(self):
self.finish() # clear up
options = webdriver.FirefoxOptions()
options.profile = config["browser_profile_path"]
# should be options.set_preference("profile", config["browser_profile_path"]) as of selenium 4 but that doesn't work
if os.getenv("HEADLESS", "false") == "true":
options.add_argument('--headless')
else:
self.logger.warning("Opening browser GUI because of 'HEADLESS=false'")
options.set_preference('print.save_as_pdf.links.enabled', True)
# Just save if the filetype is pdf already
# TODO: this is not working right now
options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
options.set_preference("browser.download.folderList", 2)
# options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
# options.set_preference("pdfjs.disabled", True)
options.set_preference("browser.download.dir", config["default_download_path"])
self.logger.info("Starting gecko driver")
# peviously, in a single docker image:
# self.driver = webdriver.Firefox(
# options = options,
# service = webdriver.firefox.service.Service(
# log_path = f'{config["local_storage_path"]}/geckodriver.log'
# ))
self.driver = webdriver.Remote(
command_executor = 'http://geckodriver:4444',
options = options,
# can't set log path...
)
residues = os.listdir(config["default_download_path"])
for res in residues:
os.remove(os.path.join(config["default_download_path"], res))
self.running = True
def autostart(self):
if not self.running:
self.start() # relaunch the dl util
def finish(self):
if self.running:
self.logger.info("Exiting gecko driver")
try:
self.driver.quit()
time.sleep(10)
except:
self.logger.critical("Connection to the driver broke off")
self.running = False
else:
self.logger.info("Gecko driver not yet running")
def download(self, article_object):
sleep_time = 2
self.autostart()
url = article_object.article_url
try:
self.driver.get(url)
except Exception as e:
self.logger.critical("Selenium .get(url) failed with error {}".format(e))
self.finish()
return article_object # without changes
time.sleep(sleep_time)
# leave the page time to do any funky business
# in the mean time, get a page title if required
if article_object.is_title_bad:
article_object.title = self.driver.title.replace(".pdf", "")
# will be propagated to the saved file (dst) as well
fname = article_object.fname_template
dst = os.path.join(article_object.save_path, fname)
if os.path.exists(dst):
fname = make_path_unique(fname)
dst = os.path.join(article_object.save_path, fname)
if url[-4:] == ".pdf":
# according to the browser preferences, calling the url will open pdfjs.
# If not handled separately, printing would require the ctrl+p route, but setup is janky to say the least
success = self.get_exisiting_pdf(url, dst)
else:
success = self.get_new_pdf(dst)
if success:
article_object.file_name = fname
article_object.set_references(self.get_references())
else:
article_object.file_name = ""
return article_object # this change is saved later by the external caller
def get_exisiting_pdf(self, url, dst):
try:
r = requests.get(url)
bytes = r.content
except:
return False
return self.get_new_pdf(dst, other_bytes=bytes)
def get_new_pdf(self, dst, other_bytes=None):
os.makedirs(os.path.dirname(dst), exist_ok=True)
if other_bytes is None:
try:
result = self.driver.print_page()
bytes = base64.b64decode(result, validate=True)
except:
self.logger.error("Failed, probably because the driver went extinct.")
return False
else:
bytes = other_bytes
try:
with open(dst, "wb+") as f:
f.write(bytes)
return True
except Exception as e:
self.logger.error(f"Failed, because of FS-operation: {e}")
return False
def get_references(self):
try:
hrefs = [e.get_attribute("href") for e in self.driver.find_elements_by_xpath("//a[@href]")]
except:
hrefs = []
# len_old = len(hrefs)
hrefs = [h for h in hrefs \
if not sum([(domain in h) for domain in blacklisted]) # sum([True, False, False, False]) == 1 (esp. not 0)
] # filter a tiny bit at least
# self.logger.info(f"Hrefs filtered (before: {len_old}, after: {len(hrefs)})")
return hrefs
def make_path_unique(path):
fname, ending = os.path.splitext(path)
fname += datetime.datetime.now().strftime("%d-%H%M%S")
return fname + ending

View File

@@ -0,0 +1,65 @@
import time
import os
import logging
import yaml
from peewee import SqliteDatabase, PostgresqlDatabase
from rich.logging import RichHandler
# first things first: logging
logging.basicConfig(
format='%(message)s',
level=logging.INFO,
datefmt='%H:%M:%S', # add %Y-%m-%d if needed
handlers=[RichHandler()]
)
logger = logging.getLogger(__name__)
# load config file containing constants and secrets
config_location = os.getenv("CONFIG_FILE")
with open(config_location, "r") as f:
config = yaml.safe_load(f)
# DEBUG MODE:
if os.getenv("DEBUG", "false") == "true":
logger.warning("Found 'DEBUG=true', setting up dummy databases")
config["slack"]["archive_id"] = config["slack"]["debug_id"]
config["mail"]["recipient"] = config["mail"]["sender"]
config["downloads"]["local_storage_path"] = config["downloads"]["debug_storage_path"]
download_db = SqliteDatabase(
config["database"]["debug_db"],
pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once
)
# PRODUCTION MODE:
else:
logger.warning("Found 'DEBUG=false' and running on production databases, I hope you know what you're doing...")
time.sleep(10) # wait for the vpn to connect (can't use a healthcheck because there is no depends_on)
cred = config["database"]
download_db = PostgresqlDatabase(
cred["production_db_name"], user=cred["production_user_name"], password=cred["production_password"], host="vpn", port=5432
)
# TODO Reimplement backup/printout
# logger.info("Backing up databases")
# backup_dst = main_config["DATABASE"]["db_backup"]
# today = datetime.today().strftime("%Y.%m.%d")
# shutil.copyfile(
# os.path.join(db_base_path, main_config["DATABASE"]["chat_db_name"]),
# os.path.join(backup_dst, today + "." + main_config["DATABASE"]["chat_db_name"]),
# )
# shutil.copyfile(
# os.path.join(db_base_path, main_config["DATABASE"]["download_db_name"]),
# os.path.join(backup_dst, today + "." + main_config["DATABASE"]["download_db_name"]),
# )
from utils_storage import models
# Set up the database connection (also creates tables if they don't exist)
models.set_db(download_db)

View File

@@ -8,3 +8,6 @@ newspaper3k
htmldate
markdown
rich
psycopg2
unidecode
pyyaml

190
news_fetch/runner.py Normal file
View File

@@ -0,0 +1,190 @@
"""Main coordination of other util classes. Handles inbound and outbound calls"""
from time import sleep
import configuration
models = configuration.models
from threading import Thread
import logging
logger = logging.getLogger(__name__)
import sys
from collections import OrderedDict
from utils_mail import runner as MailRunner
from utils_slack import runner as SlackRunner
from utils_worker.workers import DownloadWorker, FetchWorker, UploadWorker
class ArticleWatcher:
"""Wrapper for a newly created article object. Notifies the coordinator upon change/completition"""
def __init__(self, article, workers_in, workers_out) -> None:
self.article = article
self.workers_in = workers_in
self.workers_out = workers_out
self.completition_notified = False
for w_dict in self.workers_in:
worker = self.get_next_worker(w_dict) # gets the first worker of each dict (they get processed independently)
worker.process(self)
def get_next_worker(self, worker_dict, worker_name=""):
"""Returns the worker coming after the one with key worker_name"""
if worker_name == "": # first one
return worker_dict[list(worker_dict.keys())[0]]
# for i,w_dict in enumerate(workers_list):
keys = list(worker_dict.keys())
next_key_ind = keys.index(worker_name) + 1
try:
key = keys[next_key_ind]
return worker_dict[key]
except IndexError:
return None
def update(self, worker_name):
"""Called by the workers to notify the watcher of a completed step"""
for w_dict in self.workers_in:
if worker_name in w_dict.keys():
next_worker = self.get_next_worker(w_dict, worker_name)
if next_worker:
if next_worker == "out":
self.completion_notifier()
else: # it's just another in-worker
next_worker.process(self)
else: # no next worker, we are done
logger.info(f"No worker after {worker_name}")
def completion_notifier(self):
"""Triggers the out-workers to process the article, that is to send out a message"""
for w_dict in self.workers_out:
worker = self.get_next_worker(w_dict)
worker.send(self.article)
self.article.sent = True
self.article.save()
def __str__(self) -> str:
return f"ArticleWatcher with id {self.article_id}"
class Dispatcher(Thread):
def __init__(self) -> None:
"""Thread to handle handle incoming requests and control the workers"""
self.workers_in = []
self.workers_out = []
super().__init__(target = self.launch)
def launch(self) -> None:
# start workers (each worker is a thread)
for w_dict in self.workers_in: # for reduced operations such as upload, some workers are not set
for w in w_dict.values():
if isinstance(w, Thread):
w.start()
# get all articles not fully processed
unsent = models.ArticleDownload.filter(sent = False) # if past messages have not been sent, they must be reevaluated
for a in unsent:
self.incoming_request(article=a)
def incoming_request(self, message=None, article=None):
"""This method is passed onto the slack worker. It then is called when a new message is received."""
if message is not None:
try:
url = message.urls[0] # ignore all the other ones
except IndexError:
return
article, is_new = models.ArticleDownload.get_or_create(article_url=url)
article.slack_ts = message.ts # either update the timestamp (to the last reference to the article) or set it for the first time
article.save()
elif article is not None:
is_new = False
logger.info(f"Received article {article} in incoming_request")
else:
logger.error("Dispatcher.incoming_request called with no arguments")
return
if is_new or (article.file_name == "" and article.verified == 0) \
or (not is_new and len(self.workers_in) == 1): # this is for upload
# check for models that were created but were abandonned. This means they have missing information, most importantly no associated file
# this overwrites previously set information, but that should not be too important
ArticleWatcher(
article,
workers_in=self.workers_in,
workers_out=self.workers_out,
)
else: # manually trigger notification immediatly
logger.info(f"Found existing article {article}. Now sending")
class PrintWorker:
def __init__(self, action, sent = False) -> None:
self.action = action
self.sent = sent
def send(self, article):
print(f"{self.action} article {article}")
if self.sent:
article.sent = True
article.save()
def keep_alive(self): # keeps script running, because there is nothing else in the main thread
while True: sleep(1)
if __name__ == "__main__":
dispatcher = Dispatcher()
if "upload" in sys.argv:
articles = models.ArticleDownload.select().where(models.ArticleDownload.archive_url == "" or models.ArticleDownload.archive_url == "TODO:UPLOAD").execute()
logger.info(f"Launching upload to archive for {len(articles)} articles.")
dispatcher.workers_in = [{"UploadWorker": UploadWorker()}]
print_worker = PrintWorker("Uploaded")
dispatcher.workers_out = [{"PrintWorker": print_worker}]
dispatcher.start()
for a in articles:
dispatcher.incoming_request(article=a)
print_worker.keep_alive()
else: # launch with full action
try:
slack_runner = SlackRunner.BotRunner(dispatcher.incoming_request)
# All workers are implemented as a threaded queue. But the individual model requires a specific processing order:
# fetch -> download (-> compress) -> complete
# This is reflected in the following list of workers:
workers_in = [
OrderedDict({"FetchWorker": FetchWorker(), "DownloadWorker": DownloadWorker(), "NotifyRunner": "out"}),
OrderedDict({"UploadWorker": UploadWorker()})
]
# The two dicts are processed independently. First element of first dict is called at the same time as the first element of the second dict
# Inside a dict, the order of the keys gives the order of execution (only when the first element is done, the second is called, etc...)
workers_out = [{"SlackRunner": slack_runner},{"MailRunner": MailRunner}]
dispatcher.workers_in = workers_in
dispatcher.workers_out = workers_out
dispatcher.start() # starts the thread, (ie. runs launch())
slack_runner.start() # last one to start, inside the main thread
except KeyboardInterrupt:
logger.info("Keyboard interrupt. Stopping Slack and dispatcher")
slack_runner.stop()
dispatcher.join()
for w_dict in workers_in:
for w in w_dict.values():
if isinstance(w, Thread):
w.stop()
# All threads are launched as a daemon thread, meaning that any 'leftover' should exit along with the sys call
sys.exit(0)

View File

@@ -7,22 +7,23 @@ import logging
import configuration
logger = logging.getLogger(__name__)
config = configuration.parsed["MAIL"]
mail_config = configuration.config["mail"]
def send(article_model):
mail = MIMEMultipart()
mail['Subject'] = "{} -- {}".format(article_model.source_name, article_model.title)
mail['From'] = config["sender"]
mail['To'] = config["recipient"]
msgs = article_model.mail_info # this is html
msg = [m["reply_text"] for m in msgs]
msg = "\n".join(msg)
mail['From'] = mail_config["sender"]
mail['To'] = mail_config["recipient"]
try:
msg, files = article_model.mail_info() # this is html
except: # Raised by model if article has no associated file
logger.info("Skipping mail sending")
return
content = MIMEText(msg, "html")
mail.attach(content)
files = [m["file_path"] for m in msgs if m["file_path"]]
for path in files:
with open(path, 'rb') as file:
part = MIMEApplication(file.read(), "pdf")
@@ -31,10 +32,15 @@ def send(article_model):
mail.attach(part)
try:
smtp = smtplib.SMTP(config["smtp_server"], config["port"])
try:
smtp = smtplib.SMTP(mail_config["smtp_server"], mail_config["port"])
except ConnectionRefusedError:
logger.error("Server refused connection. Is this an error on your side?")
return False
smtp.starttls()
smtp.login(config["uname"], config["password"])
smtp.sendmail(config["sender"], config["recipient"], mail.as_string())
smtp.login(mail_config["uname"], mail_config["password"])
smtp.sendmail(mail_config["sender"], mail_config["recipient"], mail.as_string())
smtp.quit()
logger.info("Mail successfully sent.")
except smtplib.SMTPException as e:

View File

@@ -0,0 +1,219 @@
from slack_bolt import App
from slack_bolt.adapter.socket_mode import SocketModeHandler
from slack_sdk.errors import SlackApiError
import logging
import re
import time
import configuration
slack_config = configuration.config["slack"]
models = configuration.models
class MessageIsUnwanted(Exception):
# This exception is triggered when the message is either threaded (reply to another message) or weird (like an edit, a deletion, etc)
pass
class Message:
ts = str
user_id = str
text = str
logger = logging.getLogger(__name__)
def __init__(self, message_dict):
if message_dict.get("subtype", "not bad") == "message_changed":
raise MessageIsUnwanted()
if message_dict["type"] == "message":
if "thread_ts" in message_dict and (message_dict["thread_ts"] != message_dict["ts"]): # meaning it's a reply to another message
raise MessageIsUnwanted()
self.user_id = message_dict.get("user", "BAD USER")
# self.channel_id = config["archive_id"] # by construction, other messages are not intercepted
self.ts = message_dict["ts"]
self.text = message_dict["text"]
else:
self.logger.warning(f"What should I do of {message_dict}")
raise MessageIsUnwanted()
def __str__(self) -> str:
return f"MSG [{self.text}]"
@property
def urls(self):
pattern = r"<(.*?)>"
matches = re.findall(pattern, self.text)
matches = [m for m in matches if "." in m] # must contain a tld, right?
new_matches = []
for m in matches:
# further complication: slack automatically abreviates urls in the format:
# <url|link preview>. Lucky for us, "|" is a character derecommended in urls, meaning we can "safely" split for it and retain the first half
if "|" in m:
keep = m.split("|")[0]
else:
keep = m
new_matches.append(keep)
return new_matches
@property
def is_by_human(self):
return self.user.user_id != slack_config["bot_id"]
@property
def has_single_url(self):
return len(self.urls) == 1
class BotApp(App):
logger = logging.getLogger(__name__)
def __init__(self, callback, *args, **kwargs):
super().__init__(*args, **kwargs)
self.callback = callback
def pre_start(self):
missed_messages = self.fetch_missed_channel_messages()
[self.handle_incoming_message(m) for m in missed_messages]
self.startup_status()
def say_substitute(self, *args, **kwargs):
self.client.chat_postMessage(
channel=slack_config["archive_id"],
text=" - ".join(args),
**kwargs
)
def fetch_missed_channel_messages(self):
# latest processed message_ts is:
presaved = models.ArticleDownload.select().order_by(models.ArticleDownload.slack_ts.desc()).get_or_none()
if presaved is None:
last_ts = 0
else:
last_ts = presaved.slack_ts_full
result = self.client.conversations_history(
channel=slack_config["archive_id"],
oldest=last_ts
)
new_messages = result.get("messages", [])
# # filter the last one, it is a duplicate! (only if the db is not empty!)
# if last_ts != 0 and len(new_messages) != 0:
# new_messages.pop(-1)
return_messages = [Message(m) for m in new_messages]
refetch = result.get("has_more", False)
while refetch: # we have not actually fetched them all
try:
result = self.client.conversations_history(
channel = slack_config["archive_id"],
cursor = result["response_metadata"]["next_cursor"],
oldest = last_ts
) # fetches 100 messages, older than the [-1](=oldest) element of new_fetches
refetch = result.get("has_more", False)
new_messages = result.get("messages", [])
for m in new_messages:
return_messages.append(Message(m))
except SlackApiError: # Most likely a rate-limit
self.logger.error("Error while fetching channel messages. (likely rate limit) Retrying in {} seconds...".format(slack_config["api_wait_time"]))
time.sleep(slack_config["api_wait_time"])
refetch = True
self.logger.info(f"Fetched {len(return_messages)} new channel messages.")
return return_messages
def handle_incoming_message(self, message, say=None):
"""Reacts to all messages inside channel archiving. This either gets called when catching up on missed messages (by pre_start()) or by the SocketModeHandler in 'live' mode"""
if isinstance(message, dict):
try:
message = Message(message)
except MessageIsUnwanted:
return False
self.logger.info(f"Handling message {message} ({len(message.urls)} urls)")
if len(message.urls) > 1:
self.say_substitute("Only the first url is being handled. Please send any subsequent url as a separate message", thread_ts=message.thread.slack_ts)
self.callback(message = message)
def respond_channel_message(self, article, say=None):
if article.slack_ts == 0:
self.logger.error(f"{article} has no slack_ts")
else:
self.logger.info("Skipping slack reply.")
def startup_status(self):
"""Prints an overview of the articles. This needs to be called here because it should run after having fetched the newly sent messages"""
total = models.ArticleDownload.select().count()
to_be_processed = models.ArticleDownload.select().where(models.ArticleDownload.title == "").count()
unchecked = models.ArticleDownload.select().where(models.ArticleDownload.verified == 0).count()
bad = models.ArticleDownload.select().where(models.ArticleDownload.verified == -1).count()
not_uploaded = models.ArticleDownload.select().where(models.ArticleDownload.archive_url == "").count()
self.logger.info(
f"[bold]NEWS-FETCH DATABASE STATUS[/bold]: Total entries: {total}; Not yet downloaded: {to_be_processed}; Not yet checked: {unchecked}; Not yet uploaded to archive: {not_uploaded}; Marked as bad: {bad}",
extra={"markup": True}
)
class BotRunner():
logger = logging.getLogger(__name__)
"""Stupid encapsulation so that we can apply the slack decorators to the BotApp"""
def __init__(self, callback, *args, **kwargs) -> None:
self.bot_worker = BotApp(callback, token=slack_config["auth_token"])
@self.bot_worker.event(event="message", matchers=[is_message_in_archiving])
def handle_incoming_message(message, say):
return self.bot_worker.handle_incoming_message(message, say)
# @self.bot_worker.event(event="reaction_added", matchers=[is_reaction_in_archiving])
# def handle_incoming_reaction(event, say):
# return self.bot_worker.handle_incoming_reaction(event)
@self.bot_worker.event(event="event")
def handle_all_other_reactions(event, say):
self.logger.log("Ignoring slack event that isn't a message")
self.handler = SocketModeHandler(self.bot_worker, slack_config["app_token"])
def start(self):
self.bot_worker.pre_start()
self.handler.start()
def stop(self):
self.handler.close()
self.logger.info("Closed Slack-Socketmodehandler")
def send(self, article):
"""Proxy function to send a message to the slack channel, Called by ArticleWatcher once the Article is ready"""
self.bot_worker.respond_channel_message(article)
def is_message_in_archiving(message) -> bool:
return message["channel"] == slack_config["archive_id"]

View File

@@ -0,0 +1,14 @@
import unidecode
KEEPCHARACTERS = (' ','.','_', '-')
def clear_path_name(path):
path = unidecode.unidecode(path) # remove umlauts, accents and others
path = "".join([c if (c.isalnum() or c in KEEPCHARACTERS) else "_" for c in path]) # remove all non-alphanumeric characters
path = path.rstrip() # remove trailing spaces
return path
def shorten_name(name, offset = 50):
if len(name) > offset:
return name[:offset] + "..."
else:
return name

View File

@@ -0,0 +1,179 @@
import logging
logger = logging.getLogger(__name__)
from peewee import *
import os
import markdown
import configuration
import datetime
from . import helpers
downloads_config = configuration.config["downloads"]
FILE_SIZE_THRESHOLD = 15 * 1024 * 1024 # 15MB
# set the nature of the db at runtime
download_db = DatabaseProxy()
class DownloadBaseModel(Model):
class Meta:
database = download_db
## == Article related models == ##
class ArticleDownload(DownloadBaseModel):
# in the beginning this is all we have
article_url = TextField(default = '', unique=True)
# fetch then fills in the metadata
title = TextField(default='')
@property
def is_title_bad(self): # add incrementally
return "PUR-Abo" in self.title \
or "Redirecting" in self.title \
or "Error while running fetch" in self.title \
or self.title == ""
summary = TextField(default = '')
source_name = CharField(default = '')
language = CharField(default = '')
file_name = TextField(default = '')
@property
def save_path(self):
return f"{downloads_config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/"
@property
def fname_nas(self, file_name=""):
if self.download_date:
if file_name:
return f"NAS: {downloads_config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{file_name}"
else: # return the self. name
return f"NAS: {downloads_config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{self.file_name}"
else:
return None
@property
def fname_template(self):
if "youtube.com" in self.source_name or "youtu.be" in self.source_name:
fname = f"{self.source_name} -- {self.title}"
else:
fname = f"{self.source_name} -- {self.title}.pdf"
return helpers.clear_path_name(fname)
archive_url = TextField(default = '')
pub_date = DateField(default = datetime.date.fromtimestamp(0))
download_date = DateField(default = datetime.date.today)
slack_ts = FloatField(default = 0) # should be a fixed-length string but float is easier to sort by
@property
def slack_ts_full(self):
str_ts = str(self.slack_ts)
cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals
return f"{str_ts}{cut_zeros * '0'}"
sent = BooleanField(default = False)
archived_by = CharField(default = os.getenv("UNAME"))
# need to know who saved the message because the file needs to be on their computer in order to get verified
# verification happens in a different app, but the model has the fields here as well
comment = TextField(default = '')
verified = IntegerField(default = 0) # 0 = not verified, 1 = verified, -1 = marked as bad
# authors
# keywords
# ... are added through foreignkeys
# we will also add an attribute named message, to reference which message should be replied to. This attribute does not need to be saved in the db
## Helpers specific to a single article
def __str__(self) -> str:
if self.title != '' and self.source_name != '':
desc = f"{helpers.shorten_name(self.title)} -- {self.source_name}"
else:
desc = f"{self.article_url}"
return f"ART [{desc}]"
def mail_info(self):
summary = "\n> " + "\n> ".join(self.summary.split("\n"))
answer_text = f"[{self.article_url}]({self.article_url})\n\n" # first the url
answer_files = []
# displays the summary in a blockquote
try:
self.ensure_file_present()
answer_text += f"*{self.title}*\n{summary}"
answer_files.append(self.save_path + self.file_name)
except Exception as e:
msg = e.args[0]
logger.error(f"Article {self} has file-issues: {msg}")
if "file too big" in msg:
location = f"File too big to send directly. Location on NAS:\n`{self.fname_nas}`"
answer_text += f"*{self.title}*\n{summary}\n{location}"
else: # file not found, or filename not set
raise e
# reraise the exception, so that the caller can handle it
# then the related files
if self.related:
rel_text = "Related files on NAS:"
for r in self.related:
fname = r.related_file_name
rel_text += f"\n• `{self.fname_nas(fname)}` "
answer_text += "\n\n" + rel_text
return markdown.markdown(answer_text), answer_files
def set_authors(self, authors):
for a in authors:
if len(a) < 100: # otherwise it's a mismatched string
ArticleAuthor.create(
article = self,
author = a
)
def set_related(self, related):
for r in related:
if len(r) > 255:
raise Exception("Related file name too long for POSTGRES")
ArticleRelated.create(
article = self,
related_file_name = r
)
def ensure_file_present(self):
if not self.file_name:
raise Exception("no filename")
file_path_abs = self.save_path + self.file_name
if not os.path.exists(file_path_abs):
raise Exception("file not found")
if (os.path.splitext(file_path_abs)[1] != ".pdf") or (os.path.getsize(file_path_abs) > FILE_SIZE_THRESHOLD):
raise Exception("file too big")
class ArticleAuthor(DownloadBaseModel):
article = ForeignKeyField(ArticleDownload, backref='authors')
author = CharField()
class ArticleRelated(DownloadBaseModel):
# Related files, such as the full text of a paper, audio files, etc.
article = ForeignKeyField(ArticleDownload, backref='related')
related_file_name = TextField(default = '')
def set_db(download_db_object):
download_db.initialize(download_db_object)
with download_db: # create tables (does nothing if they exist already)
download_db.create_tables([ArticleDownload, ArticleAuthor, ArticleRelated])

View File

@@ -0,0 +1,180 @@
import logging
import time
import datetime
import os, shutil, uuid
from pathlib import Path
import base64
import requests
from selenium import webdriver
import configuration
download_config = configuration.config["downloads"]
def driver_running(f):
def wrapper(*args, **kwargs):
self = args[0]
if not self._running:
self.start()
return f(*args, **kwargs)
return wrapper
class PDFDownloader:
"""Saves a given url. Fills the object it got as a parameter"""
logger = logging.getLogger(__name__)
_running = False
def start(self):
"""Called externally to start the driver, but after an exception can also be called internally"""
if self._running:
self.finish() # clear up
self.logger.info("Starting geckodriver")
reduced_path = self.create_tmp_profile()
profile = webdriver.FirefoxProfile(reduced_path)
options = webdriver.FirefoxOptions()
if os.getenv("DEBUG", "false") == "true":
self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
else:
options.add_argument('--headless')
self.driver = webdriver.Remote(
command_executor = 'http://geckodriver:4444', # the host geckodriver points to the geckodriver container
options = options,
browser_profile = profile
)
self._running = True
def finish(self):
self.logger.info("Exiting Geckodriver")
try:
self.driver.quit()
time.sleep(10)
except:
self.logger.critical("Connection to the driver broke off")
self._running = False
@driver_running
def download(self, article_object):
url = article_object.article_url
if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly
self.logger.info("Downloading existing pdf")
success = self.get_exisiting_pdf(article_object)
# get a page title if required
if article_object.is_title_bad:
article_object.title = self.driver.title.replace(".pdf", "") # some titles end with .pdf
# will be propagated to the saved file (dst) as well
else:
success = self.get_new_pdf(article_object)
if not success:
self.logger.error("Download failed")
# TODO: need to reset the file name to empty?
return article_object # changes to this are saved later by the external caller
def get_exisiting_pdf(self, article_object):
# get a better page title if required
if article_object.is_title_bad:
article_object.title = article_object.article_url.split("/")[-1].split(".pdf")[0]
try:
r = requests.get(article_object.article_url)
bytes = r.content
except:
return False
return self.write_pdf(bytes, article_object)
def get_new_pdf(self, article_object):
sleep_time = int(download_config["browser_print_delay"])
try:
self.driver.get(article_object.article_url)
except Exception as e:
self.logger.critical("Selenium .get(url) failed with error {}".format(e))
self.finish()
return False
time.sleep(sleep_time)
# leave the page time to do any funky business
if article_object.is_title_bad:
article_object.title = self.driver.title
try:
result = self.driver.print_page()
bytes = base64.b64decode(result, validate=True)
except:
self.logger.error("Failed, probably because the driver went extinct.")
return False
return self.write_pdf(bytes, article_object)
def get_file_destination(self, article_object):
fname = article_object.fname_template
fname = ensure_unique(article_object.save_path, fname)
dst = os.path.join(article_object.save_path, fname)
return dst, fname
def write_pdf(self, content, article_object):
dst, fname = self.get_file_destination(article_object)
os.makedirs(os.path.dirname(dst), exist_ok=True)
try:
with open(dst, "wb+") as f:
f.write(content)
article_object.file_name = fname
return True
except Exception as e:
self.logger.error(f"Failed, because of FS-operation: {e}")
return False
def create_tmp_profile(self, full_profile_path: Path = Path(download_config["browser_profile_path"])) -> Path:
reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}")
os.mkdir(reduced_profile_path)
# copy needed directories
dirs = ["extensions", "storage"]
for dir in dirs:
shutil.copytree(full_profile_path / dir, reduced_profile_path / dir)
# copy needed files
files = ["extension-preferences.json", "addons.json", "addonStartup.json.lz4", "prefs.js", "extensions.json", "cookies.sqlite"]
for f in files:
shutil.copy(full_profile_path / f, reduced_profile_path)
folder_size = round(sum(p.stat().st_size for p in Path(reduced_profile_path).rglob('*')) / 1024 / 1024, 3)
self.logger.info(f"Generated temporary profile at {reduced_profile_path} with size {folder_size} MB")
return reduced_profile_path
def ensure_unique(path, fname):
fbase, ending = os.path.splitext(fname)
exists = os.path.exists(os.path.join(path, fname))
i = 1
while exists:
fname = fbase + f" -- fetch {i}" + ending
i += 1
exists = os.path.exists(os.path.join(path, fname))
return fname

View File

@@ -1,11 +1,11 @@
from __future__ import unicode_literals
import youtube_dl
import os
import logging
import configuration
download_config = configuration.config["downloads"]
logger = logging.getLogger(__name__)
class MyLogger(object):
def debug(self, msg): pass
def warning(self, msg): pass
@@ -20,7 +20,6 @@ class YouTubeDownloader:
def post_download_hook(self, ret_code):
# print(ret_code)
if ret_code['status'] == 'finished':
file_loc = ret_code["filename"]
fname = os.path.basename(file_loc)
@@ -36,9 +35,11 @@ class YouTubeDownloader:
ydl_opts = {
'format': 'best[height<=720]',
'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download
'logger': MyLogger(),
'logger': MyLogger(), # supress verbosity
'progress_hooks': [self.post_download_hook],
'updatetime': False
'updatetime': False,
# File is also used by firefox so make sure to not write to it!
# youtube dl apparenlty does not support cookies.sqlite and the documentation is not clear on how to use cookies.txt
}
try:
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
@@ -47,5 +48,9 @@ class YouTubeDownloader:
except Exception as e:
logger.error(f"Youtube download crashed: {e}")
article_object.file_name = ""
logfile = os.path.join(download_config["local_storage_path"], "failed_downloads.csv")
logger.info(f"Logging youtube errors seperately to {logfile}")
with open(logfile, "a+") as f:
f.write(f"{url}\n")
return article_object

View File

@@ -53,10 +53,5 @@ def get_description(article_object):
article_object.set_authors(news_article.authors)
except AttributeError:
pass # list would have been empty anyway
try:
article_object.set_keywords(news_article.keywords)
except AttributeError:
pass # list would have been empty anyway
return article_object

View File

@@ -1,4 +1,3 @@
import time
from waybackpy import WaybackMachineSaveAPI # upload to archive.org
import logging
logger = logging.getLogger(__name__)

View File

@@ -7,15 +7,13 @@ class TemplateWorker(Thread):
"""Parent class for any subsequent worker of the article-download pipeline. They should all run in parallel, thus the Thread subclassing"""
logger = logging.getLogger(__name__)
def __init__(self, *args, **kwargs) -> None:
def __init__(self, **kwargs) -> None:
target = self._queue_processor # will be executed on Worker.start()
group = kwargs.get("group", None)
name = kwargs.get("name", None)
super().__init__(group=group, target=target, name=name)
self.keep_running = True
super().__init__(target=target, daemon=True)
self._article_queue = []
self.logger.info(f"Worker thread {self.__class__.__name__} initialized successfully")
def process(self, article_watcher):
self._article_queue.append(article_watcher)#.article_model.article_url)
@@ -23,7 +21,7 @@ class TemplateWorker(Thread):
def _queue_processor(self):
"""This method is launched by thread.run() and idles when self._article_queue is empty. When an external caller appends to the queue it jumps into action"""
while True: # PLEASE tell me if I'm missing an obvious better way of doing this!
while self.keep_running: # PLEASE tell me if I'm missing an obvious better way of doing this!
if len(self._article_queue) == 0:
time.sleep(5)
else:
@@ -39,3 +37,10 @@ class TemplateWorker(Thread):
article = article_watcher.article
article = action(article) # action updates the article object but does not save the change
article.save()
article_watcher.update(self.__class__.__name__)
def stop(self):
self.logger.info(f"Stopping worker {self.__class__.__name__} whith {len(self._article_queue)} articles left in queue")
self.keep_running = False
self.join()

View File

@@ -3,7 +3,7 @@ from .download.browser import PDFDownloader
from .download.youtube import YouTubeDownloader
from .fetch.runner import get_description
from .upload.runner import upload_to_archive as run_upload
from .compress.runner import shrink_pdf
import time
import logging
@@ -25,7 +25,7 @@ class DownloadWorker(TemplateWorker):
action = self.dl_runner
super()._handle_article(article_watcher, action)
article_watcher.download_completed = True
# article_watcher.download_completed = True
@@ -36,7 +36,7 @@ class FetchWorker(TemplateWorker):
def _handle_article(self, article_watcher):
action = get_description # function
super()._handle_article(article_watcher, action)
article_watcher.fetch_completed = True
# article_watcher.fetch_completed = True
@@ -52,15 +52,4 @@ class UploadWorker(TemplateWorker):
return run_upload(*args, **kwargs)
super()._handle_article(article_watcher, action)
article_watcher.upload_completed = True
class CompressWorker(TemplateWorker):
def __init__(self) -> None:
super().__init__()
def _handle_article(self, article_watcher):
action = shrink_pdf
super()._handle_article(article_watcher, action)
article_watcher.compression_completed = True
# article_watcher.upload_completed = True