diff --git a/README.md b/README.md index 3acebb3..c33120f 100644 --- a/README.md +++ b/README.md @@ -76,10 +76,24 @@ docker compose --env-file env/production logs -f news_fetch # follows along with docker compose --env-file env/production down ``` +### First run: +> The program relies on a functioning chrome profile! + +For the first run ever, run + +`./launch init` + +This will generate a new chrome profile under `coss_archiving/dependencies/news_fetch.profile`. +You can then go to [http://localhost:7900](http://localhost:7900) in your browser. Verify the profile (under chrome://profile-internals). + +Now install two addons: Idontcareaboutcookies (from chrome://extensions) and Bypass Paywalls (from https://github.com/iamadamdev/bypass-paywalls-chrome). The script already downloaded the file, so just enable developer mode, click load from unpacked, go to `/user_data/dependencies/news_fetch.profile`, select the directory `bypass-paywalls-chrome-master`. + +Whenever you need to make changes to the profile, for instance re-log in to websites, just rerun `./launch init`. + ## Building -> The software (firefox, selenium, python) changes frequently. For non-breaking changes it is useful to regularly re build the docker image! This is also crucial to update the code itself. +> The software **will** change. Because the images referenced in docker compose are usually the `latest` ones, it is sufficient to update the containers. In docker compose, run diff --git a/chrome/change_configuration.sh b/chrome/change_configuration.sh new file mode 100644 index 0000000..c4fa27b --- /dev/null +++ b/chrome/change_configuration.sh @@ -0,0 +1,16 @@ +if [ -d "/user_data/news_fetch.profile" ] +then + echo "Profile already exists, skipping creation" +else + google-chrome & + sleep 5 + cp -r /home/seluser/.config/google-chrome/Default /user_data/news_fetch.profile + PID=$(pidof chrome) + echo "Now killing processes with pid:" $PID + kill $PID + cd /user_data/news_fetch.profile + wget https://github.com/iamadamdev/bypass-paywalls-chrome/archive/master.zip + unzip master +fi + +google-chrome --user-data-dir=/user_data/news_fetch.profile \ No newline at end of file diff --git a/docker-compose.yaml b/docker-compose.yaml index e8b811e..69dbae5 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -33,13 +33,17 @@ services: - /sync/nas_sync.config - geckodriver: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers) - image: ${GECKODRIVER_IMG} + chrome: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers) + image: selenium/standalone-chrome:latest shm_size: 2gb environment: - START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash) - START_XVFB=${HEADFULL-false} - SE_VNC_NO_PASSWORD=1 + volumes: + - ${CONTAINER_DATA}/dependencies:/user_data + - ${CODE:-/dev/null}:/code + user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user expose: ["4444"] # exposed to other docker-compose services only ports: - 7900:7900 # port for webvnc @@ -59,7 +63,7 @@ services: depends_on: # when using docker compose run news_fetch, the dependencies are started as well - nas_sync - - geckodriver + - chrome - db_passthrough volumes: @@ -68,6 +72,7 @@ services: environment: - DEBUG=${DEBUG} - UNAME=${UNAME} + user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user entrypoint: ${ENTRYPOINT:-python runner.py} # by default launch workers as defined in the Dockerfile # stdin_open: ${INTERACTIVE:-false} # docker run -i # tty: ${INTERACTIVE:-false} # docker run -t @@ -76,7 +81,7 @@ services: news_check: # Creates a small webapp on http://localhost:8080 to check previously generated pdfs (some of which are unusable and must be marked as such) build: news_check image: news_check:latest - user: 1000:1000 # since the app writes files to the local filesystem, it must be run as the current user + user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user depends_on: - db_passthrough volumes: diff --git a/env/debug b/env/debug index 104a910..9811c57 100644 --- a/env/debug +++ b/env/debug @@ -1,9 +1,8 @@ # Runs in a debugging mode, does not launch anything at all but starts a bash process -export CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving +export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving export UNAME=remy -export GECKODRIVER_IMG=selenium/standalone-firefox:104.0 export DEBUG=true export HEADFULL=true export CODE=./ diff --git a/env/production b/env/production index 26eee70..c7f14d5 100644 --- a/env/production +++ b/env/production @@ -3,5 +3,5 @@ CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving export UNAME=remy -export GECKODRIVER_IMG=selenium/standalone-firefox:104.0 +export U_ID=1000 export DEBUG=false diff --git a/launch b/launch index 34d0c1d..728ad95 100644 --- a/launch +++ b/launch @@ -8,9 +8,7 @@ echo "Bash script launching COSS_ARCHIVING..." # CHANGE ME ONCE! export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving export UNAME=remy -# CHANGE ME WHEN UPDATING FIREFOX -export GECKODRIVER_IMG=selenium/standalone-firefox:104.0 -# version must be >= than the one on the host or firefox will not start (because of mismatched config) +export U_ID=1000 if [[ $1 == "debug" ]] then @@ -18,8 +16,8 @@ then export HEADFULL=true export CODE=./ export ENTRYPOINT=/bin/bash - # since service ports does not open ports on implicitly started containers, also start geckodriver: - docker compose up -d geckodriver + # since service ports does not open ports on implicitly started containers, also start chrome: + docker compose up -d chrome elif [[ $1 == "production" ]] then export DEBUG=false @@ -32,6 +30,14 @@ elif [[ $1 == "down" ]] then docker compose stop exit 0 +elif [[ $1 == "init" ]] +then + export CODE=./ + export HEADFULL=true + + docker compose up -d chrome + sleep 5 + docker compose exec chrome /bin/bash /code/chrome/change_configuration.sh else echo "Please specify the execution mode (debug/production/build) as the first argument" exit 1 diff --git a/misc/sample_config/news_fetch.config.ini b/misc/sample_config/news_fetch.config.ini index 76c31a7..e8de2e9 100644 --- a/misc/sample_config/news_fetch.config.ini +++ b/misc/sample_config/news_fetch.config.ini @@ -26,5 +26,4 @@ local_storage_path: /app/containerdata/files debug_storage_path: /app/containerdata/debug/ default_download_path: /app/containerdata/tmp remote_storage_path: /helbing_support/Files RM/Archiving -browser_profile_path: /app/containerdata/dependencies/7hlyfqxt.Auto News -blacklisted_href_domains: ["google.", "facebook."] +browser_profile_path: /user_data/news_fetch.profile diff --git a/misc/youtube_batch.py b/misc/youtube_batch.py new file mode 100644 index 0000000..c2304f5 --- /dev/null +++ b/misc/youtube_batch.py @@ -0,0 +1,56 @@ +import youtube_dl +from waybackpy import WaybackMachineSaveAPI # upload to archive.org +import time + + +urls = [ + "https://www.youtube.com/watch?v=R4h_yiDIuQE", + "https://www.youtube.com/watch?v=-G8ZI1Jq8xA", + "https://www.youtube.com/watch?v=8eYBcASQIQI", + "https://www.thingiverse.com/thing:5463267", + "https://www.youtube.com/watch?v=cJoUSHJcV4E&t=0s", + "https://www.youtube.com/watch?v=UbBYZZBREBA&t=0s", + "https://www.youtube.com/watch?v=bQQn_vET4ys", + "https://www.youtube.com/watch?v=6FqNctiO06E", + "https://www.youtube.com/watch?v=ImnuJgj8XJo", + "https://www.youtube.com/watch?v=4QZQtSqaC34", + "https://www.youtube.com/watch?v=cW4qIjPMGkQ", + "https://www.youtube.com/watch?v=QWsUGpKfP8A", + "https://www.youtube.com/watch?v=a0PwEwLG9No", + "https://www.youtube.com/watch?v=Hd3lnWVIIpo", + "https://www.youtube.com/watch?v=JNtdAp-BdzI", + "https://en.wikipedia.org/wiki/Viktor_Schauberger", + "https://de.wikipedia.org/wiki/Viktor_Schauberger", +] +def post_download_hook(ret_code): + # print(ret_code) + if ret_code['status'] == 'finished': + file_loc = ret_code["filename"] + print(file_loc) + + +def save_video(url): + """Saves video accoring to url and save path""" + ydl_opts = { + 'format': 'best[height<=720]', + # 'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download + 'progress_hooks': [post_download_hook], + 'updatetime': False + } + try: + with youtube_dl.YoutubeDL(ydl_opts) as ydl: + ydl.download([url]) + # article file name is updated in self.post_download_hook + except Exception as e: + print(f"Youtube download crashed: {e}") + + +# for url in urls: +# save_video(url) + +for url in urls: + user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed? + wayback = WaybackMachineSaveAPI(url, user_agent) + archive_url = wayback.save() + print(archive_url) + time.sleep(20) diff --git a/news_check/client/src/ArticleStatus.svelte b/news_check/client/src/ArticleStatus.svelte index 6c426dc..97bbd3a 100644 --- a/news_check/client/src/ArticleStatus.svelte +++ b/news_check/client/src/ArticleStatus.svelte @@ -9,12 +9,15 @@ {name: 'Language', value: article_data.language}, {name: 'Authors', value: article_data.authors}, {name: "Related", value: article_data.related}, + {name: "Sent", value: article_data.sent}, ]