From db161e50c8017ff01b701a38fe4597af9dc63440 Mon Sep 17 00:00:00 2001 From: Remy Moll Date: Sun, 18 Sep 2022 19:26:55 +0200 Subject: [PATCH] Switched from geckodriver to chrome --- README.md | 16 +++++- chrome/change_configuration.sh | 16 ++++++ docker-compose.yaml | 13 +++-- env/debug | 3 +- env/production | 2 +- launch | 16 ++++-- misc/sample_config/news_fetch.config.ini | 3 +- misc/youtube_batch.py | 56 +++++++++++++++++++++ news_check/client/src/ArticleStatus.svelte | 11 ++-- news_fetch/Dockerfile | 7 --- news_fetch/utils_worker/download/browser.py | 51 +++++++------------ news_fetch/utils_worker/download/youtube.py | 1 - news_fetch/utils_worker/upload/runner.py | 1 - 13 files changed, 135 insertions(+), 61 deletions(-) create mode 100644 chrome/change_configuration.sh create mode 100644 misc/youtube_batch.py diff --git a/README.md b/README.md index 3acebb3..c33120f 100644 --- a/README.md +++ b/README.md @@ -76,10 +76,24 @@ docker compose --env-file env/production logs -f news_fetch # follows along with docker compose --env-file env/production down ``` +### First run: +> The program relies on a functioning chrome profile! + +For the first run ever, run + +`./launch init` + +This will generate a new chrome profile under `coss_archiving/dependencies/news_fetch.profile`. +You can then go to [http://localhost:7900](http://localhost:7900) in your browser. Verify the profile (under chrome://profile-internals). + +Now install two addons: Idontcareaboutcookies (from chrome://extensions) and Bypass Paywalls (from https://github.com/iamadamdev/bypass-paywalls-chrome). The script already downloaded the file, so just enable developer mode, click load from unpacked, go to `/user_data/dependencies/news_fetch.profile`, select the directory `bypass-paywalls-chrome-master`. + +Whenever you need to make changes to the profile, for instance re-log in to websites, just rerun `./launch init`. + ## Building -> The software (firefox, selenium, python) changes frequently. For non-breaking changes it is useful to regularly re build the docker image! This is also crucial to update the code itself. +> The software **will** change. Because the images referenced in docker compose are usually the `latest` ones, it is sufficient to update the containers. In docker compose, run diff --git a/chrome/change_configuration.sh b/chrome/change_configuration.sh new file mode 100644 index 0000000..c4fa27b --- /dev/null +++ b/chrome/change_configuration.sh @@ -0,0 +1,16 @@ +if [ -d "/user_data/news_fetch.profile" ] +then + echo "Profile already exists, skipping creation" +else + google-chrome & + sleep 5 + cp -r /home/seluser/.config/google-chrome/Default /user_data/news_fetch.profile + PID=$(pidof chrome) + echo "Now killing processes with pid:" $PID + kill $PID + cd /user_data/news_fetch.profile + wget https://github.com/iamadamdev/bypass-paywalls-chrome/archive/master.zip + unzip master +fi + +google-chrome --user-data-dir=/user_data/news_fetch.profile \ No newline at end of file diff --git a/docker-compose.yaml b/docker-compose.yaml index e8b811e..69dbae5 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -33,13 +33,17 @@ services: - /sync/nas_sync.config - geckodriver: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers) - image: ${GECKODRIVER_IMG} + chrome: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers) + image: selenium/standalone-chrome:latest shm_size: 2gb environment: - START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash) - START_XVFB=${HEADFULL-false} - SE_VNC_NO_PASSWORD=1 + volumes: + - ${CONTAINER_DATA}/dependencies:/user_data + - ${CODE:-/dev/null}:/code + user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user expose: ["4444"] # exposed to other docker-compose services only ports: - 7900:7900 # port for webvnc @@ -59,7 +63,7 @@ services: depends_on: # when using docker compose run news_fetch, the dependencies are started as well - nas_sync - - geckodriver + - chrome - db_passthrough volumes: @@ -68,6 +72,7 @@ services: environment: - DEBUG=${DEBUG} - UNAME=${UNAME} + user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user entrypoint: ${ENTRYPOINT:-python runner.py} # by default launch workers as defined in the Dockerfile # stdin_open: ${INTERACTIVE:-false} # docker run -i # tty: ${INTERACTIVE:-false} # docker run -t @@ -76,7 +81,7 @@ services: news_check: # Creates a small webapp on http://localhost:8080 to check previously generated pdfs (some of which are unusable and must be marked as such) build: news_check image: news_check:latest - user: 1000:1000 # since the app writes files to the local filesystem, it must be run as the current user + user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user depends_on: - db_passthrough volumes: diff --git a/env/debug b/env/debug index 104a910..9811c57 100644 --- a/env/debug +++ b/env/debug @@ -1,9 +1,8 @@ # Runs in a debugging mode, does not launch anything at all but starts a bash process -export CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving +export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving export UNAME=remy -export GECKODRIVER_IMG=selenium/standalone-firefox:104.0 export DEBUG=true export HEADFULL=true export CODE=./ diff --git a/env/production b/env/production index 26eee70..c7f14d5 100644 --- a/env/production +++ b/env/production @@ -3,5 +3,5 @@ CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving export UNAME=remy -export GECKODRIVER_IMG=selenium/standalone-firefox:104.0 +export U_ID=1000 export DEBUG=false diff --git a/launch b/launch index 34d0c1d..728ad95 100644 --- a/launch +++ b/launch @@ -8,9 +8,7 @@ echo "Bash script launching COSS_ARCHIVING..." # CHANGE ME ONCE! export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving export UNAME=remy -# CHANGE ME WHEN UPDATING FIREFOX -export GECKODRIVER_IMG=selenium/standalone-firefox:104.0 -# version must be >= than the one on the host or firefox will not start (because of mismatched config) +export U_ID=1000 if [[ $1 == "debug" ]] then @@ -18,8 +16,8 @@ then export HEADFULL=true export CODE=./ export ENTRYPOINT=/bin/bash - # since service ports does not open ports on implicitly started containers, also start geckodriver: - docker compose up -d geckodriver + # since service ports does not open ports on implicitly started containers, also start chrome: + docker compose up -d chrome elif [[ $1 == "production" ]] then export DEBUG=false @@ -32,6 +30,14 @@ elif [[ $1 == "down" ]] then docker compose stop exit 0 +elif [[ $1 == "init" ]] +then + export CODE=./ + export HEADFULL=true + + docker compose up -d chrome + sleep 5 + docker compose exec chrome /bin/bash /code/chrome/change_configuration.sh else echo "Please specify the execution mode (debug/production/build) as the first argument" exit 1 diff --git a/misc/sample_config/news_fetch.config.ini b/misc/sample_config/news_fetch.config.ini index 76c31a7..e8de2e9 100644 --- a/misc/sample_config/news_fetch.config.ini +++ b/misc/sample_config/news_fetch.config.ini @@ -26,5 +26,4 @@ local_storage_path: /app/containerdata/files debug_storage_path: /app/containerdata/debug/ default_download_path: /app/containerdata/tmp remote_storage_path: /helbing_support/Files RM/Archiving -browser_profile_path: /app/containerdata/dependencies/7hlyfqxt.Auto News -blacklisted_href_domains: ["google.", "facebook."] +browser_profile_path: /user_data/news_fetch.profile diff --git a/misc/youtube_batch.py b/misc/youtube_batch.py new file mode 100644 index 0000000..c2304f5 --- /dev/null +++ b/misc/youtube_batch.py @@ -0,0 +1,56 @@ +import youtube_dl +from waybackpy import WaybackMachineSaveAPI # upload to archive.org +import time + + +urls = [ + "https://www.youtube.com/watch?v=R4h_yiDIuQE", + "https://www.youtube.com/watch?v=-G8ZI1Jq8xA", + "https://www.youtube.com/watch?v=8eYBcASQIQI", + "https://www.thingiverse.com/thing:5463267", + "https://www.youtube.com/watch?v=cJoUSHJcV4E&t=0s", + "https://www.youtube.com/watch?v=UbBYZZBREBA&t=0s", + "https://www.youtube.com/watch?v=bQQn_vET4ys", + "https://www.youtube.com/watch?v=6FqNctiO06E", + "https://www.youtube.com/watch?v=ImnuJgj8XJo", + "https://www.youtube.com/watch?v=4QZQtSqaC34", + "https://www.youtube.com/watch?v=cW4qIjPMGkQ", + "https://www.youtube.com/watch?v=QWsUGpKfP8A", + "https://www.youtube.com/watch?v=a0PwEwLG9No", + "https://www.youtube.com/watch?v=Hd3lnWVIIpo", + "https://www.youtube.com/watch?v=JNtdAp-BdzI", + "https://en.wikipedia.org/wiki/Viktor_Schauberger", + "https://de.wikipedia.org/wiki/Viktor_Schauberger", +] +def post_download_hook(ret_code): + # print(ret_code) + if ret_code['status'] == 'finished': + file_loc = ret_code["filename"] + print(file_loc) + + +def save_video(url): + """Saves video accoring to url and save path""" + ydl_opts = { + 'format': 'best[height<=720]', + # 'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download + 'progress_hooks': [post_download_hook], + 'updatetime': False + } + try: + with youtube_dl.YoutubeDL(ydl_opts) as ydl: + ydl.download([url]) + # article file name is updated in self.post_download_hook + except Exception as e: + print(f"Youtube download crashed: {e}") + + +# for url in urls: +# save_video(url) + +for url in urls: + user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed? + wayback = WaybackMachineSaveAPI(url, user_agent) + archive_url = wayback.save() + print(archive_url) + time.sleep(20) diff --git a/news_check/client/src/ArticleStatus.svelte b/news_check/client/src/ArticleStatus.svelte index 6c426dc..97bbd3a 100644 --- a/news_check/client/src/ArticleStatus.svelte +++ b/news_check/client/src/ArticleStatus.svelte @@ -9,12 +9,15 @@ {name: 'Language', value: article_data.language}, {name: 'Authors', value: article_data.authors}, {name: "Related", value: article_data.related}, + {name: "Sent", value: article_data.sent}, ]
@@ -31,9 +34,9 @@ {#each status_items as item} { item.name } - {#if item.value != ""} + {#if (item.value != "" || status_items.valze == false) } {#if item.name == "Url"} - { item.value } + { item.value } {:else} { item.value } {/if} diff --git a/news_fetch/Dockerfile b/news_fetch/Dockerfile index 58298f0..c9e6da6 100644 --- a/news_fetch/Dockerfile +++ b/news_fetch/Dockerfile @@ -5,15 +5,8 @@ ENV TZ Europe/Zurich RUN apt-get update && apt-get install -y ghostscript # for compression of pdfs -# RUN useradd --create-home --shell /bin/bash --uid 1001 autonews -# id mapped to local user -# home directory needed for pip package installation -# RUN export PATH=/home/autonews/.local/bin:$PATH - RUN mkdir -p /app/auto_news -# RUN chown -R autonews:autonews /app -# USER autonews COPY requirements.txt /app/requirements.txt RUN python3 -m pip install -r /app/requirements.txt diff --git a/news_fetch/utils_worker/download/browser.py b/news_fetch/utils_worker/download/browser.py index 068f16c..38f95d3 100644 --- a/news_fetch/utils_worker/download/browser.py +++ b/news_fetch/utils_worker/download/browser.py @@ -6,10 +6,8 @@ import base64 import requests from selenium import webdriver import configuration -import json config = configuration.main_config["DOWNLOADS"] -blacklisted = json.loads(config["blacklisted_href_domains"]) class PDFDownloader: @@ -21,42 +19,31 @@ class PDFDownloader: def start(self): self.finish() # clear up - options = webdriver.FirefoxOptions() - options.profile = config["browser_profile_path"] - # should be options.set_preference("profile", config["browser_profile_path"]) as of selenium 4 but that doesn't work + options = webdriver.ChromeOptions() + options.add_argument(f"user-data-dir={config['browser_profile_path']}") + options.add_argument('--headless') - if os.getenv("DEBUG", "false") == "true": - self.logger.warning("Opening browser GUI because of 'DEBUG=true'") - else: - options.add_argument('--headless') + # if os.getenv("DEBUG", "false") == "true": + # self.logger.warning("Opening browser GUI because of 'DEBUG=true'") + # else: - options.set_preference('print.save_as_pdf.links.enabled', True) - # Just save if the filetype is pdf already - # TODO: this is not working right now + # options.set_preference('print.save_as_pdf.links.enabled', True) + # # Just save if the filetype is pdf already + # # TODO: this is not working right now - options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True) - options.set_preference("browser.download.folderList", 2) - # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf") - # options.set_preference("pdfjs.disabled", True) - options.set_preference("browser.download.dir", config["default_download_path"]) + # options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True) + # options.set_preference("browser.download.folderList", 2) + # # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf") + # # options.set_preference("pdfjs.disabled", True) + # options.set_preference("browser.download.dir", config["default_download_path"]) - self.logger.info("Starting gecko driver") - # peviously, in a single docker image: - # self.driver = webdriver.Firefox( - # options = options, - # service = webdriver.firefox.service.Service( - # log_path = f'{config["local_storage_path"]}/geckodriver.log' - # )) + self.logger.info("Starting chrome driver") self.driver = webdriver.Remote( - command_executor = 'http://geckodriver:4444', + command_executor = 'http://chrome:4444', # the host chrome points to the chrome container options = options, # can't set log path... ) - residues = os.listdir(config["default_download_path"]) - for res in residues: - os.remove(os.path.join(config["default_download_path"], res)) - self.running = True def autostart(self): @@ -65,7 +52,7 @@ class PDFDownloader: def finish(self): if self.running: - self.logger.info("Exiting gecko driver") + self.logger.info("Exiting chrome driver") try: self.driver.quit() time.sleep(10) @@ -73,7 +60,7 @@ class PDFDownloader: self.logger.critical("Connection to the driver broke off") self.running = False else: - self.logger.info("Gecko driver not yet running") + self.logger.info("Chrome driver not yet running") def download(self, article_object): sleep_time = 2 @@ -153,8 +140,6 @@ class PDFDownloader: - - def make_path_unique(path): fname, ending = os.path.splitext(path) fname += datetime.datetime.now().strftime("%d-%H%M%S") diff --git a/news_fetch/utils_worker/download/youtube.py b/news_fetch/utils_worker/download/youtube.py index 77a34ff..a16305b 100644 --- a/news_fetch/utils_worker/download/youtube.py +++ b/news_fetch/utils_worker/download/youtube.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals import youtube_dl import os import logging diff --git a/news_fetch/utils_worker/upload/runner.py b/news_fetch/utils_worker/upload/runner.py index f72d6f3..b02f5e0 100644 --- a/news_fetch/utils_worker/upload/runner.py +++ b/news_fetch/utils_worker/upload/runner.py @@ -1,4 +1,3 @@ -import time from waybackpy import WaybackMachineSaveAPI # upload to archive.org import logging logger = logging.getLogger(__name__)