Switched from geckodriver to chrome

2022-09-18 19:26:55 +02:00 · 2022-09-18 19:26:55 +02:00 · db161e50c8
commit db161e50c8
parent 7cf7422b46
13 changed files with 135 additions and 61 deletions
--- a/README.md
+++ b/README.md
@ -76,10 +76,24 @@ docker compose --env-file env/production logs -f news_fetch # follows along with
 docker compose --env-file env/production down
 ```
 ### First run:
 > The program relies on a functioning chrome profile!
 For the first run ever, run 
 `./launch init`
 This will generate a new chrome profile under `coss_archiving/dependencies/news_fetch.profile`.
 You can then go to [http://localhost:7900](http://localhost:7900) in your browser. Verify the profile (under chrome://profile-internals).
 Now install two addons: Idontcareaboutcookies (from chrome://extensions) and Bypass Paywalls (from https://github.com/iamadamdev/bypass-paywalls-chrome). The script already downloaded the file, so just enable developer mode, click load from unpacked, go to `/user_data/dependencies/news_fetch.profile`, select the directory `bypass-paywalls-chrome-master`.
 Whenever you need to make changes to the profile, for instance re-log in to websites, just rerun `./launch init`.
 ## Building
-> The software (firefox, selenium, python) changes frequently. For non-breaking changes it is useful to regularly re build the docker image! This is also crucial to update the code itself.
+> The software **will** change. Because the images referenced in docker compose are usually the `latest` ones, it is sufficient to update the containers.
 In docker compose, run 
--- a/chrome/change_configuration.sh
+++ b/chrome/change_configuration.sh
@ -0,0 +1,16 @@
 if [ -d "/user_data/news_fetch.profile" ] 
 then
    echo "Profile already exists, skipping creation"
 else
    google-chrome &
    sleep 5
    cp -r /home/seluser/.config/google-chrome/Default /user_data/news_fetch.profile
    PID=$(pidof chrome)
    echo "Now killing processes with pid:" $PID
    kill $PID
    cd /user_data/news_fetch.profile
    wget https://github.com/iamadamdev/bypass-paywalls-chrome/archive/master.zip
    unzip master
 fi
 google-chrome --user-data-dir=/user_data/news_fetch.profile
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@ -33,13 +33,17 @@ services:
      - /sync/nas_sync.config
-  geckodriver: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers)
+  chrome: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers)
-    image: ${GECKODRIVER_IMG}
+    image: selenium/standalone-chrome:latest
    shm_size: 2gb
    environment:
      - START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash)
      - START_XVFB=${HEADFULL-false}
      - SE_VNC_NO_PASSWORD=1
    volumes:
      - ${CONTAINER_DATA}/dependencies:/user_data
      - ${CODE:-/dev/null}:/code
    user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user
    expose: ["4444"] # exposed to other docker-compose services only
    ports:
      - 7900:7900 # port for webvnc
@ -59,7 +63,7 @@ services:
    depends_on: # when using docker compose run news_fetch, the dependencies are started as well
      - nas_sync
-      - geckodriver
+      - chrome
      - db_passthrough
    volumes:
@ -68,6 +72,7 @@ services:
    environment:
      - DEBUG=${DEBUG}
      - UNAME=${UNAME}
    user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user
    entrypoint: ${ENTRYPOINT:-python runner.py} # by default launch workers as defined in the Dockerfile
    # stdin_open: ${INTERACTIVE:-false} # docker run -i
    # tty: ${INTERACTIVE:-false}        # docker run -t
@ -76,7 +81,7 @@ services:
  news_check: # Creates a small webapp on http://localhost:8080 to check previously generated pdfs (some of which are unusable and must be marked as such)
    build: news_check
    image: news_check:latest
-    user: 1000:1000 # since the app writes files to the local filesystem, it must be run as the current user
+    user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user
    depends_on:
      - db_passthrough
    volumes:
--- a/env/debug
+++ b/env/debug
@ -1,9 +1,8 @@
 # Runs in a debugging mode, does not launch anything at all but starts a bash process
-export CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
+export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
 export UNAME=remy
 export GECKODRIVER_IMG=selenium/standalone-firefox:104.0
 export DEBUG=true
 export HEADFULL=true
 export CODE=./
--- a/env/production
+++ b/env/production
@ -3,5 +3,5 @@
 CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
 export UNAME=remy
-export GECKODRIVER_IMG=selenium/standalone-firefox:104.0
+export U_ID=1000
 export DEBUG=false
--- a/16
+++ b/16
@ -8,9 +8,7 @@ echo "Bash script launching COSS_ARCHIVING..."
 # CHANGE ME ONCE!
 export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
 export UNAME=remy
-# CHANGE ME WHEN UPDATING FIREFOX
+export U_ID=1000
 export GECKODRIVER_IMG=selenium/standalone-firefox:104.0
 # version must be >= than the one on the host or firefox will not start (because of mismatched config)
 if [[ $1 == "debug" ]]
 then
@ -18,8 +16,8 @@ then
    export HEADFULL=true
    export CODE=./
    export ENTRYPOINT=/bin/bash
-    # since service ports does not open ports on implicitly started containers, also start geckodriver:
+    # since service ports does not open ports on implicitly started containers, also start chrome:
-    docker compose up -d geckodriver
+    docker compose up -d chrome
 elif [[ $1 == "production" ]]
 then
    export DEBUG=false
@ -32,6 +30,14 @@ elif [[ $1 == "down" ]]
 then
    docker compose stop
    exit 0
 elif [[ $1 == "init" ]]
 then
    export CODE=./
    export HEADFULL=true
    docker compose up -d chrome
    sleep 5
    docker compose exec chrome /bin/bash /code/chrome/change_configuration.sh
 else
    echo "Please specify the execution mode (debug/production/build) as the first argument"
    exit 1
--- a/misc/sample_config/news_fetch.config.ini
+++ b/misc/sample_config/news_fetch.config.ini
@ -26,5 +26,4 @@ local_storage_path: /app/containerdata/files
 debug_storage_path: /app/containerdata/debug/
 default_download_path: /app/containerdata/tmp
 remote_storage_path: /helbing_support/Files RM/Archiving
-browser_profile_path: /app/containerdata/dependencies/7hlyfqxt.Auto News
+browser_profile_path: /user_data/news_fetch.profile
 blacklisted_href_domains: ["google.", "facebook."]
--- a/misc/youtube_batch.py
+++ b/misc/youtube_batch.py
@ -0,0 +1,56 @@
 import youtube_dl
 from waybackpy import WaybackMachineSaveAPI # upload to archive.org
 import time
 urls = [
    "https://www.youtube.com/watch?v=R4h_yiDIuQE",
    "https://www.youtube.com/watch?v=-G8ZI1Jq8xA",
    "https://www.youtube.com/watch?v=8eYBcASQIQI",
    "https://www.thingiverse.com/thing:5463267",
    "https://www.youtube.com/watch?v=cJoUSHJcV4E&t=0s",
    "https://www.youtube.com/watch?v=UbBYZZBREBA&t=0s",
    "https://www.youtube.com/watch?v=bQQn_vET4ys",
    "https://www.youtube.com/watch?v=6FqNctiO06E",
    "https://www.youtube.com/watch?v=ImnuJgj8XJo",
    "https://www.youtube.com/watch?v=4QZQtSqaC34",
    "https://www.youtube.com/watch?v=cW4qIjPMGkQ",
    "https://www.youtube.com/watch?v=QWsUGpKfP8A",
    "https://www.youtube.com/watch?v=a0PwEwLG9No",
    "https://www.youtube.com/watch?v=Hd3lnWVIIpo",
    "https://www.youtube.com/watch?v=JNtdAp-BdzI",
    "https://en.wikipedia.org/wiki/Viktor_Schauberger",
    "https://de.wikipedia.org/wiki/Viktor_Schauberger",
 ]
 def post_download_hook(ret_code):
    # print(ret_code)
    if ret_code['status'] == 'finished':
        file_loc = ret_code["filename"]
        print(file_loc)
 def save_video(url):
    """Saves video accoring to url and save path"""
    ydl_opts = {
        'format': 'best[height<=720]',
        # 'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download
        'progress_hooks': [post_download_hook],
        'updatetime': False
    }
    try:
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
            # article file name is updated in self.post_download_hook
    except Exception as e:
        print(f"Youtube download crashed: {e}")
 # for url in urls:
 #     save_video(url)
 for url in urls:
    user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed?
    wayback = WaybackMachineSaveAPI(url, user_agent)
    archive_url = wayback.save()
    print(archive_url)
    time.sleep(20)
--- a/news_check/client/src/ArticleStatus.svelte
+++ b/news_check/client/src/ArticleStatus.svelte
@ -9,12 +9,15 @@
        {name: 'Language', value: article_data.language},
        {name: 'Authors', value: article_data.authors},
        {name: "Related", value: article_data.related},
        {name: "Sent", value: article_data.sent},
    ]
 </script>
 <style>
-  a {
+  td {
-    word-break: break-all;
+    overflow-wrap: break-word;
    word-wrap: break-word;
    word-break: break-word;
  }
 </style>
 <div class="card bg-neutral-300 shadow-xl overflow-x-auto">
@ -31,9 +34,9 @@
            {#each status_items as item}
                <tr>
                    <td>{ item.name }</td>
-                    {#if item.value != ""}
+                    {#if (item.value != "" || status_items.valze == false) }
                      {#if item.name == "Url"}
-                        <td class='bg-emerald-200'><a href="{ item.value }">{ item.value }</a></td>
+                        <td class='bg-emerald-200'><a href="{ item.value }" target="_blank">{ item.value }</a></td>
                      {:else}
                        <td class='bg-emerald-200' style="white-space: normal; width:70%">{ item.value }</td>
                      {/if}
--- a/news_fetch/Dockerfile
+++ b/news_fetch/Dockerfile
@ -5,15 +5,8 @@ ENV TZ Europe/Zurich
 RUN apt-get update && apt-get install -y ghostscript
 # for compression of pdfs
 # RUN useradd --create-home --shell /bin/bash --uid 1001 autonews
 # id mapped to local user
 # home directory needed for pip package installation
 # RUN export PATH=/home/autonews/.local/bin:$PATH
 RUN mkdir -p /app/auto_news
 # RUN chown -R autonews:autonews /app
 # USER autonews
 COPY requirements.txt /app/requirements.txt
 RUN python3 -m pip install -r /app/requirements.txt
--- a/news_fetch/utils_worker/download/browser.py
+++ b/news_fetch/utils_worker/download/browser.py
@ -6,10 +6,8 @@ import base64
 import requests
 from selenium import webdriver
 import configuration
 import json
 config = configuration.main_config["DOWNLOADS"]
 blacklisted = json.loads(config["blacklisted_href_domains"])
 class PDFDownloader:
@ -21,42 +19,31 @@ class PDFDownloader:
    def start(self):
        self.finish() # clear up
-        options = webdriver.FirefoxOptions()
+        options = webdriver.ChromeOptions()
-        options.profile = config["browser_profile_path"]
+        options.add_argument(f"user-data-dir={config['browser_profile_path']}")
-        # should be options.set_preference("profile", config["browser_profile_path"]) as of selenium 4 but that doesn't work
+        options.add_argument('--headless')
-        if os.getenv("DEBUG", "false") == "true":
+        # if os.getenv("DEBUG", "false") == "true":
-            self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
+        #     self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
-        else:
+        # else:
            options.add_argument('--headless')
-        options.set_preference('print.save_as_pdf.links.enabled', True)
+        # options.set_preference('print.save_as_pdf.links.enabled', True)
-        # Just save if the filetype is pdf already
+        # # Just save if the filetype is pdf already
-        # TODO: this is not working right now
+        # # TODO: this is not working right now
-        options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
+        # options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
-        options.set_preference("browser.download.folderList", 2)
+        # options.set_preference("browser.download.folderList", 2)
-        # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
+        # # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
-        # options.set_preference("pdfjs.disabled", True)
+        # # options.set_preference("pdfjs.disabled", True)
-        options.set_preference("browser.download.dir", config["default_download_path"])
+        # options.set_preference("browser.download.dir", config["default_download_path"])
-        self.logger.info("Starting gecko driver")
+        self.logger.info("Starting chrome driver")
        # peviously, in a single docker image:
        # self.driver = webdriver.Firefox(
        #     options = options,
        #     service = webdriver.firefox.service.Service(
        #         log_path = f'{config["local_storage_path"]}/geckodriver.log'
        # ))
        self.driver = webdriver.Remote(
-            command_executor = 'http://geckodriver:4444',
+            command_executor = 'http://chrome:4444', # the host chrome points to the chrome container
            options = options,
            # can't set log path...
        )
        residues = os.listdir(config["default_download_path"])
        for res in residues:
            os.remove(os.path.join(config["default_download_path"], res))
        self.running = True
    def autostart(self):
@ -65,7 +52,7 @@ class PDFDownloader:
    def finish(self):
        if self.running:
-            self.logger.info("Exiting gecko driver")
+            self.logger.info("Exiting chrome driver")
            try:
                self.driver.quit()
                time.sleep(10)
@ -73,7 +60,7 @@ class PDFDownloader:
                self.logger.critical("Connection to the driver broke off")
            self.running = False
        else:
-            self.logger.info("Gecko driver not yet running")
+            self.logger.info("Chrome driver not yet running")
    def download(self, article_object):
        sleep_time = 2
@ -153,8 +140,6 @@ class PDFDownloader:
 def make_path_unique(path):
    fname, ending = os.path.splitext(path)
    fname += datetime.datetime.now().strftime("%d-%H%M%S")
--- a/news_fetch/utils_worker/download/youtube.py
+++ b/news_fetch/utils_worker/download/youtube.py
@ -1,4 +1,3 @@
 from __future__ import unicode_literals
 import youtube_dl
 import os
 import logging
--- a/news_fetch/utils_worker/upload/runner.py
+++ b/news_fetch/utils_worker/upload/runner.py
@ -1,4 +1,3 @@
 import time
 from waybackpy import WaybackMachineSaveAPI # upload to archive.org
 import logging
 logger = logging.getLogger(__name__)