From db161e50c8017ff01b701a38fe4597af9dc63440 Mon Sep 17 00:00:00 2001
From: Remy Moll <me@moll.re>
Date: Sun, 18 Sep 2022 19:26:55 +0200
Subject: [PATCH] Switched from geckodriver to chrome

---
 README.md                                   | 16 +++++-
 chrome/change_configuration.sh              | 16 ++++++
 docker-compose.yaml                         | 13 +++--
 env/debug                                   |  3 +-
 env/production                              |  2 +-
 launch                                      | 16 ++++--
 misc/sample_config/news_fetch.config.ini    |  3 +-
 misc/youtube_batch.py                       | 56 +++++++++++++++++++++
 news_check/client/src/ArticleStatus.svelte  | 11 ++--
 news_fetch/Dockerfile                       |  7 ---
 news_fetch/utils_worker/download/browser.py | 51 +++++++------------
 news_fetch/utils_worker/download/youtube.py |  1 -
 news_fetch/utils_worker/upload/runner.py    |  1 -
 13 files changed, 135 insertions(+), 61 deletions(-)
 create mode 100644 chrome/change_configuration.sh
 create mode 100644 misc/youtube_batch.py

diff --git a/README.md b/README.md
index 3acebb3..c33120f 100644
--- a/README.md
+++ b/README.md
@@ -76,10 +76,24 @@ docker compose --env-file env/production logs -f news_fetch # follows along with
 docker compose --env-file env/production down
 ```
 
+### First run:
+> The program relies on a functioning chrome profile!
+
+For the first run ever, run 
+
+`./launch init`
+
+This will generate a new chrome profile under `coss_archiving/dependencies/news_fetch.profile`.
+You can then go to [http://localhost:7900](http://localhost:7900) in your browser. Verify the profile (under chrome://profile-internals).
+
+Now install two addons: Idontcareaboutcookies (from chrome://extensions) and Bypass Paywalls (from https://github.com/iamadamdev/bypass-paywalls-chrome). The script already downloaded the file, so just enable developer mode, click load from unpacked, go to `/user_data/dependencies/news_fetch.profile`, select the directory `bypass-paywalls-chrome-master`.
+
+Whenever you need to make changes to the profile, for instance re-log in to websites, just rerun `./launch init`.
+
 
 ## Building
 
-> The software (firefox, selenium, python) changes frequently. For non-breaking changes it is useful to regularly re build the docker image! This is also crucial to update the code itself.
+> The software **will** change. Because the images referenced in docker compose are usually the `latest` ones, it is sufficient to update the containers.
 
 In docker compose, run 
 
diff --git a/chrome/change_configuration.sh b/chrome/change_configuration.sh
new file mode 100644
index 0000000..c4fa27b
--- /dev/null
+++ b/chrome/change_configuration.sh
@@ -0,0 +1,16 @@
+if [ -d "/user_data/news_fetch.profile" ] 
+then
+    echo "Profile already exists, skipping creation"
+else
+    google-chrome &
+    sleep 5
+    cp -r /home/seluser/.config/google-chrome/Default /user_data/news_fetch.profile
+    PID=$(pidof chrome)
+    echo "Now killing processes with pid:" $PID
+    kill $PID
+    cd /user_data/news_fetch.profile
+    wget https://github.com/iamadamdev/bypass-paywalls-chrome/archive/master.zip
+    unzip master
+fi
+
+google-chrome --user-data-dir=/user_data/news_fetch.profile
\ No newline at end of file
diff --git a/docker-compose.yaml b/docker-compose.yaml
index e8b811e..69dbae5 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -33,13 +33,17 @@ services:
       - /sync/nas_sync.config
 
 
-  geckodriver: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers)
-    image: ${GECKODRIVER_IMG}
+  chrome: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers)
+    image: selenium/standalone-chrome:latest
     shm_size: 2gb
     environment:
       - START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash)
       - START_XVFB=${HEADFULL-false}
       - SE_VNC_NO_PASSWORD=1
+    volumes:
+      - ${CONTAINER_DATA}/dependencies:/user_data
+      - ${CODE:-/dev/null}:/code
+    user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user
     expose: ["4444"] # exposed to other docker-compose services only
     ports:
       - 7900:7900 # port for webvnc
@@ -59,7 +63,7 @@ services:
 
     depends_on: # when using docker compose run news_fetch, the dependencies are started as well
       - nas_sync
-      - geckodriver
+      - chrome
       - db_passthrough
 
     volumes:
@@ -68,6 +72,7 @@ services:
     environment:
       - DEBUG=${DEBUG}
       - UNAME=${UNAME}
+    user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user
     entrypoint: ${ENTRYPOINT:-python runner.py} # by default launch workers as defined in the Dockerfile
     # stdin_open: ${INTERACTIVE:-false} # docker run -i
     # tty: ${INTERACTIVE:-false}        # docker run -t
@@ -76,7 +81,7 @@ services:
   news_check: # Creates a small webapp on http://localhost:8080 to check previously generated pdfs (some of which are unusable and must be marked as such)
     build: news_check
     image: news_check:latest
-    user: 1000:1000 # since the app writes files to the local filesystem, it must be run as the current user
+    user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user
     depends_on:
       - db_passthrough
     volumes:
diff --git a/env/debug b/env/debug
index 104a910..9811c57 100644
--- a/env/debug
+++ b/env/debug
@@ -1,9 +1,8 @@
 # Runs in a debugging mode, does not launch anything at all but starts a bash process
 
-export CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
+export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
 export UNAME=remy
 
-export GECKODRIVER_IMG=selenium/standalone-firefox:104.0
 export DEBUG=true
 export HEADFULL=true
 export CODE=./
diff --git a/env/production b/env/production
index 26eee70..c7f14d5 100644
--- a/env/production
+++ b/env/production
@@ -3,5 +3,5 @@
 CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
 
 export UNAME=remy
-export GECKODRIVER_IMG=selenium/standalone-firefox:104.0
+export U_ID=1000
 export DEBUG=false
diff --git a/launch b/launch
index 34d0c1d..728ad95 100644
--- a/launch
+++ b/launch
@@ -8,9 +8,7 @@ echo "Bash script launching COSS_ARCHIVING..."
 # CHANGE ME ONCE!
 export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
 export UNAME=remy
-# CHANGE ME WHEN UPDATING FIREFOX
-export GECKODRIVER_IMG=selenium/standalone-firefox:104.0
-# version must be >= than the one on the host or firefox will not start (because of mismatched config)
+export U_ID=1000
 
 if [[ $1 == "debug" ]]
 then
@@ -18,8 +16,8 @@ then
     export HEADFULL=true
     export CODE=./
     export ENTRYPOINT=/bin/bash
-    # since service ports does not open ports on implicitly started containers, also start geckodriver:
-    docker compose up -d geckodriver
+    # since service ports does not open ports on implicitly started containers, also start chrome:
+    docker compose up -d chrome
 elif [[ $1 == "production" ]]
 then
     export DEBUG=false
@@ -32,6 +30,14 @@ elif [[ $1 == "down" ]]
 then
     docker compose stop
     exit 0
+elif [[ $1 == "init" ]]
+then
+    export CODE=./
+    export HEADFULL=true
+
+    docker compose up -d chrome
+    sleep 5
+    docker compose exec chrome /bin/bash /code/chrome/change_configuration.sh
 else
     echo "Please specify the execution mode (debug/production/build) as the first argument"
     exit 1
diff --git a/misc/sample_config/news_fetch.config.ini b/misc/sample_config/news_fetch.config.ini
index 76c31a7..e8de2e9 100644
--- a/misc/sample_config/news_fetch.config.ini
+++ b/misc/sample_config/news_fetch.config.ini
@@ -26,5 +26,4 @@ local_storage_path: /app/containerdata/files
 debug_storage_path: /app/containerdata/debug/
 default_download_path: /app/containerdata/tmp
 remote_storage_path: /helbing_support/Files RM/Archiving
-browser_profile_path: /app/containerdata/dependencies/7hlyfqxt.Auto News
-blacklisted_href_domains: ["google.", "facebook."]
+browser_profile_path: /user_data/news_fetch.profile
diff --git a/misc/youtube_batch.py b/misc/youtube_batch.py
new file mode 100644
index 0000000..c2304f5
--- /dev/null
+++ b/misc/youtube_batch.py
@@ -0,0 +1,56 @@
+import youtube_dl
+from waybackpy import WaybackMachineSaveAPI # upload to archive.org
+import time
+
+
+urls = [
+    "https://www.youtube.com/watch?v=R4h_yiDIuQE",
+    "https://www.youtube.com/watch?v=-G8ZI1Jq8xA",
+    "https://www.youtube.com/watch?v=8eYBcASQIQI",
+    "https://www.thingiverse.com/thing:5463267",
+    "https://www.youtube.com/watch?v=cJoUSHJcV4E&t=0s",
+    "https://www.youtube.com/watch?v=UbBYZZBREBA&t=0s",
+    "https://www.youtube.com/watch?v=bQQn_vET4ys",
+    "https://www.youtube.com/watch?v=6FqNctiO06E",
+    "https://www.youtube.com/watch?v=ImnuJgj8XJo",
+    "https://www.youtube.com/watch?v=4QZQtSqaC34",
+    "https://www.youtube.com/watch?v=cW4qIjPMGkQ",
+    "https://www.youtube.com/watch?v=QWsUGpKfP8A",
+    "https://www.youtube.com/watch?v=a0PwEwLG9No",
+    "https://www.youtube.com/watch?v=Hd3lnWVIIpo",
+    "https://www.youtube.com/watch?v=JNtdAp-BdzI",
+    "https://en.wikipedia.org/wiki/Viktor_Schauberger",
+    "https://de.wikipedia.org/wiki/Viktor_Schauberger",
+]
+def post_download_hook(ret_code):
+    # print(ret_code)
+    if ret_code['status'] == 'finished':
+        file_loc = ret_code["filename"]
+        print(file_loc)
+
+
+def save_video(url):
+    """Saves video accoring to url and save path"""
+    ydl_opts = {
+        'format': 'best[height<=720]',
+        # 'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download
+        'progress_hooks': [post_download_hook],
+        'updatetime': False
+    }
+    try:
+        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
+            ydl.download([url])
+            # article file name is updated in self.post_download_hook
+    except Exception as e:
+        print(f"Youtube download crashed: {e}")
+
+
+# for url in urls:
+#     save_video(url)
+
+for url in urls:
+    user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed?
+    wayback = WaybackMachineSaveAPI(url, user_agent)
+    archive_url = wayback.save()
+    print(archive_url)
+    time.sleep(20)
diff --git a/news_check/client/src/ArticleStatus.svelte b/news_check/client/src/ArticleStatus.svelte
index 6c426dc..97bbd3a 100644
--- a/news_check/client/src/ArticleStatus.svelte
+++ b/news_check/client/src/ArticleStatus.svelte
@@ -9,12 +9,15 @@
         {name: 'Language', value: article_data.language},
         {name: 'Authors', value: article_data.authors},
         {name: "Related", value: article_data.related},
+        {name: "Sent", value: article_data.sent},
     ]
 </script>
 
 <style>
-  a {
-    word-break: break-all;
+  td {
+    overflow-wrap: break-word;
+    word-wrap: break-word;
+    word-break: break-word;
   }
 </style>
 <div class="card bg-neutral-300 shadow-xl overflow-x-auto">
@@ -31,9 +34,9 @@
             {#each status_items as item}
                 <tr>
                     <td>{ item.name }</td>
-                    {#if item.value != ""}
+                    {#if (item.value != "" || status_items.valze == false) }
                       {#if item.name == "Url"}
-                        <td class='bg-emerald-200'><a href="{ item.value }">{ item.value }</a></td>
+                        <td class='bg-emerald-200'><a href="{ item.value }" target="_blank">{ item.value }</a></td>
                       {:else}
                         <td class='bg-emerald-200' style="white-space: normal; width:70%">{ item.value }</td>
                       {/if}
diff --git a/news_fetch/Dockerfile b/news_fetch/Dockerfile
index 58298f0..c9e6da6 100644
--- a/news_fetch/Dockerfile
+++ b/news_fetch/Dockerfile
@@ -5,15 +5,8 @@ ENV TZ Europe/Zurich
 RUN apt-get update && apt-get install -y ghostscript
 # for compression of pdfs
 
-# RUN useradd --create-home --shell /bin/bash --uid 1001 autonews
-# id mapped to local user
-# home directory needed for pip package installation
-# RUN export PATH=/home/autonews/.local/bin:$PATH
-
 
 RUN mkdir -p /app/auto_news
-# RUN chown -R autonews:autonews /app
-# USER autonews
 
 COPY requirements.txt /app/requirements.txt
 RUN python3 -m pip install -r /app/requirements.txt
diff --git a/news_fetch/utils_worker/download/browser.py b/news_fetch/utils_worker/download/browser.py
index 068f16c..38f95d3 100644
--- a/news_fetch/utils_worker/download/browser.py
+++ b/news_fetch/utils_worker/download/browser.py
@@ -6,10 +6,8 @@ import base64
 import requests
 from selenium import webdriver
 import configuration
-import json
 
 config = configuration.main_config["DOWNLOADS"]
-blacklisted = json.loads(config["blacklisted_href_domains"])
 
 
 class PDFDownloader:
@@ -21,42 +19,31 @@ class PDFDownloader:
     def start(self):
         self.finish() # clear up
             
-        options = webdriver.FirefoxOptions()
-        options.profile = config["browser_profile_path"]
-        # should be options.set_preference("profile", config["browser_profile_path"]) as of selenium 4 but that doesn't work
+        options = webdriver.ChromeOptions()
+        options.add_argument(f"user-data-dir={config['browser_profile_path']}")
+        options.add_argument('--headless')
 
-        if os.getenv("DEBUG", "false") == "true":
-            self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
-        else:
-            options.add_argument('--headless')
+        # if os.getenv("DEBUG", "false") == "true":
+        #     self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
+        # else:
 
-        options.set_preference('print.save_as_pdf.links.enabled', True)
-        # Just save if the filetype is pdf already
-        # TODO: this is not working right now
+        # options.set_preference('print.save_as_pdf.links.enabled', True)
+        # # Just save if the filetype is pdf already
+        # # TODO: this is not working right now
 
-        options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
-        options.set_preference("browser.download.folderList", 2)
-        # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
-        # options.set_preference("pdfjs.disabled", True)
-        options.set_preference("browser.download.dir", config["default_download_path"])
+        # options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
+        # options.set_preference("browser.download.folderList", 2)
+        # # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
+        # # options.set_preference("pdfjs.disabled", True)
+        # options.set_preference("browser.download.dir", config["default_download_path"])
 
-        self.logger.info("Starting gecko driver")
-        # peviously, in a single docker image:
-        # self.driver = webdriver.Firefox(
-        #     options = options,
-        #     service = webdriver.firefox.service.Service(
-        #         log_path = f'{config["local_storage_path"]}/geckodriver.log'
-        # ))
+        self.logger.info("Starting chrome driver")
         self.driver = webdriver.Remote(
-            command_executor = 'http://geckodriver:4444',
+            command_executor = 'http://chrome:4444', # the host chrome points to the chrome container
             options = options,
             # can't set log path...
         )
         
-        residues = os.listdir(config["default_download_path"])
-        for res in residues:
-            os.remove(os.path.join(config["default_download_path"], res))
-
         self.running = True
 
     def autostart(self):
@@ -65,7 +52,7 @@ class PDFDownloader:
 
     def finish(self):
         if self.running:
-            self.logger.info("Exiting gecko driver")
+            self.logger.info("Exiting chrome driver")
             try:
                 self.driver.quit()
                 time.sleep(10)
@@ -73,7 +60,7 @@ class PDFDownloader:
                 self.logger.critical("Connection to the driver broke off")
             self.running = False
         else:
-            self.logger.info("Gecko driver not yet running")
+            self.logger.info("Chrome driver not yet running")
 
     def download(self, article_object):
         sleep_time = 2
@@ -153,8 +140,6 @@ class PDFDownloader:
 
 
 
-
-
 def make_path_unique(path):
     fname, ending = os.path.splitext(path)
     fname += datetime.datetime.now().strftime("%d-%H%M%S")
diff --git a/news_fetch/utils_worker/download/youtube.py b/news_fetch/utils_worker/download/youtube.py
index 77a34ff..a16305b 100644
--- a/news_fetch/utils_worker/download/youtube.py
+++ b/news_fetch/utils_worker/download/youtube.py
@@ -1,4 +1,3 @@
-from __future__ import unicode_literals
 import youtube_dl
 import os
 import logging
diff --git a/news_fetch/utils_worker/upload/runner.py b/news_fetch/utils_worker/upload/runner.py
index f72d6f3..b02f5e0 100644
--- a/news_fetch/utils_worker/upload/runner.py
+++ b/news_fetch/utils_worker/upload/runner.py
@@ -1,4 +1,3 @@
-import time
 from waybackpy import WaybackMachineSaveAPI # upload to archive.org
 import logging
 logger = logging.getLogger(__name__)