Fixed browser profile bug, line breaks and exceptions in news_check

2022-09-26 15:25:55 +02:00 · 2022-09-26 15:25:55 +02:00 · 9349b046d2
commit 9349b046d2
parent db161e50c8
12 changed files with 150 additions and 319 deletions
--- a/chrome/change_configuration.sh
+++ b/chrome/change_configuration.sh
@ -1,16 +0,0 @@
 if [ -d "/user_data/news_fetch.profile" ] 
 then
    echo "Profile already exists, skipping creation"
 else
    google-chrome &
    sleep 5
    cp -r /home/seluser/.config/google-chrome/Default /user_data/news_fetch.profile
    PID=$(pidof chrome)
    echo "Now killing processes with pid:" $PID
    kill $PID
    cd /user_data/news_fetch.profile
    wget https://github.com/iamadamdev/bypass-paywalls-chrome/archive/master.zip
    unzip master
 fi
 google-chrome --user-data-dir=/user_data/news_fetch.profile
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@ -27,21 +27,22 @@ services:
      - ${CONTAINER_DATA}/files:/sync/local_files
      - ${CONTAINER_DATA}/config/nas_sync.config:/sync/nas_sync.config
      - ${CONTAINER_DATA}/config/nas_login.config:/sync/nas_login.config
-    command: 
+    command:
      - nas22.ethz.ch/gess_coss_1/helbing_support/Files RM/Archiving/TEST # first command is the target mount path
      - lsyncd
      - /sync/nas_sync.config
-  chrome: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers)
+  geckodriver: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers)
-    image: selenium/standalone-chrome:latest
+    image: selenium/standalone-firefox:latest
    shm_size: 2gb
    environment:
      - START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash)
      - START_XVFB=${HEADFULL-false}
      - SE_VNC_NO_PASSWORD=1
      # - SE_OPTS="--profile /user_data/news_fetch.profile.firefox"
    volumes:
-      - ${CONTAINER_DATA}/dependencies:/user_data
+      - ${CONTAINER_DATA}/dependencies:/firefox_profile/
      - ${CODE:-/dev/null}:/code
    user: ${U_ID}:${U_ID} # since the app writes files to the local filesystem, it must be run as the current user
    expose: ["4444"] # exposed to other docker-compose services only
@ -60,10 +61,9 @@ services:
  news_fetch: # Orchestration of the automatic download. It generates pdfs (via the geckodriver container), fetches descriptions, triggers a snaphsot (on archive.org) and writes to a db
    build: news_fetch
    image: news_fetch:latest
    depends_on: # when using docker compose run news_fetch, the dependencies are started as well
      - nas_sync
-      - chrome
+      - geckodriver
      - db_passthrough
    volumes:
--- a/env/debug
+++ b/env/debug
@ -2,6 +2,7 @@
 export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
 export UNAME=remy
 export U_ID=1000
 export DEBUG=true
 export HEADFULL=true
--- a/geckodriver/edit_profile.sh
+++ b/geckodriver/edit_profile.sh
@ -0,0 +1,8 @@
 if [ -d "/firefox_profile/news_fetch.profile" ] 
 then
    echo "Profile already exists, skipping folder creation"
 else
    echo "Creating empty folder for profile"
    mkdir -p /firefox_profile/news_fetch.profile/  
 fi
 firefox --profile /firefox_profile/news_fetch.profile
--- a/34
+++ b/34
@ -10,43 +10,61 @@ export CONTAINER_DATA=/mnt/media/@Bulk/COSS/Downloads/coss_archiving
 export UNAME=remy
 export U_ID=1000
 ### Main use cases ###
 if [[ $1 == "debug" ]]
 then
    export DEBUG=true
    export HEADFULL=true
    export CODE=./
    export ENTRYPOINT=/bin/bash
-    # since service ports does not open ports on implicitly started containers, also start chrome:
+    # since service ports does not open ports on implicitly started containers, also start geckodriver:
-    docker compose up -d chrome
+    docker compose up -d geckodriver
 elif [[ $1 == "production" ]]
 then
    export DEBUG=false
 elif [[ $1 == "build" ]]
 then
    export DEBUG=false
-    docker compose build
+    shift
    docker compose build "$@"
    exit 0
 ### Manual Shutdown ###
 elif [[ $1 == "down" ]]
 then
-    docker compose stop
+    docker compose down -t 0
    exit 0
-elif [[ $1 == "init" ]]
+
 ### Edge cases -> for firefox ###
 elif [[ $1 == "edit_profile" ]]
 then
    export CODE=./
    export HEADFULL=true
-    docker compose up -d chrome
+    docker compose up -d geckodriver
    sleep 5
-    docker compose exec chrome /bin/bash /code/chrome/change_configuration.sh
+    docker compose exec  geckodriver /bin/bash /code/geckodriver/edit_profile.sh # inside the container
    docker compose down -t 0
 ### Fallback ####
 else
    echo "Please specify the execution mode (debug/production/build) as the first argument"
    exit 1
 fi
 shift # consumes the variable set in $1 so that $@ only contains the remaining arguments
 docker compose run -it --service-ports "$@"
 echo "Docker run finished, shutting down containers..."
-docker compose stop
+docker compose down -t 0
 echo "Bye!"
--- a/misc/sample_config/news_fetch.config.ini
+++ b/misc/sample_config/news_fetch.config.ini
@ -26,4 +26,6 @@ local_storage_path: /app/containerdata/files
 debug_storage_path: /app/containerdata/debug/
 default_download_path: /app/containerdata/tmp
 remote_storage_path: /helbing_support/Files RM/Archiving
-browser_profile_path: /user_data/news_fetch.profile
+browser_profile_path: /app/containerdata/dependencies/news_fetch.profile
 # please keep this exact name
 browser_print_delay: 5
--- a/misc/youtube_batch.py
+++ b/misc/youtube_batch.py
@ -4,24 +4,27 @@ import time
 urls = [
-    "https://www.youtube.com/watch?v=R4h_yiDIuQE",
+"https://id2020.org",
-    "https://www.youtube.com/watch?v=-G8ZI1Jq8xA",
+"https://www.weforum.org/platforms/the-centre-for-cybersecurity",
-    "https://www.youtube.com/watch?v=8eYBcASQIQI",
+"https://www.unhcr.org/blogs/wp-content/uploads/sites/48/2018/04/fs.pdf",
-    "https://www.thingiverse.com/thing:5463267",
+"https://en.wikipedia.org/wiki/Social_Credit_System",
-    "https://www.youtube.com/watch?v=cJoUSHJcV4E&t=0s",
+"https://en.wikipedia.org/wiki/Customer_lifetime_value",
-    "https://www.youtube.com/watch?v=UbBYZZBREBA&t=0s",
+"https://www.weforum.org/reports/the-internet-of-bodies-is-here-tackling-new-challenges-of-technology-governance",
-    "https://www.youtube.com/watch?v=bQQn_vET4ys",
+"https://www.un.org/en/about-us/universal-declaration-of-human-rights",
-    "https://www.youtube.com/watch?v=6FqNctiO06E",
+"https://www.biometricupdate.com/201909/id2020-and-partners-launch-program-to-provide-digital-id-with-vaccines",
-    "https://www.youtube.com/watch?v=ImnuJgj8XJo",
+"https://www.wired.com/2008/06/pb-theory/",
-    "https://www.youtube.com/watch?v=4QZQtSqaC34",
+"https://www.medtechdive.com/news/fda-warns-of-false-positives-with-bd-coronavirus-diagnostic/581115/",
-    "https://www.youtube.com/watch?v=cW4qIjPMGkQ",
+"https://www.bbc.com/news/world-middle-east-52579475",
-    "https://www.youtube.com/watch?v=QWsUGpKfP8A",
+"https://www.timesofisrael.com/over-12000-mistakenly-quarantined-by-phone-tracking-health-ministry-admits/",
-    "https://www.youtube.com/watch?v=a0PwEwLG9No",
+"https://www.delftdesignforvalues.nl",
-    "https://www.youtube.com/watch?v=Hd3lnWVIIpo",
+"https://www.theglobalist.com/technology-big-data-artificial-intelligence-future-peace-rooms/",
-    "https://www.youtube.com/watch?v=JNtdAp-BdzI",
+"https://link.springer.com/chapter/10.1007/978-3-319-90869-4_17",
-    "https://en.wikipedia.org/wiki/Viktor_Schauberger",
+"https://www.youtube.com/watch?v=_KhAsJRk2lo",
-    "https://de.wikipedia.org/wiki/Viktor_Schauberger",
+"https://www.bloomberg.org/environment/supporting-sustainable-cities/american-cities-climate-challenge/",
 "https://climatecitycup.org",
 ]
 def post_download_hook(ret_code):
    # print(ret_code)
    if ret_code['status'] == 'finished':
@ -45,10 +48,12 @@ def save_video(url):
        print(f"Youtube download crashed: {e}")
-# for url in urls:
+# for i, url in enumerate(urls):
-#     save_video(url)
+#     print(f"Downloading video {i+1} / {len(urls)}")
    # save_video(url)
-for url in urls:
+for i, url in enumerate(urls):
    print(f"Saving url {i+1} / {len(urls)}")
    user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed?
    wayback = WaybackMachineSaveAPI(url, user_agent)
    archive_url = wayback.save()
--- a/news_check/client/src/ArticleStatus.svelte
+++ b/news_check/client/src/ArticleStatus.svelte
@ -34,12 +34,14 @@
            {#each status_items as item}
                <tr>
                    <td>{ item.name }</td>
-                    {#if (item.value != "" || status_items.valze == false) }
+                    {#if (item.value != "" || status_items.value == false) }
-                      {#if item.name == "Url"}
+                      <td class='bg-emerald-200' style="white-space: normal; width:70%">
-                        <td class='bg-emerald-200'><a href="{ item.value }" target="_blank">{ item.value }</a></td>
+                        {#if item.name == "Url"}
-                      {:else}
+                          <a href="{ item.value }" target="_blank">{ item.value }</a>
-                        <td class='bg-emerald-200' style="white-space: normal; width:70%">{ item.value }</td>
+                        {:else}
-                      {/if}
+                          { item.value }
                        {/if}
                      </td>
                    {:else}
                      <td class='bg-red-200'>not set</td>
                    {/if}
--- a/news_check/server/app.py
+++ b/news_check/server/app.py
@ -53,11 +53,14 @@ def get_article_next(id):
@app.route("/api/article/<int:id>/set", methods=['POST'])
 def set_article(id):
-    try:
+    json = request.get_json(silent=True) # do not raise 400 if there is no json!
-        action = request.json.get('action', None)
+    # no json usually means a file was uploaded
-    except Exception as e:
+    if json is None:
-        print(f"Exception in set_article {e}")
+        print("Detected likely file upload.")
        action = None
    else:
        action = request.json.get('action', None) # action inside the json might still be empty
    with db:
        article = models.ArticleDownload.get_by_id(id)
        if action:
@ -66,7 +69,7 @@ def set_article(id):
            elif action == "b":
                article.verified = -1
        else: # implicitly action == "r":
-            print(request.files)
+            # request.files is an immutable dict
            file = request.files.get("file", None)
            if file is None: # upload tends to crash
                return "No file uploaded", 400
@ -74,7 +77,7 @@ def set_article(id):
            artname, _ = os.path.splitext(article.file_name)
            fname =  f"{artname} -- related_{article.related.count() + 1}.{file.filename.split('.')[-1]}"
            fpath = os.path.join(article.save_path, fname)
-            print(fpath)
+            print(f"Saving file to {fpath}")
            file.save(fpath)
            article.set_related([fname])
            return {"file_path": fpath}
--- a/news_fetch/configuration.py
+++ b/news_fetch/configuration.py
@ -64,5 +64,5 @@ else:
 from utils_storage import models
-# Set up the database
+# Set up the database connection (also creates tables if they don't exist)
 models.set_db(download_db)
--- a/news_fetch/utils_check/runner.py
+++ b/news_fetch/utils_check/runner.py
@ -1,208 +0,0 @@
 from rich.console import Console
 from rich.table import Table
 from rich.columns import Columns
 from rich.rule import Rule
 console = Console()
 hline = Rule(style="white")
 import os
 import subprocess
 from slack_sdk import WebClient
 import configuration
 models = configuration.models
 u_options = {
    "ENTER" : "Accept PDF as is. It gets marked as verified",
    "D" : "set languange to DE and set verified",
    "E" : "set languange to EN and set verified",
    "O" : "set other language (prompted)",
    "R" : "set related files (prompted multiple times)",
    "B" : "reject and move to folder BAD",
    "L" : "leave file as is, do not send reaction"
 }
 bot_client = WebClient(
    token = configuration.main_config["SLACK"]["auth_token"]
 )
 def file_overview(file_url: str, file_attributes: list, options: dict) -> None:
    """Prints a neat overview of the current article"""
    file_table = Table(
        title = file_url,
        row_styles = ["white", "bright_black"],
        min_width = 100
    )
    file_table.add_column("Attribute", justify = "right", no_wrap = True)
    file_table.add_column("Value set by auto_news")
    file_table.add_column("Status", justify = "right")
    for attr in file_attributes:
        file_table.add_row(attr["name"], attr["value"], attr["status"])
    option_key = "\n".join([f"[[bold]{k}[/bold]]" for k in options.keys()])
    option_action = "\n".join([f"[italic]{k}[/italic]" for k in options.values()])
    columns = Columns([option_key, option_action])
    console.print(file_table)
    console.print("Your options:")
    console.print(columns)
 def send_reaction_to_slack_thread(article, reaction):
    """Sends the verification status as a reaction to the associated slack thread."""
    thread = article.slack_thread
    messages = models.Message.select().where(models.Message.text.contains(article.article_url))
    # TODO rewrite this shit
    if len(messages) > 5:
        print("Found more than 5 messages. Aborting reactions...")
        return
    for m in messages:
        if m.is_processed_override:
            print("Message already processed. Aborting reactions...")
        elif not m.has_single_url:
            print("Found thread but won't send reaction because thread has multiple urls")
        else:
            ts = m.slack_ts
            bot_client.reactions_add(
                channel=configuration.main_config["SLACK"]["archive_id"],
                name=reaction,
                timestamp=ts
            )
            print("Sent reaction to message")
 def prompt_language(query):
    not_set = True
    while not_set:
        uin = input("Set language (nation-code, 2 letters) ")
        if len(uin) != 2:
            print("Bad code, try again")
        else:
            not_set = False
            query.language = uin
            query.save()
 def prompt_related(query):
    file_list = []
    finished = False
    while not finished:
        uin = input("Additional file for article? Type '1' to cancel ")
        if uin == "1":
            query.set_related(file_list)
            finished = True
        else:
            file_list.append(uin)
 def prompt_new_fname(query):
    uin = input("New fname? ")
    old_fname =  query.file_name
    query.file_name = uin
    query.verified = 1
    if old_fname != "":
        os.remove(query.save_path + old_fname)
    query.save()    
 def reject_article(article):
    article.verified = -1
    article.save()
    print("Article marked as bad")
    # also update the threads to not be monitored anymore
    send_reaction_to_slack_thread(article, "x")
 def unreject_article(query):
    query.verified = 1
    query.save()
    # os.rename(badpdf, fname)
    print("File set to verified")
 def accept_article(article, last_accepted):
    article.verified = 1
    article.save()
    print("Article accepted as GOOD")
    # also update the threads to not be monitored anymore
    send_reaction_to_slack_thread(article, "white_check_mark")
    return "" # linked
 def verify_unchecked():
    query = models.ArticleDownload.select().where(models.ArticleDownload.verified == 0).execute()
    last_linked = None
    for article in query:
        console.print(hline)
        core_info = []
        for e, name in zip([article.save_path, article.file_name, article.title, article.language], ["Save path", "File name", "Title", "Language"]):
            entry = {
                "status" : "[red]██[/red]" if (len(e) == 0 or e == -1) else "[green]██[/green]",
                "value" : e if len(e) != 0 else "not set",
                "name" : name
            }
            core_info.append(entry)
        try:
            # close any previously opened windows:
            # subprocess.call(["kill", "`pgrep evince`"])
            os.system("pkill evince")
            # then open a new one
            subprocess.Popen(["evince", f"file://{os.path.join(article.save_path, article.file_name)}"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            # supress evince gtk warnings
        except Exception as e:
            print(e)
            continue
        file_overview(
            file_url = article.article_url, 
            file_attributes=core_info,
            options = u_options
        )
        proceed = False
        while not proceed:
            proceed = False
            uin = input("Choice ?").lower()
            if uin == "":
                last_linked = accept_article(article, last_linked) # last linked accelerates the whole process
                proceed = True
            elif uin == "d":
                article.language = "de"
                article.verified = 1
                article.save()
                proceed = True
            elif uin == "e":
                article.language = "en"
                article.verified = 1
                article.save()
                proceed = True
            elif uin == "o":
                prompt_language(article)
            elif uin == "r":
                prompt_related(article)
            elif uin == "b":
                reject_article(article)
                proceed = True
            elif uin == "l":
                # do nothing
                proceed = True
            else:
                print("Invalid input")
--- a/news_fetch/utils_worker/download/browser.py
+++ b/news_fetch/utils_worker/download/browser.py
@ -1,70 +1,72 @@
 import logging
 import time
 import datetime
-import logging
+
-import os
+import os, shutil, uuid
 from pathlib import Path
 import base64
 import requests
 from selenium import webdriver
 import configuration
 config = configuration.main_config["DOWNLOADS"]
 def driver_running(f):
    def wrapper(*args, **kwargs):
        self = args[0]
        if not self._running:
            self.start()
        return f(*args, **kwargs)
    return wrapper
 class PDFDownloader:
    """Saves a given url. Fills the object it got as a parameter"""
    logger = logging.getLogger(__name__)
-    # status-variable for restarting:
+    _running = False
-    running = False
+
-    
+
    def start(self):
-        self.finish() # clear up
+        """Called externally to start the driver, but after an exception can also be called internally"""
-            
+        if self._running:
-        options = webdriver.ChromeOptions()
+            self.finish() # clear up
        options.add_argument(f"user-data-dir={config['browser_profile_path']}")
        options.add_argument('--headless')
-        # if os.getenv("DEBUG", "false") == "true":
+        self.logger.info("Starting geckodriver")
-        #     self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
+        
-        # else:
+        reduced_path = self.create_tmp_profile()
        profile = webdriver.FirefoxProfile(reduced_path)
        options = webdriver.FirefoxOptions()
-        # options.set_preference('print.save_as_pdf.links.enabled', True)
+        if os.getenv("DEBUG", "false") == "true":
-        # # Just save if the filetype is pdf already
+            self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
-        # # TODO: this is not working right now
+        else:
            options.add_argument('--headless')
        # options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
        # options.set_preference("browser.download.folderList", 2)
        # # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
        # # options.set_preference("pdfjs.disabled", True)
        # options.set_preference("browser.download.dir", config["default_download_path"])
        self.logger.info("Starting chrome driver")
        self.driver = webdriver.Remote(
-            command_executor = 'http://chrome:4444', # the host chrome points to the chrome container
+            command_executor = 'http://geckodriver:4444', # the host geckodriver points to the geckodriver container
            options = options,
-            # can't set log path...
+            browser_profile = profile
        )
-        self.running = True
+        self._running = True
    def autostart(self):
        if not self.running:
            self.start()  # relaunch the dl util
    def finish(self):
-        if self.running:
+        self.logger.info("Exiting Geckodriver")
-            self.logger.info("Exiting chrome driver")
+        try:
-            try:
+            self.driver.quit()
-                self.driver.quit()
+            time.sleep(10)
-                time.sleep(10)
+        except:
-            except:
+            self.logger.critical("Connection to the driver broke off")
-                self.logger.critical("Connection to the driver broke off")
+        self._running = False
            self.running = False
        else:
            self.logger.info("Chrome driver not yet running")
    @driver_running
    def download(self, article_object):
-        sleep_time = 2
+        sleep_time = int(config["browser_print_delay"])
        self.autostart()
        url = article_object.article_url
        try:
@ -89,20 +91,17 @@ class PDFDownloader:
            dst = os.path.join(article_object.save_path, fname)
-        if url[-4:] == ".pdf":
+        if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly
            # according to the browser preferences, calling the url will open pdfjs.
            # If not handled separately, printing would require the ctrl+p route, but setup is janky to say the least
            success = self.get_exisiting_pdf(url, dst)
        else:
            success = self.get_new_pdf(dst)
        if success:
            article_object.file_name = fname
        else:
            article_object.file_name = ""
-        return article_object  # this change is saved later by the external caller
+        return article_object # this change is saved later by the external caller
    def get_exisiting_pdf(self, url, dst):
@ -134,9 +133,26 @@ class PDFDownloader:
        except Exception as e:
            self.logger.error(f"Failed, because of FS-operation: {e}")
            return False
    def create_tmp_profile(self, full_profile_path: Path = Path(config["browser_profile_path"])) -> Path:
        reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}")
        print(reduced_profile_path, full_profile_path)
        os.mkdir(reduced_profile_path)
        # copy needed directories
        dirs = ["extensions", "storage"]
        for dir in dirs:
            shutil.copytree(full_profile_path / dir, reduced_profile_path / dir)
        # copy needed files
        files = ["extension-preferences.json", "addons.json", "addonStartup.json.lz4", "prefs.js", "extensions.json", "cookies.sqlite"]
        for f in files:
            shutil.copy(full_profile_path / f, reduced_profile_path)
        folder_size = round(sum(p.stat().st_size for p in Path(reduced_profile_path).rglob('*')) / 1024 / 1024, 3)
        self.logger.info(f"Generated temporary profile with size {folder_size} MB")
        return reduced_profile_path