FS updates and corrections

2022-06-15 11:14:08 +02:00
parent 54760abee4
commit 87d65fc988
14 changed files with 91 additions and 56 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -0,0 +1,4 @@
 {
    "python.linting.flake8Enabled": true,
    "python.linting.enabled": false
 }
--- a/1
+++ b/1
@@ -2,7 +2,6 @@ FROM python:latest
 ENV TZ Euopre/Zurich
 RUN echo "deb http://deb.debian.org/debian/ unstable main contrib non-free" >> /etc/apt/sources.list
 RUN apt-get update && apt-get install -y \
 evince \
--- a/README.md
+++ b/README.md
@@ -49,9 +49,15 @@ I also wrote a rudimentary docker compose file which makes running much more sim
 All relevant passthroughs and mounts are specified through the env-file, for which I configured 4 versions: production, debug (development in general), upload and check. These files will have to be adapted to your individual setup but can be reused more easily.
-> Note:
+For the debug env-file, you will likely want interactivity, so you need to run:
 `docker compose --env-file env/debug run auto_news`
 <!-- > Note:
 >
 > The `debug` requires additional input. Once `docker compose up` is running, in a new session run `docker compose --env-file env/debug exec bash`. The live-mounted code is then under `/code`. Note that the `DEBUG=true` environment variable is still set. If you want to test things on production, run `export DEBUG=false`.
 -->
 ## Building
--- a/app/configuration.py
+++ b/app/configuration.py
@@ -8,7 +8,7 @@ from rich.logging import RichHandler
 logging.basicConfig(
    format='%(message)s',
    level=logging.INFO,
-    datefmt='%Y-%m-%d %H:%M:%S',
+    datefmt='%H:%M:%S', # add %Y-%m-%d if needed
    handlers=[RichHandler()]
    )
 logger = logging.getLogger(__name__)
--- a/app/utils_check/runner.py
+++ b/app/utils_check/runner.py
@@ -158,10 +158,11 @@ def verify_unchecked():
        try:
            # close any previously opened windows:
-            subprocess.call("killall evince")
+            subprocess.call(["kill", "`pgrep evince`"])
            # then open a new one
            subprocess.Popen(["evince", f"file://{os.path.join(article.save_path, article.file_name)}"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            # supress evince gtk warnings
            print("done")
        except Exception as e:
            print(e)
            continue
--- a/app/utils_storage/models.py
+++ b/app/utils_storage/models.py
@@ -207,7 +207,11 @@ class Thread(ChatBaseModel):
    @property
    def initiator_message(self):
        try:
            return self.messages[0] # TODO check if this needs sorting
        except IndexError:
            logger.warning(f"Thread {self} is empty. How can that be?")
            return None
    @property
    def message_count(self):
@@ -222,6 +226,9 @@ class Thread(ChatBaseModel):
    @property
    def is_fully_processed(self) -> bool:
        init_message = self.initiator_message
        if init_message is None:
            return False
        if init_message.is_processed_override:
            return True
        # this override is set for instance, when no url was sent at all. Then set this thread to be ignored
--- a/app/utils_worker/download/browser.py
+++ b/app/utils_worker/download/browser.py
@@ -5,13 +5,13 @@ import os
 import base64
 import requests
 from selenium import webdriver
 from selenium.webdriver.firefox.options import Options
 import configuration
 import json
 config = configuration.parsed["DOWNLOADS"]
 blacklisted = json.loads(config["blacklisted_href_domains"])
 class PDFDownloader:
    """Saves a given url. Fills the object it got as a parameter"""
    logger = logging.getLogger(__name__)
@@ -19,10 +19,8 @@ class PDFDownloader:
    running = False
    def start(self):
-        try:
+        self.finish() # clear up
-            self.finish()
+            
        except:
            self.logger.info("gecko driver not yet running")
        options = webdriver.FirefoxOptions()
        options.profile = config["browser_profile_path"]
        # should be options.set_preference("profile", config["browser_profile_path"]) as of selenium 4 but that doesn't work
@@ -59,10 +57,12 @@ class PDFDownloader:
            self.start()  # relaunch the dl util
    def finish(self):
        if self.running:
            self.logger.info("Exiting gecko driver")
            self.driver.quit()
            self.running = False
-
+        else:
            self.logger.info("Gecko driver not yet running")
    def download(self, article_object):
        sleep_time = 1
@@ -81,7 +81,7 @@ class PDFDownloader:
        # in the mean time, get a page title if required
        if article_object.is_title_bad:
-            article_object.title = self.driver.title.replace(".pdf","")
+            article_object.title = self.driver.title.replace(".pdf", "")
            # will be propagated to dst as well
        fname = article_object.fname_template
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -5,6 +5,7 @@ version: "3.9"
 services:
  auto_news:
    build: .
    image: auto_news:latest
    volumes:
      - ${CONTAINER_DATA}:/app/file_storage
      - ${HOSTS_FILE}:/etc/hosts
@@ -14,13 +15,17 @@ services:
    network_mode: host
    environment:
      - DISPLAY=$DISPLAY
      - TERM=xterm-256color # colored logs
      - COLUMNS=160 # for wider logs
      - DEBUG=${DEBUG}
      - CHECK=${CHECK}
      - UPLOAD=${UPLOAD}
      - HEADLESS=${HEADLESS}
      - REDUCEDFETCH=${REDUCEDFETCH}
    entrypoint: ${ENTRYPOINT:-python3 runner.py} # by default launch workers as defined in the Dockerfile
    stdin_open: ${INTERACTIVE:-false} # docker run -i
    tty: ${INTERACTIVE:-false}        # docker run -t
    entrypoint: ${ENTRYPOINT:-"python3 runner.py"} # by default launch workers as defined in the Dockerfile
  # geckodriver:
  #   image: selenium/standalone-firefox:100.0
--- a/env/check
+++ b/env/check
@@ -1,7 +1,7 @@
 # Does not run any downloads but displays the previously downloaded but not yet checked files. Requires display-acces via xauth
-CONTAINER_DATA=/mnt/Data/COSS/Downloads/auto_news.container
+CONTAINER_DATA=~/Bulk/COSS/Downloads/auto_news.container
-HOSTS_FILE=/mnt/Data/COSS/Downloads/auto_news.container/dependencies/hosts
+HOSTS_FILE=~/Bulk/COSS/Downloads/auto_news.container/dependencies/hosts
 XAUTHORTIY=$XAUTHORTIY
--- a/env/debug
+++ b/env/debug
@@ -1,7 +1,7 @@
 # Runs in a debugging mode, does not launch anything at all but starts a bash process
-CONTAINER_DATA=/mnt/Data/COSS/Downloads/auto_news.container
+CONTAINER_DATA=~/Bulk/COSS/Downloads/auto_news.container
-HOSTS_FILE=/mnt/Data/COSS/Downloads/auto_news.container/dependencies/hosts
+HOSTS_FILE=~/Bulk/COSS/Downloads/auto_news.container/dependencies/hosts
 CODE=./
 XAUTHORTIY=$XAUTHORTIY
@@ -12,4 +12,5 @@ UPLOAD=false
 HEADLESS=false
 REDUCEDFETCH=false
-ENTRYPOINT="sleep infinity"
+ENTRYPOINT="/bin/bash"
 INTERACTIVE=true
--- a/env/production
+++ b/env/production
@@ -1,7 +1,7 @@
 # Runs on the main slack channel with the full worker setup. If nothing funky has occured, reducedfetch is a speedup
-CONTAINER_DATA=/mnt/Data/Downloads/auto_news.container
+CONTAINER_DATA=~/Bulk/COSS/Downloads/auto_news.container
-HOSTS_FILE=/mnt/Data/COSS/Downloads/auto_news.container/dependencies/hosts
+HOSTS_FILE=~/Bulk/COSS/Downloads/auto_news.container/dependencies/hosts
 DEBUG=false
 CHECK=false
--- a/env/upload
+++ b/env/upload
@@ -1,7 +1,7 @@
 # Does not run any other workers and only upploads to archive the urls that weren't previously uploaded
-CONTAINER_DATA=/mnt/Data/COSS/Downloads/auto_news.container
+CONTAINER_DATA=~/Bulk/COSS/Downloads/auto_news.container
-HOSTS_FILE=/mnt/Data/COSS/Downloads/auto_news.container/dependencies/hosts
+HOSTS_FILE=~/Bulk/COSS/Downloads/auto_news.container/dependencies/hosts
 DEBUG=false
--- a/misc/gather_media_files.py
+++ b/misc/gather_media_files.py
@@ -1,5 +1,3 @@
 from cmath import log
 from concurrent.futures import thread
 import sys
 sys.path.append("../app")
 import runner
@@ -8,33 +6,47 @@ logger = logging.getLogger()
 import json
-
+logger.info("Overwriting production values for single time media-fetch")
 logger.info("Overwriting production values for single use media-fetch")
 runner.configuration.models.set_db(
-    runner.configuration.SqliteDatabase("media_message_dummy.db"), # chat_db (not needed here)
+    runner.configuration.SqliteDatabase("../.dev/media_message_dummy.db"),  # chat_db (not needed here)
-    runner.configuration.SqliteDatabase("media_downloads.db")
+    runner.configuration.SqliteDatabase("../.dev/media_downloads.db")
 )
-runner.configuration.parsed["DOWNLOADS"]["local_storage_path"] = "."
+runner.configuration.parsed["DOWNLOADS"]["local_storage_path"] = "../.dev/"
 coordinator = runner.Coordinator()
-kwargs = {
+def fetch():
    coordinator = runner.Coordinator()
    kwargs = {
        "worker_download" : runner.DownloadWorker(),
        "worker_fetch" : runner.FetchWorker(),
        "worker_upload" : runner.UploadWorker(),
        "worker_compress" : runner.CompressWorker(),
-}
+    }
-coordinator.add_workers(**kwargs)
+    coordinator.add_workers(**kwargs)
-coordinator.start()
+    coordinator.start()
-with open("media_urls.json", "r") as f:
+    with open("media_urls.json", "r") as f:
        url_list = json.loads(f.read()) 
-logger.info(f"Found {len(url_list)} media urls")
+    logger.info(f"Found {len(url_list)} media urls")
-for u in url_list:
+    for u in url_list:
        msg_text = f"<{u}|dummy preview text>"
        dummy_thread = runner.models.Thread()
        msg = runner.models.Message(text= msg_text, thread=dummy_thread)
        coordinator.incoming_request(msg)
 def show():
    sel = runner.models.ArticleDownload.select()
    entries = ["title"]  #, "article_url", "archive_url"]
    for e in entries:
        r = [t.title for t in sel]
        print(r)
        # print([t for t in r])
 show()