diff --git a/.gitignore b/.gitignore index 39274ed..f1d4f21 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,4 @@ *.pyc *.log -__pycache__/ \ No newline at end of file +__pycache__/ diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..ae4a7ff --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,4 @@ +{ + "python.linting.flake8Enabled": true, + "python.linting.enabled": false +} \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 2be5e50..b732a78 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,6 @@ FROM python:latest ENV TZ Euopre/Zurich - RUN echo "deb http://deb.debian.org/debian/ unstable main contrib non-free" >> /etc/apt/sources.list RUN apt-get update && apt-get install -y \ evince \ @@ -33,4 +32,4 @@ COPY app /app/auto_news WORKDIR /app/auto_news RUN python3 -m pip install -r requirements.txt -ENTRYPOINT ["python3", "runner.py"] \ No newline at end of file +ENTRYPOINT ["python3", "runner.py"] diff --git a/README.md b/README.md index f7d206f..690f940 100644 --- a/README.md +++ b/README.md @@ -49,9 +49,15 @@ I also wrote a rudimentary docker compose file which makes running much more sim All relevant passthroughs and mounts are specified through the env-file, for which I configured 4 versions: production, debug (development in general), upload and check. These files will have to be adapted to your individual setup but can be reused more easily. -> Note: +For the debug env-file, you will likely want interactivity, so you need to run: + +`docker compose --env-file env/debug run auto_news` + + + ## Building diff --git a/app/configuration.py b/app/configuration.py index 8e7c148..385cf44 100644 --- a/app/configuration.py +++ b/app/configuration.py @@ -8,7 +8,7 @@ from rich.logging import RichHandler logging.basicConfig( format='%(message)s', level=logging.INFO, - datefmt='%Y-%m-%d %H:%M:%S', + datefmt='%H:%M:%S', # add %Y-%m-%d if needed handlers=[RichHandler()] ) logger = logging.getLogger(__name__) diff --git a/app/utils_check/runner.py b/app/utils_check/runner.py index a36b3c6..10feab3 100644 --- a/app/utils_check/runner.py +++ b/app/utils_check/runner.py @@ -158,10 +158,11 @@ def verify_unchecked(): try: # close any previously opened windows: - subprocess.call("killall evince") + subprocess.call(["kill", "`pgrep evince`"]) # then open a new one subprocess.Popen(["evince", f"file://{os.path.join(article.save_path, article.file_name)}"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) # supress evince gtk warnings + print("done") except Exception as e: print(e) continue diff --git a/app/utils_storage/models.py b/app/utils_storage/models.py index 6c76329..bfc1927 100644 --- a/app/utils_storage/models.py +++ b/app/utils_storage/models.py @@ -207,7 +207,11 @@ class Thread(ChatBaseModel): @property def initiator_message(self): - return self.messages[0] # TODO check if this needs sorting + try: + return self.messages[0] # TODO check if this needs sorting + except IndexError: + logger.warning(f"Thread {self} is empty. How can that be?") + return None @property def message_count(self): @@ -222,6 +226,9 @@ class Thread(ChatBaseModel): @property def is_fully_processed(self) -> bool: init_message = self.initiator_message + if init_message is None: + return False + if init_message.is_processed_override: return True # this override is set for instance, when no url was sent at all. Then set this thread to be ignored diff --git a/app/utils_worker/download/browser.py b/app/utils_worker/download/browser.py index 830f6ae..0eecd73 100644 --- a/app/utils_worker/download/browser.py +++ b/app/utils_worker/download/browser.py @@ -5,13 +5,13 @@ import os import base64 import requests from selenium import webdriver -from selenium.webdriver.firefox.options import Options import configuration import json config = configuration.parsed["DOWNLOADS"] blacklisted = json.loads(config["blacklisted_href_domains"]) + class PDFDownloader: """Saves a given url. Fills the object it got as a parameter""" logger = logging.getLogger(__name__) @@ -19,10 +19,8 @@ class PDFDownloader: running = False def start(self): - try: - self.finish() - except: - self.logger.info("gecko driver not yet running") + self.finish() # clear up + options = webdriver.FirefoxOptions() options.profile = config["browser_profile_path"] # should be options.set_preference("profile", config["browser_profile_path"]) as of selenium 4 but that doesn't work @@ -56,13 +54,15 @@ class PDFDownloader: def autostart(self): if not self.running: - self.start() # relaunch the dl util + self.start() # relaunch the dl util def finish(self): - self.logger.info("Exiting gecko driver") - self.driver.quit() - self.running = False - + if self.running: + self.logger.info("Exiting gecko driver") + self.driver.quit() + self.running = False + else: + self.logger.info("Gecko driver not yet running") def download(self, article_object): sleep_time = 1 @@ -74,14 +74,14 @@ class PDFDownloader: except Exception as e: self.logger.critical("Selenium .get(url) failed with error {}".format(e)) self.finish() - return article_object # without changes + return article_object # without changes time.sleep(sleep_time) # leave the page time to do any funky business # in the mean time, get a page title if required if article_object.is_title_bad: - article_object.title = self.driver.title.replace(".pdf","") + article_object.title = self.driver.title.replace(".pdf", "") # will be propagated to dst as well fname = article_object.fname_template @@ -105,7 +105,7 @@ class PDFDownloader: else: article_object.file_name = "" - return article_object # this change is saved later manually + return article_object # this change is saved later manually def get_exisiting_pdf(self, url, dst): diff --git a/docker-compose.yaml b/docker-compose.yaml index 8cb90fe..bac1add 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -5,6 +5,7 @@ version: "3.9" services: auto_news: build: . + image: auto_news:latest volumes: - ${CONTAINER_DATA}:/app/file_storage - ${HOSTS_FILE}:/etc/hosts @@ -14,13 +15,17 @@ services: network_mode: host environment: - DISPLAY=$DISPLAY + - TERM=xterm-256color # colored logs + - COLUMNS=160 # for wider logs - DEBUG=${DEBUG} - CHECK=${CHECK} - UPLOAD=${UPLOAD} - HEADLESS=${HEADLESS} - REDUCEDFETCH=${REDUCEDFETCH} + entrypoint: ${ENTRYPOINT:-python3 runner.py} # by default launch workers as defined in the Dockerfile + stdin_open: ${INTERACTIVE:-false} # docker run -i + tty: ${INTERACTIVE:-false} # docker run -t - entrypoint: ${ENTRYPOINT:-"python3 runner.py"} # by default launch workers as defined in the Dockerfile # geckodriver: # image: selenium/standalone-firefox:100.0 diff --git a/env/check b/env/check index fcd04c6..7b2b86e 100644 --- a/env/check +++ b/env/check @@ -1,7 +1,7 @@ # Does not run any downloads but displays the previously downloaded but not yet checked files. Requires display-acces via xauth -CONTAINER_DATA=/mnt/Data/COSS/Downloads/auto_news.container -HOSTS_FILE=/mnt/Data/COSS/Downloads/auto_news.container/dependencies/hosts +CONTAINER_DATA=~/Bulk/COSS/Downloads/auto_news.container +HOSTS_FILE=~/Bulk/COSS/Downloads/auto_news.container/dependencies/hosts XAUTHORTIY=$XAUTHORTIY diff --git a/env/debug b/env/debug index 969b7b3..041187c 100644 --- a/env/debug +++ b/env/debug @@ -1,7 +1,7 @@ # Runs in a debugging mode, does not launch anything at all but starts a bash process -CONTAINER_DATA=/mnt/Data/COSS/Downloads/auto_news.container -HOSTS_FILE=/mnt/Data/COSS/Downloads/auto_news.container/dependencies/hosts +CONTAINER_DATA=~/Bulk/COSS/Downloads/auto_news.container +HOSTS_FILE=~/Bulk/COSS/Downloads/auto_news.container/dependencies/hosts CODE=./ XAUTHORTIY=$XAUTHORTIY @@ -12,4 +12,5 @@ UPLOAD=false HEADLESS=false REDUCEDFETCH=false -ENTRYPOINT="sleep infinity" \ No newline at end of file +ENTRYPOINT="/bin/bash" +INTERACTIVE=true \ No newline at end of file diff --git a/env/production b/env/production index d4a9a24..a7d0b7a 100644 --- a/env/production +++ b/env/production @@ -1,7 +1,7 @@ # Runs on the main slack channel with the full worker setup. If nothing funky has occured, reducedfetch is a speedup -CONTAINER_DATA=/mnt/Data/Downloads/auto_news.container -HOSTS_FILE=/mnt/Data/COSS/Downloads/auto_news.container/dependencies/hosts +CONTAINER_DATA=~/Bulk/COSS/Downloads/auto_news.container +HOSTS_FILE=~/Bulk/COSS/Downloads/auto_news.container/dependencies/hosts DEBUG=false CHECK=false diff --git a/env/upload b/env/upload index 4ef11af..83da2ca 100644 --- a/env/upload +++ b/env/upload @@ -1,7 +1,7 @@ # Does not run any other workers and only upploads to archive the urls that weren't previously uploaded -CONTAINER_DATA=/mnt/Data/COSS/Downloads/auto_news.container -HOSTS_FILE=/mnt/Data/COSS/Downloads/auto_news.container/dependencies/hosts +CONTAINER_DATA=~/Bulk/COSS/Downloads/auto_news.container +HOSTS_FILE=~/Bulk/COSS/Downloads/auto_news.container/dependencies/hosts DEBUG=false diff --git a/misc/gather_media_files.py b/misc/gather_media_files.py index bb7635c..1059b6d 100644 --- a/misc/gather_media_files.py +++ b/misc/gather_media_files.py @@ -1,5 +1,3 @@ -from cmath import log -from concurrent.futures import thread import sys sys.path.append("../app") import runner @@ -8,33 +6,47 @@ logger = logging.getLogger() import json - -logger.info("Overwriting production values for single use media-fetch") +logger.info("Overwriting production values for single time media-fetch") runner.configuration.models.set_db( - runner.configuration.SqliteDatabase("media_message_dummy.db"), # chat_db (not needed here) - runner.configuration.SqliteDatabase("media_downloads.db") + runner.configuration.SqliteDatabase("../.dev/media_message_dummy.db"), # chat_db (not needed here) + runner.configuration.SqliteDatabase("../.dev/media_downloads.db") ) -runner.configuration.parsed["DOWNLOADS"]["local_storage_path"] = "." - -coordinator = runner.Coordinator() +runner.configuration.parsed["DOWNLOADS"]["local_storage_path"] = "../.dev/" -kwargs = { - "worker_download" : runner.DownloadWorker(), - "worker_fetch" : runner.FetchWorker(), - "worker_upload" : runner.UploadWorker(), - "worker_compress" : runner.CompressWorker(), -} +def fetch(): + coordinator = runner.Coordinator() -coordinator.add_workers(**kwargs) -coordinator.start() -with open("media_urls.json", "r") as f: - url_list = json.loads(f.read()) + kwargs = { + "worker_download" : runner.DownloadWorker(), + "worker_fetch" : runner.FetchWorker(), + "worker_upload" : runner.UploadWorker(), + "worker_compress" : runner.CompressWorker(), + } -logger.info(f"Found {len(url_list)} media urls") -for u in url_list: - msg_text = f"<{u}|dummy preview text>" - dummy_thread = runner.models.Thread() - msg = runner.models.Message(text= msg_text, thread=dummy_thread) - coordinator.incoming_request(msg) \ No newline at end of file + coordinator.add_workers(**kwargs) + coordinator.start() + + with open("media_urls.json", "r") as f: + url_list = json.loads(f.read()) + + logger.info(f"Found {len(url_list)} media urls") + for u in url_list: + msg_text = f"<{u}|dummy preview text>" + dummy_thread = runner.models.Thread() + msg = runner.models.Message(text= msg_text, thread=dummy_thread) + coordinator.incoming_request(msg) + + +def show(): + sel = runner.models.ArticleDownload.select() + entries = ["title"] #, "article_url", "archive_url"] + + for e in entries: + r = [t.title for t in sel] + print(r) + # print([t for t in r]) + + +show()