FS updates and corrections

This commit is contained in:
Remy Moll 2022-06-15 11:14:08 +02:00
parent 54760abee4
commit 87d65fc988
14 changed files with 91 additions and 56 deletions

4
.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,4 @@
{
"python.linting.flake8Enabled": true,
"python.linting.enabled": false
}

View File

@ -2,7 +2,6 @@ FROM python:latest
ENV TZ Euopre/Zurich ENV TZ Euopre/Zurich
RUN echo "deb http://deb.debian.org/debian/ unstable main contrib non-free" >> /etc/apt/sources.list RUN echo "deb http://deb.debian.org/debian/ unstable main contrib non-free" >> /etc/apt/sources.list
RUN apt-get update && apt-get install -y \ RUN apt-get update && apt-get install -y \
evince \ evince \

View File

@ -49,9 +49,15 @@ I also wrote a rudimentary docker compose file which makes running much more sim
All relevant passthroughs and mounts are specified through the env-file, for which I configured 4 versions: production, debug (development in general), upload and check. These files will have to be adapted to your individual setup but can be reused more easily. All relevant passthroughs and mounts are specified through the env-file, for which I configured 4 versions: production, debug (development in general), upload and check. These files will have to be adapted to your individual setup but can be reused more easily.
> Note: For the debug env-file, you will likely want interactivity, so you need to run:
`docker compose --env-file env/debug run auto_news`
<!-- > Note:
> >
> The `debug` requires additional input. Once `docker compose up` is running, in a new session run `docker compose --env-file env/debug exec bash`. The live-mounted code is then under `/code`. Note that the `DEBUG=true` environment variable is still set. If you want to test things on production, run `export DEBUG=false`. > The `debug` requires additional input. Once `docker compose up` is running, in a new session run `docker compose --env-file env/debug exec bash`. The live-mounted code is then under `/code`. Note that the `DEBUG=true` environment variable is still set. If you want to test things on production, run `export DEBUG=false`.
-->
## Building ## Building

View File

@ -8,7 +8,7 @@ from rich.logging import RichHandler
logging.basicConfig( logging.basicConfig(
format='%(message)s', format='%(message)s',
level=logging.INFO, level=logging.INFO,
datefmt='%Y-%m-%d %H:%M:%S', datefmt='%H:%M:%S', # add %Y-%m-%d if needed
handlers=[RichHandler()] handlers=[RichHandler()]
) )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)

View File

@ -158,10 +158,11 @@ def verify_unchecked():
try: try:
# close any previously opened windows: # close any previously opened windows:
subprocess.call("killall evince") subprocess.call(["kill", "`pgrep evince`"])
# then open a new one # then open a new one
subprocess.Popen(["evince", f"file://{os.path.join(article.save_path, article.file_name)}"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) subprocess.Popen(["evince", f"file://{os.path.join(article.save_path, article.file_name)}"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# supress evince gtk warnings # supress evince gtk warnings
print("done")
except Exception as e: except Exception as e:
print(e) print(e)
continue continue

View File

@ -207,7 +207,11 @@ class Thread(ChatBaseModel):
@property @property
def initiator_message(self): def initiator_message(self):
try:
return self.messages[0] # TODO check if this needs sorting return self.messages[0] # TODO check if this needs sorting
except IndexError:
logger.warning(f"Thread {self} is empty. How can that be?")
return None
@property @property
def message_count(self): def message_count(self):
@ -222,6 +226,9 @@ class Thread(ChatBaseModel):
@property @property
def is_fully_processed(self) -> bool: def is_fully_processed(self) -> bool:
init_message = self.initiator_message init_message = self.initiator_message
if init_message is None:
return False
if init_message.is_processed_override: if init_message.is_processed_override:
return True return True
# this override is set for instance, when no url was sent at all. Then set this thread to be ignored # this override is set for instance, when no url was sent at all. Then set this thread to be ignored

View File

@ -5,13 +5,13 @@ import os
import base64 import base64
import requests import requests
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import configuration import configuration
import json import json
config = configuration.parsed["DOWNLOADS"] config = configuration.parsed["DOWNLOADS"]
blacklisted = json.loads(config["blacklisted_href_domains"]) blacklisted = json.loads(config["blacklisted_href_domains"])
class PDFDownloader: class PDFDownloader:
"""Saves a given url. Fills the object it got as a parameter""" """Saves a given url. Fills the object it got as a parameter"""
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -19,10 +19,8 @@ class PDFDownloader:
running = False running = False
def start(self): def start(self):
try: self.finish() # clear up
self.finish()
except:
self.logger.info("gecko driver not yet running")
options = webdriver.FirefoxOptions() options = webdriver.FirefoxOptions()
options.profile = config["browser_profile_path"] options.profile = config["browser_profile_path"]
# should be options.set_preference("profile", config["browser_profile_path"]) as of selenium 4 but that doesn't work # should be options.set_preference("profile", config["browser_profile_path"]) as of selenium 4 but that doesn't work
@ -59,10 +57,12 @@ class PDFDownloader:
self.start() # relaunch the dl util self.start() # relaunch the dl util
def finish(self): def finish(self):
if self.running:
self.logger.info("Exiting gecko driver") self.logger.info("Exiting gecko driver")
self.driver.quit() self.driver.quit()
self.running = False self.running = False
else:
self.logger.info("Gecko driver not yet running")
def download(self, article_object): def download(self, article_object):
sleep_time = 1 sleep_time = 1
@ -81,7 +81,7 @@ class PDFDownloader:
# in the mean time, get a page title if required # in the mean time, get a page title if required
if article_object.is_title_bad: if article_object.is_title_bad:
article_object.title = self.driver.title.replace(".pdf","") article_object.title = self.driver.title.replace(".pdf", "")
# will be propagated to dst as well # will be propagated to dst as well
fname = article_object.fname_template fname = article_object.fname_template

View File

@ -5,6 +5,7 @@ version: "3.9"
services: services:
auto_news: auto_news:
build: . build: .
image: auto_news:latest
volumes: volumes:
- ${CONTAINER_DATA}:/app/file_storage - ${CONTAINER_DATA}:/app/file_storage
- ${HOSTS_FILE}:/etc/hosts - ${HOSTS_FILE}:/etc/hosts
@ -14,13 +15,17 @@ services:
network_mode: host network_mode: host
environment: environment:
- DISPLAY=$DISPLAY - DISPLAY=$DISPLAY
- TERM=xterm-256color # colored logs
- COLUMNS=160 # for wider logs
- DEBUG=${DEBUG} - DEBUG=${DEBUG}
- CHECK=${CHECK} - CHECK=${CHECK}
- UPLOAD=${UPLOAD} - UPLOAD=${UPLOAD}
- HEADLESS=${HEADLESS} - HEADLESS=${HEADLESS}
- REDUCEDFETCH=${REDUCEDFETCH} - REDUCEDFETCH=${REDUCEDFETCH}
entrypoint: ${ENTRYPOINT:-python3 runner.py} # by default launch workers as defined in the Dockerfile
stdin_open: ${INTERACTIVE:-false} # docker run -i
tty: ${INTERACTIVE:-false} # docker run -t
entrypoint: ${ENTRYPOINT:-"python3 runner.py"} # by default launch workers as defined in the Dockerfile
# geckodriver: # geckodriver:
# image: selenium/standalone-firefox:100.0 # image: selenium/standalone-firefox:100.0

4
env/check vendored
View File

@ -1,7 +1,7 @@
# Does not run any downloads but displays the previously downloaded but not yet checked files. Requires display-acces via xauth # Does not run any downloads but displays the previously downloaded but not yet checked files. Requires display-acces via xauth
CONTAINER_DATA=/mnt/Data/COSS/Downloads/auto_news.container CONTAINER_DATA=~/Bulk/COSS/Downloads/auto_news.container
HOSTS_FILE=/mnt/Data/COSS/Downloads/auto_news.container/dependencies/hosts HOSTS_FILE=~/Bulk/COSS/Downloads/auto_news.container/dependencies/hosts
XAUTHORTIY=$XAUTHORTIY XAUTHORTIY=$XAUTHORTIY

7
env/debug vendored
View File

@ -1,7 +1,7 @@
# Runs in a debugging mode, does not launch anything at all but starts a bash process # Runs in a debugging mode, does not launch anything at all but starts a bash process
CONTAINER_DATA=/mnt/Data/COSS/Downloads/auto_news.container CONTAINER_DATA=~/Bulk/COSS/Downloads/auto_news.container
HOSTS_FILE=/mnt/Data/COSS/Downloads/auto_news.container/dependencies/hosts HOSTS_FILE=~/Bulk/COSS/Downloads/auto_news.container/dependencies/hosts
CODE=./ CODE=./
XAUTHORTIY=$XAUTHORTIY XAUTHORTIY=$XAUTHORTIY
@ -12,4 +12,5 @@ UPLOAD=false
HEADLESS=false HEADLESS=false
REDUCEDFETCH=false REDUCEDFETCH=false
ENTRYPOINT="sleep infinity" ENTRYPOINT="/bin/bash"
INTERACTIVE=true

4
env/production vendored
View File

@ -1,7 +1,7 @@
# Runs on the main slack channel with the full worker setup. If nothing funky has occured, reducedfetch is a speedup # Runs on the main slack channel with the full worker setup. If nothing funky has occured, reducedfetch is a speedup
CONTAINER_DATA=/mnt/Data/Downloads/auto_news.container CONTAINER_DATA=~/Bulk/COSS/Downloads/auto_news.container
HOSTS_FILE=/mnt/Data/COSS/Downloads/auto_news.container/dependencies/hosts HOSTS_FILE=~/Bulk/COSS/Downloads/auto_news.container/dependencies/hosts
DEBUG=false DEBUG=false
CHECK=false CHECK=false

4
env/upload vendored
View File

@ -1,7 +1,7 @@
# Does not run any other workers and only upploads to archive the urls that weren't previously uploaded # Does not run any other workers and only upploads to archive the urls that weren't previously uploaded
CONTAINER_DATA=/mnt/Data/COSS/Downloads/auto_news.container CONTAINER_DATA=~/Bulk/COSS/Downloads/auto_news.container
HOSTS_FILE=/mnt/Data/COSS/Downloads/auto_news.container/dependencies/hosts HOSTS_FILE=~/Bulk/COSS/Downloads/auto_news.container/dependencies/hosts
DEBUG=false DEBUG=false

View File

@ -1,5 +1,3 @@
from cmath import log
from concurrent.futures import thread
import sys import sys
sys.path.append("../app") sys.path.append("../app")
import runner import runner
@ -8,33 +6,47 @@ logger = logging.getLogger()
import json import json
logger.info("Overwriting production values for single time media-fetch")
logger.info("Overwriting production values for single use media-fetch")
runner.configuration.models.set_db( runner.configuration.models.set_db(
runner.configuration.SqliteDatabase("media_message_dummy.db"), # chat_db (not needed here) runner.configuration.SqliteDatabase("../.dev/media_message_dummy.db"), # chat_db (not needed here)
runner.configuration.SqliteDatabase("media_downloads.db") runner.configuration.SqliteDatabase("../.dev/media_downloads.db")
) )
runner.configuration.parsed["DOWNLOADS"]["local_storage_path"] = "." runner.configuration.parsed["DOWNLOADS"]["local_storage_path"] = "../.dev/"
coordinator = runner.Coordinator()
kwargs = { def fetch():
coordinator = runner.Coordinator()
kwargs = {
"worker_download" : runner.DownloadWorker(), "worker_download" : runner.DownloadWorker(),
"worker_fetch" : runner.FetchWorker(), "worker_fetch" : runner.FetchWorker(),
"worker_upload" : runner.UploadWorker(), "worker_upload" : runner.UploadWorker(),
"worker_compress" : runner.CompressWorker(), "worker_compress" : runner.CompressWorker(),
} }
coordinator.add_workers(**kwargs) coordinator.add_workers(**kwargs)
coordinator.start() coordinator.start()
with open("media_urls.json", "r") as f: with open("media_urls.json", "r") as f:
url_list = json.loads(f.read()) url_list = json.loads(f.read())
logger.info(f"Found {len(url_list)} media urls") logger.info(f"Found {len(url_list)} media urls")
for u in url_list: for u in url_list:
msg_text = f"<{u}|dummy preview text>" msg_text = f"<{u}|dummy preview text>"
dummy_thread = runner.models.Thread() dummy_thread = runner.models.Thread()
msg = runner.models.Message(text= msg_text, thread=dummy_thread) msg = runner.models.Message(text= msg_text, thread=dummy_thread)
coordinator.incoming_request(msg) coordinator.incoming_request(msg)
def show():
sel = runner.models.ArticleDownload.select()
entries = ["title"] #, "article_url", "archive_url"]
for e in entries:
r = [t.title for t in sel]
print(r)
# print([t for t in r])
show()