bug fixes

This commit is contained in:
Remy Moll 2022-10-06 15:55:30 +02:00
parent cca902b1f2
commit 5eab6c55da
12 changed files with 62 additions and 159 deletions

2
.gitignore vendored
View File

@ -1,5 +1,4 @@
.dev/
.vscode/
*.pyc
*.log
__pycache__/
@ -23,7 +22,6 @@ dist-ssr
# Editor directories and files
.vscode/*
!.vscode/extensions.json
.idea
.DS_Store
*.suo

View File

@ -34,7 +34,7 @@ Two additional 'modes' are `build` and `down`. Build rebuilds the container, whi
In essence a command is simply a service from docker-compose, which is run in an interactive environment. As such all services defined in `docker-compose.yaml` can be called as commands. Only two of them will be of real use:
`news_fetch` does the majority of the actions mentionned above. By default, that is without any options, it runs a metadata-fetch, download, compression, and upload to archive.org. The upload is usually the slowest which is why articles that are processed but don't yet have an archive.org url tend to pile up. You can therefore specify the option `upload` which only starts the upload for the concerned articles, as a catch-up if you will.
`news_fetch` does the majority of the actions mentionned above. By default, that is without any options, it runs a metadata-fetch, download, and upload to archive.org. The upload is usually the slowest which is why articles that are processed but don't yet have an archive.org url tend to pile up. You can therefore specify the option `upload` which only starts the upload for the concerned articles, as a catch-up if you will.
Example usage:

View File

@ -25,7 +25,6 @@ def fetch():
"worker_download" : runner.DownloadWorker(),
"worker_fetch" : runner.FetchWorker(),
"worker_upload" : runner.UploadWorker(),
"worker_compress" : runner.CompressWorker(),
}
coordinator.add_workers(**kwargs)

View File

@ -28,4 +28,4 @@ default_download_path: /app/containerdata/tmp
remote_storage_path: /helbing_support/Files RM/Archiving
browser_profile_path: /app/containerdata/dependencies/news_fetch.profile
# please keep this exact name
browser_print_delay: 5
browser_print_delay: 3

View File

@ -2,10 +2,6 @@ FROM python:latest
ENV TZ Europe/Zurich
RUN apt-get update && apt-get install -y ghostscript
# for compression of pdfs
RUN mkdir -p /app/auto_news
COPY requirements.txt /app/requirements.txt

View File

@ -11,7 +11,7 @@ from collections import OrderedDict
from utils_mail import runner as MailRunner
from utils_slack import runner as SlackRunner
from utils_worker.workers import CompressWorker, DownloadWorker, FetchWorker, UploadWorker
from utils_worker.workers import DownloadWorker, FetchWorker, UploadWorker
class ArticleWatcher:
@ -160,10 +160,10 @@ if __name__ == "__main__":
try:
slack_runner = SlackRunner.BotRunner(dispatcher.incoming_request)
# All workers are implemented as a threaded queue. But the individual model requires a specific processing order:
# fetch -> download -> compress -> complete
# fetch -> download (-> compress) -> complete
# This is reflected in the following list of workers:
workers_in = [
OrderedDict({"FetchWorker": FetchWorker(), "DownloadWorker": DownloadWorker(), "CompressWorker": CompressWorker(), "NotifyRunner": "out"}),
OrderedDict({"FetchWorker": FetchWorker(), "DownloadWorker": DownloadWorker(), "NotifyRunner": "out"}),
OrderedDict({"UploadWorker": UploadWorker()})
]
# The two dicts are processed independently. First element of first dict is called at the same time as the first element of the second dict

View File

@ -15,14 +15,11 @@ def send(article_model):
mail['From'] = config["sender"]
mail['To'] = config["recipient"]
msgs = article_model.mail_info # this is html
msg = [m["reply_text"] for m in msgs]
msg = "\n".join(msg)
msg, files = article_model.mail_info() # this is html
content = MIMEText(msg, "html")
mail.attach(content)
files = [m["file_path"] for m in msgs if m["file_path"]]
for path in files:
with open(path, 'rb') as file:
part = MIMEApplication(file.read(), "pdf")
@ -30,8 +27,13 @@ def send(article_model):
part.add_header('Content-Disposition', 'attachment', filename=os.path.basename(path))
mail.attach(part)
try:
try:
smtp = smtplib.SMTP(config["smtp_server"], config["port"])
except ConnectionRefusedError:
logger.error("Server refused connection. Is this an error on your side?")
return False
smtp.starttls()
smtp.login(config["uname"], config["password"])
smtp.sendmail(config["sender"], config["recipient"], mail.as_string())

View File

@ -154,37 +154,10 @@ class BotApp(App):
def respond_channel_message(self, article, say=None):
if say is None:
say = self.say_substitute
answers = article.slack_info
if article.slack_ts == 0:
self.logger.error(f"{article} has no slack_ts")
else:
self.logger.info("Skipping slack reply because it is broken")
for a in []:
# for a in answers:
if a["file_path"]:
try:
self.client.files_upload(
channels = config["archive_id"],
initial_comment = f"{a['reply_text']}",
file = a["file_path"],
thread_ts = article.slack_ts_full
)
# status = True
except SlackApiError as e: # upload resulted in an error
say(
"File {} could not be uploaded.".format(a),
thread_ts = article.slack_ts_full
)
# status = False
self.logger.error(f"File upload failed: {e}")
else: # anticipated that there is no file!
say(
f"{a['reply_text']}",
thread_ts = article.slack_ts_full
)
# status = True
self.logger.info("Skipping slack reply.")
def startup_status(self):

View File

@ -10,6 +10,8 @@ import datetime
from . import helpers
config = configuration.main_config["DOWNLOADS"]
slack_config = configuration.main_config["SLACK"]
FILE_SIZE_THRESHOLD = 15 * 1024 * 1024 # 15MB
# set the nature of the db at runtime
download_db = DatabaseProxy()
@ -94,50 +96,34 @@ class ArticleDownload(DownloadBaseModel):
desc = f"{self.article_url}"
return f"ART [{desc}]"
@property
def slack_info(self):
status = [":x: No better version available", ":gear: Verification pending", ":white_check_mark: Verified by human"][self.verified + 1]
content = "\n>" + "\n>".join(self.summary.split("\n"))
file_status, msg = self.file_status()
if not file_status:
return [msg]
def mail_info(self):
summary = "\n> " + "\n> ".join(self.summary.split("\n"))
answer_text = f"[{self.article_url}]({self.article_url})\n\n" # first the url
answer_files = []
# displays the summary in a blockquote
# everything alright: generate real content
# first the base file
if self.file_name[-4:] == ".pdf":
answer = [{ # main reply with the base pdf
"reply_text" : f"*{self.title}*\n{status}\n{content}",
"file_path" : self.save_path + self.file_name
}]
else: # don't upload if the file is too big!
location = f"Not uploaded to slack, but the file will be on the NAS:\n`{self.fname_nas}`"
answer = [{ # main reply with the base pdf
"reply_text" : f"*{self.title}*\n{status}\n{content}\n{location}",
"file_path" : None
}]
status = self.file_status
if status == 1: # file_name was empty
return None # there has been an error do not send any message
elif status == 2: # no file found at specified location
answer_text += f"*{self.title}*\n{summary}\nFilename: {self.file_name}"
elif status == 3: # file found but deemed too big
location = f"File not sent directly. Location on NAS:\n`{self.fname_nas}`"
answer_text += f"*{self.title}*\n{summary}\n{location}"
else: # everything nominal
answer_text += f"*{self.title}*\n{summary}"
answer_files.append(self.save_path + self.file_name)
# then the related files
rel_text = ""
if self.related:
rel_text = "Related files on NAS:"
for r in self.related:
fname = r.related_file_name
lentry = "\n• `{}` ".format(self.fname_nas(fname))
if fname[-4:] == ".pdf": # this is a manageable file, directly upload
f_ret = self.save_path + fname
answer.append({"reply_text":"", "file_path" : f_ret})
else: # not pdf <=> too large. Don't upload but mention its existence
lentry += "(not uploaded to slack, but the file will be on the NAS)"
rel_text += f"\n• `{self.fname_nas(fname)}` "
rel_text += lentry
answer_text += "\n\n" + rel_text
if rel_text:
rel_text = answer[0]["reply_text"] = answer[0]["reply_text"] + "\nRelated files:\n" + rel_text
return answer
@property
def mail_info(self):
base = [{"reply_text": f"[{self.article_url}]({self.article_url})\n", "file_path":None}] + self.slack_info
return [{"reply_text": markdown.markdown(m["reply_text"]), "file_path": m["file_path"]} for m in base]
return markdown.markdown(answer_text), answer_files
def set_authors(self, authors):
@ -158,17 +144,20 @@ class ArticleDownload(DownloadBaseModel):
related_file_name = r
)
@property
def file_status(self):
"""0 = file exists, 1 = no file name!, 2 = file does not exit,3 = file exists but is too large"""
if not self.file_name:
logger.error(f"Article {self} has no filename!")
return False, {"reply_text": "Download failed, no file was saved.", "file_path": None}
return 2
file_path_abs = self.save_path + self.file_name
if not os.path.exists(file_path_abs):
logger.error(f"Article {self} has a filename, but the file does not exist at that location!")
return False, {"reply_text": "Can't find file. Either the download failed or the file was moved.", "file_path": None}
return 2
if (os.path.splitext(file_path_abs)[1] != ".pdf") or (os.path.getsize(file_path_abs) > FILE_SIZE_THRESHOLD):
logger.warning(f"Article {self} has a file that exceeds the file size limit.")
return 3
return True, {}
class ArticleAuthor(DownloadBaseModel):

View File

@ -1,47 +0,0 @@
import os
import subprocess
from pathlib import Path
import logging
logger = logging.getLogger(__name__)
import configuration
config = configuration.main_config["DOWNLOADS"]
shrink_sizes = []
def shrink_pdf(article):
article_loc = Path(article.save_path) / article.file_name
initial_size = article_loc.stat().st_size
compressed_tmp = Path(config['default_download_path']) / "compressed.pdf"
if article_loc.suffix != "pdf":
return article # it probably was a youtube video
c = subprocess.run(
[
"gs",
"-sDEVICE=pdfwrite",
"-dPDFSETTINGS=/screen",
"-dNOPAUSE",
"-dBATCH",
f"-sOutputFile={compressed_tmp}",
f"{article_loc}"
],
stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
if c.returncode == 0:
try:
os.replace(compressed_tmp, article_loc)
except OSError as e:
logger.error(f"Compression ran but I could not copy back the file {e}")
final_size = article_loc.stat().st_size
shrink_sizes.append(initial_size - final_size)
logger.info(f"Compression worked. Avg shrinkage: {int(sum(shrink_sizes)/len(shrink_sizes) / 1000)} KB")
else:
logger.error(f"Could not run the compression! {c.stderr.decode()} - {c.stdout.decode()}")
return article

View File

@ -85,9 +85,7 @@ class PDFDownloader:
# will be propagated to the saved file (dst) as well
fname = article_object.fname_template
dst = os.path.join(article_object.save_path, fname)
if os.path.exists(dst):
fname = make_path_unique(fname)
fname = ensure_unique(article_object.save_path, fname)
dst = os.path.join(article_object.save_path, fname)
@ -137,7 +135,6 @@ class PDFDownloader:
def create_tmp_profile(self, full_profile_path: Path = Path(config["browser_profile_path"])) -> Path:
reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}")
print(reduced_profile_path, full_profile_path)
os.mkdir(reduced_profile_path)
# copy needed directories
dirs = ["extensions", "storage"]
@ -150,13 +147,20 @@ class PDFDownloader:
shutil.copy(full_profile_path / f, reduced_profile_path)
folder_size = round(sum(p.stat().st_size for p in Path(reduced_profile_path).rglob('*')) / 1024 / 1024, 3)
self.logger.info(f"Generated temporary profile with size {folder_size} MB")
self.logger.info(f"Generated temporary profile at {reduced_profile_path} with size {folder_size} MB")
return reduced_profile_path
def make_path_unique(path):
fname, ending = os.path.splitext(path)
fname += datetime.datetime.now().strftime("%d-%H%M%S")
return fname + ending
def ensure_unique(path, fname):
fbase, ending = os.path.splitext(fname)
exists = os.path.exists(os.path.join(path, fname))
i = 1
while exists:
fname = fbase + f" -- fetch {i}" + ending
i += 1
exists = os.path.exists(os.path.join(path, fname))
return fname

View File

@ -3,7 +3,7 @@ from .download.browser import PDFDownloader
from .download.youtube import YouTubeDownloader
from .fetch.runner import get_description
from .upload.runner import upload_to_archive as run_upload
from .compress.runner import shrink_pdf
import time
import logging
@ -53,14 +53,3 @@ class UploadWorker(TemplateWorker):
super()._handle_article(article_watcher, action)
# article_watcher.upload_completed = True
class CompressWorker(TemplateWorker):
def __init__(self) -> None:
super().__init__()
def _handle_article(self, article_watcher):
action = shrink_pdf
super()._handle_article(article_watcher, action)
# article_watcher.compression_completed = True