bug fixes

This commit is contained in:
2022-10-06 15:55:30 +02:00
parent cca902b1f2
commit 5eab6c55da
12 changed files with 62 additions and 159 deletions

View File

@@ -2,10 +2,6 @@ FROM python:latest
ENV TZ Europe/Zurich
RUN apt-get update && apt-get install -y ghostscript
# for compression of pdfs
RUN mkdir -p /app/auto_news
COPY requirements.txt /app/requirements.txt

View File

@@ -11,7 +11,7 @@ from collections import OrderedDict
from utils_mail import runner as MailRunner
from utils_slack import runner as SlackRunner
from utils_worker.workers import CompressWorker, DownloadWorker, FetchWorker, UploadWorker
from utils_worker.workers import DownloadWorker, FetchWorker, UploadWorker
class ArticleWatcher:
@@ -160,10 +160,10 @@ if __name__ == "__main__":
try:
slack_runner = SlackRunner.BotRunner(dispatcher.incoming_request)
# All workers are implemented as a threaded queue. But the individual model requires a specific processing order:
# fetch -> download -> compress -> complete
# fetch -> download (-> compress) -> complete
# This is reflected in the following list of workers:
workers_in = [
OrderedDict({"FetchWorker": FetchWorker(), "DownloadWorker": DownloadWorker(), "CompressWorker": CompressWorker(), "NotifyRunner": "out"}),
OrderedDict({"FetchWorker": FetchWorker(), "DownloadWorker": DownloadWorker(), "NotifyRunner": "out"}),
OrderedDict({"UploadWorker": UploadWorker()})
]
# The two dicts are processed independently. First element of first dict is called at the same time as the first element of the second dict

View File

@@ -15,14 +15,11 @@ def send(article_model):
mail['From'] = config["sender"]
mail['To'] = config["recipient"]
msgs = article_model.mail_info # this is html
msg = [m["reply_text"] for m in msgs]
msg = "\n".join(msg)
msg, files = article_model.mail_info() # this is html
content = MIMEText(msg, "html")
mail.attach(content)
files = [m["file_path"] for m in msgs if m["file_path"]]
for path in files:
with open(path, 'rb') as file:
part = MIMEApplication(file.read(), "pdf")
@@ -31,7 +28,12 @@ def send(article_model):
mail.attach(part)
try:
smtp = smtplib.SMTP(config["smtp_server"], config["port"])
try:
smtp = smtplib.SMTP(config["smtp_server"], config["port"])
except ConnectionRefusedError:
logger.error("Server refused connection. Is this an error on your side?")
return False
smtp.starttls()
smtp.login(config["uname"], config["password"])
smtp.sendmail(config["sender"], config["recipient"], mail.as_string())

View File

@@ -154,37 +154,10 @@ class BotApp(App):
def respond_channel_message(self, article, say=None):
if say is None:
say = self.say_substitute
answers = article.slack_info
if article.slack_ts == 0:
self.logger.error(f"{article} has no slack_ts")
else:
self.logger.info("Skipping slack reply because it is broken")
for a in []:
# for a in answers:
if a["file_path"]:
try:
self.client.files_upload(
channels = config["archive_id"],
initial_comment = f"{a['reply_text']}",
file = a["file_path"],
thread_ts = article.slack_ts_full
)
# status = True
except SlackApiError as e: # upload resulted in an error
say(
"File {} could not be uploaded.".format(a),
thread_ts = article.slack_ts_full
)
# status = False
self.logger.error(f"File upload failed: {e}")
else: # anticipated that there is no file!
say(
f"{a['reply_text']}",
thread_ts = article.slack_ts_full
)
# status = True
self.logger.info("Skipping slack reply.")
def startup_status(self):

View File

@@ -10,6 +10,8 @@ import datetime
from . import helpers
config = configuration.main_config["DOWNLOADS"]
slack_config = configuration.main_config["SLACK"]
FILE_SIZE_THRESHOLD = 15 * 1024 * 1024 # 15MB
# set the nature of the db at runtime
download_db = DatabaseProxy()
@@ -94,50 +96,34 @@ class ArticleDownload(DownloadBaseModel):
desc = f"{self.article_url}"
return f"ART [{desc}]"
@property
def slack_info(self):
status = [":x: No better version available", ":gear: Verification pending", ":white_check_mark: Verified by human"][self.verified + 1]
content = "\n>" + "\n>".join(self.summary.split("\n"))
file_status, msg = self.file_status()
if not file_status:
return [msg]
# everything alright: generate real content
# first the base file
if self.file_name[-4:] == ".pdf":
answer = [{ # main reply with the base pdf
"reply_text" : f"*{self.title}*\n{status}\n{content}",
"file_path" : self.save_path + self.file_name
}]
else: # don't upload if the file is too big!
location = f"Not uploaded to slack, but the file will be on the NAS:\n`{self.fname_nas}`"
answer = [{ # main reply with the base pdf
"reply_text" : f"*{self.title}*\n{status}\n{content}\n{location}",
"file_path" : None
}]
def mail_info(self):
summary = "\n> " + "\n> ".join(self.summary.split("\n"))
answer_text = f"[{self.article_url}]({self.article_url})\n\n" # first the url
answer_files = []
# displays the summary in a blockquote
status = self.file_status
if status == 1: # file_name was empty
return None # there has been an error do not send any message
elif status == 2: # no file found at specified location
answer_text += f"*{self.title}*\n{summary}\nFilename: {self.file_name}"
elif status == 3: # file found but deemed too big
location = f"File not sent directly. Location on NAS:\n`{self.fname_nas}`"
answer_text += f"*{self.title}*\n{summary}\n{location}"
else: # everything nominal
answer_text += f"*{self.title}*\n{summary}"
answer_files.append(self.save_path + self.file_name)
# then the related files
rel_text = ""
for r in self.related:
fname = r.related_file_name
lentry = "\n• `{}` ".format(self.fname_nas(fname))
if fname[-4:] == ".pdf": # this is a manageable file, directly upload
f_ret = self.save_path + fname
answer.append({"reply_text":"", "file_path" : f_ret})
else: # not pdf <=> too large. Don't upload but mention its existence
lentry += "(not uploaded to slack, but the file will be on the NAS)"
rel_text += lentry
if rel_text:
rel_text = answer[0]["reply_text"] = answer[0]["reply_text"] + "\nRelated files:\n" + rel_text
if self.related:
rel_text = "Related files on NAS:"
for r in self.related:
fname = r.related_file_name
rel_text += f"\n• `{self.fname_nas(fname)}` "
answer_text += "\n\n" + rel_text
return answer
@property
def mail_info(self):
base = [{"reply_text": f"[{self.article_url}]({self.article_url})\n", "file_path":None}] + self.slack_info
return [{"reply_text": markdown.markdown(m["reply_text"]), "file_path": m["file_path"]} for m in base]
return markdown.markdown(answer_text), answer_files
def set_authors(self, authors):
@@ -157,18 +143,21 @@ class ArticleDownload(DownloadBaseModel):
article = self,
related_file_name = r
)
@property
def file_status(self):
"""0 = file exists, 1 = no file name!, 2 = file does not exit,3 = file exists but is too large"""
if not self.file_name:
logger.error(f"Article {self} has no filename!")
return False, {"reply_text": "Download failed, no file was saved.", "file_path": None}
return 2
file_path_abs = self.save_path + self.file_name
if not os.path.exists(file_path_abs):
logger.error(f"Article {self} has a filename, but the file does not exist at that location!")
return False, {"reply_text": "Can't find file. Either the download failed or the file was moved.", "file_path": None}
return 2
if (os.path.splitext(file_path_abs)[1] != ".pdf") or (os.path.getsize(file_path_abs) > FILE_SIZE_THRESHOLD):
logger.warning(f"Article {self} has a file that exceeds the file size limit.")
return 3
return True, {}
class ArticleAuthor(DownloadBaseModel):

View File

@@ -1,47 +0,0 @@
import os
import subprocess
from pathlib import Path
import logging
logger = logging.getLogger(__name__)
import configuration
config = configuration.main_config["DOWNLOADS"]
shrink_sizes = []
def shrink_pdf(article):
article_loc = Path(article.save_path) / article.file_name
initial_size = article_loc.stat().st_size
compressed_tmp = Path(config['default_download_path']) / "compressed.pdf"
if article_loc.suffix != "pdf":
return article # it probably was a youtube video
c = subprocess.run(
[
"gs",
"-sDEVICE=pdfwrite",
"-dPDFSETTINGS=/screen",
"-dNOPAUSE",
"-dBATCH",
f"-sOutputFile={compressed_tmp}",
f"{article_loc}"
],
stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
if c.returncode == 0:
try:
os.replace(compressed_tmp, article_loc)
except OSError as e:
logger.error(f"Compression ran but I could not copy back the file {e}")
final_size = article_loc.stat().st_size
shrink_sizes.append(initial_size - final_size)
logger.info(f"Compression worked. Avg shrinkage: {int(sum(shrink_sizes)/len(shrink_sizes) / 1000)} KB")
else:
logger.error(f"Could not run the compression! {c.stderr.decode()} - {c.stdout.decode()}")
return article

View File

@@ -85,10 +85,8 @@ class PDFDownloader:
# will be propagated to the saved file (dst) as well
fname = article_object.fname_template
fname = ensure_unique(article_object.save_path, fname)
dst = os.path.join(article_object.save_path, fname)
if os.path.exists(dst):
fname = make_path_unique(fname)
dst = os.path.join(article_object.save_path, fname)
if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly
@@ -137,7 +135,6 @@ class PDFDownloader:
def create_tmp_profile(self, full_profile_path: Path = Path(config["browser_profile_path"])) -> Path:
reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}")
print(reduced_profile_path, full_profile_path)
os.mkdir(reduced_profile_path)
# copy needed directories
dirs = ["extensions", "storage"]
@@ -150,13 +147,20 @@ class PDFDownloader:
shutil.copy(full_profile_path / f, reduced_profile_path)
folder_size = round(sum(p.stat().st_size for p in Path(reduced_profile_path).rglob('*')) / 1024 / 1024, 3)
self.logger.info(f"Generated temporary profile with size {folder_size} MB")
self.logger.info(f"Generated temporary profile at {reduced_profile_path} with size {folder_size} MB")
return reduced_profile_path
def make_path_unique(path):
fname, ending = os.path.splitext(path)
fname += datetime.datetime.now().strftime("%d-%H%M%S")
return fname + ending
def ensure_unique(path, fname):
fbase, ending = os.path.splitext(fname)
exists = os.path.exists(os.path.join(path, fname))
i = 1
while exists:
fname = fbase + f" -- fetch {i}" + ending
i += 1
exists = os.path.exists(os.path.join(path, fname))
return fname

View File

@@ -3,7 +3,7 @@ from .download.browser import PDFDownloader
from .download.youtube import YouTubeDownloader
from .fetch.runner import get_description
from .upload.runner import upload_to_archive as run_upload
from .compress.runner import shrink_pdf
import time
import logging
@@ -53,14 +53,3 @@ class UploadWorker(TemplateWorker):
super()._handle_article(article_watcher, action)
# article_watcher.upload_completed = True
class CompressWorker(TemplateWorker):
def __init__(self) -> None:
super().__init__()
def _handle_article(self, article_watcher):
action = shrink_pdf
super()._handle_article(article_watcher, action)
# article_watcher.compression_completed = True