bug fixes
This commit is contained in:
@@ -2,10 +2,6 @@ FROM python:latest
|
||||
|
||||
ENV TZ Europe/Zurich
|
||||
|
||||
RUN apt-get update && apt-get install -y ghostscript
|
||||
# for compression of pdfs
|
||||
|
||||
|
||||
RUN mkdir -p /app/auto_news
|
||||
|
||||
COPY requirements.txt /app/requirements.txt
|
||||
|
@@ -11,7 +11,7 @@ from collections import OrderedDict
|
||||
|
||||
from utils_mail import runner as MailRunner
|
||||
from utils_slack import runner as SlackRunner
|
||||
from utils_worker.workers import CompressWorker, DownloadWorker, FetchWorker, UploadWorker
|
||||
from utils_worker.workers import DownloadWorker, FetchWorker, UploadWorker
|
||||
|
||||
|
||||
class ArticleWatcher:
|
||||
@@ -160,10 +160,10 @@ if __name__ == "__main__":
|
||||
try:
|
||||
slack_runner = SlackRunner.BotRunner(dispatcher.incoming_request)
|
||||
# All workers are implemented as a threaded queue. But the individual model requires a specific processing order:
|
||||
# fetch -> download -> compress -> complete
|
||||
# fetch -> download (-> compress) -> complete
|
||||
# This is reflected in the following list of workers:
|
||||
workers_in = [
|
||||
OrderedDict({"FetchWorker": FetchWorker(), "DownloadWorker": DownloadWorker(), "CompressWorker": CompressWorker(), "NotifyRunner": "out"}),
|
||||
OrderedDict({"FetchWorker": FetchWorker(), "DownloadWorker": DownloadWorker(), "NotifyRunner": "out"}),
|
||||
OrderedDict({"UploadWorker": UploadWorker()})
|
||||
]
|
||||
# The two dicts are processed independently. First element of first dict is called at the same time as the first element of the second dict
|
||||
|
@@ -15,14 +15,11 @@ def send(article_model):
|
||||
mail['From'] = config["sender"]
|
||||
mail['To'] = config["recipient"]
|
||||
|
||||
msgs = article_model.mail_info # this is html
|
||||
msg = [m["reply_text"] for m in msgs]
|
||||
msg = "\n".join(msg)
|
||||
msg, files = article_model.mail_info() # this is html
|
||||
|
||||
content = MIMEText(msg, "html")
|
||||
mail.attach(content)
|
||||
|
||||
files = [m["file_path"] for m in msgs if m["file_path"]]
|
||||
for path in files:
|
||||
with open(path, 'rb') as file:
|
||||
part = MIMEApplication(file.read(), "pdf")
|
||||
@@ -31,7 +28,12 @@ def send(article_model):
|
||||
mail.attach(part)
|
||||
|
||||
try:
|
||||
smtp = smtplib.SMTP(config["smtp_server"], config["port"])
|
||||
try:
|
||||
smtp = smtplib.SMTP(config["smtp_server"], config["port"])
|
||||
except ConnectionRefusedError:
|
||||
logger.error("Server refused connection. Is this an error on your side?")
|
||||
return False
|
||||
|
||||
smtp.starttls()
|
||||
smtp.login(config["uname"], config["password"])
|
||||
smtp.sendmail(config["sender"], config["recipient"], mail.as_string())
|
||||
|
@@ -154,37 +154,10 @@ class BotApp(App):
|
||||
|
||||
|
||||
def respond_channel_message(self, article, say=None):
|
||||
if say is None:
|
||||
say = self.say_substitute
|
||||
answers = article.slack_info
|
||||
if article.slack_ts == 0:
|
||||
self.logger.error(f"{article} has no slack_ts")
|
||||
else:
|
||||
self.logger.info("Skipping slack reply because it is broken")
|
||||
for a in []:
|
||||
# for a in answers:
|
||||
if a["file_path"]:
|
||||
try:
|
||||
self.client.files_upload(
|
||||
channels = config["archive_id"],
|
||||
initial_comment = f"{a['reply_text']}",
|
||||
file = a["file_path"],
|
||||
thread_ts = article.slack_ts_full
|
||||
)
|
||||
# status = True
|
||||
except SlackApiError as e: # upload resulted in an error
|
||||
say(
|
||||
"File {} could not be uploaded.".format(a),
|
||||
thread_ts = article.slack_ts_full
|
||||
)
|
||||
# status = False
|
||||
self.logger.error(f"File upload failed: {e}")
|
||||
else: # anticipated that there is no file!
|
||||
say(
|
||||
f"{a['reply_text']}",
|
||||
thread_ts = article.slack_ts_full
|
||||
)
|
||||
# status = True
|
||||
self.logger.info("Skipping slack reply.")
|
||||
|
||||
|
||||
def startup_status(self):
|
||||
|
@@ -10,6 +10,8 @@ import datetime
|
||||
from . import helpers
|
||||
config = configuration.main_config["DOWNLOADS"]
|
||||
slack_config = configuration.main_config["SLACK"]
|
||||
FILE_SIZE_THRESHOLD = 15 * 1024 * 1024 # 15MB
|
||||
|
||||
|
||||
# set the nature of the db at runtime
|
||||
download_db = DatabaseProxy()
|
||||
@@ -94,50 +96,34 @@ class ArticleDownload(DownloadBaseModel):
|
||||
desc = f"{self.article_url}"
|
||||
return f"ART [{desc}]"
|
||||
|
||||
@property
|
||||
def slack_info(self):
|
||||
status = [":x: No better version available", ":gear: Verification pending", ":white_check_mark: Verified by human"][self.verified + 1]
|
||||
content = "\n>" + "\n>".join(self.summary.split("\n"))
|
||||
file_status, msg = self.file_status()
|
||||
if not file_status:
|
||||
return [msg]
|
||||
|
||||
# everything alright: generate real content
|
||||
# first the base file
|
||||
if self.file_name[-4:] == ".pdf":
|
||||
answer = [{ # main reply with the base pdf
|
||||
"reply_text" : f"*{self.title}*\n{status}\n{content}",
|
||||
"file_path" : self.save_path + self.file_name
|
||||
}]
|
||||
else: # don't upload if the file is too big!
|
||||
location = f"Not uploaded to slack, but the file will be on the NAS:\n`{self.fname_nas}`"
|
||||
answer = [{ # main reply with the base pdf
|
||||
"reply_text" : f"*{self.title}*\n{status}\n{content}\n{location}",
|
||||
"file_path" : None
|
||||
}]
|
||||
def mail_info(self):
|
||||
summary = "\n> " + "\n> ".join(self.summary.split("\n"))
|
||||
answer_text = f"[{self.article_url}]({self.article_url})\n\n" # first the url
|
||||
answer_files = []
|
||||
# displays the summary in a blockquote
|
||||
|
||||
status = self.file_status
|
||||
if status == 1: # file_name was empty
|
||||
return None # there has been an error do not send any message
|
||||
elif status == 2: # no file found at specified location
|
||||
answer_text += f"*{self.title}*\n{summary}\nFilename: {self.file_name}"
|
||||
elif status == 3: # file found but deemed too big
|
||||
location = f"File not sent directly. Location on NAS:\n`{self.fname_nas}`"
|
||||
answer_text += f"*{self.title}*\n{summary}\n{location}"
|
||||
else: # everything nominal
|
||||
answer_text += f"*{self.title}*\n{summary}"
|
||||
answer_files.append(self.save_path + self.file_name)
|
||||
|
||||
# then the related files
|
||||
rel_text = ""
|
||||
for r in self.related:
|
||||
fname = r.related_file_name
|
||||
lentry = "\n• `{}` ".format(self.fname_nas(fname))
|
||||
if fname[-4:] == ".pdf": # this is a manageable file, directly upload
|
||||
f_ret = self.save_path + fname
|
||||
answer.append({"reply_text":"", "file_path" : f_ret})
|
||||
else: # not pdf <=> too large. Don't upload but mention its existence
|
||||
lentry += "(not uploaded to slack, but the file will be on the NAS)"
|
||||
|
||||
rel_text += lentry
|
||||
|
||||
if rel_text:
|
||||
rel_text = answer[0]["reply_text"] = answer[0]["reply_text"] + "\nRelated files:\n" + rel_text
|
||||
if self.related:
|
||||
rel_text = "Related files on NAS:"
|
||||
for r in self.related:
|
||||
fname = r.related_file_name
|
||||
rel_text += f"\n• `{self.fname_nas(fname)}` "
|
||||
|
||||
answer_text += "\n\n" + rel_text
|
||||
|
||||
return answer
|
||||
|
||||
@property
|
||||
def mail_info(self):
|
||||
base = [{"reply_text": f"[{self.article_url}]({self.article_url})\n", "file_path":None}] + self.slack_info
|
||||
return [{"reply_text": markdown.markdown(m["reply_text"]), "file_path": m["file_path"]} for m in base]
|
||||
return markdown.markdown(answer_text), answer_files
|
||||
|
||||
|
||||
def set_authors(self, authors):
|
||||
@@ -157,18 +143,21 @@ class ArticleDownload(DownloadBaseModel):
|
||||
article = self,
|
||||
related_file_name = r
|
||||
)
|
||||
|
||||
|
||||
@property
|
||||
def file_status(self):
|
||||
"""0 = file exists, 1 = no file name!, 2 = file does not exit,3 = file exists but is too large"""
|
||||
if not self.file_name:
|
||||
logger.error(f"Article {self} has no filename!")
|
||||
return False, {"reply_text": "Download failed, no file was saved.", "file_path": None}
|
||||
|
||||
return 2
|
||||
file_path_abs = self.save_path + self.file_name
|
||||
if not os.path.exists(file_path_abs):
|
||||
logger.error(f"Article {self} has a filename, but the file does not exist at that location!")
|
||||
return False, {"reply_text": "Can't find file. Either the download failed or the file was moved.", "file_path": None}
|
||||
return 2
|
||||
if (os.path.splitext(file_path_abs)[1] != ".pdf") or (os.path.getsize(file_path_abs) > FILE_SIZE_THRESHOLD):
|
||||
logger.warning(f"Article {self} has a file that exceeds the file size limit.")
|
||||
return 3
|
||||
|
||||
return True, {}
|
||||
|
||||
|
||||
class ArticleAuthor(DownloadBaseModel):
|
||||
|
@@ -1,47 +0,0 @@
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import configuration
|
||||
config = configuration.main_config["DOWNLOADS"]
|
||||
|
||||
shrink_sizes = []
|
||||
|
||||
def shrink_pdf(article):
|
||||
article_loc = Path(article.save_path) / article.file_name
|
||||
initial_size = article_loc.stat().st_size
|
||||
compressed_tmp = Path(config['default_download_path']) / "compressed.pdf"
|
||||
|
||||
if article_loc.suffix != "pdf":
|
||||
return article # it probably was a youtube video
|
||||
|
||||
c = subprocess.run(
|
||||
[
|
||||
"gs",
|
||||
"-sDEVICE=pdfwrite",
|
||||
"-dPDFSETTINGS=/screen",
|
||||
"-dNOPAUSE",
|
||||
"-dBATCH",
|
||||
f"-sOutputFile={compressed_tmp}",
|
||||
f"{article_loc}"
|
||||
],
|
||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||
)
|
||||
|
||||
if c.returncode == 0:
|
||||
try:
|
||||
os.replace(compressed_tmp, article_loc)
|
||||
except OSError as e:
|
||||
logger.error(f"Compression ran but I could not copy back the file {e}")
|
||||
|
||||
final_size = article_loc.stat().st_size
|
||||
shrink_sizes.append(initial_size - final_size)
|
||||
logger.info(f"Compression worked. Avg shrinkage: {int(sum(shrink_sizes)/len(shrink_sizes) / 1000)} KB")
|
||||
|
||||
|
||||
else:
|
||||
logger.error(f"Could not run the compression! {c.stderr.decode()} - {c.stdout.decode()}")
|
||||
|
||||
return article
|
@@ -85,10 +85,8 @@ class PDFDownloader:
|
||||
# will be propagated to the saved file (dst) as well
|
||||
|
||||
fname = article_object.fname_template
|
||||
fname = ensure_unique(article_object.save_path, fname)
|
||||
dst = os.path.join(article_object.save_path, fname)
|
||||
if os.path.exists(dst):
|
||||
fname = make_path_unique(fname)
|
||||
dst = os.path.join(article_object.save_path, fname)
|
||||
|
||||
|
||||
if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly
|
||||
@@ -137,7 +135,6 @@ class PDFDownloader:
|
||||
|
||||
def create_tmp_profile(self, full_profile_path: Path = Path(config["browser_profile_path"])) -> Path:
|
||||
reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}")
|
||||
print(reduced_profile_path, full_profile_path)
|
||||
os.mkdir(reduced_profile_path)
|
||||
# copy needed directories
|
||||
dirs = ["extensions", "storage"]
|
||||
@@ -150,13 +147,20 @@ class PDFDownloader:
|
||||
shutil.copy(full_profile_path / f, reduced_profile_path)
|
||||
|
||||
folder_size = round(sum(p.stat().st_size for p in Path(reduced_profile_path).rglob('*')) / 1024 / 1024, 3)
|
||||
self.logger.info(f"Generated temporary profile with size {folder_size} MB")
|
||||
self.logger.info(f"Generated temporary profile at {reduced_profile_path} with size {folder_size} MB")
|
||||
return reduced_profile_path
|
||||
|
||||
|
||||
|
||||
|
||||
def make_path_unique(path):
|
||||
fname, ending = os.path.splitext(path)
|
||||
fname += datetime.datetime.now().strftime("%d-%H%M%S")
|
||||
return fname + ending
|
||||
def ensure_unique(path, fname):
|
||||
fbase, ending = os.path.splitext(fname)
|
||||
|
||||
exists = os.path.exists(os.path.join(path, fname))
|
||||
i = 1
|
||||
while exists:
|
||||
fname = fbase + f" -- fetch {i}" + ending
|
||||
i += 1
|
||||
exists = os.path.exists(os.path.join(path, fname))
|
||||
|
||||
return fname
|
||||
|
@@ -3,7 +3,7 @@ from .download.browser import PDFDownloader
|
||||
from .download.youtube import YouTubeDownloader
|
||||
from .fetch.runner import get_description
|
||||
from .upload.runner import upload_to_archive as run_upload
|
||||
from .compress.runner import shrink_pdf
|
||||
|
||||
|
||||
import time
|
||||
import logging
|
||||
@@ -53,14 +53,3 @@ class UploadWorker(TemplateWorker):
|
||||
|
||||
super()._handle_article(article_watcher, action)
|
||||
# article_watcher.upload_completed = True
|
||||
|
||||
|
||||
|
||||
class CompressWorker(TemplateWorker):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
def _handle_article(self, article_watcher):
|
||||
action = shrink_pdf
|
||||
super()._handle_article(article_watcher, action)
|
||||
# article_watcher.compression_completed = True
|
Reference in New Issue
Block a user