bug fixes
This commit is contained in:
parent
cca902b1f2
commit
5eab6c55da
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,5 +1,4 @@
|
||||
.dev/
|
||||
.vscode/
|
||||
*.pyc
|
||||
*.log
|
||||
__pycache__/
|
||||
@ -23,7 +22,6 @@ dist-ssr
|
||||
|
||||
# Editor directories and files
|
||||
.vscode/*
|
||||
!.vscode/extensions.json
|
||||
.idea
|
||||
.DS_Store
|
||||
*.suo
|
||||
|
@ -34,7 +34,7 @@ Two additional 'modes' are `build` and `down`. Build rebuilds the container, whi
|
||||
|
||||
In essence a command is simply a service from docker-compose, which is run in an interactive environment. As such all services defined in `docker-compose.yaml` can be called as commands. Only two of them will be of real use:
|
||||
|
||||
`news_fetch` does the majority of the actions mentionned above. By default, that is without any options, it runs a metadata-fetch, download, compression, and upload to archive.org. The upload is usually the slowest which is why articles that are processed but don't yet have an archive.org url tend to pile up. You can therefore specify the option `upload` which only starts the upload for the concerned articles, as a catch-up if you will.
|
||||
`news_fetch` does the majority of the actions mentionned above. By default, that is without any options, it runs a metadata-fetch, download, and upload to archive.org. The upload is usually the slowest which is why articles that are processed but don't yet have an archive.org url tend to pile up. You can therefore specify the option `upload` which only starts the upload for the concerned articles, as a catch-up if you will.
|
||||
|
||||
Example usage:
|
||||
|
||||
|
@ -25,7 +25,6 @@ def fetch():
|
||||
"worker_download" : runner.DownloadWorker(),
|
||||
"worker_fetch" : runner.FetchWorker(),
|
||||
"worker_upload" : runner.UploadWorker(),
|
||||
"worker_compress" : runner.CompressWorker(),
|
||||
}
|
||||
|
||||
coordinator.add_workers(**kwargs)
|
||||
|
@ -28,4 +28,4 @@ default_download_path: /app/containerdata/tmp
|
||||
remote_storage_path: /helbing_support/Files RM/Archiving
|
||||
browser_profile_path: /app/containerdata/dependencies/news_fetch.profile
|
||||
# please keep this exact name
|
||||
browser_print_delay: 5
|
||||
browser_print_delay: 3
|
||||
|
@ -2,10 +2,6 @@ FROM python:latest
|
||||
|
||||
ENV TZ Europe/Zurich
|
||||
|
||||
RUN apt-get update && apt-get install -y ghostscript
|
||||
# for compression of pdfs
|
||||
|
||||
|
||||
RUN mkdir -p /app/auto_news
|
||||
|
||||
COPY requirements.txt /app/requirements.txt
|
||||
|
@ -11,7 +11,7 @@ from collections import OrderedDict
|
||||
|
||||
from utils_mail import runner as MailRunner
|
||||
from utils_slack import runner as SlackRunner
|
||||
from utils_worker.workers import CompressWorker, DownloadWorker, FetchWorker, UploadWorker
|
||||
from utils_worker.workers import DownloadWorker, FetchWorker, UploadWorker
|
||||
|
||||
|
||||
class ArticleWatcher:
|
||||
@ -160,10 +160,10 @@ if __name__ == "__main__":
|
||||
try:
|
||||
slack_runner = SlackRunner.BotRunner(dispatcher.incoming_request)
|
||||
# All workers are implemented as a threaded queue. But the individual model requires a specific processing order:
|
||||
# fetch -> download -> compress -> complete
|
||||
# fetch -> download (-> compress) -> complete
|
||||
# This is reflected in the following list of workers:
|
||||
workers_in = [
|
||||
OrderedDict({"FetchWorker": FetchWorker(), "DownloadWorker": DownloadWorker(), "CompressWorker": CompressWorker(), "NotifyRunner": "out"}),
|
||||
OrderedDict({"FetchWorker": FetchWorker(), "DownloadWorker": DownloadWorker(), "NotifyRunner": "out"}),
|
||||
OrderedDict({"UploadWorker": UploadWorker()})
|
||||
]
|
||||
# The two dicts are processed independently. First element of first dict is called at the same time as the first element of the second dict
|
||||
|
@ -15,14 +15,11 @@ def send(article_model):
|
||||
mail['From'] = config["sender"]
|
||||
mail['To'] = config["recipient"]
|
||||
|
||||
msgs = article_model.mail_info # this is html
|
||||
msg = [m["reply_text"] for m in msgs]
|
||||
msg = "\n".join(msg)
|
||||
msg, files = article_model.mail_info() # this is html
|
||||
|
||||
content = MIMEText(msg, "html")
|
||||
mail.attach(content)
|
||||
|
||||
files = [m["file_path"] for m in msgs if m["file_path"]]
|
||||
for path in files:
|
||||
with open(path, 'rb') as file:
|
||||
part = MIMEApplication(file.read(), "pdf")
|
||||
@ -30,8 +27,13 @@ def send(article_model):
|
||||
part.add_header('Content-Disposition', 'attachment', filename=os.path.basename(path))
|
||||
mail.attach(part)
|
||||
|
||||
try:
|
||||
try:
|
||||
smtp = smtplib.SMTP(config["smtp_server"], config["port"])
|
||||
except ConnectionRefusedError:
|
||||
logger.error("Server refused connection. Is this an error on your side?")
|
||||
return False
|
||||
|
||||
smtp.starttls()
|
||||
smtp.login(config["uname"], config["password"])
|
||||
smtp.sendmail(config["sender"], config["recipient"], mail.as_string())
|
||||
|
@ -154,37 +154,10 @@ class BotApp(App):
|
||||
|
||||
|
||||
def respond_channel_message(self, article, say=None):
|
||||
if say is None:
|
||||
say = self.say_substitute
|
||||
answers = article.slack_info
|
||||
if article.slack_ts == 0:
|
||||
self.logger.error(f"{article} has no slack_ts")
|
||||
else:
|
||||
self.logger.info("Skipping slack reply because it is broken")
|
||||
for a in []:
|
||||
# for a in answers:
|
||||
if a["file_path"]:
|
||||
try:
|
||||
self.client.files_upload(
|
||||
channels = config["archive_id"],
|
||||
initial_comment = f"{a['reply_text']}",
|
||||
file = a["file_path"],
|
||||
thread_ts = article.slack_ts_full
|
||||
)
|
||||
# status = True
|
||||
except SlackApiError as e: # upload resulted in an error
|
||||
say(
|
||||
"File {} could not be uploaded.".format(a),
|
||||
thread_ts = article.slack_ts_full
|
||||
)
|
||||
# status = False
|
||||
self.logger.error(f"File upload failed: {e}")
|
||||
else: # anticipated that there is no file!
|
||||
say(
|
||||
f"{a['reply_text']}",
|
||||
thread_ts = article.slack_ts_full
|
||||
)
|
||||
# status = True
|
||||
self.logger.info("Skipping slack reply.")
|
||||
|
||||
|
||||
def startup_status(self):
|
||||
|
@ -10,6 +10,8 @@ import datetime
|
||||
from . import helpers
|
||||
config = configuration.main_config["DOWNLOADS"]
|
||||
slack_config = configuration.main_config["SLACK"]
|
||||
FILE_SIZE_THRESHOLD = 15 * 1024 * 1024 # 15MB
|
||||
|
||||
|
||||
# set the nature of the db at runtime
|
||||
download_db = DatabaseProxy()
|
||||
@ -94,50 +96,34 @@ class ArticleDownload(DownloadBaseModel):
|
||||
desc = f"{self.article_url}"
|
||||
return f"ART [{desc}]"
|
||||
|
||||
@property
|
||||
def slack_info(self):
|
||||
status = [":x: No better version available", ":gear: Verification pending", ":white_check_mark: Verified by human"][self.verified + 1]
|
||||
content = "\n>" + "\n>".join(self.summary.split("\n"))
|
||||
file_status, msg = self.file_status()
|
||||
if not file_status:
|
||||
return [msg]
|
||||
def mail_info(self):
|
||||
summary = "\n> " + "\n> ".join(self.summary.split("\n"))
|
||||
answer_text = f"[{self.article_url}]({self.article_url})\n\n" # first the url
|
||||
answer_files = []
|
||||
# displays the summary in a blockquote
|
||||
|
||||
# everything alright: generate real content
|
||||
# first the base file
|
||||
if self.file_name[-4:] == ".pdf":
|
||||
answer = [{ # main reply with the base pdf
|
||||
"reply_text" : f"*{self.title}*\n{status}\n{content}",
|
||||
"file_path" : self.save_path + self.file_name
|
||||
}]
|
||||
else: # don't upload if the file is too big!
|
||||
location = f"Not uploaded to slack, but the file will be on the NAS:\n`{self.fname_nas}`"
|
||||
answer = [{ # main reply with the base pdf
|
||||
"reply_text" : f"*{self.title}*\n{status}\n{content}\n{location}",
|
||||
"file_path" : None
|
||||
}]
|
||||
status = self.file_status
|
||||
if status == 1: # file_name was empty
|
||||
return None # there has been an error do not send any message
|
||||
elif status == 2: # no file found at specified location
|
||||
answer_text += f"*{self.title}*\n{summary}\nFilename: {self.file_name}"
|
||||
elif status == 3: # file found but deemed too big
|
||||
location = f"File not sent directly. Location on NAS:\n`{self.fname_nas}`"
|
||||
answer_text += f"*{self.title}*\n{summary}\n{location}"
|
||||
else: # everything nominal
|
||||
answer_text += f"*{self.title}*\n{summary}"
|
||||
answer_files.append(self.save_path + self.file_name)
|
||||
|
||||
# then the related files
|
||||
rel_text = ""
|
||||
if self.related:
|
||||
rel_text = "Related files on NAS:"
|
||||
for r in self.related:
|
||||
fname = r.related_file_name
|
||||
lentry = "\n• `{}` ".format(self.fname_nas(fname))
|
||||
if fname[-4:] == ".pdf": # this is a manageable file, directly upload
|
||||
f_ret = self.save_path + fname
|
||||
answer.append({"reply_text":"", "file_path" : f_ret})
|
||||
else: # not pdf <=> too large. Don't upload but mention its existence
|
||||
lentry += "(not uploaded to slack, but the file will be on the NAS)"
|
||||
rel_text += f"\n• `{self.fname_nas(fname)}` "
|
||||
|
||||
rel_text += lentry
|
||||
answer_text += "\n\n" + rel_text
|
||||
|
||||
if rel_text:
|
||||
rel_text = answer[0]["reply_text"] = answer[0]["reply_text"] + "\nRelated files:\n" + rel_text
|
||||
|
||||
return answer
|
||||
|
||||
@property
|
||||
def mail_info(self):
|
||||
base = [{"reply_text": f"[{self.article_url}]({self.article_url})\n", "file_path":None}] + self.slack_info
|
||||
return [{"reply_text": markdown.markdown(m["reply_text"]), "file_path": m["file_path"]} for m in base]
|
||||
return markdown.markdown(answer_text), answer_files
|
||||
|
||||
|
||||
def set_authors(self, authors):
|
||||
@ -158,17 +144,20 @@ class ArticleDownload(DownloadBaseModel):
|
||||
related_file_name = r
|
||||
)
|
||||
|
||||
@property
|
||||
def file_status(self):
|
||||
"""0 = file exists, 1 = no file name!, 2 = file does not exit,3 = file exists but is too large"""
|
||||
if not self.file_name:
|
||||
logger.error(f"Article {self} has no filename!")
|
||||
return False, {"reply_text": "Download failed, no file was saved.", "file_path": None}
|
||||
|
||||
return 2
|
||||
file_path_abs = self.save_path + self.file_name
|
||||
if not os.path.exists(file_path_abs):
|
||||
logger.error(f"Article {self} has a filename, but the file does not exist at that location!")
|
||||
return False, {"reply_text": "Can't find file. Either the download failed or the file was moved.", "file_path": None}
|
||||
return 2
|
||||
if (os.path.splitext(file_path_abs)[1] != ".pdf") or (os.path.getsize(file_path_abs) > FILE_SIZE_THRESHOLD):
|
||||
logger.warning(f"Article {self} has a file that exceeds the file size limit.")
|
||||
return 3
|
||||
|
||||
return True, {}
|
||||
|
||||
|
||||
class ArticleAuthor(DownloadBaseModel):
|
||||
|
@ -1,47 +0,0 @@
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import configuration
|
||||
config = configuration.main_config["DOWNLOADS"]
|
||||
|
||||
shrink_sizes = []
|
||||
|
||||
def shrink_pdf(article):
|
||||
article_loc = Path(article.save_path) / article.file_name
|
||||
initial_size = article_loc.stat().st_size
|
||||
compressed_tmp = Path(config['default_download_path']) / "compressed.pdf"
|
||||
|
||||
if article_loc.suffix != "pdf":
|
||||
return article # it probably was a youtube video
|
||||
|
||||
c = subprocess.run(
|
||||
[
|
||||
"gs",
|
||||
"-sDEVICE=pdfwrite",
|
||||
"-dPDFSETTINGS=/screen",
|
||||
"-dNOPAUSE",
|
||||
"-dBATCH",
|
||||
f"-sOutputFile={compressed_tmp}",
|
||||
f"{article_loc}"
|
||||
],
|
||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||
)
|
||||
|
||||
if c.returncode == 0:
|
||||
try:
|
||||
os.replace(compressed_tmp, article_loc)
|
||||
except OSError as e:
|
||||
logger.error(f"Compression ran but I could not copy back the file {e}")
|
||||
|
||||
final_size = article_loc.stat().st_size
|
||||
shrink_sizes.append(initial_size - final_size)
|
||||
logger.info(f"Compression worked. Avg shrinkage: {int(sum(shrink_sizes)/len(shrink_sizes) / 1000)} KB")
|
||||
|
||||
|
||||
else:
|
||||
logger.error(f"Could not run the compression! {c.stderr.decode()} - {c.stdout.decode()}")
|
||||
|
||||
return article
|
@ -85,9 +85,7 @@ class PDFDownloader:
|
||||
# will be propagated to the saved file (dst) as well
|
||||
|
||||
fname = article_object.fname_template
|
||||
dst = os.path.join(article_object.save_path, fname)
|
||||
if os.path.exists(dst):
|
||||
fname = make_path_unique(fname)
|
||||
fname = ensure_unique(article_object.save_path, fname)
|
||||
dst = os.path.join(article_object.save_path, fname)
|
||||
|
||||
|
||||
@ -137,7 +135,6 @@ class PDFDownloader:
|
||||
|
||||
def create_tmp_profile(self, full_profile_path: Path = Path(config["browser_profile_path"])) -> Path:
|
||||
reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}")
|
||||
print(reduced_profile_path, full_profile_path)
|
||||
os.mkdir(reduced_profile_path)
|
||||
# copy needed directories
|
||||
dirs = ["extensions", "storage"]
|
||||
@ -150,13 +147,20 @@ class PDFDownloader:
|
||||
shutil.copy(full_profile_path / f, reduced_profile_path)
|
||||
|
||||
folder_size = round(sum(p.stat().st_size for p in Path(reduced_profile_path).rglob('*')) / 1024 / 1024, 3)
|
||||
self.logger.info(f"Generated temporary profile with size {folder_size} MB")
|
||||
self.logger.info(f"Generated temporary profile at {reduced_profile_path} with size {folder_size} MB")
|
||||
return reduced_profile_path
|
||||
|
||||
|
||||
|
||||
|
||||
def make_path_unique(path):
|
||||
fname, ending = os.path.splitext(path)
|
||||
fname += datetime.datetime.now().strftime("%d-%H%M%S")
|
||||
return fname + ending
|
||||
def ensure_unique(path, fname):
|
||||
fbase, ending = os.path.splitext(fname)
|
||||
|
||||
exists = os.path.exists(os.path.join(path, fname))
|
||||
i = 1
|
||||
while exists:
|
||||
fname = fbase + f" -- fetch {i}" + ending
|
||||
i += 1
|
||||
exists = os.path.exists(os.path.join(path, fname))
|
||||
|
||||
return fname
|
||||
|
@ -3,7 +3,7 @@ from .download.browser import PDFDownloader
|
||||
from .download.youtube import YouTubeDownloader
|
||||
from .fetch.runner import get_description
|
||||
from .upload.runner import upload_to_archive as run_upload
|
||||
from .compress.runner import shrink_pdf
|
||||
|
||||
|
||||
import time
|
||||
import logging
|
||||
@ -53,14 +53,3 @@ class UploadWorker(TemplateWorker):
|
||||
|
||||
super()._handle_article(article_watcher, action)
|
||||
# article_watcher.upload_completed = True
|
||||
|
||||
|
||||
|
||||
class CompressWorker(TemplateWorker):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
def _handle_article(self, article_watcher):
|
||||
action = shrink_pdf
|
||||
super()._handle_article(article_watcher, action)
|
||||
# article_watcher.compression_completed = True
|
Loading…
x
Reference in New Issue
Block a user