Fixed browser profile bug, line breaks and exceptions in news_check

This commit is contained in:
2022-09-26 15:25:55 +02:00
parent db161e50c8
commit 9349b046d2
12 changed files with 150 additions and 319 deletions

View File

@@ -64,5 +64,5 @@ else:
from utils_storage import models
# Set up the database
# Set up the database connection (also creates tables if they don't exist)
models.set_db(download_db)

View File

@@ -1,208 +0,0 @@
from rich.console import Console
from rich.table import Table
from rich.columns import Columns
from rich.rule import Rule
console = Console()
hline = Rule(style="white")
import os
import subprocess
from slack_sdk import WebClient
import configuration
models = configuration.models
u_options = {
"ENTER" : "Accept PDF as is. It gets marked as verified",
"D" : "set languange to DE and set verified",
"E" : "set languange to EN and set verified",
"O" : "set other language (prompted)",
"R" : "set related files (prompted multiple times)",
"B" : "reject and move to folder BAD",
"L" : "leave file as is, do not send reaction"
}
bot_client = WebClient(
token = configuration.main_config["SLACK"]["auth_token"]
)
def file_overview(file_url: str, file_attributes: list, options: dict) -> None:
"""Prints a neat overview of the current article"""
file_table = Table(
title = file_url,
row_styles = ["white", "bright_black"],
min_width = 100
)
file_table.add_column("Attribute", justify = "right", no_wrap = True)
file_table.add_column("Value set by auto_news")
file_table.add_column("Status", justify = "right")
for attr in file_attributes:
file_table.add_row(attr["name"], attr["value"], attr["status"])
option_key = "\n".join([f"[[bold]{k}[/bold]]" for k in options.keys()])
option_action = "\n".join([f"[italic]{k}[/italic]" for k in options.values()])
columns = Columns([option_key, option_action])
console.print(file_table)
console.print("Your options:")
console.print(columns)
def send_reaction_to_slack_thread(article, reaction):
"""Sends the verification status as a reaction to the associated slack thread."""
thread = article.slack_thread
messages = models.Message.select().where(models.Message.text.contains(article.article_url))
# TODO rewrite this shit
if len(messages) > 5:
print("Found more than 5 messages. Aborting reactions...")
return
for m in messages:
if m.is_processed_override:
print("Message already processed. Aborting reactions...")
elif not m.has_single_url:
print("Found thread but won't send reaction because thread has multiple urls")
else:
ts = m.slack_ts
bot_client.reactions_add(
channel=configuration.main_config["SLACK"]["archive_id"],
name=reaction,
timestamp=ts
)
print("Sent reaction to message")
def prompt_language(query):
not_set = True
while not_set:
uin = input("Set language (nation-code, 2 letters) ")
if len(uin) != 2:
print("Bad code, try again")
else:
not_set = False
query.language = uin
query.save()
def prompt_related(query):
file_list = []
finished = False
while not finished:
uin = input("Additional file for article? Type '1' to cancel ")
if uin == "1":
query.set_related(file_list)
finished = True
else:
file_list.append(uin)
def prompt_new_fname(query):
uin = input("New fname? ")
old_fname = query.file_name
query.file_name = uin
query.verified = 1
if old_fname != "":
os.remove(query.save_path + old_fname)
query.save()
def reject_article(article):
article.verified = -1
article.save()
print("Article marked as bad")
# also update the threads to not be monitored anymore
send_reaction_to_slack_thread(article, "x")
def unreject_article(query):
query.verified = 1
query.save()
# os.rename(badpdf, fname)
print("File set to verified")
def accept_article(article, last_accepted):
article.verified = 1
article.save()
print("Article accepted as GOOD")
# also update the threads to not be monitored anymore
send_reaction_to_slack_thread(article, "white_check_mark")
return "" # linked
def verify_unchecked():
query = models.ArticleDownload.select().where(models.ArticleDownload.verified == 0).execute()
last_linked = None
for article in query:
console.print(hline)
core_info = []
for e, name in zip([article.save_path, article.file_name, article.title, article.language], ["Save path", "File name", "Title", "Language"]):
entry = {
"status" : "[red]██[/red]" if (len(e) == 0 or e == -1) else "[green]██[/green]",
"value" : e if len(e) != 0 else "not set",
"name" : name
}
core_info.append(entry)
try:
# close any previously opened windows:
# subprocess.call(["kill", "`pgrep evince`"])
os.system("pkill evince")
# then open a new one
subprocess.Popen(["evince", f"file://{os.path.join(article.save_path, article.file_name)}"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# supress evince gtk warnings
except Exception as e:
print(e)
continue
file_overview(
file_url = article.article_url,
file_attributes=core_info,
options = u_options
)
proceed = False
while not proceed:
proceed = False
uin = input("Choice ?").lower()
if uin == "":
last_linked = accept_article(article, last_linked) # last linked accelerates the whole process
proceed = True
elif uin == "d":
article.language = "de"
article.verified = 1
article.save()
proceed = True
elif uin == "e":
article.language = "en"
article.verified = 1
article.save()
proceed = True
elif uin == "o":
prompt_language(article)
elif uin == "r":
prompt_related(article)
elif uin == "b":
reject_article(article)
proceed = True
elif uin == "l":
# do nothing
proceed = True
else:
print("Invalid input")

View File

@@ -1,70 +1,72 @@
import logging
import time
import datetime
import logging
import os
import os, shutil, uuid
from pathlib import Path
import base64
import requests
from selenium import webdriver
import configuration
config = configuration.main_config["DOWNLOADS"]
def driver_running(f):
def wrapper(*args, **kwargs):
self = args[0]
if not self._running:
self.start()
return f(*args, **kwargs)
return wrapper
class PDFDownloader:
"""Saves a given url. Fills the object it got as a parameter"""
logger = logging.getLogger(__name__)
# status-variable for restarting:
running = False
_running = False
def start(self):
self.finish() # clear up
options = webdriver.ChromeOptions()
options.add_argument(f"user-data-dir={config['browser_profile_path']}")
options.add_argument('--headless')
"""Called externally to start the driver, but after an exception can also be called internally"""
if self._running:
self.finish() # clear up
# if os.getenv("DEBUG", "false") == "true":
# self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
# else:
self.logger.info("Starting geckodriver")
reduced_path = self.create_tmp_profile()
profile = webdriver.FirefoxProfile(reduced_path)
options = webdriver.FirefoxOptions()
# options.set_preference('print.save_as_pdf.links.enabled', True)
# # Just save if the filetype is pdf already
# # TODO: this is not working right now
if os.getenv("DEBUG", "false") == "true":
self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
else:
options.add_argument('--headless')
# options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
# options.set_preference("browser.download.folderList", 2)
# # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
# # options.set_preference("pdfjs.disabled", True)
# options.set_preference("browser.download.dir", config["default_download_path"])
self.logger.info("Starting chrome driver")
self.driver = webdriver.Remote(
command_executor = 'http://chrome:4444', # the host chrome points to the chrome container
command_executor = 'http://geckodriver:4444', # the host geckodriver points to the geckodriver container
options = options,
# can't set log path...
browser_profile = profile
)
self.running = True
self._running = True
def autostart(self):
if not self.running:
self.start() # relaunch the dl util
def finish(self):
if self.running:
self.logger.info("Exiting chrome driver")
try:
self.driver.quit()
time.sleep(10)
except:
self.logger.critical("Connection to the driver broke off")
self.running = False
else:
self.logger.info("Chrome driver not yet running")
self.logger.info("Exiting Geckodriver")
try:
self.driver.quit()
time.sleep(10)
except:
self.logger.critical("Connection to the driver broke off")
self._running = False
@driver_running
def download(self, article_object):
sleep_time = 2
self.autostart()
sleep_time = int(config["browser_print_delay"])
url = article_object.article_url
try:
@@ -89,20 +91,17 @@ class PDFDownloader:
dst = os.path.join(article_object.save_path, fname)
if url[-4:] == ".pdf":
# according to the browser preferences, calling the url will open pdfjs.
# If not handled separately, printing would require the ctrl+p route, but setup is janky to say the least
if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly
success = self.get_exisiting_pdf(url, dst)
else:
success = self.get_new_pdf(dst)
if success:
article_object.file_name = fname
else:
article_object.file_name = ""
return article_object # this change is saved later by the external caller
return article_object # this change is saved later by the external caller
def get_exisiting_pdf(self, url, dst):
@@ -134,9 +133,26 @@ class PDFDownloader:
except Exception as e:
self.logger.error(f"Failed, because of FS-operation: {e}")
return False
def create_tmp_profile(self, full_profile_path: Path = Path(config["browser_profile_path"])) -> Path:
reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}")
print(reduced_profile_path, full_profile_path)
os.mkdir(reduced_profile_path)
# copy needed directories
dirs = ["extensions", "storage"]
for dir in dirs:
shutil.copytree(full_profile_path / dir, reduced_profile_path / dir)
# copy needed files
files = ["extension-preferences.json", "addons.json", "addonStartup.json.lz4", "prefs.js", "extensions.json", "cookies.sqlite"]
for f in files:
shutil.copy(full_profile_path / f, reduced_profile_path)
folder_size = round(sum(p.stat().st_size for p in Path(reduced_profile_path).rglob('*')) / 1024 / 1024, 3)
self.logger.info(f"Generated temporary profile with size {folder_size} MB")
return reduced_profile_path