Fixed browser profile bug, line breaks and exceptions in news_check

2022-09-26 15:25:55 +02:00
parent db161e50c8
commit 9349b046d2
12 changed files with 150 additions and 319 deletions
--- a/news_fetch/configuration.py
+++ b/news_fetch/configuration.py
@@ -64,5 +64,5 @@ else:

 from utils_storage import models

-# Set up the database
+# Set up the database connection (also creates tables if they don't exist)
 models.set_db(download_db)
--- a/news_fetch/utils_check/runner.py
+++ b/news_fetch/utils_check/runner.py
@@ -1,208 +0,0 @@
-from rich.console import Console
-from rich.table import Table
-from rich.columns import Columns
-from rich.rule import Rule
-console = Console()
-hline = Rule(style="white")
-
-import os
-import subprocess
-from slack_sdk import WebClient
-import configuration
-models = configuration.models
-
-u_options = {
-    "ENTER" : "Accept PDF as is. It gets marked as verified",
-    "D" : "set languange to DE and set verified",
-    "E" : "set languange to EN and set verified",
-    "O" : "set other language (prompted)",
-    "R" : "set related files (prompted multiple times)",
-    "B" : "reject and move to folder BAD",
-    "L" : "leave file as is, do not send reaction"
-}
-
-
-bot_client = WebClient(
-    token = configuration.main_config["SLACK"]["auth_token"]
-)
-
-
-
-
-
-def file_overview(file_url: str, file_attributes: list, options: dict) -> None:
-    """Prints a neat overview of the current article"""
-    file_table = Table(
-        title = file_url,
-        row_styles = ["white", "bright_black"],
-        min_width = 100
-    )
-
-    file_table.add_column("Attribute", justify = "right", no_wrap = True)
-    file_table.add_column("Value set by auto_news")
-    file_table.add_column("Status", justify = "right")
-    for attr in file_attributes:
-        file_table.add_row(attr["name"], attr["value"], attr["status"])
-
-    
-    option_key = "\n".join([f"[[bold]{k}[/bold]]" for k in options.keys()])
-    option_action = "\n".join([f"[italic]{k}[/italic]" for k in options.values()])
-    columns = Columns([option_key, option_action])
-
-    console.print(file_table)
-    console.print("Your options:")
-    console.print(columns)
-
-
-def send_reaction_to_slack_thread(article, reaction):
-    """Sends the verification status as a reaction to the associated slack thread."""
-    thread = article.slack_thread
-    messages = models.Message.select().where(models.Message.text.contains(article.article_url))
-    # TODO rewrite this shit
-    if len(messages) > 5:
-        print("Found more than 5 messages. Aborting reactions...")
-        return
-    for m in messages:
-        if m.is_processed_override:
-            print("Message already processed. Aborting reactions...")
-        elif not m.has_single_url:
-            print("Found thread but won't send reaction because thread has multiple urls")
-        else:
-            ts = m.slack_ts
-            bot_client.reactions_add(
-                channel=configuration.main_config["SLACK"]["archive_id"],
-                name=reaction,
-                timestamp=ts
-            )
-            print("Sent reaction to message")
-
-
-def prompt_language(query):
-    not_set = True
-    while not_set:
-        uin = input("Set language (nation-code, 2 letters) ")
-        if len(uin) != 2:
-            print("Bad code, try again")
-        else:
-            not_set = False
-            query.language = uin
-            query.save()
-
-
-def prompt_related(query):
-    file_list = []
-    finished = False
-    while not finished:
-        uin = input("Additional file for article? Type '1' to cancel ")
-        if uin == "1":
-            query.set_related(file_list)
-            finished = True
-        else:
-            file_list.append(uin)
-
-
-def prompt_new_fname(query):
-    uin = input("New fname? ")
-    old_fname =  query.file_name
-    query.file_name = uin
-    query.verified = 1
-    if old_fname != "":
-        os.remove(query.save_path + old_fname)
-    query.save()    
-
-
-
-def reject_article(article):
-    article.verified = -1
-    article.save()
-    print("Article marked as bad")
-    # also update the threads to not be monitored anymore
-    send_reaction_to_slack_thread(article, "x")
-
-
-def unreject_article(query):
-    query.verified = 1
-    query.save()
-    # os.rename(badpdf, fname)
-    print("File set to verified")
-
-
-def accept_article(article, last_accepted):
-    article.verified = 1
-    article.save()
-    print("Article accepted as GOOD")
-
-    # also update the threads to not be monitored anymore
-    send_reaction_to_slack_thread(article, "white_check_mark")
-
-    return "" # linked
-
-
-
-
-
-
-def verify_unchecked():
-    query = models.ArticleDownload.select().where(models.ArticleDownload.verified == 0).execute()
-    last_linked = None
-
-    for article in query:
-        console.print(hline)
-        core_info = []
-        for e, name in zip([article.save_path, article.file_name, article.title, article.language], ["Save path", "File name", "Title", "Language"]):
-            entry = {
-                "status" : "[red]██[/red]" if (len(e) == 0 or e == -1) else "[green]██[/green]",
-                "value" : e if len(e) != 0 else "not set",
-                "name" : name
-            }
-            core_info.append(entry)
-        
-        try:
-            # close any previously opened windows:
-            # subprocess.call(["kill", "`pgrep evince`"])
-            os.system("pkill evince")
-            # then open a new one
-            subprocess.Popen(["evince", f"file://{os.path.join(article.save_path, article.file_name)}"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-            # supress evince gtk warnings
-        except Exception as e:
-            print(e)
-            continue
-
-        
-
-        file_overview(
-            file_url = article.article_url, 
-            file_attributes=core_info,
-            options = u_options
-        )
-
-
-        proceed = False
-        while not proceed:
-            proceed = False
-            uin = input("Choice ?").lower()
-            if uin == "":
-                last_linked = accept_article(article, last_linked) # last linked accelerates the whole process
-                proceed = True
-            elif uin == "d":
-                article.language = "de"
-                article.verified = 1
-                article.save()
-                proceed = True
-            elif uin == "e":
-                article.language = "en"
-                article.verified = 1
-                article.save()
-                proceed = True
-            elif uin == "o":
-                prompt_language(article)
-            elif uin == "r":
-                prompt_related(article)
-            elif uin == "b":
-                reject_article(article)
-                proceed = True
-            elif uin == "l":
-                # do nothing
-                proceed = True
-            else:
-                print("Invalid input")
--- a/news_fetch/utils_worker/download/browser.py
+++ b/news_fetch/utils_worker/download/browser.py
@@ -1,70 +1,72 @@
+import logging
 import time
 import datetime
-import logging
-import os
+
+import os, shutil, uuid
+from pathlib import Path
+
 import base64
 import requests
 from selenium import webdriver
+
 import configuration

 config = configuration.main_config["DOWNLOADS"]

+def driver_running(f):
+    def wrapper(*args, **kwargs):
+        self = args[0]
+        if not self._running:
+            self.start()
+        return f(*args, **kwargs)
+    return wrapper
+
+

 class PDFDownloader:
    """Saves a given url. Fills the object it got as a parameter"""
    logger = logging.getLogger(__name__)
-    # status-variable for restarting:
-    running = False
-    
+    _running = False
+
+
    def start(self):
-        self.finish() # clear up
-            
-        options = webdriver.ChromeOptions()
-        options.add_argument(f"user-data-dir={config['browser_profile_path']}")
-        options.add_argument('--headless')
+        """Called externally to start the driver, but after an exception can also be called internally"""
+        if self._running:
+            self.finish() # clear up

-        # if os.getenv("DEBUG", "false") == "true":
-        #     self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
-        # else:
+        self.logger.info("Starting geckodriver")
+        
+        reduced_path = self.create_tmp_profile()
+        profile = webdriver.FirefoxProfile(reduced_path)
+        options = webdriver.FirefoxOptions()

-        # options.set_preference('print.save_as_pdf.links.enabled', True)
-        # # Just save if the filetype is pdf already
-        # # TODO: this is not working right now
+        if os.getenv("DEBUG", "false") == "true":
+            self.logger.warning("Opening browser GUI because of 'DEBUG=true'")
+        else:
+            options.add_argument('--headless')

-        # options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True)
-        # options.set_preference("browser.download.folderList", 2)
-        # # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
-        # # options.set_preference("pdfjs.disabled", True)
-        # options.set_preference("browser.download.dir", config["default_download_path"])
-
-        self.logger.info("Starting chrome driver")
        self.driver = webdriver.Remote(
-            command_executor = 'http://chrome:4444', # the host chrome points to the chrome container
+            command_executor = 'http://geckodriver:4444', # the host geckodriver points to the geckodriver container
            options = options,
-            # can't set log path...
+            browser_profile = profile
        )
        
-        self.running = True
+        self._running = True

-    def autostart(self):
-        if not self.running:
-            self.start()  # relaunch the dl util

    def finish(self):
-        if self.running:
-            self.logger.info("Exiting chrome driver")
-            try:
-                self.driver.quit()
-                time.sleep(10)
-            except:
-                self.logger.critical("Connection to the driver broke off")
-            self.running = False
-        else:
-            self.logger.info("Chrome driver not yet running")
+        self.logger.info("Exiting Geckodriver")
+        try:
+            self.driver.quit()
+            time.sleep(10)
+        except:
+            self.logger.critical("Connection to the driver broke off")
+        self._running = False

+
+    @driver_running
    def download(self, article_object):
-        sleep_time = 2
-        self.autostart()
+        sleep_time = int(config["browser_print_delay"])
        url = article_object.article_url

        try:
@@ -89,20 +91,17 @@ class PDFDownloader:
            dst = os.path.join(article_object.save_path, fname)


-        if url[-4:] == ".pdf":
-            # according to the browser preferences, calling the url will open pdfjs.
-            # If not handled separately, printing would require the ctrl+p route, but setup is janky to say the least
+        if url[-4:] == ".pdf": # calling the ususal pdf generation would not yield a nice pdf, just download it directly
            success = self.get_exisiting_pdf(url, dst)
        else:
            success = self.get_new_pdf(dst)

-
        if success:
            article_object.file_name = fname
        else:
            article_object.file_name = ""
        
-        return article_object  # this change is saved later by the external caller
+        return article_object # this change is saved later by the external caller


    def get_exisiting_pdf(self, url, dst):
@@ -134,9 +133,26 @@ class PDFDownloader:
        except Exception as e:
            self.logger.error(f"Failed, because of FS-operation: {e}")
            return False
-        


+    def create_tmp_profile(self, full_profile_path: Path = Path(config["browser_profile_path"])) -> Path:
+        reduced_profile_path = Path(f"/tmp/firefox_profile_{uuid.uuid4()}")
+        print(reduced_profile_path, full_profile_path)
+        os.mkdir(reduced_profile_path)
+        # copy needed directories
+        dirs = ["extensions", "storage"]
+        for dir in dirs:
+            shutil.copytree(full_profile_path / dir, reduced_profile_path / dir)
+
+        # copy needed files
+        files = ["extension-preferences.json", "addons.json", "addonStartup.json.lz4", "prefs.js", "extensions.json", "cookies.sqlite"]
+        for f in files:
+            shutil.copy(full_profile_path / f, reduced_profile_path)
+        
+        folder_size = round(sum(p.stat().st_size for p in Path(reduced_profile_path).rglob('*')) / 1024 / 1024, 3)
+        self.logger.info(f"Generated temporary profile with size {folder_size} MB")
+        return reduced_profile_path
+