new component - upload to NAS

2022-07-23 17:21:00 +02:00
parent 79e3f54955
commit 8e46f30f07
29 changed files with 132 additions and 63 deletions
--- a/news_fetch/app/utils_slack/message_helpers.py
+++ b/news_fetch/app/utils_slack/message_helpers.py
@@ -0,0 +1,277 @@
+import logging
+import configuration
+import requests
+import os
+import time
+from threading import Thread
+from slack_sdk.errors import SlackApiError
+
+logger = logging.getLogger(__name__)
+config = configuration.parsed["SLACK"]
+models = configuration.models
+slack_client = "dummy"
+LATEST_RECORDED_REACTION = 0
+
+
+def init(client) -> None:
+    global slack_client
+    slack_client = client
+
+    global LATEST_RECORDED_REACTION
+    try:
+        LATEST_RECORDED_REACTION = models.Reaction.select(models.Reaction.id).order_by("id")[-1]
+    except IndexError: #query is actually empty, we have never fetched any messages until now
+        LATEST_RECORDED_REACTION = 0    
+    
+    # fetch all te messages we could have possibly missed
+    logger.info("Querying missed messages, threads and reactions. This can take some time.")
+    fetch_missed_channel_messages() # not threaded
+    t = Thread(target = fetch_missed_channel_reactions) # threaded, runs in background (usually takes a long time)
+    t.start()
+
+    if os.getenv("REDUCEDFETCH", "false") == "true":
+        logger.warning("Only fetching empty threads for bot messages because 'REDUCEDFETCH=true'")
+        fetch_missed_thread_messages(reduced=True)
+    else: # perform both asyncronously
+        fetch_missed_thread_messages()
+    
+
+
+def get_unhandled_messages():
+    """Gets all messages that have not yet been handled, be it by mistake or by downtime
+    As the message handler makes no distinction between channel messages and thread messages,
+    we don't have to worry about them here.
+    """
+
+    threaded_objects = []
+    for t in models.Thread.select():
+        if t.message_count > 1: # if only one message was written, it is the channel message
+            msg = t.last_message
+            if msg.is_by_human:
+                threaded_objects.append(msg)
+            # else don't, nothing to process
+    logger.info(f"Set {len(threaded_objects)} thread-messages as not yet handled.")
+
+
+    channel_objects = [t.initiator_message for t in models.Thread.select() if (t.message_count == 1 and not t.is_fully_processed)]
+    logger.info(f"Set {len(channel_objects)} channel-messages as not yet handled.")
+    
+    reaction_objects = list(models.Reaction.select().where(models.Reaction.id > LATEST_RECORDED_REACTION))
+    logger.info(f"Set {len(reaction_objects)} reactions as not yet handled.")
+    # the ones newer than the last before the fetch
+    
+    all_messages = channel_objects + threaded_objects
+    return all_messages, reaction_objects
+
+
+def fetch_missed_channel_messages():
+    # latest processed message_ts is:
+    presaved = models.Message.select().order_by(models.Message.ts)
+    if not presaved:
+        last_ts = 0
+    else:
+        last_message = presaved[-1]
+        last_ts = last_message.slack_ts
+    
+    result = slack_client.conversations_history(
+        channel=config["archive_id"],
+        oldest=last_ts
+    )
+
+    new_messages = result.get("messages", [])
+    # # filter the last one, it is a duplicate! (only if the db is not empty!)
+    # if last_ts != 0 and len(new_messages) != 0:
+    #     new_messages.pop(-1)
+
+    new_fetches = 0
+    for m in new_messages:
+        # print(m)
+        message_dict_to_model(m)
+        new_fetches += 1
+
+    refetch = result.get("has_more", False)
+    while refetch: # we have not actually fetched them all
+        try:
+            result = slack_client.conversations_history(
+                channel = config["archive_id"],
+                cursor = result["response_metadata"]["next_cursor"],
+                oldest = last_ts
+            ) # fetches 100 messages, older than the [-1](=oldest) element of new_fetches
+            refetch = result.get("has_more", False)
+
+            new_messages = result.get("messages", [])
+            for m in new_messages:
+                message_dict_to_model(m)
+                new_fetches += 1
+        except SlackApiError: # Most likely a rate-limit
+            logger.error("Error while fetching channel messages. (likely rate limit) Retrying in {} seconds...".format(config["api_wait_time"]))
+            time.sleep(config["api_wait_time"])
+            refetch = True
+    
+    logger.info(f"Fetched {new_fetches} new channel messages.")
+
+
+def fetch_missed_thread_messages(reduced=False):
+    """After having gotten all base-threads, we need to fetch all their replies"""        
+    # I don't know of a better way: we need to fetch this for each and every thread (except if it is marked as permanently solved)
+    logger.info("Starting fetch of thread messages...")
+    if reduced:
+        threads = [t for t in models.Thread.select() if (t.message_count == 1 and not t.is_fully_processed)]
+        # this only fetches completely empty threads, which might be because the bot-message was not yet saved to the db.
+        # once we got all the bot-messages the remaining empty threads will be the ones we need to process.
+    else:
+        threads = [t for t in models.Thread.select() if not t.is_fully_processed]
+    logger.info(f"Fetching history for {len(threads)} empty threads")
+    new_messages = []
+    for i,t in enumerate(threads):
+        try:
+            messages = slack_client.conversations_replies(
+                channel = config["archive_id"],
+                ts = t.slack_ts,
+                oldest = t.messages[-1].slack_ts
+            )["messages"]
+        except SlackApiError:
+            logger.error("Hit rate limit while querying threaded messages, retrying in {}s ({}/{} queries elapsed)".format(config["api_wait_time"], i, len(threads)))
+            time.sleep(int(config["api_wait_time"]))
+            messages = slack_client.conversations_replies(
+                channel = config["archive_id"],
+                ts = t.slack_ts,
+                oldest = t.messages[-1].slack_ts
+            )["messages"]
+
+        messages.pop(0) # the first message is the one posted in the channel. We already processed it!
+        
+        for m in messages:
+            # only append *new* messages
+            res = message_dict_to_model(m)
+            if res:
+                new_messages.append(res)
+    logger.info("Fetched {} new threaded messages.".format(len(new_messages)))
+
+
+def fetch_missed_channel_reactions():
+    logger.info("Starting background fetch of channel reactions...")
+    threads = [t for t in models.Thread.select() if not t.is_fully_processed]
+    for i,t in enumerate(threads):
+        try:
+            query = slack_client.reactions_get(
+                channel = config["archive_id"],
+                timestamp = t.slack_ts
+            )
+            reactions = query.get("message", []).get("reactions", []) # default = []
+        except SlackApiError: # probably a rate_limit:
+            logger.error("Hit rate limit while querying reactions. retrying in {}s ({}/{} queries elapsed)".format(config["api_wait_time"], i, len(threads)))
+            time.sleep(int(config["api_wait_time"]))
+            reactions = query.get("message", []).get("reactions", [])
+
+        for r in reactions:
+            reaction_dict_to_model(r, t)
+
+
+
+
+# Helpers for message conversion to db-objects
+def reaction_dict_to_model(reaction, thread=None):
+    if thread is None:
+        m_ts = reaction["item"]["ts"]
+        message = models.Message.get(ts = float(m_ts))
+        thread = message.thread
+    if "name" in reaction.keys(): # fetched through manual api query
+        content = reaction["name"]
+    elif "reaction" in reaction.keys(): # fetched through events
+        content = reaction["reaction"]
+    else:
+        logger.error(f"Weird reaction received: {reaction}")
+        return None
+
+    r, _ = models.Reaction.get_or_create(
+        type = content,
+        message = thread.initiator_message
+    )
+    logger.info("Saved reaction [{}]".format(content))
+    return r
+
+
+def message_dict_to_model(message):
+    if message["type"] == "message":
+        thread_ts = message["thread_ts"] if "thread_ts" in message else message["ts"]
+        uid = message.get("user", "BAD USER")
+        if uid == "BAD USER":
+            logger.critical("Message has no user?? {}".format(message))
+            return None
+        
+        user, _ = models.User.get_or_create(user_id = uid)
+        thread, _ = models.Thread.get_or_create(thread_ts = thread_ts)
+        m, new = models.Message.get_or_create(
+            user = user,
+            thread = thread,
+            ts = message["ts"],
+            channel_id = config["archive_id"],
+            text = message["text"]
+        )
+        logger.info(f"Saved: {m} ({'new' if new else 'old'})")
+
+        files = message.get("files", [])
+        if len(files) >= 1:
+            f = files[0] #default: []
+            m.file_type = f["filetype"]
+            m.perma_link = f["url_private_download"]
+            m.save()
+            logger.info(f"Saved {m.file_type}-file for message (id={m.id})")
+        if new:
+            return m
+        else:
+            return None
+    else:
+        logger.warning("What should I do of {}".format(message))
+        return None
+
+
+def say_substitute(*args, **kwargs):
+    logger.info("Now sending message through say-substitute: {}".format(" - ".join(args)))
+    slack_client.chat_postMessage(
+        channel=config["archive_id"],
+        text=" - ".join(args),
+        **kwargs
+    )
+    
+
+def save_as_related_file(url, article_object):
+    r = requests.get(url, headers={"Authorization": "Bearer {}".format(slack_client.token)})
+    saveto = article_object.save_path
+    ftype = url[url.rfind(".") + 1:]
+    fname = "{} - related no {}.{}".format(
+        article_object.file_name.replace(".pdf",""),
+        len(article_object.related) + 1,
+        ftype
+    )
+    with open(os.path.join(saveto, fname), "wb") as f:
+        f.write(r.content)
+    article_object.set_related([fname])
+    logger.info("Added {} to model {}".format(fname, article_object))
+    return fname
+
+
+def react_file_path_message(fname, article_object):
+    saveto = article_object.save_path
+    file_path = os.path.join(saveto, fname)
+    if os.path.exists(file_path):
+        article_object.set_related([fname])
+        logger.info("Added {} to model {}".format(fname, article_object))
+        return True
+    else:
+        return False
+
+
+def is_message_in_archiving(message) -> bool:
+    if isinstance(message, dict):
+        return message["channel"] == config["archive_id"]
+    else:
+        return message.channel_id == config["archive_id"]
+
+
+def is_reaction_in_archiving(event) -> bool:
+    if isinstance(event, dict):
+        return event["item"]["channel"] == config["archive_id"]
+    else:
+        return event.message.channel_id == config["archive_id"]
--- a/news_fetch/app/utils_slack/runner.py
+++ b/news_fetch/app/utils_slack/runner.py
@@ -0,0 +1,184 @@
+from slack_bolt import App
+from slack_bolt.adapter.socket_mode import SocketModeHandler
+
+import logging
+import configuration
+
+from . import message_helpers
+
+
+config = configuration.parsed["SLACK"]
+models = configuration.models
+
+class BotApp(App):
+    logger = logging.getLogger(__name__)
+
+    def __init__(self, callback, *args, **kwargs):
+
+        super().__init__(*args, **kwargs)
+        self.callback = callback
+
+    def start(self):
+        message_helpers.init(self.client)
+        missed_messages, missed_reactions = message_helpers.get_unhandled_messages()
+
+        [self.handle_incoming_message(m) for m in missed_messages]
+        [self.handle_incoming_reaction(r) for r in missed_reactions]
+
+        # self.react_missed_reactions(missed_reactions)
+        # self.react_missed_messages(missed_messages)
+        self.startup_status()
+
+
+
+    def handle_incoming_reaction(self, reaction):
+        if isinstance(reaction, dict): #else: the reaction is already being passed as a model
+            # CAUTION: filter for 'changed reactions' those are nasty (usually when adding an url)
+            reaction = message_helpers.reaction_dict_to_model(reaction)
+
+        thread = reaction.message.thread
+        article_object = thread.article
+        if not article_object is None:
+            reaction = reaction.type
+            status = 1 if reaction == "white_check_mark" else -1
+
+            # self.logger.info(f"Applying reaction {reaction} to its root message.")
+            article_object.verified = status
+            article_object.save()
+
+
+    def handle_incoming_message(self, message):
+        """Reacts to all messages inside channel archiving. Must then
+        distinguish between threaded replies and new requests
+        and react accordingly"""
+        if isinstance(message, dict): #else: the message is already being passed as a model
+            # CAUTION: filter for 'changed messages' those are nasty (usually when adding an url)
+            if message.get("subtype", "not bad") == "message_changed":
+                return False
+            message = message_helpers.message_dict_to_model(message)
+
+        # First check: belongs to thread?
+        is_threaded = message.thread.message_count > 1 and message != message.thread.initiator_message
+        if is_threaded:
+            self.incoming_thread_message(message)
+        else:
+            self.incoming_channel_message(message)
+            
+
+    def incoming_thread_message(self, message):
+        if message.user.user_id == config["bot_id"]:
+            return True # ignore the files uploaded by the bot. We handled them already!
+        
+        thread = message.thread
+        if thread.is_fully_processed:
+            return True
+
+        self.logger.info("Receiving thread-message")
+        self.respond_thread_message(message)
+
+
+    def incoming_channel_message(self, message):
+        self.logger.info(f"Handling message {message} ({len(message.urls)} urls)")
+
+        if not message.urls: # no urls in a root-message => IGNORE
+            message.is_processed_override = True
+            message.save()
+            return
+
+        # ensure thread is still empty, this is a scenario encountered only in testing, but let's just filter it
+        if message.thread.message_count > 1:
+            self.logger.info("Discarded message because it is actually processed.")
+            return
+        
+        if len(message.urls) > 1:
+            message_helpers.say_substitute("Only the first url is being handled. Please send any subsequent url as a separate message", thread_ts=message.thread.slack_ts)
+        
+        self.callback(message)
+        # for url in message.urls:
+            # self.callback(url, message)
+            # stop here!
+
+
+
+    def respond_thread_message(self, message, say=message_helpers.say_substitute):
+        thread = message.thread
+        article = thread.article
+        if message.perma_link: # file upload means new data    
+            fname = message_helpers.save_as_related_file(message.perma_link, article)
+            say("File was saved as 'related file' under `{}`.".format(fname),
+                thread_ts=thread.slack_ts
+            )
+        else: # either a pointer to a new file (too large to upload), or trash
+            success = message_helpers.react_file_path_message(message.text, article)
+            if success:
+                say("File was saved as 'related file'", thread_ts=thread.slack_ts)
+            else:
+                self.logger.error("User replied to thread {} but the response did not contain a file/path".format(thread))
+                say("Cannot process response without associated file.",
+                    thread_ts=thread.slack_ts
+                )
+
+
+    def respond_channel_message(self, thread, say=message_helpers.say_substitute):
+        article = thread.article
+        answers = article.slack_info
+        for a in answers:
+            if a["file_path"]:
+                try: # either, a["file_path"] does not exist, or the upload resulted in an error
+                    self.client.files_upload(
+                        channels = config["archive_id"],
+                        initial_comment = f"<@{config['responsible_id']}> \n {a['reply_text']}",
+                        file = a["file_path"],
+                        thread_ts = thread.slack_ts
+                    )
+                    status = True
+                except:
+                    say(
+                        "File {} could not be uploaded.".format(a),
+                        thread_ts=thread.slack_ts
+                    )
+                    status = False
+            else: # anticipated that there is no file!
+                say(
+                    f"<@{config['responsible_id']}> \n {a['reply_text']}",
+                    thread_ts=thread.slack_ts
+                )
+                status = True
+        
+
+    def startup_status(self):
+        threads = [t for t in models.Thread.select()]
+        all_threads = len(threads)
+        fully_processed = len([t for t in threads if t.is_fully_processed])
+        fully_unprocessed = len([t for t in threads if t.message_count == 1])
+        articles_unprocessed = len(models.ArticleDownload.select().where(models.ArticleDownload.verified < 1))
+        self.logger.info(f"[bold]STATUS[/bold]: Fully processed {fully_processed}/{all_threads} threads. {fully_unprocessed} threads have 0 replies. Article-objects to verify: {articles_unprocessed}", extra={"markup": True})
+
+
+    
+
+
+class BotRunner():
+    """Stupid encapsulation so that we can apply the slack decorators to the BotApp"""
+    def __init__(self, callback, *args, **kwargs) -> None:
+        self.bot_worker = BotApp(callback, token=config["auth_token"])
+
+        @self.bot_worker.event(event="message", matchers=[message_helpers.is_message_in_archiving])
+        def handle_incoming_message(message, say):
+            return self.bot_worker.handle_incoming_message(message)
+
+        @self.bot_worker.event(event="reaction_added", matchers=[message_helpers.is_reaction_in_archiving])
+        def handle_incoming_reaction(event, say):
+            return self.bot_worker.handle_incoming_reaction(event)
+
+        # target = self.launch
+        # super().__init__(target=target)
+
+
+    def start(self):
+        self.bot_worker.start()
+        SocketModeHandler(self.bot_worker, config["app_token"]).start()
+
+
+    # def respond_to_message(self, message):
+    #     self.bot_worker.handle_incoming_message(message)