new component - upload to NAS
This commit is contained in:
		
							
								
								
									
										59
									
								
								news_fetch/app/configuration.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										59
									
								
								news_fetch/app/configuration.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,59 @@ | ||||
| from dataclasses import dataclass | ||||
| import os | ||||
| import shutil | ||||
| import configparser | ||||
| import logging | ||||
| from datetime import datetime | ||||
| from peewee import SqliteDatabase | ||||
| from rich.logging import RichHandler | ||||
|  | ||||
| # first things first: logging | ||||
| logging.basicConfig( | ||||
|     format='%(message)s', | ||||
|     level=logging.INFO, | ||||
|     datefmt='%H:%M:%S', # add %Y-%m-%d if needed | ||||
|     handlers=[RichHandler()] | ||||
|     ) | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| # load config file containing constants and secrets | ||||
| parsed = configparser.ConfigParser() | ||||
| parsed.read("/app/containerdata/config/news_fetch.config.ini") | ||||
|  | ||||
| if os.getenv("DEBUG", "false") == "true": | ||||
|     logger.warning("Found 'DEBUG=true', setting up dummy databases") | ||||
|      | ||||
|     db_base_path = parsed["DATABASE"]["db_path_dev"] | ||||
|     parsed["SLACK"]["archive_id"] = parsed["SLACK"]["debug_id"] | ||||
|     parsed["MAIL"]["recipient"] = parsed["MAIL"]["sender"] | ||||
|     parsed["DOWNLOADS"]["local_storage_path"] = parsed["DATABASE"]["db_path_dev"] | ||||
| else: | ||||
|     logger.warning("Found 'DEBUG=false' and running on production databases, I hope you know what you're doing...") | ||||
|     db_base_path = parsed["DATABASE"]["db_path_prod"] | ||||
|     logger.info("Backing up databases") | ||||
|     backup_dst = parsed["DATABASE"]["db_backup"] | ||||
|     today = datetime.today().strftime("%Y.%m.%d") | ||||
|     shutil.copyfile( | ||||
|         os.path.join(db_base_path, parsed["DATABASE"]["chat_db_name"]),  | ||||
|         os.path.join(backup_dst, today + "." + parsed["DATABASE"]["chat_db_name"]),  | ||||
|         ) | ||||
|     shutil.copyfile( | ||||
|         os.path.join(db_base_path, parsed["DATABASE"]["download_db_name"]),  | ||||
|         os.path.join(backup_dst, today + "." + parsed["DATABASE"]["download_db_name"]),  | ||||
|         ) | ||||
|  | ||||
|  | ||||
| from utils_storage import models | ||||
|  | ||||
| # Set up the database | ||||
| models.set_db( | ||||
|     SqliteDatabase( | ||||
|         os.path.join(db_base_path, parsed["DATABASE"]["chat_db_name"]), | ||||
|         pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once | ||||
|     ), | ||||
|     SqliteDatabase( | ||||
|         os.path.join(db_base_path, parsed["DATABASE"]["download_db_name"]), | ||||
|         pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once | ||||
|     ) | ||||
| ) | ||||
							
								
								
									
										197
									
								
								news_fetch/app/runner.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										197
									
								
								news_fetch/app/runner.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,197 @@ | ||||
| """Main coordination of other util classes. Handles inbound and outbound calls""" | ||||
| import configuration | ||||
| models = configuration.models | ||||
| from threading import Thread | ||||
| import logging | ||||
| import os | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
| from utils_mail import runner as mail_runner | ||||
| from utils_slack import runner as slack_runner | ||||
| from utils_worker.workers import CompressWorker, DownloadWorker, FetchWorker, UploadWorker | ||||
|  | ||||
|  | ||||
| class ArticleWatcher: | ||||
|     """Wrapper for a newly created article object. Notifies the coordinator upon change/completition""" | ||||
|     def __init__(self, article, thread, **kwargs) -> None: | ||||
|         self.article_id = article.id # in case article becomes None at any point, we can still track the article | ||||
|         self.article = article | ||||
|         self.thread = thread | ||||
|  | ||||
|         self.completition_notifier = kwargs.get("notifier") | ||||
|         self.fetch = kwargs.get("worker_fetch", None) | ||||
|         self.download = kwargs.get("worker_download", None) | ||||
|         self.compress = kwargs.get("worker_compress", None) | ||||
|         self.upload = kwargs.get("worker_upload", None) | ||||
|  | ||||
|         self.completition_notified = False | ||||
|         # self._download_called = self._compression_called = False | ||||
|         self._fetch_completed = self._download_completed = self._compression_completed = self._upload_completed = False | ||||
|  | ||||
|         # first step: gather metadata | ||||
|         if self.fetch and self.upload: | ||||
|             self.fetch.process(self) # this will call the update_status method | ||||
|             self.upload.process(self) # idependent from the rest | ||||
|         else: # the full kwargs were not provided, only do a manual run | ||||
|             # overwrite update_status() because calls from the workers will result in erros | ||||
|             self.update_status = lambda completed: logger.info(f"Completed action {completed}") | ||||
|             for w in kwargs.get("workers_manual"): | ||||
|                 w.process(self) | ||||
|  | ||||
|  | ||||
|     def update_status(self, completed_action): | ||||
|         """Checks and notifies internal completition-status. | ||||
|         Article download is complete iff fetch and download were successfull and compression was run | ||||
|         """ | ||||
|         # if self.completition_notified and self._compression_completed and self._fetch_completed and self._download_completed and self._upload_completed, we are done | ||||
|         if completed_action == "fetch": | ||||
|             self.download.process(self) | ||||
|         elif completed_action == "download": | ||||
|             self.compress.process(self) | ||||
|         elif completed_action == "compress": # last step | ||||
|             self.completition_notifier(self.article, self.thread) | ||||
|             # triggers action in Coordinator | ||||
|         elif completed_action == "upload": | ||||
|             # this case occurs when upload was faster than compression | ||||
|             pass | ||||
|         else: | ||||
|             logger.warning(f"update_status called with unusual configuration: {completed_action}") | ||||
|  | ||||
|  | ||||
|     # ====== Attributes to be modified by the util workers | ||||
|     @property | ||||
|     def fetch_completed(self): | ||||
|         return self._fetch_completed | ||||
|  | ||||
|     @fetch_completed.setter | ||||
|     def fetch_completed(self, value: bool): | ||||
|         self._fetch_completed = value | ||||
|         self.update_status("fetch") | ||||
|  | ||||
|     @property | ||||
|     def download_completed(self): | ||||
|         return self._download_completed  | ||||
|  | ||||
|     @download_completed.setter | ||||
|     def download_completed(self, value: bool): | ||||
|         self._download_completed = value | ||||
|         self.update_status("download") | ||||
|  | ||||
|     @property | ||||
|     def compression_completed(self): | ||||
|         return self._compression_completed | ||||
|  | ||||
|     @compression_completed.setter | ||||
|     def compression_completed(self, value: bool): | ||||
|         self._compression_completed = value | ||||
|         self.update_status("compress") | ||||
|  | ||||
|     @property | ||||
|     def upload_completed(self): | ||||
|         return self._upload_completed | ||||
|  | ||||
|     @upload_completed.setter | ||||
|     def upload_completed(self, value: bool): | ||||
|         self._upload_completed = value | ||||
|         self.update_status("upload") | ||||
|  | ||||
|     def __str__(self) -> str: | ||||
|         return f"Article with id {self.article_id}" | ||||
|  | ||||
|  | ||||
| class Coordinator(Thread): | ||||
|     def __init__(self, **kwargs) -> None: | ||||
|         """Launcher calls this Coordinator as the main thread to handle connections between the other workers (threaded).""" | ||||
|         super().__init__(target = self.launch) | ||||
|      | ||||
|     def add_workers(self, **kwargs): | ||||
|         self.worker_slack = kwargs.pop("worker_slack", None)  | ||||
|         self.worker_mail = kwargs.pop("worker_mail", None) | ||||
|         # the two above won't be needed in the Watcher | ||||
|         self.worker_download = kwargs.get("worker_download", None) | ||||
|         self.worker_fetch = kwargs.get("worker_fetch", None) | ||||
|         self.worker_compress = kwargs.get("worker_compress", None) | ||||
|         self.worker_upload = kwargs.get("worker_upload", None) | ||||
|          | ||||
|         self.kwargs = kwargs | ||||
|  | ||||
|     def launch(self) -> None: | ||||
|         for w in [self.worker_download, self.worker_fetch, self.worker_upload, self.worker_compress]: | ||||
|             if not w is None: | ||||
|                 w.start() | ||||
|  | ||||
|  | ||||
|     def incoming_request(self, message): | ||||
|         """This method is passed onto the slack worker. It gets triggered when a new message is received.""" | ||||
|         url = message.urls[0] # ignore all the other ones | ||||
|         article, is_new = models.ArticleDownload.get_or_create(article_url=url) | ||||
|         thread = message.thread | ||||
|         thread.article = article | ||||
|         thread.save() | ||||
|         self.kwargs.update({"notifier" : self.article_complete_notifier}) | ||||
|  | ||||
|         if is_new or (article.file_name == "" and article.verified == 0): | ||||
|             # check for models that were created but were abandonned. This means they have missing information, most importantly no associated file | ||||
|             # this overwrites previously set information, but that should not be too important | ||||
|             ArticleWatcher( | ||||
|                 article, | ||||
|                 thread, | ||||
|                 **self.kwargs    | ||||
|             ) | ||||
|  | ||||
|             # All workers are implemented as a threaded queue. But the individual model requires a specific processing order: | ||||
|             # fetch -> download -> compress -> complete | ||||
|             # the watcher orchestrates the procedure and notifies upon completition | ||||
|             # the watcher will notify once it is sufficiently populated | ||||
|         else: # manually trigger notification immediatly | ||||
|             logger.info(f"Found existing article {article}. Now sending") | ||||
|             self.article_complete_notifier(article, thread) | ||||
|  | ||||
|  | ||||
|  | ||||
|     def manual_processing(self, articles, workers): | ||||
|         for w in workers: | ||||
|             w.start() | ||||
|  | ||||
|         for article in articles: | ||||
|             notifier = lambda article: print(f"Completed manual actions for {article}") | ||||
|             ArticleWatcher(article, None, workers_manual = workers, notifier = notifier) # Article watcher wants a thread to link article to TODO: handle threads as a kwarg  | ||||
|  | ||||
|     def article_complete_notifier(self, article, thread): | ||||
|         if self.worker_slack is None: | ||||
|             logger.warning("Not sending slack notifier") | ||||
|         else: | ||||
|             self.worker_slack.bot_worker.respond_channel_message(thread) | ||||
|         if self.worker_mail is None: | ||||
|             logger.warning("Not sending mail notifier") | ||||
|         else: | ||||
|             self.worker_mail.send(article) | ||||
|  | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     coordinator = Coordinator() | ||||
|  | ||||
|  | ||||
|     if os.getenv("UPLOAD", "false") == "true": | ||||
|         articles = models.ArticleDownload.select().where(models.ArticleDownload.archive_url == "").execute() | ||||
|         logger.info(f"Launching upload to archive for {len(articles)} articles.") | ||||
|         coordinator.manual_processing(articles, [UploadWorker()]) | ||||
|  | ||||
|     elif os.getenv("CHECK", "false") == "true": | ||||
|         from utils_check import runner as check_runner | ||||
|         check_runner.verify_unchecked() | ||||
|  | ||||
|     else: # launch with full action | ||||
|         slack_runner = slack_runner.BotRunner(coordinator.incoming_request) | ||||
|         kwargs = { | ||||
|             "worker_download" : DownloadWorker(), | ||||
|             "worker_fetch" : FetchWorker(), | ||||
|             "worker_upload" : UploadWorker(), | ||||
|             "worker_compress" : CompressWorker(), | ||||
|             "worker_slack" : slack_runner, | ||||
|             "worker_mail" : mail_runner, | ||||
|         } | ||||
|         coordinator.add_workers(**kwargs) | ||||
|         coordinator.start() | ||||
|         slack_runner.start() | ||||
							
								
								
									
										207
									
								
								news_fetch/app/utils_check/runner.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										207
									
								
								news_fetch/app/utils_check/runner.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,207 @@ | ||||
| from rich.console import Console | ||||
| from rich.table import Table | ||||
| from rich.columns import Columns | ||||
| from rich.rule import Rule | ||||
| console = Console() | ||||
| hline = Rule(style="white") | ||||
|  | ||||
| import os | ||||
| import subprocess | ||||
| from slack_sdk import WebClient | ||||
| import configuration | ||||
| models = configuration.models | ||||
|  | ||||
| u_options = { | ||||
|     "ENTER" : "Accept PDF as is. It gets marked as verified", | ||||
|     "D" : "set languange to DE and set verified", | ||||
|     "E" : "set languange to EN and set verified", | ||||
|     "O" : "set other language (prompted)", | ||||
|     "R" : "set related files (prompted multiple times)", | ||||
|     "B" : "reject and move to folder BAD", | ||||
|     "L" : "leave file as is, do not send reaction" | ||||
| } | ||||
|  | ||||
|  | ||||
| bot_client = WebClient( | ||||
|     token = configuration.parsed["SLACK"]["auth_token"] | ||||
| ) | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| def file_overview(file_url: str, file_attributes: list, options: dict) -> None: | ||||
|     """Prints a neat overview of the current article""" | ||||
|     file_table = Table( | ||||
|         title = file_url, | ||||
|         row_styles = ["white", "bright_black"], | ||||
|         min_width = 100 | ||||
|     ) | ||||
|  | ||||
|     file_table.add_column("Attribute", justify = "right", no_wrap = True) | ||||
|     file_table.add_column("Value set by auto_news") | ||||
|     file_table.add_column("Status", justify = "right") | ||||
|     for attr in file_attributes: | ||||
|         file_table.add_row(attr["name"], attr["value"], attr["status"]) | ||||
|  | ||||
|      | ||||
|     option_key = "\n".join([f"[[bold]{k}[/bold]]" for k in options.keys()]) | ||||
|     option_action = "\n".join([f"[italic]{k}[/italic]" for k in options.values()]) | ||||
|     columns = Columns([option_key, option_action]) | ||||
|  | ||||
|     console.print(file_table) | ||||
|     console.print("Your options:") | ||||
|     console.print(columns) | ||||
|  | ||||
|  | ||||
| def send_reaction_to_slack_thread(article, reaction): | ||||
|     """Sends the verification status as a reaction to the associated slack thread. This will significantly decrease load times of the bot""" | ||||
|     thread = article.slack_thread | ||||
|     messages = models.Message.select().where(models.Message.text.contains(article.article_url)) | ||||
|     # TODO rewrite this shit | ||||
|     if len(messages) > 5: | ||||
|         print("Found more than 5 messages. Aborting reactions...") | ||||
|         return | ||||
|     for m in messages: | ||||
|         if not m.has_single_url: | ||||
|             print("Found thread but won't send reaction because thread has multiple urls") | ||||
|             pass | ||||
|         else: | ||||
|             ts = m.slack_ts | ||||
|             bot_client.reactions_add( | ||||
|                 channel=configuration.parsed["SLACK"]["archive_id"], | ||||
|                 name=reaction, | ||||
|                 timestamp=ts | ||||
|             ) | ||||
|             print("Sent reaction to message") | ||||
|  | ||||
|  | ||||
| def prompt_language(query): | ||||
|     not_set = True | ||||
|     while not_set: | ||||
|         uin = input("Set language (nation-code, 2 letters) ") | ||||
|         if len(uin) != 2: | ||||
|             print("Bad code, try again") | ||||
|         else: | ||||
|             not_set = False | ||||
|             query.language = uin | ||||
|             query.save() | ||||
|  | ||||
|  | ||||
| def prompt_related(query): | ||||
|     file_list = [] | ||||
|     finished = False | ||||
|     while not finished: | ||||
|         uin = input("Additional file for article? Type '1' to cancel ") | ||||
|         if uin == "1": | ||||
|             query.set_related(file_list) | ||||
|             finished = True | ||||
|         else: | ||||
|             file_list.append(uin) | ||||
|  | ||||
|  | ||||
| def prompt_new_fname(query): | ||||
|     uin = input("New fname? ") | ||||
|     old_fname =  query.file_name | ||||
|     query.file_name = uin | ||||
|     query.verified = 1 | ||||
|     if old_fname != "": | ||||
|         os.remove(query.save_path + old_fname) | ||||
|     query.save()     | ||||
|  | ||||
|  | ||||
|  | ||||
| def reject_article(article): | ||||
|     article.verified = -1 | ||||
|     article.save() | ||||
|     print("Article marked as bad") | ||||
|     # also update the threads to not be monitored anymore | ||||
|     send_reaction_to_slack_thread(article, "x") | ||||
|  | ||||
|  | ||||
| def unreject_article(query): | ||||
|     query.verified = 1 | ||||
|     query.save() | ||||
|     # os.rename(badpdf, fname) | ||||
|     print("File set to verified") | ||||
|  | ||||
|  | ||||
| def accept_article(article, last_accepted): | ||||
|     article.verified = 1 | ||||
|     article.save() | ||||
|     print("Article accepted as GOOD") | ||||
|  | ||||
|     # also update the threads to not be monitored anymore | ||||
|     send_reaction_to_slack_thread(article, "white_check_mark") | ||||
|  | ||||
|     return "" # linked | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| def verify_unchecked(): | ||||
|     query = models.ArticleDownload.select().where(models.ArticleDownload.verified == 0).execute() | ||||
|     last_linked = None | ||||
|  | ||||
|     for article in query: | ||||
|         console.print(hline) | ||||
|         core_info = [] | ||||
|         for e, name in zip([article.save_path, article.file_name, article.title, article.language], ["Save path", "File name", "Title", "Language"]): | ||||
|             entry = { | ||||
|                 "status" : "[red]██[/red]" if (len(e) == 0 or e == -1) else "[green]██[/green]", | ||||
|                 "value" : e if len(e) != 0 else "not set", | ||||
|                 "name" : name | ||||
|             } | ||||
|             core_info.append(entry) | ||||
|          | ||||
|         try: | ||||
|             # close any previously opened windows: | ||||
|             # subprocess.call(["kill", "`pgrep evince`"]) | ||||
|             os.system("pkill evince") | ||||
|             # then open a new one | ||||
|             subprocess.Popen(["evince", f"file://{os.path.join(article.save_path, article.file_name)}"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) | ||||
|             # supress evince gtk warnings | ||||
|         except Exception as e: | ||||
|             print(e) | ||||
|             continue | ||||
|  | ||||
|          | ||||
|  | ||||
|         file_overview( | ||||
|             file_url = article.article_url,  | ||||
|             file_attributes=core_info, | ||||
|             options = u_options | ||||
|         ) | ||||
|  | ||||
|  | ||||
|         proceed = False | ||||
|         while not proceed: | ||||
|             proceed = False | ||||
|             uin = input("Choice ?").lower() | ||||
|             if uin == "": | ||||
|                 last_linked = accept_article(article, last_linked) # last linked accelerates the whole process | ||||
|                 proceed = True | ||||
|             elif uin == "d": | ||||
|                 article.language = "de" | ||||
|                 article.verified = 1 | ||||
|                 article.save() | ||||
|                 proceed = True | ||||
|             elif uin == "e": | ||||
|                 article.language = "en" | ||||
|                 article.verified = 1 | ||||
|                 article.save() | ||||
|                 proceed = True | ||||
|             elif uin == "o": | ||||
|                 prompt_language(article) | ||||
|             elif uin == "r": | ||||
|                 prompt_related(article) | ||||
|             elif uin == "b": | ||||
|                 reject_article(article) | ||||
|                 proceed = True | ||||
|             elif uin == "l": | ||||
|                 # do nothing | ||||
|                 proceed = True | ||||
|             else: | ||||
|                 print("Invalid input") | ||||
							
								
								
									
										42
									
								
								news_fetch/app/utils_mail/runner.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								news_fetch/app/utils_mail/runner.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,42 @@ | ||||
| import smtplib | ||||
| from email.mime.multipart import MIMEMultipart | ||||
| from email.mime.text import MIMEText | ||||
| from email.mime.application import MIMEApplication | ||||
| import os | ||||
| import logging | ||||
| import configuration | ||||
|  | ||||
| logger = logging.getLogger(__name__) | ||||
| config = configuration.parsed["MAIL"] | ||||
|  | ||||
| def send(article_model): | ||||
|     mail = MIMEMultipart() | ||||
|     mail['Subject'] = "{} -- {}".format(article_model.source_name, article_model.title) | ||||
|     mail['From'] = config["sender"] | ||||
|     mail['To'] = config["recipient"] | ||||
|  | ||||
|     msgs = article_model.mail_info # this is html | ||||
|     msg = [m["reply_text"] for m in msgs] | ||||
|     msg = "\n".join(msg) | ||||
|  | ||||
|     content = MIMEText(msg, "html") | ||||
|     mail.attach(content) | ||||
|  | ||||
|     files = [m["file_path"] for m in msgs if m["file_path"]] | ||||
|     for path in files: | ||||
|         with open(path, 'rb') as file: | ||||
|             part = MIMEApplication(file.read(), "pdf") | ||||
|         # encoders.encode_base64(part) | ||||
|         part.add_header('Content-Disposition', 'attachment', filename=os.path.basename(path)) | ||||
|         mail.attach(part) | ||||
|  | ||||
|     try: | ||||
|         smtp = smtplib.SMTP(config["smtp_server"], config["port"]) | ||||
|         smtp.starttls() | ||||
|         smtp.login(config["uname"], config["password"]) | ||||
|         smtp.sendmail(config["sender"], config["recipient"], mail.as_string()) | ||||
|         smtp.quit() | ||||
|         logger.info("Mail successfully sent.") | ||||
|     except Exception as e: | ||||
|         logger.error("Could not send mail for article {}".format(article_model)) | ||||
|         logger.info(e) | ||||
							
								
								
									
										277
									
								
								news_fetch/app/utils_slack/message_helpers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										277
									
								
								news_fetch/app/utils_slack/message_helpers.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,277 @@ | ||||
| import logging | ||||
| import configuration | ||||
| import requests | ||||
| import os | ||||
| import time | ||||
| from threading import Thread | ||||
| from slack_sdk.errors import SlackApiError | ||||
|  | ||||
| logger = logging.getLogger(__name__) | ||||
| config = configuration.parsed["SLACK"] | ||||
| models = configuration.models | ||||
| slack_client = "dummy" | ||||
| LATEST_RECORDED_REACTION = 0 | ||||
|  | ||||
|  | ||||
| def init(client) -> None: | ||||
|     global slack_client | ||||
|     slack_client = client | ||||
|  | ||||
|     global LATEST_RECORDED_REACTION | ||||
|     try: | ||||
|         LATEST_RECORDED_REACTION = models.Reaction.select(models.Reaction.id).order_by("id")[-1] | ||||
|     except IndexError: #query is actually empty, we have never fetched any messages until now | ||||
|         LATEST_RECORDED_REACTION = 0     | ||||
|      | ||||
|     # fetch all te messages we could have possibly missed | ||||
|     logger.info("Querying missed messages, threads and reactions. This can take some time.") | ||||
|     fetch_missed_channel_messages() # not threaded | ||||
|     t = Thread(target = fetch_missed_channel_reactions) # threaded, runs in background (usually takes a long time) | ||||
|     t.start() | ||||
|  | ||||
|     if os.getenv("REDUCEDFETCH", "false") == "true": | ||||
|         logger.warning("Only fetching empty threads for bot messages because 'REDUCEDFETCH=true'") | ||||
|         fetch_missed_thread_messages(reduced=True) | ||||
|     else: # perform both asyncronously | ||||
|         fetch_missed_thread_messages() | ||||
|      | ||||
|  | ||||
|  | ||||
| def get_unhandled_messages(): | ||||
|     """Gets all messages that have not yet been handled, be it by mistake or by downtime | ||||
|     As the message handler makes no distinction between channel messages and thread messages, | ||||
|     we don't have to worry about them here. | ||||
|     """ | ||||
|  | ||||
|     threaded_objects = [] | ||||
|     for t in models.Thread.select(): | ||||
|         if t.message_count > 1: # if only one message was written, it is the channel message | ||||
|             msg = t.last_message | ||||
|             if msg.is_by_human: | ||||
|                 threaded_objects.append(msg) | ||||
|             # else don't, nothing to process | ||||
|     logger.info(f"Set {len(threaded_objects)} thread-messages as not yet handled.") | ||||
|  | ||||
|  | ||||
|     channel_objects = [t.initiator_message for t in models.Thread.select() if (t.message_count == 1 and not t.is_fully_processed)] | ||||
|     logger.info(f"Set {len(channel_objects)} channel-messages as not yet handled.") | ||||
|      | ||||
|     reaction_objects = list(models.Reaction.select().where(models.Reaction.id > LATEST_RECORDED_REACTION)) | ||||
|     logger.info(f"Set {len(reaction_objects)} reactions as not yet handled.") | ||||
|     # the ones newer than the last before the fetch | ||||
|      | ||||
|     all_messages = channel_objects + threaded_objects | ||||
|     return all_messages, reaction_objects | ||||
|  | ||||
|  | ||||
| def fetch_missed_channel_messages(): | ||||
|     # latest processed message_ts is: | ||||
|     presaved = models.Message.select().order_by(models.Message.ts) | ||||
|     if not presaved: | ||||
|         last_ts = 0 | ||||
|     else: | ||||
|         last_message = presaved[-1] | ||||
|         last_ts = last_message.slack_ts | ||||
|      | ||||
|     result = slack_client.conversations_history( | ||||
|         channel=config["archive_id"], | ||||
|         oldest=last_ts | ||||
|     ) | ||||
|  | ||||
|     new_messages = result.get("messages", []) | ||||
|     # # filter the last one, it is a duplicate! (only if the db is not empty!) | ||||
|     # if last_ts != 0 and len(new_messages) != 0: | ||||
|     #     new_messages.pop(-1) | ||||
|  | ||||
|     new_fetches = 0 | ||||
|     for m in new_messages: | ||||
|         # print(m) | ||||
|         message_dict_to_model(m) | ||||
|         new_fetches += 1 | ||||
|  | ||||
|     refetch = result.get("has_more", False) | ||||
|     while refetch: # we have not actually fetched them all | ||||
|         try: | ||||
|             result = slack_client.conversations_history( | ||||
|                 channel = config["archive_id"], | ||||
|                 cursor = result["response_metadata"]["next_cursor"], | ||||
|                 oldest = last_ts | ||||
|             ) # fetches 100 messages, older than the [-1](=oldest) element of new_fetches | ||||
|             refetch = result.get("has_more", False) | ||||
|  | ||||
|             new_messages = result.get("messages", []) | ||||
|             for m in new_messages: | ||||
|                 message_dict_to_model(m) | ||||
|                 new_fetches += 1 | ||||
|         except SlackApiError: # Most likely a rate-limit | ||||
|             logger.error("Error while fetching channel messages. (likely rate limit) Retrying in {} seconds...".format(config["api_wait_time"])) | ||||
|             time.sleep(config["api_wait_time"]) | ||||
|             refetch = True | ||||
|      | ||||
|     logger.info(f"Fetched {new_fetches} new channel messages.") | ||||
|  | ||||
|  | ||||
| def fetch_missed_thread_messages(reduced=False): | ||||
|     """After having gotten all base-threads, we need to fetch all their replies"""         | ||||
|     # I don't know of a better way: we need to fetch this for each and every thread (except if it is marked as permanently solved) | ||||
|     logger.info("Starting fetch of thread messages...") | ||||
|     if reduced: | ||||
|         threads = [t for t in models.Thread.select() if (t.message_count == 1 and not t.is_fully_processed)] | ||||
|         # this only fetches completely empty threads, which might be because the bot-message was not yet saved to the db. | ||||
|         # once we got all the bot-messages the remaining empty threads will be the ones we need to process. | ||||
|     else: | ||||
|         threads = [t for t in models.Thread.select() if not t.is_fully_processed] | ||||
|     logger.info(f"Fetching history for {len(threads)} empty threads") | ||||
|     new_messages = [] | ||||
|     for i,t in enumerate(threads): | ||||
|         try: | ||||
|             messages = slack_client.conversations_replies( | ||||
|                 channel = config["archive_id"], | ||||
|                 ts = t.slack_ts, | ||||
|                 oldest = t.messages[-1].slack_ts | ||||
|             )["messages"] | ||||
|         except SlackApiError: | ||||
|             logger.error("Hit rate limit while querying threaded messages, retrying in {}s ({}/{} queries elapsed)".format(config["api_wait_time"], i, len(threads))) | ||||
|             time.sleep(int(config["api_wait_time"])) | ||||
|             messages = slack_client.conversations_replies( | ||||
|                 channel = config["archive_id"], | ||||
|                 ts = t.slack_ts, | ||||
|                 oldest = t.messages[-1].slack_ts | ||||
|             )["messages"] | ||||
|  | ||||
|         messages.pop(0) # the first message is the one posted in the channel. We already processed it! | ||||
|          | ||||
|         for m in messages: | ||||
|             # only append *new* messages | ||||
|             res = message_dict_to_model(m) | ||||
|             if res: | ||||
|                 new_messages.append(res) | ||||
|     logger.info("Fetched {} new threaded messages.".format(len(new_messages))) | ||||
|  | ||||
|  | ||||
| def fetch_missed_channel_reactions(): | ||||
|     logger.info("Starting background fetch of channel reactions...") | ||||
|     threads = [t for t in models.Thread.select() if not t.is_fully_processed] | ||||
|     for i,t in enumerate(threads): | ||||
|         try: | ||||
|             query = slack_client.reactions_get( | ||||
|                 channel = config["archive_id"], | ||||
|                 timestamp = t.slack_ts | ||||
|             ) | ||||
|             reactions = query.get("message", []).get("reactions", []) # default = [] | ||||
|         except SlackApiError: # probably a rate_limit: | ||||
|             logger.error("Hit rate limit while querying reactions. retrying in {}s ({}/{} queries elapsed)".format(config["api_wait_time"], i, len(threads))) | ||||
|             time.sleep(int(config["api_wait_time"])) | ||||
|             reactions = query.get("message", []).get("reactions", []) | ||||
|  | ||||
|         for r in reactions: | ||||
|             reaction_dict_to_model(r, t) | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # Helpers for message conversion to db-objects | ||||
| def reaction_dict_to_model(reaction, thread=None): | ||||
|     if thread is None: | ||||
|         m_ts = reaction["item"]["ts"] | ||||
|         message = models.Message.get(ts = float(m_ts)) | ||||
|         thread = message.thread | ||||
|     if "name" in reaction.keys(): # fetched through manual api query | ||||
|         content = reaction["name"] | ||||
|     elif "reaction" in reaction.keys(): # fetched through events | ||||
|         content = reaction["reaction"] | ||||
|     else: | ||||
|         logger.error(f"Weird reaction received: {reaction}") | ||||
|         return None | ||||
|  | ||||
|     r, _ = models.Reaction.get_or_create( | ||||
|         type = content, | ||||
|         message = thread.initiator_message | ||||
|     ) | ||||
|     logger.info("Saved reaction [{}]".format(content)) | ||||
|     return r | ||||
|  | ||||
|  | ||||
| def message_dict_to_model(message): | ||||
|     if message["type"] == "message": | ||||
|         thread_ts = message["thread_ts"] if "thread_ts" in message else message["ts"] | ||||
|         uid = message.get("user", "BAD USER") | ||||
|         if uid == "BAD USER": | ||||
|             logger.critical("Message has no user?? {}".format(message)) | ||||
|             return None | ||||
|          | ||||
|         user, _ = models.User.get_or_create(user_id = uid) | ||||
|         thread, _ = models.Thread.get_or_create(thread_ts = thread_ts) | ||||
|         m, new = models.Message.get_or_create( | ||||
|             user = user, | ||||
|             thread = thread, | ||||
|             ts = message["ts"], | ||||
|             channel_id = config["archive_id"], | ||||
|             text = message["text"] | ||||
|         ) | ||||
|         logger.info(f"Saved: {m} ({'new' if new else 'old'})") | ||||
|  | ||||
|         files = message.get("files", []) | ||||
|         if len(files) >= 1: | ||||
|             f = files[0] #default: [] | ||||
|             m.file_type = f["filetype"] | ||||
|             m.perma_link = f["url_private_download"] | ||||
|             m.save() | ||||
|             logger.info(f"Saved {m.file_type}-file for message (id={m.id})") | ||||
|         if new: | ||||
|             return m | ||||
|         else: | ||||
|             return None | ||||
|     else: | ||||
|         logger.warning("What should I do of {}".format(message)) | ||||
|         return None | ||||
|  | ||||
|  | ||||
| def say_substitute(*args, **kwargs): | ||||
|     logger.info("Now sending message through say-substitute: {}".format(" - ".join(args))) | ||||
|     slack_client.chat_postMessage( | ||||
|         channel=config["archive_id"], | ||||
|         text=" - ".join(args), | ||||
|         **kwargs | ||||
|     ) | ||||
|      | ||||
|  | ||||
| def save_as_related_file(url, article_object): | ||||
|     r = requests.get(url, headers={"Authorization": "Bearer {}".format(slack_client.token)}) | ||||
|     saveto = article_object.save_path | ||||
|     ftype = url[url.rfind(".") + 1:] | ||||
|     fname = "{} - related no {}.{}".format( | ||||
|         article_object.file_name.replace(".pdf",""), | ||||
|         len(article_object.related) + 1, | ||||
|         ftype | ||||
|     ) | ||||
|     with open(os.path.join(saveto, fname), "wb") as f: | ||||
|         f.write(r.content) | ||||
|     article_object.set_related([fname]) | ||||
|     logger.info("Added {} to model {}".format(fname, article_object)) | ||||
|     return fname | ||||
|  | ||||
|  | ||||
| def react_file_path_message(fname, article_object): | ||||
|     saveto = article_object.save_path | ||||
|     file_path = os.path.join(saveto, fname) | ||||
|     if os.path.exists(file_path): | ||||
|         article_object.set_related([fname]) | ||||
|         logger.info("Added {} to model {}".format(fname, article_object)) | ||||
|         return True | ||||
|     else: | ||||
|         return False | ||||
|  | ||||
|  | ||||
| def is_message_in_archiving(message) -> bool: | ||||
|     if isinstance(message, dict): | ||||
|         return message["channel"] == config["archive_id"] | ||||
|     else: | ||||
|         return message.channel_id == config["archive_id"] | ||||
|  | ||||
|  | ||||
| def is_reaction_in_archiving(event) -> bool: | ||||
|     if isinstance(event, dict): | ||||
|         return event["item"]["channel"] == config["archive_id"] | ||||
|     else: | ||||
|         return event.message.channel_id == config["archive_id"] | ||||
							
								
								
									
										184
									
								
								news_fetch/app/utils_slack/runner.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										184
									
								
								news_fetch/app/utils_slack/runner.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,184 @@ | ||||
| from slack_bolt import App | ||||
| from slack_bolt.adapter.socket_mode import SocketModeHandler | ||||
|  | ||||
| import logging | ||||
| import configuration | ||||
|  | ||||
| from . import message_helpers | ||||
|  | ||||
|  | ||||
| config = configuration.parsed["SLACK"] | ||||
| models = configuration.models | ||||
|  | ||||
| class BotApp(App): | ||||
|     logger = logging.getLogger(__name__) | ||||
|  | ||||
|     def __init__(self, callback, *args, **kwargs): | ||||
|  | ||||
|         super().__init__(*args, **kwargs) | ||||
|         self.callback = callback | ||||
|  | ||||
|     def start(self): | ||||
|         message_helpers.init(self.client) | ||||
|         missed_messages, missed_reactions = message_helpers.get_unhandled_messages() | ||||
|  | ||||
|         [self.handle_incoming_message(m) for m in missed_messages] | ||||
|         [self.handle_incoming_reaction(r) for r in missed_reactions] | ||||
|  | ||||
|         # self.react_missed_reactions(missed_reactions) | ||||
|         # self.react_missed_messages(missed_messages) | ||||
|         self.startup_status() | ||||
|  | ||||
|  | ||||
|  | ||||
|     def handle_incoming_reaction(self, reaction): | ||||
|         if isinstance(reaction, dict): #else: the reaction is already being passed as a model | ||||
|             # CAUTION: filter for 'changed reactions' those are nasty (usually when adding an url) | ||||
|             reaction = message_helpers.reaction_dict_to_model(reaction) | ||||
|  | ||||
|         thread = reaction.message.thread | ||||
|         article_object = thread.article | ||||
|         if not article_object is None: | ||||
|             reaction = reaction.type | ||||
|             status = 1 if reaction == "white_check_mark" else -1 | ||||
|  | ||||
|             # self.logger.info(f"Applying reaction {reaction} to its root message.") | ||||
|             article_object.verified = status | ||||
|             article_object.save() | ||||
|  | ||||
|  | ||||
|     def handle_incoming_message(self, message): | ||||
|         """Reacts to all messages inside channel archiving. Must then | ||||
|         distinguish between threaded replies and new requests | ||||
|         and react accordingly""" | ||||
|         if isinstance(message, dict): #else: the message is already being passed as a model | ||||
|             # CAUTION: filter for 'changed messages' those are nasty (usually when adding an url) | ||||
|             if message.get("subtype", "not bad") == "message_changed": | ||||
|                 return False | ||||
|             message = message_helpers.message_dict_to_model(message) | ||||
|  | ||||
|         # First check: belongs to thread? | ||||
|         is_threaded = message.thread.message_count > 1 and message != message.thread.initiator_message | ||||
|         if is_threaded: | ||||
|             self.incoming_thread_message(message) | ||||
|         else: | ||||
|             self.incoming_channel_message(message) | ||||
|              | ||||
|  | ||||
|     def incoming_thread_message(self, message): | ||||
|         if message.user.user_id == config["bot_id"]: | ||||
|             return True # ignore the files uploaded by the bot. We handled them already! | ||||
|          | ||||
|         thread = message.thread | ||||
|         if thread.is_fully_processed: | ||||
|             return True | ||||
|  | ||||
|         self.logger.info("Receiving thread-message") | ||||
|         self.respond_thread_message(message) | ||||
|  | ||||
|  | ||||
|     def incoming_channel_message(self, message): | ||||
|         self.logger.info(f"Handling message {message} ({len(message.urls)} urls)") | ||||
|  | ||||
|         if not message.urls: # no urls in a root-message => IGNORE | ||||
|             message.is_processed_override = True | ||||
|             message.save() | ||||
|             return | ||||
|  | ||||
|         # ensure thread is still empty, this is a scenario encountered only in testing, but let's just filter it | ||||
|         if message.thread.message_count > 1: | ||||
|             self.logger.info("Discarded message because it is actually processed.") | ||||
|             return | ||||
|          | ||||
|         if len(message.urls) > 1: | ||||
|             message_helpers.say_substitute("Only the first url is being handled. Please send any subsequent url as a separate message", thread_ts=message.thread.slack_ts) | ||||
|          | ||||
|         self.callback(message) | ||||
|         # for url in message.urls: | ||||
|             # self.callback(url, message) | ||||
|             # stop here! | ||||
|  | ||||
|  | ||||
|  | ||||
|     def respond_thread_message(self, message, say=message_helpers.say_substitute): | ||||
|         thread = message.thread | ||||
|         article = thread.article | ||||
|         if message.perma_link: # file upload means new data     | ||||
|             fname = message_helpers.save_as_related_file(message.perma_link, article) | ||||
|             say("File was saved as 'related file' under `{}`.".format(fname), | ||||
|                 thread_ts=thread.slack_ts | ||||
|             ) | ||||
|         else: # either a pointer to a new file (too large to upload), or trash | ||||
|             success = message_helpers.react_file_path_message(message.text, article) | ||||
|             if success: | ||||
|                 say("File was saved as 'related file'", thread_ts=thread.slack_ts) | ||||
|             else: | ||||
|                 self.logger.error("User replied to thread {} but the response did not contain a file/path".format(thread)) | ||||
|                 say("Cannot process response without associated file.", | ||||
|                     thread_ts=thread.slack_ts | ||||
|                 ) | ||||
|  | ||||
|  | ||||
|     def respond_channel_message(self, thread, say=message_helpers.say_substitute): | ||||
|         article = thread.article | ||||
|         answers = article.slack_info | ||||
|         for a in answers: | ||||
|             if a["file_path"]: | ||||
|                 try: # either, a["file_path"] does not exist, or the upload resulted in an error | ||||
|                     self.client.files_upload( | ||||
|                         channels = config["archive_id"], | ||||
|                         initial_comment = f"<@{config['responsible_id']}> \n {a['reply_text']}", | ||||
|                         file = a["file_path"], | ||||
|                         thread_ts = thread.slack_ts | ||||
|                     ) | ||||
|                     status = True | ||||
|                 except: | ||||
|                     say( | ||||
|                         "File {} could not be uploaded.".format(a), | ||||
|                         thread_ts=thread.slack_ts | ||||
|                     ) | ||||
|                     status = False | ||||
|             else: # anticipated that there is no file! | ||||
|                 say( | ||||
|                     f"<@{config['responsible_id']}> \n {a['reply_text']}", | ||||
|                     thread_ts=thread.slack_ts | ||||
|                 ) | ||||
|                 status = True | ||||
|          | ||||
|  | ||||
|     def startup_status(self): | ||||
|         threads = [t for t in models.Thread.select()] | ||||
|         all_threads = len(threads) | ||||
|         fully_processed = len([t for t in threads if t.is_fully_processed]) | ||||
|         fully_unprocessed = len([t for t in threads if t.message_count == 1]) | ||||
|         articles_unprocessed = len(models.ArticleDownload.select().where(models.ArticleDownload.verified < 1)) | ||||
|         self.logger.info(f"[bold]STATUS[/bold]: Fully processed {fully_processed}/{all_threads} threads. {fully_unprocessed} threads have 0 replies. Article-objects to verify: {articles_unprocessed}", extra={"markup": True}) | ||||
|  | ||||
|  | ||||
|      | ||||
|  | ||||
|  | ||||
| class BotRunner(): | ||||
|     """Stupid encapsulation so that we can apply the slack decorators to the BotApp""" | ||||
|     def __init__(self, callback, *args, **kwargs) -> None: | ||||
|         self.bot_worker = BotApp(callback, token=config["auth_token"]) | ||||
|  | ||||
|         @self.bot_worker.event(event="message", matchers=[message_helpers.is_message_in_archiving]) | ||||
|         def handle_incoming_message(message, say): | ||||
|             return self.bot_worker.handle_incoming_message(message) | ||||
|  | ||||
|         @self.bot_worker.event(event="reaction_added", matchers=[message_helpers.is_reaction_in_archiving]) | ||||
|         def handle_incoming_reaction(event, say): | ||||
|             return self.bot_worker.handle_incoming_reaction(event) | ||||
|  | ||||
|         # target = self.launch | ||||
|         # super().__init__(target=target) | ||||
|  | ||||
|  | ||||
|     def start(self): | ||||
|         self.bot_worker.start() | ||||
|         SocketModeHandler(self.bot_worker, config["app_token"]).start() | ||||
|  | ||||
|  | ||||
|     # def respond_to_message(self, message): | ||||
|     #     self.bot_worker.handle_incoming_message(message) | ||||
							
								
								
									
										67
									
								
								news_fetch/app/utils_storage/migrations/migration.001.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										67
									
								
								news_fetch/app/utils_storage/migrations/migration.001.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,67 @@ | ||||
| from playhouse.migrate import * | ||||
|  | ||||
|  | ||||
| """ | ||||
| This migration assumes that downloads.db kept the exact same structure as before. | ||||
| messages.db should drop the table articlemodelreference in favor of a new field article in the thread-table | ||||
| Since each thread is constrained to exactly one article this makes the most sense. | ||||
|  | ||||
| This migration assumes that messages.db gets a new field in the table thread: | ||||
| id  |   thread_ts | article_id | ||||
|  | ||||
| We now need to migrate from the table articlemodelreference and then delete it. | ||||
| """ | ||||
|  | ||||
|  | ||||
| db = SqliteDatabase("/code/.dev/messages.db") | ||||
| migrator = SqliteMigrator(db) | ||||
|  | ||||
|  | ||||
| article_field = IntegerField(null=True) | ||||
|  | ||||
|  | ||||
| migrate( | ||||
|     migrator.add_column('thread', 'article_id', article_field), | ||||
|     # migrator.drop_column('some_table', 'old_column'), | ||||
| ) | ||||
|  | ||||
|  | ||||
|  | ||||
| # these are the old models, adapted to the migration | ||||
|  | ||||
| class BaseModel(Model): | ||||
|     class Meta: | ||||
|         database = db | ||||
|  | ||||
| class User(BaseModel): | ||||
|     user_id = CharField(default='', unique=True)    | ||||
|  | ||||
| class Thread(BaseModel): | ||||
|     """The threads that concern us are only created if the messages that contain urls""" | ||||
|     thread_ts = FloatField(default = 0) | ||||
|     article_id = IntegerField(null=True) | ||||
|  | ||||
|      | ||||
| class Message(BaseModel): | ||||
|     ts = FloatField(unique=True) #for sorting | ||||
|     channel_id = CharField(default='') | ||||
|     user = ForeignKeyField(User, backref="messages") | ||||
|     text = TextField(default='') | ||||
|     thread = ForeignKeyField(Thread, backref="messages", default=None) | ||||
|     file_type = CharField(default='') | ||||
|     perma_link = CharField(default='') | ||||
|     is_processed_override = BooleanField(default=False) | ||||
|  | ||||
|  | ||||
| class ArticleModelReference(BaseModel): | ||||
|     message = ForeignKeyField(Message, backref='article_model_references') | ||||
|     article_model_id = IntegerField(default = 0) | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| for ref in ArticleModelReference.select(): | ||||
|     ref.message.thread.article_id = ref.article_model_id | ||||
|     ref.message.thread.save() | ||||
|  | ||||
| db.drop_tables((ArticleModelReference)) | ||||
							
								
								
									
										322
									
								
								news_fetch/app/utils_storage/models.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										322
									
								
								news_fetch/app/utils_storage/models.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,322 @@ | ||||
| import logging | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
| from peewee import * | ||||
| import os | ||||
| import markdown | ||||
| import re | ||||
| import configuration | ||||
| import datetime | ||||
|  | ||||
| config = configuration.parsed["DOWNLOADS"] | ||||
| slack_config = configuration.parsed["SLACK"] | ||||
|  | ||||
| ## Helpers | ||||
| chat_db = DatabaseProxy() | ||||
| download_db = DatabaseProxy() | ||||
|  | ||||
| # set the nature of the db at runtime | ||||
|  | ||||
| class DownloadBaseModel(Model): | ||||
|     class Meta: | ||||
|         database = download_db | ||||
|  | ||||
| class ChatBaseModel(Model): | ||||
|     class Meta: | ||||
|         database = chat_db | ||||
|  | ||||
|  | ||||
|  | ||||
| ## == Article related models == ## | ||||
| class ArticleDownload(DownloadBaseModel): | ||||
|     title = CharField(default='') | ||||
|     pub_date = DateField(default = '') | ||||
|     download_date = DateField(default = datetime.date.today) | ||||
|     source_name = CharField(default = '') | ||||
|     article_url = TextField(default = '', unique=True) | ||||
|     archive_url = TextField(default = '') | ||||
|     file_name = TextField(default = '') | ||||
|     language = CharField(default = '') | ||||
|     summary = TextField(default = '') | ||||
|     comment = TextField(default = '') | ||||
|     verified = IntegerField(default = False) | ||||
|     # authors | ||||
|     # keywords | ||||
|     # ... are added through foreignkeys | ||||
|  | ||||
|     def __str__(self) -> str: | ||||
|         return f"ART [{self.title} -- {self.source_name}]" | ||||
|  | ||||
|     ## Useful Properties | ||||
|     @property | ||||
|     def save_path(self): | ||||
|         return f"{config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/" | ||||
|  | ||||
|     def fname_nas(self, file_name=""): | ||||
|         if self.download_date: | ||||
|             if file_name: | ||||
|                 return "NAS: {}/{}/{}/{}".format(config["remote_storage_path"], self.download_date.year, self.download_date.strftime("%B"), file_name) | ||||
|             else: # return the self. name | ||||
|                 return "NAS: {}/{}/{}/{}".format(config["remote_storage_path"], self.download_date.year, self.download_date.strftime("%B"), self.file_name) | ||||
|         else: | ||||
|             return None | ||||
|  | ||||
|     @property | ||||
|     def fname_template(self): | ||||
|         if "youtube.com" in self.source_name or "youtu.be" in self.source_name: | ||||
|             fname = "{} -- {}".format(self.source_name, self.title) | ||||
|         else: | ||||
|             fname = "{} -- {}.pdf".format(self.source_name, self.title) | ||||
|         return clear_path_name(fname) | ||||
|  | ||||
|     @property | ||||
|     def is_title_bad(self):  # add incrementally | ||||
|         return "PUR-Abo" in self.title \ | ||||
|             or "Redirecting" in self.title \ | ||||
|             or "Error while running fetch" in self.title | ||||
|              | ||||
|     @property | ||||
|     def slack_info(self): | ||||
|         status = [":x: No better version available", ":gear: Verification pending", ":white_check_mark: Verified by human"][self.verified + 1] | ||||
|         content = "\n>" + "\n>".join(self.summary.split("\n")) | ||||
|         file_status, msg = self.file_status() | ||||
|         if not file_status: | ||||
|             return [msg] | ||||
|          | ||||
|         # everything alright: generate real content | ||||
|         # first the base file | ||||
|         if self.file_name[-4:] == ".pdf": | ||||
|             answer = [{ # main reply with the base pdf | ||||
|                 "reply_text" : f"*{self.title}*\n{status}\n{content}", | ||||
|                 "file_path" : self.save_path + self.file_name  | ||||
|             }] | ||||
|         else: # don't upload if the file is too big! | ||||
|             location = "Not uploaded to slack, but the file will be on the NAS:\n`{}`".format(self.fname_nas()) | ||||
|             answer = [{ # main reply with the base pdf | ||||
|                 "reply_text" : "*{}*\n{}\n{}\n{}".format(self.title, status, content, location), | ||||
|                 "file_path" : None  | ||||
|             }] | ||||
|  | ||||
|         # then the related files | ||||
|         rel_text = "" | ||||
|         for r in self.related: | ||||
|             fname = r.related_file_name | ||||
|             lentry = "\n• `{}` ".format(self.fname_nas(fname)) | ||||
|             if fname[-4:] == ".pdf": # this is a manageable file, directly upload | ||||
|                 f_ret = self.save_path + fname | ||||
|                 answer.append({"reply_text":"", "file_path" : f_ret}) | ||||
|             else: # not pdf <=> too large. Don't upload but mention its existence | ||||
|                 lentry += "(not uploaded to slack, but the file will be on the NAS)" | ||||
|                  | ||||
|             rel_text += lentry | ||||
|  | ||||
|         if rel_text: | ||||
|             rel_text = answer[0]["reply_text"] = answer[0]["reply_text"] + "\nRelated files:\n" + rel_text | ||||
|          | ||||
|         return answer | ||||
|  | ||||
|     @property | ||||
|     def mail_info(self): | ||||
|         base = [{"reply_text": "[{}]({})\n".format(self.article_url, self.article_url), "file_path":None}] + self.slack_info | ||||
|         return [{"reply_text": markdown.markdown(m["reply_text"]), "file_path": m["file_path"]} for m in base] | ||||
|  | ||||
|  | ||||
|     ## Helpers | ||||
|     def set_keywords(self, keywords): | ||||
|         for k in keywords: | ||||
|             ArticleKeyword.create( | ||||
|                 article = self, | ||||
|                 keyword = k | ||||
|                 ) | ||||
|  | ||||
|     def set_authors(self, authors): | ||||
|         for a in authors: | ||||
|             ArticleAuthor.create( | ||||
|                 article = self, | ||||
|                 author = a | ||||
|                 ) | ||||
|  | ||||
|     def set_references(self, references): | ||||
|         for r in references: | ||||
|             ArticleReference.create( | ||||
|                 article = self, | ||||
|                 reference_url = r | ||||
|             ) | ||||
|  | ||||
|     def set_related(self, related): | ||||
|         for r in related: | ||||
|             ArticleRelated.create( | ||||
|                 article = self, | ||||
|                 related_file_name = r | ||||
|             ) | ||||
|  | ||||
|     def file_status(self): | ||||
|         if not self.file_name: | ||||
|             logger.error("Article {} has no filename!".format(self)) | ||||
|             return False, {"reply_text": "Download failed, no file was saved.", "file_path": None} | ||||
|          | ||||
|         file_path_abs = self.save_path + self.file_name | ||||
|         if not os.path.exists(file_path_abs): | ||||
|             logger.error("Article {} has a filename, but the file does not exist at that location!".format(self)) | ||||
|             return False, {"reply_text": "Can't find file. Either the download failed or the file was moved.", "file_path": None} | ||||
|  | ||||
|         return True, {} | ||||
|  | ||||
|  | ||||
| class ArticleKeyword(DownloadBaseModel): | ||||
|     # instance gets created for every one keyword -> flexible in size | ||||
|     article = ForeignKeyField(ArticleDownload, backref='keywords') | ||||
|     keyword = CharField() | ||||
|  | ||||
|  | ||||
| class ArticleAuthor(DownloadBaseModel): | ||||
|     article = ForeignKeyField(ArticleDownload, backref='authors') | ||||
|     author = CharField() | ||||
|  | ||||
|  | ||||
| class ArticleReference(DownloadBaseModel): | ||||
|     article = ForeignKeyField(ArticleDownload, backref='references') | ||||
|     reference_url = TextField(default = '') | ||||
|  | ||||
|  | ||||
| class ArticleRelated(DownloadBaseModel): | ||||
|     article = ForeignKeyField(ArticleDownload, backref='related') | ||||
|     related_file_name = TextField(default = '') | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| ## == Slack-thread related models == ## | ||||
| class User(ChatBaseModel): | ||||
|     user_id = CharField(default='', unique=True) | ||||
|     # messages | ||||
|  | ||||
|  | ||||
| class Thread(ChatBaseModel): | ||||
|     """The threads that concern us are only created if the base massage contains a url""" | ||||
|     thread_ts = FloatField(default = 0) | ||||
|     article = ForeignKeyField(ArticleDownload, backref="slack_thread", null=True, default=None) | ||||
|     # provides, ts, user, models | ||||
|     # messages | ||||
|  | ||||
|     @property | ||||
|     def slack_ts(self): | ||||
|         str_ts = str(self.thread_ts) | ||||
|         cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem! | ||||
|         return "{}{}".format(str_ts, cut_zeros*"0") | ||||
|  | ||||
|     @property | ||||
|     def initiator_message(self): | ||||
|         try: | ||||
|             return self.messages[0] # TODO check if this needs sorting | ||||
|         except IndexError: | ||||
|             logger.warning(f"Thread {self} is empty. How can that be?") | ||||
|             return None | ||||
|  | ||||
|     @property | ||||
|     def message_count(self): | ||||
|         # logger.warning("message_count was called") | ||||
|         return self.messages.count() | ||||
|  | ||||
|     @property | ||||
|     def last_message(self): | ||||
|         messages = Message.select().where(Message.thread == self).order_by(Message.ts) # can't be empty by definition/creation | ||||
|         return messages[-1] | ||||
|  | ||||
|     @property | ||||
|     def is_fully_processed(self) -> bool: | ||||
|         init_message = self.initiator_message | ||||
|         if init_message is None: | ||||
|             return False | ||||
|          | ||||
|         if init_message.is_processed_override: | ||||
|             return True | ||||
|         # this override is set for instance, when no url was sent at all. Then set this thread to be ignored | ||||
|          | ||||
|         reactions = init_message.reaction | ||||
|         if not reactions: | ||||
|             return False | ||||
|         else: | ||||
|             r = reactions[0].type # can and should only have one reaction | ||||
|             return r == "white_check_mark" \ | ||||
|                 or r == "x" | ||||
|  | ||||
|  | ||||
|      | ||||
| class Message(ChatBaseModel): | ||||
|     ts = FloatField(unique=True) #for sorting | ||||
|     channel_id = CharField(default='') | ||||
|     user = ForeignKeyField(User, backref="messages") | ||||
|     text = TextField(default='') | ||||
|     thread = ForeignKeyField(Thread, backref="messages", default=None) | ||||
|     file_type = CharField(default='') | ||||
|     perma_link = CharField(default='') | ||||
|     is_processed_override = BooleanField(default=False) | ||||
|     # reaction | ||||
|  | ||||
|     def __str__(self) -> str: | ||||
|         return "MSG [{}]".format(self.text[:min(len(self.text), 30)].replace('\n','/') + '...') | ||||
|  | ||||
|     @property | ||||
|     def slack_ts(self): | ||||
|         str_ts = str(self.ts) | ||||
|         cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem! | ||||
|         return "{}{}".format(str_ts, cut_zeros * "0") | ||||
|  | ||||
|  | ||||
|     @property | ||||
|     def urls(self): | ||||
|         pattern = r"<(.*?)>" | ||||
|         matches = re.findall(pattern, self.text) | ||||
|         matches = [m for m in matches if "." in m] | ||||
|          | ||||
|         new_matches = [] | ||||
|         for m in matches: | ||||
|             if "." in m:  # must contain a tld, right? | ||||
|                 # further complication: slack automatically abreviates urls in the format:  | ||||
|                 # <url|link preview>. Lucky for us, "|" is a character derecommended in urls, meaning we can "safely" split for it and retain the first half | ||||
|                 if "|" in m: | ||||
|                     keep = m.split("|")[0] | ||||
|                 else: | ||||
|                     keep = m | ||||
|                 new_matches.append(keep) | ||||
|         return new_matches | ||||
|      | ||||
|     @property | ||||
|     def is_by_human(self): | ||||
|         return self.user.user_id != slack_config["bot_id"] | ||||
|  | ||||
|      | ||||
|     @property | ||||
|     def has_single_url(self): | ||||
|         return len(self.urls) == 1 | ||||
|  | ||||
|  | ||||
| class Reaction(ChatBaseModel): | ||||
|     type = CharField(default = "") | ||||
|     message = ForeignKeyField(Message, backref="reaction") | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| def create_tables(): | ||||
|     with download_db: | ||||
|         download_db.create_tables([ArticleDownload, ArticleKeyword, ArticleAuthor, ArticleReference, ArticleRelated]) | ||||
|     with chat_db: | ||||
|         chat_db.create_tables([User, Message, Thread, Reaction]) | ||||
|  | ||||
|  | ||||
| def set_db(chat_db_object, download_db_object): | ||||
|     chat_db.initialize(chat_db_object) | ||||
|     download_db.initialize(download_db_object) | ||||
|     create_tables() | ||||
|  | ||||
| def clear_path_name(path): | ||||
|     keepcharacters = (' ','.','_', '-') | ||||
|     converted = "".join([c if (c.isalnum() or c in keepcharacters) else "_" for c in path]).rstrip() | ||||
|     return converted | ||||
|      | ||||
							
								
								
									
										0
									
								
								news_fetch/app/utils_worker/_init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								news_fetch/app/utils_worker/_init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										47
									
								
								news_fetch/app/utils_worker/compress/runner.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										47
									
								
								news_fetch/app/utils_worker/compress/runner.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,47 @@ | ||||
| import os | ||||
| import subprocess | ||||
| from pathlib import Path | ||||
|  | ||||
| import logging | ||||
| logger = logging.getLogger(__name__) | ||||
| import configuration | ||||
| config = configuration.parsed["DOWNLOADS"] | ||||
|  | ||||
| shrink_sizes = [] | ||||
|  | ||||
| def shrink_pdf(article): | ||||
|     article_loc = Path(article.save_path) / article.file_name | ||||
|     initial_size = article_loc.stat().st_size | ||||
|     compressed_tmp = Path(config['default_download_path']) / "compressed.pdf" | ||||
|  | ||||
|     if article_loc.suffix != "pdf": | ||||
|         return article # it probably was a youtube video | ||||
|          | ||||
|     c = subprocess.run( | ||||
|         [ | ||||
|             "gs", | ||||
|             "-sDEVICE=pdfwrite", | ||||
|             "-dPDFSETTINGS=/screen", | ||||
|             "-dNOPAUSE", | ||||
|             "-dBATCH", | ||||
|             f"-sOutputFile={compressed_tmp}",  | ||||
|             f"{article_loc}" | ||||
|         ], | ||||
|         stdout=subprocess.PIPE, stderr=subprocess.PIPE | ||||
|     ) | ||||
|  | ||||
|     if c.returncode == 0: | ||||
|         try: | ||||
|             os.replace(compressed_tmp, article_loc) | ||||
|         except OSError as e: | ||||
|             logger.error(f"Compression ran but I could not copy back the file {e}") | ||||
|  | ||||
|         final_size = article_loc.stat().st_size | ||||
|         shrink_sizes.append(initial_size - final_size) | ||||
|         logger.info(f"Compression worked. Avg shrinkage: {int(sum(shrink_sizes)/len(shrink_sizes) / 1000)} KB") | ||||
|  | ||||
|  | ||||
|     else: | ||||
|         logger.error(f"Could not run the compression! {c.stderr.decode()} - {c.stdout.decode()}") | ||||
|      | ||||
|     return article | ||||
							
								
								
									
										0
									
								
								news_fetch/app/utils_worker/download/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								news_fetch/app/utils_worker/download/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										172
									
								
								news_fetch/app/utils_worker/download/browser.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										172
									
								
								news_fetch/app/utils_worker/download/browser.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,172 @@ | ||||
| import time | ||||
| import datetime | ||||
| import logging | ||||
| import os | ||||
| import base64 | ||||
| import requests | ||||
| from selenium import webdriver | ||||
| import configuration | ||||
| import json | ||||
|  | ||||
| config = configuration.parsed["DOWNLOADS"] | ||||
| blacklisted = json.loads(config["blacklisted_href_domains"]) | ||||
|  | ||||
|  | ||||
| class PDFDownloader: | ||||
|     """Saves a given url. Fills the object it got as a parameter""" | ||||
|     logger = logging.getLogger(__name__) | ||||
|     # status-variable for restarting: | ||||
|     running = False | ||||
|      | ||||
|     def start(self): | ||||
|         self.finish() # clear up | ||||
|              | ||||
|         options = webdriver.FirefoxOptions() | ||||
|         options.profile = config["browser_profile_path"] | ||||
|         # should be options.set_preference("profile", config["browser_profile_path"]) as of selenium 4 but that doesn't work | ||||
|  | ||||
|         if os.getenv("HEADLESS", "false") == "true": | ||||
|             options.add_argument('--headless') | ||||
|         else: | ||||
|             self.logger.warning("Opening browser GUI because of 'HEADLESS=false'") | ||||
|  | ||||
|         options.set_preference('print.save_as_pdf.links.enabled', True) | ||||
|         # Just save if the filetype is pdf already, does not work! | ||||
|  | ||||
|         options.set_preference("print.printer_Mozilla_Save_to_PDF.print_to_file", True) | ||||
|         options.set_preference("browser.download.folderList", 2) | ||||
|         # options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf") | ||||
|         # options.set_preference("pdfjs.disabled", True) | ||||
|         options.set_preference("browser.download.dir", config["default_download_path"]) | ||||
|  | ||||
|         self.logger.info("Starting gecko driver") | ||||
|         # self.driver = webdriver.Firefox( | ||||
|         #     options = options, | ||||
|         #     service = webdriver.firefox.service.Service( | ||||
|         #         log_path = f'{config["local_storage_path"]}/geckodriver.log' | ||||
|         # )) | ||||
|         self.driver = webdriver.Remote( | ||||
|             command_executor = 'http://geckodriver:4444', | ||||
|             options = options, | ||||
|             # can't set log path... | ||||
|         ) | ||||
|          | ||||
|         residues = os.listdir(config["default_download_path"]) | ||||
|         for res in residues: | ||||
|             os.remove(os.path.join(config["default_download_path"], res)) | ||||
|  | ||||
|         self.running = True | ||||
|  | ||||
|     def autostart(self): | ||||
|         if not self.running: | ||||
|             self.start()  # relaunch the dl util | ||||
|  | ||||
|     def finish(self): | ||||
|         if self.running: | ||||
|             self.logger.info("Exiting gecko driver") | ||||
|             try: | ||||
|                 self.driver.quit() | ||||
|                 time.sleep(10) | ||||
|             except: | ||||
|                 self.logger.critical("Connection to the driver broke off") | ||||
|             self.running = False | ||||
|         else: | ||||
|             self.logger.info("Gecko driver not yet running") | ||||
|  | ||||
|     def download(self, article_object): | ||||
|         sleep_time = 2 | ||||
|         self.autostart() | ||||
|         url = article_object.article_url | ||||
|  | ||||
|         try: | ||||
|             self.driver.get(url) | ||||
|         except Exception as e: | ||||
|             self.logger.critical("Selenium .get(url) failed with error {}".format(e)) | ||||
|             self.finish() | ||||
|             return article_object  # without changes | ||||
|          | ||||
|         time.sleep(sleep_time) | ||||
|         # leave the page time to do any funky business | ||||
|  | ||||
|         # in the mean time, get a page title if required | ||||
|         if article_object.is_title_bad: | ||||
|             article_object.title = self.driver.title.replace(".pdf", "") | ||||
|             # will be propagated to the saved file (dst) as well | ||||
|  | ||||
|         fname = article_object.fname_template | ||||
|         dst = os.path.join(article_object.save_path, fname) | ||||
|         if os.path.exists(dst): | ||||
|             fname = make_path_unique(fname) | ||||
|             dst = os.path.join(article_object.save_path, fname) | ||||
|  | ||||
|  | ||||
|         if url[-4:] == ".pdf": | ||||
|             # according to the browser preferences, calling the url will open pdfjs. | ||||
|             # If not handled separately, printing would require the ctrl+p route, but setup is janky to say the least | ||||
|             success = self.get_exisiting_pdf(url, dst) | ||||
|         else: | ||||
|             success = self.get_new_pdf(dst) | ||||
|  | ||||
|  | ||||
|         if success: | ||||
|             article_object.file_name = fname | ||||
|             article_object.set_references(self.get_references()) | ||||
|         else: | ||||
|             article_object.file_name = "" | ||||
|          | ||||
|         return article_object  # this change is saved later by the external caller | ||||
|  | ||||
|  | ||||
|     def get_exisiting_pdf(self, url, dst): | ||||
|         try: | ||||
|             r = requests.get(url) | ||||
|             bytes = r.content | ||||
|         except: | ||||
|             return False | ||||
|         return self.get_new_pdf(dst, other_bytes=bytes) | ||||
|  | ||||
|  | ||||
|     def get_new_pdf(self, dst, other_bytes=None): | ||||
|         os.makedirs(os.path.dirname(dst), exist_ok=True) | ||||
|  | ||||
|         if other_bytes is None: | ||||
|             try: | ||||
|                 result = self.driver.print_page() | ||||
|                 bytes = base64.b64decode(result, validate=True) | ||||
|             except: | ||||
|                 self.logger.error("Failed, probably because the driver went extinct.") | ||||
|                 return False | ||||
|         else: | ||||
|             bytes = other_bytes | ||||
|  | ||||
|         try: | ||||
|             with open(dst, "wb+") as f: | ||||
|                 f.write(bytes) | ||||
|             return True | ||||
|         except Exception as e: | ||||
|             self.logger.error(f"Failed, because of FS-operation: {e}") | ||||
|             return False | ||||
|          | ||||
|  | ||||
|     def get_references(self): | ||||
|         try: | ||||
|             hrefs = [e.get_attribute("href") for e in self.driver.find_elements_by_xpath("//a[@href]")] | ||||
|         except: | ||||
|             hrefs = [] | ||||
|         len_old = len(hrefs) | ||||
|         hrefs = [h for h in hrefs \ | ||||
|             if not sum([(domain in h) for domain in blacklisted]) # sum([True, False, False, False]) == 1 (esp. not 0) | ||||
|             ] # filter a tiny bit at least | ||||
|         self.logger.info(f"Hrefs filtered (before: {len_old}, after: {len(hrefs)})") | ||||
|         return hrefs | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| def make_path_unique(path): | ||||
|     fname, ending = os.path.splitext(path) | ||||
|     fname += datetime.datetime.now().strftime("%d-%H%M%S") | ||||
|     return fname + ending | ||||
							
								
								
									
										0
									
								
								news_fetch/app/utils_worker/download/runner.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								news_fetch/app/utils_worker/download/runner.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										51
									
								
								news_fetch/app/utils_worker/download/youtube.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										51
									
								
								news_fetch/app/utils_worker/download/youtube.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,51 @@ | ||||
| from __future__ import unicode_literals | ||||
| import youtube_dl | ||||
| import os | ||||
| import logging | ||||
|  | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| class MyLogger(object): | ||||
|     def debug(self, msg): pass | ||||
|     def warning(self, msg): pass | ||||
|     def error(self, msg): | ||||
|         logger.error(msg) | ||||
|  | ||||
|  | ||||
|  | ||||
| class YouTubeDownloader: | ||||
|     def __init__(self) -> None: | ||||
|         pass | ||||
|  | ||||
|  | ||||
|     def post_download_hook(self, ret_code): | ||||
|         # print(ret_code) | ||||
|         if ret_code['status'] == 'finished': | ||||
|             file_loc = ret_code["filename"] | ||||
|             fname = os.path.basename(file_loc) | ||||
|             self.article_object.file_name = fname | ||||
|  | ||||
|  | ||||
|     def save_video(self, article_object): | ||||
|         """Saves video accoring to url and save path""" | ||||
|         self.article_object = article_object | ||||
|         url = article_object.article_url | ||||
|         logger.info("Saving new video") | ||||
|         file_path = os.path.join(article_object.save_path, article_object.fname_template) | ||||
|         ydl_opts = { | ||||
|             'format': 'best[height<=720]', | ||||
|             'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download | ||||
|             'logger': MyLogger(), | ||||
|             'progress_hooks': [self.post_download_hook], | ||||
|             'updatetime': False | ||||
|         } | ||||
|         try: | ||||
|             with youtube_dl.YoutubeDL(ydl_opts) as ydl: | ||||
|                 ydl.download([url]) | ||||
|                 # article file name is updated in self.post_download_hook | ||||
|         except Exception as e: | ||||
|             logger.error(f"Youtube download crashed: {e}") | ||||
|             article_object.file_name = "" | ||||
|  | ||||
|         return article_object | ||||
							
								
								
									
										62
									
								
								news_fetch/app/utils_worker/fetch/runner.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										62
									
								
								news_fetch/app/utils_worker/fetch/runner.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,62 @@ | ||||
| from newspaper import Article | ||||
| from urllib.parse import urlparse | ||||
| from htmldate import find_date | ||||
| import datetime | ||||
| import logging | ||||
| logging.getLogger('newspaper').setLevel(logging.ERROR) # quieter logs | ||||
| logging.getLogger('urllib').setLevel(logging.ERROR) # quieter logs | ||||
| logging.getLogger('urllib3.poolmanager').setLevel(logging.ERROR) # quieter logs | ||||
| logging.getLogger('htmldate').setLevel(logging.ERROR) #quieter logs | ||||
| logging.getLogger('charset_normalizer').setLevel(logging.ERROR) #quieter logs | ||||
| logger = logging.getLogger("fetch") | ||||
|  | ||||
|  | ||||
| def get_description(article_object): | ||||
|     url = article_object.article_url | ||||
|     website = urlparse(url).netloc | ||||
|     article_object.source_name = website | ||||
|  | ||||
|     try: | ||||
|         article_object.pub_date = datetime.datetime.strptime(find_date(url), '%Y-%d-%M') | ||||
|     except: # other file types | ||||
|         article_object.pub_date = datetime.datetime(year=1900, month=1, day=1) | ||||
|  | ||||
|     try: | ||||
|         news_article = Article(url) | ||||
|         news_article.download() | ||||
|         news_article.parse() | ||||
|     except: | ||||
|         news_article = object() # fallback value | ||||
|  | ||||
|     try: | ||||
|         article_object.title = news_article.title | ||||
|     except AttributeError: | ||||
|         article_object.title = "Error while running fetch" | ||||
|  | ||||
|     try: | ||||
|         if article_object.summary: | ||||
|             article_object.summary = news_article.summary | ||||
|         elif news_article.text: | ||||
|             ind = min(500, len(news_article.text)) | ||||
|             article_object.summary = news_article.text[:ind] + "..." | ||||
|         else: | ||||
|             article_object.summary = "" | ||||
|     except AttributeError: | ||||
|         article_object.summary = "" | ||||
|  | ||||
|     try: | ||||
|         article_object.language = news_article.meta_lang | ||||
|     except AttributeError: | ||||
|         article_object.language = "" | ||||
|  | ||||
|     try: | ||||
|         article_object.set_authors(news_article.authors) | ||||
|     except AttributeError: | ||||
|         pass # list would have been empty anyway | ||||
|      | ||||
|     try: | ||||
|         article_object.set_keywords(news_article.keywords) | ||||
|     except AttributeError: | ||||
|         pass  # list would have been empty anyway | ||||
|      | ||||
|     return article_object | ||||
							
								
								
									
										20
									
								
								news_fetch/app/utils_worker/upload/runner.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								news_fetch/app/utils_worker/upload/runner.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,20 @@ | ||||
| import time | ||||
| from waybackpy import WaybackMachineSaveAPI # upload to archive.org | ||||
| import logging | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
| def upload_to_archive(article_object): | ||||
|     """uploads to archive.org and returns the archived url""" | ||||
|     user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed? | ||||
|     url = article_object.article_url | ||||
|     try: | ||||
|         wayback = WaybackMachineSaveAPI(url, user_agent) | ||||
|         archive_url = wayback.save() | ||||
|         # logger.info(f"{url} uploaded to archive successfully") | ||||
|         article_object.archive_url = archive_url | ||||
|  | ||||
|     except Exception as e: | ||||
|         article_object.archive_url = "Error while uploading: {}".format(e) | ||||
|         logger.error(f"Error while generating archive url: {e}") | ||||
|  | ||||
|     return article_object | ||||
							
								
								
									
										41
									
								
								news_fetch/app/utils_worker/worker_template.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										41
									
								
								news_fetch/app/utils_worker/worker_template.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,41 @@ | ||||
| from threading import Thread | ||||
| import time | ||||
| import logging | ||||
|  | ||||
|  | ||||
| class TemplateWorker(Thread): | ||||
|     """Parent class for any subsequent worker of the article-download pipeline. They should all run in parallel, thus the Thread subclassing""" | ||||
|     logger = logging.getLogger(__name__) | ||||
|  | ||||
|     def __init__(self, *args, **kwargs) -> None: | ||||
|         target = self._queue_processor # will be executed on Worker.start() | ||||
|         group = kwargs.get("group", None) | ||||
|         name = kwargs.get("name", None) | ||||
|  | ||||
|         super().__init__(group=group, target=target, name=name) | ||||
|         self._article_queue = [] | ||||
|         self.logger.info(f"Worker thread {self.__class__.__name__} initialized successfully") | ||||
|      | ||||
|  | ||||
|     def process(self, article_watcher): | ||||
|         self._article_queue.append(article_watcher)#.article_model.article_url) | ||||
|  | ||||
|  | ||||
|     def _queue_processor(self): | ||||
|         """This method is launched by thread.run() and idles when self._article_queue is empty. When an external caller appends to the queue it jumps into action""" | ||||
|         while True: # PLEASE tell me if I'm missing an obvious better way of doing this! | ||||
|             if len(self._article_queue) == 0: | ||||
|                 time.sleep(5) | ||||
|             else: | ||||
|                 article_watcher = self._article_queue.pop(0) | ||||
|                 self.logger.info(f"{self.__class__.__name__} now processing from queue (length: {len(self._article_queue)}) - {article_watcher.article}") | ||||
|                 self._handle_article(article_watcher) | ||||
|                  | ||||
|  | ||||
|     def _handle_article(self, article_watcher, action=None): | ||||
|         if action is None: | ||||
|             self.logger.error("Unoverloaded call of _handle_article(). This should not occur in prod") | ||||
|         else: | ||||
|             article = article_watcher.article | ||||
|             article = action(article) # action updates the article object but does not save the change | ||||
|             article.save() | ||||
							
								
								
									
										66
									
								
								news_fetch/app/utils_worker/workers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										66
									
								
								news_fetch/app/utils_worker/workers.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,66 @@ | ||||
| from .worker_template import TemplateWorker | ||||
| from .download.browser import PDFDownloader | ||||
| from .download.youtube import YouTubeDownloader | ||||
| from .fetch.runner import get_description | ||||
| from .upload.runner import upload_to_archive as run_upload | ||||
| from .compress.runner import shrink_pdf | ||||
|  | ||||
| import time | ||||
| import logging | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
| class DownloadWorker(TemplateWorker): | ||||
|     def __init__(self) -> None: | ||||
|         self.dl_runner = PDFDownloader().download | ||||
|         self.yt_runner = YouTubeDownloader().save_video | ||||
|         super().__init__() | ||||
|  | ||||
|     def _handle_article(self, article_watcher): | ||||
|         article = article_watcher.article | ||||
|         u = article.article_url | ||||
|  | ||||
|         if "youtu.be/" in u or "youtube.com/" in u: | ||||
|             action = self.yt_runner | ||||
|         else: | ||||
|             action = self.dl_runner | ||||
|  | ||||
|         super()._handle_article(article_watcher, action) | ||||
|         article_watcher.download_completed = True | ||||
|  | ||||
|  | ||||
|  | ||||
| class FetchWorker(TemplateWorker): | ||||
|     def __init__(self) -> None: | ||||
|         super().__init__() | ||||
|  | ||||
|     def _handle_article(self, article_watcher): | ||||
|         action = get_description # function | ||||
|         super()._handle_article(article_watcher, action) | ||||
|         article_watcher.fetch_completed = True | ||||
|  | ||||
|  | ||||
|  | ||||
| class UploadWorker(TemplateWorker): | ||||
|     def __init__(self) -> None: | ||||
|         super().__init__() | ||||
|      | ||||
|  | ||||
|  | ||||
|     def _handle_article(self, article_watcher): | ||||
|         def action(*args, **kwargs): | ||||
|             time.sleep(10) # uploads to archive are throttled to 15/minute, but 5s still triggers a blacklisting | ||||
|             return run_upload(*args, **kwargs) | ||||
|  | ||||
|         super()._handle_article(article_watcher, action) | ||||
|         article_watcher.upload_completed = True | ||||
|  | ||||
|  | ||||
|  | ||||
| class CompressWorker(TemplateWorker): | ||||
|     def __init__(self) -> None: | ||||
|         super().__init__() | ||||
|  | ||||
|     def _handle_article(self, article_watcher): | ||||
|         action = shrink_pdf | ||||
|         super()._handle_article(article_watcher, action) | ||||
|         article_watcher.compression_completed = True | ||||
		Reference in New Issue
	
	Block a user
	 Remy Moll
					Remy Moll