From 8f3ea25662d501e89b2912dee2f5cdd616f9ddde Mon Sep 17 00:00:00 2001
From: Remy Moll <remoll@ethz.ch>
Date: Wed, 20 Apr 2022 16:49:55 +0200
Subject: [PATCH] Many bug fixes

---
 Dockerfile                                    | 20 +++--
 README.md                                     |  8 +-
 app/configuration.py                          | 10 ++-
 app/runner.py                                 | 61 ++++++---------
 app/utils_slack/message_helpers.py            |  1 +
 app/utils_slack/runner.py                     |  2 +-
 app/utils_storage/migrations/migration.001.py | 67 ++++++++++++++++
 app/utils_storage/models.py                   | 51 ++++++------
 app/utils_worker/compress/runner.py           |  3 +
 app/utils_worker/download/browser.py          | 16 ++--
 app/utils_worker/download/youtube.py          | 78 +++++++++++++------
 app/utils_worker/fetch/runner.py              | 12 ++-
 app/utils_worker/upload/runner.py             |  4 +-
 app/utils_worker/worker_template.py           |  2 -
 app/utils_worker/workers.py                   |  4 +-
 requirements.txt                              |  2 +-
 16 files changed, 223 insertions(+), 118 deletions(-)
 create mode 100644 app/utils_storage/migrations/migration.001.py

diff --git a/Dockerfile b/Dockerfile
index d4d0203..093f2f1 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,18 +1,22 @@
-FROM ubuntu:latest
-# UGH, timezone issues
-RUN ln -snf /usr/share/zoneinfo/$CONTAINER_TIMEZONE /etc/localtime && echo $CONTAINER_TIMEZONE > /etc/timezone
+FROM python:latest
 
-RUN apt-get update && apt-get install -y evince libcanberra-gtk-module && apt-get install -y xauth wget tar python3 python3-pip python3-setuptools python3-wheel python3-dev build-essential firefox ghostscript
+RUN apt-get update && apt-get install -y \
+evince libcanberra-gtk-module \
+# for checking
+xauth wget tar firefox \
+# for geckodriver + gui
+ghostscript
+# for compression
 
 # Download gecko (firefox) driver for selenium
-RUN wget https://github.com/mozilla/geckodriver/releases/download/v0.30.0/geckodriver-v0.30.0-linux64.tar.gz
-RUN tar -x geckodriver -zf geckodriver-v0.30.0-linux64.tar.gz -O > /usr/bin/geckodriver
+RUN wget https://github.com/mozilla/geckodriver/releases/download/v0.31.0/geckodriver-v0.31.0-linux64.tar.gz
+RUN tar -x geckodriver -zf geckodriver-v0.31.0-linux64.tar.gz -O > /usr/bin/geckodriver
 RUN chmod +x /usr/bin/geckodriver
-RUN rm geckodriver-v0.30.0-linux64.tar.gz
+RUN rm geckodriver-v0.31.0-linux64.tar.gz
 RUN echo "127.0.0.1 localhost" >> /etc/hosts
 
 COPY requirements.txt /app/
-RUN python3 -m pip install --upgrade pip && python3 -m pip install -r /app/requirements.txt
+RUN python3 -m pip install -r /app/requirements.txt
 
 RUN mkdir -p /app/auto_news
 COPY app /app/auto_news
diff --git a/README.md b/README.md
index 9970347..3bb1884 100644
--- a/README.md
+++ b/README.md
@@ -40,9 +40,9 @@ where the `Dockerfile` has to be in the working directory
 
 ## Cheat-sheet Remy:
 
-`docker run -it -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ auto_news`
+`docker run -it -v /mnt/Data/COSS/CONTAINERDATA/:/app/file_storage/ auto_news`
 
-`docker run -it -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ -v /mnt/Data/COSS/auto_news/app:/code --entrypoint /bin/bash auto_news`
+`docker run -it -v /mnt/Data/COSS/CONTAINERDATA/:/app/file_storage/ -v /mnt/Data/COSS/auto_news/app:/code --entrypoint /bin/bash auto_news`
 
 
 `docker run -it -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ -e DISPLAY=":0" --network host -v \$XAUTHORITY:/root/.Xauthority auto_news check`
@@ -51,6 +51,6 @@ where the `Dockerfile` has to be in the working directory
 
 ## Roadmap:
 
-[] automatically upload files to NAS
-[] handle paywalled sites like faz, spiegel, .. through their dedicated edu-sites
+[ ] automatically upload files to NAS
+[ ] handle paywalled sites like faz, spiegel, .. through their dedicated edu-sites
 ...
\ No newline at end of file
diff --git a/app/configuration.py b/app/configuration.py
index 25bcd82..0981f26 100644
--- a/app/configuration.py
+++ b/app/configuration.py
@@ -23,20 +23,24 @@ if "debug" in sys.argv:
     logger.warning("Running in debugging mode because launched with argument 'debug'")
     # parsed.read("/code/config.ini")
 
-    db_path = os.path.join(parsed["DATABASE"]["db_path_dev"], parsed["DATABASE"]["db_name"])
+    db_base_path = parsed["DATABASE"]["db_path_dev"]
     parsed["SLACK"]["archive_id"] = parsed["SLACK"]["debug_id"]
     parsed["MAIL"]["recipient"] = parsed["MAIL"]["sender"] 
 else:
     logger.warning("Using production values, I hope you know what you're doing...")
 
-    db_path = os.path.join(parsed["DATABASE"]["db_path_prod"], parsed["DATABASE"]["db_name"])
+    db_base_path = parsed["DATABASE"]["db_path_prod"]
 
 from utils_storage import models
 
 # Set up the database
 models.set_db(
     SqliteDatabase(
-        db_path,
+        os.path.join(db_base_path, parsed["DATABASE"]["chat_db_name"]),
+        pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once
+    ),
+    SqliteDatabase(
+        os.path.join(db_base_path, parsed["DATABASE"]["download_db_name"]),
         pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once
     )
 )
\ No newline at end of file
diff --git a/app/runner.py b/app/runner.py
index 82c4c72..ed96736 100644
--- a/app/runner.py
+++ b/app/runner.py
@@ -27,8 +27,14 @@ class ArticleWatcher:
         self._fetch_completed = self._download_completed = self._compression_completed = self._upload_completed = False
 
         # first step: gather metadata
-        self.fetch.process(self) # this will call the update_status method
-        self.upload.process(self) # idependdent from the rest
+        if self.fetch and self.upload:
+            self.fetch.process(self) # this will call the update_status method
+            self.upload.process(self) # idependdent from the rest
+        else: # the full kwargs were not provided, only do a manual run
+            # overwrite update_status() because calls from the workers will result in erros
+            self.update_status = lambda completed: logger.info(f"Completed action {completed}")
+            for w in kwargs.get("workers_manual"):
+                w.process(self)
 
 
     def update_status(self, completed_action):
@@ -36,23 +42,6 @@ class ArticleWatcher:
         Article download is complete iff fetch and download were successfull and compression was run
         """
         # if self.completition_notified and self._compression_completed and self._fetch_completed and self._download_completed and self._upload_completed, we are done
-        # we don't need to delete self though, because it is then automatically garbage-collected
-        # all_done = self._fetch_completed and self._download_completed and self._compression_completed and self._upload_completed
-        # if self._fetch_completed and not self._download_called:
-        #     self._download_called = True
-        #     self.download.process(self)
-        # elif self._download_completed and not self._compression_called:
-        #     self._compression_called = True
-        #     self.compress.process(self)
-        # elif self._compression_completed: # last step
-        #     self.completition_notifier(self.article)
-        #     # triggers action in Coordinator
-        # elif self._upload_completed:
-        #     # this case occurs when upload was faster than compression
-        #     pass
-        # else:
-        #     logger.warning(f"update_status called with unusual configuration {self._fetch_completed},{self._download_completed},{self._compression_completed}")
-
         if completed_action == "fetch":
             self.download.process(self)
         elif completed_action == "download":
@@ -129,15 +118,16 @@ class Coordinator:
 
 
     def incoming_request(self, message):
-        # TODO CHECK ME!
         """This method is passed onto the slack worker. It gets triggered when a new message is received."""
         url = message.urls[0] # ignore all the other ones
         a, is_new = models.ArticleDownload.get_or_create(article_url=url)
         message.thread.article = a
         message.thread.save()
+        self.kwargs.update({"notifier" : self.article_complete_notifier})
 
-        if is_new:
-            self.kwargs.update({"notifier" : self.article_complete_notifier})
+        if is_new or (a.file_name == "" and a.verified == 0):
+            # check for models that were created but were abandonned. This means they have missing information, most importantly no associated file
+            # this overwrites previously set information, but that should not be too important
             ArticleWatcher(
                 a,
                 **self.kwargs   
@@ -152,12 +142,13 @@ class Coordinator:
 
 
 
-    def manual_processing(self, url_list, target_calls):
-        for url in url_list:
-            article = models.ArticleDownload.get_or_none(article_url=url)
-            watcher = ArticleWatcher(article, self.article_complete_notifier)
-            for t in target_calls:
-                t.process(watcher)
+    def manual_processing(self, articles, workers):
+        for w in workers:
+            w.start()
+
+        for article in articles:
+            notifier = lambda article: print(f"Completed manual actions for {article}")
+            ArticleWatcher(article, workers_manual = workers, notifier = notifier)
 
     def article_complete_notifier(self, article):
         self.worker_slack.bot_worker.respond_channel_message(article)
@@ -170,12 +161,14 @@ if __name__ == "__main__":
 
 
     if "upload" in sys.argv:
-        urls = models.ArticleDownload.select(models.ArticleDownload.article_url).where(models.ArticleDownload.archive_url == "").execute()
-        logger.info(f"Launching upload to archive for {len(urls)} urls.")
-        coordinator.manual_processing(urls, [UploadWorker()])
+        articles = models.ArticleDownload.select().where(models.ArticleDownload.archive_url == "").execute()
+        logger.info(f"Launching upload to archive for {len(articles)} articles.")
+        coordinator.manual_processing(articles, [UploadWorker()])
+
     elif "check" in sys.argv:
         from utils_check import runner as check_runner
         check_runner.verify_unchecked()
+
     else: # launch with full action
         kwargs = {
             "worker_download" : DownloadWorker(),
@@ -186,9 +179,3 @@ if __name__ == "__main__":
             "worker_mail" : mail_runner,
         }
         coordinator.add_workers(**kwargs)
-
-
-
-
-# TODO
-# Resume interrupted article models
\ No newline at end of file
diff --git a/app/utils_slack/message_helpers.py b/app/utils_slack/message_helpers.py
index 84cbc58..aeb71c1 100644
--- a/app/utils_slack/message_helpers.py
+++ b/app/utils_slack/message_helpers.py
@@ -189,6 +189,7 @@ def message_dict_to_model(message):
         uid = message.get("user", "BAD USER")
         if uid == "BAD USER":
             logger.critical("Message has no user?? {}".format(message))
+            return None
         
         user, _ = models.User.get_or_create(user_id = uid)
         thread, _ = models.Thread.get_or_create(thread_ts = thread_ts)
diff --git a/app/utils_slack/runner.py b/app/utils_slack/runner.py
index 92a88e3..2e20ea5 100644
--- a/app/utils_slack/runner.py
+++ b/app/utils_slack/runner.py
@@ -158,7 +158,7 @@ class BotApp(App):
         fully_processed = len([t for t in threads if t.is_fully_processed])
         fully_unprocessed = len([t for t in threads if t.message_count == 1])
         articles_unprocessed = len(models.ArticleDownload.select().where(models.ArticleDownload.verified < 1))
-        self.logger.info(f"[bold]STATUS[/bold]: Fully processed {all_threads}/{fully_processed} threads. {fully_unprocessed} threads have 0 replies. Article-objects to verify: {articles_unprocessed}", extra={"markup": True})
+        self.logger.info(f"[bold]STATUS[/bold]: Fully processed {fully_processed}/{all_threads} threads. {fully_unprocessed} threads have 0 replies. Article-objects to verify: {articles_unprocessed}", extra={"markup": True})
 
 
     
diff --git a/app/utils_storage/migrations/migration.001.py b/app/utils_storage/migrations/migration.001.py
new file mode 100644
index 0000000..bb5bb8c
--- /dev/null
+++ b/app/utils_storage/migrations/migration.001.py
@@ -0,0 +1,67 @@
+from playhouse.migrate import *
+
+
+"""
+This migration assumes that downloads.db kept the exact same structure as before.
+messages.db should drop the table articlemodelreference in favor of a new field article in the thread-table
+Since each thread is constrained to exactly one article this makes the most sense.
+
+This migration assumes that messages.db gets a new field in the table thread:
+id  |   thread_ts | article_id
+
+We now need to migrate from the table articlemodelreference and then delete it.
+"""
+
+
+db = SqliteDatabase("/code/.dev/messages.db")
+migrator = SqliteMigrator(db)
+
+
+article_field = IntegerField(null=True)
+
+
+migrate(
+    migrator.add_column('thread', 'article_id', article_field),
+    # migrator.drop_column('some_table', 'old_column'),
+)
+
+
+
+# these are the old models, adapted to the migration
+
+class BaseModel(Model):
+    class Meta:
+        database = db
+
+class User(BaseModel):
+    user_id = CharField(default='', unique=True)   
+
+class Thread(BaseModel):
+    """The threads that concern us are only created if the messages that contain urls"""
+    thread_ts = FloatField(default = 0)
+    article_id = IntegerField(null=True)
+
+    
+class Message(BaseModel):
+    ts = FloatField(unique=True) #for sorting
+    channel_id = CharField(default='')
+    user = ForeignKeyField(User, backref="messages")
+    text = TextField(default='')
+    thread = ForeignKeyField(Thread, backref="messages", default=None)
+    file_type = CharField(default='')
+    perma_link = CharField(default='')
+    is_processed_override = BooleanField(default=False)
+
+
+class ArticleModelReference(BaseModel):
+    message = ForeignKeyField(Message, backref='article_model_references')
+    article_model_id = IntegerField(default = 0)
+
+
+
+
+for ref in ArticleModelReference.select():
+    ref.message.thread.article_id = ref.article_model_id
+    ref.message.thread.save()
+
+db.drop_tables((ArticleModelReference))
\ No newline at end of file
diff --git a/app/utils_storage/models.py b/app/utils_storage/models.py
index 3720aa8..c01b0e9 100644
--- a/app/utils_storage/models.py
+++ b/app/utils_storage/models.py
@@ -12,15 +12,23 @@ config = configuration.parsed["DOWNLOADS"]
 slack_config = configuration.parsed["SLACK"]
 
 ## Helpers
-db = DatabaseProxy()
+chat_db = DatabaseProxy()
+download_db = DatabaseProxy()
+
 # set the nature of the db at runtime
-class BaseModel(Model):
+
+class DownloadBaseModel(Model):
     class Meta:
-        database = db
+        database = download_db
+
+class ChatBaseModel(Model):
+    class Meta:
+        database = chat_db
+
 
 
 ## == Article related models == ##
-class ArticleDownload(BaseModel):
+class ArticleDownload(DownloadBaseModel):
     title = CharField(default='')
     pub_date = DateField(default = '')
     download_date = DateField(default = datetime.date.today)
@@ -55,7 +63,7 @@ class ArticleDownload(BaseModel):
 
     @property
     def fname_template(self):
-        if self.source_name == "youtube.com":
+        if "youtube.com" in self.source_name or "youtu.be" in self.source_name:
             fname = "{} -- {}".format(self.source_name, self.title)
         else:
             fname = "{} -- {}.pdf".format(self.source_name, self.title)
@@ -155,23 +163,23 @@ class ArticleDownload(BaseModel):
         return True, {}
 
 
-class ArticleKeyword(BaseModel):
+class ArticleKeyword(DownloadBaseModel):
     # instance gets created for every one keyword -> flexible in size
     article = ForeignKeyField(ArticleDownload, backref='keywords')
     keyword = CharField()
 
 
-class ArticleAuthor(BaseModel):
+class ArticleAuthor(DownloadBaseModel):
     article = ForeignKeyField(ArticleDownload, backref='authors')
     author = CharField()
 
 
-class ArticleReference(BaseModel):
+class ArticleReference(DownloadBaseModel):
     article = ForeignKeyField(ArticleDownload, backref='references')
     reference_url = TextField(default = '')
 
 
-class ArticleRelated(BaseModel):
+class ArticleRelated(DownloadBaseModel):
     article = ForeignKeyField(ArticleDownload, backref='related')
     related_file_name = TextField(default = '')
 
@@ -179,13 +187,13 @@ class ArticleRelated(BaseModel):
 
 
 ## == Slack-thread related models == ##
-class User(BaseModel):
+class User(ChatBaseModel):
     user_id = CharField(default='', unique=True)
     # messages
 
 
-class Thread(BaseModel):
-    """The threads that concern us are only created if the messages that contain urls"""
+class Thread(ChatBaseModel):
+    """The threads that concern us are only created if the base massage contains a url"""
     thread_ts = FloatField(default = 0)
     article = ForeignKeyField(ArticleDownload, backref="slack_thread", null=True, default=None)
     # provides, ts, user, models
@@ -227,7 +235,7 @@ class Thread(BaseModel):
 
 
     
-class Message(BaseModel):
+class Message(ChatBaseModel):
     ts = FloatField(unique=True) #for sorting
     channel_id = CharField(default='')
     user = ForeignKeyField(User, backref="messages")
@@ -275,7 +283,7 @@ class Message(BaseModel):
         return len(self.urls) == 1
 
 
-class Reaction(BaseModel):
+class Reaction(ChatBaseModel):
     type = CharField(default = "")
     message = ForeignKeyField(Message, backref="reaction")
 
@@ -286,17 +294,16 @@ class Reaction(BaseModel):
 
 
 
-
-
-
-
 def create_tables():
-    with db:
-        db.create_tables([ArticleDownload, ArticleKeyword, ArticleAuthor, ArticleReference, ArticleRelated, User, Message, Thread, Reaction])
+    with download_db:
+        download_db.create_tables([ArticleDownload, ArticleKeyword, ArticleAuthor, ArticleReference, ArticleRelated])
+    with chat_db:
+        chat_db.create_tables([User, Message, Thread, Reaction])
 
 
-def set_db(db_object):
-    db.initialize(db_object)
+def set_db(chat_db_object, download_db_object):
+    chat_db.initialize(chat_db_object)
+    download_db.initialize(download_db_object)
     create_tables()
 
 def clear_path_name(path):
diff --git a/app/utils_worker/compress/runner.py b/app/utils_worker/compress/runner.py
index 5f67bb9..8a99fcb 100644
--- a/app/utils_worker/compress/runner.py
+++ b/app/utils_worker/compress/runner.py
@@ -9,6 +9,9 @@ shrink_sizes = []
 
 def shrink_pdf(article):
     initial_size = os.path.getsize(article.save_path + article.file_name)
+    if article.file_name[-4:] != ".pdf":
+        return article # it probably was a youtube video
+        
     c = subprocess.run(
         ["gs", "-sDEVICE=pdfwrite", "-dPDFSETTINGS=/screen", "-dNOPAUSE", "-dBATCH", f"-sOutputFile={config['default_download_path']}/compressed.pdf", f'"{article.save_path + article.file_name}"'],
         stdout=subprocess.PIPE,
diff --git a/app/utils_worker/download/browser.py b/app/utils_worker/download/browser.py
index 3af0b36..f1767b2 100644
--- a/app/utils_worker/download/browser.py
+++ b/app/utils_worker/download/browser.py
@@ -7,10 +7,10 @@ import requests
 from selenium import webdriver
 from selenium.webdriver.firefox.options import Options
 import configuration
+import json
 
 config = configuration.parsed["DOWNLOADS"]
-
-
+blacklisted = json.loads(config["blacklisted_href_domains"])
 
 class PDFDownloader:
     """Saves a given url. Fills the object it got as a parameter"""
@@ -61,10 +61,6 @@ class PDFDownloader:
         self.autostart()
         url = article_object.article_url
 
-        # arbitrary bug fixes:
-        if "focus.de" in url or "bloomberg.com" in url:
-            url = url.replace("https://", "https://outline.com/")
-            sleep_time += 5
         try:
             self.driver.get(url)
         except Exception as e:
@@ -97,7 +93,7 @@ class PDFDownloader:
 
         if success:
             article_object.file_name = fname
-            article_object.set_references = self.get_references()
+            article_object.set_references(self.get_references())
         else:
             article_object.file_name = ""
         
@@ -140,10 +136,12 @@ class PDFDownloader:
             hrefs = [e.get_attribute("href") for e in self.driver.find_elements_by_xpath("//a[@href]")]
         except:
             hrefs = []
-        # TODO TEST THIS
+        old = hrefs
         hrefs = [h for h in hrefs \
-            if bool([(domain in h) for domain in config["blacklisted_href_domains"]])
+            if not sum([(domain in h) for domain in blacklisted]) # sum([True, False, False, False]) == 1 (esp. not 0)
             ] # filter a tiny bit at least
+        diff = set(old) ^ set(hrefs)
+        self.logger.info(f"Removed {len(diff)} hrefs: {diff} (before:{len(old)}, after: {len(hrefs)})")
         return hrefs
 
 
diff --git a/app/utils_worker/download/youtube.py b/app/utils_worker/download/youtube.py
index 014e382..f7d5c0e 100644
--- a/app/utils_worker/download/youtube.py
+++ b/app/utils_worker/download/youtube.py
@@ -1,33 +1,65 @@
-import logging
+from __future__ import unicode_literals
+import youtube_dl
 import os
-from pytube import YouTube
+import logging
 
 logger = logging.getLogger(__name__)
 
 
-def save_video(article_object):
-    """Saves video accoring to url and save path"""
-    url = article_object.article_url
-    logger.info("Saving new video")
-    try:
-        yt = YouTube(url)
-        streams = yt.streams.filter(progressive=True).order_by('resolution')
-    except Exception as e:
-        article_object.file_name = "ERROR: {}".format(e)
-        return article_object
+class MyLogger(object):
+    def debug(self, msg): pass
+    def warning(self, msg): pass
+    def error(self, msg):
+        logger.error(msg)
 
-    if streams: # if it's not empty
-        vid = streams[-1]
-        article_object.source_name = "youtube.com"
-        article_object.title = yt.title
+
+
+class YouTubeDownloader:
+    def __init__(self) -> None:
+        pass
+
+
+    def post_download_hook(self, ret_code):
+        # print(ret_code)
+        if ret_code['status'] == 'finished':
+            file_loc = ret_code["filename"]
+            fname = os.path.basename(file_loc)
+            self.article_object.file_name = fname
+
+
+    def save_video(self, article_object):
+        """Saves video accoring to url and save path"""
+        self.article_object = article_object
+        url = article_object.article_url
+        logger.info("Saving new video")
         file_path = os.path.join(article_object.save_path, article_object.fname_template)
+        ydl_opts = {
+            'format': 'best[height<=720]',
+            'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download
+            'logger': MyLogger(),
+            'progress_hooks': [self.post_download_hook],
+            'updatetime': False
+        }
         try:
-            vid.download(file_path)
-            article_object.file_name = article_object.fname_template
+            with youtube_dl.YoutubeDL(ydl_opts) as ydl:
+                ydl.download([url])
+                # article file name is updated in self.post_download_hook
         except Exception as e:
             logger.error(f"Youtube download crashed: {e}")
-            article_object.file_name = "Error while downloading"
-    else:
-        article_object.file_name = "No streams available"
-    
-    return article_object
+            article_object.file_name = ""
+
+        return article_object
+
+
+
+# class DummyArticle:
+#     article_url = "https://www.welt.de/politik/ausland/article238267261/Baerbock-Lieferung-gepanzerter-Fahrzeuge-an-die-Ukraine-kein-Tabu.html"
+#     save_path = "/app/file_storage/"
+#     fname_template = "www.youtube.com -- Test"
+#     file_name = ""
+
+# m = DummyArticle()
+# t = YouTubeDownloader()
+# t.save_video(m)
+
+# print(m.file_name)
diff --git a/app/utils_worker/fetch/runner.py b/app/utils_worker/fetch/runner.py
index 1fc227e..960a0f2 100644
--- a/app/utils_worker/fetch/runner.py
+++ b/app/utils_worker/fetch/runner.py
@@ -37,24 +37,28 @@ def get_description(article_object):
     except:
         news_article = fallback
 
-
     if news_article.title:
         title = news_article.title
     else:
         title = fallback.title
 
-
     if news_article.summary:
         summary = news_article.summary
     elif news_article.text:
         ind = min(500, len(news_article.text))
         summary = news_article.text[:ind] + "..."
     else:
-        summary = fallback.summary        
+        summary = fallback.summary
+
+    if news_article.meta_lang:
+        lang = news_article.meta_lang
+    else:
+        lang = ""
 
     article_object.title = title
     article_object.summary = summary
+    article_object.language = lang
     article_object.set_authors(news_article.authors)
     article_object.set_keywords(news_article.keywords)
-    
+
     return article_object
diff --git a/app/utils_worker/upload/runner.py b/app/utils_worker/upload/runner.py
index b8d188f..5542d16 100644
--- a/app/utils_worker/upload/runner.py
+++ b/app/utils_worker/upload/runner.py
@@ -9,10 +9,10 @@ def upload_to_archive(article_object):
     try:
         wayback = WaybackMachineSaveAPI(url, user_agent)
         archive_url = wayback.save()
-        logger.info(f"{url} uploaded to archive successfully")
+        # logger.info(f"{url} uploaded to archive successfully")
         article_object.archive_url = archive_url
     except Exception as e:
         article_object.archive_url = "Error while uploading: {}".format(e)
-        logger.error(f"Error while generating new url: {e}")
+        logger.error(f"Error while generating archive url: {e}")
 
     return article_object
\ No newline at end of file
diff --git a/app/utils_worker/worker_template.py b/app/utils_worker/worker_template.py
index 96be787..a19a726 100644
--- a/app/utils_worker/worker_template.py
+++ b/app/utils_worker/worker_template.py
@@ -1,7 +1,6 @@
 from threading import Thread
 import time
 import logging
-# logger = logging.getLogger(__name__)
 
 
 class TemplateWorker(Thread):
@@ -34,7 +33,6 @@ class TemplateWorker(Thread):
                 
 
     def _handle_article(self, article_watcher, action=None):
-        # TODO Overload in children classes
         if action is None:
             self.logger.error("Unoverloaded call of _handle_article(). This should not occur in prod")
         else:
diff --git a/app/utils_worker/workers.py b/app/utils_worker/workers.py
index f29aab0..21b4388 100644
--- a/app/utils_worker/workers.py
+++ b/app/utils_worker/workers.py
@@ -1,6 +1,6 @@
 from .worker_template import TemplateWorker
 from .download.browser import PDFDownloader
-from .download.youtube import save_video
+from .download.youtube import YouTubeDownloader
 from .fetch.runner import get_description
 from .upload.runner import upload_to_archive as run_upload
 from .compress.runner import shrink_pdf
@@ -11,7 +11,7 @@ logger = logging.getLogger(__name__)
 class DownloadWorker(TemplateWorker):
     def __init__(self) -> None:
         self.dl_runner = PDFDownloader().download
-        self.yt_runner = save_video
+        self.yt_runner = YouTubeDownloader().save_video
         super().__init__()
 
     def _handle_article(self, article_watcher):
diff --git a/requirements.txt b/requirements.txt
index 50f3bca..5347fd9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 peewee
 selenium
-pytube
+youtube-dl
 waybackpy
 slack_bolt # relies on slack_sdk
 newspaper3k