Many bug fixes
This commit is contained in:
		
							
								
								
									
										20
									
								
								Dockerfile
									
									
									
									
									
								
							
							
						
						
									
										20
									
								
								Dockerfile
									
									
									
									
									
								
							| @@ -1,18 +1,22 @@ | |||||||
| FROM ubuntu:latest | FROM python:latest | ||||||
| # UGH, timezone issues |  | ||||||
| RUN ln -snf /usr/share/zoneinfo/$CONTAINER_TIMEZONE /etc/localtime && echo $CONTAINER_TIMEZONE > /etc/timezone |  | ||||||
|  |  | ||||||
| RUN apt-get update && apt-get install -y evince libcanberra-gtk-module && apt-get install -y xauth wget tar python3 python3-pip python3-setuptools python3-wheel python3-dev build-essential firefox ghostscript | RUN apt-get update && apt-get install -y \ | ||||||
|  | evince libcanberra-gtk-module \ | ||||||
|  | # for checking | ||||||
|  | xauth wget tar firefox \ | ||||||
|  | # for geckodriver + gui | ||||||
|  | ghostscript | ||||||
|  | # for compression | ||||||
|  |  | ||||||
| # Download gecko (firefox) driver for selenium | # Download gecko (firefox) driver for selenium | ||||||
| RUN wget https://github.com/mozilla/geckodriver/releases/download/v0.30.0/geckodriver-v0.30.0-linux64.tar.gz | RUN wget https://github.com/mozilla/geckodriver/releases/download/v0.31.0/geckodriver-v0.31.0-linux64.tar.gz | ||||||
| RUN tar -x geckodriver -zf geckodriver-v0.30.0-linux64.tar.gz -O > /usr/bin/geckodriver | RUN tar -x geckodriver -zf geckodriver-v0.31.0-linux64.tar.gz -O > /usr/bin/geckodriver | ||||||
| RUN chmod +x /usr/bin/geckodriver | RUN chmod +x /usr/bin/geckodriver | ||||||
| RUN rm geckodriver-v0.30.0-linux64.tar.gz | RUN rm geckodriver-v0.31.0-linux64.tar.gz | ||||||
| RUN echo "127.0.0.1 localhost" >> /etc/hosts | RUN echo "127.0.0.1 localhost" >> /etc/hosts | ||||||
|  |  | ||||||
| COPY requirements.txt /app/ | COPY requirements.txt /app/ | ||||||
| RUN python3 -m pip install --upgrade pip && python3 -m pip install -r /app/requirements.txt | RUN python3 -m pip install -r /app/requirements.txt | ||||||
|  |  | ||||||
| RUN mkdir -p /app/auto_news | RUN mkdir -p /app/auto_news | ||||||
| COPY app /app/auto_news | COPY app /app/auto_news | ||||||
|   | |||||||
| @@ -40,9 +40,9 @@ where the `Dockerfile` has to be in the working directory | |||||||
|  |  | ||||||
| ## Cheat-sheet Remy: | ## Cheat-sheet Remy: | ||||||
|  |  | ||||||
| `docker run -it -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ auto_news` | `docker run -it -v /mnt/Data/COSS/CONTAINERDATA/:/app/file_storage/ auto_news` | ||||||
|  |  | ||||||
| `docker run -it -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ -v /mnt/Data/COSS/auto_news/app:/code --entrypoint /bin/bash auto_news` | `docker run -it -v /mnt/Data/COSS/CONTAINERDATA/:/app/file_storage/ -v /mnt/Data/COSS/auto_news/app:/code --entrypoint /bin/bash auto_news` | ||||||
|  |  | ||||||
|  |  | ||||||
| `docker run -it -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ -e DISPLAY=":0" --network host -v \$XAUTHORITY:/root/.Xauthority auto_news check` | `docker run -it -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ -e DISPLAY=":0" --network host -v \$XAUTHORITY:/root/.Xauthority auto_news check` | ||||||
| @@ -51,6 +51,6 @@ where the `Dockerfile` has to be in the working directory | |||||||
|  |  | ||||||
| ## Roadmap: | ## Roadmap: | ||||||
|  |  | ||||||
| [] automatically upload files to NAS | [ ] automatically upload files to NAS | ||||||
| [] handle paywalled sites like faz, spiegel, .. through their dedicated edu-sites | [ ] handle paywalled sites like faz, spiegel, .. through their dedicated edu-sites | ||||||
| ... | ... | ||||||
| @@ -23,20 +23,24 @@ if "debug" in sys.argv: | |||||||
|     logger.warning("Running in debugging mode because launched with argument 'debug'") |     logger.warning("Running in debugging mode because launched with argument 'debug'") | ||||||
|     # parsed.read("/code/config.ini") |     # parsed.read("/code/config.ini") | ||||||
|  |  | ||||||
|     db_path = os.path.join(parsed["DATABASE"]["db_path_dev"], parsed["DATABASE"]["db_name"]) |     db_base_path = parsed["DATABASE"]["db_path_dev"] | ||||||
|     parsed["SLACK"]["archive_id"] = parsed["SLACK"]["debug_id"] |     parsed["SLACK"]["archive_id"] = parsed["SLACK"]["debug_id"] | ||||||
|     parsed["MAIL"]["recipient"] = parsed["MAIL"]["sender"]  |     parsed["MAIL"]["recipient"] = parsed["MAIL"]["sender"]  | ||||||
| else: | else: | ||||||
|     logger.warning("Using production values, I hope you know what you're doing...") |     logger.warning("Using production values, I hope you know what you're doing...") | ||||||
|  |  | ||||||
|     db_path = os.path.join(parsed["DATABASE"]["db_path_prod"], parsed["DATABASE"]["db_name"]) |     db_base_path = parsed["DATABASE"]["db_path_prod"] | ||||||
|  |  | ||||||
| from utils_storage import models | from utils_storage import models | ||||||
|  |  | ||||||
| # Set up the database | # Set up the database | ||||||
| models.set_db( | models.set_db( | ||||||
|     SqliteDatabase( |     SqliteDatabase( | ||||||
|         db_path, |         os.path.join(db_base_path, parsed["DATABASE"]["chat_db_name"]), | ||||||
|  |         pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once | ||||||
|  |     ), | ||||||
|  |     SqliteDatabase( | ||||||
|  |         os.path.join(db_base_path, parsed["DATABASE"]["download_db_name"]), | ||||||
|         pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once |         pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once | ||||||
|     ) |     ) | ||||||
| ) | ) | ||||||
| @@ -27,8 +27,14 @@ class ArticleWatcher: | |||||||
|         self._fetch_completed = self._download_completed = self._compression_completed = self._upload_completed = False |         self._fetch_completed = self._download_completed = self._compression_completed = self._upload_completed = False | ||||||
|  |  | ||||||
|         # first step: gather metadata |         # first step: gather metadata | ||||||
|         self.fetch.process(self) # this will call the update_status method |         if self.fetch and self.upload: | ||||||
|         self.upload.process(self) # idependdent from the rest |             self.fetch.process(self) # this will call the update_status method | ||||||
|  |             self.upload.process(self) # idependdent from the rest | ||||||
|  |         else: # the full kwargs were not provided, only do a manual run | ||||||
|  |             # overwrite update_status() because calls from the workers will result in erros | ||||||
|  |             self.update_status = lambda completed: logger.info(f"Completed action {completed}") | ||||||
|  |             for w in kwargs.get("workers_manual"): | ||||||
|  |                 w.process(self) | ||||||
|  |  | ||||||
|  |  | ||||||
|     def update_status(self, completed_action): |     def update_status(self, completed_action): | ||||||
| @@ -36,23 +42,6 @@ class ArticleWatcher: | |||||||
|         Article download is complete iff fetch and download were successfull and compression was run |         Article download is complete iff fetch and download were successfull and compression was run | ||||||
|         """ |         """ | ||||||
|         # if self.completition_notified and self._compression_completed and self._fetch_completed and self._download_completed and self._upload_completed, we are done |         # if self.completition_notified and self._compression_completed and self._fetch_completed and self._download_completed and self._upload_completed, we are done | ||||||
|         # we don't need to delete self though, because it is then automatically garbage-collected |  | ||||||
|         # all_done = self._fetch_completed and self._download_completed and self._compression_completed and self._upload_completed |  | ||||||
|         # if self._fetch_completed and not self._download_called: |  | ||||||
|         #     self._download_called = True |  | ||||||
|         #     self.download.process(self) |  | ||||||
|         # elif self._download_completed and not self._compression_called: |  | ||||||
|         #     self._compression_called = True |  | ||||||
|         #     self.compress.process(self) |  | ||||||
|         # elif self._compression_completed: # last step |  | ||||||
|         #     self.completition_notifier(self.article) |  | ||||||
|         #     # triggers action in Coordinator |  | ||||||
|         # elif self._upload_completed: |  | ||||||
|         #     # this case occurs when upload was faster than compression |  | ||||||
|         #     pass |  | ||||||
|         # else: |  | ||||||
|         #     logger.warning(f"update_status called with unusual configuration {self._fetch_completed},{self._download_completed},{self._compression_completed}") |  | ||||||
|  |  | ||||||
|         if completed_action == "fetch": |         if completed_action == "fetch": | ||||||
|             self.download.process(self) |             self.download.process(self) | ||||||
|         elif completed_action == "download": |         elif completed_action == "download": | ||||||
| @@ -129,15 +118,16 @@ class Coordinator: | |||||||
|  |  | ||||||
|  |  | ||||||
|     def incoming_request(self, message): |     def incoming_request(self, message): | ||||||
|         # TODO CHECK ME! |  | ||||||
|         """This method is passed onto the slack worker. It gets triggered when a new message is received.""" |         """This method is passed onto the slack worker. It gets triggered when a new message is received.""" | ||||||
|         url = message.urls[0] # ignore all the other ones |         url = message.urls[0] # ignore all the other ones | ||||||
|         a, is_new = models.ArticleDownload.get_or_create(article_url=url) |         a, is_new = models.ArticleDownload.get_or_create(article_url=url) | ||||||
|         message.thread.article = a |         message.thread.article = a | ||||||
|         message.thread.save() |         message.thread.save() | ||||||
|  |         self.kwargs.update({"notifier" : self.article_complete_notifier}) | ||||||
|  |  | ||||||
|         if is_new: |         if is_new or (a.file_name == "" and a.verified == 0): | ||||||
|             self.kwargs.update({"notifier" : self.article_complete_notifier}) |             # check for models that were created but were abandonned. This means they have missing information, most importantly no associated file | ||||||
|  |             # this overwrites previously set information, but that should not be too important | ||||||
|             ArticleWatcher( |             ArticleWatcher( | ||||||
|                 a, |                 a, | ||||||
|                 **self.kwargs    |                 **self.kwargs    | ||||||
| @@ -152,12 +142,13 @@ class Coordinator: | |||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     def manual_processing(self, url_list, target_calls): |     def manual_processing(self, articles, workers): | ||||||
|         for url in url_list: |         for w in workers: | ||||||
|             article = models.ArticleDownload.get_or_none(article_url=url) |             w.start() | ||||||
|             watcher = ArticleWatcher(article, self.article_complete_notifier) |  | ||||||
|             for t in target_calls: |         for article in articles: | ||||||
|                 t.process(watcher) |             notifier = lambda article: print(f"Completed manual actions for {article}") | ||||||
|  |             ArticleWatcher(article, workers_manual = workers, notifier = notifier) | ||||||
|  |  | ||||||
|     def article_complete_notifier(self, article): |     def article_complete_notifier(self, article): | ||||||
|         self.worker_slack.bot_worker.respond_channel_message(article) |         self.worker_slack.bot_worker.respond_channel_message(article) | ||||||
| @@ -170,12 +161,14 @@ if __name__ == "__main__": | |||||||
|  |  | ||||||
|  |  | ||||||
|     if "upload" in sys.argv: |     if "upload" in sys.argv: | ||||||
|         urls = models.ArticleDownload.select(models.ArticleDownload.article_url).where(models.ArticleDownload.archive_url == "").execute() |         articles = models.ArticleDownload.select().where(models.ArticleDownload.archive_url == "").execute() | ||||||
|         logger.info(f"Launching upload to archive for {len(urls)} urls.") |         logger.info(f"Launching upload to archive for {len(articles)} articles.") | ||||||
|         coordinator.manual_processing(urls, [UploadWorker()]) |         coordinator.manual_processing(articles, [UploadWorker()]) | ||||||
|  |  | ||||||
|     elif "check" in sys.argv: |     elif "check" in sys.argv: | ||||||
|         from utils_check import runner as check_runner |         from utils_check import runner as check_runner | ||||||
|         check_runner.verify_unchecked() |         check_runner.verify_unchecked() | ||||||
|  |  | ||||||
|     else: # launch with full action |     else: # launch with full action | ||||||
|         kwargs = { |         kwargs = { | ||||||
|             "worker_download" : DownloadWorker(), |             "worker_download" : DownloadWorker(), | ||||||
| @@ -186,9 +179,3 @@ if __name__ == "__main__": | |||||||
|             "worker_mail" : mail_runner, |             "worker_mail" : mail_runner, | ||||||
|         } |         } | ||||||
|         coordinator.add_workers(**kwargs) |         coordinator.add_workers(**kwargs) | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # TODO |  | ||||||
| # Resume interrupted article models |  | ||||||
| @@ -189,6 +189,7 @@ def message_dict_to_model(message): | |||||||
|         uid = message.get("user", "BAD USER") |         uid = message.get("user", "BAD USER") | ||||||
|         if uid == "BAD USER": |         if uid == "BAD USER": | ||||||
|             logger.critical("Message has no user?? {}".format(message)) |             logger.critical("Message has no user?? {}".format(message)) | ||||||
|  |             return None | ||||||
|          |          | ||||||
|         user, _ = models.User.get_or_create(user_id = uid) |         user, _ = models.User.get_or_create(user_id = uid) | ||||||
|         thread, _ = models.Thread.get_or_create(thread_ts = thread_ts) |         thread, _ = models.Thread.get_or_create(thread_ts = thread_ts) | ||||||
|   | |||||||
| @@ -158,7 +158,7 @@ class BotApp(App): | |||||||
|         fully_processed = len([t for t in threads if t.is_fully_processed]) |         fully_processed = len([t for t in threads if t.is_fully_processed]) | ||||||
|         fully_unprocessed = len([t for t in threads if t.message_count == 1]) |         fully_unprocessed = len([t for t in threads if t.message_count == 1]) | ||||||
|         articles_unprocessed = len(models.ArticleDownload.select().where(models.ArticleDownload.verified < 1)) |         articles_unprocessed = len(models.ArticleDownload.select().where(models.ArticleDownload.verified < 1)) | ||||||
|         self.logger.info(f"[bold]STATUS[/bold]: Fully processed {all_threads}/{fully_processed} threads. {fully_unprocessed} threads have 0 replies. Article-objects to verify: {articles_unprocessed}", extra={"markup": True}) |         self.logger.info(f"[bold]STATUS[/bold]: Fully processed {fully_processed}/{all_threads} threads. {fully_unprocessed} threads have 0 replies. Article-objects to verify: {articles_unprocessed}", extra={"markup": True}) | ||||||
|  |  | ||||||
|  |  | ||||||
|      |      | ||||||
|   | |||||||
							
								
								
									
										67
									
								
								app/utils_storage/migrations/migration.001.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										67
									
								
								app/utils_storage/migrations/migration.001.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,67 @@ | |||||||
|  | from playhouse.migrate import * | ||||||
|  |  | ||||||
|  |  | ||||||
|  | """ | ||||||
|  | This migration assumes that downloads.db kept the exact same structure as before. | ||||||
|  | messages.db should drop the table articlemodelreference in favor of a new field article in the thread-table | ||||||
|  | Since each thread is constrained to exactly one article this makes the most sense. | ||||||
|  |  | ||||||
|  | This migration assumes that messages.db gets a new field in the table thread: | ||||||
|  | id  |   thread_ts | article_id | ||||||
|  |  | ||||||
|  | We now need to migrate from the table articlemodelreference and then delete it. | ||||||
|  | """ | ||||||
|  |  | ||||||
|  |  | ||||||
|  | db = SqliteDatabase("/code/.dev/messages.db") | ||||||
|  | migrator = SqliteMigrator(db) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | article_field = IntegerField(null=True) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | migrate( | ||||||
|  |     migrator.add_column('thread', 'article_id', article_field), | ||||||
|  |     # migrator.drop_column('some_table', 'old_column'), | ||||||
|  | ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # these are the old models, adapted to the migration | ||||||
|  |  | ||||||
|  | class BaseModel(Model): | ||||||
|  |     class Meta: | ||||||
|  |         database = db | ||||||
|  |  | ||||||
|  | class User(BaseModel): | ||||||
|  |     user_id = CharField(default='', unique=True)    | ||||||
|  |  | ||||||
|  | class Thread(BaseModel): | ||||||
|  |     """The threads that concern us are only created if the messages that contain urls""" | ||||||
|  |     thread_ts = FloatField(default = 0) | ||||||
|  |     article_id = IntegerField(null=True) | ||||||
|  |  | ||||||
|  |      | ||||||
|  | class Message(BaseModel): | ||||||
|  |     ts = FloatField(unique=True) #for sorting | ||||||
|  |     channel_id = CharField(default='') | ||||||
|  |     user = ForeignKeyField(User, backref="messages") | ||||||
|  |     text = TextField(default='') | ||||||
|  |     thread = ForeignKeyField(Thread, backref="messages", default=None) | ||||||
|  |     file_type = CharField(default='') | ||||||
|  |     perma_link = CharField(default='') | ||||||
|  |     is_processed_override = BooleanField(default=False) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ArticleModelReference(BaseModel): | ||||||
|  |     message = ForeignKeyField(Message, backref='article_model_references') | ||||||
|  |     article_model_id = IntegerField(default = 0) | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | for ref in ArticleModelReference.select(): | ||||||
|  |     ref.message.thread.article_id = ref.article_model_id | ||||||
|  |     ref.message.thread.save() | ||||||
|  |  | ||||||
|  | db.drop_tables((ArticleModelReference)) | ||||||
| @@ -12,15 +12,23 @@ config = configuration.parsed["DOWNLOADS"] | |||||||
| slack_config = configuration.parsed["SLACK"] | slack_config = configuration.parsed["SLACK"] | ||||||
|  |  | ||||||
| ## Helpers | ## Helpers | ||||||
| db = DatabaseProxy() | chat_db = DatabaseProxy() | ||||||
|  | download_db = DatabaseProxy() | ||||||
|  |  | ||||||
| # set the nature of the db at runtime | # set the nature of the db at runtime | ||||||
| class BaseModel(Model): |  | ||||||
|  | class DownloadBaseModel(Model): | ||||||
|     class Meta: |     class Meta: | ||||||
|         database = db |         database = download_db | ||||||
|  |  | ||||||
|  | class ChatBaseModel(Model): | ||||||
|  |     class Meta: | ||||||
|  |         database = chat_db | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| ## == Article related models == ## | ## == Article related models == ## | ||||||
| class ArticleDownload(BaseModel): | class ArticleDownload(DownloadBaseModel): | ||||||
|     title = CharField(default='') |     title = CharField(default='') | ||||||
|     pub_date = DateField(default = '') |     pub_date = DateField(default = '') | ||||||
|     download_date = DateField(default = datetime.date.today) |     download_date = DateField(default = datetime.date.today) | ||||||
| @@ -55,7 +63,7 @@ class ArticleDownload(BaseModel): | |||||||
|  |  | ||||||
|     @property |     @property | ||||||
|     def fname_template(self): |     def fname_template(self): | ||||||
|         if self.source_name == "youtube.com": |         if "youtube.com" in self.source_name or "youtu.be" in self.source_name: | ||||||
|             fname = "{} -- {}".format(self.source_name, self.title) |             fname = "{} -- {}".format(self.source_name, self.title) | ||||||
|         else: |         else: | ||||||
|             fname = "{} -- {}.pdf".format(self.source_name, self.title) |             fname = "{} -- {}.pdf".format(self.source_name, self.title) | ||||||
| @@ -155,23 +163,23 @@ class ArticleDownload(BaseModel): | |||||||
|         return True, {} |         return True, {} | ||||||
|  |  | ||||||
|  |  | ||||||
| class ArticleKeyword(BaseModel): | class ArticleKeyword(DownloadBaseModel): | ||||||
|     # instance gets created for every one keyword -> flexible in size |     # instance gets created for every one keyword -> flexible in size | ||||||
|     article = ForeignKeyField(ArticleDownload, backref='keywords') |     article = ForeignKeyField(ArticleDownload, backref='keywords') | ||||||
|     keyword = CharField() |     keyword = CharField() | ||||||
|  |  | ||||||
|  |  | ||||||
| class ArticleAuthor(BaseModel): | class ArticleAuthor(DownloadBaseModel): | ||||||
|     article = ForeignKeyField(ArticleDownload, backref='authors') |     article = ForeignKeyField(ArticleDownload, backref='authors') | ||||||
|     author = CharField() |     author = CharField() | ||||||
|  |  | ||||||
|  |  | ||||||
| class ArticleReference(BaseModel): | class ArticleReference(DownloadBaseModel): | ||||||
|     article = ForeignKeyField(ArticleDownload, backref='references') |     article = ForeignKeyField(ArticleDownload, backref='references') | ||||||
|     reference_url = TextField(default = '') |     reference_url = TextField(default = '') | ||||||
|  |  | ||||||
|  |  | ||||||
| class ArticleRelated(BaseModel): | class ArticleRelated(DownloadBaseModel): | ||||||
|     article = ForeignKeyField(ArticleDownload, backref='related') |     article = ForeignKeyField(ArticleDownload, backref='related') | ||||||
|     related_file_name = TextField(default = '') |     related_file_name = TextField(default = '') | ||||||
|  |  | ||||||
| @@ -179,13 +187,13 @@ class ArticleRelated(BaseModel): | |||||||
|  |  | ||||||
|  |  | ||||||
| ## == Slack-thread related models == ## | ## == Slack-thread related models == ## | ||||||
| class User(BaseModel): | class User(ChatBaseModel): | ||||||
|     user_id = CharField(default='', unique=True) |     user_id = CharField(default='', unique=True) | ||||||
|     # messages |     # messages | ||||||
|  |  | ||||||
|  |  | ||||||
| class Thread(BaseModel): | class Thread(ChatBaseModel): | ||||||
|     """The threads that concern us are only created if the messages that contain urls""" |     """The threads that concern us are only created if the base massage contains a url""" | ||||||
|     thread_ts = FloatField(default = 0) |     thread_ts = FloatField(default = 0) | ||||||
|     article = ForeignKeyField(ArticleDownload, backref="slack_thread", null=True, default=None) |     article = ForeignKeyField(ArticleDownload, backref="slack_thread", null=True, default=None) | ||||||
|     # provides, ts, user, models |     # provides, ts, user, models | ||||||
| @@ -227,7 +235,7 @@ class Thread(BaseModel): | |||||||
|  |  | ||||||
|  |  | ||||||
|      |      | ||||||
| class Message(BaseModel): | class Message(ChatBaseModel): | ||||||
|     ts = FloatField(unique=True) #for sorting |     ts = FloatField(unique=True) #for sorting | ||||||
|     channel_id = CharField(default='') |     channel_id = CharField(default='') | ||||||
|     user = ForeignKeyField(User, backref="messages") |     user = ForeignKeyField(User, backref="messages") | ||||||
| @@ -275,7 +283,7 @@ class Message(BaseModel): | |||||||
|         return len(self.urls) == 1 |         return len(self.urls) == 1 | ||||||
|  |  | ||||||
|  |  | ||||||
| class Reaction(BaseModel): | class Reaction(ChatBaseModel): | ||||||
|     type = CharField(default = "") |     type = CharField(default = "") | ||||||
|     message = ForeignKeyField(Message, backref="reaction") |     message = ForeignKeyField(Message, backref="reaction") | ||||||
|  |  | ||||||
| @@ -286,17 +294,16 @@ class Reaction(BaseModel): | |||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def create_tables(): | def create_tables(): | ||||||
|     with db: |     with download_db: | ||||||
|         db.create_tables([ArticleDownload, ArticleKeyword, ArticleAuthor, ArticleReference, ArticleRelated, User, Message, Thread, Reaction]) |         download_db.create_tables([ArticleDownload, ArticleKeyword, ArticleAuthor, ArticleReference, ArticleRelated]) | ||||||
|  |     with chat_db: | ||||||
|  |         chat_db.create_tables([User, Message, Thread, Reaction]) | ||||||
|  |  | ||||||
|  |  | ||||||
| def set_db(db_object): | def set_db(chat_db_object, download_db_object): | ||||||
|     db.initialize(db_object) |     chat_db.initialize(chat_db_object) | ||||||
|  |     download_db.initialize(download_db_object) | ||||||
|     create_tables() |     create_tables() | ||||||
|  |  | ||||||
| def clear_path_name(path): | def clear_path_name(path): | ||||||
|   | |||||||
| @@ -9,6 +9,9 @@ shrink_sizes = [] | |||||||
|  |  | ||||||
| def shrink_pdf(article): | def shrink_pdf(article): | ||||||
|     initial_size = os.path.getsize(article.save_path + article.file_name) |     initial_size = os.path.getsize(article.save_path + article.file_name) | ||||||
|  |     if article.file_name[-4:] != ".pdf": | ||||||
|  |         return article # it probably was a youtube video | ||||||
|  |          | ||||||
|     c = subprocess.run( |     c = subprocess.run( | ||||||
|         ["gs", "-sDEVICE=pdfwrite", "-dPDFSETTINGS=/screen", "-dNOPAUSE", "-dBATCH", f"-sOutputFile={config['default_download_path']}/compressed.pdf", f'"{article.save_path + article.file_name}"'], |         ["gs", "-sDEVICE=pdfwrite", "-dPDFSETTINGS=/screen", "-dNOPAUSE", "-dBATCH", f"-sOutputFile={config['default_download_path']}/compressed.pdf", f'"{article.save_path + article.file_name}"'], | ||||||
|         stdout=subprocess.PIPE, |         stdout=subprocess.PIPE, | ||||||
|   | |||||||
| @@ -7,10 +7,10 @@ import requests | |||||||
| from selenium import webdriver | from selenium import webdriver | ||||||
| from selenium.webdriver.firefox.options import Options | from selenium.webdriver.firefox.options import Options | ||||||
| import configuration | import configuration | ||||||
|  | import json | ||||||
|  |  | ||||||
| config = configuration.parsed["DOWNLOADS"] | config = configuration.parsed["DOWNLOADS"] | ||||||
|  | blacklisted = json.loads(config["blacklisted_href_domains"]) | ||||||
|  |  | ||||||
|  |  | ||||||
| class PDFDownloader: | class PDFDownloader: | ||||||
|     """Saves a given url. Fills the object it got as a parameter""" |     """Saves a given url. Fills the object it got as a parameter""" | ||||||
| @@ -61,10 +61,6 @@ class PDFDownloader: | |||||||
|         self.autostart() |         self.autostart() | ||||||
|         url = article_object.article_url |         url = article_object.article_url | ||||||
|  |  | ||||||
|         # arbitrary bug fixes: |  | ||||||
|         if "focus.de" in url or "bloomberg.com" in url: |  | ||||||
|             url = url.replace("https://", "https://outline.com/") |  | ||||||
|             sleep_time += 5 |  | ||||||
|         try: |         try: | ||||||
|             self.driver.get(url) |             self.driver.get(url) | ||||||
|         except Exception as e: |         except Exception as e: | ||||||
| @@ -97,7 +93,7 @@ class PDFDownloader: | |||||||
|  |  | ||||||
|         if success: |         if success: | ||||||
|             article_object.file_name = fname |             article_object.file_name = fname | ||||||
|             article_object.set_references = self.get_references() |             article_object.set_references(self.get_references()) | ||||||
|         else: |         else: | ||||||
|             article_object.file_name = "" |             article_object.file_name = "" | ||||||
|          |          | ||||||
| @@ -140,10 +136,12 @@ class PDFDownloader: | |||||||
|             hrefs = [e.get_attribute("href") for e in self.driver.find_elements_by_xpath("//a[@href]")] |             hrefs = [e.get_attribute("href") for e in self.driver.find_elements_by_xpath("//a[@href]")] | ||||||
|         except: |         except: | ||||||
|             hrefs = [] |             hrefs = [] | ||||||
|         # TODO TEST THIS |         old = hrefs | ||||||
|         hrefs = [h for h in hrefs \ |         hrefs = [h for h in hrefs \ | ||||||
|             if bool([(domain in h) for domain in config["blacklisted_href_domains"]]) |             if not sum([(domain in h) for domain in blacklisted]) # sum([True, False, False, False]) == 1 (esp. not 0) | ||||||
|             ] # filter a tiny bit at least |             ] # filter a tiny bit at least | ||||||
|  |         diff = set(old) ^ set(hrefs) | ||||||
|  |         self.logger.info(f"Removed {len(diff)} hrefs: {diff} (before:{len(old)}, after: {len(hrefs)})") | ||||||
|         return hrefs |         return hrefs | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,33 +1,65 @@ | |||||||
| import logging | from __future__ import unicode_literals | ||||||
|  | import youtube_dl | ||||||
| import os | import os | ||||||
| from pytube import YouTube | import logging | ||||||
|  |  | ||||||
| logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||||
|  |  | ||||||
|  |  | ||||||
| def save_video(article_object): | class MyLogger(object): | ||||||
|     """Saves video accoring to url and save path""" |     def debug(self, msg): pass | ||||||
|     url = article_object.article_url |     def warning(self, msg): pass | ||||||
|     logger.info("Saving new video") |     def error(self, msg): | ||||||
|     try: |         logger.error(msg) | ||||||
|         yt = YouTube(url) |  | ||||||
|         streams = yt.streams.filter(progressive=True).order_by('resolution') |  | ||||||
|     except Exception as e: |  | ||||||
|         article_object.file_name = "ERROR: {}".format(e) |  | ||||||
|         return article_object |  | ||||||
|  |  | ||||||
|     if streams: # if it's not empty |  | ||||||
|         vid = streams[-1] |  | ||||||
|         article_object.source_name = "youtube.com" | class YouTubeDownloader: | ||||||
|         article_object.title = yt.title |     def __init__(self) -> None: | ||||||
|  |         pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     def post_download_hook(self, ret_code): | ||||||
|  |         # print(ret_code) | ||||||
|  |         if ret_code['status'] == 'finished': | ||||||
|  |             file_loc = ret_code["filename"] | ||||||
|  |             fname = os.path.basename(file_loc) | ||||||
|  |             self.article_object.file_name = fname | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     def save_video(self, article_object): | ||||||
|  |         """Saves video accoring to url and save path""" | ||||||
|  |         self.article_object = article_object | ||||||
|  |         url = article_object.article_url | ||||||
|  |         logger.info("Saving new video") | ||||||
|         file_path = os.path.join(article_object.save_path, article_object.fname_template) |         file_path = os.path.join(article_object.save_path, article_object.fname_template) | ||||||
|  |         ydl_opts = { | ||||||
|  |             'format': 'best[height<=720]', | ||||||
|  |             'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download | ||||||
|  |             'logger': MyLogger(), | ||||||
|  |             'progress_hooks': [self.post_download_hook], | ||||||
|  |             'updatetime': False | ||||||
|  |         } | ||||||
|         try: |         try: | ||||||
|             vid.download(file_path) |             with youtube_dl.YoutubeDL(ydl_opts) as ydl: | ||||||
|             article_object.file_name = article_object.fname_template |                 ydl.download([url]) | ||||||
|  |                 # article file name is updated in self.post_download_hook | ||||||
|         except Exception as e: |         except Exception as e: | ||||||
|             logger.error(f"Youtube download crashed: {e}") |             logger.error(f"Youtube download crashed: {e}") | ||||||
|             article_object.file_name = "Error while downloading" |             article_object.file_name = "" | ||||||
|     else: |  | ||||||
|         article_object.file_name = "No streams available" |  | ||||||
|  |  | ||||||
|     return article_object |         return article_object | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # class DummyArticle: | ||||||
|  | #     article_url = "https://www.welt.de/politik/ausland/article238267261/Baerbock-Lieferung-gepanzerter-Fahrzeuge-an-die-Ukraine-kein-Tabu.html" | ||||||
|  | #     save_path = "/app/file_storage/" | ||||||
|  | #     fname_template = "www.youtube.com -- Test" | ||||||
|  | #     file_name = "" | ||||||
|  |  | ||||||
|  | # m = DummyArticle() | ||||||
|  | # t = YouTubeDownloader() | ||||||
|  | # t.save_video(m) | ||||||
|  |  | ||||||
|  | # print(m.file_name) | ||||||
|   | |||||||
| @@ -37,13 +37,11 @@ def get_description(article_object): | |||||||
|     except: |     except: | ||||||
|         news_article = fallback |         news_article = fallback | ||||||
|  |  | ||||||
|  |  | ||||||
|     if news_article.title: |     if news_article.title: | ||||||
|         title = news_article.title |         title = news_article.title | ||||||
|     else: |     else: | ||||||
|         title = fallback.title |         title = fallback.title | ||||||
|  |  | ||||||
|  |  | ||||||
|     if news_article.summary: |     if news_article.summary: | ||||||
|         summary = news_article.summary |         summary = news_article.summary | ||||||
|     elif news_article.text: |     elif news_article.text: | ||||||
| @@ -52,8 +50,14 @@ def get_description(article_object): | |||||||
|     else: |     else: | ||||||
|         summary = fallback.summary |         summary = fallback.summary | ||||||
|  |  | ||||||
|  |     if news_article.meta_lang: | ||||||
|  |         lang = news_article.meta_lang | ||||||
|  |     else: | ||||||
|  |         lang = "" | ||||||
|  |  | ||||||
|     article_object.title = title |     article_object.title = title | ||||||
|     article_object.summary = summary |     article_object.summary = summary | ||||||
|  |     article_object.language = lang | ||||||
|     article_object.set_authors(news_article.authors) |     article_object.set_authors(news_article.authors) | ||||||
|     article_object.set_keywords(news_article.keywords) |     article_object.set_keywords(news_article.keywords) | ||||||
|  |  | ||||||
|   | |||||||
| @@ -9,10 +9,10 @@ def upload_to_archive(article_object): | |||||||
|     try: |     try: | ||||||
|         wayback = WaybackMachineSaveAPI(url, user_agent) |         wayback = WaybackMachineSaveAPI(url, user_agent) | ||||||
|         archive_url = wayback.save() |         archive_url = wayback.save() | ||||||
|         logger.info(f"{url} uploaded to archive successfully") |         # logger.info(f"{url} uploaded to archive successfully") | ||||||
|         article_object.archive_url = archive_url |         article_object.archive_url = archive_url | ||||||
|     except Exception as e: |     except Exception as e: | ||||||
|         article_object.archive_url = "Error while uploading: {}".format(e) |         article_object.archive_url = "Error while uploading: {}".format(e) | ||||||
|         logger.error(f"Error while generating new url: {e}") |         logger.error(f"Error while generating archive url: {e}") | ||||||
|  |  | ||||||
|     return article_object |     return article_object | ||||||
| @@ -1,7 +1,6 @@ | |||||||
| from threading import Thread | from threading import Thread | ||||||
| import time | import time | ||||||
| import logging | import logging | ||||||
| # logger = logging.getLogger(__name__) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class TemplateWorker(Thread): | class TemplateWorker(Thread): | ||||||
| @@ -34,7 +33,6 @@ class TemplateWorker(Thread): | |||||||
|                  |                  | ||||||
|  |  | ||||||
|     def _handle_article(self, article_watcher, action=None): |     def _handle_article(self, article_watcher, action=None): | ||||||
|         # TODO Overload in children classes |  | ||||||
|         if action is None: |         if action is None: | ||||||
|             self.logger.error("Unoverloaded call of _handle_article(). This should not occur in prod") |             self.logger.error("Unoverloaded call of _handle_article(). This should not occur in prod") | ||||||
|         else: |         else: | ||||||
|   | |||||||
| @@ -1,6 +1,6 @@ | |||||||
| from .worker_template import TemplateWorker | from .worker_template import TemplateWorker | ||||||
| from .download.browser import PDFDownloader | from .download.browser import PDFDownloader | ||||||
| from .download.youtube import save_video | from .download.youtube import YouTubeDownloader | ||||||
| from .fetch.runner import get_description | from .fetch.runner import get_description | ||||||
| from .upload.runner import upload_to_archive as run_upload | from .upload.runner import upload_to_archive as run_upload | ||||||
| from .compress.runner import shrink_pdf | from .compress.runner import shrink_pdf | ||||||
| @@ -11,7 +11,7 @@ logger = logging.getLogger(__name__) | |||||||
| class DownloadWorker(TemplateWorker): | class DownloadWorker(TemplateWorker): | ||||||
|     def __init__(self) -> None: |     def __init__(self) -> None: | ||||||
|         self.dl_runner = PDFDownloader().download |         self.dl_runner = PDFDownloader().download | ||||||
|         self.yt_runner = save_video |         self.yt_runner = YouTubeDownloader().save_video | ||||||
|         super().__init__() |         super().__init__() | ||||||
|  |  | ||||||
|     def _handle_article(self, article_watcher): |     def _handle_article(self, article_watcher): | ||||||
|   | |||||||
| @@ -1,6 +1,6 @@ | |||||||
| peewee | peewee | ||||||
| selenium | selenium | ||||||
| pytube | youtube-dl | ||||||
| waybackpy | waybackpy | ||||||
| slack_bolt # relies on slack_sdk | slack_bolt # relies on slack_sdk | ||||||
| newspaper3k | newspaper3k | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Remy Moll
					Remy Moll