Many bug fixes
This commit is contained in:
parent
0d76bcbb98
commit
8f3ea25662
20
Dockerfile
20
Dockerfile
@ -1,18 +1,22 @@
|
|||||||
FROM ubuntu:latest
|
FROM python:latest
|
||||||
# UGH, timezone issues
|
|
||||||
RUN ln -snf /usr/share/zoneinfo/$CONTAINER_TIMEZONE /etc/localtime && echo $CONTAINER_TIMEZONE > /etc/timezone
|
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y evince libcanberra-gtk-module && apt-get install -y xauth wget tar python3 python3-pip python3-setuptools python3-wheel python3-dev build-essential firefox ghostscript
|
RUN apt-get update && apt-get install -y \
|
||||||
|
evince libcanberra-gtk-module \
|
||||||
|
# for checking
|
||||||
|
xauth wget tar firefox \
|
||||||
|
# for geckodriver + gui
|
||||||
|
ghostscript
|
||||||
|
# for compression
|
||||||
|
|
||||||
# Download gecko (firefox) driver for selenium
|
# Download gecko (firefox) driver for selenium
|
||||||
RUN wget https://github.com/mozilla/geckodriver/releases/download/v0.30.0/geckodriver-v0.30.0-linux64.tar.gz
|
RUN wget https://github.com/mozilla/geckodriver/releases/download/v0.31.0/geckodriver-v0.31.0-linux64.tar.gz
|
||||||
RUN tar -x geckodriver -zf geckodriver-v0.30.0-linux64.tar.gz -O > /usr/bin/geckodriver
|
RUN tar -x geckodriver -zf geckodriver-v0.31.0-linux64.tar.gz -O > /usr/bin/geckodriver
|
||||||
RUN chmod +x /usr/bin/geckodriver
|
RUN chmod +x /usr/bin/geckodriver
|
||||||
RUN rm geckodriver-v0.30.0-linux64.tar.gz
|
RUN rm geckodriver-v0.31.0-linux64.tar.gz
|
||||||
RUN echo "127.0.0.1 localhost" >> /etc/hosts
|
RUN echo "127.0.0.1 localhost" >> /etc/hosts
|
||||||
|
|
||||||
COPY requirements.txt /app/
|
COPY requirements.txt /app/
|
||||||
RUN python3 -m pip install --upgrade pip && python3 -m pip install -r /app/requirements.txt
|
RUN python3 -m pip install -r /app/requirements.txt
|
||||||
|
|
||||||
RUN mkdir -p /app/auto_news
|
RUN mkdir -p /app/auto_news
|
||||||
COPY app /app/auto_news
|
COPY app /app/auto_news
|
||||||
|
@ -40,9 +40,9 @@ where the `Dockerfile` has to be in the working directory
|
|||||||
|
|
||||||
## Cheat-sheet Remy:
|
## Cheat-sheet Remy:
|
||||||
|
|
||||||
`docker run -it -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ auto_news`
|
`docker run -it -v /mnt/Data/COSS/CONTAINERDATA/:/app/file_storage/ auto_news`
|
||||||
|
|
||||||
`docker run -it -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ -v /mnt/Data/COSS/auto_news/app:/code --entrypoint /bin/bash auto_news`
|
`docker run -it -v /mnt/Data/COSS/CONTAINERDATA/:/app/file_storage/ -v /mnt/Data/COSS/auto_news/app:/code --entrypoint /bin/bash auto_news`
|
||||||
|
|
||||||
|
|
||||||
`docker run -it -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ -e DISPLAY=":0" --network host -v \$XAUTHORITY:/root/.Xauthority auto_news check`
|
`docker run -it -v /mnt/Data/COSS/DOWNLOADS/auto_news/container_data/:/app/file_storage/ -e DISPLAY=":0" --network host -v \$XAUTHORITY:/root/.Xauthority auto_news check`
|
||||||
@ -51,6 +51,6 @@ where the `Dockerfile` has to be in the working directory
|
|||||||
|
|
||||||
## Roadmap:
|
## Roadmap:
|
||||||
|
|
||||||
[] automatically upload files to NAS
|
[ ] automatically upload files to NAS
|
||||||
[] handle paywalled sites like faz, spiegel, .. through their dedicated edu-sites
|
[ ] handle paywalled sites like faz, spiegel, .. through their dedicated edu-sites
|
||||||
...
|
...
|
@ -23,20 +23,24 @@ if "debug" in sys.argv:
|
|||||||
logger.warning("Running in debugging mode because launched with argument 'debug'")
|
logger.warning("Running in debugging mode because launched with argument 'debug'")
|
||||||
# parsed.read("/code/config.ini")
|
# parsed.read("/code/config.ini")
|
||||||
|
|
||||||
db_path = os.path.join(parsed["DATABASE"]["db_path_dev"], parsed["DATABASE"]["db_name"])
|
db_base_path = parsed["DATABASE"]["db_path_dev"]
|
||||||
parsed["SLACK"]["archive_id"] = parsed["SLACK"]["debug_id"]
|
parsed["SLACK"]["archive_id"] = parsed["SLACK"]["debug_id"]
|
||||||
parsed["MAIL"]["recipient"] = parsed["MAIL"]["sender"]
|
parsed["MAIL"]["recipient"] = parsed["MAIL"]["sender"]
|
||||||
else:
|
else:
|
||||||
logger.warning("Using production values, I hope you know what you're doing...")
|
logger.warning("Using production values, I hope you know what you're doing...")
|
||||||
|
|
||||||
db_path = os.path.join(parsed["DATABASE"]["db_path_prod"], parsed["DATABASE"]["db_name"])
|
db_base_path = parsed["DATABASE"]["db_path_prod"]
|
||||||
|
|
||||||
from utils_storage import models
|
from utils_storage import models
|
||||||
|
|
||||||
# Set up the database
|
# Set up the database
|
||||||
models.set_db(
|
models.set_db(
|
||||||
SqliteDatabase(
|
SqliteDatabase(
|
||||||
db_path,
|
os.path.join(db_base_path, parsed["DATABASE"]["chat_db_name"]),
|
||||||
|
pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once
|
||||||
|
),
|
||||||
|
SqliteDatabase(
|
||||||
|
os.path.join(db_base_path, parsed["DATABASE"]["download_db_name"]),
|
||||||
pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once
|
pragmas = {'journal_mode': 'wal'} # mutliple threads can read at once
|
||||||
)
|
)
|
||||||
)
|
)
|
@ -27,8 +27,14 @@ class ArticleWatcher:
|
|||||||
self._fetch_completed = self._download_completed = self._compression_completed = self._upload_completed = False
|
self._fetch_completed = self._download_completed = self._compression_completed = self._upload_completed = False
|
||||||
|
|
||||||
# first step: gather metadata
|
# first step: gather metadata
|
||||||
self.fetch.process(self) # this will call the update_status method
|
if self.fetch and self.upload:
|
||||||
self.upload.process(self) # idependdent from the rest
|
self.fetch.process(self) # this will call the update_status method
|
||||||
|
self.upload.process(self) # idependdent from the rest
|
||||||
|
else: # the full kwargs were not provided, only do a manual run
|
||||||
|
# overwrite update_status() because calls from the workers will result in erros
|
||||||
|
self.update_status = lambda completed: logger.info(f"Completed action {completed}")
|
||||||
|
for w in kwargs.get("workers_manual"):
|
||||||
|
w.process(self)
|
||||||
|
|
||||||
|
|
||||||
def update_status(self, completed_action):
|
def update_status(self, completed_action):
|
||||||
@ -36,23 +42,6 @@ class ArticleWatcher:
|
|||||||
Article download is complete iff fetch and download were successfull and compression was run
|
Article download is complete iff fetch and download were successfull and compression was run
|
||||||
"""
|
"""
|
||||||
# if self.completition_notified and self._compression_completed and self._fetch_completed and self._download_completed and self._upload_completed, we are done
|
# if self.completition_notified and self._compression_completed and self._fetch_completed and self._download_completed and self._upload_completed, we are done
|
||||||
# we don't need to delete self though, because it is then automatically garbage-collected
|
|
||||||
# all_done = self._fetch_completed and self._download_completed and self._compression_completed and self._upload_completed
|
|
||||||
# if self._fetch_completed and not self._download_called:
|
|
||||||
# self._download_called = True
|
|
||||||
# self.download.process(self)
|
|
||||||
# elif self._download_completed and not self._compression_called:
|
|
||||||
# self._compression_called = True
|
|
||||||
# self.compress.process(self)
|
|
||||||
# elif self._compression_completed: # last step
|
|
||||||
# self.completition_notifier(self.article)
|
|
||||||
# # triggers action in Coordinator
|
|
||||||
# elif self._upload_completed:
|
|
||||||
# # this case occurs when upload was faster than compression
|
|
||||||
# pass
|
|
||||||
# else:
|
|
||||||
# logger.warning(f"update_status called with unusual configuration {self._fetch_completed},{self._download_completed},{self._compression_completed}")
|
|
||||||
|
|
||||||
if completed_action == "fetch":
|
if completed_action == "fetch":
|
||||||
self.download.process(self)
|
self.download.process(self)
|
||||||
elif completed_action == "download":
|
elif completed_action == "download":
|
||||||
@ -129,15 +118,16 @@ class Coordinator:
|
|||||||
|
|
||||||
|
|
||||||
def incoming_request(self, message):
|
def incoming_request(self, message):
|
||||||
# TODO CHECK ME!
|
|
||||||
"""This method is passed onto the slack worker. It gets triggered when a new message is received."""
|
"""This method is passed onto the slack worker. It gets triggered when a new message is received."""
|
||||||
url = message.urls[0] # ignore all the other ones
|
url = message.urls[0] # ignore all the other ones
|
||||||
a, is_new = models.ArticleDownload.get_or_create(article_url=url)
|
a, is_new = models.ArticleDownload.get_or_create(article_url=url)
|
||||||
message.thread.article = a
|
message.thread.article = a
|
||||||
message.thread.save()
|
message.thread.save()
|
||||||
|
self.kwargs.update({"notifier" : self.article_complete_notifier})
|
||||||
|
|
||||||
if is_new:
|
if is_new or (a.file_name == "" and a.verified == 0):
|
||||||
self.kwargs.update({"notifier" : self.article_complete_notifier})
|
# check for models that were created but were abandonned. This means they have missing information, most importantly no associated file
|
||||||
|
# this overwrites previously set information, but that should not be too important
|
||||||
ArticleWatcher(
|
ArticleWatcher(
|
||||||
a,
|
a,
|
||||||
**self.kwargs
|
**self.kwargs
|
||||||
@ -152,12 +142,13 @@ class Coordinator:
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
def manual_processing(self, url_list, target_calls):
|
def manual_processing(self, articles, workers):
|
||||||
for url in url_list:
|
for w in workers:
|
||||||
article = models.ArticleDownload.get_or_none(article_url=url)
|
w.start()
|
||||||
watcher = ArticleWatcher(article, self.article_complete_notifier)
|
|
||||||
for t in target_calls:
|
for article in articles:
|
||||||
t.process(watcher)
|
notifier = lambda article: print(f"Completed manual actions for {article}")
|
||||||
|
ArticleWatcher(article, workers_manual = workers, notifier = notifier)
|
||||||
|
|
||||||
def article_complete_notifier(self, article):
|
def article_complete_notifier(self, article):
|
||||||
self.worker_slack.bot_worker.respond_channel_message(article)
|
self.worker_slack.bot_worker.respond_channel_message(article)
|
||||||
@ -170,12 +161,14 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
|
|
||||||
if "upload" in sys.argv:
|
if "upload" in sys.argv:
|
||||||
urls = models.ArticleDownload.select(models.ArticleDownload.article_url).where(models.ArticleDownload.archive_url == "").execute()
|
articles = models.ArticleDownload.select().where(models.ArticleDownload.archive_url == "").execute()
|
||||||
logger.info(f"Launching upload to archive for {len(urls)} urls.")
|
logger.info(f"Launching upload to archive for {len(articles)} articles.")
|
||||||
coordinator.manual_processing(urls, [UploadWorker()])
|
coordinator.manual_processing(articles, [UploadWorker()])
|
||||||
|
|
||||||
elif "check" in sys.argv:
|
elif "check" in sys.argv:
|
||||||
from utils_check import runner as check_runner
|
from utils_check import runner as check_runner
|
||||||
check_runner.verify_unchecked()
|
check_runner.verify_unchecked()
|
||||||
|
|
||||||
else: # launch with full action
|
else: # launch with full action
|
||||||
kwargs = {
|
kwargs = {
|
||||||
"worker_download" : DownloadWorker(),
|
"worker_download" : DownloadWorker(),
|
||||||
@ -186,9 +179,3 @@ if __name__ == "__main__":
|
|||||||
"worker_mail" : mail_runner,
|
"worker_mail" : mail_runner,
|
||||||
}
|
}
|
||||||
coordinator.add_workers(**kwargs)
|
coordinator.add_workers(**kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# TODO
|
|
||||||
# Resume interrupted article models
|
|
@ -189,6 +189,7 @@ def message_dict_to_model(message):
|
|||||||
uid = message.get("user", "BAD USER")
|
uid = message.get("user", "BAD USER")
|
||||||
if uid == "BAD USER":
|
if uid == "BAD USER":
|
||||||
logger.critical("Message has no user?? {}".format(message))
|
logger.critical("Message has no user?? {}".format(message))
|
||||||
|
return None
|
||||||
|
|
||||||
user, _ = models.User.get_or_create(user_id = uid)
|
user, _ = models.User.get_or_create(user_id = uid)
|
||||||
thread, _ = models.Thread.get_or_create(thread_ts = thread_ts)
|
thread, _ = models.Thread.get_or_create(thread_ts = thread_ts)
|
||||||
|
@ -158,7 +158,7 @@ class BotApp(App):
|
|||||||
fully_processed = len([t for t in threads if t.is_fully_processed])
|
fully_processed = len([t for t in threads if t.is_fully_processed])
|
||||||
fully_unprocessed = len([t for t in threads if t.message_count == 1])
|
fully_unprocessed = len([t for t in threads if t.message_count == 1])
|
||||||
articles_unprocessed = len(models.ArticleDownload.select().where(models.ArticleDownload.verified < 1))
|
articles_unprocessed = len(models.ArticleDownload.select().where(models.ArticleDownload.verified < 1))
|
||||||
self.logger.info(f"[bold]STATUS[/bold]: Fully processed {all_threads}/{fully_processed} threads. {fully_unprocessed} threads have 0 replies. Article-objects to verify: {articles_unprocessed}", extra={"markup": True})
|
self.logger.info(f"[bold]STATUS[/bold]: Fully processed {fully_processed}/{all_threads} threads. {fully_unprocessed} threads have 0 replies. Article-objects to verify: {articles_unprocessed}", extra={"markup": True})
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
67
app/utils_storage/migrations/migration.001.py
Normal file
67
app/utils_storage/migrations/migration.001.py
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
from playhouse.migrate import *
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
This migration assumes that downloads.db kept the exact same structure as before.
|
||||||
|
messages.db should drop the table articlemodelreference in favor of a new field article in the thread-table
|
||||||
|
Since each thread is constrained to exactly one article this makes the most sense.
|
||||||
|
|
||||||
|
This migration assumes that messages.db gets a new field in the table thread:
|
||||||
|
id | thread_ts | article_id
|
||||||
|
|
||||||
|
We now need to migrate from the table articlemodelreference and then delete it.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
db = SqliteDatabase("/code/.dev/messages.db")
|
||||||
|
migrator = SqliteMigrator(db)
|
||||||
|
|
||||||
|
|
||||||
|
article_field = IntegerField(null=True)
|
||||||
|
|
||||||
|
|
||||||
|
migrate(
|
||||||
|
migrator.add_column('thread', 'article_id', article_field),
|
||||||
|
# migrator.drop_column('some_table', 'old_column'),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# these are the old models, adapted to the migration
|
||||||
|
|
||||||
|
class BaseModel(Model):
|
||||||
|
class Meta:
|
||||||
|
database = db
|
||||||
|
|
||||||
|
class User(BaseModel):
|
||||||
|
user_id = CharField(default='', unique=True)
|
||||||
|
|
||||||
|
class Thread(BaseModel):
|
||||||
|
"""The threads that concern us are only created if the messages that contain urls"""
|
||||||
|
thread_ts = FloatField(default = 0)
|
||||||
|
article_id = IntegerField(null=True)
|
||||||
|
|
||||||
|
|
||||||
|
class Message(BaseModel):
|
||||||
|
ts = FloatField(unique=True) #for sorting
|
||||||
|
channel_id = CharField(default='')
|
||||||
|
user = ForeignKeyField(User, backref="messages")
|
||||||
|
text = TextField(default='')
|
||||||
|
thread = ForeignKeyField(Thread, backref="messages", default=None)
|
||||||
|
file_type = CharField(default='')
|
||||||
|
perma_link = CharField(default='')
|
||||||
|
is_processed_override = BooleanField(default=False)
|
||||||
|
|
||||||
|
|
||||||
|
class ArticleModelReference(BaseModel):
|
||||||
|
message = ForeignKeyField(Message, backref='article_model_references')
|
||||||
|
article_model_id = IntegerField(default = 0)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
for ref in ArticleModelReference.select():
|
||||||
|
ref.message.thread.article_id = ref.article_model_id
|
||||||
|
ref.message.thread.save()
|
||||||
|
|
||||||
|
db.drop_tables((ArticleModelReference))
|
@ -12,15 +12,23 @@ config = configuration.parsed["DOWNLOADS"]
|
|||||||
slack_config = configuration.parsed["SLACK"]
|
slack_config = configuration.parsed["SLACK"]
|
||||||
|
|
||||||
## Helpers
|
## Helpers
|
||||||
db = DatabaseProxy()
|
chat_db = DatabaseProxy()
|
||||||
|
download_db = DatabaseProxy()
|
||||||
|
|
||||||
# set the nature of the db at runtime
|
# set the nature of the db at runtime
|
||||||
class BaseModel(Model):
|
|
||||||
|
class DownloadBaseModel(Model):
|
||||||
class Meta:
|
class Meta:
|
||||||
database = db
|
database = download_db
|
||||||
|
|
||||||
|
class ChatBaseModel(Model):
|
||||||
|
class Meta:
|
||||||
|
database = chat_db
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## == Article related models == ##
|
## == Article related models == ##
|
||||||
class ArticleDownload(BaseModel):
|
class ArticleDownload(DownloadBaseModel):
|
||||||
title = CharField(default='')
|
title = CharField(default='')
|
||||||
pub_date = DateField(default = '')
|
pub_date = DateField(default = '')
|
||||||
download_date = DateField(default = datetime.date.today)
|
download_date = DateField(default = datetime.date.today)
|
||||||
@ -55,7 +63,7 @@ class ArticleDownload(BaseModel):
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def fname_template(self):
|
def fname_template(self):
|
||||||
if self.source_name == "youtube.com":
|
if "youtube.com" in self.source_name or "youtu.be" in self.source_name:
|
||||||
fname = "{} -- {}".format(self.source_name, self.title)
|
fname = "{} -- {}".format(self.source_name, self.title)
|
||||||
else:
|
else:
|
||||||
fname = "{} -- {}.pdf".format(self.source_name, self.title)
|
fname = "{} -- {}.pdf".format(self.source_name, self.title)
|
||||||
@ -155,23 +163,23 @@ class ArticleDownload(BaseModel):
|
|||||||
return True, {}
|
return True, {}
|
||||||
|
|
||||||
|
|
||||||
class ArticleKeyword(BaseModel):
|
class ArticleKeyword(DownloadBaseModel):
|
||||||
# instance gets created for every one keyword -> flexible in size
|
# instance gets created for every one keyword -> flexible in size
|
||||||
article = ForeignKeyField(ArticleDownload, backref='keywords')
|
article = ForeignKeyField(ArticleDownload, backref='keywords')
|
||||||
keyword = CharField()
|
keyword = CharField()
|
||||||
|
|
||||||
|
|
||||||
class ArticleAuthor(BaseModel):
|
class ArticleAuthor(DownloadBaseModel):
|
||||||
article = ForeignKeyField(ArticleDownload, backref='authors')
|
article = ForeignKeyField(ArticleDownload, backref='authors')
|
||||||
author = CharField()
|
author = CharField()
|
||||||
|
|
||||||
|
|
||||||
class ArticleReference(BaseModel):
|
class ArticleReference(DownloadBaseModel):
|
||||||
article = ForeignKeyField(ArticleDownload, backref='references')
|
article = ForeignKeyField(ArticleDownload, backref='references')
|
||||||
reference_url = TextField(default = '')
|
reference_url = TextField(default = '')
|
||||||
|
|
||||||
|
|
||||||
class ArticleRelated(BaseModel):
|
class ArticleRelated(DownloadBaseModel):
|
||||||
article = ForeignKeyField(ArticleDownload, backref='related')
|
article = ForeignKeyField(ArticleDownload, backref='related')
|
||||||
related_file_name = TextField(default = '')
|
related_file_name = TextField(default = '')
|
||||||
|
|
||||||
@ -179,13 +187,13 @@ class ArticleRelated(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
## == Slack-thread related models == ##
|
## == Slack-thread related models == ##
|
||||||
class User(BaseModel):
|
class User(ChatBaseModel):
|
||||||
user_id = CharField(default='', unique=True)
|
user_id = CharField(default='', unique=True)
|
||||||
# messages
|
# messages
|
||||||
|
|
||||||
|
|
||||||
class Thread(BaseModel):
|
class Thread(ChatBaseModel):
|
||||||
"""The threads that concern us are only created if the messages that contain urls"""
|
"""The threads that concern us are only created if the base massage contains a url"""
|
||||||
thread_ts = FloatField(default = 0)
|
thread_ts = FloatField(default = 0)
|
||||||
article = ForeignKeyField(ArticleDownload, backref="slack_thread", null=True, default=None)
|
article = ForeignKeyField(ArticleDownload, backref="slack_thread", null=True, default=None)
|
||||||
# provides, ts, user, models
|
# provides, ts, user, models
|
||||||
@ -227,7 +235,7 @@ class Thread(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Message(BaseModel):
|
class Message(ChatBaseModel):
|
||||||
ts = FloatField(unique=True) #for sorting
|
ts = FloatField(unique=True) #for sorting
|
||||||
channel_id = CharField(default='')
|
channel_id = CharField(default='')
|
||||||
user = ForeignKeyField(User, backref="messages")
|
user = ForeignKeyField(User, backref="messages")
|
||||||
@ -275,7 +283,7 @@ class Message(BaseModel):
|
|||||||
return len(self.urls) == 1
|
return len(self.urls) == 1
|
||||||
|
|
||||||
|
|
||||||
class Reaction(BaseModel):
|
class Reaction(ChatBaseModel):
|
||||||
type = CharField(default = "")
|
type = CharField(default = "")
|
||||||
message = ForeignKeyField(Message, backref="reaction")
|
message = ForeignKeyField(Message, backref="reaction")
|
||||||
|
|
||||||
@ -286,17 +294,16 @@ class Reaction(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def create_tables():
|
def create_tables():
|
||||||
with db:
|
with download_db:
|
||||||
db.create_tables([ArticleDownload, ArticleKeyword, ArticleAuthor, ArticleReference, ArticleRelated, User, Message, Thread, Reaction])
|
download_db.create_tables([ArticleDownload, ArticleKeyword, ArticleAuthor, ArticleReference, ArticleRelated])
|
||||||
|
with chat_db:
|
||||||
|
chat_db.create_tables([User, Message, Thread, Reaction])
|
||||||
|
|
||||||
|
|
||||||
def set_db(db_object):
|
def set_db(chat_db_object, download_db_object):
|
||||||
db.initialize(db_object)
|
chat_db.initialize(chat_db_object)
|
||||||
|
download_db.initialize(download_db_object)
|
||||||
create_tables()
|
create_tables()
|
||||||
|
|
||||||
def clear_path_name(path):
|
def clear_path_name(path):
|
||||||
|
@ -9,6 +9,9 @@ shrink_sizes = []
|
|||||||
|
|
||||||
def shrink_pdf(article):
|
def shrink_pdf(article):
|
||||||
initial_size = os.path.getsize(article.save_path + article.file_name)
|
initial_size = os.path.getsize(article.save_path + article.file_name)
|
||||||
|
if article.file_name[-4:] != ".pdf":
|
||||||
|
return article # it probably was a youtube video
|
||||||
|
|
||||||
c = subprocess.run(
|
c = subprocess.run(
|
||||||
["gs", "-sDEVICE=pdfwrite", "-dPDFSETTINGS=/screen", "-dNOPAUSE", "-dBATCH", f"-sOutputFile={config['default_download_path']}/compressed.pdf", f'"{article.save_path + article.file_name}"'],
|
["gs", "-sDEVICE=pdfwrite", "-dPDFSETTINGS=/screen", "-dNOPAUSE", "-dBATCH", f"-sOutputFile={config['default_download_path']}/compressed.pdf", f'"{article.save_path + article.file_name}"'],
|
||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
|
@ -7,10 +7,10 @@ import requests
|
|||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from selenium.webdriver.firefox.options import Options
|
from selenium.webdriver.firefox.options import Options
|
||||||
import configuration
|
import configuration
|
||||||
|
import json
|
||||||
|
|
||||||
config = configuration.parsed["DOWNLOADS"]
|
config = configuration.parsed["DOWNLOADS"]
|
||||||
|
blacklisted = json.loads(config["blacklisted_href_domains"])
|
||||||
|
|
||||||
|
|
||||||
class PDFDownloader:
|
class PDFDownloader:
|
||||||
"""Saves a given url. Fills the object it got as a parameter"""
|
"""Saves a given url. Fills the object it got as a parameter"""
|
||||||
@ -61,10 +61,6 @@ class PDFDownloader:
|
|||||||
self.autostart()
|
self.autostart()
|
||||||
url = article_object.article_url
|
url = article_object.article_url
|
||||||
|
|
||||||
# arbitrary bug fixes:
|
|
||||||
if "focus.de" in url or "bloomberg.com" in url:
|
|
||||||
url = url.replace("https://", "https://outline.com/")
|
|
||||||
sleep_time += 5
|
|
||||||
try:
|
try:
|
||||||
self.driver.get(url)
|
self.driver.get(url)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -97,7 +93,7 @@ class PDFDownloader:
|
|||||||
|
|
||||||
if success:
|
if success:
|
||||||
article_object.file_name = fname
|
article_object.file_name = fname
|
||||||
article_object.set_references = self.get_references()
|
article_object.set_references(self.get_references())
|
||||||
else:
|
else:
|
||||||
article_object.file_name = ""
|
article_object.file_name = ""
|
||||||
|
|
||||||
@ -140,10 +136,12 @@ class PDFDownloader:
|
|||||||
hrefs = [e.get_attribute("href") for e in self.driver.find_elements_by_xpath("//a[@href]")]
|
hrefs = [e.get_attribute("href") for e in self.driver.find_elements_by_xpath("//a[@href]")]
|
||||||
except:
|
except:
|
||||||
hrefs = []
|
hrefs = []
|
||||||
# TODO TEST THIS
|
old = hrefs
|
||||||
hrefs = [h for h in hrefs \
|
hrefs = [h for h in hrefs \
|
||||||
if bool([(domain in h) for domain in config["blacklisted_href_domains"]])
|
if not sum([(domain in h) for domain in blacklisted]) # sum([True, False, False, False]) == 1 (esp. not 0)
|
||||||
] # filter a tiny bit at least
|
] # filter a tiny bit at least
|
||||||
|
diff = set(old) ^ set(hrefs)
|
||||||
|
self.logger.info(f"Removed {len(diff)} hrefs: {diff} (before:{len(old)}, after: {len(hrefs)})")
|
||||||
return hrefs
|
return hrefs
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,33 +1,65 @@
|
|||||||
import logging
|
from __future__ import unicode_literals
|
||||||
|
import youtube_dl
|
||||||
import os
|
import os
|
||||||
from pytube import YouTube
|
import logging
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def save_video(article_object):
|
class MyLogger(object):
|
||||||
"""Saves video accoring to url and save path"""
|
def debug(self, msg): pass
|
||||||
url = article_object.article_url
|
def warning(self, msg): pass
|
||||||
logger.info("Saving new video")
|
def error(self, msg):
|
||||||
try:
|
logger.error(msg)
|
||||||
yt = YouTube(url)
|
|
||||||
streams = yt.streams.filter(progressive=True).order_by('resolution')
|
|
||||||
except Exception as e:
|
|
||||||
article_object.file_name = "ERROR: {}".format(e)
|
|
||||||
return article_object
|
|
||||||
|
|
||||||
if streams: # if it's not empty
|
|
||||||
vid = streams[-1]
|
|
||||||
article_object.source_name = "youtube.com"
|
class YouTubeDownloader:
|
||||||
article_object.title = yt.title
|
def __init__(self) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def post_download_hook(self, ret_code):
|
||||||
|
# print(ret_code)
|
||||||
|
if ret_code['status'] == 'finished':
|
||||||
|
file_loc = ret_code["filename"]
|
||||||
|
fname = os.path.basename(file_loc)
|
||||||
|
self.article_object.file_name = fname
|
||||||
|
|
||||||
|
|
||||||
|
def save_video(self, article_object):
|
||||||
|
"""Saves video accoring to url and save path"""
|
||||||
|
self.article_object = article_object
|
||||||
|
url = article_object.article_url
|
||||||
|
logger.info("Saving new video")
|
||||||
file_path = os.path.join(article_object.save_path, article_object.fname_template)
|
file_path = os.path.join(article_object.save_path, article_object.fname_template)
|
||||||
|
ydl_opts = {
|
||||||
|
'format': 'best[height<=720]',
|
||||||
|
'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download
|
||||||
|
'logger': MyLogger(),
|
||||||
|
'progress_hooks': [self.post_download_hook],
|
||||||
|
'updatetime': False
|
||||||
|
}
|
||||||
try:
|
try:
|
||||||
vid.download(file_path)
|
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
||||||
article_object.file_name = article_object.fname_template
|
ydl.download([url])
|
||||||
|
# article file name is updated in self.post_download_hook
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Youtube download crashed: {e}")
|
logger.error(f"Youtube download crashed: {e}")
|
||||||
article_object.file_name = "Error while downloading"
|
article_object.file_name = ""
|
||||||
else:
|
|
||||||
article_object.file_name = "No streams available"
|
return article_object
|
||||||
|
|
||||||
return article_object
|
|
||||||
|
|
||||||
|
# class DummyArticle:
|
||||||
|
# article_url = "https://www.welt.de/politik/ausland/article238267261/Baerbock-Lieferung-gepanzerter-Fahrzeuge-an-die-Ukraine-kein-Tabu.html"
|
||||||
|
# save_path = "/app/file_storage/"
|
||||||
|
# fname_template = "www.youtube.com -- Test"
|
||||||
|
# file_name = ""
|
||||||
|
|
||||||
|
# m = DummyArticle()
|
||||||
|
# t = YouTubeDownloader()
|
||||||
|
# t.save_video(m)
|
||||||
|
|
||||||
|
# print(m.file_name)
|
||||||
|
@ -37,24 +37,28 @@ def get_description(article_object):
|
|||||||
except:
|
except:
|
||||||
news_article = fallback
|
news_article = fallback
|
||||||
|
|
||||||
|
|
||||||
if news_article.title:
|
if news_article.title:
|
||||||
title = news_article.title
|
title = news_article.title
|
||||||
else:
|
else:
|
||||||
title = fallback.title
|
title = fallback.title
|
||||||
|
|
||||||
|
|
||||||
if news_article.summary:
|
if news_article.summary:
|
||||||
summary = news_article.summary
|
summary = news_article.summary
|
||||||
elif news_article.text:
|
elif news_article.text:
|
||||||
ind = min(500, len(news_article.text))
|
ind = min(500, len(news_article.text))
|
||||||
summary = news_article.text[:ind] + "..."
|
summary = news_article.text[:ind] + "..."
|
||||||
else:
|
else:
|
||||||
summary = fallback.summary
|
summary = fallback.summary
|
||||||
|
|
||||||
|
if news_article.meta_lang:
|
||||||
|
lang = news_article.meta_lang
|
||||||
|
else:
|
||||||
|
lang = ""
|
||||||
|
|
||||||
article_object.title = title
|
article_object.title = title
|
||||||
article_object.summary = summary
|
article_object.summary = summary
|
||||||
|
article_object.language = lang
|
||||||
article_object.set_authors(news_article.authors)
|
article_object.set_authors(news_article.authors)
|
||||||
article_object.set_keywords(news_article.keywords)
|
article_object.set_keywords(news_article.keywords)
|
||||||
|
|
||||||
return article_object
|
return article_object
|
||||||
|
@ -9,10 +9,10 @@ def upload_to_archive(article_object):
|
|||||||
try:
|
try:
|
||||||
wayback = WaybackMachineSaveAPI(url, user_agent)
|
wayback = WaybackMachineSaveAPI(url, user_agent)
|
||||||
archive_url = wayback.save()
|
archive_url = wayback.save()
|
||||||
logger.info(f"{url} uploaded to archive successfully")
|
# logger.info(f"{url} uploaded to archive successfully")
|
||||||
article_object.archive_url = archive_url
|
article_object.archive_url = archive_url
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
article_object.archive_url = "Error while uploading: {}".format(e)
|
article_object.archive_url = "Error while uploading: {}".format(e)
|
||||||
logger.error(f"Error while generating new url: {e}")
|
logger.error(f"Error while generating archive url: {e}")
|
||||||
|
|
||||||
return article_object
|
return article_object
|
@ -1,7 +1,6 @@
|
|||||||
from threading import Thread
|
from threading import Thread
|
||||||
import time
|
import time
|
||||||
import logging
|
import logging
|
||||||
# logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class TemplateWorker(Thread):
|
class TemplateWorker(Thread):
|
||||||
@ -34,7 +33,6 @@ class TemplateWorker(Thread):
|
|||||||
|
|
||||||
|
|
||||||
def _handle_article(self, article_watcher, action=None):
|
def _handle_article(self, article_watcher, action=None):
|
||||||
# TODO Overload in children classes
|
|
||||||
if action is None:
|
if action is None:
|
||||||
self.logger.error("Unoverloaded call of _handle_article(). This should not occur in prod")
|
self.logger.error("Unoverloaded call of _handle_article(). This should not occur in prod")
|
||||||
else:
|
else:
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from .worker_template import TemplateWorker
|
from .worker_template import TemplateWorker
|
||||||
from .download.browser import PDFDownloader
|
from .download.browser import PDFDownloader
|
||||||
from .download.youtube import save_video
|
from .download.youtube import YouTubeDownloader
|
||||||
from .fetch.runner import get_description
|
from .fetch.runner import get_description
|
||||||
from .upload.runner import upload_to_archive as run_upload
|
from .upload.runner import upload_to_archive as run_upload
|
||||||
from .compress.runner import shrink_pdf
|
from .compress.runner import shrink_pdf
|
||||||
@ -11,7 +11,7 @@ logger = logging.getLogger(__name__)
|
|||||||
class DownloadWorker(TemplateWorker):
|
class DownloadWorker(TemplateWorker):
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self.dl_runner = PDFDownloader().download
|
self.dl_runner = PDFDownloader().download
|
||||||
self.yt_runner = save_video
|
self.yt_runner = YouTubeDownloader().save_video
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
def _handle_article(self, article_watcher):
|
def _handle_article(self, article_watcher):
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
peewee
|
peewee
|
||||||
selenium
|
selenium
|
||||||
pytube
|
youtube-dl
|
||||||
waybackpy
|
waybackpy
|
||||||
slack_bolt # relies on slack_sdk
|
slack_bolt # relies on slack_sdk
|
||||||
newspaper3k
|
newspaper3k
|
||||||
|
Loading…
x
Reference in New Issue
Block a user