update nas target, documentation

2022-10-24 17:25:48 +02:00
parent 6c08dec20a
commit e6bfe811d0
22 changed files with 111 additions and 369 deletions
--- a/manual/README.md
+++ b/manual/README.md
@@ -0,0 +1,7 @@
+### MANUAL TASKS
+
+The files inside this directory contain scripts for repetitive but somewhat automatable tasks.
+
+> ⚠️ warning:
+> 
+> Most scripts still require manual intervention before/after running and probably require changes to the code. **Please make sure you understand them before using them!**
--- a/manual/batch_archive.py
+++ b/manual/batch_archive.py
@@ -0,0 +1,21 @@
+"""
+Saves websites specified in 'batch_urls.txt' to the wayback machine. Outputs archive urls to terminal
+Hint: use 'python batch_archive.py > batch_archive.txt' to save the output to a file
+"""
+from waybackpy import WaybackMachineSaveAPI # upload to archive.org
+import time
+
+urls = []
+with open ("batch_urls.txt", "r") as f:
+    urls = f.readlines()
+
+
+
+for i, url in enumerate(urls):
+    print(f"Saving url {i+1} / {len(urls)}")
+    user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed?
+    wayback = WaybackMachineSaveAPI(url, user_agent)
+    archive_url = wayback.save()
+    print(archive_url)
+    time.sleep(20)
+    # Uploads to archive.org are rate limited
--- a/manual/batch_urls.txt
+++ b/manual/batch_urls.txt
@@ -0,0 +1,18 @@
+https://id2020.org
+https://www.weforum.org/platforms/the-centre-for-cybersecurity
+https://www.unhcr.org/blogs/wp-content/uploads/sites/48/2018/04/fs.pdf
+https://en.wikipedia.org/wiki/Social_Credit_System
+https://en.wikipedia.org/wiki/Customer_lifetime_value
+https://www.weforum.org/reports/the-internet-of-bodies-is-here-tackling-new-challenges-of-technology-governance
+https://www.un.org/en/about-us/universal-declaration-of-human-rights
+https://www.biometricupdate.com/201909/id2020-and-partners-launch-program-to-provide-digital-id-with-vaccines
+https://www.wired.com/2008/06/pb-theory/
+https://www.medtechdive.com/news/fda-warns-of-false-positives-with-bd-coronavirus-diagnostic/581115/
+https://www.bbc.com/news/world-middle-east-52579475
+https://www.timesofisrael.com/over-12000-mistakenly-quarantined-by-phone-tracking-health-ministry-admits/
+https://www.delftdesignforvalues.nl
+https://www.theglobalist.com/technology-big-data-artificial-intelligence-future-peace-rooms/
+https://link.springer.com/chapter/10.1007/978-3-319-90869-4_17
+https://www.youtube.com/watch?v=_KhAsJRk2lo
+https://www.bloomberg.org/environment/supporting-sustainable-cities/american-cities-climate-challenge/
+https://climatecitycup.org
--- a/manual/batch_youtube.py
+++ b/manual/batch_youtube.py
@@ -0,0 +1,33 @@
+"""
+Saves youtube videos specified in 'batch_urls.txt' to the local folder. (to be copied manually)
+"""
+import youtube_dl
+
+urls = []
+with open ("batch_urls.txt", "r") as f:
+    urls = f.readlines()
+
+
+def post_download_hook(ret_code):
+    if ret_code['status'] == 'finished':
+        file_loc = ret_code["filename"]
+        print(file_loc)
+
+
+def save_video(url):
+    """Saves video accoring to url and save path"""
+    ydl_opts = {
+        'format': 'best[height<=720]',
+        'progress_hooks': [post_download_hook],
+        'updatetime': False
+    }
+    try:
+        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
+            ydl.download([url])
+    except Exception as e:
+        print(f"Youtube download crashed: {e}")
+
+
+for i, url in enumerate(urls):
+    print(f"Downloading video {i+1} / {len(urls)}")
+    save_video(url)
--- a/manual/exctract_from_mail_backup.py
+++ b/manual/exctract_from_mail_backup.py
@@ -0,0 +1,26 @@
+"""
+Extracts all urls from a list of mails exported from thunderbird. Writes to 'mails_url_export.json'
+"""
+import os
+import re
+import json
+
+os.chdir("/home/remy/Downloads/mails/")
+
+regex = "(?P<url>https?://[^\s]+)"
+
+all_files = os.listdir(".")
+all_urls = []
+
+for f in all_files:
+    with open(f, "r", encoding="utf8") as mail:
+        content = mail.readlines()
+    
+    search = "".join(content)
+    urls = re.findall(regex, search)
+    all_urls += urls
+
+print("Saved {} urls".format(len(all_urls)))
+
+with open("mails_url_export.json", "w") as f:
+    json.dump(all_urls, f)  
--- a/manual/gather_media_files.py
+++ b/manual/gather_media_files.py
@@ -0,0 +1,67 @@
+"""
+Runs the news_fetch pipeline against a manually curated list of urls and saves them locally
+"""
+import sys
+sys.path.append("../app/news_fetch")
+import runner
+import logging
+logger = logging.getLogger()
+import json
+
+from rich.console import Console
+from rich.table import Table
+console = Console()
+
+logger.info("Overwriting production values for single time media-fetch")
+runner.configuration.models.set_db(
+    runner.configuration.SqliteDatabase("../.dev/media_downloads.db")
+)
+runner.configuration.main_config["DOWNLOADS"]["local_storage_path"] = "../.dev/"
+
+
+def fetch():
+    dispatcher = runner.Dispatcher()
+
+    dispatcher.workers_in = [{"FetchWorker": runner.FetchWorker(), "DownloadWorker": runner.DownloadWorker()}]
+    dispatcher.workers_out = [{"PrintWorker": runner.PrintWorker()}]
+
+    dispatcher.start()
+
+    with open("media_urls.json", "r") as f:
+        url_list = json.loads(f.read()) 
+
+    logger.info(f"Found {len(url_list)} media urls")
+    for u in url_list:
+        msg_text = f"<{u}|dummy preview text>"
+        dispatcher.incoming_request(msg)
+
+
+
+def show():
+
+    t = Table(
+        title = "ArticleDownloads",
+        row_styles = ["white", "bright_black"],
+    )
+    
+    entries = ["title", "article_url", "archive_url", "authors"]
+
+    for e in entries:
+        t.add_column(e, justify = "right")
+
+    sel = runner.models.ArticleDownload.select()
+
+    for s in sel:
+        c = [getattr(s, e) for e in entries]#
+        c[-1] = str([a.author for a in c[-1]])
+        print(c)
+        t.add_row(*c)
+
+    
+    console.print(t)
+
+
+
+
+# fetch()
+show()
--- a/manual/migration.to_postgres.py
+++ b/manual/migration.to_postgres.py
@@ -0,0 +1,170 @@
+import datetime
+import sys
+sys.path.append("../news_fetch/")
+import configuration # lives in app
+from peewee import *
+
+import os
+import time
+
+old_db = SqliteDatabase("/app/containerdata/downloads.db")
+
+cred = configuration.db_config["DATABASE"]
+download_db = PostgresqlDatabase(
+    cred["db_name"], user=cred["user_name"], password=cred["password"], host="vpn", port=5432
+)
+
+## OLD Models
+class OLDModel(Model):
+    class Meta:
+        database = old_db
+
+
+class OLDArticleDownload(OLDModel):
+    class Meta:
+        db_table = 'articledownload'
+
+    title = CharField(default='')
+    pub_date = DateField(default = '')
+    download_date = DateField(default = 0)
+    source_name = CharField(default = '')
+    article_url = TextField(default = '', unique=True)
+    archive_url = TextField(default = '')
+    file_name = TextField(default = '')
+    language = CharField(default = '')
+    summary = TextField(default = '')
+    comment = TextField(default = '')
+    verified = IntegerField(default = False)
+    # authors
+    # keywords
+    # ... are added through foreignkeys
+
+
+
+
+class OLDArticleAuthor(OLDModel):
+    class Meta:
+        db_table = 'articleauthor'
+
+    article = ForeignKeyField(OLDArticleDownload, backref='authors')
+    author = CharField()
+
+
+
+class OLDArticleRelated(OLDModel):
+    class Meta:
+        db_table = 'articlerelated'
+
+    article = ForeignKeyField(OLDArticleDownload, backref='related')
+    related_file_name = TextField(default = '')
+
+
+
+
+## NEW Models
+class NEWModel(Model):
+    class Meta:
+        database = download_db
+
+
+class ArticleDownload(NEWModel):
+    # in the beginning this is all we have
+    article_url = TextField(default = '', unique=True)
+    # fetch then fills in the metadata
+    title = TextField(default='')
+    summary = TextField(default = '')
+    source_name = CharField(default = '')
+    language = CharField(default = '')
+    file_name = TextField(default = '')
+    archive_url = TextField(default = '')
+    pub_date = DateField(default = '')
+    download_date = DateField(default = 0)
+    slack_ts = FloatField(default = 0) # should be a fixed-length string but float is easier to sort by
+    sent = BooleanField(default = False)
+    archived_by = CharField(default = os.getenv("UNAME"))
+    # need to know who saved the message because the file needs to be on their computer in order to get verified
+    # verification happens in a different app, but the model has the fields here as well
+    comment = TextField(default = '')
+    verified = IntegerField(default = 0) # 0 = not verified, 1 = verified, -1 = marked as bad
+    
+    def set_authors(self, authors):
+        for a in authors:
+            if len(a) < 100:
+                ArticleAuthor.create(
+                    article = self,
+                    author = a
+            )
+
+    def set_related(self, related):
+        for r in related:
+            ArticleRelated.create(
+                article = self,
+                related_file_name = r
+            )
+
+    # authors
+    # keywords
+    # ... are added through foreignkeys
+    # we will also add an attribute named message, to reference which message should be replied to. This attribute does not need to be saved in the db
+
+   
+
+class ArticleAuthor(NEWModel):
+    article = ForeignKeyField(ArticleDownload, backref='authors')
+    author = CharField()
+
+
+class ArticleRelated(NEWModel):
+    # Related files, such as the full text of a paper, audio files, etc.
+    article = ForeignKeyField(ArticleDownload, backref='related')
+    related_file_name = TextField(default = '')
+
+
+
+
+
+
+####################################################################
+# Migrate using sensible defaults:
+download_db.create_tables([ArticleDownload, ArticleAuthor, ArticleRelated])
+
+it = 0
+for old_art in OLDArticleDownload.select():
+    print("====================================================================")
+    it+=1
+    print(f"IT {it} New article with data:")
+    print(
+        old_art.article_url,
+        old_art.title,
+        old_art.summary,
+        old_art.source_name,
+        old_art.language,
+        old_art.file_name,
+        old_art.archive_url,
+        old_art.pub_date if old_art.pub_date != "" else datetime.date.fromtimestamp(0),
+        old_art.download_date,
+        True,
+        old_art.comment,
+        old_art.verified
+    )
+    new_art = ArticleDownload.create(
+        article_url = old_art.article_url,
+        title = old_art.title,
+        summary = old_art.summary,
+        source_name = old_art.source_name,
+        language = old_art.language,
+        file_name = old_art.file_name,
+        archive_url = old_art.archive_url,
+        pub_date = old_art.pub_date if old_art.pub_date != "" else datetime.date.fromtimestamp(0),
+        download_date = old_art.download_date,
+        # slack_ts = FloatField(default = 0)
+        sent = True,
+        # archived_by = CharField(default = os.getenv("UNAME"))
+        comment = old_art.comment,
+        verified = old_art.verified
+    )
+    
+    
+    new_art.set_related([r.related_file_name for r in old_art.related])
+    new_art.set_authors([a.author for a in old_art.authors])
+