From e6bfe811d0cd173a3e381af9bf3f751cb5f731ea Mon Sep 17 00:00:00 2001
From: Remy Moll <me@moll.re>
Date: Mon, 24 Oct 2022 17:25:48 +0200
Subject: [PATCH] update nas target, documentation

---
 config/README.md                              |   8 +
 {misc/sample_config => config}/db.config.ini  |   0
 {env => config/env}/debug                     |   0
 {env => config/env}/production                |   0
 .../sample_config => config}/nas_login.config |   0
 .../sample_config => config}/nas_sync.config  |   0
 .../news_fetch.config.ini                     |   2 +-
 {misc/sample_config => config}/vpn.config     |   0
 docker-compose.yaml                           |   2 +-
 manual/README.md                              |   7 +
 manual/batch_archive.py                       |  21 +++
 manual/batch_urls.txt                         |  18 +++
 manual/batch_youtube.py                       |  33 ++++
 {misc => manual}/exctract_from_mail_backup.py |   5 +-
 {misc => manual}/gather_media_files.py        |  24 ++-
 {misc => manual}/migration.to_postgres.py     |   0
 misc/hotfix_missed_messages.py                |  88 ----------
 misc/hotfix_reactions.py                      |  38 -----
 misc/media_urls.json                          | 151 ------------------
 misc/youtube_batch.py                         |  61 -------
 news_fetch/Dockerfile                         |   6 +-
 news_fetch/runner.py                          |  16 +-
 22 files changed, 111 insertions(+), 369 deletions(-)
 create mode 100644 config/README.md
 rename {misc/sample_config => config}/db.config.ini (100%)
 rename {env => config/env}/debug (100%)
 rename {env => config/env}/production (100%)
 rename {misc/sample_config => config}/nas_login.config (100%)
 rename {misc/sample_config => config}/nas_sync.config (100%)
 rename {misc/sample_config => config}/news_fetch.config.ini (92%)
 rename {misc/sample_config => config}/vpn.config (100%)
 create mode 100644 manual/README.md
 create mode 100644 manual/batch_archive.py
 create mode 100644 manual/batch_urls.txt
 create mode 100644 manual/batch_youtube.py
 rename {misc => manual}/exctract_from_mail_backup.py (70%)
 rename {misc => manual}/gather_media_files.py (67%)
 rename {misc => manual}/migration.to_postgres.py (100%)
 delete mode 100644 misc/hotfix_missed_messages.py
 delete mode 100644 misc/hotfix_reactions.py
 delete mode 100644 misc/media_urls.json
 delete mode 100644 misc/youtube_batch.py
diff --git a/config/README.md b/config/README.md
new file mode 100644
index 0000000..cefa4d9
--- /dev/null
+++ b/config/README.md
@@ -0,0 +1,8 @@
+## Configuration: example
+The files inside this directory (not the ones in `env/`) are a sample of the required configuration.
+
+Please create a copy of these files under `<location of downloads>/config/...`.
+
+> Note:
+>
+> Some of the fields are blank, please fill them in as needed.
\ No newline at end of file
diff --git a/misc/sample_config/db.config.ini b/config/db.config.ini
similarity index 100%
rename from misc/sample_config/db.config.ini
rename to config/db.config.ini
diff --git a/env/debug b/config/env/debug
similarity index 100%
rename from env/debug
rename to config/env/debug
diff --git a/env/production b/config/env/production
similarity index 100%
rename from env/production
rename to config/env/production
diff --git a/misc/sample_config/nas_login.config b/config/nas_login.config
similarity index 100%
rename from misc/sample_config/nas_login.config
rename to config/nas_login.config
diff --git a/misc/sample_config/nas_sync.config b/config/nas_sync.config
similarity index 100%
rename from misc/sample_config/nas_sync.config
rename to config/nas_sync.config
diff --git a/misc/sample_config/news_fetch.config.ini b/config/news_fetch.config.ini
similarity index 92%
rename from misc/sample_config/news_fetch.config.ini
rename to config/news_fetch.config.ini
index a220818..24155c2 100644
--- a/misc/sample_config/news_fetch.config.ini
+++ b/config/news_fetch.config.ini
@@ -25,7 +25,7 @@ db_printout: /app/containerdata/backups
 local_storage_path: /app/containerdata/files
 debug_storage_path: /app/containerdata/debug/
 default_download_path: /app/containerdata/tmp
-remote_storage_path: /helbing_support/Files RM/Archiving
+remote_storage_path: /helbing_support/Archiving-Pipeline
 browser_profile_path: /app/containerdata/dependencies/news_fetch.profile
 # please keep this exact name
 browser_print_delay: 3
diff --git a/misc/sample_config/vpn.config b/config/vpn.config
similarity index 100%
rename from misc/sample_config/vpn.config
rename to config/vpn.config
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 8027b99..84fa678 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -28,7 +28,7 @@ services:
       - ${CONTAINER_DATA}/config/nas_sync.config:/sync/nas_sync.config
       - ${CONTAINER_DATA}/config/nas_login.config:/sync/nas_login.config
     command:
-      - nas22.ethz.ch/gess_coss_1/helbing_support/Files RM/Archiving/TEST # first command is the target mount path
+      - nas22.ethz.ch/gess_coss_1/helbing_support/Archiving-Pipeline # first command is the target mount path
       - lsyncd
       - /sync/nas_sync.config
 
diff --git a/manual/README.md b/manual/README.md
new file mode 100644
index 0000000..80d6e66
--- /dev/null
+++ b/manual/README.md
@@ -0,0 +1,7 @@
+### MANUAL TASKS
+
+The files inside this directory contain scripts for repetitive but somewhat automatable tasks.
+
+> ⚠️ warning:
+> 
+> Most scripts still require manual intervention before/after running and probably require changes to the code. **Please make sure you understand them before using them!**
\ No newline at end of file
diff --git a/manual/batch_archive.py b/manual/batch_archive.py
new file mode 100644
index 0000000..1d9bdbf
--- /dev/null
+++ b/manual/batch_archive.py
@@ -0,0 +1,21 @@
+"""
+Saves websites specified in 'batch_urls.txt' to the wayback machine. Outputs archive urls to terminal
+Hint: use 'python batch_archive.py > batch_archive.txt' to save the output to a file
+"""
+from waybackpy import WaybackMachineSaveAPI # upload to archive.org
+import time
+
+urls = []
+with open ("batch_urls.txt", "r") as f:
+    urls = f.readlines()
+
+
+
+for i, url in enumerate(urls):
+    print(f"Saving url {i+1} / {len(urls)}")
+    user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed?
+    wayback = WaybackMachineSaveAPI(url, user_agent)
+    archive_url = wayback.save()
+    print(archive_url)
+    time.sleep(20)
+    # Uploads to archive.org are rate limited
diff --git a/manual/batch_urls.txt b/manual/batch_urls.txt
new file mode 100644
index 0000000..d8aeb8b
--- /dev/null
+++ b/manual/batch_urls.txt
@@ -0,0 +1,18 @@
+https://id2020.org
+https://www.weforum.org/platforms/the-centre-for-cybersecurity
+https://www.unhcr.org/blogs/wp-content/uploads/sites/48/2018/04/fs.pdf
+https://en.wikipedia.org/wiki/Social_Credit_System
+https://en.wikipedia.org/wiki/Customer_lifetime_value
+https://www.weforum.org/reports/the-internet-of-bodies-is-here-tackling-new-challenges-of-technology-governance
+https://www.un.org/en/about-us/universal-declaration-of-human-rights
+https://www.biometricupdate.com/201909/id2020-and-partners-launch-program-to-provide-digital-id-with-vaccines
+https://www.wired.com/2008/06/pb-theory/
+https://www.medtechdive.com/news/fda-warns-of-false-positives-with-bd-coronavirus-diagnostic/581115/
+https://www.bbc.com/news/world-middle-east-52579475
+https://www.timesofisrael.com/over-12000-mistakenly-quarantined-by-phone-tracking-health-ministry-admits/
+https://www.delftdesignforvalues.nl
+https://www.theglobalist.com/technology-big-data-artificial-intelligence-future-peace-rooms/
+https://link.springer.com/chapter/10.1007/978-3-319-90869-4_17
+https://www.youtube.com/watch?v=_KhAsJRk2lo
+https://www.bloomberg.org/environment/supporting-sustainable-cities/american-cities-climate-challenge/
+https://climatecitycup.org
diff --git a/manual/batch_youtube.py b/manual/batch_youtube.py
new file mode 100644
index 0000000..de61551
--- /dev/null
+++ b/manual/batch_youtube.py
@@ -0,0 +1,33 @@
+"""
+Saves youtube videos specified in 'batch_urls.txt' to the local folder. (to be copied manually)
+"""
+import youtube_dl
+
+urls = []
+with open ("batch_urls.txt", "r") as f:
+    urls = f.readlines()
+
+
+def post_download_hook(ret_code):
+    if ret_code['status'] == 'finished':
+        file_loc = ret_code["filename"]
+        print(file_loc)
+
+
+def save_video(url):
+    """Saves video accoring to url and save path"""
+    ydl_opts = {
+        'format': 'best[height<=720]',
+        'progress_hooks': [post_download_hook],
+        'updatetime': False
+    }
+    try:
+        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
+            ydl.download([url])
+    except Exception as e:
+        print(f"Youtube download crashed: {e}")
+
+
+for i, url in enumerate(urls):
+    print(f"Downloading video {i+1} / {len(urls)}")
+    save_video(url)
diff --git a/misc/exctract_from_mail_backup.py b/manual/exctract_from_mail_backup.py
similarity index 70%
rename from misc/exctract_from_mail_backup.py
rename to manual/exctract_from_mail_backup.py
index a00dc63..f21e8e0 100644
--- a/misc/exctract_from_mail_backup.py
+++ b/manual/exctract_from_mail_backup.py
@@ -1,3 +1,6 @@
+"""
+Extracts all urls from a list of mails exported from thunderbird. Writes to 'mails_url_export.json'
+"""
 import os
 import re
 import json
@@ -19,5 +22,5 @@ for f in all_files:
 
 print("Saved {} urls".format(len(all_urls)))
 
-with open("media_mails_export.json", "w") as f:
+with open("mails_url_export.json", "w") as f:
     json.dump(all_urls, f)  
\ No newline at end of file
diff --git a/misc/gather_media_files.py b/manual/gather_media_files.py
similarity index 67%
rename from misc/gather_media_files.py
rename to manual/gather_media_files.py
index a9a2c68..e1d9e0b 100644
--- a/misc/gather_media_files.py
+++ b/manual/gather_media_files.py
@@ -1,5 +1,8 @@
+"""
+Runs the news_fetch pipeline against a manually curated list of urls and saves them locally
+"""
 import sys
-sys.path.append("../app")
+sys.path.append("../app/news_fetch")
 import runner
 import logging
 logger = logging.getLogger()
@@ -11,24 +14,18 @@ console = Console()
 
 logger.info("Overwriting production values for single time media-fetch")
 runner.configuration.models.set_db(
-    runner.configuration.SqliteDatabase("../.dev/media_message_dummy.db"),  # chat_db (not needed here)
     runner.configuration.SqliteDatabase("../.dev/media_downloads.db")
 )
 runner.configuration.main_config["DOWNLOADS"]["local_storage_path"] = "../.dev/"
 
 
 def fetch():
-    coordinator = runner.Coordinator()
+    dispatcher = runner.Dispatcher()
 
+    dispatcher.workers_in = [{"FetchWorker": runner.FetchWorker(), "DownloadWorker": runner.DownloadWorker()}]
+    dispatcher.workers_out = [{"PrintWorker": runner.PrintWorker()}]
 
-    kwargs = {
-        "worker_download" : runner.DownloadWorker(),
-        "worker_fetch" : runner.FetchWorker(),
-        "worker_upload" : runner.UploadWorker(),
-    }
-
-    coordinator.add_workers(**kwargs)
-    coordinator.start()
+    dispatcher.start()
 
     with open("media_urls.json", "r") as f:
         url_list = json.loads(f.read()) 
@@ -36,9 +33,8 @@ def fetch():
     logger.info(f"Found {len(url_list)} media urls")
     for u in url_list:
         msg_text = f"<{u}|dummy preview text>"
-        dummy_thread = runner.models.Thread()
-        msg = runner.models.Message(text= msg_text, thread=dummy_thread)
-        coordinator.incoming_request(msg)
+        dispatcher.incoming_request(msg)
+
 
 
 def show():
diff --git a/misc/migration.to_postgres.py b/manual/migration.to_postgres.py
similarity index 100%
rename from misc/migration.to_postgres.py
rename to manual/migration.to_postgres.py
diff --git a/misc/hotfix_missed_messages.py b/misc/hotfix_missed_messages.py
deleted file mode 100644
index 6cdb69d..0000000
--- a/misc/hotfix_missed_messages.py
+++ /dev/null
@@ -1,88 +0,0 @@
-import time
-import keys
-import slack_sdk
-from slack_sdk.errors import SlackApiError
-from peewee import SqliteDatabase
-
-from persistence import  message_models
-# from bot_utils import messages
-
-
-
-# Constant values...
-MESSAGES_DB = "/app/containerdata/messages.db"
-
-BOT_ID = "U02MR1R8UJH"
-ARCHIVE_ID = "C02MM7YG1V4"
-DEBUG_ID = "C02NM2H9J5Q"
-
-
-
-client = slack_sdk.WebClient(token=keys.OAUTH_TOKEN)
-
-message_models.set_db(SqliteDatabase(MESSAGES_DB))
-
-
-def message_dict_to_model(message):
-    if message["type"] == "message":
-        thread_ts = message["thread_ts"] if "thread_ts" in message else message["ts"]
-        uid = message.get("user", "BAD USER")
-        user, _ = message_models.User.get_or_create(user_id = uid)
-        thread, _ = message_models.Thread.get_or_create(thread_ts = thread_ts)
-        m, new = message_models.Message.get_or_create(
-            user = user,
-            thread = thread,
-            ts = message["ts"],
-            channel_id = ARCHIVE_ID,
-            text = message["text"]
-        )
-        print("Saved (text) {} (new={})".format(m, new))
-
-        for f in message.get("files", []): #default: []
-            m.file_type = f["filetype"]
-            m.perma_link = f["url_private_download"]
-            m.save()
-            print("Saved permalink {} to {} (possibly overwriting)".format(f["name"], m))
-        if new:
-            return m
-        else:
-            return None
-    else:
-        print("What should I do of {}".format(message))
-        return None
-
-
-def check_all_past_messages():
-    last_ts = 0
-    
-    result = client.conversations_history(
-        channel=ARCHIVE_ID,
-        oldest=last_ts
-    )
-
-    new_messages = result.get("messages", []) # fetches 100 messages by default
-
-    new_fetches = []
-    for m in new_messages:
-        new_fetches.append(message_dict_to_model(m))
-    # print(result)
-    refetch = result.get("has_more", False)
-    print(f"Refetching : {refetch}")
-    while refetch: # we have not actually fetched them all
-        try:
-            result = client.conversations_history(
-                channel = ARCHIVE_ID,
-                cursor = result["response_metadata"]["next_cursor"],
-                oldest = last_ts
-            ) # refetches in batches of 100 messages
-            refetch = result.get("has_more", False)
-            new_messages = result.get("messages", [])
-            for m in new_messages:
-                new_fetches.append(message_dict_to_model(m))
-        except SlackApiError: # Most likely a rate-limit
-            print("Error while fetching channel messages. (likely rate limit) Retrying in {} seconds...".format(30))
-            time.sleep(30)
-            refetch = True
-
-
-check_all_past_messages()
\ No newline at end of file
diff --git a/misc/hotfix_reactions.py b/misc/hotfix_reactions.py
deleted file mode 100644
index 5e43b37..0000000
--- a/misc/hotfix_reactions.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from peewee import SqliteDatabase
-
-from persistence import article_models, message_models
-
-# Global logger setup:
-
-
-# Constant values...
-DOWNLOADS_DB = "../container_data/downloads.db"
-MESSAGES_DB = "../container_data/messages.db"
-
-BOT_ID = "U02MR1R8UJH"
-ARCHIVE_ID = "C02MM7YG1V4"
-DEBUG_ID = "C02NM2H9J5Q"
-
-
-# DB Setup:
-article_models.set_db(SqliteDatabase(
-    DOWNLOADS_DB,
-    pragmas = {'journal_mode': 'wal'} # mutliple threads can access at once
-))
-
-message_models.set_db(SqliteDatabase(MESSAGES_DB))
-
-
-
-for reaction in message_models.Reaction.select():
-    print(reaction)        
-    thread = reaction.message.thread
-    articles = message_models.get_referenced_articles(thread, article_models.ArticleDownload)
-    for a in articles:
-        print(a)
-    reaction = reaction.type
-    status = 1 if reaction == "white_check_mark" else -1
-    print(status)
-    for article in articles:
-        article.verified = status
-        article.save()
diff --git a/misc/media_urls.json b/misc/media_urls.json
deleted file mode 100644
index 158c9df..0000000
--- a/misc/media_urls.json
+++ /dev/null
@@ -1,151 +0,0 @@
-[
-    "https://www.swissinfo.ch/ger/wirtschaft/koennen-ki-und-direkte-demokratie-nebeneinander-bestehen-/47542048",
-    "https://www.zeit.de/2011/33/CH-Oekonophysik",
-    "https://ourworld.unu.edu/en/green-idea-self-organizing-traffic-signals",
-    "https://www.youtube.com/watch?v=-FQD4ie9UYA",
-    "https://www.brandeins.de/corporate-services/mck-wissen/mck-wissen-logistik/schwaermen-fuer-das-optimum",
-    "https://www.youtube.com/watch?v=upQM4Xzh8zM",
-    "https://www.youtube.com/watch?v=gAkoprZmW4k",
-    "https://www.youtube.com/watch?v=VMzfDVAWXHI&t=1s",
-    "https://www.youtube.com/watch?v=1SwTiIlkndE",
-    "https://www.informatik-aktuell.de/management-und-recht/digitalisierung/digitale-revolution-und-oekonomie-40-quo-vadis.html",
-    "https://www.youtube.com/watch?v=cSvvH0SBFOw",
-    "https://www.linkedin.com/posts/margit-osterloh-24198a104_pl%C3%A4doyer-gegen-sprechverbote-ugcPost-6925702100450480129-K7Dl?utm_source=linkedin_share&utm_medium=member_desktop_web",
-    "https://www.nebelspalter.ch/plaedoyer-gegen-sprechverbote",
-    "https://falling-walls.com/people/dirk-helbing/",
-    "https://digitalsensemaker.podigee.io/3-2-mit-dirk-helbing",
-    "https://www.blick.ch/wirtschaft/musk-als-hueter-der-redefreiheit-eth-experte-sagt-musks-vorhaben-hat-potenzial-aber-id17437811.html",
-    "https://www.trend.at/standpunkte/mit-verantwortung-zukunft-10082300",
-    "https://www.pantarhei.ch/podcast/",
-    "https://ethz.ch/en/industry/industry/news/data/2022/04/intelligent-traffic-lights-for-optimal-traffic-flow.html",
-    "https://ethz.ch/de/wirtschaft/industry/news/data/2022/04/optimaler-verkehrsfluss-mit-intelligenten-ampeln.html",
-    "https://www.spektrum.de/news/die-verschlungenen-wege-der-menschen/1181815",
-    "https://www.pcwelt.de/a/diktatur-4-0-schoene-neue-digitalisierte-welt,3447005",
-    "https://www.nzz.ch/english/cancel-culture-at-eth-a-professor-receives-death-threats-over-a-lecture-slide-ld.1675322",
-    "https://www.brandeins.de/corporate-services/mck-wissen/mck-wissen-logistik/schwaermen-fuer-das-optimum",
-    "https://www.achgut.com/artikel/ausgestossene_der_woche_prinz_william_als_immaginierter_rassist",
-    "https://www.pinterpolitik.com/in-depth/klaim-big-data-luhut-perlu-diuji/",
-    "https://www.srf.ch/kultur/gesellschaft-religion/eklat-an-der-eth-wenn-ein-angeblicher-schweinevergleich-zur-staatsaffaere-wird",
-    "https://open.spotify.com/episode/6s1icdoplZeNOINvx6ZHTd?si=610a699eba004da2&nd=1",
-    "https://www.nzz.ch/schweiz/shitstorm-an-der-eth-ein-professor-erhaelt-morddrohungen-ld.1673554",
-    "https://www.nzz.ch/schweiz/shitstorm-an-der-eth-ein-professor-erhaelt-morddrohungen-ld.1673554",
-    "https://djmag.com/features/after-astroworld-what-being-done-stop-crowd-crushes-happening-again",
-    "https://prisma-hsg.ch/articles/meine-daten-deine-daten-unsere-daten/",
-    "https://www.srf.ch/audio/focus/zukunftsforscher-dirk-helbing-die-welt-ist-keine-maschine?id=10756661",
-    "https://www.20min.ch/story/roboter-fuer-hunde-machen-wenig-sinn-647302764916",
-    "https://www.wienerzeitung.at/nachrichten/wissen/mensch/942890-Roboter-als-Praesidentschaftskandidaten.html",
-    "https://disruptors.fm/11-building-a-crystal-ball-of-the-world-unseating-capitalism-and-creating-a-new-world-order-with-prof-dirk-helbing/",
-    "https://www.spreaker.com/user/disruptorsfm/11-building-crystal-ball-of-the-world-un",
-    "https://www.youtube.com/watch?v=fRkCMC3zqSQ",
-    "https://arstechnica.com/science/2021/11/what-the-physics-of-crowds-can-tell-us-about-the-tragic-deaths-at-astroworld/",
-    "https://www.fox23.com/news/trending/astroworld-festival-big-crowds-can-flow-like-liquid-with-terrifying-results/37QH6Q4RGFELHGCZSZTBV46STU/",
-    "https://futurism.com/astroworld-theory-deaths-bodies-fluid",
-    "https://www.businessinsider.com/why-people-died-astroworld-crowd-crush-physics-fluid-dynamics-2021-11",
-    "https://theconversation.com/ten-tips-for-surviving-a-crowd-crush-112169",
-    "https://www.limmattalerzeitung.ch/basel/das-wort-zum-tag-kopie-von-4-januar-hypotenuse-schlaegt-kathete-trivia-trampel-pandemie-ld.2233931",
-    "https://magazine.swissinformatics.org/en/whats-wrong-with-ai/",
-    "https://magazine.swissinformatics.org/en/whats-wrong-with-ai/",
-    "https://www.netkwesties.nl/1541/wrr-ai-wordt-de-verbrandingsmotor-van.htm",
-    "https://youtu.be/ptm9zLG2KaE",
-    "https://www.deutschlandfunkkultur.de/die-zukunft-der-demokratie-mehr-teilhabe-von-unten-wagen.976.de.html?dram:article_id=468341",
-    "https://www.springer.com/gp/book/9783642240034",
-    "https://www.springer.com/de/book/9783319908687",
-    "https://technikjournal.de/2017/08/02/ein-plaedoyer-fuer-die-digitale-demokratie/",
-    "https://technikjournal.de/2017/08/02/ein-plaedoyer-fuer-die-digitale-demokratie/",
-    "https://trafo.hypotheses.org/23989",
-    "https://web.archive.org/web/20200609053329/https://www.wiko-berlin.de/institution/projekte-kooperationen/projekte/working-futures/wiko-briefs-working-futures-in-corona-times/the-corona-crisis-reveals-the-struggle-for-a-sustainable-digital-future/",
-    "https://www.wiko-berlin.de/institution/projekte-kooperationen/projekte/working-futures/wiko-briefs-working-futures-in-corona-times/",
-    "https://www.youtube.com/watch?v=gAkoprZmW4k",
-    "https://www.rhein-zeitung.de/region/aus-den-lokalredaktionen/nahe-zeitung_artikel,-peter-flaschels-lebenswerk-hat-die-sozialgeschichte-beeinflusst-_arid,2322161.html",
-    "https://www.blick.ch/wirtschaft/online-boom-ohne-ende-corona-befeuert-die-tech-revolution-id16359910.html",
-    "https://www.nzz.ch/meinung/china-unterwirft-tech-und-social-media-das-geht-auch-europa-an-ld.1643010",
-    "https://www.say.media/article/la-mort-par-algorithme",
-    "https://www.suedostschweiz.ch/aus-dem-leben/2021-08-14/stau-ist-nicht-gleich-stau",
-    "https://www.swissinfo.ch/eng/directdemocracy/political-perspectives_digital-democracy--too-risky--or-the-chance-of-a-generation-/43836222",
-    "https://kow-berlin.com/exhibitions/illusion-einer-menschenmenge",
-    "https://www.springer.com/gp/book/9783642240034",
-    "https://www.springer.com/de/book/9783319908687",
-    "https://www.politik-kommunikation.de/ressorts/artikel/eine-gefaehrliche-machtasymmetrie-1383558602",
-    "https://www.springer.com/gp/book/9783642240034",
-    "https://www.springer.com/de/book/9783319908687",
-    "https://solutions.hamburg/ethik-und-digitalisierung-nicht-voneinander-getrennt-betrachten/",
-    "https://www.springer.com/gp/book/9783642240034",
-    "https://www.springer.com/de/book/9783319908687",
-    "https://avenue.argusdatainsights.ch/Article/AvenueClip?artikelHash=d14d91ec9a8b4cb0b6bb3012c0cefd8b_27F0B19422F1F03723769C18906AA1EE&artikelDateiId=298862327",
-    "https://www.tagblatt.ch/kultur/grosses-ranking-ihre-stimme-hat-gewicht-das-sind-die-50-profiliertesten-intellektuellen-der-schweiz-ld.2182261",
-    "https://reliefweb.int/report/world/building-multisystemic-understanding-societal-resilience-covid-19-pandemic",
-    "https://reliefweb.int/report/world/building-multisystemic-understanding-societal-resilience-covid-19-pandemic",
-    "https://www.events.at/e/wie-wir-in-zukunft-leben-wollen-die-stadt-als-datenfeld",
-    "https://www.events.at/e/wie-wir-in-zukunft-leben-wollen-die-stadt-als-datenfeld",
-    "https://greennetproject.org/en/2018/11/27/prof-dirk-helbing-es-braucht-vor-allem-tolle-ideen-in-die-sich-die-leute-verlieben/",
-    "https://www.hpcwire.com/2011/05/06/simulating_society_at_the_global_scale/",
-    "https://www.technologyreview.com/2010/04/30/204005/europes-plan-to-simulate-the-entire-planet/",
-    "https://komentare.sme.sk/c/22543617/smrt-podla-algoritmu.html",
-    "https://komentare.sme.sk/c/22543617/smrt-podla-algoritmu.html",
-    "https://www.confidencial.com.ni/opinion/muerte-por-algoritmo/",
-    "https://www.nzz.ch/panorama/wie-kann-eine-massenpanik-verhindert-werden-ld.1614761",
-    "https://www.20min.ch/story/roboter-fuer-hunde-machen-wenig-sinn-647302764916",
-    "https://www.wienerzeitung.at/nachrichten/wissen/mensch/942890-Roboter-als-Praesidentschaftskandidaten.html",
-    "https://www.srf.ch/audio/focus/zukunftsforscher-dirk-helbing-die-welt-ist-keine-maschine?id=10756661",
-    "https://disruptors.fm/11-building-a-crystal-ball-of-the-world-unseating-capitalism-and-creating-a-new-world-order-with-prof-dirk-helbing/",
-    "https://www.spreaker.com/user/disruptorsfm/11-building-crystal-ball-of-the-world-un",
-    "https://www.youtube.com/watch?v=fRkCMC3zqSQ",
-    "https://arstechnica.com/science/2021/11/what-the-physics-of-crowds-can-tell-us-about-the-tragic-deaths-at-astroworld/",
-    "https://www.fox23.com/news/trending/astroworld-festival-big-crowds-can-flow-like-liquid-with-terrifying-results/37QH6Q4RGFELHGCZSZTBV46STU/",
-    "https://futurism.com/astroworld-theory-deaths-bodies-fluid",
-    "https://www.businessinsider.com/why-people-died-astroworld-crowd-crush-physics-fluid-dynamics-2021-11",
-    "https://theconversation.com/ten-tips-for-surviving-a-crowd-crush-112169",
-    "https://www.limmattalerzeitung.ch/basel/das-wort-zum-tag-kopie-von-4-januar-hypotenuse-schlaegt-kathete-trivia-trampel-pandemie-ld.2233931",
-    "https://www.pantarhei.ch/podcast/",
-    "https://www.focus.it/scienza/scienze/folla-fisica-modelli-simulazioni",
-    "https://www.focus.it/scienza/scienze/folla-fisica-modelli-simulazioni",
-    "https://www.netkwesties.nl/1541/wrr-ai-wordt-de-verbrandingsmotor-van.htm",
-    "https://www.transformationbeats.com/de/transformation/digitale-gesellschaft/",
-    "https://www.transformationbeats.com/de/transformation/digitale-gesellschaft/",
-    "https://www.suedkurier.de/ueberregional/wirtschaft/Wie-uns-der-Staat-heimlich-erzieht-sogar-auf-dem-Klo;art416,8763904",
-    "https://www.suedkurier.de/ueberregional/wirtschaft/Wie-uns-der-Staat-heimlich-erzieht-sogar-auf-dem-Klo;art416,8763904",
-    "https://www.deutschlandfunkkultur.de/die-zukunft-der-demokratie-mehr-teilhabe-von-unten-wagen.976.de.html?dram:article_id=468341",
-    "https://www.springer.com/gp/book/9783642240034",
-    "https://www.springer.com/de/book/9783319908687",
-    "https://trafo.hypotheses.org/23989",
-    "https://web.archive.org/web/20200609053329/https://www.wiko-berlin.de/institution/projekte-kooperationen/projekte/working-futures/wiko-briefs-working-futures-in-corona-times/the-corona-crisis-reveals-the-struggle-for-a-sustainable-digital-future/",
-    "https://www.wiko-berlin.de/institution/projekte-kooperationen/projekte/working-futures/wiko-briefs-working-futures-in-corona-times/",
-    "https://www.youtube.com/watch?v=gAkoprZmW4k",
-    "https://futurium.de/de/gespraech/ranga-yogeshwar-1/ranga-yogeshwar-dirk-helbing-mit-musik-von-till-broenner",
-    "https://www.springer.com/gp/book/9783642240034",
-    "https://www.springer.com/de/book/9783319908687",
-    "https://idw-online.de/en/news113518",
-    "https://blmplus.de/die-digitalcharta-ist-erst-der-anfang-ein-szenario-von-dirk-helbing/",
-    "https://www.risiko-dialog.ch/big-nudging-vom-computer-gelenkt-aber-wohin/",
-    "https://idw-online.de/de/news13986",
-    "https://www.uni-stuttgart.de/presse/archiv/uni-kurier/uk84_85/forschung/fw66.html",
-    "https://www.infosperber.ch/medien/trends/rankings-oft-unbrauchbar-so-oder-so-aber-immer-schadlich/",
-    "https://www.infosperber.ch/medien/trends/rankings-oft-unbrauchbar-so-oder-so-aber-immer-schadlich/",
-    "https://www.nzz.ch/meinung/china-unterwirft-tech-und-social-media-das-geht-auch-europa-an-ld.1643010",
-    "https://www.suedostschweiz.ch/aus-dem-leben/2021-08-14/stau-ist-nicht-gleich-stau",
-    "https://www.swissinfo.ch/eng/directdemocracy/political-perspectives_digital-democracy--too-risky--or-the-chance-of-a-generation-/43836222",
-    "https://werteundwandel.de/inhalte/d2030-in-aufbruchstimmung-fuer-eine-lebenswerte-zukunft/",
-    "https://www.springer.com/gp/book/9783642240034",
-    "https://www.springer.com/de/book/9783319908687",
-    "https://www.youtube.com/watch?v=n9e77iYZPEY",
-    "https://greennetproject.org/en/2018/11/27/prof-dirk-helbing-es-braucht-vor-allem-tolle-ideen-in-die-sich-die-leute-verlieben/",
-    "https://www.hpcwire.com/2011/05/06/simulating_society_at_the_global_scale/",
-    "https://www.say.media/article/la-mort-par-algorithme",
-    "https://www.confidencial.com.ni/opinion/muerte-por-algoritmo/",
-    "https://www.nzz.ch/panorama/wie-kann-eine-massenpanik-verhindert-werden-ld.1614761",
-    "https://www.nesta.org.uk/report/digital-democracy-the-tools-transforming-political-engagement/",
-    "https://www.nature.com/articles/news.2010.351",
-    "https://www.focus.de/panorama/welt/tid-19265/gastkommentar-nutzt-die-moeglichkeiten-des-computers_aid_534372.html",
-    "https://www.theglobalist.com/democracy-technology-innovation-society-internet/",
-    "https://www.theglobalist.com/capitalism-democracy-technology-surveillance-privacy/",
-    "https://www.theglobalist.com/google-artificial-intelligence-big-data-technology-future/",
-    "https://www.theglobalist.com/fascism-big-data-artificial-intelligence-surveillance-democracy/",
-    "https://www.theglobalist.com/technology-big-data-artificial-intelligence-future-peace-rooms/",
-    "https://www.theglobalist.com/technology-society-sustainability-future-humanity/",
-    "https://www.theglobalist.com/society-technology-peace-sustainability/",
-    "https://www.theglobalist.com/democracy-technology-social-media-artificial-intelligence/",
-    "https://www.theglobalist.com/financial-system-reform-economy-internet-of-things-capitalism/",
-    "https://www.theglobalist.com/capitalism-society-equality-sustainability-crowd-funding/",
-    "https://www.theglobalist.com/united-nations-world-government-peace-sustainability-society/",
-    "https://www.theglobalist.com/world-economy-sustainability-environment-society/"
-]
\ No newline at end of file
diff --git a/misc/youtube_batch.py b/misc/youtube_batch.py
deleted file mode 100644
index 155540a..0000000
--- a/misc/youtube_batch.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import youtube_dl
-from waybackpy import WaybackMachineSaveAPI # upload to archive.org
-import time
-
-
-urls = [
-"https://id2020.org",
-"https://www.weforum.org/platforms/the-centre-for-cybersecurity",
-"https://www.unhcr.org/blogs/wp-content/uploads/sites/48/2018/04/fs.pdf",
-"https://en.wikipedia.org/wiki/Social_Credit_System",
-"https://en.wikipedia.org/wiki/Customer_lifetime_value",
-"https://www.weforum.org/reports/the-internet-of-bodies-is-here-tackling-new-challenges-of-technology-governance",
-"https://www.un.org/en/about-us/universal-declaration-of-human-rights",
-"https://www.biometricupdate.com/201909/id2020-and-partners-launch-program-to-provide-digital-id-with-vaccines",
-"https://www.wired.com/2008/06/pb-theory/",
-"https://www.medtechdive.com/news/fda-warns-of-false-positives-with-bd-coronavirus-diagnostic/581115/",
-"https://www.bbc.com/news/world-middle-east-52579475",
-"https://www.timesofisrael.com/over-12000-mistakenly-quarantined-by-phone-tracking-health-ministry-admits/",
-"https://www.delftdesignforvalues.nl",
-"https://www.theglobalist.com/technology-big-data-artificial-intelligence-future-peace-rooms/",
-"https://link.springer.com/chapter/10.1007/978-3-319-90869-4_17",
-"https://www.youtube.com/watch?v=_KhAsJRk2lo",
-"https://www.bloomberg.org/environment/supporting-sustainable-cities/american-cities-climate-challenge/",
-"https://climatecitycup.org",
-
-]
-
-def post_download_hook(ret_code):
-    # print(ret_code)
-    if ret_code['status'] == 'finished':
-        file_loc = ret_code["filename"]
-        print(file_loc)
-
-
-def save_video(url):
-    """Saves video accoring to url and save path"""
-    ydl_opts = {
-        'format': 'best[height<=720]',
-        # 'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download
-        'progress_hooks': [post_download_hook],
-        'updatetime': False
-    }
-    try:
-        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
-            ydl.download([url])
-            # article file name is updated in self.post_download_hook
-    except Exception as e:
-        print(f"Youtube download crashed: {e}")
-
-
-# for i, url in enumerate(urls):
-#     print(f"Downloading video {i+1} / {len(urls)}")
-    # save_video(url)
-
-for i, url in enumerate(urls):
-    print(f"Saving url {i+1} / {len(urls)}")
-    user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed?
-    wayback = WaybackMachineSaveAPI(url, user_agent)
-    archive_url = wayback.save()
-    print(archive_url)
-    time.sleep(20)
diff --git a/news_fetch/Dockerfile b/news_fetch/Dockerfile
index 2f4bb4e..cdba50b 100644
--- a/news_fetch/Dockerfile
+++ b/news_fetch/Dockerfile
@@ -2,10 +2,10 @@ FROM python:latest
 
 ENV TZ Europe/Zurich
 
-RUN mkdir -p /app/auto_news
+RUN mkdir -p /app/news_fetch
 
 COPY requirements.txt /app/requirements.txt
 RUN python3 -m pip install -r /app/requirements.txt
 
-COPY . /app/auto_news
-WORKDIR /app/auto_news
+COPY . /app/news_fetch
+WORKDIR /app/news_fetch
diff --git a/news_fetch/runner.py b/news_fetch/runner.py
index 92a9fdf..1185a52 100644
--- a/news_fetch/runner.py
+++ b/news_fetch/runner.py
@@ -126,13 +126,12 @@ class Dispatcher(Thread):
 
 
 
-    # def manual_processing(self, articles, workers):
-    #     for w in workers:
-    #         w.start()
 
-    #     for article in articles:
-    #         notifier = lambda article: logger.info(f"Completed manual actions for {article}")
-    #         ArticleWatcher(article, workers_manual = workers, notifier = notifier) # Article watcher wants a thread to link article to TODO: handle threads as a kwarg 
+class PrintWorker:
+    def send(self, article):
+        print(f"Uploaded article {article}")
+    def keep_alive(self): # keeps script running, because there is nothing else in the main thread
+        while True: sleep(1)
 
 
 
@@ -140,11 +139,6 @@ if __name__ == "__main__":
     dispatcher = Dispatcher()
 
     if "upload" in sys.argv:
-        class PrintWorker:
-            def send(self, article):
-                print(f"Uploaded article {article}")
-            def keep_alive(self): # keeps script running, because there is nothing else in the main thread
-                while True: sleep(1)
 
         articles = models.ArticleDownload.select().where(models.ArticleDownload.archive_url == "" or models.ArticleDownload.archive_url == "TODO:UPLOAD").execute()
         logger.info(f"Launching upload to archive for {len(articles)} articles.")