From e6bfe811d0cd173a3e381af9bf3f751cb5f731ea Mon Sep 17 00:00:00 2001 From: Remy Moll Date: Mon, 24 Oct 2022 17:25:48 +0200 Subject: [PATCH] update nas target, documentation --- config/README.md | 8 + {misc/sample_config => config}/db.config.ini | 0 {env => config/env}/debug | 0 {env => config/env}/production | 0 .../sample_config => config}/nas_login.config | 0 .../sample_config => config}/nas_sync.config | 0 .../news_fetch.config.ini | 2 +- {misc/sample_config => config}/vpn.config | 0 docker-compose.yaml | 2 +- manual/README.md | 7 + manual/batch_archive.py | 21 +++ manual/batch_urls.txt | 18 +++ manual/batch_youtube.py | 33 ++++ {misc => manual}/exctract_from_mail_backup.py | 5 +- {misc => manual}/gather_media_files.py | 24 ++- {misc => manual}/migration.to_postgres.py | 0 misc/hotfix_missed_messages.py | 88 ---------- misc/hotfix_reactions.py | 38 ----- misc/media_urls.json | 151 ------------------ misc/youtube_batch.py | 61 ------- news_fetch/Dockerfile | 6 +- news_fetch/runner.py | 16 +- 22 files changed, 111 insertions(+), 369 deletions(-) create mode 100644 config/README.md rename {misc/sample_config => config}/db.config.ini (100%) rename {env => config/env}/debug (100%) rename {env => config/env}/production (100%) rename {misc/sample_config => config}/nas_login.config (100%) rename {misc/sample_config => config}/nas_sync.config (100%) rename {misc/sample_config => config}/news_fetch.config.ini (92%) rename {misc/sample_config => config}/vpn.config (100%) create mode 100644 manual/README.md create mode 100644 manual/batch_archive.py create mode 100644 manual/batch_urls.txt create mode 100644 manual/batch_youtube.py rename {misc => manual}/exctract_from_mail_backup.py (70%) rename {misc => manual}/gather_media_files.py (67%) rename {misc => manual}/migration.to_postgres.py (100%) delete mode 100644 misc/hotfix_missed_messages.py delete mode 100644 misc/hotfix_reactions.py delete mode 100644 misc/media_urls.json delete mode 100644 misc/youtube_batch.py diff --git a/config/README.md b/config/README.md new file mode 100644 index 0000000..cefa4d9 --- /dev/null +++ b/config/README.md @@ -0,0 +1,8 @@ +## Configuration: example +The files inside this directory (not the ones in `env/`) are a sample of the required configuration. + +Please create a copy of these files under `/config/...`. + +> Note: +> +> Some of the fields are blank, please fill them in as needed. \ No newline at end of file diff --git a/misc/sample_config/db.config.ini b/config/db.config.ini similarity index 100% rename from misc/sample_config/db.config.ini rename to config/db.config.ini diff --git a/env/debug b/config/env/debug similarity index 100% rename from env/debug rename to config/env/debug diff --git a/env/production b/config/env/production similarity index 100% rename from env/production rename to config/env/production diff --git a/misc/sample_config/nas_login.config b/config/nas_login.config similarity index 100% rename from misc/sample_config/nas_login.config rename to config/nas_login.config diff --git a/misc/sample_config/nas_sync.config b/config/nas_sync.config similarity index 100% rename from misc/sample_config/nas_sync.config rename to config/nas_sync.config diff --git a/misc/sample_config/news_fetch.config.ini b/config/news_fetch.config.ini similarity index 92% rename from misc/sample_config/news_fetch.config.ini rename to config/news_fetch.config.ini index a220818..24155c2 100644 --- a/misc/sample_config/news_fetch.config.ini +++ b/config/news_fetch.config.ini @@ -25,7 +25,7 @@ db_printout: /app/containerdata/backups local_storage_path: /app/containerdata/files debug_storage_path: /app/containerdata/debug/ default_download_path: /app/containerdata/tmp -remote_storage_path: /helbing_support/Files RM/Archiving +remote_storage_path: /helbing_support/Archiving-Pipeline browser_profile_path: /app/containerdata/dependencies/news_fetch.profile # please keep this exact name browser_print_delay: 3 diff --git a/misc/sample_config/vpn.config b/config/vpn.config similarity index 100% rename from misc/sample_config/vpn.config rename to config/vpn.config diff --git a/docker-compose.yaml b/docker-compose.yaml index 8027b99..84fa678 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -28,7 +28,7 @@ services: - ${CONTAINER_DATA}/config/nas_sync.config:/sync/nas_sync.config - ${CONTAINER_DATA}/config/nas_login.config:/sync/nas_login.config command: - - nas22.ethz.ch/gess_coss_1/helbing_support/Files RM/Archiving/TEST # first command is the target mount path + - nas22.ethz.ch/gess_coss_1/helbing_support/Archiving-Pipeline # first command is the target mount path - lsyncd - /sync/nas_sync.config diff --git a/manual/README.md b/manual/README.md new file mode 100644 index 0000000..80d6e66 --- /dev/null +++ b/manual/README.md @@ -0,0 +1,7 @@ +### MANUAL TASKS + +The files inside this directory contain scripts for repetitive but somewhat automatable tasks. + +> ⚠️ warning: +> +> Most scripts still require manual intervention before/after running and probably require changes to the code. **Please make sure you understand them before using them!** \ No newline at end of file diff --git a/manual/batch_archive.py b/manual/batch_archive.py new file mode 100644 index 0000000..1d9bdbf --- /dev/null +++ b/manual/batch_archive.py @@ -0,0 +1,21 @@ +""" +Saves websites specified in 'batch_urls.txt' to the wayback machine. Outputs archive urls to terminal +Hint: use 'python batch_archive.py > batch_archive.txt' to save the output to a file +""" +from waybackpy import WaybackMachineSaveAPI # upload to archive.org +import time + +urls = [] +with open ("batch_urls.txt", "r") as f: + urls = f.readlines() + + + +for i, url in enumerate(urls): + print(f"Saving url {i+1} / {len(urls)}") + user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed? + wayback = WaybackMachineSaveAPI(url, user_agent) + archive_url = wayback.save() + print(archive_url) + time.sleep(20) + # Uploads to archive.org are rate limited diff --git a/manual/batch_urls.txt b/manual/batch_urls.txt new file mode 100644 index 0000000..d8aeb8b --- /dev/null +++ b/manual/batch_urls.txt @@ -0,0 +1,18 @@ +https://id2020.org +https://www.weforum.org/platforms/the-centre-for-cybersecurity +https://www.unhcr.org/blogs/wp-content/uploads/sites/48/2018/04/fs.pdf +https://en.wikipedia.org/wiki/Social_Credit_System +https://en.wikipedia.org/wiki/Customer_lifetime_value +https://www.weforum.org/reports/the-internet-of-bodies-is-here-tackling-new-challenges-of-technology-governance +https://www.un.org/en/about-us/universal-declaration-of-human-rights +https://www.biometricupdate.com/201909/id2020-and-partners-launch-program-to-provide-digital-id-with-vaccines +https://www.wired.com/2008/06/pb-theory/ +https://www.medtechdive.com/news/fda-warns-of-false-positives-with-bd-coronavirus-diagnostic/581115/ +https://www.bbc.com/news/world-middle-east-52579475 +https://www.timesofisrael.com/over-12000-mistakenly-quarantined-by-phone-tracking-health-ministry-admits/ +https://www.delftdesignforvalues.nl +https://www.theglobalist.com/technology-big-data-artificial-intelligence-future-peace-rooms/ +https://link.springer.com/chapter/10.1007/978-3-319-90869-4_17 +https://www.youtube.com/watch?v=_KhAsJRk2lo +https://www.bloomberg.org/environment/supporting-sustainable-cities/american-cities-climate-challenge/ +https://climatecitycup.org diff --git a/manual/batch_youtube.py b/manual/batch_youtube.py new file mode 100644 index 0000000..de61551 --- /dev/null +++ b/manual/batch_youtube.py @@ -0,0 +1,33 @@ +""" +Saves youtube videos specified in 'batch_urls.txt' to the local folder. (to be copied manually) +""" +import youtube_dl + +urls = [] +with open ("batch_urls.txt", "r") as f: + urls = f.readlines() + + +def post_download_hook(ret_code): + if ret_code['status'] == 'finished': + file_loc = ret_code["filename"] + print(file_loc) + + +def save_video(url): + """Saves video accoring to url and save path""" + ydl_opts = { + 'format': 'best[height<=720]', + 'progress_hooks': [post_download_hook], + 'updatetime': False + } + try: + with youtube_dl.YoutubeDL(ydl_opts) as ydl: + ydl.download([url]) + except Exception as e: + print(f"Youtube download crashed: {e}") + + +for i, url in enumerate(urls): + print(f"Downloading video {i+1} / {len(urls)}") + save_video(url) diff --git a/misc/exctract_from_mail_backup.py b/manual/exctract_from_mail_backup.py similarity index 70% rename from misc/exctract_from_mail_backup.py rename to manual/exctract_from_mail_backup.py index a00dc63..f21e8e0 100644 --- a/misc/exctract_from_mail_backup.py +++ b/manual/exctract_from_mail_backup.py @@ -1,3 +1,6 @@ +""" +Extracts all urls from a list of mails exported from thunderbird. Writes to 'mails_url_export.json' +""" import os import re import json @@ -19,5 +22,5 @@ for f in all_files: print("Saved {} urls".format(len(all_urls))) -with open("media_mails_export.json", "w") as f: +with open("mails_url_export.json", "w") as f: json.dump(all_urls, f) \ No newline at end of file diff --git a/misc/gather_media_files.py b/manual/gather_media_files.py similarity index 67% rename from misc/gather_media_files.py rename to manual/gather_media_files.py index a9a2c68..e1d9e0b 100644 --- a/misc/gather_media_files.py +++ b/manual/gather_media_files.py @@ -1,5 +1,8 @@ +""" +Runs the news_fetch pipeline against a manually curated list of urls and saves them locally +""" import sys -sys.path.append("../app") +sys.path.append("../app/news_fetch") import runner import logging logger = logging.getLogger() @@ -11,24 +14,18 @@ console = Console() logger.info("Overwriting production values for single time media-fetch") runner.configuration.models.set_db( - runner.configuration.SqliteDatabase("../.dev/media_message_dummy.db"), # chat_db (not needed here) runner.configuration.SqliteDatabase("../.dev/media_downloads.db") ) runner.configuration.main_config["DOWNLOADS"]["local_storage_path"] = "../.dev/" def fetch(): - coordinator = runner.Coordinator() + dispatcher = runner.Dispatcher() + dispatcher.workers_in = [{"FetchWorker": runner.FetchWorker(), "DownloadWorker": runner.DownloadWorker()}] + dispatcher.workers_out = [{"PrintWorker": runner.PrintWorker()}] - kwargs = { - "worker_download" : runner.DownloadWorker(), - "worker_fetch" : runner.FetchWorker(), - "worker_upload" : runner.UploadWorker(), - } - - coordinator.add_workers(**kwargs) - coordinator.start() + dispatcher.start() with open("media_urls.json", "r") as f: url_list = json.loads(f.read()) @@ -36,9 +33,8 @@ def fetch(): logger.info(f"Found {len(url_list)} media urls") for u in url_list: msg_text = f"<{u}|dummy preview text>" - dummy_thread = runner.models.Thread() - msg = runner.models.Message(text= msg_text, thread=dummy_thread) - coordinator.incoming_request(msg) + dispatcher.incoming_request(msg) + def show(): diff --git a/misc/migration.to_postgres.py b/manual/migration.to_postgres.py similarity index 100% rename from misc/migration.to_postgres.py rename to manual/migration.to_postgres.py diff --git a/misc/hotfix_missed_messages.py b/misc/hotfix_missed_messages.py deleted file mode 100644 index 6cdb69d..0000000 --- a/misc/hotfix_missed_messages.py +++ /dev/null @@ -1,88 +0,0 @@ -import time -import keys -import slack_sdk -from slack_sdk.errors import SlackApiError -from peewee import SqliteDatabase - -from persistence import message_models -# from bot_utils import messages - - - -# Constant values... -MESSAGES_DB = "/app/containerdata/messages.db" - -BOT_ID = "U02MR1R8UJH" -ARCHIVE_ID = "C02MM7YG1V4" -DEBUG_ID = "C02NM2H9J5Q" - - - -client = slack_sdk.WebClient(token=keys.OAUTH_TOKEN) - -message_models.set_db(SqliteDatabase(MESSAGES_DB)) - - -def message_dict_to_model(message): - if message["type"] == "message": - thread_ts = message["thread_ts"] if "thread_ts" in message else message["ts"] - uid = message.get("user", "BAD USER") - user, _ = message_models.User.get_or_create(user_id = uid) - thread, _ = message_models.Thread.get_or_create(thread_ts = thread_ts) - m, new = message_models.Message.get_or_create( - user = user, - thread = thread, - ts = message["ts"], - channel_id = ARCHIVE_ID, - text = message["text"] - ) - print("Saved (text) {} (new={})".format(m, new)) - - for f in message.get("files", []): #default: [] - m.file_type = f["filetype"] - m.perma_link = f["url_private_download"] - m.save() - print("Saved permalink {} to {} (possibly overwriting)".format(f["name"], m)) - if new: - return m - else: - return None - else: - print("What should I do of {}".format(message)) - return None - - -def check_all_past_messages(): - last_ts = 0 - - result = client.conversations_history( - channel=ARCHIVE_ID, - oldest=last_ts - ) - - new_messages = result.get("messages", []) # fetches 100 messages by default - - new_fetches = [] - for m in new_messages: - new_fetches.append(message_dict_to_model(m)) - # print(result) - refetch = result.get("has_more", False) - print(f"Refetching : {refetch}") - while refetch: # we have not actually fetched them all - try: - result = client.conversations_history( - channel = ARCHIVE_ID, - cursor = result["response_metadata"]["next_cursor"], - oldest = last_ts - ) # refetches in batches of 100 messages - refetch = result.get("has_more", False) - new_messages = result.get("messages", []) - for m in new_messages: - new_fetches.append(message_dict_to_model(m)) - except SlackApiError: # Most likely a rate-limit - print("Error while fetching channel messages. (likely rate limit) Retrying in {} seconds...".format(30)) - time.sleep(30) - refetch = True - - -check_all_past_messages() \ No newline at end of file diff --git a/misc/hotfix_reactions.py b/misc/hotfix_reactions.py deleted file mode 100644 index 5e43b37..0000000 --- a/misc/hotfix_reactions.py +++ /dev/null @@ -1,38 +0,0 @@ -from peewee import SqliteDatabase - -from persistence import article_models, message_models - -# Global logger setup: - - -# Constant values... -DOWNLOADS_DB = "../container_data/downloads.db" -MESSAGES_DB = "../container_data/messages.db" - -BOT_ID = "U02MR1R8UJH" -ARCHIVE_ID = "C02MM7YG1V4" -DEBUG_ID = "C02NM2H9J5Q" - - -# DB Setup: -article_models.set_db(SqliteDatabase( - DOWNLOADS_DB, - pragmas = {'journal_mode': 'wal'} # mutliple threads can access at once -)) - -message_models.set_db(SqliteDatabase(MESSAGES_DB)) - - - -for reaction in message_models.Reaction.select(): - print(reaction) - thread = reaction.message.thread - articles = message_models.get_referenced_articles(thread, article_models.ArticleDownload) - for a in articles: - print(a) - reaction = reaction.type - status = 1 if reaction == "white_check_mark" else -1 - print(status) - for article in articles: - article.verified = status - article.save() diff --git a/misc/media_urls.json b/misc/media_urls.json deleted file mode 100644 index 158c9df..0000000 --- a/misc/media_urls.json +++ /dev/null @@ -1,151 +0,0 @@ -[ - "https://www.swissinfo.ch/ger/wirtschaft/koennen-ki-und-direkte-demokratie-nebeneinander-bestehen-/47542048", - "https://www.zeit.de/2011/33/CH-Oekonophysik", - "https://ourworld.unu.edu/en/green-idea-self-organizing-traffic-signals", - "https://www.youtube.com/watch?v=-FQD4ie9UYA", - "https://www.brandeins.de/corporate-services/mck-wissen/mck-wissen-logistik/schwaermen-fuer-das-optimum", - "https://www.youtube.com/watch?v=upQM4Xzh8zM", - "https://www.youtube.com/watch?v=gAkoprZmW4k", - "https://www.youtube.com/watch?v=VMzfDVAWXHI&t=1s", - "https://www.youtube.com/watch?v=1SwTiIlkndE", - "https://www.informatik-aktuell.de/management-und-recht/digitalisierung/digitale-revolution-und-oekonomie-40-quo-vadis.html", - "https://www.youtube.com/watch?v=cSvvH0SBFOw", - "https://www.linkedin.com/posts/margit-osterloh-24198a104_pl%C3%A4doyer-gegen-sprechverbote-ugcPost-6925702100450480129-K7Dl?utm_source=linkedin_share&utm_medium=member_desktop_web", - "https://www.nebelspalter.ch/plaedoyer-gegen-sprechverbote", - "https://falling-walls.com/people/dirk-helbing/", - "https://digitalsensemaker.podigee.io/3-2-mit-dirk-helbing", - "https://www.blick.ch/wirtschaft/musk-als-hueter-der-redefreiheit-eth-experte-sagt-musks-vorhaben-hat-potenzial-aber-id17437811.html", - "https://www.trend.at/standpunkte/mit-verantwortung-zukunft-10082300", - "https://www.pantarhei.ch/podcast/", - "https://ethz.ch/en/industry/industry/news/data/2022/04/intelligent-traffic-lights-for-optimal-traffic-flow.html", - "https://ethz.ch/de/wirtschaft/industry/news/data/2022/04/optimaler-verkehrsfluss-mit-intelligenten-ampeln.html", - "https://www.spektrum.de/news/die-verschlungenen-wege-der-menschen/1181815", - "https://www.pcwelt.de/a/diktatur-4-0-schoene-neue-digitalisierte-welt,3447005", - "https://www.nzz.ch/english/cancel-culture-at-eth-a-professor-receives-death-threats-over-a-lecture-slide-ld.1675322", - "https://www.brandeins.de/corporate-services/mck-wissen/mck-wissen-logistik/schwaermen-fuer-das-optimum", - "https://www.achgut.com/artikel/ausgestossene_der_woche_prinz_william_als_immaginierter_rassist", - "https://www.pinterpolitik.com/in-depth/klaim-big-data-luhut-perlu-diuji/", - "https://www.srf.ch/kultur/gesellschaft-religion/eklat-an-der-eth-wenn-ein-angeblicher-schweinevergleich-zur-staatsaffaere-wird", - "https://open.spotify.com/episode/6s1icdoplZeNOINvx6ZHTd?si=610a699eba004da2&nd=1", - "https://www.nzz.ch/schweiz/shitstorm-an-der-eth-ein-professor-erhaelt-morddrohungen-ld.1673554", - "https://www.nzz.ch/schweiz/shitstorm-an-der-eth-ein-professor-erhaelt-morddrohungen-ld.1673554", - "https://djmag.com/features/after-astroworld-what-being-done-stop-crowd-crushes-happening-again", - "https://prisma-hsg.ch/articles/meine-daten-deine-daten-unsere-daten/", - "https://www.srf.ch/audio/focus/zukunftsforscher-dirk-helbing-die-welt-ist-keine-maschine?id=10756661", - "https://www.20min.ch/story/roboter-fuer-hunde-machen-wenig-sinn-647302764916", - "https://www.wienerzeitung.at/nachrichten/wissen/mensch/942890-Roboter-als-Praesidentschaftskandidaten.html", - "https://disruptors.fm/11-building-a-crystal-ball-of-the-world-unseating-capitalism-and-creating-a-new-world-order-with-prof-dirk-helbing/", - "https://www.spreaker.com/user/disruptorsfm/11-building-crystal-ball-of-the-world-un", - "https://www.youtube.com/watch?v=fRkCMC3zqSQ", - "https://arstechnica.com/science/2021/11/what-the-physics-of-crowds-can-tell-us-about-the-tragic-deaths-at-astroworld/", - "https://www.fox23.com/news/trending/astroworld-festival-big-crowds-can-flow-like-liquid-with-terrifying-results/37QH6Q4RGFELHGCZSZTBV46STU/", - "https://futurism.com/astroworld-theory-deaths-bodies-fluid", - "https://www.businessinsider.com/why-people-died-astroworld-crowd-crush-physics-fluid-dynamics-2021-11", - "https://theconversation.com/ten-tips-for-surviving-a-crowd-crush-112169", - "https://www.limmattalerzeitung.ch/basel/das-wort-zum-tag-kopie-von-4-januar-hypotenuse-schlaegt-kathete-trivia-trampel-pandemie-ld.2233931", - "https://magazine.swissinformatics.org/en/whats-wrong-with-ai/", - "https://magazine.swissinformatics.org/en/whats-wrong-with-ai/", - "https://www.netkwesties.nl/1541/wrr-ai-wordt-de-verbrandingsmotor-van.htm", - "https://youtu.be/ptm9zLG2KaE", - "https://www.deutschlandfunkkultur.de/die-zukunft-der-demokratie-mehr-teilhabe-von-unten-wagen.976.de.html?dram:article_id=468341", - "https://www.springer.com/gp/book/9783642240034", - "https://www.springer.com/de/book/9783319908687", - "https://technikjournal.de/2017/08/02/ein-plaedoyer-fuer-die-digitale-demokratie/", - "https://technikjournal.de/2017/08/02/ein-plaedoyer-fuer-die-digitale-demokratie/", - "https://trafo.hypotheses.org/23989", - "https://web.archive.org/web/20200609053329/https://www.wiko-berlin.de/institution/projekte-kooperationen/projekte/working-futures/wiko-briefs-working-futures-in-corona-times/the-corona-crisis-reveals-the-struggle-for-a-sustainable-digital-future/", - "https://www.wiko-berlin.de/institution/projekte-kooperationen/projekte/working-futures/wiko-briefs-working-futures-in-corona-times/", - "https://www.youtube.com/watch?v=gAkoprZmW4k", - "https://www.rhein-zeitung.de/region/aus-den-lokalredaktionen/nahe-zeitung_artikel,-peter-flaschels-lebenswerk-hat-die-sozialgeschichte-beeinflusst-_arid,2322161.html", - "https://www.blick.ch/wirtschaft/online-boom-ohne-ende-corona-befeuert-die-tech-revolution-id16359910.html", - "https://www.nzz.ch/meinung/china-unterwirft-tech-und-social-media-das-geht-auch-europa-an-ld.1643010", - "https://www.say.media/article/la-mort-par-algorithme", - "https://www.suedostschweiz.ch/aus-dem-leben/2021-08-14/stau-ist-nicht-gleich-stau", - "https://www.swissinfo.ch/eng/directdemocracy/political-perspectives_digital-democracy--too-risky--or-the-chance-of-a-generation-/43836222", - "https://kow-berlin.com/exhibitions/illusion-einer-menschenmenge", - "https://www.springer.com/gp/book/9783642240034", - "https://www.springer.com/de/book/9783319908687", - "https://www.politik-kommunikation.de/ressorts/artikel/eine-gefaehrliche-machtasymmetrie-1383558602", - "https://www.springer.com/gp/book/9783642240034", - "https://www.springer.com/de/book/9783319908687", - "https://solutions.hamburg/ethik-und-digitalisierung-nicht-voneinander-getrennt-betrachten/", - "https://www.springer.com/gp/book/9783642240034", - "https://www.springer.com/de/book/9783319908687", - "https://avenue.argusdatainsights.ch/Article/AvenueClip?artikelHash=d14d91ec9a8b4cb0b6bb3012c0cefd8b_27F0B19422F1F03723769C18906AA1EE&artikelDateiId=298862327", - "https://www.tagblatt.ch/kultur/grosses-ranking-ihre-stimme-hat-gewicht-das-sind-die-50-profiliertesten-intellektuellen-der-schweiz-ld.2182261", - "https://reliefweb.int/report/world/building-multisystemic-understanding-societal-resilience-covid-19-pandemic", - "https://reliefweb.int/report/world/building-multisystemic-understanding-societal-resilience-covid-19-pandemic", - "https://www.events.at/e/wie-wir-in-zukunft-leben-wollen-die-stadt-als-datenfeld", - "https://www.events.at/e/wie-wir-in-zukunft-leben-wollen-die-stadt-als-datenfeld", - "https://greennetproject.org/en/2018/11/27/prof-dirk-helbing-es-braucht-vor-allem-tolle-ideen-in-die-sich-die-leute-verlieben/", - "https://www.hpcwire.com/2011/05/06/simulating_society_at_the_global_scale/", - "https://www.technologyreview.com/2010/04/30/204005/europes-plan-to-simulate-the-entire-planet/", - "https://komentare.sme.sk/c/22543617/smrt-podla-algoritmu.html", - "https://komentare.sme.sk/c/22543617/smrt-podla-algoritmu.html", - "https://www.confidencial.com.ni/opinion/muerte-por-algoritmo/", - "https://www.nzz.ch/panorama/wie-kann-eine-massenpanik-verhindert-werden-ld.1614761", - "https://www.20min.ch/story/roboter-fuer-hunde-machen-wenig-sinn-647302764916", - "https://www.wienerzeitung.at/nachrichten/wissen/mensch/942890-Roboter-als-Praesidentschaftskandidaten.html", - "https://www.srf.ch/audio/focus/zukunftsforscher-dirk-helbing-die-welt-ist-keine-maschine?id=10756661", - "https://disruptors.fm/11-building-a-crystal-ball-of-the-world-unseating-capitalism-and-creating-a-new-world-order-with-prof-dirk-helbing/", - "https://www.spreaker.com/user/disruptorsfm/11-building-crystal-ball-of-the-world-un", - "https://www.youtube.com/watch?v=fRkCMC3zqSQ", - "https://arstechnica.com/science/2021/11/what-the-physics-of-crowds-can-tell-us-about-the-tragic-deaths-at-astroworld/", - "https://www.fox23.com/news/trending/astroworld-festival-big-crowds-can-flow-like-liquid-with-terrifying-results/37QH6Q4RGFELHGCZSZTBV46STU/", - "https://futurism.com/astroworld-theory-deaths-bodies-fluid", - "https://www.businessinsider.com/why-people-died-astroworld-crowd-crush-physics-fluid-dynamics-2021-11", - "https://theconversation.com/ten-tips-for-surviving-a-crowd-crush-112169", - "https://www.limmattalerzeitung.ch/basel/das-wort-zum-tag-kopie-von-4-januar-hypotenuse-schlaegt-kathete-trivia-trampel-pandemie-ld.2233931", - "https://www.pantarhei.ch/podcast/", - "https://www.focus.it/scienza/scienze/folla-fisica-modelli-simulazioni", - "https://www.focus.it/scienza/scienze/folla-fisica-modelli-simulazioni", - "https://www.netkwesties.nl/1541/wrr-ai-wordt-de-verbrandingsmotor-van.htm", - "https://www.transformationbeats.com/de/transformation/digitale-gesellschaft/", - "https://www.transformationbeats.com/de/transformation/digitale-gesellschaft/", - "https://www.suedkurier.de/ueberregional/wirtschaft/Wie-uns-der-Staat-heimlich-erzieht-sogar-auf-dem-Klo;art416,8763904", - "https://www.suedkurier.de/ueberregional/wirtschaft/Wie-uns-der-Staat-heimlich-erzieht-sogar-auf-dem-Klo;art416,8763904", - "https://www.deutschlandfunkkultur.de/die-zukunft-der-demokratie-mehr-teilhabe-von-unten-wagen.976.de.html?dram:article_id=468341", - "https://www.springer.com/gp/book/9783642240034", - "https://www.springer.com/de/book/9783319908687", - "https://trafo.hypotheses.org/23989", - "https://web.archive.org/web/20200609053329/https://www.wiko-berlin.de/institution/projekte-kooperationen/projekte/working-futures/wiko-briefs-working-futures-in-corona-times/the-corona-crisis-reveals-the-struggle-for-a-sustainable-digital-future/", - "https://www.wiko-berlin.de/institution/projekte-kooperationen/projekte/working-futures/wiko-briefs-working-futures-in-corona-times/", - "https://www.youtube.com/watch?v=gAkoprZmW4k", - "https://futurium.de/de/gespraech/ranga-yogeshwar-1/ranga-yogeshwar-dirk-helbing-mit-musik-von-till-broenner", - "https://www.springer.com/gp/book/9783642240034", - "https://www.springer.com/de/book/9783319908687", - "https://idw-online.de/en/news113518", - "https://blmplus.de/die-digitalcharta-ist-erst-der-anfang-ein-szenario-von-dirk-helbing/", - "https://www.risiko-dialog.ch/big-nudging-vom-computer-gelenkt-aber-wohin/", - "https://idw-online.de/de/news13986", - "https://www.uni-stuttgart.de/presse/archiv/uni-kurier/uk84_85/forschung/fw66.html", - "https://www.infosperber.ch/medien/trends/rankings-oft-unbrauchbar-so-oder-so-aber-immer-schadlich/", - "https://www.infosperber.ch/medien/trends/rankings-oft-unbrauchbar-so-oder-so-aber-immer-schadlich/", - "https://www.nzz.ch/meinung/china-unterwirft-tech-und-social-media-das-geht-auch-europa-an-ld.1643010", - "https://www.suedostschweiz.ch/aus-dem-leben/2021-08-14/stau-ist-nicht-gleich-stau", - "https://www.swissinfo.ch/eng/directdemocracy/political-perspectives_digital-democracy--too-risky--or-the-chance-of-a-generation-/43836222", - "https://werteundwandel.de/inhalte/d2030-in-aufbruchstimmung-fuer-eine-lebenswerte-zukunft/", - "https://www.springer.com/gp/book/9783642240034", - "https://www.springer.com/de/book/9783319908687", - "https://www.youtube.com/watch?v=n9e77iYZPEY", - "https://greennetproject.org/en/2018/11/27/prof-dirk-helbing-es-braucht-vor-allem-tolle-ideen-in-die-sich-die-leute-verlieben/", - "https://www.hpcwire.com/2011/05/06/simulating_society_at_the_global_scale/", - "https://www.say.media/article/la-mort-par-algorithme", - "https://www.confidencial.com.ni/opinion/muerte-por-algoritmo/", - "https://www.nzz.ch/panorama/wie-kann-eine-massenpanik-verhindert-werden-ld.1614761", - "https://www.nesta.org.uk/report/digital-democracy-the-tools-transforming-political-engagement/", - "https://www.nature.com/articles/news.2010.351", - "https://www.focus.de/panorama/welt/tid-19265/gastkommentar-nutzt-die-moeglichkeiten-des-computers_aid_534372.html", - "https://www.theglobalist.com/democracy-technology-innovation-society-internet/", - "https://www.theglobalist.com/capitalism-democracy-technology-surveillance-privacy/", - "https://www.theglobalist.com/google-artificial-intelligence-big-data-technology-future/", - "https://www.theglobalist.com/fascism-big-data-artificial-intelligence-surveillance-democracy/", - "https://www.theglobalist.com/technology-big-data-artificial-intelligence-future-peace-rooms/", - "https://www.theglobalist.com/technology-society-sustainability-future-humanity/", - "https://www.theglobalist.com/society-technology-peace-sustainability/", - "https://www.theglobalist.com/democracy-technology-social-media-artificial-intelligence/", - "https://www.theglobalist.com/financial-system-reform-economy-internet-of-things-capitalism/", - "https://www.theglobalist.com/capitalism-society-equality-sustainability-crowd-funding/", - "https://www.theglobalist.com/united-nations-world-government-peace-sustainability-society/", - "https://www.theglobalist.com/world-economy-sustainability-environment-society/" -] \ No newline at end of file diff --git a/misc/youtube_batch.py b/misc/youtube_batch.py deleted file mode 100644 index 155540a..0000000 --- a/misc/youtube_batch.py +++ /dev/null @@ -1,61 +0,0 @@ -import youtube_dl -from waybackpy import WaybackMachineSaveAPI # upload to archive.org -import time - - -urls = [ -"https://id2020.org", -"https://www.weforum.org/platforms/the-centre-for-cybersecurity", -"https://www.unhcr.org/blogs/wp-content/uploads/sites/48/2018/04/fs.pdf", -"https://en.wikipedia.org/wiki/Social_Credit_System", -"https://en.wikipedia.org/wiki/Customer_lifetime_value", -"https://www.weforum.org/reports/the-internet-of-bodies-is-here-tackling-new-challenges-of-technology-governance", -"https://www.un.org/en/about-us/universal-declaration-of-human-rights", -"https://www.biometricupdate.com/201909/id2020-and-partners-launch-program-to-provide-digital-id-with-vaccines", -"https://www.wired.com/2008/06/pb-theory/", -"https://www.medtechdive.com/news/fda-warns-of-false-positives-with-bd-coronavirus-diagnostic/581115/", -"https://www.bbc.com/news/world-middle-east-52579475", -"https://www.timesofisrael.com/over-12000-mistakenly-quarantined-by-phone-tracking-health-ministry-admits/", -"https://www.delftdesignforvalues.nl", -"https://www.theglobalist.com/technology-big-data-artificial-intelligence-future-peace-rooms/", -"https://link.springer.com/chapter/10.1007/978-3-319-90869-4_17", -"https://www.youtube.com/watch?v=_KhAsJRk2lo", -"https://www.bloomberg.org/environment/supporting-sustainable-cities/american-cities-climate-challenge/", -"https://climatecitycup.org", - -] - -def post_download_hook(ret_code): - # print(ret_code) - if ret_code['status'] == 'finished': - file_loc = ret_code["filename"] - print(file_loc) - - -def save_video(url): - """Saves video accoring to url and save path""" - ydl_opts = { - 'format': 'best[height<=720]', - # 'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download - 'progress_hooks': [post_download_hook], - 'updatetime': False - } - try: - with youtube_dl.YoutubeDL(ydl_opts) as ydl: - ydl.download([url]) - # article file name is updated in self.post_download_hook - except Exception as e: - print(f"Youtube download crashed: {e}") - - -# for i, url in enumerate(urls): -# print(f"Downloading video {i+1} / {len(urls)}") - # save_video(url) - -for i, url in enumerate(urls): - print(f"Saving url {i+1} / {len(urls)}") - user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed? - wayback = WaybackMachineSaveAPI(url, user_agent) - archive_url = wayback.save() - print(archive_url) - time.sleep(20) diff --git a/news_fetch/Dockerfile b/news_fetch/Dockerfile index 2f4bb4e..cdba50b 100644 --- a/news_fetch/Dockerfile +++ b/news_fetch/Dockerfile @@ -2,10 +2,10 @@ FROM python:latest ENV TZ Europe/Zurich -RUN mkdir -p /app/auto_news +RUN mkdir -p /app/news_fetch COPY requirements.txt /app/requirements.txt RUN python3 -m pip install -r /app/requirements.txt -COPY . /app/auto_news -WORKDIR /app/auto_news +COPY . /app/news_fetch +WORKDIR /app/news_fetch diff --git a/news_fetch/runner.py b/news_fetch/runner.py index 92a9fdf..1185a52 100644 --- a/news_fetch/runner.py +++ b/news_fetch/runner.py @@ -126,13 +126,12 @@ class Dispatcher(Thread): - # def manual_processing(self, articles, workers): - # for w in workers: - # w.start() - # for article in articles: - # notifier = lambda article: logger.info(f"Completed manual actions for {article}") - # ArticleWatcher(article, workers_manual = workers, notifier = notifier) # Article watcher wants a thread to link article to TODO: handle threads as a kwarg +class PrintWorker: + def send(self, article): + print(f"Uploaded article {article}") + def keep_alive(self): # keeps script running, because there is nothing else in the main thread + while True: sleep(1) @@ -140,11 +139,6 @@ if __name__ == "__main__": dispatcher = Dispatcher() if "upload" in sys.argv: - class PrintWorker: - def send(self, article): - print(f"Uploaded article {article}") - def keep_alive(self): # keeps script running, because there is nothing else in the main thread - while True: sleep(1) articles = models.ArticleDownload.select().where(models.ArticleDownload.archive_url == "" or models.ArticleDownload.archive_url == "TODO:UPLOAD").execute() logger.info(f"Launching upload to archive for {len(articles)} articles.")