update nas target, documentation

This commit is contained in:
Remy Moll 2022-10-24 17:25:48 +02:00
parent 6c08dec20a
commit e6bfe811d0
22 changed files with 111 additions and 369 deletions

8
config/README.md Normal file
View File

@ -0,0 +1,8 @@
## Configuration: example
The files inside this directory (not the ones in `env/`) are a sample of the required configuration.
Please create a copy of these files under `<location of downloads>/config/...`.
> Note:
>
> Some of the fields are blank, please fill them in as needed.

View File

View File

@ -25,7 +25,7 @@ db_printout: /app/containerdata/backups
local_storage_path: /app/containerdata/files
debug_storage_path: /app/containerdata/debug/
default_download_path: /app/containerdata/tmp
remote_storage_path: /helbing_support/Files RM/Archiving
remote_storage_path: /helbing_support/Archiving-Pipeline
browser_profile_path: /app/containerdata/dependencies/news_fetch.profile
# please keep this exact name
browser_print_delay: 3

View File

@ -28,7 +28,7 @@ services:
- ${CONTAINER_DATA}/config/nas_sync.config:/sync/nas_sync.config
- ${CONTAINER_DATA}/config/nas_login.config:/sync/nas_login.config
command:
- nas22.ethz.ch/gess_coss_1/helbing_support/Files RM/Archiving/TEST # first command is the target mount path
- nas22.ethz.ch/gess_coss_1/helbing_support/Archiving-Pipeline # first command is the target mount path
- lsyncd
- /sync/nas_sync.config

7
manual/README.md Normal file
View File

@ -0,0 +1,7 @@
### MANUAL TASKS
The files inside this directory contain scripts for repetitive but somewhat automatable tasks.
> ⚠️ warning:
>
> Most scripts still require manual intervention before/after running and probably require changes to the code. **Please make sure you understand them before using them!**

21
manual/batch_archive.py Normal file
View File

@ -0,0 +1,21 @@
"""
Saves websites specified in 'batch_urls.txt' to the wayback machine. Outputs archive urls to terminal
Hint: use 'python batch_archive.py > batch_archive.txt' to save the output to a file
"""
from waybackpy import WaybackMachineSaveAPI # upload to archive.org
import time
urls = []
with open ("batch_urls.txt", "r") as f:
urls = f.readlines()
for i, url in enumerate(urls):
print(f"Saving url {i+1} / {len(urls)}")
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed?
wayback = WaybackMachineSaveAPI(url, user_agent)
archive_url = wayback.save()
print(archive_url)
time.sleep(20)
# Uploads to archive.org are rate limited

18
manual/batch_urls.txt Normal file
View File

@ -0,0 +1,18 @@
https://id2020.org
https://www.weforum.org/platforms/the-centre-for-cybersecurity
https://www.unhcr.org/blogs/wp-content/uploads/sites/48/2018/04/fs.pdf
https://en.wikipedia.org/wiki/Social_Credit_System
https://en.wikipedia.org/wiki/Customer_lifetime_value
https://www.weforum.org/reports/the-internet-of-bodies-is-here-tackling-new-challenges-of-technology-governance
https://www.un.org/en/about-us/universal-declaration-of-human-rights
https://www.biometricupdate.com/201909/id2020-and-partners-launch-program-to-provide-digital-id-with-vaccines
https://www.wired.com/2008/06/pb-theory/
https://www.medtechdive.com/news/fda-warns-of-false-positives-with-bd-coronavirus-diagnostic/581115/
https://www.bbc.com/news/world-middle-east-52579475
https://www.timesofisrael.com/over-12000-mistakenly-quarantined-by-phone-tracking-health-ministry-admits/
https://www.delftdesignforvalues.nl
https://www.theglobalist.com/technology-big-data-artificial-intelligence-future-peace-rooms/
https://link.springer.com/chapter/10.1007/978-3-319-90869-4_17
https://www.youtube.com/watch?v=_KhAsJRk2lo
https://www.bloomberg.org/environment/supporting-sustainable-cities/american-cities-climate-challenge/
https://climatecitycup.org

33
manual/batch_youtube.py Normal file
View File

@ -0,0 +1,33 @@
"""
Saves youtube videos specified in 'batch_urls.txt' to the local folder. (to be copied manually)
"""
import youtube_dl
urls = []
with open ("batch_urls.txt", "r") as f:
urls = f.readlines()
def post_download_hook(ret_code):
if ret_code['status'] == 'finished':
file_loc = ret_code["filename"]
print(file_loc)
def save_video(url):
"""Saves video accoring to url and save path"""
ydl_opts = {
'format': 'best[height<=720]',
'progress_hooks': [post_download_hook],
'updatetime': False
}
try:
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
except Exception as e:
print(f"Youtube download crashed: {e}")
for i, url in enumerate(urls):
print(f"Downloading video {i+1} / {len(urls)}")
save_video(url)

View File

@ -1,3 +1,6 @@
"""
Extracts all urls from a list of mails exported from thunderbird. Writes to 'mails_url_export.json'
"""
import os
import re
import json
@ -19,5 +22,5 @@ for f in all_files:
print("Saved {} urls".format(len(all_urls)))
with open("media_mails_export.json", "w") as f:
with open("mails_url_export.json", "w") as f:
json.dump(all_urls, f)

View File

@ -1,5 +1,8 @@
"""
Runs the news_fetch pipeline against a manually curated list of urls and saves them locally
"""
import sys
sys.path.append("../app")
sys.path.append("../app/news_fetch")
import runner
import logging
logger = logging.getLogger()
@ -11,24 +14,18 @@ console = Console()
logger.info("Overwriting production values for single time media-fetch")
runner.configuration.models.set_db(
runner.configuration.SqliteDatabase("../.dev/media_message_dummy.db"), # chat_db (not needed here)
runner.configuration.SqliteDatabase("../.dev/media_downloads.db")
)
runner.configuration.main_config["DOWNLOADS"]["local_storage_path"] = "../.dev/"
def fetch():
coordinator = runner.Coordinator()
dispatcher = runner.Dispatcher()
dispatcher.workers_in = [{"FetchWorker": runner.FetchWorker(), "DownloadWorker": runner.DownloadWorker()}]
dispatcher.workers_out = [{"PrintWorker": runner.PrintWorker()}]
kwargs = {
"worker_download" : runner.DownloadWorker(),
"worker_fetch" : runner.FetchWorker(),
"worker_upload" : runner.UploadWorker(),
}
coordinator.add_workers(**kwargs)
coordinator.start()
dispatcher.start()
with open("media_urls.json", "r") as f:
url_list = json.loads(f.read())
@ -36,9 +33,8 @@ def fetch():
logger.info(f"Found {len(url_list)} media urls")
for u in url_list:
msg_text = f"<{u}|dummy preview text>"
dummy_thread = runner.models.Thread()
msg = runner.models.Message(text= msg_text, thread=dummy_thread)
coordinator.incoming_request(msg)
dispatcher.incoming_request(msg)
def show():

View File

@ -1,88 +0,0 @@
import time
import keys
import slack_sdk
from slack_sdk.errors import SlackApiError
from peewee import SqliteDatabase
from persistence import message_models
# from bot_utils import messages
# Constant values...
MESSAGES_DB = "/app/containerdata/messages.db"
BOT_ID = "U02MR1R8UJH"
ARCHIVE_ID = "C02MM7YG1V4"
DEBUG_ID = "C02NM2H9J5Q"
client = slack_sdk.WebClient(token=keys.OAUTH_TOKEN)
message_models.set_db(SqliteDatabase(MESSAGES_DB))
def message_dict_to_model(message):
if message["type"] == "message":
thread_ts = message["thread_ts"] if "thread_ts" in message else message["ts"]
uid = message.get("user", "BAD USER")
user, _ = message_models.User.get_or_create(user_id = uid)
thread, _ = message_models.Thread.get_or_create(thread_ts = thread_ts)
m, new = message_models.Message.get_or_create(
user = user,
thread = thread,
ts = message["ts"],
channel_id = ARCHIVE_ID,
text = message["text"]
)
print("Saved (text) {} (new={})".format(m, new))
for f in message.get("files", []): #default: []
m.file_type = f["filetype"]
m.perma_link = f["url_private_download"]
m.save()
print("Saved permalink {} to {} (possibly overwriting)".format(f["name"], m))
if new:
return m
else:
return None
else:
print("What should I do of {}".format(message))
return None
def check_all_past_messages():
last_ts = 0
result = client.conversations_history(
channel=ARCHIVE_ID,
oldest=last_ts
)
new_messages = result.get("messages", []) # fetches 100 messages by default
new_fetches = []
for m in new_messages:
new_fetches.append(message_dict_to_model(m))
# print(result)
refetch = result.get("has_more", False)
print(f"Refetching : {refetch}")
while refetch: # we have not actually fetched them all
try:
result = client.conversations_history(
channel = ARCHIVE_ID,
cursor = result["response_metadata"]["next_cursor"],
oldest = last_ts
) # refetches in batches of 100 messages
refetch = result.get("has_more", False)
new_messages = result.get("messages", [])
for m in new_messages:
new_fetches.append(message_dict_to_model(m))
except SlackApiError: # Most likely a rate-limit
print("Error while fetching channel messages. (likely rate limit) Retrying in {} seconds...".format(30))
time.sleep(30)
refetch = True
check_all_past_messages()

View File

@ -1,38 +0,0 @@
from peewee import SqliteDatabase
from persistence import article_models, message_models
# Global logger setup:
# Constant values...
DOWNLOADS_DB = "../container_data/downloads.db"
MESSAGES_DB = "../container_data/messages.db"
BOT_ID = "U02MR1R8UJH"
ARCHIVE_ID = "C02MM7YG1V4"
DEBUG_ID = "C02NM2H9J5Q"
# DB Setup:
article_models.set_db(SqliteDatabase(
DOWNLOADS_DB,
pragmas = {'journal_mode': 'wal'} # mutliple threads can access at once
))
message_models.set_db(SqliteDatabase(MESSAGES_DB))
for reaction in message_models.Reaction.select():
print(reaction)
thread = reaction.message.thread
articles = message_models.get_referenced_articles(thread, article_models.ArticleDownload)
for a in articles:
print(a)
reaction = reaction.type
status = 1 if reaction == "white_check_mark" else -1
print(status)
for article in articles:
article.verified = status
article.save()

View File

@ -1,151 +0,0 @@
[
"https://www.swissinfo.ch/ger/wirtschaft/koennen-ki-und-direkte-demokratie-nebeneinander-bestehen-/47542048",
"https://www.zeit.de/2011/33/CH-Oekonophysik",
"https://ourworld.unu.edu/en/green-idea-self-organizing-traffic-signals",
"https://www.youtube.com/watch?v=-FQD4ie9UYA",
"https://www.brandeins.de/corporate-services/mck-wissen/mck-wissen-logistik/schwaermen-fuer-das-optimum",
"https://www.youtube.com/watch?v=upQM4Xzh8zM",
"https://www.youtube.com/watch?v=gAkoprZmW4k",
"https://www.youtube.com/watch?v=VMzfDVAWXHI&t=1s",
"https://www.youtube.com/watch?v=1SwTiIlkndE",
"https://www.informatik-aktuell.de/management-und-recht/digitalisierung/digitale-revolution-und-oekonomie-40-quo-vadis.html",
"https://www.youtube.com/watch?v=cSvvH0SBFOw",
"https://www.linkedin.com/posts/margit-osterloh-24198a104_pl%C3%A4doyer-gegen-sprechverbote-ugcPost-6925702100450480129-K7Dl?utm_source=linkedin_share&utm_medium=member_desktop_web",
"https://www.nebelspalter.ch/plaedoyer-gegen-sprechverbote",
"https://falling-walls.com/people/dirk-helbing/",
"https://digitalsensemaker.podigee.io/3-2-mit-dirk-helbing",
"https://www.blick.ch/wirtschaft/musk-als-hueter-der-redefreiheit-eth-experte-sagt-musks-vorhaben-hat-potenzial-aber-id17437811.html",
"https://www.trend.at/standpunkte/mit-verantwortung-zukunft-10082300",
"https://www.pantarhei.ch/podcast/",
"https://ethz.ch/en/industry/industry/news/data/2022/04/intelligent-traffic-lights-for-optimal-traffic-flow.html",
"https://ethz.ch/de/wirtschaft/industry/news/data/2022/04/optimaler-verkehrsfluss-mit-intelligenten-ampeln.html",
"https://www.spektrum.de/news/die-verschlungenen-wege-der-menschen/1181815",
"https://www.pcwelt.de/a/diktatur-4-0-schoene-neue-digitalisierte-welt,3447005",
"https://www.nzz.ch/english/cancel-culture-at-eth-a-professor-receives-death-threats-over-a-lecture-slide-ld.1675322",
"https://www.brandeins.de/corporate-services/mck-wissen/mck-wissen-logistik/schwaermen-fuer-das-optimum",
"https://www.achgut.com/artikel/ausgestossene_der_woche_prinz_william_als_immaginierter_rassist",
"https://www.pinterpolitik.com/in-depth/klaim-big-data-luhut-perlu-diuji/",
"https://www.srf.ch/kultur/gesellschaft-religion/eklat-an-der-eth-wenn-ein-angeblicher-schweinevergleich-zur-staatsaffaere-wird",
"https://open.spotify.com/episode/6s1icdoplZeNOINvx6ZHTd?si=610a699eba004da2&nd=1",
"https://www.nzz.ch/schweiz/shitstorm-an-der-eth-ein-professor-erhaelt-morddrohungen-ld.1673554",
"https://www.nzz.ch/schweiz/shitstorm-an-der-eth-ein-professor-erhaelt-morddrohungen-ld.1673554",
"https://djmag.com/features/after-astroworld-what-being-done-stop-crowd-crushes-happening-again",
"https://prisma-hsg.ch/articles/meine-daten-deine-daten-unsere-daten/",
"https://www.srf.ch/audio/focus/zukunftsforscher-dirk-helbing-die-welt-ist-keine-maschine?id=10756661",
"https://www.20min.ch/story/roboter-fuer-hunde-machen-wenig-sinn-647302764916",
"https://www.wienerzeitung.at/nachrichten/wissen/mensch/942890-Roboter-als-Praesidentschaftskandidaten.html",
"https://disruptors.fm/11-building-a-crystal-ball-of-the-world-unseating-capitalism-and-creating-a-new-world-order-with-prof-dirk-helbing/",
"https://www.spreaker.com/user/disruptorsfm/11-building-crystal-ball-of-the-world-un",
"https://www.youtube.com/watch?v=fRkCMC3zqSQ",
"https://arstechnica.com/science/2021/11/what-the-physics-of-crowds-can-tell-us-about-the-tragic-deaths-at-astroworld/",
"https://www.fox23.com/news/trending/astroworld-festival-big-crowds-can-flow-like-liquid-with-terrifying-results/37QH6Q4RGFELHGCZSZTBV46STU/",
"https://futurism.com/astroworld-theory-deaths-bodies-fluid",
"https://www.businessinsider.com/why-people-died-astroworld-crowd-crush-physics-fluid-dynamics-2021-11",
"https://theconversation.com/ten-tips-for-surviving-a-crowd-crush-112169",
"https://www.limmattalerzeitung.ch/basel/das-wort-zum-tag-kopie-von-4-januar-hypotenuse-schlaegt-kathete-trivia-trampel-pandemie-ld.2233931",
"https://magazine.swissinformatics.org/en/whats-wrong-with-ai/",
"https://magazine.swissinformatics.org/en/whats-wrong-with-ai/",
"https://www.netkwesties.nl/1541/wrr-ai-wordt-de-verbrandingsmotor-van.htm",
"https://youtu.be/ptm9zLG2KaE",
"https://www.deutschlandfunkkultur.de/die-zukunft-der-demokratie-mehr-teilhabe-von-unten-wagen.976.de.html?dram:article_id=468341",
"https://www.springer.com/gp/book/9783642240034",
"https://www.springer.com/de/book/9783319908687",
"https://technikjournal.de/2017/08/02/ein-plaedoyer-fuer-die-digitale-demokratie/",
"https://technikjournal.de/2017/08/02/ein-plaedoyer-fuer-die-digitale-demokratie/",
"https://trafo.hypotheses.org/23989",
"https://web.archive.org/web/20200609053329/https://www.wiko-berlin.de/institution/projekte-kooperationen/projekte/working-futures/wiko-briefs-working-futures-in-corona-times/the-corona-crisis-reveals-the-struggle-for-a-sustainable-digital-future/",
"https://www.wiko-berlin.de/institution/projekte-kooperationen/projekte/working-futures/wiko-briefs-working-futures-in-corona-times/",
"https://www.youtube.com/watch?v=gAkoprZmW4k",
"https://www.rhein-zeitung.de/region/aus-den-lokalredaktionen/nahe-zeitung_artikel,-peter-flaschels-lebenswerk-hat-die-sozialgeschichte-beeinflusst-_arid,2322161.html",
"https://www.blick.ch/wirtschaft/online-boom-ohne-ende-corona-befeuert-die-tech-revolution-id16359910.html",
"https://www.nzz.ch/meinung/china-unterwirft-tech-und-social-media-das-geht-auch-europa-an-ld.1643010",
"https://www.say.media/article/la-mort-par-algorithme",
"https://www.suedostschweiz.ch/aus-dem-leben/2021-08-14/stau-ist-nicht-gleich-stau",
"https://www.swissinfo.ch/eng/directdemocracy/political-perspectives_digital-democracy--too-risky--or-the-chance-of-a-generation-/43836222",
"https://kow-berlin.com/exhibitions/illusion-einer-menschenmenge",
"https://www.springer.com/gp/book/9783642240034",
"https://www.springer.com/de/book/9783319908687",
"https://www.politik-kommunikation.de/ressorts/artikel/eine-gefaehrliche-machtasymmetrie-1383558602",
"https://www.springer.com/gp/book/9783642240034",
"https://www.springer.com/de/book/9783319908687",
"https://solutions.hamburg/ethik-und-digitalisierung-nicht-voneinander-getrennt-betrachten/",
"https://www.springer.com/gp/book/9783642240034",
"https://www.springer.com/de/book/9783319908687",
"https://avenue.argusdatainsights.ch/Article/AvenueClip?artikelHash=d14d91ec9a8b4cb0b6bb3012c0cefd8b_27F0B19422F1F03723769C18906AA1EE&artikelDateiId=298862327",
"https://www.tagblatt.ch/kultur/grosses-ranking-ihre-stimme-hat-gewicht-das-sind-die-50-profiliertesten-intellektuellen-der-schweiz-ld.2182261",
"https://reliefweb.int/report/world/building-multisystemic-understanding-societal-resilience-covid-19-pandemic",
"https://reliefweb.int/report/world/building-multisystemic-understanding-societal-resilience-covid-19-pandemic",
"https://www.events.at/e/wie-wir-in-zukunft-leben-wollen-die-stadt-als-datenfeld",
"https://www.events.at/e/wie-wir-in-zukunft-leben-wollen-die-stadt-als-datenfeld",
"https://greennetproject.org/en/2018/11/27/prof-dirk-helbing-es-braucht-vor-allem-tolle-ideen-in-die-sich-die-leute-verlieben/",
"https://www.hpcwire.com/2011/05/06/simulating_society_at_the_global_scale/",
"https://www.technologyreview.com/2010/04/30/204005/europes-plan-to-simulate-the-entire-planet/",
"https://komentare.sme.sk/c/22543617/smrt-podla-algoritmu.html",
"https://komentare.sme.sk/c/22543617/smrt-podla-algoritmu.html",
"https://www.confidencial.com.ni/opinion/muerte-por-algoritmo/",
"https://www.nzz.ch/panorama/wie-kann-eine-massenpanik-verhindert-werden-ld.1614761",
"https://www.20min.ch/story/roboter-fuer-hunde-machen-wenig-sinn-647302764916",
"https://www.wienerzeitung.at/nachrichten/wissen/mensch/942890-Roboter-als-Praesidentschaftskandidaten.html",
"https://www.srf.ch/audio/focus/zukunftsforscher-dirk-helbing-die-welt-ist-keine-maschine?id=10756661",
"https://disruptors.fm/11-building-a-crystal-ball-of-the-world-unseating-capitalism-and-creating-a-new-world-order-with-prof-dirk-helbing/",
"https://www.spreaker.com/user/disruptorsfm/11-building-crystal-ball-of-the-world-un",
"https://www.youtube.com/watch?v=fRkCMC3zqSQ",
"https://arstechnica.com/science/2021/11/what-the-physics-of-crowds-can-tell-us-about-the-tragic-deaths-at-astroworld/",
"https://www.fox23.com/news/trending/astroworld-festival-big-crowds-can-flow-like-liquid-with-terrifying-results/37QH6Q4RGFELHGCZSZTBV46STU/",
"https://futurism.com/astroworld-theory-deaths-bodies-fluid",
"https://www.businessinsider.com/why-people-died-astroworld-crowd-crush-physics-fluid-dynamics-2021-11",
"https://theconversation.com/ten-tips-for-surviving-a-crowd-crush-112169",
"https://www.limmattalerzeitung.ch/basel/das-wort-zum-tag-kopie-von-4-januar-hypotenuse-schlaegt-kathete-trivia-trampel-pandemie-ld.2233931",
"https://www.pantarhei.ch/podcast/",
"https://www.focus.it/scienza/scienze/folla-fisica-modelli-simulazioni",
"https://www.focus.it/scienza/scienze/folla-fisica-modelli-simulazioni",
"https://www.netkwesties.nl/1541/wrr-ai-wordt-de-verbrandingsmotor-van.htm",
"https://www.transformationbeats.com/de/transformation/digitale-gesellschaft/",
"https://www.transformationbeats.com/de/transformation/digitale-gesellschaft/",
"https://www.suedkurier.de/ueberregional/wirtschaft/Wie-uns-der-Staat-heimlich-erzieht-sogar-auf-dem-Klo;art416,8763904",
"https://www.suedkurier.de/ueberregional/wirtschaft/Wie-uns-der-Staat-heimlich-erzieht-sogar-auf-dem-Klo;art416,8763904",
"https://www.deutschlandfunkkultur.de/die-zukunft-der-demokratie-mehr-teilhabe-von-unten-wagen.976.de.html?dram:article_id=468341",
"https://www.springer.com/gp/book/9783642240034",
"https://www.springer.com/de/book/9783319908687",
"https://trafo.hypotheses.org/23989",
"https://web.archive.org/web/20200609053329/https://www.wiko-berlin.de/institution/projekte-kooperationen/projekte/working-futures/wiko-briefs-working-futures-in-corona-times/the-corona-crisis-reveals-the-struggle-for-a-sustainable-digital-future/",
"https://www.wiko-berlin.de/institution/projekte-kooperationen/projekte/working-futures/wiko-briefs-working-futures-in-corona-times/",
"https://www.youtube.com/watch?v=gAkoprZmW4k",
"https://futurium.de/de/gespraech/ranga-yogeshwar-1/ranga-yogeshwar-dirk-helbing-mit-musik-von-till-broenner",
"https://www.springer.com/gp/book/9783642240034",
"https://www.springer.com/de/book/9783319908687",
"https://idw-online.de/en/news113518",
"https://blmplus.de/die-digitalcharta-ist-erst-der-anfang-ein-szenario-von-dirk-helbing/",
"https://www.risiko-dialog.ch/big-nudging-vom-computer-gelenkt-aber-wohin/",
"https://idw-online.de/de/news13986",
"https://www.uni-stuttgart.de/presse/archiv/uni-kurier/uk84_85/forschung/fw66.html",
"https://www.infosperber.ch/medien/trends/rankings-oft-unbrauchbar-so-oder-so-aber-immer-schadlich/",
"https://www.infosperber.ch/medien/trends/rankings-oft-unbrauchbar-so-oder-so-aber-immer-schadlich/",
"https://www.nzz.ch/meinung/china-unterwirft-tech-und-social-media-das-geht-auch-europa-an-ld.1643010",
"https://www.suedostschweiz.ch/aus-dem-leben/2021-08-14/stau-ist-nicht-gleich-stau",
"https://www.swissinfo.ch/eng/directdemocracy/political-perspectives_digital-democracy--too-risky--or-the-chance-of-a-generation-/43836222",
"https://werteundwandel.de/inhalte/d2030-in-aufbruchstimmung-fuer-eine-lebenswerte-zukunft/",
"https://www.springer.com/gp/book/9783642240034",
"https://www.springer.com/de/book/9783319908687",
"https://www.youtube.com/watch?v=n9e77iYZPEY",
"https://greennetproject.org/en/2018/11/27/prof-dirk-helbing-es-braucht-vor-allem-tolle-ideen-in-die-sich-die-leute-verlieben/",
"https://www.hpcwire.com/2011/05/06/simulating_society_at_the_global_scale/",
"https://www.say.media/article/la-mort-par-algorithme",
"https://www.confidencial.com.ni/opinion/muerte-por-algoritmo/",
"https://www.nzz.ch/panorama/wie-kann-eine-massenpanik-verhindert-werden-ld.1614761",
"https://www.nesta.org.uk/report/digital-democracy-the-tools-transforming-political-engagement/",
"https://www.nature.com/articles/news.2010.351",
"https://www.focus.de/panorama/welt/tid-19265/gastkommentar-nutzt-die-moeglichkeiten-des-computers_aid_534372.html",
"https://www.theglobalist.com/democracy-technology-innovation-society-internet/",
"https://www.theglobalist.com/capitalism-democracy-technology-surveillance-privacy/",
"https://www.theglobalist.com/google-artificial-intelligence-big-data-technology-future/",
"https://www.theglobalist.com/fascism-big-data-artificial-intelligence-surveillance-democracy/",
"https://www.theglobalist.com/technology-big-data-artificial-intelligence-future-peace-rooms/",
"https://www.theglobalist.com/technology-society-sustainability-future-humanity/",
"https://www.theglobalist.com/society-technology-peace-sustainability/",
"https://www.theglobalist.com/democracy-technology-social-media-artificial-intelligence/",
"https://www.theglobalist.com/financial-system-reform-economy-internet-of-things-capitalism/",
"https://www.theglobalist.com/capitalism-society-equality-sustainability-crowd-funding/",
"https://www.theglobalist.com/united-nations-world-government-peace-sustainability-society/",
"https://www.theglobalist.com/world-economy-sustainability-environment-society/"
]

View File

@ -1,61 +0,0 @@
import youtube_dl
from waybackpy import WaybackMachineSaveAPI # upload to archive.org
import time
urls = [
"https://id2020.org",
"https://www.weforum.org/platforms/the-centre-for-cybersecurity",
"https://www.unhcr.org/blogs/wp-content/uploads/sites/48/2018/04/fs.pdf",
"https://en.wikipedia.org/wiki/Social_Credit_System",
"https://en.wikipedia.org/wiki/Customer_lifetime_value",
"https://www.weforum.org/reports/the-internet-of-bodies-is-here-tackling-new-challenges-of-technology-governance",
"https://www.un.org/en/about-us/universal-declaration-of-human-rights",
"https://www.biometricupdate.com/201909/id2020-and-partners-launch-program-to-provide-digital-id-with-vaccines",
"https://www.wired.com/2008/06/pb-theory/",
"https://www.medtechdive.com/news/fda-warns-of-false-positives-with-bd-coronavirus-diagnostic/581115/",
"https://www.bbc.com/news/world-middle-east-52579475",
"https://www.timesofisrael.com/over-12000-mistakenly-quarantined-by-phone-tracking-health-ministry-admits/",
"https://www.delftdesignforvalues.nl",
"https://www.theglobalist.com/technology-big-data-artificial-intelligence-future-peace-rooms/",
"https://link.springer.com/chapter/10.1007/978-3-319-90869-4_17",
"https://www.youtube.com/watch?v=_KhAsJRk2lo",
"https://www.bloomberg.org/environment/supporting-sustainable-cities/american-cities-climate-challenge/",
"https://climatecitycup.org",
]
def post_download_hook(ret_code):
# print(ret_code)
if ret_code['status'] == 'finished':
file_loc = ret_code["filename"]
print(file_loc)
def save_video(url):
"""Saves video accoring to url and save path"""
ydl_opts = {
'format': 'best[height<=720]',
# 'outtmpl': f"{file_path}.%(ext)s", # basically the filename from the object, but with a custom extension depending on the download
'progress_hooks': [post_download_hook],
'updatetime': False
}
try:
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
# article file name is updated in self.post_download_hook
except Exception as e:
print(f"Youtube download crashed: {e}")
# for i, url in enumerate(urls):
# print(f"Downloading video {i+1} / {len(urls)}")
# save_video(url)
for i, url in enumerate(urls):
print(f"Saving url {i+1} / {len(urls)}")
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed?
wayback = WaybackMachineSaveAPI(url, user_agent)
archive_url = wayback.save()
print(archive_url)
time.sleep(20)

View File

@ -2,10 +2,10 @@ FROM python:latest
ENV TZ Europe/Zurich
RUN mkdir -p /app/auto_news
RUN mkdir -p /app/news_fetch
COPY requirements.txt /app/requirements.txt
RUN python3 -m pip install -r /app/requirements.txt
COPY . /app/auto_news
WORKDIR /app/auto_news
COPY . /app/news_fetch
WORKDIR /app/news_fetch

View File

@ -126,13 +126,12 @@ class Dispatcher(Thread):
# def manual_processing(self, articles, workers):
# for w in workers:
# w.start()
# for article in articles:
# notifier = lambda article: logger.info(f"Completed manual actions for {article}")
# ArticleWatcher(article, workers_manual = workers, notifier = notifier) # Article watcher wants a thread to link article to TODO: handle threads as a kwarg
class PrintWorker:
def send(self, article):
print(f"Uploaded article {article}")
def keep_alive(self): # keeps script running, because there is nothing else in the main thread
while True: sleep(1)
@ -140,11 +139,6 @@ if __name__ == "__main__":
dispatcher = Dispatcher()
if "upload" in sys.argv:
class PrintWorker:
def send(self, article):
print(f"Uploaded article {article}")
def keep_alive(self): # keeps script running, because there is nothing else in the main thread
while True: sleep(1)
articles = models.ArticleDownload.select().where(models.ArticleDownload.archive_url == "" or models.ArticleDownload.archive_url == "TODO:UPLOAD").execute()
logger.info(f"Launching upload to archive for {len(articles)} articles.")