update nas target, documentation

This commit is contained in:
2022-10-24 17:25:48 +02:00
parent 6c08dec20a
commit e6bfe811d0
22 changed files with 111 additions and 369 deletions

7
manual/README.md Normal file
View File

@@ -0,0 +1,7 @@
### MANUAL TASKS
The files inside this directory contain scripts for repetitive but somewhat automatable tasks.
> ⚠️ warning:
>
> Most scripts still require manual intervention before/after running and probably require changes to the code. **Please make sure you understand them before using them!**

21
manual/batch_archive.py Normal file
View File

@@ -0,0 +1,21 @@
"""
Saves websites specified in 'batch_urls.txt' to the wayback machine. Outputs archive urls to terminal
Hint: use 'python batch_archive.py > batch_archive.txt' to save the output to a file
"""
from waybackpy import WaybackMachineSaveAPI # upload to archive.org
import time
urls = []
with open ("batch_urls.txt", "r") as f:
urls = f.readlines()
for i, url in enumerate(urls):
print(f"Saving url {i+1} / {len(urls)}")
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed?
wayback = WaybackMachineSaveAPI(url, user_agent)
archive_url = wayback.save()
print(archive_url)
time.sleep(20)
# Uploads to archive.org are rate limited

18
manual/batch_urls.txt Normal file
View File

@@ -0,0 +1,18 @@
https://id2020.org
https://www.weforum.org/platforms/the-centre-for-cybersecurity
https://www.unhcr.org/blogs/wp-content/uploads/sites/48/2018/04/fs.pdf
https://en.wikipedia.org/wiki/Social_Credit_System
https://en.wikipedia.org/wiki/Customer_lifetime_value
https://www.weforum.org/reports/the-internet-of-bodies-is-here-tackling-new-challenges-of-technology-governance
https://www.un.org/en/about-us/universal-declaration-of-human-rights
https://www.biometricupdate.com/201909/id2020-and-partners-launch-program-to-provide-digital-id-with-vaccines
https://www.wired.com/2008/06/pb-theory/
https://www.medtechdive.com/news/fda-warns-of-false-positives-with-bd-coronavirus-diagnostic/581115/
https://www.bbc.com/news/world-middle-east-52579475
https://www.timesofisrael.com/over-12000-mistakenly-quarantined-by-phone-tracking-health-ministry-admits/
https://www.delftdesignforvalues.nl
https://www.theglobalist.com/technology-big-data-artificial-intelligence-future-peace-rooms/
https://link.springer.com/chapter/10.1007/978-3-319-90869-4_17
https://www.youtube.com/watch?v=_KhAsJRk2lo
https://www.bloomberg.org/environment/supporting-sustainable-cities/american-cities-climate-challenge/
https://climatecitycup.org

33
manual/batch_youtube.py Normal file
View File

@@ -0,0 +1,33 @@
"""
Saves youtube videos specified in 'batch_urls.txt' to the local folder. (to be copied manually)
"""
import youtube_dl
urls = []
with open ("batch_urls.txt", "r") as f:
urls = f.readlines()
def post_download_hook(ret_code):
if ret_code['status'] == 'finished':
file_loc = ret_code["filename"]
print(file_loc)
def save_video(url):
"""Saves video accoring to url and save path"""
ydl_opts = {
'format': 'best[height<=720]',
'progress_hooks': [post_download_hook],
'updatetime': False
}
try:
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
except Exception as e:
print(f"Youtube download crashed: {e}")
for i, url in enumerate(urls):
print(f"Downloading video {i+1} / {len(urls)}")
save_video(url)

View File

@@ -0,0 +1,26 @@
"""
Extracts all urls from a list of mails exported from thunderbird. Writes to 'mails_url_export.json'
"""
import os
import re
import json
os.chdir("/home/remy/Downloads/mails/")
regex = "(?P<url>https?://[^\s]+)"
all_files = os.listdir(".")
all_urls = []
for f in all_files:
with open(f, "r", encoding="utf8") as mail:
content = mail.readlines()
search = "".join(content)
urls = re.findall(regex, search)
all_urls += urls
print("Saved {} urls".format(len(all_urls)))
with open("mails_url_export.json", "w") as f:
json.dump(all_urls, f)

View File

@@ -0,0 +1,67 @@
"""
Runs the news_fetch pipeline against a manually curated list of urls and saves them locally
"""
import sys
sys.path.append("../app/news_fetch")
import runner
import logging
logger = logging.getLogger()
import json
from rich.console import Console
from rich.table import Table
console = Console()
logger.info("Overwriting production values for single time media-fetch")
runner.configuration.models.set_db(
runner.configuration.SqliteDatabase("../.dev/media_downloads.db")
)
runner.configuration.main_config["DOWNLOADS"]["local_storage_path"] = "../.dev/"
def fetch():
dispatcher = runner.Dispatcher()
dispatcher.workers_in = [{"FetchWorker": runner.FetchWorker(), "DownloadWorker": runner.DownloadWorker()}]
dispatcher.workers_out = [{"PrintWorker": runner.PrintWorker()}]
dispatcher.start()
with open("media_urls.json", "r") as f:
url_list = json.loads(f.read())
logger.info(f"Found {len(url_list)} media urls")
for u in url_list:
msg_text = f"<{u}|dummy preview text>"
dispatcher.incoming_request(msg)
def show():
t = Table(
title = "ArticleDownloads",
row_styles = ["white", "bright_black"],
)
entries = ["title", "article_url", "archive_url", "authors"]
for e in entries:
t.add_column(e, justify = "right")
sel = runner.models.ArticleDownload.select()
for s in sel:
c = [getattr(s, e) for e in entries]#
c[-1] = str([a.author for a in c[-1]])
print(c)
t.add_row(*c)
console.print(t)
# fetch()
show()

View File

@@ -0,0 +1,170 @@
import datetime
import sys
sys.path.append("../news_fetch/")
import configuration # lives in app
from peewee import *
import os
import time
old_db = SqliteDatabase("/app/containerdata/downloads.db")
cred = configuration.db_config["DATABASE"]
download_db = PostgresqlDatabase(
cred["db_name"], user=cred["user_name"], password=cred["password"], host="vpn", port=5432
)
## OLD Models
class OLDModel(Model):
class Meta:
database = old_db
class OLDArticleDownload(OLDModel):
class Meta:
db_table = 'articledownload'
title = CharField(default='')
pub_date = DateField(default = '')
download_date = DateField(default = 0)
source_name = CharField(default = '')
article_url = TextField(default = '', unique=True)
archive_url = TextField(default = '')
file_name = TextField(default = '')
language = CharField(default = '')
summary = TextField(default = '')
comment = TextField(default = '')
verified = IntegerField(default = False)
# authors
# keywords
# ... are added through foreignkeys
class OLDArticleAuthor(OLDModel):
class Meta:
db_table = 'articleauthor'
article = ForeignKeyField(OLDArticleDownload, backref='authors')
author = CharField()
class OLDArticleRelated(OLDModel):
class Meta:
db_table = 'articlerelated'
article = ForeignKeyField(OLDArticleDownload, backref='related')
related_file_name = TextField(default = '')
## NEW Models
class NEWModel(Model):
class Meta:
database = download_db
class ArticleDownload(NEWModel):
# in the beginning this is all we have
article_url = TextField(default = '', unique=True)
# fetch then fills in the metadata
title = TextField(default='')
summary = TextField(default = '')
source_name = CharField(default = '')
language = CharField(default = '')
file_name = TextField(default = '')
archive_url = TextField(default = '')
pub_date = DateField(default = '')
download_date = DateField(default = 0)
slack_ts = FloatField(default = 0) # should be a fixed-length string but float is easier to sort by
sent = BooleanField(default = False)
archived_by = CharField(default = os.getenv("UNAME"))
# need to know who saved the message because the file needs to be on their computer in order to get verified
# verification happens in a different app, but the model has the fields here as well
comment = TextField(default = '')
verified = IntegerField(default = 0) # 0 = not verified, 1 = verified, -1 = marked as bad
def set_authors(self, authors):
for a in authors:
if len(a) < 100:
ArticleAuthor.create(
article = self,
author = a
)
def set_related(self, related):
for r in related:
ArticleRelated.create(
article = self,
related_file_name = r
)
# authors
# keywords
# ... are added through foreignkeys
# we will also add an attribute named message, to reference which message should be replied to. This attribute does not need to be saved in the db
class ArticleAuthor(NEWModel):
article = ForeignKeyField(ArticleDownload, backref='authors')
author = CharField()
class ArticleRelated(NEWModel):
# Related files, such as the full text of a paper, audio files, etc.
article = ForeignKeyField(ArticleDownload, backref='related')
related_file_name = TextField(default = '')
####################################################################
# Migrate using sensible defaults:
download_db.create_tables([ArticleDownload, ArticleAuthor, ArticleRelated])
it = 0
for old_art in OLDArticleDownload.select():
print("====================================================================")
it+=1
print(f"IT {it} New article with data:")
print(
old_art.article_url,
old_art.title,
old_art.summary,
old_art.source_name,
old_art.language,
old_art.file_name,
old_art.archive_url,
old_art.pub_date if old_art.pub_date != "" else datetime.date.fromtimestamp(0),
old_art.download_date,
True,
old_art.comment,
old_art.verified
)
new_art = ArticleDownload.create(
article_url = old_art.article_url,
title = old_art.title,
summary = old_art.summary,
source_name = old_art.source_name,
language = old_art.language,
file_name = old_art.file_name,
archive_url = old_art.archive_url,
pub_date = old_art.pub_date if old_art.pub_date != "" else datetime.date.fromtimestamp(0),
download_date = old_art.download_date,
# slack_ts = FloatField(default = 0)
sent = True,
# archived_by = CharField(default = os.getenv("UNAME"))
comment = old_art.comment,
verified = old_art.verified
)
new_art.set_related([r.related_file_name for r in old_art.related])
new_art.set_authors([a.author for a in old_art.authors])