update nas target, documentation
This commit is contained in:
7
manual/README.md
Normal file
7
manual/README.md
Normal file
@@ -0,0 +1,7 @@
|
||||
### MANUAL TASKS
|
||||
|
||||
The files inside this directory contain scripts for repetitive but somewhat automatable tasks.
|
||||
|
||||
> ⚠️ warning:
|
||||
>
|
||||
> Most scripts still require manual intervention before/after running and probably require changes to the code. **Please make sure you understand them before using them!**
|
21
manual/batch_archive.py
Normal file
21
manual/batch_archive.py
Normal file
@@ -0,0 +1,21 @@
|
||||
"""
|
||||
Saves websites specified in 'batch_urls.txt' to the wayback machine. Outputs archive urls to terminal
|
||||
Hint: use 'python batch_archive.py > batch_archive.txt' to save the output to a file
|
||||
"""
|
||||
from waybackpy import WaybackMachineSaveAPI # upload to archive.org
|
||||
import time
|
||||
|
||||
urls = []
|
||||
with open ("batch_urls.txt", "r") as f:
|
||||
urls = f.readlines()
|
||||
|
||||
|
||||
|
||||
for i, url in enumerate(urls):
|
||||
print(f"Saving url {i+1} / {len(urls)}")
|
||||
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed?
|
||||
wayback = WaybackMachineSaveAPI(url, user_agent)
|
||||
archive_url = wayback.save()
|
||||
print(archive_url)
|
||||
time.sleep(20)
|
||||
# Uploads to archive.org are rate limited
|
18
manual/batch_urls.txt
Normal file
18
manual/batch_urls.txt
Normal file
@@ -0,0 +1,18 @@
|
||||
https://id2020.org
|
||||
https://www.weforum.org/platforms/the-centre-for-cybersecurity
|
||||
https://www.unhcr.org/blogs/wp-content/uploads/sites/48/2018/04/fs.pdf
|
||||
https://en.wikipedia.org/wiki/Social_Credit_System
|
||||
https://en.wikipedia.org/wiki/Customer_lifetime_value
|
||||
https://www.weforum.org/reports/the-internet-of-bodies-is-here-tackling-new-challenges-of-technology-governance
|
||||
https://www.un.org/en/about-us/universal-declaration-of-human-rights
|
||||
https://www.biometricupdate.com/201909/id2020-and-partners-launch-program-to-provide-digital-id-with-vaccines
|
||||
https://www.wired.com/2008/06/pb-theory/
|
||||
https://www.medtechdive.com/news/fda-warns-of-false-positives-with-bd-coronavirus-diagnostic/581115/
|
||||
https://www.bbc.com/news/world-middle-east-52579475
|
||||
https://www.timesofisrael.com/over-12000-mistakenly-quarantined-by-phone-tracking-health-ministry-admits/
|
||||
https://www.delftdesignforvalues.nl
|
||||
https://www.theglobalist.com/technology-big-data-artificial-intelligence-future-peace-rooms/
|
||||
https://link.springer.com/chapter/10.1007/978-3-319-90869-4_17
|
||||
https://www.youtube.com/watch?v=_KhAsJRk2lo
|
||||
https://www.bloomberg.org/environment/supporting-sustainable-cities/american-cities-climate-challenge/
|
||||
https://climatecitycup.org
|
33
manual/batch_youtube.py
Normal file
33
manual/batch_youtube.py
Normal file
@@ -0,0 +1,33 @@
|
||||
"""
|
||||
Saves youtube videos specified in 'batch_urls.txt' to the local folder. (to be copied manually)
|
||||
"""
|
||||
import youtube_dl
|
||||
|
||||
urls = []
|
||||
with open ("batch_urls.txt", "r") as f:
|
||||
urls = f.readlines()
|
||||
|
||||
|
||||
def post_download_hook(ret_code):
|
||||
if ret_code['status'] == 'finished':
|
||||
file_loc = ret_code["filename"]
|
||||
print(file_loc)
|
||||
|
||||
|
||||
def save_video(url):
|
||||
"""Saves video accoring to url and save path"""
|
||||
ydl_opts = {
|
||||
'format': 'best[height<=720]',
|
||||
'progress_hooks': [post_download_hook],
|
||||
'updatetime': False
|
||||
}
|
||||
try:
|
||||
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
||||
ydl.download([url])
|
||||
except Exception as e:
|
||||
print(f"Youtube download crashed: {e}")
|
||||
|
||||
|
||||
for i, url in enumerate(urls):
|
||||
print(f"Downloading video {i+1} / {len(urls)}")
|
||||
save_video(url)
|
26
manual/exctract_from_mail_backup.py
Normal file
26
manual/exctract_from_mail_backup.py
Normal file
@@ -0,0 +1,26 @@
|
||||
"""
|
||||
Extracts all urls from a list of mails exported from thunderbird. Writes to 'mails_url_export.json'
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
|
||||
os.chdir("/home/remy/Downloads/mails/")
|
||||
|
||||
regex = "(?P<url>https?://[^\s]+)"
|
||||
|
||||
all_files = os.listdir(".")
|
||||
all_urls = []
|
||||
|
||||
for f in all_files:
|
||||
with open(f, "r", encoding="utf8") as mail:
|
||||
content = mail.readlines()
|
||||
|
||||
search = "".join(content)
|
||||
urls = re.findall(regex, search)
|
||||
all_urls += urls
|
||||
|
||||
print("Saved {} urls".format(len(all_urls)))
|
||||
|
||||
with open("mails_url_export.json", "w") as f:
|
||||
json.dump(all_urls, f)
|
67
manual/gather_media_files.py
Normal file
67
manual/gather_media_files.py
Normal file
@@ -0,0 +1,67 @@
|
||||
"""
|
||||
Runs the news_fetch pipeline against a manually curated list of urls and saves them locally
|
||||
"""
|
||||
import sys
|
||||
sys.path.append("../app/news_fetch")
|
||||
import runner
|
||||
import logging
|
||||
logger = logging.getLogger()
|
||||
import json
|
||||
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
console = Console()
|
||||
|
||||
logger.info("Overwriting production values for single time media-fetch")
|
||||
runner.configuration.models.set_db(
|
||||
runner.configuration.SqliteDatabase("../.dev/media_downloads.db")
|
||||
)
|
||||
runner.configuration.main_config["DOWNLOADS"]["local_storage_path"] = "../.dev/"
|
||||
|
||||
|
||||
def fetch():
|
||||
dispatcher = runner.Dispatcher()
|
||||
|
||||
dispatcher.workers_in = [{"FetchWorker": runner.FetchWorker(), "DownloadWorker": runner.DownloadWorker()}]
|
||||
dispatcher.workers_out = [{"PrintWorker": runner.PrintWorker()}]
|
||||
|
||||
dispatcher.start()
|
||||
|
||||
with open("media_urls.json", "r") as f:
|
||||
url_list = json.loads(f.read())
|
||||
|
||||
logger.info(f"Found {len(url_list)} media urls")
|
||||
for u in url_list:
|
||||
msg_text = f"<{u}|dummy preview text>"
|
||||
dispatcher.incoming_request(msg)
|
||||
|
||||
|
||||
|
||||
def show():
|
||||
|
||||
t = Table(
|
||||
title = "ArticleDownloads",
|
||||
row_styles = ["white", "bright_black"],
|
||||
)
|
||||
|
||||
entries = ["title", "article_url", "archive_url", "authors"]
|
||||
|
||||
for e in entries:
|
||||
t.add_column(e, justify = "right")
|
||||
|
||||
sel = runner.models.ArticleDownload.select()
|
||||
|
||||
for s in sel:
|
||||
c = [getattr(s, e) for e in entries]#
|
||||
c[-1] = str([a.author for a in c[-1]])
|
||||
print(c)
|
||||
t.add_row(*c)
|
||||
|
||||
|
||||
console.print(t)
|
||||
|
||||
|
||||
|
||||
|
||||
# fetch()
|
||||
show()
|
170
manual/migration.to_postgres.py
Normal file
170
manual/migration.to_postgres.py
Normal file
@@ -0,0 +1,170 @@
|
||||
import datetime
|
||||
import sys
|
||||
sys.path.append("../news_fetch/")
|
||||
import configuration # lives in app
|
||||
from peewee import *
|
||||
|
||||
import os
|
||||
import time
|
||||
|
||||
old_db = SqliteDatabase("/app/containerdata/downloads.db")
|
||||
|
||||
cred = configuration.db_config["DATABASE"]
|
||||
download_db = PostgresqlDatabase(
|
||||
cred["db_name"], user=cred["user_name"], password=cred["password"], host="vpn", port=5432
|
||||
)
|
||||
|
||||
## OLD Models
|
||||
class OLDModel(Model):
|
||||
class Meta:
|
||||
database = old_db
|
||||
|
||||
|
||||
class OLDArticleDownload(OLDModel):
|
||||
class Meta:
|
||||
db_table = 'articledownload'
|
||||
|
||||
title = CharField(default='')
|
||||
pub_date = DateField(default = '')
|
||||
download_date = DateField(default = 0)
|
||||
source_name = CharField(default = '')
|
||||
article_url = TextField(default = '', unique=True)
|
||||
archive_url = TextField(default = '')
|
||||
file_name = TextField(default = '')
|
||||
language = CharField(default = '')
|
||||
summary = TextField(default = '')
|
||||
comment = TextField(default = '')
|
||||
verified = IntegerField(default = False)
|
||||
# authors
|
||||
# keywords
|
||||
# ... are added through foreignkeys
|
||||
|
||||
|
||||
|
||||
|
||||
class OLDArticleAuthor(OLDModel):
|
||||
class Meta:
|
||||
db_table = 'articleauthor'
|
||||
|
||||
article = ForeignKeyField(OLDArticleDownload, backref='authors')
|
||||
author = CharField()
|
||||
|
||||
|
||||
|
||||
class OLDArticleRelated(OLDModel):
|
||||
class Meta:
|
||||
db_table = 'articlerelated'
|
||||
|
||||
article = ForeignKeyField(OLDArticleDownload, backref='related')
|
||||
related_file_name = TextField(default = '')
|
||||
|
||||
|
||||
|
||||
|
||||
## NEW Models
|
||||
class NEWModel(Model):
|
||||
class Meta:
|
||||
database = download_db
|
||||
|
||||
|
||||
class ArticleDownload(NEWModel):
|
||||
# in the beginning this is all we have
|
||||
article_url = TextField(default = '', unique=True)
|
||||
# fetch then fills in the metadata
|
||||
title = TextField(default='')
|
||||
summary = TextField(default = '')
|
||||
source_name = CharField(default = '')
|
||||
language = CharField(default = '')
|
||||
file_name = TextField(default = '')
|
||||
archive_url = TextField(default = '')
|
||||
pub_date = DateField(default = '')
|
||||
download_date = DateField(default = 0)
|
||||
slack_ts = FloatField(default = 0) # should be a fixed-length string but float is easier to sort by
|
||||
sent = BooleanField(default = False)
|
||||
archived_by = CharField(default = os.getenv("UNAME"))
|
||||
# need to know who saved the message because the file needs to be on their computer in order to get verified
|
||||
# verification happens in a different app, but the model has the fields here as well
|
||||
comment = TextField(default = '')
|
||||
verified = IntegerField(default = 0) # 0 = not verified, 1 = verified, -1 = marked as bad
|
||||
|
||||
def set_authors(self, authors):
|
||||
for a in authors:
|
||||
if len(a) < 100:
|
||||
ArticleAuthor.create(
|
||||
article = self,
|
||||
author = a
|
||||
)
|
||||
|
||||
def set_related(self, related):
|
||||
for r in related:
|
||||
ArticleRelated.create(
|
||||
article = self,
|
||||
related_file_name = r
|
||||
)
|
||||
|
||||
# authors
|
||||
# keywords
|
||||
# ... are added through foreignkeys
|
||||
# we will also add an attribute named message, to reference which message should be replied to. This attribute does not need to be saved in the db
|
||||
|
||||
|
||||
|
||||
class ArticleAuthor(NEWModel):
|
||||
article = ForeignKeyField(ArticleDownload, backref='authors')
|
||||
author = CharField()
|
||||
|
||||
|
||||
class ArticleRelated(NEWModel):
|
||||
# Related files, such as the full text of a paper, audio files, etc.
|
||||
article = ForeignKeyField(ArticleDownload, backref='related')
|
||||
related_file_name = TextField(default = '')
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
####################################################################
|
||||
# Migrate using sensible defaults:
|
||||
download_db.create_tables([ArticleDownload, ArticleAuthor, ArticleRelated])
|
||||
|
||||
it = 0
|
||||
for old_art in OLDArticleDownload.select():
|
||||
print("====================================================================")
|
||||
it+=1
|
||||
print(f"IT {it} New article with data:")
|
||||
print(
|
||||
old_art.article_url,
|
||||
old_art.title,
|
||||
old_art.summary,
|
||||
old_art.source_name,
|
||||
old_art.language,
|
||||
old_art.file_name,
|
||||
old_art.archive_url,
|
||||
old_art.pub_date if old_art.pub_date != "" else datetime.date.fromtimestamp(0),
|
||||
old_art.download_date,
|
||||
True,
|
||||
old_art.comment,
|
||||
old_art.verified
|
||||
)
|
||||
new_art = ArticleDownload.create(
|
||||
article_url = old_art.article_url,
|
||||
title = old_art.title,
|
||||
summary = old_art.summary,
|
||||
source_name = old_art.source_name,
|
||||
language = old_art.language,
|
||||
file_name = old_art.file_name,
|
||||
archive_url = old_art.archive_url,
|
||||
pub_date = old_art.pub_date if old_art.pub_date != "" else datetime.date.fromtimestamp(0),
|
||||
download_date = old_art.download_date,
|
||||
# slack_ts = FloatField(default = 0)
|
||||
sent = True,
|
||||
# archived_by = CharField(default = os.getenv("UNAME"))
|
||||
comment = old_art.comment,
|
||||
verified = old_art.verified
|
||||
)
|
||||
|
||||
|
||||
new_art.set_related([r.related_file_name for r in old_art.related])
|
||||
new_art.set_authors([a.author for a in old_art.authors])
|
||||
|
Reference in New Issue
Block a user