update nas target, documentation
This commit is contained in:
		
							
								
								
									
										7
									
								
								manual/README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										7
									
								
								manual/README.md
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,7 @@ | ||||
| ### MANUAL TASKS | ||||
|  | ||||
| The files inside this directory contain scripts for repetitive but somewhat automatable tasks. | ||||
|  | ||||
| > ⚠️ warning: | ||||
| >  | ||||
| > Most scripts still require manual intervention before/after running and probably require changes to the code. **Please make sure you understand them before using them!** | ||||
							
								
								
									
										21
									
								
								manual/batch_archive.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								manual/batch_archive.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,21 @@ | ||||
| """ | ||||
| Saves websites specified in 'batch_urls.txt' to the wayback machine. Outputs archive urls to terminal | ||||
| Hint: use 'python batch_archive.py > batch_archive.txt' to save the output to a file | ||||
| """ | ||||
| from waybackpy import WaybackMachineSaveAPI # upload to archive.org | ||||
| import time | ||||
|  | ||||
| urls = [] | ||||
| with open ("batch_urls.txt", "r") as f: | ||||
|     urls = f.readlines() | ||||
|  | ||||
|  | ||||
|  | ||||
| for i, url in enumerate(urls): | ||||
|     print(f"Saving url {i+1} / {len(urls)}") | ||||
|     user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" # needed? | ||||
|     wayback = WaybackMachineSaveAPI(url, user_agent) | ||||
|     archive_url = wayback.save() | ||||
|     print(archive_url) | ||||
|     time.sleep(20) | ||||
|     # Uploads to archive.org are rate limited | ||||
							
								
								
									
										18
									
								
								manual/batch_urls.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								manual/batch_urls.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,18 @@ | ||||
| https://id2020.org | ||||
| https://www.weforum.org/platforms/the-centre-for-cybersecurity | ||||
| https://www.unhcr.org/blogs/wp-content/uploads/sites/48/2018/04/fs.pdf | ||||
| https://en.wikipedia.org/wiki/Social_Credit_System | ||||
| https://en.wikipedia.org/wiki/Customer_lifetime_value | ||||
| https://www.weforum.org/reports/the-internet-of-bodies-is-here-tackling-new-challenges-of-technology-governance | ||||
| https://www.un.org/en/about-us/universal-declaration-of-human-rights | ||||
| https://www.biometricupdate.com/201909/id2020-and-partners-launch-program-to-provide-digital-id-with-vaccines | ||||
| https://www.wired.com/2008/06/pb-theory/ | ||||
| https://www.medtechdive.com/news/fda-warns-of-false-positives-with-bd-coronavirus-diagnostic/581115/ | ||||
| https://www.bbc.com/news/world-middle-east-52579475 | ||||
| https://www.timesofisrael.com/over-12000-mistakenly-quarantined-by-phone-tracking-health-ministry-admits/ | ||||
| https://www.delftdesignforvalues.nl | ||||
| https://www.theglobalist.com/technology-big-data-artificial-intelligence-future-peace-rooms/ | ||||
| https://link.springer.com/chapter/10.1007/978-3-319-90869-4_17 | ||||
| https://www.youtube.com/watch?v=_KhAsJRk2lo | ||||
| https://www.bloomberg.org/environment/supporting-sustainable-cities/american-cities-climate-challenge/ | ||||
| https://climatecitycup.org | ||||
							
								
								
									
										33
									
								
								manual/batch_youtube.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								manual/batch_youtube.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,33 @@ | ||||
| """ | ||||
| Saves youtube videos specified in 'batch_urls.txt' to the local folder. (to be copied manually) | ||||
| """ | ||||
| import youtube_dl | ||||
|  | ||||
| urls = [] | ||||
| with open ("batch_urls.txt", "r") as f: | ||||
|     urls = f.readlines() | ||||
|  | ||||
|  | ||||
| def post_download_hook(ret_code): | ||||
|     if ret_code['status'] == 'finished': | ||||
|         file_loc = ret_code["filename"] | ||||
|         print(file_loc) | ||||
|  | ||||
|  | ||||
| def save_video(url): | ||||
|     """Saves video accoring to url and save path""" | ||||
|     ydl_opts = { | ||||
|         'format': 'best[height<=720]', | ||||
|         'progress_hooks': [post_download_hook], | ||||
|         'updatetime': False | ||||
|     } | ||||
|     try: | ||||
|         with youtube_dl.YoutubeDL(ydl_opts) as ydl: | ||||
|             ydl.download([url]) | ||||
|     except Exception as e: | ||||
|         print(f"Youtube download crashed: {e}") | ||||
|  | ||||
|  | ||||
| for i, url in enumerate(urls): | ||||
|     print(f"Downloading video {i+1} / {len(urls)}") | ||||
|     save_video(url) | ||||
							
								
								
									
										26
									
								
								manual/exctract_from_mail_backup.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								manual/exctract_from_mail_backup.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | ||||
| """ | ||||
| Extracts all urls from a list of mails exported from thunderbird. Writes to 'mails_url_export.json' | ||||
| """ | ||||
| import os | ||||
| import re | ||||
| import json | ||||
|  | ||||
| os.chdir("/home/remy/Downloads/mails/") | ||||
|  | ||||
| regex = "(?P<url>https?://[^\s]+)" | ||||
|  | ||||
| all_files = os.listdir(".") | ||||
| all_urls = [] | ||||
|  | ||||
| for f in all_files: | ||||
|     with open(f, "r", encoding="utf8") as mail: | ||||
|         content = mail.readlines() | ||||
|      | ||||
|     search = "".join(content) | ||||
|     urls = re.findall(regex, search) | ||||
|     all_urls += urls | ||||
|  | ||||
| print("Saved {} urls".format(len(all_urls))) | ||||
|  | ||||
| with open("mails_url_export.json", "w") as f: | ||||
|     json.dump(all_urls, f)   | ||||
							
								
								
									
										67
									
								
								manual/gather_media_files.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										67
									
								
								manual/gather_media_files.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,67 @@ | ||||
| """ | ||||
| Runs the news_fetch pipeline against a manually curated list of urls and saves them locally | ||||
| """ | ||||
| import sys | ||||
| sys.path.append("../app/news_fetch") | ||||
| import runner | ||||
| import logging | ||||
| logger = logging.getLogger() | ||||
| import json | ||||
|  | ||||
| from rich.console import Console | ||||
| from rich.table import Table | ||||
| console = Console() | ||||
|  | ||||
| logger.info("Overwriting production values for single time media-fetch") | ||||
| runner.configuration.models.set_db( | ||||
|     runner.configuration.SqliteDatabase("../.dev/media_downloads.db") | ||||
| ) | ||||
| runner.configuration.main_config["DOWNLOADS"]["local_storage_path"] = "../.dev/" | ||||
|  | ||||
|  | ||||
| def fetch(): | ||||
|     dispatcher = runner.Dispatcher() | ||||
|  | ||||
|     dispatcher.workers_in = [{"FetchWorker": runner.FetchWorker(), "DownloadWorker": runner.DownloadWorker()}] | ||||
|     dispatcher.workers_out = [{"PrintWorker": runner.PrintWorker()}] | ||||
|  | ||||
|     dispatcher.start() | ||||
|  | ||||
|     with open("media_urls.json", "r") as f: | ||||
|         url_list = json.loads(f.read())  | ||||
|  | ||||
|     logger.info(f"Found {len(url_list)} media urls") | ||||
|     for u in url_list: | ||||
|         msg_text = f"<{u}|dummy preview text>" | ||||
|         dispatcher.incoming_request(msg) | ||||
|  | ||||
|  | ||||
|  | ||||
| def show(): | ||||
|  | ||||
|     t = Table( | ||||
|         title = "ArticleDownloads", | ||||
|         row_styles = ["white", "bright_black"], | ||||
|     ) | ||||
|      | ||||
|     entries = ["title", "article_url", "archive_url", "authors"] | ||||
|  | ||||
|     for e in entries: | ||||
|         t.add_column(e, justify = "right") | ||||
|  | ||||
|     sel = runner.models.ArticleDownload.select() | ||||
|  | ||||
|     for s in sel: | ||||
|         c = [getattr(s, e) for e in entries]# | ||||
|         c[-1] = str([a.author for a in c[-1]]) | ||||
|         print(c) | ||||
|         t.add_row(*c) | ||||
|  | ||||
|      | ||||
|     console.print(t) | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| # fetch() | ||||
| show() | ||||
							
								
								
									
										170
									
								
								manual/migration.to_postgres.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										170
									
								
								manual/migration.to_postgres.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,170 @@ | ||||
| import datetime | ||||
| import sys | ||||
| sys.path.append("../news_fetch/") | ||||
| import configuration # lives in app | ||||
| from peewee import * | ||||
|  | ||||
| import os | ||||
| import time | ||||
|  | ||||
| old_db = SqliteDatabase("/app/containerdata/downloads.db") | ||||
|  | ||||
| cred = configuration.db_config["DATABASE"] | ||||
| download_db = PostgresqlDatabase( | ||||
|     cred["db_name"], user=cred["user_name"], password=cred["password"], host="vpn", port=5432 | ||||
| ) | ||||
|  | ||||
| ## OLD Models | ||||
| class OLDModel(Model): | ||||
|     class Meta: | ||||
|         database = old_db | ||||
|  | ||||
|  | ||||
| class OLDArticleDownload(OLDModel): | ||||
|     class Meta: | ||||
|         db_table = 'articledownload' | ||||
|  | ||||
|     title = CharField(default='') | ||||
|     pub_date = DateField(default = '') | ||||
|     download_date = DateField(default = 0) | ||||
|     source_name = CharField(default = '') | ||||
|     article_url = TextField(default = '', unique=True) | ||||
|     archive_url = TextField(default = '') | ||||
|     file_name = TextField(default = '') | ||||
|     language = CharField(default = '') | ||||
|     summary = TextField(default = '') | ||||
|     comment = TextField(default = '') | ||||
|     verified = IntegerField(default = False) | ||||
|     # authors | ||||
|     # keywords | ||||
|     # ... are added through foreignkeys | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| class OLDArticleAuthor(OLDModel): | ||||
|     class Meta: | ||||
|         db_table = 'articleauthor' | ||||
|  | ||||
|     article = ForeignKeyField(OLDArticleDownload, backref='authors') | ||||
|     author = CharField() | ||||
|  | ||||
|  | ||||
|  | ||||
| class OLDArticleRelated(OLDModel): | ||||
|     class Meta: | ||||
|         db_table = 'articlerelated' | ||||
|  | ||||
|     article = ForeignKeyField(OLDArticleDownload, backref='related') | ||||
|     related_file_name = TextField(default = '') | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| ## NEW Models | ||||
| class NEWModel(Model): | ||||
|     class Meta: | ||||
|         database = download_db | ||||
|  | ||||
|  | ||||
| class ArticleDownload(NEWModel): | ||||
|     # in the beginning this is all we have | ||||
|     article_url = TextField(default = '', unique=True) | ||||
|     # fetch then fills in the metadata | ||||
|     title = TextField(default='') | ||||
|     summary = TextField(default = '') | ||||
|     source_name = CharField(default = '') | ||||
|     language = CharField(default = '') | ||||
|     file_name = TextField(default = '') | ||||
|     archive_url = TextField(default = '') | ||||
|     pub_date = DateField(default = '') | ||||
|     download_date = DateField(default = 0) | ||||
|     slack_ts = FloatField(default = 0) # should be a fixed-length string but float is easier to sort by | ||||
|     sent = BooleanField(default = False) | ||||
|     archived_by = CharField(default = os.getenv("UNAME")) | ||||
|     # need to know who saved the message because the file needs to be on their computer in order to get verified | ||||
|     # verification happens in a different app, but the model has the fields here as well | ||||
|     comment = TextField(default = '') | ||||
|     verified = IntegerField(default = 0) # 0 = not verified, 1 = verified, -1 = marked as bad | ||||
|      | ||||
|     def set_authors(self, authors): | ||||
|         for a in authors: | ||||
|             if len(a) < 100: | ||||
|                 ArticleAuthor.create( | ||||
|                     article = self, | ||||
|                     author = a | ||||
|             ) | ||||
|  | ||||
|     def set_related(self, related): | ||||
|         for r in related: | ||||
|             ArticleRelated.create( | ||||
|                 article = self, | ||||
|                 related_file_name = r | ||||
|             ) | ||||
|  | ||||
|     # authors | ||||
|     # keywords | ||||
|     # ... are added through foreignkeys | ||||
|     # we will also add an attribute named message, to reference which message should be replied to. This attribute does not need to be saved in the db | ||||
|  | ||||
|     | ||||
|  | ||||
| class ArticleAuthor(NEWModel): | ||||
|     article = ForeignKeyField(ArticleDownload, backref='authors') | ||||
|     author = CharField() | ||||
|  | ||||
|  | ||||
| class ArticleRelated(NEWModel): | ||||
|     # Related files, such as the full text of a paper, audio files, etc. | ||||
|     article = ForeignKeyField(ArticleDownload, backref='related') | ||||
|     related_file_name = TextField(default = '') | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| #################################################################### | ||||
| # Migrate using sensible defaults: | ||||
| download_db.create_tables([ArticleDownload, ArticleAuthor, ArticleRelated]) | ||||
|  | ||||
| it = 0 | ||||
| for old_art in OLDArticleDownload.select(): | ||||
|     print("====================================================================") | ||||
|     it+=1 | ||||
|     print(f"IT {it} New article with data:") | ||||
|     print( | ||||
|         old_art.article_url, | ||||
|         old_art.title, | ||||
|         old_art.summary, | ||||
|         old_art.source_name, | ||||
|         old_art.language, | ||||
|         old_art.file_name, | ||||
|         old_art.archive_url, | ||||
|         old_art.pub_date if old_art.pub_date != "" else datetime.date.fromtimestamp(0), | ||||
|         old_art.download_date, | ||||
|         True, | ||||
|         old_art.comment, | ||||
|         old_art.verified | ||||
|     ) | ||||
|     new_art = ArticleDownload.create( | ||||
|         article_url = old_art.article_url, | ||||
|         title = old_art.title, | ||||
|         summary = old_art.summary, | ||||
|         source_name = old_art.source_name, | ||||
|         language = old_art.language, | ||||
|         file_name = old_art.file_name, | ||||
|         archive_url = old_art.archive_url, | ||||
|         pub_date = old_art.pub_date if old_art.pub_date != "" else datetime.date.fromtimestamp(0), | ||||
|         download_date = old_art.download_date, | ||||
|         # slack_ts = FloatField(default = 0) | ||||
|         sent = True, | ||||
|         # archived_by = CharField(default = os.getenv("UNAME")) | ||||
|         comment = old_art.comment, | ||||
|         verified = old_art.verified | ||||
|     ) | ||||
|      | ||||
|      | ||||
|     new_art.set_related([r.related_file_name for r in old_art.related]) | ||||
|     new_art.set_authors([a.author for a in old_art.authors]) | ||||
|      | ||||
		Reference in New Issue
	
	Block a user