coss_archiving/manual/migration.to_postgres.py

170 lines
5.0 KiB
Python

import datetime
import sys
sys.path.append("../news_fetch/")
import configuration # lives in app
from peewee import *
import os
import time
old_db = SqliteDatabase("/app/containerdata/downloads.db")
cred = configuration.db_config["DATABASE"]
download_db = PostgresqlDatabase(
cred["db_name"], user=cred["user_name"], password=cred["password"], host="vpn", port=5432
)
## OLD Models
class OLDModel(Model):
class Meta:
database = old_db
class OLDArticleDownload(OLDModel):
class Meta:
db_table = 'articledownload'
title = CharField(default='')
pub_date = DateField(default = '')
download_date = DateField(default = 0)
source_name = CharField(default = '')
article_url = TextField(default = '', unique=True)
archive_url = TextField(default = '')
file_name = TextField(default = '')
language = CharField(default = '')
summary = TextField(default = '')
comment = TextField(default = '')
verified = IntegerField(default = False)
# authors
# keywords
# ... are added through foreignkeys
class OLDArticleAuthor(OLDModel):
class Meta:
db_table = 'articleauthor'
article = ForeignKeyField(OLDArticleDownload, backref='authors')
author = CharField()
class OLDArticleRelated(OLDModel):
class Meta:
db_table = 'articlerelated'
article = ForeignKeyField(OLDArticleDownload, backref='related')
related_file_name = TextField(default = '')
## NEW Models
class NEWModel(Model):
class Meta:
database = download_db
class ArticleDownload(NEWModel):
# in the beginning this is all we have
article_url = TextField(default = '', unique=True)
# fetch then fills in the metadata
title = TextField(default='')
summary = TextField(default = '')
source_name = CharField(default = '')
language = CharField(default = '')
file_name = TextField(default = '')
archive_url = TextField(default = '')
pub_date = DateField(default = '')
download_date = DateField(default = 0)
slack_ts = FloatField(default = 0) # should be a fixed-length string but float is easier to sort by
sent = BooleanField(default = False)
archived_by = CharField(default = os.getenv("UNAME"))
# need to know who saved the message because the file needs to be on their computer in order to get verified
# verification happens in a different app, but the model has the fields here as well
comment = TextField(default = '')
verified = IntegerField(default = 0) # 0 = not verified, 1 = verified, -1 = marked as bad
def set_authors(self, authors):
for a in authors:
if len(a) < 100:
ArticleAuthor.create(
article = self,
author = a
)
def set_related(self, related):
for r in related:
ArticleRelated.create(
article = self,
related_file_name = r
)
# authors
# keywords
# ... are added through foreignkeys
# we will also add an attribute named message, to reference which message should be replied to. This attribute does not need to be saved in the db
class ArticleAuthor(NEWModel):
article = ForeignKeyField(ArticleDownload, backref='authors')
author = CharField()
class ArticleRelated(NEWModel):
# Related files, such as the full text of a paper, audio files, etc.
article = ForeignKeyField(ArticleDownload, backref='related')
related_file_name = TextField(default = '')
####################################################################
# Migrate using sensible defaults:
download_db.create_tables([ArticleDownload, ArticleAuthor, ArticleRelated])
it = 0
for old_art in OLDArticleDownload.select():
print("====================================================================")
it+=1
print(f"IT {it} New article with data:")
print(
old_art.article_url,
old_art.title,
old_art.summary,
old_art.source_name,
old_art.language,
old_art.file_name,
old_art.archive_url,
old_art.pub_date if old_art.pub_date != "" else datetime.date.fromtimestamp(0),
old_art.download_date,
True,
old_art.comment,
old_art.verified
)
new_art = ArticleDownload.create(
article_url = old_art.article_url,
title = old_art.title,
summary = old_art.summary,
source_name = old_art.source_name,
language = old_art.language,
file_name = old_art.file_name,
archive_url = old_art.archive_url,
pub_date = old_art.pub_date if old_art.pub_date != "" else datetime.date.fromtimestamp(0),
download_date = old_art.download_date,
# slack_ts = FloatField(default = 0)
sent = True,
# archived_by = CharField(default = os.getenv("UNAME"))
comment = old_art.comment,
verified = old_art.verified
)
new_art.set_related([r.related_file_name for r in old_art.related])
new_art.set_authors([a.author for a in old_art.authors])