few bugs in news_fetch left, news_chek wip
This commit is contained in:
parent
2e65828bbb
commit
713406dc67
@ -34,7 +34,7 @@ services:
|
||||
|
||||
|
||||
geckodriver: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers)
|
||||
image: selenium/standalone-firefox:103.0 # latest version because it mirrors the locally installed version (which is automatically updated)
|
||||
image: ${GECKODRIVER_IMG}
|
||||
environment:
|
||||
- START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash)
|
||||
- START_XVFB=${HEADFULL-false}
|
||||
|
10
launch
10
launch
@ -5,10 +5,12 @@ set -o ignoreeof
|
||||
echo "Bash script launching COSS_ARCHIVING..."
|
||||
|
||||
|
||||
# CHANGE ME!
|
||||
# CHANGE ME ONCE!
|
||||
export CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
|
||||
export UNAME=remy
|
||||
|
||||
# CHANGE ME WHEN UPDATING FIREFOX
|
||||
export GECKODRIVER_IMG=selenium/standalone-firefox:103.0
|
||||
# version must be >= than the one on the host or firefox will not start (because of mismatched config)
|
||||
|
||||
if [[ $1 == "debug" ]]
|
||||
then
|
||||
@ -16,8 +18,8 @@ then
|
||||
export HEADFULL=true
|
||||
export CODE=./
|
||||
export ENTRYPOINT=/bin/bash
|
||||
# since service ports is not enough here, also execute up, which will
|
||||
docker compose up -d
|
||||
# since service ports does not open ports on implicitly started containers, also start geckodriver:
|
||||
docker compose up -d geckodriver
|
||||
elif [[ $1 == "production" ]]
|
||||
then
|
||||
export DEBUG=false
|
||||
|
@ -1,5 +1,4 @@
|
||||
import sys
|
||||
from webbrowser import get
|
||||
sys.path.append("../app")
|
||||
import runner
|
||||
import logging
|
||||
|
170
misc/migration.to_postgres.py
Normal file
170
misc/migration.to_postgres.py
Normal file
@ -0,0 +1,170 @@
|
||||
import datetime
|
||||
import sys
|
||||
sys.path.append("../news_fetch/")
|
||||
import configuration # lives in app
|
||||
from peewee import *
|
||||
|
||||
import os
|
||||
import time
|
||||
|
||||
old_db = SqliteDatabase("/app/containerdata/downloads.db")
|
||||
|
||||
cred = configuration.db_config["DATABASE"]
|
||||
download_db = PostgresqlDatabase(
|
||||
cred["db_name"], user=cred["user_name"], password=cred["password"], host="vpn", port=5432
|
||||
)
|
||||
|
||||
## OLD Models
|
||||
class OLDModel(Model):
|
||||
class Meta:
|
||||
database = old_db
|
||||
|
||||
|
||||
class OLDArticleDownload(OLDModel):
|
||||
class Meta:
|
||||
db_table = 'articledownload'
|
||||
|
||||
title = CharField(default='')
|
||||
pub_date = DateField(default = '')
|
||||
download_date = DateField(default = 0)
|
||||
source_name = CharField(default = '')
|
||||
article_url = TextField(default = '', unique=True)
|
||||
archive_url = TextField(default = '')
|
||||
file_name = TextField(default = '')
|
||||
language = CharField(default = '')
|
||||
summary = TextField(default = '')
|
||||
comment = TextField(default = '')
|
||||
verified = IntegerField(default = False)
|
||||
# authors
|
||||
# keywords
|
||||
# ... are added through foreignkeys
|
||||
|
||||
|
||||
|
||||
|
||||
class OLDArticleAuthor(OLDModel):
|
||||
class Meta:
|
||||
db_table = 'articleauthor'
|
||||
|
||||
article = ForeignKeyField(OLDArticleDownload, backref='authors')
|
||||
author = CharField()
|
||||
|
||||
|
||||
|
||||
class OLDArticleRelated(OLDModel):
|
||||
class Meta:
|
||||
db_table = 'articlerelated'
|
||||
|
||||
article = ForeignKeyField(OLDArticleDownload, backref='related')
|
||||
related_file_name = TextField(default = '')
|
||||
|
||||
|
||||
|
||||
|
||||
## NEW Models
|
||||
class NEWModel(Model):
|
||||
class Meta:
|
||||
database = download_db
|
||||
|
||||
|
||||
class ArticleDownload(NEWModel):
|
||||
# in the beginning this is all we have
|
||||
article_url = TextField(default = '', unique=True)
|
||||
# fetch then fills in the metadata
|
||||
title = TextField(default='')
|
||||
summary = TextField(default = '')
|
||||
source_name = CharField(default = '')
|
||||
language = CharField(default = '')
|
||||
file_name = TextField(default = '')
|
||||
archive_url = TextField(default = '')
|
||||
pub_date = DateField(default = '')
|
||||
download_date = DateField(default = 0)
|
||||
slack_ts = FloatField(default = 0) # should be a fixed-length string but float is easier to sort by
|
||||
sent = BooleanField(default = False)
|
||||
archived_by = CharField(default = os.getenv("UNAME"))
|
||||
# need to know who saved the message because the file needs to be on their computer in order to get verified
|
||||
# verification happens in a different app, but the model has the fields here as well
|
||||
comment = TextField(default = '')
|
||||
verified = IntegerField(default = 0) # 0 = not verified, 1 = verified, -1 = marked as bad
|
||||
|
||||
def set_authors(self, authors):
|
||||
for a in authors:
|
||||
if len(a) < 100:
|
||||
ArticleAuthor.create(
|
||||
article = self,
|
||||
author = a
|
||||
)
|
||||
|
||||
def set_related(self, related):
|
||||
for r in related:
|
||||
ArticleRelated.create(
|
||||
article = self,
|
||||
related_file_name = r
|
||||
)
|
||||
|
||||
# authors
|
||||
# keywords
|
||||
# ... are added through foreignkeys
|
||||
# we will also add an attribute named message, to reference which message should be replied to. This attribute does not need to be saved in the db
|
||||
|
||||
|
||||
|
||||
class ArticleAuthor(NEWModel):
|
||||
article = ForeignKeyField(ArticleDownload, backref='authors')
|
||||
author = CharField()
|
||||
|
||||
|
||||
class ArticleRelated(NEWModel):
|
||||
# Related files, such as the full text of a paper, audio files, etc.
|
||||
article = ForeignKeyField(ArticleDownload, backref='related')
|
||||
related_file_name = TextField(default = '')
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
####################################################################
|
||||
# Migrate using sensible defaults:
|
||||
download_db.create_tables([ArticleDownload, ArticleAuthor, ArticleRelated])
|
||||
|
||||
it = 0
|
||||
for old_art in OLDArticleDownload.select():
|
||||
print("====================================================================")
|
||||
it+=1
|
||||
print(f"IT {it} New article with data:")
|
||||
print(
|
||||
old_art.article_url,
|
||||
old_art.title,
|
||||
old_art.summary,
|
||||
old_art.source_name,
|
||||
old_art.language,
|
||||
old_art.file_name,
|
||||
old_art.archive_url,
|
||||
old_art.pub_date if old_art.pub_date != "" else datetime.date.fromtimestamp(0),
|
||||
old_art.download_date,
|
||||
True,
|
||||
old_art.comment,
|
||||
old_art.verified
|
||||
)
|
||||
new_art = ArticleDownload.create(
|
||||
article_url = old_art.article_url,
|
||||
title = old_art.title,
|
||||
summary = old_art.summary,
|
||||
source_name = old_art.source_name,
|
||||
language = old_art.language,
|
||||
file_name = old_art.file_name,
|
||||
archive_url = old_art.archive_url,
|
||||
pub_date = old_art.pub_date if old_art.pub_date != "" else datetime.date.fromtimestamp(0),
|
||||
download_date = old_art.download_date,
|
||||
# slack_ts = FloatField(default = 0)
|
||||
sent = True,
|
||||
# archived_by = CharField(default = os.getenv("UNAME"))
|
||||
comment = old_art.comment,
|
||||
verified = old_art.verified
|
||||
)
|
||||
|
||||
|
||||
new_art.set_related([r.related_file_name for r in old_art.related])
|
||||
new_art.set_authors([a.author for a in old_art.authors])
|
||||
|
@ -2,16 +2,38 @@
|
||||
import PDFView from './PDFView.svelte';
|
||||
import ArticleStatus from './ArticleStatus.svelte';
|
||||
import ArticleOperations from './ArticleOperations.svelte';
|
||||
|
||||
let current_id = 0;
|
||||
|
||||
const updateInterface = (async () => {
|
||||
let url = '';
|
||||
if (current_id == 0) {
|
||||
url = '/api/article/first';
|
||||
} else {
|
||||
url = '/api/article/' + current_id + '/next';
|
||||
}
|
||||
const response = await fetch(url)
|
||||
const data = await response.json()
|
||||
current_id = data.id;
|
||||
let article_url = '/api/article/' + current_id + '/get';
|
||||
const article_response = await fetch(article_url);
|
||||
const article_data = await article_response.json();
|
||||
return article_data;
|
||||
})()
|
||||
|
||||
|
||||
</script>
|
||||
|
||||
|
||||
<div class="flex w-full h-full gap-5 p-5">
|
||||
<div class="w-3/5"><PDFView/></div>
|
||||
{#await updateInterface}
|
||||
...
|
||||
{:then article_data}
|
||||
<div class="flex w-full h-screen gap-5 p-5">
|
||||
<div class="w-3/5"><PDFView article_data={article_data}/></div>
|
||||
<div class="divider divider-horizontal"></div>
|
||||
<div class="w-2/5">
|
||||
<ArticleStatus article_id={42}/>
|
||||
<ArticleStatus article_data={article_data}/>
|
||||
<div class="divider divider-vertical"></div>
|
||||
<ArticleOperations/>
|
||||
<ArticleOperations article_data={article_data}/>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/await}
|
||||
|
@ -1,28 +1,93 @@
|
||||
<div class="toast">
|
||||
<div class="alert alert-info">
|
||||
<div>
|
||||
<span>New message arrived.</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
import {fade} from 'svelte/transition';
|
||||
|
||||
<div class="grid grid-cols-3 gap-4">
|
||||
<div class="highlight">01</div>
|
||||
<div class="highlight">01</div>
|
||||
<div class="highlight">01</div>
|
||||
<div class="highlight">01</div>
|
||||
<div class="highlight">01</div>
|
||||
<div class="highlight">01</div>
|
||||
<div class="highlight">01</div>
|
||||
<div class="highlight">01</div>
|
||||
<div class="highlight">01</div>
|
||||
export let article_data;
|
||||
|
||||
</div>
|
||||
<style>
|
||||
.highlight {
|
||||
background-color: #f5f5f5;
|
||||
border-radius: 5px;
|
||||
padding: 10px;
|
||||
margin: 10px;
|
||||
const actions = [
|
||||
{name: 'Mark as good (and skip to next)', kbd: 'A'},
|
||||
{name: 'Mark as bad (and skip to next)', kbd: 'B'},
|
||||
{name: 'Upload related file', kbd: 'R'},
|
||||
{name: 'Skip', kbd: 'ctrl'},
|
||||
]
|
||||
|
||||
const toast_states = {
|
||||
'success' : {class: 'alert-success', text: 'Article updated successfully'},
|
||||
'error' : {class: 'alert-error', text: 'Article update failed'},
|
||||
}
|
||||
</style>
|
||||
let toast_state = {};
|
||||
let toast_visible = false;
|
||||
|
||||
|
||||
function onKeyDown(e) {apiAction(e.key)}
|
||||
function apiAction(key) {
|
||||
if (actions.map(d => d.kbd.toLowerCase()).includes(key.toLowerCase())){ // ignore other keypresses
|
||||
|
||||
const updateArticle = (async() => {
|
||||
const response = await fetch('/api/article/' + article_data.id + '/set', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type': 'application/json'},
|
||||
body: JSON.stringify({
|
||||
'action': key.toLowerCase(),
|
||||
})
|
||||
})
|
||||
const success = response.status == 200;
|
||||
|
||||
if (success){
|
||||
showToast('success');
|
||||
} else {
|
||||
showToast('error');
|
||||
}
|
||||
|
||||
})()
|
||||
}
|
||||
}
|
||||
|
||||
function showToast(state){
|
||||
toast_visible = true;
|
||||
toast_state = toast_states[state];
|
||||
setTimeout(() => {
|
||||
toast_visible = false;
|
||||
}, 1000)
|
||||
|
||||
}
|
||||
</script>
|
||||
|
||||
|
||||
<div class="card bg-neutral-300 shadow-xl">
|
||||
<div class="card-body">
|
||||
<h2 class="card-title">Your options: (click on action or use keyboard)</h2>
|
||||
<div class="overflow-x-auto">
|
||||
<table class="table w-full table-compact">
|
||||
<!-- head -->
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Action</th>
|
||||
<th>Keyboard shortcut</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{#each actions as action}
|
||||
|
||||
<tr>
|
||||
<td><button on:click={() => apiAction(action.kbd)}>{ action.name }</button></td>
|
||||
<td><kbd class="kbd">{ action.kbd }</kbd></td>
|
||||
</tr>
|
||||
|
||||
{/each}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<svelte:window on:keydown|preventDefault={onKeyDown} />
|
||||
|
||||
{#if toast_visible}
|
||||
<div class="toast" transition:fade>
|
||||
<div class="alert { toast_state.class }">
|
||||
<div>
|
||||
<span>{ toast_state.text }.</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{/if}
|
@ -1,25 +1,38 @@
|
||||
<script>
|
||||
export let article_id;
|
||||
const Article = (async () => {
|
||||
const response = await fetch('/api/article/' + article_id + '/get')
|
||||
return await response.json()
|
||||
})()
|
||||
console.log(Article)
|
||||
export let article_data;
|
||||
const status_items = [
|
||||
{name: 'Title', value: article_data.title},
|
||||
{name: 'Filename', value: article_data.file_name},
|
||||
{name: 'Language', value: article_data.language},
|
||||
{name: 'Authors', value: article_data.authors},
|
||||
{name: "Related", value: article_data.related},
|
||||
]
|
||||
</script>
|
||||
|
||||
<div class="mockup-window border bg-base-300">
|
||||
<h1 class="center">Article overview</h1>
|
||||
<ul tabindex="0" class="menu p-2 shadow bg-base-100 rounded-box w-52">
|
||||
{#await Article}
|
||||
<li>...waiting</li>
|
||||
{:then data}
|
||||
<li><a href="#">{data.value}</a></li>
|
||||
<li><a href="#">Item 2</a></li>
|
||||
{:catch error}
|
||||
<li>An error occurred!</li>
|
||||
{/await}
|
||||
|
||||
|
||||
</ul>
|
||||
<div class="card bg-neutral-300 shadow-xl overflow-x-auto">
|
||||
<div class="card-body">
|
||||
<h2 class="card-title">Article overview:</h2>
|
||||
<table class="table w-full table-compact" style="table-layout: fixed">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Attribute</th>
|
||||
<th>Value</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{#each status_items as item}
|
||||
<tr>
|
||||
<td>{ item.name }</td>
|
||||
<!-- <td>Quality Control Specialist</td> -->
|
||||
{#if item.value != ""}
|
||||
<td class='bg-emerald-200' style="white-space: normal">{ item.value }</td>
|
||||
{:else}
|
||||
<td class='bg-red-200'>{ item.value }</td>
|
||||
{/if}
|
||||
</tr>
|
||||
{/each}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
</div>
|
@ -1,64 +1,10 @@
|
||||
<!--
|
||||
<script>
|
||||
var myState = {
|
||||
pdf: null,
|
||||
currentPage: 1,
|
||||
zoom: 1
|
||||
}
|
||||
|
||||
pdfjsLib.getDocument('test.pdf').then((pdf) => {
|
||||
|
||||
myState.pdf = pdf;
|
||||
render();
|
||||
|
||||
});
|
||||
|
||||
function render() {
|
||||
myState.pdf.getPage(myState.currentPage).then((page) => {
|
||||
|
||||
var canvas = document.getElementById("pdf_renderer");
|
||||
var ctx = canvas.getContext('2d');
|
||||
|
||||
var viewport = page.getViewport(myState.zoom);
|
||||
|
||||
canvas.width = viewport.width;
|
||||
canvas.height = viewport.height;
|
||||
|
||||
page.render({
|
||||
canvasContext: ctx,
|
||||
viewport: viewport
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
</script>
|
||||
-->
|
||||
<!-- <div id="my_pdf_viewer">
|
||||
<div class="mockup-window border bg-base-300">
|
||||
<div id="canvas_container" class="flex justify-center">
|
||||
<canvas id="pdf_renderer"></canvas>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<div id="navigation_controls">
|
||||
<button id="go_previous">Previous</button>
|
||||
<input id="current_page" value="1" type="number"/>
|
||||
<button id="go_next">Next</button>
|
||||
</div>
|
||||
|
||||
<div id="zoom_controls">
|
||||
<button id="zoom_in">+</button>
|
||||
<button id="zoom_out">-</button>
|
||||
</div>
|
||||
</div> -->
|
||||
|
||||
<script>
|
||||
let pdf_file = 'test.pdf';
|
||||
export let article_data;
|
||||
</script>
|
||||
|
||||
<div class="mockup-window border bg-base-300 h-full w-full">
|
||||
<object class="pdf-view" data="{pdf_file}" title="Article PDF"> </object>
|
||||
<div class="h-full w-full shadow-xl">
|
||||
<object class="pdf-view" data="{article_data.save_path + article_data.file_name}" title="Article PDF"> </object>
|
||||
</div>
|
||||
|
||||
<style>
|
||||
|
@ -1,7 +1,7 @@
|
||||
import json
|
||||
from flask import Flask, send_from_directory, jsonify
|
||||
import random
|
||||
|
||||
from flask import Flask, send_from_directory, request
|
||||
import configuration
|
||||
models = configuration.models
|
||||
db = configuration.db
|
||||
app = Flask(__name__)
|
||||
|
||||
|
||||
@ -9,26 +9,55 @@ app = Flask(__name__)
|
||||
# SVELTE 'STATIC' BACKEND. Always send index.html and the requested js-files. (compiled by npm)
|
||||
|
||||
@app.route("/") #index.html
|
||||
def base():
|
||||
def index():
|
||||
return send_from_directory('../client/public', 'index.html')
|
||||
@app.route("/<path:path>") #js-files
|
||||
def home(path):
|
||||
def js(path):
|
||||
return send_from_directory('../client/public', path)
|
||||
|
||||
@app.route("/app/containerdata/files/<path:path>")
|
||||
def static_pdf(path):
|
||||
return send_from_directory('/app/containerdata/files/', path)
|
||||
|
||||
|
||||
|
||||
###############################################################################
|
||||
# API for news_check.
|
||||
# (simple) API for news_check.
|
||||
|
||||
@app.route("/api/article/<int:id>/get")
|
||||
def get_article(id):
|
||||
res = {"value": id}
|
||||
return jsonify(res)
|
||||
def get_article_by_id(id):
|
||||
with db:
|
||||
article = models.ArticleDownload.get_by_id(id)
|
||||
return article.to_dict()
|
||||
|
||||
@app.route("/api/article/first")
|
||||
def get_article_first():
|
||||
with db:
|
||||
article = models.ArticleDownload.select(models.ArticleDownload.id).where(models.ArticleDownload.verified == 0).order_by(models.ArticleDownload.id).first()
|
||||
return {"id" : article.id}
|
||||
|
||||
@app.route("/api/article/<int:id>/next")
|
||||
def get_article_next(id):
|
||||
with db:
|
||||
if models.ArticleDownload.get_by_id(id + 1).verified == 0:
|
||||
return {"id" : id + 1}
|
||||
else:
|
||||
return get_article_first()
|
||||
|
||||
|
||||
|
||||
@app.route("/api/article/<int:id>/set", methods=['POST'])
|
||||
def set_article(id):
|
||||
return str(random.randint(0, 100))
|
||||
action = request.json['action']
|
||||
with db:
|
||||
article = models.ArticleDownload.get_by_id(id)
|
||||
if action == "a":
|
||||
article.verified = 1
|
||||
elif action == "b":
|
||||
article.verified = -1
|
||||
elif action == "r":
|
||||
article.set_related()
|
||||
article.save()
|
||||
return "ok"
|
||||
|
||||
|
||||
|
||||
|
16
news_check/server/configuration.py
Normal file
16
news_check/server/configuration.py
Normal file
@ -0,0 +1,16 @@
|
||||
from peewee import PostgresqlDatabase
|
||||
import configparser
|
||||
|
||||
main_config = configparser.ConfigParser()
|
||||
main_config.read("/app/containerdata/config/news_fetch.config.ini")
|
||||
|
||||
db_config = configparser.ConfigParser()
|
||||
db_config.read("/app/containerdata/config/db.config.ini")
|
||||
|
||||
cred = db_config["DATABASE"]
|
||||
db = PostgresqlDatabase(
|
||||
cred["db_name"], user=cred["user_name"], password=cred["password"], host="vpn", port=5432
|
||||
)
|
||||
|
||||
import models
|
||||
models.set_db(db)
|
134
news_check/server/models.py
Normal file
134
news_check/server/models.py
Normal file
@ -0,0 +1,134 @@
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from peewee import *
|
||||
import os
|
||||
import datetime
|
||||
import configuration
|
||||
|
||||
config = configuration.main_config["DOWNLOADS"]
|
||||
|
||||
# set the nature of the db at runtime
|
||||
download_db = DatabaseProxy()
|
||||
|
||||
|
||||
class DownloadBaseModel(Model):
|
||||
class Meta:
|
||||
database = download_db
|
||||
|
||||
|
||||
|
||||
## == Article related models == ##
|
||||
class ArticleDownload(DownloadBaseModel):
|
||||
# in the beginning this is all we have
|
||||
article_url = TextField(default = '', unique=True)
|
||||
|
||||
# fetch then fills in the metadata
|
||||
title = TextField(default='')
|
||||
|
||||
summary = TextField(default = '')
|
||||
source_name = CharField(default = '')
|
||||
language = CharField(default = '')
|
||||
|
||||
|
||||
file_name = TextField(default = '')
|
||||
@property
|
||||
def save_path(self):
|
||||
return f"{config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/"
|
||||
@property
|
||||
def fname_nas(self, file_name=""):
|
||||
if self.download_date:
|
||||
if file_name:
|
||||
return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{file_name}"
|
||||
else: # return the self. name
|
||||
return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{self.file_name}"
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
archive_url = TextField(default = '')
|
||||
pub_date = DateField(default = datetime.date.fromtimestamp(0))
|
||||
download_date = DateField(default = datetime.date.today)
|
||||
|
||||
slack_ts = FloatField(default = 0) # should be a fixed-length string but float is easier to sort by
|
||||
|
||||
sent = BooleanField(default = False)
|
||||
|
||||
archived_by = CharField(default = os.getenv("UNAME"))
|
||||
# need to know who saved the message because the file needs to be on their computer in order to get verified
|
||||
# verification happens in a different app, but the model has the fields here as well
|
||||
comment = TextField(default = '')
|
||||
verified = IntegerField(default = 0) # 0 = not verified, 1 = verified, -1 = marked as bad
|
||||
|
||||
# authors
|
||||
# keywords
|
||||
# ... are added through foreignkeys
|
||||
# we will also add an attribute named message, to reference which message should be replied to. This attribute does not need to be saved in the db
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
"id": self.id,
|
||||
"article_url": self.article_url,
|
||||
"title": self.title,
|
||||
"summary": self.summary,
|
||||
"source_name": self.source_name,
|
||||
"language": self.language,
|
||||
"file_name": self.file_name,
|
||||
"save_path": self.save_path,
|
||||
"fname_nas": self.fname_nas,
|
||||
"archive_url": self.archive_url,
|
||||
"pub_date": self.pub_date.strftime("%Y-%m-%d"),
|
||||
"download_date": self.download_date.strftime("%Y-%m-%d"),
|
||||
"sent": self.sent,
|
||||
"comment": self.comment,
|
||||
"related": [r.related_file_name for r in self.related],
|
||||
"authors": [a.author for a in self.authors]
|
||||
}
|
||||
|
||||
|
||||
|
||||
def set_related(self, related):
|
||||
for r in related:
|
||||
if len(r) > 255:
|
||||
raise Exception("Related file name too long for POSTGRES")
|
||||
|
||||
ArticleRelated.create(
|
||||
article = self,
|
||||
related_file_name = r
|
||||
)
|
||||
|
||||
def file_status(self):
|
||||
if not self.file_name:
|
||||
logger.error(f"Article {self} has no filename!")
|
||||
return False, {"reply_text": "Download failed, no file was saved.", "file_path": None}
|
||||
|
||||
file_path_abs = self.save_path + self.file_name
|
||||
if not os.path.exists(file_path_abs):
|
||||
logger.error(f"Article {self} has a filename, but the file does not exist at that location!")
|
||||
return False, {"reply_text": "Can't find file. Either the download failed or the file was moved.", "file_path": None}
|
||||
|
||||
return True, {}
|
||||
|
||||
|
||||
class ArticleAuthor(DownloadBaseModel):
|
||||
article = ForeignKeyField(ArticleDownload, backref='authors')
|
||||
author = CharField()
|
||||
|
||||
|
||||
class ArticleRelated(DownloadBaseModel):
|
||||
# Related files, such as the full text of a paper, audio files, etc.
|
||||
article = ForeignKeyField(ArticleDownload, backref='related')
|
||||
related_file_name = TextField(default = '')
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def set_db(download_db_object):
|
||||
download_db.initialize(download_db_object)
|
||||
with download_db: # create tables (does nothing if they exist already)
|
||||
download_db.create_tables([ArticleDownload, ArticleAuthor, ArticleRelated])
|
||||
|
||||
|
@ -1,20 +0,0 @@
|
||||
import peewee
|
||||
|
||||
db = peewee.PostgresqlDatabase('coss_archiving', user='ca_rw', password='pleasechangeit', host='vpn', port=5432)
|
||||
# db.connect()
|
||||
|
||||
|
||||
class Pet(peewee.Model):
|
||||
name = peewee.CharField()
|
||||
animal_type = peewee.CharField()
|
||||
|
||||
class Meta:
|
||||
database = db # this model uses the "people.db" database
|
||||
with db:
|
||||
db.create_tables([Pet])
|
||||
db.get_tables()
|
||||
|
||||
t = Pet.create(name="Test", animal_type="test")
|
||||
|
||||
for pet in Pet.select():
|
||||
print(pet.name)
|
@ -8,3 +8,4 @@ newspaper3k
|
||||
htmldate
|
||||
markdown
|
||||
rich
|
||||
psycopg2
|
@ -123,7 +123,6 @@ class Coordinator(Thread):
|
||||
unsent = models.ArticleDownload.filter(sent = False)
|
||||
# .objects.filter(sent = False)
|
||||
for a in unsent:
|
||||
print(a)
|
||||
self.incoming_request(article=a)
|
||||
|
||||
|
||||
@ -170,7 +169,7 @@ class Coordinator(Thread):
|
||||
|
||||
for article in articles:
|
||||
notifier = lambda article: logger.info(f"Completed manual actions for {article}")
|
||||
ArticleWatcher(article, None, workers_manual = workers, notifier = notifier) # Article watcher wants a thread to link article to TODO: handle threads as a kwarg
|
||||
ArticleWatcher(article, workers_manual = workers, notifier = notifier) # Article watcher wants a thread to link article to TODO: handle threads as a kwarg
|
||||
|
||||
def article_complete_notifier(self, article):
|
||||
if self.worker_slack is None:
|
||||
@ -192,7 +191,7 @@ if __name__ == "__main__":
|
||||
|
||||
|
||||
if "upload" in sys.argv:
|
||||
articles = models.ArticleDownload.select().where(models.ArticleDownload.archive_url == "").execute()
|
||||
articles = models.ArticleDownload.select().where(models.ArticleDownload.archive_url == "" or models.ArticleDownload.archive_url == "TODO:UPLOAD").execute()
|
||||
logger.info(f"Launching upload to archive for {len(articles)} articles.")
|
||||
coordinator.manual_processing(articles, [UploadWorker()])
|
||||
|
||||
|
@ -4,7 +4,6 @@ logger = logging.getLogger(__name__)
|
||||
from peewee import *
|
||||
import os
|
||||
import markdown
|
||||
import re
|
||||
import configuration
|
||||
import datetime
|
||||
|
||||
@ -28,7 +27,7 @@ class ArticleDownload(DownloadBaseModel):
|
||||
article_url = TextField(default = '', unique=True)
|
||||
|
||||
# fetch then fills in the metadata
|
||||
title = CharField(default='')
|
||||
title = TextField(default='')
|
||||
@property
|
||||
def is_title_bad(self): # add incrementally
|
||||
return "PUR-Abo" in self.title \
|
||||
@ -63,7 +62,7 @@ class ArticleDownload(DownloadBaseModel):
|
||||
|
||||
|
||||
archive_url = TextField(default = '')
|
||||
pub_date = DateField(default = '')
|
||||
pub_date = DateField(default = datetime.date.fromtimestamp(0))
|
||||
download_date = DateField(default = datetime.date.today)
|
||||
|
||||
slack_ts = FloatField(default = 0) # should be a fixed-length string but float is easier to sort by
|
||||
@ -143,6 +142,7 @@ class ArticleDownload(DownloadBaseModel):
|
||||
|
||||
def set_authors(self, authors):
|
||||
for a in authors:
|
||||
if len(a) < 100: # otherwise it's a mismatched string
|
||||
ArticleAuthor.create(
|
||||
article = self,
|
||||
author = a
|
||||
@ -150,6 +150,9 @@ class ArticleDownload(DownloadBaseModel):
|
||||
|
||||
def set_related(self, related):
|
||||
for r in related:
|
||||
if len(r) > 255:
|
||||
raise Exception("Related file name too long for POSTGRES")
|
||||
|
||||
ArticleRelated.create(
|
||||
article = self,
|
||||
related_file_name = r
|
||||
@ -182,116 +185,7 @@ class ArticleRelated(DownloadBaseModel):
|
||||
|
||||
|
||||
|
||||
# class Thread(ChatBaseModel):
|
||||
# """The threads that concern us are only created if the base massage contains a url"""
|
||||
# thread_ts = FloatField(default = 0)
|
||||
# article = ForeignKeyField(ArticleDownload, backref="slack_thread", null=True, default=None)
|
||||
# # provides, ts, user, models
|
||||
# # messages
|
||||
|
||||
# @property
|
||||
# def slack_ts(self):
|
||||
# str_ts = str(self.thread_ts)
|
||||
# cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem!
|
||||
# return "{}{}".format(str_ts, cut_zeros*"0")
|
||||
|
||||
# @property
|
||||
# def initiator_message(self):
|
||||
# try:
|
||||
# return self.messages[0] # TODO check if this needs sorting
|
||||
# except IndexError:
|
||||
# logger.warning(f"Thread {self} is empty. How can that be?")
|
||||
# return None
|
||||
|
||||
# @property
|
||||
# def message_count(self):
|
||||
# # logger.warning("message_count was called")
|
||||
# return self.messages.count()
|
||||
|
||||
# @property
|
||||
# def last_message(self):
|
||||
# messages = Message.select().where(Message.thread == self).order_by(Message.ts) # can't be empty by definition/creation
|
||||
# return messages[-1]
|
||||
|
||||
# @property
|
||||
# def is_fully_processed(self) -> bool:
|
||||
# init_message = self.initiator_message
|
||||
# if init_message is None:
|
||||
# return False
|
||||
|
||||
# if init_message.is_processed_override:
|
||||
# return True
|
||||
# # this override is set for instance, when no url was sent at all. Then set this thread to be ignored
|
||||
|
||||
# reactions = init_message.reaction
|
||||
# if not reactions:
|
||||
# return False
|
||||
# else:
|
||||
# r = reactions[0].type # can and should only have one reaction
|
||||
# return r == "white_check_mark" \
|
||||
# or r == "x"
|
||||
|
||||
|
||||
|
||||
# class Message(ChatBaseModel):
|
||||
# ts = FloatField(unique=True) #for sorting
|
||||
# channel_id = CharField(default='')
|
||||
# user = ForeignKeyField(User, backref="messages")
|
||||
# text = TextField(default='')
|
||||
# thread = ForeignKeyField(Thread, backref="messages", default=None)
|
||||
# file_type = CharField(default='')
|
||||
# perma_link = CharField(default='')
|
||||
# is_processed_override = BooleanField(default=False)
|
||||
# # reaction
|
||||
|
||||
# def __str__(self) -> str:
|
||||
# return "MSG [{}]".format(shorten_name(self.text).replace('\n','/'))
|
||||
|
||||
# @property
|
||||
# def slack_ts(self):
|
||||
# str_ts = str(self.ts)
|
||||
# cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem!
|
||||
# return "{}{}".format(str_ts, cut_zeros * "0")
|
||||
|
||||
|
||||
# @property
|
||||
# def urls(self):
|
||||
# pattern = r"<(.*?)>"
|
||||
# matches = re.findall(pattern, self.text)
|
||||
# matches = [m for m in matches if "." in m]
|
||||
|
||||
# new_matches = []
|
||||
# for m in matches:
|
||||
# if "." in m: # must contain a tld, right?
|
||||
# # further complication: slack automatically abreviates urls in the format:
|
||||
# # <url|link preview>. Lucky for us, "|" is a character derecommended in urls, meaning we can "safely" split for it and retain the first half
|
||||
# if "|" in m:
|
||||
# keep = m.split("|")[0]
|
||||
# else:
|
||||
# keep = m
|
||||
# new_matches.append(keep)
|
||||
# return new_matches
|
||||
|
||||
# @property
|
||||
# def is_by_human(self):
|
||||
# return self.user.user_id != slack_config["bot_id"]
|
||||
|
||||
|
||||
# @property
|
||||
# def has_single_url(self):
|
||||
# return len(self.urls) == 1
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def set_db(download_db_object):
|
||||
download_db.initialize(download_db_object)
|
||||
with download_db: # create tables (does nothing if they exist already)
|
||||
download_db.create_tables([ArticleDownload, ArticleAuthor, ArticleRelated])
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user