few bugs in news_fetch left, news_chek wip
This commit is contained in:
parent
2e65828bbb
commit
713406dc67
@ -34,7 +34,7 @@ services:
|
|||||||
|
|
||||||
|
|
||||||
geckodriver: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers)
|
geckodriver: # separate docker container for pdf-download. This hugely improves stability (and creates shorter build times for the containers)
|
||||||
image: selenium/standalone-firefox:103.0 # latest version because it mirrors the locally installed version (which is automatically updated)
|
image: ${GECKODRIVER_IMG}
|
||||||
environment:
|
environment:
|
||||||
- START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash)
|
- START_VNC=${HEADFULL-false} # as opposed to headless, used when requiring supervision (eg. for websites that crash)
|
||||||
- START_XVFB=${HEADFULL-false}
|
- START_XVFB=${HEADFULL-false}
|
||||||
|
10
launch
10
launch
@ -5,10 +5,12 @@ set -o ignoreeof
|
|||||||
echo "Bash script launching COSS_ARCHIVING..."
|
echo "Bash script launching COSS_ARCHIVING..."
|
||||||
|
|
||||||
|
|
||||||
# CHANGE ME!
|
# CHANGE ME ONCE!
|
||||||
export CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
|
export CONTAINER_DATA=~/Bulk/COSS/Downloads/coss_archiving
|
||||||
export UNAME=remy
|
export UNAME=remy
|
||||||
|
# CHANGE ME WHEN UPDATING FIREFOX
|
||||||
|
export GECKODRIVER_IMG=selenium/standalone-firefox:103.0
|
||||||
|
# version must be >= than the one on the host or firefox will not start (because of mismatched config)
|
||||||
|
|
||||||
if [[ $1 == "debug" ]]
|
if [[ $1 == "debug" ]]
|
||||||
then
|
then
|
||||||
@ -16,8 +18,8 @@ then
|
|||||||
export HEADFULL=true
|
export HEADFULL=true
|
||||||
export CODE=./
|
export CODE=./
|
||||||
export ENTRYPOINT=/bin/bash
|
export ENTRYPOINT=/bin/bash
|
||||||
# since service ports is not enough here, also execute up, which will
|
# since service ports does not open ports on implicitly started containers, also start geckodriver:
|
||||||
docker compose up -d
|
docker compose up -d geckodriver
|
||||||
elif [[ $1 == "production" ]]
|
elif [[ $1 == "production" ]]
|
||||||
then
|
then
|
||||||
export DEBUG=false
|
export DEBUG=false
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
import sys
|
import sys
|
||||||
from webbrowser import get
|
|
||||||
sys.path.append("../app")
|
sys.path.append("../app")
|
||||||
import runner
|
import runner
|
||||||
import logging
|
import logging
|
||||||
|
170
misc/migration.to_postgres.py
Normal file
170
misc/migration.to_postgres.py
Normal file
@ -0,0 +1,170 @@
|
|||||||
|
import datetime
|
||||||
|
import sys
|
||||||
|
sys.path.append("../news_fetch/")
|
||||||
|
import configuration # lives in app
|
||||||
|
from peewee import *
|
||||||
|
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
|
||||||
|
old_db = SqliteDatabase("/app/containerdata/downloads.db")
|
||||||
|
|
||||||
|
cred = configuration.db_config["DATABASE"]
|
||||||
|
download_db = PostgresqlDatabase(
|
||||||
|
cred["db_name"], user=cred["user_name"], password=cred["password"], host="vpn", port=5432
|
||||||
|
)
|
||||||
|
|
||||||
|
## OLD Models
|
||||||
|
class OLDModel(Model):
|
||||||
|
class Meta:
|
||||||
|
database = old_db
|
||||||
|
|
||||||
|
|
||||||
|
class OLDArticleDownload(OLDModel):
|
||||||
|
class Meta:
|
||||||
|
db_table = 'articledownload'
|
||||||
|
|
||||||
|
title = CharField(default='')
|
||||||
|
pub_date = DateField(default = '')
|
||||||
|
download_date = DateField(default = 0)
|
||||||
|
source_name = CharField(default = '')
|
||||||
|
article_url = TextField(default = '', unique=True)
|
||||||
|
archive_url = TextField(default = '')
|
||||||
|
file_name = TextField(default = '')
|
||||||
|
language = CharField(default = '')
|
||||||
|
summary = TextField(default = '')
|
||||||
|
comment = TextField(default = '')
|
||||||
|
verified = IntegerField(default = False)
|
||||||
|
# authors
|
||||||
|
# keywords
|
||||||
|
# ... are added through foreignkeys
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class OLDArticleAuthor(OLDModel):
|
||||||
|
class Meta:
|
||||||
|
db_table = 'articleauthor'
|
||||||
|
|
||||||
|
article = ForeignKeyField(OLDArticleDownload, backref='authors')
|
||||||
|
author = CharField()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class OLDArticleRelated(OLDModel):
|
||||||
|
class Meta:
|
||||||
|
db_table = 'articlerelated'
|
||||||
|
|
||||||
|
article = ForeignKeyField(OLDArticleDownload, backref='related')
|
||||||
|
related_file_name = TextField(default = '')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## NEW Models
|
||||||
|
class NEWModel(Model):
|
||||||
|
class Meta:
|
||||||
|
database = download_db
|
||||||
|
|
||||||
|
|
||||||
|
class ArticleDownload(NEWModel):
|
||||||
|
# in the beginning this is all we have
|
||||||
|
article_url = TextField(default = '', unique=True)
|
||||||
|
# fetch then fills in the metadata
|
||||||
|
title = TextField(default='')
|
||||||
|
summary = TextField(default = '')
|
||||||
|
source_name = CharField(default = '')
|
||||||
|
language = CharField(default = '')
|
||||||
|
file_name = TextField(default = '')
|
||||||
|
archive_url = TextField(default = '')
|
||||||
|
pub_date = DateField(default = '')
|
||||||
|
download_date = DateField(default = 0)
|
||||||
|
slack_ts = FloatField(default = 0) # should be a fixed-length string but float is easier to sort by
|
||||||
|
sent = BooleanField(default = False)
|
||||||
|
archived_by = CharField(default = os.getenv("UNAME"))
|
||||||
|
# need to know who saved the message because the file needs to be on their computer in order to get verified
|
||||||
|
# verification happens in a different app, but the model has the fields here as well
|
||||||
|
comment = TextField(default = '')
|
||||||
|
verified = IntegerField(default = 0) # 0 = not verified, 1 = verified, -1 = marked as bad
|
||||||
|
|
||||||
|
def set_authors(self, authors):
|
||||||
|
for a in authors:
|
||||||
|
if len(a) < 100:
|
||||||
|
ArticleAuthor.create(
|
||||||
|
article = self,
|
||||||
|
author = a
|
||||||
|
)
|
||||||
|
|
||||||
|
def set_related(self, related):
|
||||||
|
for r in related:
|
||||||
|
ArticleRelated.create(
|
||||||
|
article = self,
|
||||||
|
related_file_name = r
|
||||||
|
)
|
||||||
|
|
||||||
|
# authors
|
||||||
|
# keywords
|
||||||
|
# ... are added through foreignkeys
|
||||||
|
# we will also add an attribute named message, to reference which message should be replied to. This attribute does not need to be saved in the db
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class ArticleAuthor(NEWModel):
|
||||||
|
article = ForeignKeyField(ArticleDownload, backref='authors')
|
||||||
|
author = CharField()
|
||||||
|
|
||||||
|
|
||||||
|
class ArticleRelated(NEWModel):
|
||||||
|
# Related files, such as the full text of a paper, audio files, etc.
|
||||||
|
article = ForeignKeyField(ArticleDownload, backref='related')
|
||||||
|
related_file_name = TextField(default = '')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
####################################################################
|
||||||
|
# Migrate using sensible defaults:
|
||||||
|
download_db.create_tables([ArticleDownload, ArticleAuthor, ArticleRelated])
|
||||||
|
|
||||||
|
it = 0
|
||||||
|
for old_art in OLDArticleDownload.select():
|
||||||
|
print("====================================================================")
|
||||||
|
it+=1
|
||||||
|
print(f"IT {it} New article with data:")
|
||||||
|
print(
|
||||||
|
old_art.article_url,
|
||||||
|
old_art.title,
|
||||||
|
old_art.summary,
|
||||||
|
old_art.source_name,
|
||||||
|
old_art.language,
|
||||||
|
old_art.file_name,
|
||||||
|
old_art.archive_url,
|
||||||
|
old_art.pub_date if old_art.pub_date != "" else datetime.date.fromtimestamp(0),
|
||||||
|
old_art.download_date,
|
||||||
|
True,
|
||||||
|
old_art.comment,
|
||||||
|
old_art.verified
|
||||||
|
)
|
||||||
|
new_art = ArticleDownload.create(
|
||||||
|
article_url = old_art.article_url,
|
||||||
|
title = old_art.title,
|
||||||
|
summary = old_art.summary,
|
||||||
|
source_name = old_art.source_name,
|
||||||
|
language = old_art.language,
|
||||||
|
file_name = old_art.file_name,
|
||||||
|
archive_url = old_art.archive_url,
|
||||||
|
pub_date = old_art.pub_date if old_art.pub_date != "" else datetime.date.fromtimestamp(0),
|
||||||
|
download_date = old_art.download_date,
|
||||||
|
# slack_ts = FloatField(default = 0)
|
||||||
|
sent = True,
|
||||||
|
# archived_by = CharField(default = os.getenv("UNAME"))
|
||||||
|
comment = old_art.comment,
|
||||||
|
verified = old_art.verified
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
new_art.set_related([r.related_file_name for r in old_art.related])
|
||||||
|
new_art.set_authors([a.author for a in old_art.authors])
|
||||||
|
|
@ -2,16 +2,38 @@
|
|||||||
import PDFView from './PDFView.svelte';
|
import PDFView from './PDFView.svelte';
|
||||||
import ArticleStatus from './ArticleStatus.svelte';
|
import ArticleStatus from './ArticleStatus.svelte';
|
||||||
import ArticleOperations from './ArticleOperations.svelte';
|
import ArticleOperations from './ArticleOperations.svelte';
|
||||||
|
|
||||||
|
let current_id = 0;
|
||||||
|
|
||||||
|
const updateInterface = (async () => {
|
||||||
|
let url = '';
|
||||||
|
if (current_id == 0) {
|
||||||
|
url = '/api/article/first';
|
||||||
|
} else {
|
||||||
|
url = '/api/article/' + current_id + '/next';
|
||||||
|
}
|
||||||
|
const response = await fetch(url)
|
||||||
|
const data = await response.json()
|
||||||
|
current_id = data.id;
|
||||||
|
let article_url = '/api/article/' + current_id + '/get';
|
||||||
|
const article_response = await fetch(article_url);
|
||||||
|
const article_data = await article_response.json();
|
||||||
|
return article_data;
|
||||||
|
})()
|
||||||
|
|
||||||
|
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
|
{#await updateInterface}
|
||||||
<div class="flex w-full h-full gap-5 p-5">
|
...
|
||||||
<div class="w-3/5"><PDFView/></div>
|
{:then article_data}
|
||||||
|
<div class="flex w-full h-screen gap-5 p-5">
|
||||||
|
<div class="w-3/5"><PDFView article_data={article_data}/></div>
|
||||||
<div class="divider divider-horizontal"></div>
|
<div class="divider divider-horizontal"></div>
|
||||||
<div class="w-2/5">
|
<div class="w-2/5">
|
||||||
<ArticleStatus article_id={42}/>
|
<ArticleStatus article_data={article_data}/>
|
||||||
<div class="divider divider-vertical"></div>
|
<div class="divider divider-vertical"></div>
|
||||||
<ArticleOperations/>
|
<ArticleOperations article_data={article_data}/>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
{/await}
|
||||||
|
@ -1,28 +1,93 @@
|
|||||||
<div class="toast">
|
<script>
|
||||||
<div class="alert alert-info">
|
import {fade} from 'svelte/transition';
|
||||||
<div>
|
|
||||||
<span>New message arrived.</span>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="grid grid-cols-3 gap-4">
|
export let article_data;
|
||||||
<div class="highlight">01</div>
|
|
||||||
<div class="highlight">01</div>
|
|
||||||
<div class="highlight">01</div>
|
|
||||||
<div class="highlight">01</div>
|
|
||||||
<div class="highlight">01</div>
|
|
||||||
<div class="highlight">01</div>
|
|
||||||
<div class="highlight">01</div>
|
|
||||||
<div class="highlight">01</div>
|
|
||||||
<div class="highlight">01</div>
|
|
||||||
|
|
||||||
</div>
|
const actions = [
|
||||||
<style>
|
{name: 'Mark as good (and skip to next)', kbd: 'A'},
|
||||||
.highlight {
|
{name: 'Mark as bad (and skip to next)', kbd: 'B'},
|
||||||
background-color: #f5f5f5;
|
{name: 'Upload related file', kbd: 'R'},
|
||||||
border-radius: 5px;
|
{name: 'Skip', kbd: 'ctrl'},
|
||||||
padding: 10px;
|
]
|
||||||
margin: 10px;
|
|
||||||
|
const toast_states = {
|
||||||
|
'success' : {class: 'alert-success', text: 'Article updated successfully'},
|
||||||
|
'error' : {class: 'alert-error', text: 'Article update failed'},
|
||||||
}
|
}
|
||||||
</style>
|
let toast_state = {};
|
||||||
|
let toast_visible = false;
|
||||||
|
|
||||||
|
|
||||||
|
function onKeyDown(e) {apiAction(e.key)}
|
||||||
|
function apiAction(key) {
|
||||||
|
if (actions.map(d => d.kbd.toLowerCase()).includes(key.toLowerCase())){ // ignore other keypresses
|
||||||
|
|
||||||
|
const updateArticle = (async() => {
|
||||||
|
const response = await fetch('/api/article/' + article_data.id + '/set', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {'Content-Type': 'application/json'},
|
||||||
|
body: JSON.stringify({
|
||||||
|
'action': key.toLowerCase(),
|
||||||
|
})
|
||||||
|
})
|
||||||
|
const success = response.status == 200;
|
||||||
|
|
||||||
|
if (success){
|
||||||
|
showToast('success');
|
||||||
|
} else {
|
||||||
|
showToast('error');
|
||||||
|
}
|
||||||
|
|
||||||
|
})()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function showToast(state){
|
||||||
|
toast_visible = true;
|
||||||
|
toast_state = toast_states[state];
|
||||||
|
setTimeout(() => {
|
||||||
|
toast_visible = false;
|
||||||
|
}, 1000)
|
||||||
|
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
|
||||||
|
|
||||||
|
<div class="card bg-neutral-300 shadow-xl">
|
||||||
|
<div class="card-body">
|
||||||
|
<h2 class="card-title">Your options: (click on action or use keyboard)</h2>
|
||||||
|
<div class="overflow-x-auto">
|
||||||
|
<table class="table w-full table-compact">
|
||||||
|
<!-- head -->
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Action</th>
|
||||||
|
<th>Keyboard shortcut</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{#each actions as action}
|
||||||
|
|
||||||
|
<tr>
|
||||||
|
<td><button on:click={() => apiAction(action.kbd)}>{ action.name }</button></td>
|
||||||
|
<td><kbd class="kbd">{ action.kbd }</kbd></td>
|
||||||
|
</tr>
|
||||||
|
|
||||||
|
{/each}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<svelte:window on:keydown|preventDefault={onKeyDown} />
|
||||||
|
|
||||||
|
{#if toast_visible}
|
||||||
|
<div class="toast" transition:fade>
|
||||||
|
<div class="alert { toast_state.class }">
|
||||||
|
<div>
|
||||||
|
<span>{ toast_state.text }.</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{/if}
|
@ -1,25 +1,38 @@
|
|||||||
<script>
|
<script>
|
||||||
export let article_id;
|
export let article_data;
|
||||||
const Article = (async () => {
|
const status_items = [
|
||||||
const response = await fetch('/api/article/' + article_id + '/get')
|
{name: 'Title', value: article_data.title},
|
||||||
return await response.json()
|
{name: 'Filename', value: article_data.file_name},
|
||||||
})()
|
{name: 'Language', value: article_data.language},
|
||||||
console.log(Article)
|
{name: 'Authors', value: article_data.authors},
|
||||||
|
{name: "Related", value: article_data.related},
|
||||||
|
]
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
<div class="mockup-window border bg-base-300">
|
<div class="card bg-neutral-300 shadow-xl overflow-x-auto">
|
||||||
<h1 class="center">Article overview</h1>
|
<div class="card-body">
|
||||||
<ul tabindex="0" class="menu p-2 shadow bg-base-100 rounded-box w-52">
|
<h2 class="card-title">Article overview:</h2>
|
||||||
{#await Article}
|
<table class="table w-full table-compact" style="table-layout: fixed">
|
||||||
<li>...waiting</li>
|
<thead>
|
||||||
{:then data}
|
<tr>
|
||||||
<li><a href="#">{data.value}</a></li>
|
<th>Attribute</th>
|
||||||
<li><a href="#">Item 2</a></li>
|
<th>Value</th>
|
||||||
{:catch error}
|
</tr>
|
||||||
<li>An error occurred!</li>
|
</thead>
|
||||||
{/await}
|
<tbody>
|
||||||
|
{#each status_items as item}
|
||||||
|
<tr>
|
||||||
</ul>
|
<td>{ item.name }</td>
|
||||||
|
<!-- <td>Quality Control Specialist</td> -->
|
||||||
|
{#if item.value != ""}
|
||||||
|
<td class='bg-emerald-200' style="white-space: normal">{ item.value }</td>
|
||||||
|
{:else}
|
||||||
|
<td class='bg-red-200'>{ item.value }</td>
|
||||||
|
{/if}
|
||||||
|
</tr>
|
||||||
|
{/each}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
</div>
|
</div>
|
@ -1,64 +1,10 @@
|
|||||||
<!--
|
|
||||||
<script>
|
|
||||||
var myState = {
|
|
||||||
pdf: null,
|
|
||||||
currentPage: 1,
|
|
||||||
zoom: 1
|
|
||||||
}
|
|
||||||
|
|
||||||
pdfjsLib.getDocument('test.pdf').then((pdf) => {
|
|
||||||
|
|
||||||
myState.pdf = pdf;
|
|
||||||
render();
|
|
||||||
|
|
||||||
});
|
|
||||||
|
|
||||||
function render() {
|
|
||||||
myState.pdf.getPage(myState.currentPage).then((page) => {
|
|
||||||
|
|
||||||
var canvas = document.getElementById("pdf_renderer");
|
|
||||||
var ctx = canvas.getContext('2d');
|
|
||||||
|
|
||||||
var viewport = page.getViewport(myState.zoom);
|
|
||||||
|
|
||||||
canvas.width = viewport.width;
|
|
||||||
canvas.height = viewport.height;
|
|
||||||
|
|
||||||
page.render({
|
|
||||||
canvasContext: ctx,
|
|
||||||
viewport: viewport
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
</script>
|
|
||||||
-->
|
|
||||||
<!-- <div id="my_pdf_viewer">
|
|
||||||
<div class="mockup-window border bg-base-300">
|
|
||||||
<div id="canvas_container" class="flex justify-center">
|
|
||||||
<canvas id="pdf_renderer"></canvas>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
|
|
||||||
<div id="navigation_controls">
|
|
||||||
<button id="go_previous">Previous</button>
|
|
||||||
<input id="current_page" value="1" type="number"/>
|
|
||||||
<button id="go_next">Next</button>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div id="zoom_controls">
|
|
||||||
<button id="zoom_in">+</button>
|
|
||||||
<button id="zoom_out">-</button>
|
|
||||||
</div>
|
|
||||||
</div> -->
|
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
let pdf_file = 'test.pdf';
|
export let article_data;
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
<div class="mockup-window border bg-base-300 h-full w-full">
|
<div class="h-full w-full shadow-xl">
|
||||||
<object class="pdf-view" data="{pdf_file}" title="Article PDF"> </object>
|
<object class="pdf-view" data="{article_data.save_path + article_data.file_name}" title="Article PDF"> </object>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<style>
|
<style>
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import json
|
from flask import Flask, send_from_directory, request
|
||||||
from flask import Flask, send_from_directory, jsonify
|
import configuration
|
||||||
import random
|
models = configuration.models
|
||||||
|
db = configuration.db
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
|
||||||
@ -9,26 +9,55 @@ app = Flask(__name__)
|
|||||||
# SVELTE 'STATIC' BACKEND. Always send index.html and the requested js-files. (compiled by npm)
|
# SVELTE 'STATIC' BACKEND. Always send index.html and the requested js-files. (compiled by npm)
|
||||||
|
|
||||||
@app.route("/") #index.html
|
@app.route("/") #index.html
|
||||||
def base():
|
def index():
|
||||||
return send_from_directory('../client/public', 'index.html')
|
return send_from_directory('../client/public', 'index.html')
|
||||||
@app.route("/<path:path>") #js-files
|
@app.route("/<path:path>") #js-files
|
||||||
def home(path):
|
def js(path):
|
||||||
return send_from_directory('../client/public', path)
|
return send_from_directory('../client/public', path)
|
||||||
|
@app.route("/app/containerdata/files/<path:path>")
|
||||||
|
def static_pdf(path):
|
||||||
|
return send_from_directory('/app/containerdata/files/', path)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
# API for news_check.
|
# (simple) API for news_check.
|
||||||
|
|
||||||
@app.route("/api/article/<int:id>/get")
|
@app.route("/api/article/<int:id>/get")
|
||||||
def get_article(id):
|
def get_article_by_id(id):
|
||||||
res = {"value": id}
|
with db:
|
||||||
return jsonify(res)
|
article = models.ArticleDownload.get_by_id(id)
|
||||||
|
return article.to_dict()
|
||||||
|
|
||||||
|
@app.route("/api/article/first")
|
||||||
|
def get_article_first():
|
||||||
|
with db:
|
||||||
|
article = models.ArticleDownload.select(models.ArticleDownload.id).where(models.ArticleDownload.verified == 0).order_by(models.ArticleDownload.id).first()
|
||||||
|
return {"id" : article.id}
|
||||||
|
|
||||||
|
@app.route("/api/article/<int:id>/next")
|
||||||
|
def get_article_next(id):
|
||||||
|
with db:
|
||||||
|
if models.ArticleDownload.get_by_id(id + 1).verified == 0:
|
||||||
|
return {"id" : id + 1}
|
||||||
|
else:
|
||||||
|
return get_article_first()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/article/<int:id>/set", methods=['POST'])
|
@app.route("/api/article/<int:id>/set", methods=['POST'])
|
||||||
def set_article(id):
|
def set_article(id):
|
||||||
return str(random.randint(0, 100))
|
action = request.json['action']
|
||||||
|
with db:
|
||||||
|
article = models.ArticleDownload.get_by_id(id)
|
||||||
|
if action == "a":
|
||||||
|
article.verified = 1
|
||||||
|
elif action == "b":
|
||||||
|
article.verified = -1
|
||||||
|
elif action == "r":
|
||||||
|
article.set_related()
|
||||||
|
article.save()
|
||||||
|
return "ok"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
16
news_check/server/configuration.py
Normal file
16
news_check/server/configuration.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
from peewee import PostgresqlDatabase
|
||||||
|
import configparser
|
||||||
|
|
||||||
|
main_config = configparser.ConfigParser()
|
||||||
|
main_config.read("/app/containerdata/config/news_fetch.config.ini")
|
||||||
|
|
||||||
|
db_config = configparser.ConfigParser()
|
||||||
|
db_config.read("/app/containerdata/config/db.config.ini")
|
||||||
|
|
||||||
|
cred = db_config["DATABASE"]
|
||||||
|
db = PostgresqlDatabase(
|
||||||
|
cred["db_name"], user=cred["user_name"], password=cred["password"], host="vpn", port=5432
|
||||||
|
)
|
||||||
|
|
||||||
|
import models
|
||||||
|
models.set_db(db)
|
134
news_check/server/models.py
Normal file
134
news_check/server/models.py
Normal file
@ -0,0 +1,134 @@
|
|||||||
|
import logging
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
from peewee import *
|
||||||
|
import os
|
||||||
|
import datetime
|
||||||
|
import configuration
|
||||||
|
|
||||||
|
config = configuration.main_config["DOWNLOADS"]
|
||||||
|
|
||||||
|
# set the nature of the db at runtime
|
||||||
|
download_db = DatabaseProxy()
|
||||||
|
|
||||||
|
|
||||||
|
class DownloadBaseModel(Model):
|
||||||
|
class Meta:
|
||||||
|
database = download_db
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## == Article related models == ##
|
||||||
|
class ArticleDownload(DownloadBaseModel):
|
||||||
|
# in the beginning this is all we have
|
||||||
|
article_url = TextField(default = '', unique=True)
|
||||||
|
|
||||||
|
# fetch then fills in the metadata
|
||||||
|
title = TextField(default='')
|
||||||
|
|
||||||
|
summary = TextField(default = '')
|
||||||
|
source_name = CharField(default = '')
|
||||||
|
language = CharField(default = '')
|
||||||
|
|
||||||
|
|
||||||
|
file_name = TextField(default = '')
|
||||||
|
@property
|
||||||
|
def save_path(self):
|
||||||
|
return f"{config['local_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/"
|
||||||
|
@property
|
||||||
|
def fname_nas(self, file_name=""):
|
||||||
|
if self.download_date:
|
||||||
|
if file_name:
|
||||||
|
return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{file_name}"
|
||||||
|
else: # return the self. name
|
||||||
|
return f"NAS: {config['remote_storage_path']}/{self.download_date.year}/{self.download_date.strftime('%B')}/{self.file_name}"
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
archive_url = TextField(default = '')
|
||||||
|
pub_date = DateField(default = datetime.date.fromtimestamp(0))
|
||||||
|
download_date = DateField(default = datetime.date.today)
|
||||||
|
|
||||||
|
slack_ts = FloatField(default = 0) # should be a fixed-length string but float is easier to sort by
|
||||||
|
|
||||||
|
sent = BooleanField(default = False)
|
||||||
|
|
||||||
|
archived_by = CharField(default = os.getenv("UNAME"))
|
||||||
|
# need to know who saved the message because the file needs to be on their computer in order to get verified
|
||||||
|
# verification happens in a different app, but the model has the fields here as well
|
||||||
|
comment = TextField(default = '')
|
||||||
|
verified = IntegerField(default = 0) # 0 = not verified, 1 = verified, -1 = marked as bad
|
||||||
|
|
||||||
|
# authors
|
||||||
|
# keywords
|
||||||
|
# ... are added through foreignkeys
|
||||||
|
# we will also add an attribute named message, to reference which message should be replied to. This attribute does not need to be saved in the db
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return {
|
||||||
|
"id": self.id,
|
||||||
|
"article_url": self.article_url,
|
||||||
|
"title": self.title,
|
||||||
|
"summary": self.summary,
|
||||||
|
"source_name": self.source_name,
|
||||||
|
"language": self.language,
|
||||||
|
"file_name": self.file_name,
|
||||||
|
"save_path": self.save_path,
|
||||||
|
"fname_nas": self.fname_nas,
|
||||||
|
"archive_url": self.archive_url,
|
||||||
|
"pub_date": self.pub_date.strftime("%Y-%m-%d"),
|
||||||
|
"download_date": self.download_date.strftime("%Y-%m-%d"),
|
||||||
|
"sent": self.sent,
|
||||||
|
"comment": self.comment,
|
||||||
|
"related": [r.related_file_name for r in self.related],
|
||||||
|
"authors": [a.author for a in self.authors]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def set_related(self, related):
|
||||||
|
for r in related:
|
||||||
|
if len(r) > 255:
|
||||||
|
raise Exception("Related file name too long for POSTGRES")
|
||||||
|
|
||||||
|
ArticleRelated.create(
|
||||||
|
article = self,
|
||||||
|
related_file_name = r
|
||||||
|
)
|
||||||
|
|
||||||
|
def file_status(self):
|
||||||
|
if not self.file_name:
|
||||||
|
logger.error(f"Article {self} has no filename!")
|
||||||
|
return False, {"reply_text": "Download failed, no file was saved.", "file_path": None}
|
||||||
|
|
||||||
|
file_path_abs = self.save_path + self.file_name
|
||||||
|
if not os.path.exists(file_path_abs):
|
||||||
|
logger.error(f"Article {self} has a filename, but the file does not exist at that location!")
|
||||||
|
return False, {"reply_text": "Can't find file. Either the download failed or the file was moved.", "file_path": None}
|
||||||
|
|
||||||
|
return True, {}
|
||||||
|
|
||||||
|
|
||||||
|
class ArticleAuthor(DownloadBaseModel):
|
||||||
|
article = ForeignKeyField(ArticleDownload, backref='authors')
|
||||||
|
author = CharField()
|
||||||
|
|
||||||
|
|
||||||
|
class ArticleRelated(DownloadBaseModel):
|
||||||
|
# Related files, such as the full text of a paper, audio files, etc.
|
||||||
|
article = ForeignKeyField(ArticleDownload, backref='related')
|
||||||
|
related_file_name = TextField(default = '')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def set_db(download_db_object):
|
||||||
|
download_db.initialize(download_db_object)
|
||||||
|
with download_db: # create tables (does nothing if they exist already)
|
||||||
|
download_db.create_tables([ArticleDownload, ArticleAuthor, ArticleRelated])
|
||||||
|
|
||||||
|
|
@ -1,20 +0,0 @@
|
|||||||
import peewee
|
|
||||||
|
|
||||||
db = peewee.PostgresqlDatabase('coss_archiving', user='ca_rw', password='pleasechangeit', host='vpn', port=5432)
|
|
||||||
# db.connect()
|
|
||||||
|
|
||||||
|
|
||||||
class Pet(peewee.Model):
|
|
||||||
name = peewee.CharField()
|
|
||||||
animal_type = peewee.CharField()
|
|
||||||
|
|
||||||
class Meta:
|
|
||||||
database = db # this model uses the "people.db" database
|
|
||||||
with db:
|
|
||||||
db.create_tables([Pet])
|
|
||||||
db.get_tables()
|
|
||||||
|
|
||||||
t = Pet.create(name="Test", animal_type="test")
|
|
||||||
|
|
||||||
for pet in Pet.select():
|
|
||||||
print(pet.name)
|
|
@ -8,3 +8,4 @@ newspaper3k
|
|||||||
htmldate
|
htmldate
|
||||||
markdown
|
markdown
|
||||||
rich
|
rich
|
||||||
|
psycopg2
|
@ -123,7 +123,6 @@ class Coordinator(Thread):
|
|||||||
unsent = models.ArticleDownload.filter(sent = False)
|
unsent = models.ArticleDownload.filter(sent = False)
|
||||||
# .objects.filter(sent = False)
|
# .objects.filter(sent = False)
|
||||||
for a in unsent:
|
for a in unsent:
|
||||||
print(a)
|
|
||||||
self.incoming_request(article=a)
|
self.incoming_request(article=a)
|
||||||
|
|
||||||
|
|
||||||
@ -170,7 +169,7 @@ class Coordinator(Thread):
|
|||||||
|
|
||||||
for article in articles:
|
for article in articles:
|
||||||
notifier = lambda article: logger.info(f"Completed manual actions for {article}")
|
notifier = lambda article: logger.info(f"Completed manual actions for {article}")
|
||||||
ArticleWatcher(article, None, workers_manual = workers, notifier = notifier) # Article watcher wants a thread to link article to TODO: handle threads as a kwarg
|
ArticleWatcher(article, workers_manual = workers, notifier = notifier) # Article watcher wants a thread to link article to TODO: handle threads as a kwarg
|
||||||
|
|
||||||
def article_complete_notifier(self, article):
|
def article_complete_notifier(self, article):
|
||||||
if self.worker_slack is None:
|
if self.worker_slack is None:
|
||||||
@ -192,7 +191,7 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
|
|
||||||
if "upload" in sys.argv:
|
if "upload" in sys.argv:
|
||||||
articles = models.ArticleDownload.select().where(models.ArticleDownload.archive_url == "").execute()
|
articles = models.ArticleDownload.select().where(models.ArticleDownload.archive_url == "" or models.ArticleDownload.archive_url == "TODO:UPLOAD").execute()
|
||||||
logger.info(f"Launching upload to archive for {len(articles)} articles.")
|
logger.info(f"Launching upload to archive for {len(articles)} articles.")
|
||||||
coordinator.manual_processing(articles, [UploadWorker()])
|
coordinator.manual_processing(articles, [UploadWorker()])
|
||||||
|
|
||||||
|
@ -4,7 +4,6 @@ logger = logging.getLogger(__name__)
|
|||||||
from peewee import *
|
from peewee import *
|
||||||
import os
|
import os
|
||||||
import markdown
|
import markdown
|
||||||
import re
|
|
||||||
import configuration
|
import configuration
|
||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
@ -28,7 +27,7 @@ class ArticleDownload(DownloadBaseModel):
|
|||||||
article_url = TextField(default = '', unique=True)
|
article_url = TextField(default = '', unique=True)
|
||||||
|
|
||||||
# fetch then fills in the metadata
|
# fetch then fills in the metadata
|
||||||
title = CharField(default='')
|
title = TextField(default='')
|
||||||
@property
|
@property
|
||||||
def is_title_bad(self): # add incrementally
|
def is_title_bad(self): # add incrementally
|
||||||
return "PUR-Abo" in self.title \
|
return "PUR-Abo" in self.title \
|
||||||
@ -63,7 +62,7 @@ class ArticleDownload(DownloadBaseModel):
|
|||||||
|
|
||||||
|
|
||||||
archive_url = TextField(default = '')
|
archive_url = TextField(default = '')
|
||||||
pub_date = DateField(default = '')
|
pub_date = DateField(default = datetime.date.fromtimestamp(0))
|
||||||
download_date = DateField(default = datetime.date.today)
|
download_date = DateField(default = datetime.date.today)
|
||||||
|
|
||||||
slack_ts = FloatField(default = 0) # should be a fixed-length string but float is easier to sort by
|
slack_ts = FloatField(default = 0) # should be a fixed-length string but float is easier to sort by
|
||||||
@ -143,6 +142,7 @@ class ArticleDownload(DownloadBaseModel):
|
|||||||
|
|
||||||
def set_authors(self, authors):
|
def set_authors(self, authors):
|
||||||
for a in authors:
|
for a in authors:
|
||||||
|
if len(a) < 100: # otherwise it's a mismatched string
|
||||||
ArticleAuthor.create(
|
ArticleAuthor.create(
|
||||||
article = self,
|
article = self,
|
||||||
author = a
|
author = a
|
||||||
@ -150,6 +150,9 @@ class ArticleDownload(DownloadBaseModel):
|
|||||||
|
|
||||||
def set_related(self, related):
|
def set_related(self, related):
|
||||||
for r in related:
|
for r in related:
|
||||||
|
if len(r) > 255:
|
||||||
|
raise Exception("Related file name too long for POSTGRES")
|
||||||
|
|
||||||
ArticleRelated.create(
|
ArticleRelated.create(
|
||||||
article = self,
|
article = self,
|
||||||
related_file_name = r
|
related_file_name = r
|
||||||
@ -182,116 +185,7 @@ class ArticleRelated(DownloadBaseModel):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
# class Thread(ChatBaseModel):
|
|
||||||
# """The threads that concern us are only created if the base massage contains a url"""
|
|
||||||
# thread_ts = FloatField(default = 0)
|
|
||||||
# article = ForeignKeyField(ArticleDownload, backref="slack_thread", null=True, default=None)
|
|
||||||
# # provides, ts, user, models
|
|
||||||
# # messages
|
|
||||||
|
|
||||||
# @property
|
|
||||||
# def slack_ts(self):
|
|
||||||
# str_ts = str(self.thread_ts)
|
|
||||||
# cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem!
|
|
||||||
# return "{}{}".format(str_ts, cut_zeros*"0")
|
|
||||||
|
|
||||||
# @property
|
|
||||||
# def initiator_message(self):
|
|
||||||
# try:
|
|
||||||
# return self.messages[0] # TODO check if this needs sorting
|
|
||||||
# except IndexError:
|
|
||||||
# logger.warning(f"Thread {self} is empty. How can that be?")
|
|
||||||
# return None
|
|
||||||
|
|
||||||
# @property
|
|
||||||
# def message_count(self):
|
|
||||||
# # logger.warning("message_count was called")
|
|
||||||
# return self.messages.count()
|
|
||||||
|
|
||||||
# @property
|
|
||||||
# def last_message(self):
|
|
||||||
# messages = Message.select().where(Message.thread == self).order_by(Message.ts) # can't be empty by definition/creation
|
|
||||||
# return messages[-1]
|
|
||||||
|
|
||||||
# @property
|
|
||||||
# def is_fully_processed(self) -> bool:
|
|
||||||
# init_message = self.initiator_message
|
|
||||||
# if init_message is None:
|
|
||||||
# return False
|
|
||||||
|
|
||||||
# if init_message.is_processed_override:
|
|
||||||
# return True
|
|
||||||
# # this override is set for instance, when no url was sent at all. Then set this thread to be ignored
|
|
||||||
|
|
||||||
# reactions = init_message.reaction
|
|
||||||
# if not reactions:
|
|
||||||
# return False
|
|
||||||
# else:
|
|
||||||
# r = reactions[0].type # can and should only have one reaction
|
|
||||||
# return r == "white_check_mark" \
|
|
||||||
# or r == "x"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# class Message(ChatBaseModel):
|
|
||||||
# ts = FloatField(unique=True) #for sorting
|
|
||||||
# channel_id = CharField(default='')
|
|
||||||
# user = ForeignKeyField(User, backref="messages")
|
|
||||||
# text = TextField(default='')
|
|
||||||
# thread = ForeignKeyField(Thread, backref="messages", default=None)
|
|
||||||
# file_type = CharField(default='')
|
|
||||||
# perma_link = CharField(default='')
|
|
||||||
# is_processed_override = BooleanField(default=False)
|
|
||||||
# # reaction
|
|
||||||
|
|
||||||
# def __str__(self) -> str:
|
|
||||||
# return "MSG [{}]".format(shorten_name(self.text).replace('\n','/'))
|
|
||||||
|
|
||||||
# @property
|
|
||||||
# def slack_ts(self):
|
|
||||||
# str_ts = str(self.ts)
|
|
||||||
# cut_zeros = 6 - (len(str_ts) - str_ts.find(".") - 1) # usually there a 6 decimals. If there are less, problem!
|
|
||||||
# return "{}{}".format(str_ts, cut_zeros * "0")
|
|
||||||
|
|
||||||
|
|
||||||
# @property
|
|
||||||
# def urls(self):
|
|
||||||
# pattern = r"<(.*?)>"
|
|
||||||
# matches = re.findall(pattern, self.text)
|
|
||||||
# matches = [m for m in matches if "." in m]
|
|
||||||
|
|
||||||
# new_matches = []
|
|
||||||
# for m in matches:
|
|
||||||
# if "." in m: # must contain a tld, right?
|
|
||||||
# # further complication: slack automatically abreviates urls in the format:
|
|
||||||
# # <url|link preview>. Lucky for us, "|" is a character derecommended in urls, meaning we can "safely" split for it and retain the first half
|
|
||||||
# if "|" in m:
|
|
||||||
# keep = m.split("|")[0]
|
|
||||||
# else:
|
|
||||||
# keep = m
|
|
||||||
# new_matches.append(keep)
|
|
||||||
# return new_matches
|
|
||||||
|
|
||||||
# @property
|
|
||||||
# def is_by_human(self):
|
|
||||||
# return self.user.user_id != slack_config["bot_id"]
|
|
||||||
|
|
||||||
|
|
||||||
# @property
|
|
||||||
# def has_single_url(self):
|
|
||||||
# return len(self.urls) == 1
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def set_db(download_db_object):
|
def set_db(download_db_object):
|
||||||
download_db.initialize(download_db_object)
|
download_db.initialize(download_db_object)
|
||||||
with download_db: # create tables (does nothing if they exist already)
|
with download_db: # create tables (does nothing if they exist already)
|
||||||
download_db.create_tables([ArticleDownload, ArticleAuthor, ArticleRelated])
|
download_db.create_tables([ArticleDownload, ArticleAuthor, ArticleRelated])
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user