add node code

This commit is contained in:
Bruce Röttgers 2025-04-09 13:50:08 +02:00
parent e242a18408
commit 04b6e56d14
7 changed files with 414 additions and 0 deletions

8
.dockerignore Normal file

@ -0,0 +1,8 @@
.env
# NODE
node_modules
# PYTHON
__pycache__
.venv

3
.gitignore vendored

@ -8,3 +8,6 @@ wheels/
# Virtual environments
.venv
.env
node_modules

1
.prettierrc Normal file

@ -0,0 +1 @@
{}

12
Dockerfile Normal file

@ -0,0 +1,12 @@
FROM node:22
WORKDIR /app
COPY package.json .
COPY package-lock.json .
RUN npm install
COPY index.ts .
CMD [ "node", "--max-old-space-size=4096", "--experimental-strip-types", "index.ts" ]

183
package-lock.json generated Normal file

@ -0,0 +1,183 @@
{
"name": "mapvoyage-extract",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"dependencies": {
"sax": "^1.4.1",
"unbzip2-stream": "^1.4.3"
},
"devDependencies": {
"@types/node": "^22.14.0",
"@types/sax": "^1.2.7",
"@types/unbzip2-stream": "^1.4.3",
"prettier": "^3.4.2",
"typescript": "^5.8.2"
}
},
"node_modules/@types/node": {
"version": "22.14.0",
"resolved": "https://registry.npmjs.org/@types/node/-/node-22.14.0.tgz",
"integrity": "sha512-Kmpl+z84ILoG+3T/zQFyAJsU6EPTmOCj8/2+83fSN6djd6I4o7uOuGIH6vq3PrjY5BGitSbFuMN18j3iknubbA==",
"dev": true,
"license": "MIT",
"dependencies": {
"undici-types": "~6.21.0"
}
},
"node_modules/@types/sax": {
"version": "1.2.7",
"resolved": "https://registry.npmjs.org/@types/sax/-/sax-1.2.7.tgz",
"integrity": "sha512-rO73L89PJxeYM3s3pPPjiPgVVcymqU490g0YO5n5By0k2Erzj6tay/4lr1CHAAU4JyOWd1rpQ8bCf6cZfHU96A==",
"dev": true,
"license": "MIT",
"dependencies": {
"@types/node": "*"
}
},
"node_modules/@types/through": {
"version": "0.0.33",
"resolved": "https://registry.npmjs.org/@types/through/-/through-0.0.33.tgz",
"integrity": "sha512-HsJ+z3QuETzP3cswwtzt2vEIiHBk/dCcHGhbmG5X3ecnwFD/lPrMpliGXxSCg03L9AhrdwA4Oz/qfspkDW+xGQ==",
"dev": true,
"license": "MIT",
"dependencies": {
"@types/node": "*"
}
},
"node_modules/@types/unbzip2-stream": {
"version": "1.4.3",
"resolved": "https://registry.npmjs.org/@types/unbzip2-stream/-/unbzip2-stream-1.4.3.tgz",
"integrity": "sha512-D8X5uuJRISqc8YtwL8jNW2FpPdUOCYXbfD6zNROCTbVXK9nawucxh10tVXE3MPjnHdRA1LvB0zDxVya/lBsnYw==",
"dev": true,
"license": "MIT",
"dependencies": {
"@types/through": "*"
}
},
"node_modules/base64-js": {
"version": "1.5.1",
"resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz",
"integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/feross"
},
{
"type": "patreon",
"url": "https://www.patreon.com/feross"
},
{
"type": "consulting",
"url": "https://feross.org/support"
}
],
"license": "MIT"
},
"node_modules/buffer": {
"version": "5.7.1",
"resolved": "https://registry.npmjs.org/buffer/-/buffer-5.7.1.tgz",
"integrity": "sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/feross"
},
{
"type": "patreon",
"url": "https://www.patreon.com/feross"
},
{
"type": "consulting",
"url": "https://feross.org/support"
}
],
"license": "MIT",
"dependencies": {
"base64-js": "^1.3.1",
"ieee754": "^1.1.13"
}
},
"node_modules/ieee754": {
"version": "1.2.1",
"resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz",
"integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/feross"
},
{
"type": "patreon",
"url": "https://www.patreon.com/feross"
},
{
"type": "consulting",
"url": "https://feross.org/support"
}
],
"license": "BSD-3-Clause"
},
"node_modules/prettier": {
"version": "3.5.3",
"resolved": "https://registry.npmjs.org/prettier/-/prettier-3.5.3.tgz",
"integrity": "sha512-QQtaxnoDJeAkDvDKWCLiwIXkTgRhwYDEQCghU9Z6q03iyek/rxRh/2lC3HB7P8sWT2xC/y5JDctPLBIGzHKbhw==",
"dev": true,
"license": "MIT",
"bin": {
"prettier": "bin/prettier.cjs"
},
"engines": {
"node": ">=14"
},
"funding": {
"url": "https://github.com/prettier/prettier?sponsor=1"
}
},
"node_modules/sax": {
"version": "1.4.1",
"resolved": "https://registry.npmjs.org/sax/-/sax-1.4.1.tgz",
"integrity": "sha512-+aWOz7yVScEGoKNd4PA10LZ8sk0A/z5+nXQG5giUO5rprX9jgYsTdov9qCchZiPIZezbZH+jRut8nPodFAX4Jg==",
"license": "ISC"
},
"node_modules/through": {
"version": "2.3.8",
"resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz",
"integrity": "sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==",
"license": "MIT"
},
"node_modules/typescript": {
"version": "5.8.2",
"resolved": "https://registry.npmjs.org/typescript/-/typescript-5.8.2.tgz",
"integrity": "sha512-aJn6wq13/afZp/jT9QZmwEjDqqvSGp1VT5GVg+f/t6/oVyrgXM6BY1h9BRh/O5p3PlUPAe+WuiEZOmb/49RqoQ==",
"dev": true,
"license": "Apache-2.0",
"bin": {
"tsc": "bin/tsc",
"tsserver": "bin/tsserver"
},
"engines": {
"node": ">=14.17"
}
},
"node_modules/unbzip2-stream": {
"version": "1.4.3",
"resolved": "https://registry.npmjs.org/unbzip2-stream/-/unbzip2-stream-1.4.3.tgz",
"integrity": "sha512-mlExGW4w71ebDJviH16lQLtZS32VKqsSfk80GCfUlwT/4/hNRFsoscrF/c++9xinkMzECL1uL9DDwXqFWkruPg==",
"license": "MIT",
"dependencies": {
"buffer": "^5.2.1",
"through": "^2.3.8"
}
},
"node_modules/undici-types": {
"version": "6.21.0",
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz",
"integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==",
"dev": true,
"license": "MIT"
}
}
}

15
package.json Normal file

@ -0,0 +1,15 @@
{
"private": true,
"packageManager": "pnpm@9.13.2+sha512.88c9c3864450350e65a33587ab801acf946d7c814ed1134da4a924f6df5a2120fd36b46aab68f7cd1d413149112d53c7db3a4136624cfd00ff1846a0c6cef48a",
"devDependencies": {
"@types/node": "^22.14.0",
"@types/sax": "^1.2.7",
"@types/unbzip2-stream": "^1.4.3",
"prettier": "^3.4.2",
"typescript": "^5.8.2"
},
"dependencies": {
"sax": "^1.4.1",
"unbzip2-stream": "^1.4.3"
}
}

192
split-dump.ts Normal file

@ -0,0 +1,192 @@
import fs from "fs";
import https from "https";
import path from "path";
import sax from "sax";
import bz2 from "unbzip2-stream";
import { createGunzip } from "zlib";
// Local storage configuration
const OUTPUT_FOLDER = "myfolder";
// --- Step 1: Fetch mappings from SQL dump ---
async function fetchMappings(): Promise<Record<string, string>> {
return new Promise((resolve, reject) => {
const sqlUrl =
"https://dumps.wikimedia.org/enwikivoyage/latest/enwikivoyage-latest-page_props.sql.gz";
https
.get(sqlUrl, (res) => {
if (res.statusCode !== 200) {
return reject(
new Error(`Failed to get SQL dump, status code: ${res.statusCode}`),
);
}
const gunzip = createGunzip();
let buffer = "";
const mappings: Record<string, string> = {};
res.pipe(gunzip);
gunzip.on("data", (chunk: Buffer) => {
buffer += chunk.toString();
const regex = /\((\d+),'([^']+)','([^']+)',(NULL|[\d\.]+)\)/g;
let match: RegExpExecArray | null;
while ((match = regex.exec(buffer)) !== null) {
const [, pp_page, pp_propname, pp_value] = match;
if (pp_propname === "wikibase_item") {
mappings[pp_page] = pp_value;
}
}
// Keep a tail to handle chunk splits
if (buffer.length > 1000) {
buffer = buffer.slice(-1000);
}
});
gunzip.on("end", () => resolve(mappings));
gunzip.on("error", reject);
})
.on("error", reject);
});
}
// --- Helper to save file locally ---
let saveCount = 0;
function saveToLocalFile(filename: string, data: string): Promise<void> {
return new Promise((resolve, reject) => {
// Create directory if it doesn't exist
if (!fs.existsSync(OUTPUT_FOLDER)) {
fs.mkdirSync(OUTPUT_FOLDER, { recursive: true });
}
const filePath = path.join(OUTPUT_FOLDER, filename);
fs.writeFile(filePath, data, (err) => {
if (err) {
reject(err);
} else {
console.log(`File saved successfully (${++saveCount}): ${filePath}`);
resolve();
}
});
});
}
// Simple semaphore to limit concurrency
class Semaphore {
private tasks: (() => void)[] = [];
private count: number;
constructor(count: number) {
this.count = count;
}
async acquire(): Promise<() => void> {
return new Promise((release) => {
const task = () => {
this.count--;
release(() => {
this.count++;
if (this.tasks.length > 0) {
const next = this.tasks.shift()!;
next();
}
});
};
if (this.count > 0) {
task();
} else {
this.tasks.push(task);
}
});
}
}
// --- Step 3: Process the XML dump ---
async function processXML(mappings: Record<string, string>): Promise<void> {
return new Promise((resolve, reject) => {
const xmlUrl =
"https://dumps.wikimedia.org/enwikivoyage/latest/enwikivoyage-latest-pages-articles.xml.bz2";
https
.get(xmlUrl, (res) => {
if (res.statusCode !== 200) {
return reject(
new Error(`Failed to fetch XML dump: ${res.statusCode}`),
);
}
// Pipe through bz2 decompressor
const stream = res.pipe(bz2());
// Use sax for streaming XML parsing
const parser = sax.createStream(true, {});
let currentPageId: string | null = null;
let currentText: string | null = null;
let inPage = false;
let inRevision = false;
let inText = false;
let currentTag: string | null = null; // Track current tag
parser.on("opentag", (node) => {
currentTag = node.name; // Track current tag
if (node.name === "page") {
inPage = true;
currentPageId = null;
currentText = null;
} else if (node.name === "revision") {
inRevision = true;
} else if (inRevision && node.name === "text") {
inText = true;
}
});
parser.on("closetag", (tagName) => {
if (tagName === "page") {
if (
typeof currentPageId == "string" &&
currentText !== null &&
!!mappings[currentPageId]
) {
const wikidataId = mappings[currentPageId];
const filename = `${wikidataId}.wiki.txt`;
// Make a copy as the value will continue changing
const textToSave = currentText.toString();
saveToLocalFile(filename, textToSave).catch((err) =>
console.error(`Save error for page ${currentPageId}:`, err)
);
}
// Reset state for the next page
inPage = false;
currentPageId = null;
currentText = null;
} else if (tagName === "revision") {
inRevision = false;
} else if (tagName === "text") {
inText = false;
}
currentTag = null; // Reset current tag
});
parser.on("text", (text) => {
const trimmedText = text.trim();
if (!trimmedText) return;
if (currentTag === "id" && inPage && !inRevision && !currentPageId) {
currentPageId = trimmedText;
} else if (inText) {
currentText = (currentText || "") + trimmedText;
}
});
parser.on("error", reject);
parser.on("end", resolve);
stream.pipe(parser);
})
.on("error", reject);
});
}
// --- Main integration ---
async function main() {
try {
console.log("Fetching mappings from SQL dump...");
const mappings = await fetchMappings();
console.log(`Fetched ${Object.keys(mappings).length} mappings.`);
console.log("Processing XML dump...");
await processXML(mappings);
console.log("Processing complete.");
} catch (err) {
console.error("Error:", err);
}
}
main().then(() => process.exit());