mirror of
https://github.com/bcye/structured-wikivoyage-exports.git
synced 2025-05-12 03:00:20 +00:00
add node code
This commit is contained in:
parent
e242a18408
commit
04b6e56d14
8
.dockerignore
Normal file
8
.dockerignore
Normal file
@ -0,0 +1,8 @@
|
||||
.env
|
||||
|
||||
# NODE
|
||||
node_modules
|
||||
|
||||
# PYTHON
|
||||
__pycache__
|
||||
.venv
|
3
.gitignore
vendored
3
.gitignore
vendored
@ -8,3 +8,6 @@ wheels/
|
||||
|
||||
# Virtual environments
|
||||
.venv
|
||||
|
||||
.env
|
||||
node_modules
|
||||
|
1
.prettierrc
Normal file
1
.prettierrc
Normal file
@ -0,0 +1 @@
|
||||
{}
|
12
Dockerfile
Normal file
12
Dockerfile
Normal file
@ -0,0 +1,12 @@
|
||||
FROM node:22
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY package.json .
|
||||
COPY package-lock.json .
|
||||
|
||||
RUN npm install
|
||||
|
||||
COPY index.ts .
|
||||
|
||||
CMD [ "node", "--max-old-space-size=4096", "--experimental-strip-types", "index.ts" ]
|
183
package-lock.json
generated
Normal file
183
package-lock.json
generated
Normal file
@ -0,0 +1,183 @@
|
||||
{
|
||||
"name": "mapvoyage-extract",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"dependencies": {
|
||||
"sax": "^1.4.1",
|
||||
"unbzip2-stream": "^1.4.3"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^22.14.0",
|
||||
"@types/sax": "^1.2.7",
|
||||
"@types/unbzip2-stream": "^1.4.3",
|
||||
"prettier": "^3.4.2",
|
||||
"typescript": "^5.8.2"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/node": {
|
||||
"version": "22.14.0",
|
||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-22.14.0.tgz",
|
||||
"integrity": "sha512-Kmpl+z84ILoG+3T/zQFyAJsU6EPTmOCj8/2+83fSN6djd6I4o7uOuGIH6vq3PrjY5BGitSbFuMN18j3iknubbA==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"undici-types": "~6.21.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/sax": {
|
||||
"version": "1.2.7",
|
||||
"resolved": "https://registry.npmjs.org/@types/sax/-/sax-1.2.7.tgz",
|
||||
"integrity": "sha512-rO73L89PJxeYM3s3pPPjiPgVVcymqU490g0YO5n5By0k2Erzj6tay/4lr1CHAAU4JyOWd1rpQ8bCf6cZfHU96A==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/through": {
|
||||
"version": "0.0.33",
|
||||
"resolved": "https://registry.npmjs.org/@types/through/-/through-0.0.33.tgz",
|
||||
"integrity": "sha512-HsJ+z3QuETzP3cswwtzt2vEIiHBk/dCcHGhbmG5X3ecnwFD/lPrMpliGXxSCg03L9AhrdwA4Oz/qfspkDW+xGQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/unbzip2-stream": {
|
||||
"version": "1.4.3",
|
||||
"resolved": "https://registry.npmjs.org/@types/unbzip2-stream/-/unbzip2-stream-1.4.3.tgz",
|
||||
"integrity": "sha512-D8X5uuJRISqc8YtwL8jNW2FpPdUOCYXbfD6zNROCTbVXK9nawucxh10tVXE3MPjnHdRA1LvB0zDxVya/lBsnYw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@types/through": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/base64-js": {
|
||||
"version": "1.5.1",
|
||||
"resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz",
|
||||
"integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==",
|
||||
"funding": [
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/feross"
|
||||
},
|
||||
{
|
||||
"type": "patreon",
|
||||
"url": "https://www.patreon.com/feross"
|
||||
},
|
||||
{
|
||||
"type": "consulting",
|
||||
"url": "https://feross.org/support"
|
||||
}
|
||||
],
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/buffer": {
|
||||
"version": "5.7.1",
|
||||
"resolved": "https://registry.npmjs.org/buffer/-/buffer-5.7.1.tgz",
|
||||
"integrity": "sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==",
|
||||
"funding": [
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/feross"
|
||||
},
|
||||
{
|
||||
"type": "patreon",
|
||||
"url": "https://www.patreon.com/feross"
|
||||
},
|
||||
{
|
||||
"type": "consulting",
|
||||
"url": "https://feross.org/support"
|
||||
}
|
||||
],
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"base64-js": "^1.3.1",
|
||||
"ieee754": "^1.1.13"
|
||||
}
|
||||
},
|
||||
"node_modules/ieee754": {
|
||||
"version": "1.2.1",
|
||||
"resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz",
|
||||
"integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==",
|
||||
"funding": [
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/feross"
|
||||
},
|
||||
{
|
||||
"type": "patreon",
|
||||
"url": "https://www.patreon.com/feross"
|
||||
},
|
||||
{
|
||||
"type": "consulting",
|
||||
"url": "https://feross.org/support"
|
||||
}
|
||||
],
|
||||
"license": "BSD-3-Clause"
|
||||
},
|
||||
"node_modules/prettier": {
|
||||
"version": "3.5.3",
|
||||
"resolved": "https://registry.npmjs.org/prettier/-/prettier-3.5.3.tgz",
|
||||
"integrity": "sha512-QQtaxnoDJeAkDvDKWCLiwIXkTgRhwYDEQCghU9Z6q03iyek/rxRh/2lC3HB7P8sWT2xC/y5JDctPLBIGzHKbhw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"bin": {
|
||||
"prettier": "bin/prettier.cjs"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=14"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/prettier/prettier?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/sax": {
|
||||
"version": "1.4.1",
|
||||
"resolved": "https://registry.npmjs.org/sax/-/sax-1.4.1.tgz",
|
||||
"integrity": "sha512-+aWOz7yVScEGoKNd4PA10LZ8sk0A/z5+nXQG5giUO5rprX9jgYsTdov9qCchZiPIZezbZH+jRut8nPodFAX4Jg==",
|
||||
"license": "ISC"
|
||||
},
|
||||
"node_modules/through": {
|
||||
"version": "2.3.8",
|
||||
"resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz",
|
||||
"integrity": "sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/typescript": {
|
||||
"version": "5.8.2",
|
||||
"resolved": "https://registry.npmjs.org/typescript/-/typescript-5.8.2.tgz",
|
||||
"integrity": "sha512-aJn6wq13/afZp/jT9QZmwEjDqqvSGp1VT5GVg+f/t6/oVyrgXM6BY1h9BRh/O5p3PlUPAe+WuiEZOmb/49RqoQ==",
|
||||
"dev": true,
|
||||
"license": "Apache-2.0",
|
||||
"bin": {
|
||||
"tsc": "bin/tsc",
|
||||
"tsserver": "bin/tsserver"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=14.17"
|
||||
}
|
||||
},
|
||||
"node_modules/unbzip2-stream": {
|
||||
"version": "1.4.3",
|
||||
"resolved": "https://registry.npmjs.org/unbzip2-stream/-/unbzip2-stream-1.4.3.tgz",
|
||||
"integrity": "sha512-mlExGW4w71ebDJviH16lQLtZS32VKqsSfk80GCfUlwT/4/hNRFsoscrF/c++9xinkMzECL1uL9DDwXqFWkruPg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"buffer": "^5.2.1",
|
||||
"through": "^2.3.8"
|
||||
}
|
||||
},
|
||||
"node_modules/undici-types": {
|
||||
"version": "6.21.0",
|
||||
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz",
|
||||
"integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
}
|
||||
}
|
||||
}
|
15
package.json
Normal file
15
package.json
Normal file
@ -0,0 +1,15 @@
|
||||
{
|
||||
"private": true,
|
||||
"packageManager": "pnpm@9.13.2+sha512.88c9c3864450350e65a33587ab801acf946d7c814ed1134da4a924f6df5a2120fd36b46aab68f7cd1d413149112d53c7db3a4136624cfd00ff1846a0c6cef48a",
|
||||
"devDependencies": {
|
||||
"@types/node": "^22.14.0",
|
||||
"@types/sax": "^1.2.7",
|
||||
"@types/unbzip2-stream": "^1.4.3",
|
||||
"prettier": "^3.4.2",
|
||||
"typescript": "^5.8.2"
|
||||
},
|
||||
"dependencies": {
|
||||
"sax": "^1.4.1",
|
||||
"unbzip2-stream": "^1.4.3"
|
||||
}
|
||||
}
|
192
split-dump.ts
Normal file
192
split-dump.ts
Normal file
@ -0,0 +1,192 @@
|
||||
import fs from "fs";
|
||||
import https from "https";
|
||||
import path from "path";
|
||||
import sax from "sax";
|
||||
import bz2 from "unbzip2-stream";
|
||||
import { createGunzip } from "zlib";
|
||||
|
||||
// Local storage configuration
|
||||
const OUTPUT_FOLDER = "myfolder";
|
||||
|
||||
// --- Step 1: Fetch mappings from SQL dump ---
|
||||
async function fetchMappings(): Promise<Record<string, string>> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const sqlUrl =
|
||||
"https://dumps.wikimedia.org/enwikivoyage/latest/enwikivoyage-latest-page_props.sql.gz";
|
||||
https
|
||||
.get(sqlUrl, (res) => {
|
||||
if (res.statusCode !== 200) {
|
||||
return reject(
|
||||
new Error(`Failed to get SQL dump, status code: ${res.statusCode}`),
|
||||
);
|
||||
}
|
||||
const gunzip = createGunzip();
|
||||
let buffer = "";
|
||||
const mappings: Record<string, string> = {};
|
||||
res.pipe(gunzip);
|
||||
gunzip.on("data", (chunk: Buffer) => {
|
||||
buffer += chunk.toString();
|
||||
const regex = /\((\d+),'([^']+)','([^']+)',(NULL|[\d\.]+)\)/g;
|
||||
let match: RegExpExecArray | null;
|
||||
while ((match = regex.exec(buffer)) !== null) {
|
||||
const [, pp_page, pp_propname, pp_value] = match;
|
||||
if (pp_propname === "wikibase_item") {
|
||||
mappings[pp_page] = pp_value;
|
||||
}
|
||||
}
|
||||
// Keep a tail to handle chunk splits
|
||||
if (buffer.length > 1000) {
|
||||
buffer = buffer.slice(-1000);
|
||||
}
|
||||
});
|
||||
gunzip.on("end", () => resolve(mappings));
|
||||
gunzip.on("error", reject);
|
||||
})
|
||||
.on("error", reject);
|
||||
});
|
||||
}
|
||||
|
||||
// --- Helper to save file locally ---
|
||||
let saveCount = 0;
|
||||
function saveToLocalFile(filename: string, data: string): Promise<void> {
|
||||
return new Promise((resolve, reject) => {
|
||||
// Create directory if it doesn't exist
|
||||
if (!fs.existsSync(OUTPUT_FOLDER)) {
|
||||
fs.mkdirSync(OUTPUT_FOLDER, { recursive: true });
|
||||
}
|
||||
|
||||
const filePath = path.join(OUTPUT_FOLDER, filename);
|
||||
fs.writeFile(filePath, data, (err) => {
|
||||
if (err) {
|
||||
reject(err);
|
||||
} else {
|
||||
console.log(`File saved successfully (${++saveCount}): ${filePath}`);
|
||||
resolve();
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// Simple semaphore to limit concurrency
|
||||
class Semaphore {
|
||||
private tasks: (() => void)[] = [];
|
||||
private count: number;
|
||||
constructor(count: number) {
|
||||
this.count = count;
|
||||
}
|
||||
async acquire(): Promise<() => void> {
|
||||
return new Promise((release) => {
|
||||
const task = () => {
|
||||
this.count--;
|
||||
release(() => {
|
||||
this.count++;
|
||||
if (this.tasks.length > 0) {
|
||||
const next = this.tasks.shift()!;
|
||||
next();
|
||||
}
|
||||
});
|
||||
};
|
||||
if (this.count > 0) {
|
||||
task();
|
||||
} else {
|
||||
this.tasks.push(task);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// --- Step 3: Process the XML dump ---
|
||||
async function processXML(mappings: Record<string, string>): Promise<void> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const xmlUrl =
|
||||
"https://dumps.wikimedia.org/enwikivoyage/latest/enwikivoyage-latest-pages-articles.xml.bz2";
|
||||
https
|
||||
.get(xmlUrl, (res) => {
|
||||
if (res.statusCode !== 200) {
|
||||
return reject(
|
||||
new Error(`Failed to fetch XML dump: ${res.statusCode}`),
|
||||
);
|
||||
}
|
||||
// Pipe through bz2 decompressor
|
||||
const stream = res.pipe(bz2());
|
||||
// Use sax for streaming XML parsing
|
||||
const parser = sax.createStream(true, {});
|
||||
let currentPageId: string | null = null;
|
||||
let currentText: string | null = null;
|
||||
let inPage = false;
|
||||
let inRevision = false;
|
||||
let inText = false;
|
||||
let currentTag: string | null = null; // Track current tag
|
||||
parser.on("opentag", (node) => {
|
||||
currentTag = node.name; // Track current tag
|
||||
if (node.name === "page") {
|
||||
inPage = true;
|
||||
currentPageId = null;
|
||||
currentText = null;
|
||||
} else if (node.name === "revision") {
|
||||
inRevision = true;
|
||||
} else if (inRevision && node.name === "text") {
|
||||
inText = true;
|
||||
}
|
||||
});
|
||||
parser.on("closetag", (tagName) => {
|
||||
if (tagName === "page") {
|
||||
if (
|
||||
typeof currentPageId == "string" &&
|
||||
currentText !== null &&
|
||||
!!mappings[currentPageId]
|
||||
) {
|
||||
const wikidataId = mappings[currentPageId];
|
||||
const filename = `${wikidataId}.wiki.txt`;
|
||||
|
||||
// Make a copy as the value will continue changing
|
||||
const textToSave = currentText.toString();
|
||||
|
||||
|
||||
saveToLocalFile(filename, textToSave).catch((err) =>
|
||||
console.error(`Save error for page ${currentPageId}:`, err)
|
||||
);
|
||||
}
|
||||
// Reset state for the next page
|
||||
inPage = false;
|
||||
currentPageId = null;
|
||||
currentText = null;
|
||||
} else if (tagName === "revision") {
|
||||
inRevision = false;
|
||||
} else if (tagName === "text") {
|
||||
inText = false;
|
||||
}
|
||||
currentTag = null; // Reset current tag
|
||||
});
|
||||
parser.on("text", (text) => {
|
||||
const trimmedText = text.trim();
|
||||
if (!trimmedText) return;
|
||||
if (currentTag === "id" && inPage && !inRevision && !currentPageId) {
|
||||
currentPageId = trimmedText;
|
||||
} else if (inText) {
|
||||
currentText = (currentText || "") + trimmedText;
|
||||
}
|
||||
});
|
||||
parser.on("error", reject);
|
||||
parser.on("end", resolve);
|
||||
stream.pipe(parser);
|
||||
})
|
||||
.on("error", reject);
|
||||
});
|
||||
}
|
||||
|
||||
// --- Main integration ---
|
||||
async function main() {
|
||||
try {
|
||||
console.log("Fetching mappings from SQL dump...");
|
||||
const mappings = await fetchMappings();
|
||||
console.log(`Fetched ${Object.keys(mappings).length} mappings.`);
|
||||
console.log("Processing XML dump...");
|
||||
await processXML(mappings);
|
||||
console.log("Processing complete.");
|
||||
} catch (err) {
|
||||
console.error("Error:", err);
|
||||
}
|
||||
}
|
||||
|
||||
main().then(() => process.exit());
|
Loading…
x
Reference in New Issue
Block a user