344 lines
12 KiB
JavaScript
344 lines
12 KiB
JavaScript
|
// OWL Data Format
|
||
|
//
|
||
|
// The Official WordList data format exported by this encoder
|
||
|
// encodes sorted latin-1 words (letters only) based on the
|
||
|
// fact that sorted words have prefixes with substantial
|
||
|
// overlap.
|
||
|
//
|
||
|
// For example, the words:
|
||
|
// [ Another, Apple, Apricot, Bread ]
|
||
|
// could be folded once with a single special character, such
|
||
|
// as ":" to yield:
|
||
|
// [ nother, pple, pricot, :, read ].
|
||
|
// The First letter has been removed, but can be inferred by
|
||
|
// starting at A and incrementing to the next letter when ":"
|
||
|
// is encountered.
|
||
|
//
|
||
|
// The fold operation can be repeated for large sets as even within
|
||
|
// each folded set, there is substatial overlap in prefix. With the
|
||
|
// second special symbol ";", we get:
|
||
|
// [ ; x 13, other, :, ple, ricot, :, ; x 18, ead ]
|
||
|
// which can be further compressed by using numbers instead of the
|
||
|
// special character:
|
||
|
// [ 13, other, :, ple, ricot, :, 18, ead ]
|
||
|
// and to keep all values within a single byte, we only allow a
|
||
|
// maximum value of 10 (using 0 through 9 to represent 1 through 10),
|
||
|
// we get:
|
||
|
// [ 9, 2, other, :, ple, ricot, :, 9, 7, ead ]
|
||
|
// and we use camel-case to imply the bounrary, giving the final string:
|
||
|
// "92Other:PleRicot:97Ead"
|
||
|
//
|
||
|
// Once the entire latin-1 set has been collapsed, we use the remaining
|
||
|
// printable characters (except " and \, which require 2 bytes to represent
|
||
|
// in string) to substiture for the most common 2-letter pairs of letters
|
||
|
// in the string.
|
||
|
//
|
||
|
// OWLA Accent Format
|
||
|
//
|
||
|
// OWLA first removes all accents, and encodes that data using the OWL
|
||
|
// data format and encodes the accents as a base-64 series of 6-bit
|
||
|
// packed bits representing the distance from one followed letter to the
|
||
|
// next.
|
||
|
//
|
||
|
// For example, the acute accent in a given language may follow either
|
||
|
// a or e, in which case the follow-set is "ae". Each letter in the entire
|
||
|
// set is indexed, so the set of words with the accents:
|
||
|
// "thisA/ppleDoe/sNotMa/tterToMe/"
|
||
|
// " 1^ 2^ 3^ 4^ 5^ 6^ " <-- follow-set members, ALL a's and e's
|
||
|
// which gives the positions:
|
||
|
// [ 0, 2, 3, 4, 6 ]
|
||
|
// which then reduce to the distances
|
||
|
// [ 0, 2, 1, 1, 2 ]
|
||
|
// each of which fit into a 2-bit value, so this can be encoded as the
|
||
|
// base-64 encoded string:
|
||
|
// 00 10 01 01 10 = 001001 1010xx
|
||
|
//
|
||
|
// The base-64 set used has all number replaced with their
|
||
|
// shifted-counterparts to prevent comflicting with the numbers used in
|
||
|
// the fold operation to indicate the number of ";".
|
||
|
import fs from "fs";
|
||
|
import { id } from "../../hash/id.js";
|
||
|
import { decodeOwl } from "../decode-owl.js";
|
||
|
import { decodeOwlA } from "../decode-owla.js";
|
||
|
const subsChrs = " !#$%&'()*+,-./<=>?@[]^_`{|}~";
|
||
|
const Word = /^[a-z'`]*$/i;
|
||
|
function fold(words, sep) {
|
||
|
const output = [];
|
||
|
let initial = 97;
|
||
|
for (const word of words) {
|
||
|
if (word.match(Word)) {
|
||
|
while (initial < word.charCodeAt(0)) {
|
||
|
initial++;
|
||
|
output.push(sep);
|
||
|
}
|
||
|
output.push(word.substring(1));
|
||
|
}
|
||
|
else {
|
||
|
initial = 97;
|
||
|
output.push(word);
|
||
|
}
|
||
|
}
|
||
|
return output;
|
||
|
}
|
||
|
function camelcase(words) {
|
||
|
return words.map((word) => {
|
||
|
if (word.match(Word)) {
|
||
|
return word[0].toUpperCase() + word.substring(1);
|
||
|
}
|
||
|
else {
|
||
|
return word;
|
||
|
}
|
||
|
}).join("");
|
||
|
}
|
||
|
//let cc = 0, ce = 0;
|
||
|
/*
|
||
|
function getChar(c: string): string {
|
||
|
//if (c === "e") { ce++; }
|
||
|
if (c >= 'a' && c <= 'z') { return c; }
|
||
|
if (c.charCodeAt(1)) {
|
||
|
throw new Error(`bad char: "${ c }"`);
|
||
|
}
|
||
|
//cc++;
|
||
|
return "";
|
||
|
if (c.charCodeAt(0) === 768) { return "`"; }
|
||
|
if (c.charCodeAt(0) === 769) { return "'"; }
|
||
|
if (c.charCodeAt(0) === 771) { return "~"; }
|
||
|
throw new Error(`Unsupported character: ${ c } (${ c.charCodeAt(0) }, ${ c.charCodeAt(1) })`);
|
||
|
}
|
||
|
function mangle(text: string): { word: string, special: string } {
|
||
|
const result: Array<string> = [ ];
|
||
|
for (let i = 0; i < text.length; i++) {
|
||
|
const c = getChar(text[i]);
|
||
|
result.push(c);
|
||
|
}
|
||
|
|
||
|
const word = result.join("");
|
||
|
if (word[1] >= 'a' && word[1] <= 'z') { return { word, special: " " }; }
|
||
|
return { word: word[0] + word.substring(2), special: word[1] };
|
||
|
}
|
||
|
*/
|
||
|
/*
|
||
|
Store: [ accent ][ targets ][ rle data; base64-tail ]
|
||
|
` ae 3, 100 = (63, 37), 15
|
||
|
~ n 63, 64 = (63, 1), 27
|
||
|
*/
|
||
|
const Base64 = ")!@#$%^&*(ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-_";
|
||
|
export class BitWriter {
|
||
|
width;
|
||
|
#data;
|
||
|
#bitLength;
|
||
|
constructor(width) {
|
||
|
this.width = width;
|
||
|
this.#data = [];
|
||
|
this.#bitLength = 0;
|
||
|
}
|
||
|
write(value) {
|
||
|
const maxValue = ((1 << this.width) - 1);
|
||
|
while (value > maxValue) {
|
||
|
this.#data.push(0);
|
||
|
this.#bitLength += this.width;
|
||
|
value -= maxValue;
|
||
|
}
|
||
|
this.#data.push(value);
|
||
|
this.#bitLength += this.width;
|
||
|
}
|
||
|
get length() {
|
||
|
return 1 + Math.trunc((this.#bitLength + 5) / 6);
|
||
|
}
|
||
|
get data() {
|
||
|
let result = String(this.width);
|
||
|
let bits = 0;
|
||
|
let accum = 0;
|
||
|
const data = this.#data.slice();
|
||
|
let bitMod = this.#bitLength % 6;
|
||
|
while (bitMod !== 0 && bitMod < 6) {
|
||
|
data.push(0);
|
||
|
bitMod += this.width;
|
||
|
}
|
||
|
for (const value of data) {
|
||
|
accum <<= this.width;
|
||
|
accum |= value;
|
||
|
bits += this.width;
|
||
|
if (bits < 6) {
|
||
|
continue;
|
||
|
}
|
||
|
result += Base64[accum >> (bits - 6)];
|
||
|
bits -= 6;
|
||
|
accum &= ((1 << bits) - 1);
|
||
|
}
|
||
|
if (result.length !== this.length) {
|
||
|
throw new Error(`Hmm: ${this.length} ${result.length} ${result}`);
|
||
|
}
|
||
|
return result;
|
||
|
}
|
||
|
}
|
||
|
;
|
||
|
function sorted(text) {
|
||
|
const letters = text.split("");
|
||
|
letters.sort();
|
||
|
return letters.join("");
|
||
|
}
|
||
|
// if (c.charCodeAt(0) === 768) { return "`"; }
|
||
|
// if (c.charCodeAt(0) === 769) { return "'"; }
|
||
|
// if (c.charCodeAt(0) === 771) { return "~"; }
|
||
|
export function extractAccents(words) {
|
||
|
// Build a list that maps accents to the letters it can follow
|
||
|
const followsMap = new Map();
|
||
|
for (const word of words) {
|
||
|
for (let i = 0; i < word.length; i++) {
|
||
|
const c = word[i];
|
||
|
if (c >= 'a' && c <= 'z') {
|
||
|
continue;
|
||
|
}
|
||
|
// Make sure this positions and codepoint make sense
|
||
|
if (c.charCodeAt(1)) {
|
||
|
throw new Error(`unsupported codepoint: "${c}"`);
|
||
|
}
|
||
|
if (i === 0) {
|
||
|
throw new Error(`unmatched accent: ${c}`);
|
||
|
}
|
||
|
const ac = c.charCodeAt(0), lastLetter = word[i - 1];
|
||
|
;
|
||
|
const follows = (followsMap.get(ac) || "");
|
||
|
if (follows.indexOf(lastLetter) === -1) {
|
||
|
followsMap.set(ac, sorted(follows + lastLetter));
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
// Build the positions of each follow-set for those accents
|
||
|
const positionsMap = new Map();
|
||
|
for (const [accent, follows] of followsMap) {
|
||
|
let count = 0;
|
||
|
for (const word of words) {
|
||
|
for (let i = 0; i < word.length; i++) {
|
||
|
const c = word[i], ac = c.charCodeAt(0);
|
||
|
if (follows.indexOf(c) >= 0) {
|
||
|
count++;
|
||
|
}
|
||
|
if (ac === accent) {
|
||
|
const pos = positionsMap.get(ac) || [];
|
||
|
pos.push(count);
|
||
|
positionsMap.set(ac, pos);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
const accents = [];
|
||
|
for (const [accent, follows] of followsMap) {
|
||
|
let last = -1;
|
||
|
const positions = (positionsMap.get(accent) || []).map((value, index) => {
|
||
|
const delta = value - last;
|
||
|
last = value;
|
||
|
if (index === 0) {
|
||
|
return value;
|
||
|
}
|
||
|
return delta;
|
||
|
});
|
||
|
// Find the best encoding of the position data
|
||
|
let positionData = "";
|
||
|
for (let i = 2; i < 7; i++) {
|
||
|
const bitWriter = new BitWriter(i);
|
||
|
for (const p of positions) {
|
||
|
bitWriter.write(p);
|
||
|
}
|
||
|
if (positionData === "" || bitWriter.length < positionData.length) {
|
||
|
positionData = bitWriter.data;
|
||
|
}
|
||
|
}
|
||
|
const positionsLength = positions.length;
|
||
|
const positionDataLength = positionData.length;
|
||
|
accents.push({ accent, follows, positions, positionsLength, positionData, positionDataLength });
|
||
|
}
|
||
|
words = words.map((word) => {
|
||
|
let result = "";
|
||
|
for (let i = 0; i < word.length; i++) {
|
||
|
const c = word[i];
|
||
|
if (c >= 'a' && c <= 'z') {
|
||
|
result += c;
|
||
|
}
|
||
|
}
|
||
|
return result;
|
||
|
});
|
||
|
return { accents, words };
|
||
|
}
|
||
|
// Encode Official WordList
|
||
|
export function encodeOwl(words) {
|
||
|
// Fold the sorted words by indicating delta for the first 2 letters
|
||
|
let data = camelcase(fold(fold(words, ":"), ";"));
|
||
|
// Replace semicolons with counts (e.g. ";;;" with "3")
|
||
|
data = data.replace(/(;+)/g, (all, semis) => {
|
||
|
let result = "";
|
||
|
while (semis.length) {
|
||
|
let count = semis.length;
|
||
|
if (count > 10) {
|
||
|
count = 10;
|
||
|
}
|
||
|
result += String(count - 1);
|
||
|
semis = semis.substring(count);
|
||
|
}
|
||
|
return result;
|
||
|
});
|
||
|
// Finds the best option for a shortcut replacement using the
|
||
|
// unused ascii7 characters
|
||
|
function findBest() {
|
||
|
const tally = {};
|
||
|
const l = 2;
|
||
|
for (let i = l; i < data.length; i++) {
|
||
|
const key = data.substring(i - l, i);
|
||
|
tally[key] = (tally[key] || 0) + 1;
|
||
|
}
|
||
|
const sorted = Object.keys(tally).map((text) => {
|
||
|
return { text, count: tally[text], save: (tally[text] * (text.length - 1)) };
|
||
|
});
|
||
|
sorted.sort((a, b) => (b.save - a.save));
|
||
|
return sorted[0].text;
|
||
|
}
|
||
|
// Make substitutions
|
||
|
let subs = "";
|
||
|
for (let i = 0; i < subsChrs.length; i++) {
|
||
|
const n = subsChrs[i], o = findBest();
|
||
|
subs += o;
|
||
|
data = data.split(o).join(n);
|
||
|
}
|
||
|
return { data, subs };
|
||
|
}
|
||
|
// Returns either:
|
||
|
// - OWL data for accent-free latin-1: { data, accentds: "" }
|
||
|
// - OWLA data for accented latin-1: { data, accents }
|
||
|
function encodeWords(_words) {
|
||
|
const { accents, words } = extractAccents(_words);
|
||
|
const { data, subs } = encodeOwl(words);
|
||
|
const accentData = accents.map(({ accent, follows, positionData }) => {
|
||
|
return `${follows}${accent}${positionData}`;
|
||
|
}).join(",");
|
||
|
return {
|
||
|
data: `0${subs}${data}`,
|
||
|
accents: accentData
|
||
|
};
|
||
|
}
|
||
|
// CLI
|
||
|
const content = fs.readFileSync(process.argv[2]).toString();
|
||
|
const words = content.split("\n").filter(Boolean);
|
||
|
const { data, accents } = encodeWords(words);
|
||
|
if (accents) {
|
||
|
const rec = decodeOwlA(data, accents);
|
||
|
console.log("DATA: ", JSON.stringify(data));
|
||
|
console.log("ACCENTS: ", JSON.stringify(accents));
|
||
|
console.log("LENGTH: ", data.length);
|
||
|
console.log("CHECKSUM: ", id(content));
|
||
|
console.log("RATIO: ", Math.trunc(100 * data.length / content.length) + "%");
|
||
|
if (rec.join("\n") !== words.join("\n")) {
|
||
|
throw new Error("no match!");
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
const rec = decodeOwl(data);
|
||
|
console.log("DATA: ", JSON.stringify(data));
|
||
|
console.log("LENGTH: ", data.length);
|
||
|
console.log("CHECKSUM: ", id(content));
|
||
|
console.log("RATIO: ", Math.trunc(100 * data.length / content.length) + "%");
|
||
|
if (rec.join("\n") !== words.join("\n")) {
|
||
|
throw new Error("no match!");
|
||
|
}
|
||
|
}
|
||
|
//# sourceMappingURL=encode-latin.js.map
|