// OWL Data Format // // The Official WordList data format exported by this encoder // encodes sorted latin-1 words (letters only) based on the // fact that sorted words have prefixes with substantial // overlap. // // For example, the words: // [ Another, Apple, Apricot, Bread ] // could be folded once with a single special character, such // as ":" to yield: // [ nother, pple, pricot, :, read ]. // The First letter has been removed, but can be inferred by // starting at A and incrementing to the next letter when ":" // is encountered. // // The fold operation can be repeated for large sets as even within // each folded set, there is substatial overlap in prefix. With the // second special symbol ";", we get: // [ ; x 13, other, :, ple, ricot, :, ; x 18, ead ] // which can be further compressed by using numbers instead of the // special character: // [ 13, other, :, ple, ricot, :, 18, ead ] // and to keep all values within a single byte, we only allow a // maximum value of 10 (using 0 through 9 to represent 1 through 10), // we get: // [ 9, 2, other, :, ple, ricot, :, 9, 7, ead ] // and we use camel-case to imply the bounrary, giving the final string: // "92Other:PleRicot:97Ead" // // Once the entire latin-1 set has been collapsed, we use the remaining // printable characters (except " and \, which require 2 bytes to represent // in string) to substiture for the most common 2-letter pairs of letters // in the string. // // OWLA Accent Format // // OWLA first removes all accents, and encodes that data using the OWL // data format and encodes the accents as a base-64 series of 6-bit // packed bits representing the distance from one followed letter to the // next. // // For example, the acute accent in a given language may follow either // a or e, in which case the follow-set is "ae". Each letter in the entire // set is indexed, so the set of words with the accents: // "thisA/ppleDoe/sNotMa/tterToMe/" // " 1^ 2^ 3^ 4^ 5^ 6^ " <-- follow-set members, ALL a's and e's // which gives the positions: // [ 0, 2, 3, 4, 6 ] // which then reduce to the distances // [ 0, 2, 1, 1, 2 ] // each of which fit into a 2-bit value, so this can be encoded as the // base-64 encoded string: // 00 10 01 01 10 = 001001 1010xx // // The base-64 set used has all number replaced with their // shifted-counterparts to prevent comflicting with the numbers used in // the fold operation to indicate the number of ";". import fs from "fs"; import { id } from "../../hash/id.js"; import { decodeOwl } from "../decode-owl.js"; import { decodeOwlA } from "../decode-owla.js"; const subsChrs = " !#$%&'()*+,-./<=>?@[]^_`{|}~"; const Word = /^[a-z'`]*$/i; function fold(words: Array, sep: string): Array { const output: Array = [ ]; let initial = 97; for (const word of words) { if (word.match(Word)) { while (initial < word.charCodeAt(0)) { initial++; output.push(sep); } output.push(word.substring(1)); } else { initial = 97; output.push(word); } } return output; } function camelcase(words: Array): string { return words.map((word) => { if (word.match(Word)) { return word[0].toUpperCase() + word.substring(1); } else { return word; } }).join(""); } //let cc = 0, ce = 0; /* function getChar(c: string): string { //if (c === "e") { ce++; } if (c >= 'a' && c <= 'z') { return c; } if (c.charCodeAt(1)) { throw new Error(`bad char: "${ c }"`); } //cc++; return ""; if (c.charCodeAt(0) === 768) { return "`"; } if (c.charCodeAt(0) === 769) { return "'"; } if (c.charCodeAt(0) === 771) { return "~"; } throw new Error(`Unsupported character: ${ c } (${ c.charCodeAt(0) }, ${ c.charCodeAt(1) })`); } function mangle(text: string): { word: string, special: string } { const result: Array = [ ]; for (let i = 0; i < text.length; i++) { const c = getChar(text[i]); result.push(c); } const word = result.join(""); if (word[1] >= 'a' && word[1] <= 'z') { return { word, special: " " }; } return { word: word[0] + word.substring(2), special: word[1] }; } */ /* Store: [ accent ][ targets ][ rle data; base64-tail ] ` ae 3, 100 = (63, 37), 15 ~ n 63, 64 = (63, 1), 27 */ const Base64 = ")!@#$%^&*(ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-_"; export class BitWriter { readonly width: number; readonly #data: Array; #bitLength: number; constructor(width: number) { this.width = width; this.#data = [ ]; this.#bitLength = 0; } write(value: number): void { const maxValue = ((1 << this.width) - 1); while (value > maxValue) { this.#data.push(0); this.#bitLength += this.width; value -= maxValue; } this.#data.push(value); this.#bitLength += this.width; } get length(): number { return 1 + Math.trunc((this.#bitLength + 5) / 6); } get data(): string { let result = String(this.width); let bits = 0; let accum = 0; const data = this.#data.slice(); let bitMod = this.#bitLength % 6; while (bitMod !== 0 && bitMod < 6) { data.push(0); bitMod += this.width; } for (const value of data) { accum <<= this.width; accum |= value; bits += this.width; if (bits < 6) { continue; } result += Base64[accum >> (bits - 6)]; bits -= 6; accum &= ((1 << bits) - 1); } if (result.length !== this.length) { throw new Error(`Hmm: ${ this.length } ${ result.length } ${ result }`); } return result; } } export interface AccentSet { accent: number; follows: string; positions: Array; positionsLength: number; positionData: string; positionDataLength: number; }; function sorted(text: string): string { const letters = text.split(""); letters.sort(); return letters.join(""); } // if (c.charCodeAt(0) === 768) { return "`"; } // if (c.charCodeAt(0) === 769) { return "'"; } // if (c.charCodeAt(0) === 771) { return "~"; } export function extractAccents(words: Array): { accents: Array, words: Array } { // Build a list that maps accents to the letters it can follow const followsMap: Map = new Map(); for (const word of words) { for (let i = 0; i < word.length; i++) { const c = word[i]; if (c >= 'a' && c <= 'z') { continue; } // Make sure this positions and codepoint make sense if (c.charCodeAt(1)) { throw new Error(`unsupported codepoint: "${ c }"`); } if (i === 0) { throw new Error(`unmatched accent: ${ c }`); } const ac = c.charCodeAt(0), lastLetter = word[i - 1];; const follows = (followsMap.get(ac) || ""); if (follows.indexOf(lastLetter) === -1) { followsMap.set(ac, sorted(follows + lastLetter)); } } } // Build the positions of each follow-set for those accents const positionsMap: Map> = new Map(); for (const [ accent, follows ] of followsMap) { let count = 0; for (const word of words) { for (let i = 0; i < word.length; i++) { const c = word[i], ac = c.charCodeAt(0); if (follows.indexOf(c) >= 0) { count++; } if (ac === accent) { const pos = positionsMap.get(ac) || [ ]; pos.push(count); positionsMap.set(ac, pos); } } } } const accents: Array = [ ]; for (const [ accent, follows ] of followsMap) { let last = -1; const positions = (positionsMap.get(accent) || [ ]).map((value, index) => { const delta = value - last; last = value; if (index === 0) { return value; } return delta; }); // Find the best encoding of the position data let positionData = ""; for (let i = 2; i < 7; i++) { const bitWriter = new BitWriter(i); for (const p of positions) { bitWriter.write(p); } if (positionData === "" || bitWriter.length < positionData.length) { positionData = bitWriter.data; } } const positionsLength = positions.length; const positionDataLength = positionData.length; accents.push({ accent, follows, positions, positionsLength, positionData, positionDataLength }); } words = words.map((word) => { let result = ""; for (let i = 0; i < word.length; i++) { const c = word[i]; if (c >= 'a' && c <= 'z') { result += c } } return result; }); return { accents, words }; } // Encode Official WordList export function encodeOwl(words: Array): { subs: string, data: string } { // Fold the sorted words by indicating delta for the first 2 letters let data = camelcase(fold(fold(words, ":"), ";")); // Replace semicolons with counts (e.g. ";;;" with "3") data = data.replace(/(;+)/g, (all, semis) => { let result = ""; while (semis.length) { let count = semis.length; if (count > 10) { count = 10; } result += String(count - 1); semis = semis.substring(count); } return result; }); // Finds the best option for a shortcut replacement using the // unused ascii7 characters function findBest(): string { const tally: Record = { }; const l = 2; for (let i = l; i < data.length; i++) { const key = data.substring(i - l, i); tally[key] = (tally[key] || 0) + 1; } const sorted: Array<{ text: string, count: number, save: number }> = Object.keys(tally).map((text) => { return { text, count: tally[text], save: (tally[text] * (text.length - 1)) } }); sorted.sort((a, b) => (b.save - a.save)); return sorted[0].text; } // Make substitutions let subs = ""; for (let i = 0; i < subsChrs.length; i++) { const n = subsChrs[i], o = findBest(); subs += o; data = data.split(o).join(n); } return { data, subs }; } // Returns either: // - OWL data for accent-free latin-1: { data, accentds: "" } // - OWLA data for accented latin-1: { data, accents } function encodeWords(_words: Array): { data: string, accents: string } { const { accents, words } = extractAccents(_words); const { data, subs } = encodeOwl(words); const accentData = accents.map(({ accent, follows, positionData }) => { return `${ follows }${ accent }${ positionData }`; }).join(","); return { data: `0${ subs }${data}`, accents: accentData }; } // CLI const content = fs.readFileSync(process.argv[2]).toString(); const words = content.split("\n").filter(Boolean); const { data, accents } = encodeWords(words); if (accents) { const rec = decodeOwlA(data, accents); console.log("DATA: ", JSON.stringify(data)); console.log("ACCENTS: ", JSON.stringify(accents)); console.log("LENGTH: ", data.length); console.log("CHECKSUM: ", id(content)); console.log("RATIO: ", Math.trunc(100 * data.length / content.length) + "%"); if (rec.join("\n") !== words.join("\n")) { throw new Error("no match!"); } } else { const rec = decodeOwl(data); console.log("DATA: ", JSON.stringify(data)); console.log("LENGTH: ", data.length); console.log("CHECKSUM: ", id(content)); console.log("RATIO: ", Math.trunc(100 * data.length / content.length) + "%"); if (rec.join("\n") !== words.join("\n")) { throw new Error("no match!"); } }