'use strict'; import { arrayify, Arrayish } from './bytes'; export enum UnicodeNormalizationForm { current = '', NFC = 'NFC', NFD = 'NFD', NFKC = 'NFKC', NFKD = 'NFKD' }; // http://stackoverflow.com/questions/18729405/how-to-convert-utf8-string-to-byte-array export function toUtf8Bytes(str: string, form: UnicodeNormalizationForm = UnicodeNormalizationForm.current): Uint8Array { if (form != UnicodeNormalizationForm.current) { str = str.normalize(form); } var result = []; var offset = 0; for (var i = 0; i < str.length; i++) { var c = str.charCodeAt(i); if (c < 128) { result[offset++] = c; } else if (c < 2048) { result[offset++] = (c >> 6) | 192; result[offset++] = (c & 63) | 128; } else if (((c & 0xFC00) == 0xD800) && (i + 1) < str.length && ((str.charCodeAt(i + 1) & 0xFC00) == 0xDC00)) { // Surrogate Pair c = 0x10000 + ((c & 0x03FF) << 10) + (str.charCodeAt(++i) & 0x03FF); result[offset++] = (c >> 18) | 240; result[offset++] = ((c >> 12) & 63) | 128; result[offset++] = ((c >> 6) & 63) | 128; result[offset++] = (c & 63) | 128; } else { result[offset++] = (c >> 12) | 224; result[offset++] = ((c >> 6) & 63) | 128; result[offset++] = (c & 63) | 128; } } return arrayify(result); }; // http://stackoverflow.com/questions/13356493/decode-utf-8-with-javascript#13691499 export function toUtf8String(bytes: Arrayish): string { bytes = arrayify(bytes); var result = ''; var i = 0; // Invalid bytes are ignored while(i < bytes.length) { var c = bytes[i++]; if (c >> 7 == 0) { // 0xxx xxxx result += String.fromCharCode(c); continue; } // Invalid starting byte if (c >> 6 == 0x02) { continue; } // Multibyte; how many bytes left for thus character? var extraLength = null; if (c >> 5 == 0x06) { extraLength = 1; } else if (c >> 4 == 0x0e) { extraLength = 2; } else if (c >> 3 == 0x1e) { extraLength = 3; } else if (c >> 2 == 0x3e) { extraLength = 4; } else if (c >> 1 == 0x7e) { extraLength = 5; } else { continue; } // Do we have enough bytes in our data? if (i + extraLength > bytes.length) { // If there is an invalid unprocessed byte, try to continue for (; i < bytes.length; i++) { if (bytes[i] >> 6 != 0x02) { break; } } if (i != bytes.length) continue; // All leftover bytes are valid. return result; } // Remove the UTF-8 prefix from the char (res) var res = c & ((1 << (8 - extraLength - 1)) - 1); var count; for (count = 0; count < extraLength; count++) { var nextChar = bytes[i++]; // Is the char valid multibyte part? if (nextChar >> 6 != 0x02) {break;}; res = (res << 6) | (nextChar & 0x3f); } if (count != extraLength) { i--; continue; } if (res <= 0xffff) { result += String.fromCharCode(res); continue; } res -= 0x10000; result += String.fromCharCode(((res >> 10) & 0x3ff) + 0xd800, (res & 0x3ff) + 0xdc00); } return result; }