| /** |
| * @fileoverview UTF8 encoding and decoding routines |
| */ |
| goog.provide('jspb.binary.utf8'); |
| |
| goog.require('jspb.asserts'); |
| |
| |
| /** |
| * Whether to use the browser based `TextEncoder` and `TextDecoder` APIs for |
| * handling utf8. |
| * |
| * <p>Enabled by default for `goog.FEATURESET_YEAR >= 2020`. The code also |
| * performs feature detection for this API and will always use it if available, |
| * this variable enables us to not ship the polyfill. |
| * |
| * <p>See http://go/jscompiler-flags#browser-featureset-year-options for the |
| * behavior here. |
| * |
| * @define {boolean} |
| */ |
| const USE_TEXT_ENCODING = |
| goog.define('jspb.binary.USE_TEXTENCODING', goog.FEATURESET_YEAR >= 2020); |
| |
| const /** number */ MIN_SURROGATE = 0xD800; |
| const /** number */ MIN_HIGH_SURROGATE = MIN_SURROGATE; |
| const /** number */ MAX_HIGH_SURROGATE = 0xDBFF; |
| const /** number */ MIN_LOW_SURROGATE = 0xDC00; |
| const /** number */ MAX_LOW_SURROGATE = 0xDFFF; |
| const /** number */ MAX_SURROGATE = MAX_LOW_SURROGATE; |
| |
| /** |
| * Returns whether the byte is not a valid continuation of the form |
| * '10XXXXXX'. |
| * @return {boolean} |
| */ |
| function isNotTrailingByte(/** number */ byte) { |
| // 0xC0 is '11000000' in binary |
| // 0x80 is '10000000' in binary |
| return (byte & 0xC0) !== 0x80; |
| } |
| |
| |
| /** |
| * Either throws an error or appends a replacement codepoint of invalid utf8 |
| */ |
| function invalid( |
| /** boolean */ parsingErrorsAreFatal, /** !Array<number> */ codeUnits) { |
| if (parsingErrorsAreFatal) { |
| throw new Error('Invalid UTF8'); |
| } |
| codeUnits.push(0xFFFD); // utf8 replacement character |
| } |
| |
| /** @return {string} */ |
| function codeUnitsToString( |
| /** string? */ accum, /** !Array<number> */ utf16CodeUnits) { |
| const suffix = String.fromCharCode.apply(null, utf16CodeUnits); |
| return accum == null ? suffix : accum + suffix; |
| } |
| |
| /** |
| * Our handwritten UTF8 decoder. |
| * |
| * https://en.wikipedia.org/wiki/UTF-8#Encoding describes the bit layout |
| * |
| * https://en.wikipedia.org/wiki/UTF-8#Invalid_sequences_and_error_handling |
| * describes important cases to check for which are namely: |
| * - overlong encodings, meaning a value expressable in N bytes could have been |
| * expressed in fewer bytes |
| * - invalid bytes, meaning bytes that are generally out of range |
| * - surrogate codepoints, utf8 never encodes directly a utf16 surrogate value |
| * - underflow where there aren't enough bytes for the sequence we are parsing |
| * - out of range codepoints. |
| * |
| * @return {string} |
| */ |
| jspb.binary.utf8.polyfillDecodeUtf8 = function ( |
| /** !Uint8Array */ bytes, /** number */ offset, /** number */ length, |
| /** boolean */ parsingErrorsAreFatal) { |
| let cursor = offset; |
| const end = cursor + length; |
| const codeUnits = []; |
| let result = null; |
| |
| // This is significantly slower than the TextDecoder implementation. |
| // Ideas for improving performance: |
| // 1. Reduce branching with non-shortcircuting operators, e.g. |
| // https://stackoverflow.com/q/5652363 |
| // 2. improve isNotTrailingByte using xor? |
| // 3. consider having a dedicate ascii loop (java impls do this) |
| let c1, c2, c3, c4; |
| while (cursor < end) { |
| c1 = bytes[cursor++]; |
| if (c1 < 0x80) { // Regular 7-bit ASCII. |
| codeUnits.push(c1); |
| } else if (c1 < 0xE0) { // UTF-8 with two bytes. |
| if (cursor >= end) { |
| invalid(parsingErrorsAreFatal, codeUnits); |
| } else { |
| c2 = bytes[cursor++]; |
| // Make sure that c1 is a valid leading byte and c2 is a valid |
| // trailing byte |
| // 0xC2 is '11000010', if c1 is less than this then we have an overlong |
| // encoding because there would only be 7 significant bits. |
| if (c1 < 0xC2 || isNotTrailingByte(c2)) { |
| cursor--; // push c2 back since it isn't 'accepted' |
| invalid(parsingErrorsAreFatal, codeUnits); |
| } else { |
| // The codeUnit is the lower 6 bits from c2 and the lower 5 bits from |
| // c1 |
| const codeUnit = ((c1 & 0x1F) << 6) | (c2 & 0x3F); |
| // Consistency check that the computed code is in range for a 2 byte |
| // sequence. |
| jspb.asserts.assert(codeUnit >= 0x80 && codeUnit <= 0x07FF); |
| codeUnits.push(codeUnit); |
| } |
| } |
| } else if (c1 < 0xF0) { // UTF-8 with three bytes. |
| if (cursor >= end - 1) { |
| invalid(parsingErrorsAreFatal, codeUnits); |
| } else { |
| c2 = bytes[cursor++]; |
| if (isNotTrailingByte(c2) || |
| // These checks were taken from |
| // java/com/google/protobuf/Utf8.java |
| // overlong? 5 most significant bits must not all be zero |
| (c1 === 0xE0 && c2 < 0xA0) |
| // check for illegal surrogate codepoints |
| || (c1 === 0xED && c2 >= 0xA0) || |
| // We delay reading c3 until now so than an error in c2 or c1 will |
| // preserve c3 for the next loop iteration |
| isNotTrailingByte(c3 = bytes[cursor++])) { |
| cursor--; // push back c2 or c3, depending on how far we made it |
| invalid(parsingErrorsAreFatal, codeUnits); |
| } else { |
| // 4 bits from the first byte |
| // 6 bits from each of the two lower bytes |
| // == 16 bits total |
| const codeUnit = |
| ((c1 & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F); |
| // Consistency check, this is the valid range for a 3 byte character |
| jspb.asserts.assert(codeUnit >= 0x800 && codeUnit <= 0xFFFF); |
| // And that Utf16 surrogates are disallowed |
| jspb.asserts.assert(codeUnit < MIN_SURROGATE || codeUnit > MAX_SURROGATE); |
| codeUnits.push(codeUnit); |
| } |
| } |
| } else if (c1 <= 0xF4) { // UTF-8 with 4 bytes. |
| // 0xF8 matches the bitpattern for utf8 with 4 bytes, but all leading |
| // bytes > 0xF4 are either overlong encodings or exceed the valid range. |
| if (cursor >= end - 2) { |
| invalid(parsingErrorsAreFatal, codeUnits); |
| } else { |
| c2 = bytes[cursor++]; |
| if (isNotTrailingByte(c2) || |
| // This check was inspired by |
| // java/com/google/protobuf/Utf8.java |
| // Tricky optimized form of: |
| // valid 4-byte leading byte? |
| // if (byte1 > (byte) 0xF4 || |
| // overlong? 4 most significant bits must not all be zero |
| // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || |
| // codepoint larger than the highest code point (U+10FFFF)? |
| // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) |
| (((c1 << 28) + (c2 - 0x90)) >> 30) !== 0 || |
| // We delay reading c3 and c4 until now so than an error in c2 or c1 |
| // will preserve them for the next loop iteration. |
| isNotTrailingByte(c3 = bytes[cursor++]) || |
| isNotTrailingByte(c4 = bytes[cursor++])) { |
| cursor--; // push back c2, c3 or c4 depending on how far we made it |
| invalid(parsingErrorsAreFatal, codeUnits); |
| } else { |
| // Characters written on 4 bytes have 21 bits for a codepoint. |
| // We can't fit that on 16bit characters, so we use surrogates. |
| // 3 bits from the uppermost byte, 6 bits from each of the lower 3 |
| // bytes. This is 21 bits which is too big for a 16 bit utf16 code |
| // unit so we use surrogates. |
| let codepoint = ((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12) | |
| ((c3 & 0x3F) << 6) | (c4 & 0x3F); |
| // Consistency check, this is the valid range for a 4 byte character. |
| jspb.asserts.assert(codepoint >= 0x10000 && codepoint <= 0x10FFFF); |
| // Surrogates formula from wikipedia. |
| // 1. Subtract 0x10000 from codepoint |
| codepoint -= 0x10000; |
| // 2. Split this into the high 10-bit value and the low 10-bit value |
| // 3. Add 0xD800 to the high value to form the high surrogate |
| // 4. Add 0xDC00 to the low value to form the low surrogate: |
| const low = (codepoint & 0x3FF) + MIN_LOW_SURROGATE; |
| const high = ((codepoint >> 10) & 0x3FF) + MIN_HIGH_SURROGATE; |
| codeUnits.push(high, low); |
| } |
| } |
| } else { |
| // initial byte is too large for utf8 |
| invalid(parsingErrorsAreFatal, codeUnits); |
| } |
| // Accumulate as we go to avoid exceeding the maximum stack size when |
| // calling `apply`. |
| if (codeUnits.length >= 8192) { |
| result = codeUnitsToString(result, codeUnits); |
| codeUnits.length = 0; |
| } |
| } |
| // ensure we don't overflow or underflow |
| jspb.asserts.assert(cursor === end, `expected ${cursor} === ${end}`); |
| return codeUnitsToString(result, codeUnits); |
| } |
| |
| |
| /** @type {boolean|undefined} */ |
| let isFatalTextDecoderCachableAfterThrowing_ = |
| // chrome version >= 2020 are not subject to https://crbug.com/910292 |
| goog.FEATURESET_YEAR >= 2020 ? true : undefined; |
| |
| /** @return {boolean} */ |
| function isFatalTextDecoderCachableAfterThrowing(/** !TextDecoder */ decoder) { |
| // Test if the decoder is subject to https://crbug.com/910292 |
| // chrome versions with this bug cause one failed decode to cause all later |
| // decodes to throw. |
| if (isFatalTextDecoderCachableAfterThrowing_ === undefined) { |
| // In theory we shouldn't need to generate an error here since this function |
| // is only called in the context of a failed decode. However, the buggy |
| // chrome versions are not 'consistent' in corrupting their internal state |
| // since it depends on where in the decode stream the error occurs. This |
| // error however does consistently trigger the bug based on manual testing. |
| try { |
| // A lonely continuation byte |
| decoder.decode(new Uint8Array([0x80])); |
| } catch (e) { |
| // expected |
| } |
| try { |
| // 'a' in hex |
| decoder.decode(new Uint8Array([0x61])); |
| isFatalTextDecoderCachableAfterThrowing_ = true; |
| } catch (e) { |
| // This decode should not throw, if it does it means our chrome version |
| // is buggy and we need to flush our cached decoder when failures occur |
| isFatalTextDecoderCachableAfterThrowing_ = false; |
| } |
| } |
| return isFatalTextDecoderCachableAfterThrowing_; |
| } |
| |
| /** @type {!TextDecoder|undefined} */ |
| let fatalDecoderInstance; |
| |
| /** @return {!TextDecoder}*/ |
| function getFatalDecoderInstance() { |
| let instance = fatalDecoderInstance; |
| if (!instance) { |
| instance = fatalDecoderInstance = new TextDecoder('utf-8', { fatal: true }); |
| } |
| return instance; |
| } |
| |
| /** @type {!TextDecoder|undefined} */ |
| let nonFatalDecoderInstance; |
| |
| /** @return {!TextDecoder}*/ |
| function getNonFatalDecoderInstance() { |
| let instance = nonFatalDecoderInstance; |
| if (!instance) { |
| instance = nonFatalDecoderInstance = |
| new TextDecoder('utf-8', { fatal: false }); |
| } |
| return instance; |
| } |
| |
| /** |
| * A `subarray` implementation that avoids calling `subarray` if it isn't needed |
| * |
| * `subarray` tends to be surprisingly slow. |
| * @return {!Uint8Array} |
| */ |
| function subarray( |
| /** !Uint8Array*/ bytes, /** number */ offset, /** number */ end) { |
| return offset === 0 && end === bytes.length ? bytes : |
| bytes.subarray(offset, end); |
| } |
| |
| /** |
| * @return {string} |
| */ |
| jspb.binary.utf8.textDecoderDecodeUtf8 = function ( |
| /** !Uint8Array*/ bytes, /** number */ offset, /** number */ length, |
| /** boolean*/ parsingErrorsAreFatal) { |
| const /** !TextDecoder */ decoder = parsingErrorsAreFatal ? |
| getFatalDecoderInstance() : |
| getNonFatalDecoderInstance(); |
| |
| bytes = subarray(bytes, offset, offset + length); |
| try { |
| return decoder.decode(bytes); |
| } catch (e) { |
| if (parsingErrorsAreFatal && |
| !isFatalTextDecoderCachableAfterThrowing(decoder)) { |
| fatalDecoderInstance = undefined; |
| } |
| throw e; |
| } |
| } |
| |
| /** @const {boolean} */ |
| const useTextDecoderDecode = |
| USE_TEXT_ENCODING || typeof TextDecoder !== 'undefined'; |
| |
| /** |
| * A utf8 decoding routine either based upon TextDecoder if available or using |
| * our polyfill implementation |
| * @return {string} |
| */ |
| jspb.binary.utf8.decodeUtf8 = function ( |
| /** !Uint8Array*/ bytes, /** number */ offset, /** number */ length, |
| /** boolean*/ parsingErrorsAreFatal) { |
| return useTextDecoderDecode ? |
| jspb.binary.utf8.textDecoderDecodeUtf8(bytes, offset, length, parsingErrorsAreFatal) : |
| jspb.binary.utf8.polyfillDecodeUtf8(bytes, offset, length, parsingErrorsAreFatal); |
| } |
| |
| /** @type {!TextEncoder|undefined} */ |
| let textEncoderInstance; |
| |
| /** @return {!Uint8Array} */ |
| jspb.binary.utf8.textEncoderEncode = function ( |
| /** string */ s, /** boolean */ rejectUnpairedSurrogates) { |
| if (rejectUnpairedSurrogates) { |
| jspb.binary.utf8.checkWellFormed(s); |
| } |
| |
| if (!textEncoderInstance) { |
| textEncoderInstance = new TextEncoder(); |
| } |
| return textEncoderInstance.encode(s); |
| } |
| |
| // isWellFormed landed in major browsers in early 2023 so it will only be |
| // definitely available in 2024 See |
| // http://go/mdn/JavaScript/Reference/Global_Objects/String/isWellFormed |
| const /** boolean */ HAS_WELL_FORMED_METHOD = goog.FEATURESET_YEAR > 2023 || |
| typeof String.prototype.isWellFormed === 'function'; |
| |
| jspb.binary.utf8.checkWellFormed = function (/** string */ text) { |
| if (HAS_WELL_FORMED_METHOD ? |
| // Externs don't contain the definition of this function yet. |
| // http://go/mdn/JavaScript/Reference/Global_Objects/String/isWellFormed |
| !(/** @type{{isWellFormed:function():boolean}}*/ ( |
| /** @type {?} */ (text)) |
| .isWellFormed()) : |
| /(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])/ |
| .test(text)) { |
| throw new Error('Found an unpaired surrogate'); |
| } |
| } |
| |
| |
| /** @return {!Uint8Array} */ |
| jspb.binary.utf8.polyfillEncode = function ( |
| /** string */ s, /** boolean */ rejectUnpairedSurrogates) { |
| let bi = 0; |
| // The worse case is that every character requires 3 output bytes, so we |
| // allocate for this. This assumes that the buffer will be short lived. |
| // Callers can always `slice` if needed |
| const buffer = new Uint8Array(3 * s.length); |
| for (let ci = 0; ci < s.length; ci++) { |
| let c = s.charCodeAt(ci); |
| if (c < 0x80) { |
| buffer[bi++] = c; |
| } else if (c < 0x800) { |
| buffer[bi++] = (c >> 6) | 0xC0; |
| buffer[bi++] = (c & 63) | 0x80; |
| } else { |
| jspb.asserts.assert(c < 65536); |
| // Look for surrogates |
| // First check if it is surrogate range |
| if (c >= MIN_SURROGATE && c <= MAX_SURROGATE) { |
| // is it a high surrogate? |
| if (c <= MAX_HIGH_SURROGATE && ci < s.length) { |
| const c2 = s.charCodeAt(++ci); |
| if (c2 >= MIN_LOW_SURROGATE && c2 <= MAX_LOW_SURROGATE) { |
| // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae |
| const codePoint = |
| (c - MIN_SURROGATE) * 0x400 + c2 - MIN_LOW_SURROGATE + 0x10000; |
| buffer[bi++] = (codePoint >> 18) | 0xF0; |
| buffer[bi++] = ((codePoint >> 12) & 63) | 0x80; |
| buffer[bi++] = ((codePoint >> 6) & 63) | 0x80; |
| buffer[bi++] = (codePoint & 63) | 0x80; |
| continue; |
| } else { |
| // else c2 not in low surrogate range, treat c as a lone surrogate |
| // and back up ci so we process c2 on the next loop as an |
| // independent character |
| ci--; |
| } |
| } // else c not a high surrogate |
| if (rejectUnpairedSurrogates) { |
| throw new Error('Found an unpaired surrogate'); |
| } |
| c = 0xFFFD; // Error! Unpaired surrogate |
| } |
| buffer[bi++] = (c >> 12) | 0xE0; |
| buffer[bi++] = ((c >> 6) & 63) | 0x80; |
| buffer[bi++] = (c & 63) | 0x80; |
| } |
| } |
| return subarray(buffer, 0, bi); |
| } |
| |
| /** @const {boolean} */ |
| const useTextEncoderEncode = |
| (USE_TEXT_ENCODING || typeof TextEncoder !== 'undefined'); |
| |
| /** |
| * A utf8 encoding routine either based upon TextEncoder if available or using |
| * our polyfill implementation |
| * @return {!Uint8Array} |
| */ |
| jspb.binary.utf8.encodeUtf8 = function ( |
| /**string*/ string, /** boolean=*/ rejectUnpairedSurrogates = false) { |
| jspb.asserts.assertString(string); |
| return useTextEncoderEncode ? |
| jspb.binary.utf8.textEncoderEncode(string, rejectUnpairedSurrogates) : |
| jspb.binary.utf8.polyfillEncode(string, rejectUnpairedSurrogates); |
| } |