| #include "string_decoder.h" // NOLINT(build/include_inline) |
| #include "string_decoder-inl.h" |
| |
| #include "env-inl.h" |
| #include "node_buffer.h" |
| #include "string_bytes.h" |
| #include "util.h" |
| |
| using v8::Array; |
| using v8::ArrayBufferView; |
| using v8::Context; |
| using v8::FunctionCallbackInfo; |
| using v8::Integer; |
| using v8::Isolate; |
| using v8::Local; |
| using v8::MaybeLocal; |
| using v8::Object; |
| using v8::String; |
| using v8::Value; |
| |
| namespace node { |
| |
| namespace { |
| |
| MaybeLocal<String> MakeString(Isolate* isolate, |
| const char* data, |
| size_t length, |
| enum encoding encoding) { |
| Local<Value> error; |
| MaybeLocal<Value> ret; |
| if (encoding == UTF8) { |
| return String::NewFromUtf8( |
| isolate, |
| data, |
| v8::NewStringType::kNormal, |
| length); |
| } else { |
| ret = StringBytes::Encode( |
| isolate, |
| data, |
| length, |
| encoding, |
| &error); |
| } |
| |
| if (ret.IsEmpty()) { |
| CHECK(!error.IsEmpty()); |
| isolate->ThrowException(error); |
| } |
| |
| DCHECK(ret.IsEmpty() || ret.ToLocalChecked()->IsString()); |
| return ret.FromMaybe(Local<Value>()).As<String>(); |
| } |
| |
| } // anonymous namespace |
| |
| |
| MaybeLocal<String> StringDecoder::DecodeData(Isolate* isolate, |
| const char* data, |
| size_t* nread_ptr) { |
| Local<String> prepend, body; |
| |
| size_t nread = *nread_ptr; |
| |
| if (Encoding() == UTF8 || Encoding() == UCS2 || Encoding() == BASE64) { |
| // See if we want bytes to finish a character from the previous |
| // chunk; if so, copy the new bytes to the missing bytes buffer |
| // and create a small string from it that is to be prepended to the |
| // main body. |
| if (MissingBytes() > 0) { |
| // There are never more bytes missing than the pre-calculated maximum. |
| CHECK_LE(MissingBytes() + BufferedBytes(), |
| kIncompleteCharactersEnd); |
| if (Encoding() == UTF8) { |
| // For UTF-8, we need special treatment to align with the V8 decoder: |
| // If an incomplete character is found at a chunk boundary, we use |
| // its remainder and pass it to V8 as-is. |
| for (size_t i = 0; i < nread && i < MissingBytes(); ++i) { |
| if ((data[i] & 0xC0) != 0x80) { |
| // This byte is not a continuation byte even though it should have |
| // been one. We stop decoding of the incomplete character at this |
| // point (but still use the rest of the incomplete bytes from this |
| // chunk) and assume that the new, unexpected byte starts a new one. |
| state_[kMissingBytes] = 0; |
| memcpy(IncompleteCharacterBuffer() + BufferedBytes(), data, i); |
| state_[kBufferedBytes] += i; |
| data += i; |
| nread -= i; |
| break; |
| } |
| } |
| } |
| |
| size_t found_bytes = |
| std::min(nread, static_cast<size_t>(MissingBytes())); |
| memcpy(IncompleteCharacterBuffer() + BufferedBytes(), |
| data, |
| found_bytes); |
| // Adjust the two buffers. |
| data += found_bytes; |
| nread -= found_bytes; |
| |
| state_[kMissingBytes] -= found_bytes; |
| state_[kBufferedBytes] += found_bytes; |
| |
| if (LIKELY(MissingBytes() == 0)) { |
| // If no more bytes are missing, create a small string that we |
| // will later prepend. |
| if (!MakeString(isolate, |
| IncompleteCharacterBuffer(), |
| BufferedBytes(), |
| Encoding()).ToLocal(&prepend)) { |
| return MaybeLocal<String>(); |
| } |
| |
| *nread_ptr += BufferedBytes(); |
| // No more buffered bytes. |
| state_[kBufferedBytes] = 0; |
| } |
| } |
| |
| // It could be that trying to finish the previous chunk already |
| // consumed all data that we received in this chunk. |
| if (UNLIKELY(nread == 0)) { |
| body = !prepend.IsEmpty() ? prepend : String::Empty(isolate); |
| prepend = Local<String>(); |
| } else { |
| // If not, that means is no character left to finish at this point. |
| DCHECK_EQ(MissingBytes(), 0); |
| DCHECK_EQ(BufferedBytes(), 0); |
| |
| // See whether there is a character that we may have to cut off and |
| // finish when receiving the next chunk. |
| if (Encoding() == UTF8 && data[nread - 1] & 0x80) { |
| // This is UTF-8 encoded data and we ended on a non-ASCII UTF-8 byte. |
| // This means we'll need to figure out where the character to which |
| // the byte belongs begins. |
| for (size_t i = nread - 1; ; --i) { |
| DCHECK_LT(i, nread); |
| state_[kBufferedBytes]++; |
| if ((data[i] & 0xC0) == 0x80) { |
| // This byte does not start a character (a "trailing" byte). |
| if (state_[kBufferedBytes] >= 4 || i == 0) { |
| // We either have more then 4 trailing bytes (which means |
| // the current character would not be inside the range for |
| // valid Unicode, and in particular cannot be represented |
| // through JavaScript's UTF-16-based approach to strings), or the |
| // current buffer does not contain the start of an UTF-8 character |
| // at all. Either way, this is invalid UTF8 and we can just |
| // let the engine's decoder handle it. |
| state_[kBufferedBytes] = 0; |
| break; |
| } |
| } else { |
| // Found the first byte of a UTF-8 character. By looking at the |
| // upper bits we can tell how long the character *should* be. |
| if ((data[i] & 0xE0) == 0xC0) { |
| state_[kMissingBytes] = 2; |
| } else if ((data[i] & 0xF0) == 0xE0) { |
| state_[kMissingBytes] = 3; |
| } else if ((data[i] & 0xF8) == 0xF0) { |
| state_[kMissingBytes] = 4; |
| } else { |
| // This lead byte would indicate a character outside of the |
| // representable range. |
| state_[kBufferedBytes] = 0; |
| break; |
| } |
| |
| if (BufferedBytes() >= MissingBytes()) { |
| // Received more or exactly as many trailing bytes than the lead |
| // character would indicate. In the "==" case, we have valid |
| // data and don't need to slice anything off; |
| // in the ">" case, this is invalid UTF-8 anyway. |
| state_[kMissingBytes] = 0; |
| state_[kBufferedBytes] = 0; |
| } |
| |
| state_[kMissingBytes] -= state_[kBufferedBytes]; |
| break; |
| } |
| } |
| } else if (Encoding() == UCS2) { |
| if ((nread % 2) == 1) { |
| // We got half a codepoint, and need the second byte of it. |
| state_[kBufferedBytes] = 1; |
| state_[kMissingBytes] = 1; |
| } else if ((data[nread - 1] & 0xFC) == 0xD8) { |
| // Half a split UTF-16 character. |
| state_[kBufferedBytes] = 2; |
| state_[kMissingBytes] = 2; |
| } |
| } else if (Encoding() == BASE64) { |
| state_[kBufferedBytes] = nread % 3; |
| if (state_[kBufferedBytes] > 0) |
| state_[kMissingBytes] = 3 - BufferedBytes(); |
| } |
| |
| if (BufferedBytes() > 0) { |
| // Copy the requested number of buffered bytes from the end of the |
| // input into the incomplete character buffer. |
| nread -= BufferedBytes(); |
| *nread_ptr -= BufferedBytes(); |
| memcpy(IncompleteCharacterBuffer(), data + nread, BufferedBytes()); |
| } |
| |
| if (nread > 0) { |
| if (!MakeString(isolate, data, nread, Encoding()).ToLocal(&body)) |
| return MaybeLocal<String>(); |
| } else { |
| body = String::Empty(isolate); |
| } |
| } |
| |
| if (prepend.IsEmpty()) { |
| return body; |
| } else { |
| return String::Concat(isolate, prepend, body); |
| } |
| } else { |
| CHECK(Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1); |
| return MakeString(isolate, data, nread, Encoding()); |
| } |
| } |
| |
| MaybeLocal<String> StringDecoder::FlushData(Isolate* isolate) { |
| if (Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1) { |
| CHECK_EQ(MissingBytes(), 0); |
| CHECK_EQ(BufferedBytes(), 0); |
| } |
| |
| if (Encoding() == UCS2 && BufferedBytes() % 2 == 1) { |
| // Ignore a single trailing byte, like the JS decoder does. |
| state_[kMissingBytes]--; |
| state_[kBufferedBytes]--; |
| } |
| |
| if (BufferedBytes() == 0) |
| return String::Empty(isolate); |
| |
| MaybeLocal<String> ret = |
| MakeString(isolate, |
| IncompleteCharacterBuffer(), |
| BufferedBytes(), |
| Encoding()); |
| |
| state_[kMissingBytes] = 0; |
| state_[kBufferedBytes] = 0; |
| |
| return ret; |
| } |
| |
| namespace { |
| |
| void DecodeData(const FunctionCallbackInfo<Value>& args) { |
| StringDecoder* decoder = |
| reinterpret_cast<StringDecoder*>(Buffer::Data(args[0])); |
| CHECK_NOT_NULL(decoder); |
| |
| CHECK(args[1]->IsArrayBufferView()); |
| ArrayBufferViewContents<char> content(args[1].As<ArrayBufferView>()); |
| size_t length = content.length(); |
| |
| MaybeLocal<String> ret = |
| decoder->DecodeData(args.GetIsolate(), content.data(), &length); |
| if (!ret.IsEmpty()) |
| args.GetReturnValue().Set(ret.ToLocalChecked()); |
| } |
| |
| void FlushData(const FunctionCallbackInfo<Value>& args) { |
| StringDecoder* decoder = |
| reinterpret_cast<StringDecoder*>(Buffer::Data(args[0])); |
| CHECK_NOT_NULL(decoder); |
| MaybeLocal<String> ret = decoder->FlushData(args.GetIsolate()); |
| if (!ret.IsEmpty()) |
| args.GetReturnValue().Set(ret.ToLocalChecked()); |
| } |
| |
| void InitializeStringDecoder(Local<Object> target, |
| Local<Value> unused, |
| Local<Context> context, |
| void* priv) { |
| Environment* env = Environment::GetCurrent(context); |
| Isolate* isolate = env->isolate(); |
| |
| #define SET_DECODER_CONSTANT(name) \ |
| target->Set(context, \ |
| FIXED_ONE_BYTE_STRING(isolate, #name), \ |
| Integer::New(isolate, StringDecoder::name)).FromJust() |
| |
| SET_DECODER_CONSTANT(kIncompleteCharactersStart); |
| SET_DECODER_CONSTANT(kIncompleteCharactersEnd); |
| SET_DECODER_CONSTANT(kMissingBytes); |
| SET_DECODER_CONSTANT(kBufferedBytes); |
| SET_DECODER_CONSTANT(kEncodingField); |
| SET_DECODER_CONSTANT(kNumFields); |
| |
| Local<Array> encodings = Array::New(isolate); |
| #define ADD_TO_ENCODINGS_ARRAY(cname, jsname) \ |
| encodings->Set(context, \ |
| static_cast<int32_t>(cname), \ |
| FIXED_ONE_BYTE_STRING(isolate, jsname)).FromJust() |
| ADD_TO_ENCODINGS_ARRAY(ASCII, "ascii"); |
| ADD_TO_ENCODINGS_ARRAY(UTF8, "utf8"); |
| ADD_TO_ENCODINGS_ARRAY(BASE64, "base64"); |
| ADD_TO_ENCODINGS_ARRAY(UCS2, "utf16le"); |
| ADD_TO_ENCODINGS_ARRAY(HEX, "hex"); |
| ADD_TO_ENCODINGS_ARRAY(BUFFER, "buffer"); |
| ADD_TO_ENCODINGS_ARRAY(LATIN1, "latin1"); |
| |
| target->Set(context, |
| FIXED_ONE_BYTE_STRING(isolate, "encodings"), |
| encodings).Check(); |
| |
| target->Set(context, |
| FIXED_ONE_BYTE_STRING(isolate, "kSize"), |
| Integer::New(isolate, sizeof(StringDecoder))).Check(); |
| |
| env->SetMethod(target, "decode", DecodeData); |
| env->SetMethod(target, "flush", FlushData); |
| } |
| |
| } // anonymous namespace |
| |
| } // namespace node |
| |
| NODE_MODULE_CONTEXT_AWARE_INTERNAL(string_decoder, |
| node::InitializeStringDecoder) |