blob: c569375383e8d9d5bc8bcdd96242a5e0beba3fb5 [file] [edit]
#include "encoding_binding.h"
#include "ada.h"
#include "env-inl.h"
#include "node_errors.h"
#include "node_external_reference.h"
#include "simdutf.h"
#include "string_bytes.h"
#include "util.h"
#include "v8.h"
#include <algorithm>
#include <cstdint>
namespace node {
namespace encoding_binding {
using v8::ArrayBuffer;
using v8::BackingStore;
using v8::BackingStoreInitializationMode;
using v8::BackingStoreOnFailureMode;
using v8::Context;
using v8::FunctionCallbackInfo;
using v8::HandleScope;
using v8::Isolate;
using v8::Local;
using v8::Object;
using v8::ObjectTemplate;
using v8::SnapshotCreator;
using v8::String;
using v8::Uint8Array;
using v8::Value;
void BindingData::MemoryInfo(MemoryTracker* tracker) const {
tracker->TrackField("encode_into_results_buffer",
encode_into_results_buffer_);
}
BindingData::BindingData(Realm* realm,
Local<Object> object,
InternalFieldInfo* info)
: SnapshotableObject(realm, object, type_int),
encode_into_results_buffer_(
realm->isolate(),
kEncodeIntoResultsLength,
MAYBE_FIELD_PTR(info, encode_into_results_buffer)) {
if (info == nullptr) {
object
->Set(realm->context(),
FIXED_ONE_BYTE_STRING(realm->isolate(), "encodeIntoResults"),
encode_into_results_buffer_.GetJSArray())
.Check();
} else {
encode_into_results_buffer_.Deserialize(realm->context());
}
encode_into_results_buffer_.MakeWeak();
}
bool BindingData::PrepareForSerialization(Local<Context> context,
SnapshotCreator* creator) {
DCHECK_NULL(internal_field_info_);
internal_field_info_ = InternalFieldInfoBase::New<InternalFieldInfo>(type());
internal_field_info_->encode_into_results_buffer =
encode_into_results_buffer_.Serialize(context, creator);
// Return true because we need to maintain the reference to the binding from
// JS land.
return true;
}
InternalFieldInfoBase* BindingData::Serialize(int index) {
DCHECK_IS_SNAPSHOT_SLOT(index);
InternalFieldInfo* info = internal_field_info_;
internal_field_info_ = nullptr;
return info;
}
// The following code is adapted from Cloudflare workers.
// Particularly from: https://github.com/cloudflare/workerd/pull/5448
//
// Copyright (c) 2017-2025 Cloudflare, Inc.
// Licensed under the Apache 2.0 license found in the LICENSE file or at:
// https://opensource.org/licenses/Apache-2.0
namespace {
constexpr int MAX_SIZE_FOR_STACK_ALLOC = 4096;
constexpr bool isSurrogatePair(uint16_t lead, uint16_t trail) {
return (lead & 0xfc00) == 0xd800 && (trail & 0xfc00) == 0xdc00;
}
constexpr size_t simpleUtfEncodingLength(uint16_t c) {
if (c < 0x80) return 1;
if (c < 0x400) return 2;
return 3;
}
// Finds the maximum number of input characters (UTF-16 or Latin1) that can be
// encoded into a UTF-8 buffer of the given size.
//
// The challenge is that UTF-8 encoding expands characters by variable amounts:
// - ASCII (< 0x80): 1 byte
// - Code points < 0x800: 2 bytes
// - Other BMP characters: 3 bytes
// - Surrogate pairs (supplementary planes): 4 bytes total
//
// This function uses an adaptive chunking algorithm:
// 1. Process the input in chunks, estimating how many characters will fit
// 2. Calculate the actual UTF-8 length for each chunk using simdutf
// 3. Adjust the expansion factor based on observed encoding ratios
// 4. Fall back to character-by-character processing near the buffer boundary
// 5. Handle UTF-16 surrogate pairs to avoid splitting them across boundaries
//
// The algorithm starts with a conservative expansion estimate (1.15x) and
// dynamically adjusts based on actual character distribution, making it
// efficient for common ASCII-heavy text while remaining correct for
// multi-byte heavy content.
template <typename Char>
size_t findBestFit(const Char* data, size_t length, size_t bufferSize) {
size_t pos = 0;
size_t utf8Accumulated = 0;
constexpr size_t CHUNK = 257;
constexpr bool UTF16 = sizeof(Char) == 2;
constexpr size_t MAX_FACTOR = UTF16 ? 3 : 2;
double expansion = 1.15;
while (pos < length && utf8Accumulated < bufferSize) {
size_t remainingInput = length - pos;
size_t spaceRemaining = bufferSize - utf8Accumulated;
DCHECK_GE(expansion, 1.15);
size_t guaranteedToFit = spaceRemaining / MAX_FACTOR;
if (guaranteedToFit >= remainingInput) {
return length;
}
size_t likelyToFit =
std::min(static_cast<size_t>(spaceRemaining / expansion), CHUNK);
size_t fitEstimate =
std::max(size_t{1}, std::max(guaranteedToFit, likelyToFit));
size_t chunkSize = std::min(remainingInput, fitEstimate);
if (chunkSize == 1) break;
CHECK_GT(chunkSize, 1);
size_t chunkUtf8Len;
if constexpr (UTF16) {
// TODO(anonrig): Use utf8_length_from_utf16_with_replacement when
// available For now, validate and use utf8_length_from_utf16
size_t newPos = pos + chunkSize;
if (newPos < length && isSurrogatePair(data[newPos - 1], data[newPos]))
chunkSize--;
chunkUtf8Len = simdutf::utf8_length_from_utf16(data + pos, chunkSize);
} else {
chunkUtf8Len = simdutf::utf8_length_from_latin1(data + pos, chunkSize);
}
if (utf8Accumulated + chunkUtf8Len > bufferSize) {
DCHECK_GT(chunkSize, guaranteedToFit);
expansion = std::max(expansion * 1.1, (chunkUtf8Len * 1.1) / chunkSize);
} else {
expansion = std::max(1.15, (chunkUtf8Len * 1.1) / chunkSize);
pos += chunkSize;
utf8Accumulated += chunkUtf8Len;
}
}
while (pos < length && utf8Accumulated < bufferSize) {
size_t extra = simpleUtfEncodingLength(data[pos]);
if (utf8Accumulated + extra > bufferSize) break;
pos++;
utf8Accumulated += extra;
}
if (UTF16 && pos != 0 && pos != length &&
isSurrogatePair(data[pos - 1], data[pos])) {
if (utf8Accumulated < bufferSize) {
pos++;
} else {
pos--;
}
}
return pos;
}
} // namespace
void BindingData::Deserialize(Local<Context> context,
Local<Object> holder,
int index,
InternalFieldInfoBase* info) {
DCHECK_IS_SNAPSHOT_SLOT(index);
HandleScope scope(Isolate::GetCurrent());
Realm* realm = Realm::GetCurrent(context);
// Recreate the buffer in the constructor.
InternalFieldInfo* casted_info = static_cast<InternalFieldInfo*>(info);
BindingData* binding =
realm->AddBindingData<BindingData>(holder, casted_info);
CHECK_NOT_NULL(binding);
}
void BindingData::EncodeInto(const FunctionCallbackInfo<Value>& args) {
CHECK_GE(args.Length(), 2);
CHECK(args[0]->IsString());
CHECK(args[1]->IsUint8Array());
Realm* realm = Realm::GetCurrent(args);
Isolate* isolate = realm->isolate();
BindingData* binding_data = realm->GetBindingData<BindingData>();
Local<String> source = args[0].As<String>();
Local<Uint8Array> dest = args[1].As<Uint8Array>();
Local<ArrayBuffer> buf = dest->Buffer();
// Handle detached buffers - return {read: 0, written: 0}
if (buf->Data() == nullptr) {
binding_data->encode_into_results_buffer_[0] = 0;
binding_data->encode_into_results_buffer_[1] = 0;
return;
}
char* write_result = static_cast<char*>(buf->Data()) + dest->ByteOffset();
size_t dest_length = dest->ByteLength();
size_t read = 0;
size_t written = 0;
// For small strings (length <= 32), use the old V8 path for better
// performance
static constexpr int kSmallStringThreshold = 32;
if (source->Length() <= kSmallStringThreshold) {
written = source->WriteUtf8V2(isolate,
write_result,
dest_length,
String::WriteFlags::kReplaceInvalidUtf8,
&read);
binding_data->encode_into_results_buffer_[0] = static_cast<double>(read);
binding_data->encode_into_results_buffer_[1] = static_cast<double>(written);
return;
}
v8::String::ValueView view(isolate, source);
size_t length_that_fits =
std::min(static_cast<size_t>(view.length()), dest_length);
if (view.is_one_byte()) {
auto data = reinterpret_cast<const char*>(view.data8());
simdutf::result result =
simdutf::validate_ascii_with_errors(data, length_that_fits);
written = read = result.count;
memcpy(write_result, data, read);
write_result += read;
data += read;
length_that_fits -= read;
dest_length -= read;
if (length_that_fits != 0 && dest_length != 0) {
if (size_t rest = findBestFit(data, length_that_fits, dest_length)) {
DCHECK_LE(simdutf::utf8_length_from_latin1(data, rest), dest_length);
written += simdutf::convert_latin1_to_utf8(data, rest, write_result);
read += rest;
}
}
} else {
auto data = reinterpret_cast<const char16_t*>(view.data16());
// Limit conversion to what could fit in destination, avoiding splitting
// a valid surrogate pair at the boundary, which could cause a spurious call
// of simdutf::to_well_formed_utf16()
if (length_that_fits > 0 && length_that_fits < view.length() &&
isSurrogatePair(data[length_that_fits - 1], data[length_that_fits])) {
length_that_fits--;
}
// Check if input has unpaired surrogates - if so, convert to well-formed
// first
simdutf::result validation_result =
simdutf::validate_utf16_with_errors(data, length_that_fits);
if (validation_result.error == simdutf::SUCCESS) {
// Valid UTF-16 - use the fast path
read = findBestFit(data, length_that_fits, dest_length);
if (read != 0) {
DCHECK_LE(simdutf::utf8_length_from_utf16(data, read), dest_length);
written = simdutf::convert_utf16_to_utf8(data, read, write_result);
}
} else {
// Invalid UTF-16 with unpaired surrogates - convert to well-formed first
// TODO(anonrig): Use utf8_length_from_utf16_with_replacement when
// available
MaybeStackBuffer<char16_t, MAX_SIZE_FOR_STACK_ALLOC> conversion_buffer(
length_that_fits);
simdutf::to_well_formed_utf16(
data, length_that_fits, conversion_buffer.out());
// Now use findBestFit with the well-formed data
read =
findBestFit(conversion_buffer.out(), length_that_fits, dest_length);
if (read != 0) {
DCHECK_LE(
simdutf::utf8_length_from_utf16(conversion_buffer.out(), read),
dest_length);
written = simdutf::convert_utf16_to_utf8(
conversion_buffer.out(), read, write_result);
}
}
}
DCHECK_LE(written, dest->ByteLength());
binding_data->encode_into_results_buffer_[0] = static_cast<double>(read);
binding_data->encode_into_results_buffer_[1] = static_cast<double>(written);
}
// Encode a single string to a UTF-8 Uint8Array (not Buffer).
// Used in TextEncoder.prototype.encode.
void BindingData::EncodeUtf8String(const FunctionCallbackInfo<Value>& args) {
Isolate* isolate = args.GetIsolate();
CHECK_GE(args.Length(), 1);
CHECK(args[0]->IsString());
Local<String> source = args[0].As<String>();
// For small strings, use the V8 path
static constexpr int kSmallStringThreshold = 32;
if (source->Length() <= kSmallStringThreshold) {
size_t length = source->Utf8LengthV2(isolate);
std::unique_ptr<BackingStore> bs = ArrayBuffer::NewBackingStore(
isolate,
length,
BackingStoreInitializationMode::kUninitialized,
BackingStoreOnFailureMode::kReturnNull);
if (!bs) [[unlikely]] {
THROW_ERR_MEMORY_ALLOCATION_FAILED(isolate);
return;
}
source->WriteUtf8V2(isolate,
static_cast<char*>(bs->Data()),
bs->MaxByteLength(),
String::WriteFlags::kReplaceInvalidUtf8);
Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, std::move(bs));
args.GetReturnValue().Set(Uint8Array::New(ab, 0, length));
return;
}
size_t length = source->Length();
size_t utf8_length = 0;
bool is_one_byte = source->IsOneByte();
if (is_one_byte) {
// One-byte string (Latin1) - copy to buffer first, then process
MaybeStackBuffer<uint8_t, MAX_SIZE_FOR_STACK_ALLOC> latin1_buffer(length);
source->WriteOneByteV2(isolate, 0, length, latin1_buffer.out());
auto data = reinterpret_cast<const char*>(latin1_buffer.out());
// Check if it's pure ASCII - if so, we can just copy
simdutf::result result = simdutf::validate_ascii_with_errors(data, length);
if (result.error == simdutf::SUCCESS) {
// Pure ASCII - direct copy
std::unique_ptr<BackingStore> bs = ArrayBuffer::NewBackingStore(
isolate, length, BackingStoreInitializationMode::kUninitialized);
CHECK(bs);
memcpy(bs->Data(), data, length);
Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, std::move(bs));
args.GetReturnValue().Set(Uint8Array::New(ab, 0, length));
return;
}
// Latin1 with non-ASCII characters - need conversion
utf8_length = simdutf::utf8_length_from_latin1(data, length);
std::unique_ptr<BackingStore> bs = ArrayBuffer::NewBackingStore(
isolate, utf8_length, BackingStoreInitializationMode::kUninitialized);
CHECK(bs);
[[maybe_unused]] size_t written = simdutf::convert_latin1_to_utf8(
data, length, static_cast<char*>(bs->Data()));
DCHECK_EQ(written, utf8_length);
Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, std::move(bs));
args.GetReturnValue().Set(Uint8Array::New(ab, 0, utf8_length));
return;
}
// Two-byte string (UTF-16) - copy to buffer first
MaybeStackBuffer<uint16_t, MAX_SIZE_FOR_STACK_ALLOC> utf16_buffer(length);
source->WriteV2(isolate, 0, length, utf16_buffer.out());
auto data = reinterpret_cast<char16_t*>(utf16_buffer.out());
// Check for unpaired surrogates
simdutf::result validation_result =
simdutf::validate_utf16_with_errors(data, length);
if (validation_result.error == simdutf::SUCCESS) {
// Valid UTF-16 - use the fast path
utf8_length = simdutf::utf8_length_from_utf16(data, length);
std::unique_ptr<BackingStore> bs = ArrayBuffer::NewBackingStore(
isolate, utf8_length, BackingStoreInitializationMode::kUninitialized);
CHECK(bs);
[[maybe_unused]] size_t written = simdutf::convert_utf16_to_utf8(
data, length, static_cast<char*>(bs->Data()));
DCHECK_EQ(written, utf8_length);
Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, std::move(bs));
args.GetReturnValue().Set(Uint8Array::New(ab, 0, utf8_length));
return;
}
// Invalid UTF-16 with unpaired surrogates - convert to well-formed in place
simdutf::to_well_formed_utf16(data, length, data);
utf8_length = simdutf::utf8_length_from_utf16(data, length);
std::unique_ptr<BackingStore> bs = ArrayBuffer::NewBackingStore(
isolate, utf8_length, BackingStoreInitializationMode::kUninitialized);
CHECK(bs);
[[maybe_unused]] size_t written = simdutf::convert_utf16_to_utf8(
data, length, static_cast<char*>(bs->Data()));
DCHECK_EQ(written, utf8_length);
Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, std::move(bs));
args.GetReturnValue().Set(Uint8Array::New(ab, 0, utf8_length));
}
// Convert the input into an encoded string
void BindingData::DecodeUTF8(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args); // list, flags
CHECK_GE(args.Length(), 1);
if (!(args[0]->IsArrayBuffer() || args[0]->IsSharedArrayBuffer() ||
args[0]->IsArrayBufferView())) {
return node::THROW_ERR_INVALID_ARG_TYPE(
env->isolate(),
"The \"list\" argument must be an instance of SharedArrayBuffer, "
"ArrayBuffer or ArrayBufferView.");
}
ArrayBufferViewContents<char> buffer(args[0]);
bool ignore_bom = args[1]->IsTrue();
bool has_fatal = args[2]->IsTrue();
const char* data = buffer.data();
size_t length = buffer.length();
if (!ignore_bom && length >= 3) {
if (memcmp(data, "\xEF\xBB\xBF", 3) == 0) {
data += 3;
length -= 3;
}
}
if (has_fatal) {
// Are we perhaps ASCII? Then we won't have to check for UTF-8
if (!simdutf::validate_ascii_with_errors(data, length).error) {
Local<Value> ret;
if (StringBytes::Encode(env->isolate(), data, length, LATIN1)
.ToLocal(&ret)) {
args.GetReturnValue().Set(ret);
}
return;
}
auto result = simdutf::validate_utf8_with_errors(data, length);
if (result.error) {
return node::THROW_ERR_ENCODING_INVALID_ENCODED_DATA(
env->isolate(), "The encoded data was not valid for encoding utf-8");
}
// TODO(chalker): save on utf8 validity recheck in StringBytes::Encode()
}
if (length == 0) return args.GetReturnValue().SetEmptyString();
Local<Value> ret;
if (StringBytes::Encode(env->isolate(), data, length, UTF8).ToLocal(&ret)) {
args.GetReturnValue().Set(ret);
}
}
void BindingData::ToASCII(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
CHECK_GE(args.Length(), 1);
CHECK(args[0]->IsString());
Utf8Value input(env->isolate(), args[0]);
auto out = ada::idna::to_ascii(input.ToStringView());
Local<Value> ret;
if (ToV8Value(env->context(), out, env->isolate()).ToLocal(&ret)) {
args.GetReturnValue().Set(ret);
}
}
void BindingData::ToUnicode(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
CHECK_GE(args.Length(), 1);
CHECK(args[0]->IsString());
Utf8Value input(env->isolate(), args[0]);
auto out = ada::idna::to_unicode(input.ToStringView());
Local<Value> ret;
if (ToV8Value(env->context(), out, env->isolate()).ToLocal(&ret)) {
args.GetReturnValue().Set(ret);
}
}
void BindingData::CreatePerIsolateProperties(IsolateData* isolate_data,
Local<ObjectTemplate> target) {
Isolate* isolate = isolate_data->isolate();
SetMethod(isolate, target, "encodeInto", EncodeInto);
SetMethodNoSideEffect(isolate, target, "encodeUtf8String", EncodeUtf8String);
SetMethodNoSideEffect(isolate, target, "decodeUTF8", DecodeUTF8);
SetMethodNoSideEffect(isolate, target, "toASCII", ToASCII);
SetMethodNoSideEffect(isolate, target, "toUnicode", ToUnicode);
}
void BindingData::CreatePerContextProperties(Local<Object> target,
Local<Value> unused,
Local<Context> context,
void* priv) {
Realm* realm = Realm::GetCurrent(context);
realm->AddBindingData<BindingData>(target);
}
void BindingData::RegisterTimerExternalReferences(
ExternalReferenceRegistry* registry) {
registry->Register(EncodeInto);
registry->Register(EncodeUtf8String);
registry->Register(DecodeUTF8);
registry->Register(ToASCII);
registry->Register(ToUnicode);
}
} // namespace encoding_binding
} // namespace node
NODE_BINDING_CONTEXT_AWARE_INTERNAL(
encoding_binding,
node::encoding_binding::BindingData::CreatePerContextProperties)
NODE_BINDING_PER_ISOLATE_INIT(
encoding_binding,
node::encoding_binding::BindingData::CreatePerIsolateProperties)
NODE_BINDING_EXTERNAL_REFERENCE(
encoding_binding,
node::encoding_binding::BindingData::RegisterTimerExternalReferences)