| /* |
| * Copyright 2024 WebAssembly Community Group participants |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include <optional> |
| #include <ostream> |
| |
| #include "support/string.h" |
| |
| namespace wasm::String { |
| |
| Split::Split(const std::string& input, const NewLineOr& newLineOrDelim) { |
| auto first = input.find("\n", 0); |
| if (first != std::string::npos && first != input.length() - 1) { |
| split(input, "\n"); |
| } else { |
| split(input, newLineOrDelim.delim); |
| } |
| } |
| |
| void Split::split(const std::string& input, const std::string& delim) { |
| size_t lastEnd = 0; |
| while (lastEnd < input.size()) { |
| auto nextDelim = input.find(delim, lastEnd); |
| if (nextDelim == std::string::npos) { |
| nextDelim = input.size(); |
| } |
| (*this).push_back(input.substr(lastEnd, nextDelim - lastEnd)); |
| lastEnd = nextDelim + delim.size(); |
| } |
| needToHandleBracketingOperations = delim != "\n"; |
| } |
| |
| Split handleBracketingOperators(Split split) { |
| if (!split.needToHandleBracketingOperations) { |
| return split; |
| } |
| |
| Split ret; |
| std::string last; |
| int nesting = 0; |
| auto handlePart = [&](std::string part) { |
| if (part.empty()) { |
| return; |
| } |
| for (const char c : part) { |
| if (c == '(' || c == '<' || c == '[' || c == '{') { |
| nesting++; |
| } else if (c == ')' || c == '>' || c == ']' || c == '}') { |
| nesting--; |
| } |
| } |
| if (last.empty()) { |
| last = part; |
| } else { |
| last += ',' + part; |
| } |
| if (nesting == 0) { |
| ret.push_back(last); |
| last.clear(); |
| } |
| }; |
| for (auto& part : split) { |
| handlePart(part); |
| } |
| handlePart(""); |
| if (nesting != 0) { |
| Fatal() << "Asyncify: failed to parse lists"; |
| } |
| return ret; |
| } |
| |
| bool wildcardMatch(const std::string& pattern, const std::string& value) { |
| for (size_t i = 0; i < pattern.size(); i++) { |
| if (pattern[i] == '*') { |
| return wildcardMatch(pattern.substr(i + 1), value.substr(i)) || |
| (value.size() > 0 && |
| wildcardMatch(pattern.substr(i), value.substr(i + 1))); |
| } |
| if (i >= value.size()) { |
| return false; |
| } |
| if (pattern[i] != value[i]) { |
| return false; |
| } |
| } |
| return value.size() == pattern.size(); |
| } |
| |
| std::string trim(const std::string& input) { |
| size_t size = input.size(); |
| while (size > 0 && (isspace(input[size - 1]) || input[size - 1] == '\0')) { |
| size--; |
| } |
| return input.substr(0, size); |
| } |
| |
| std::ostream& printEscaped(std::ostream& os, std::string_view str) { |
| os << '"'; |
| for (unsigned char c : str) { |
| switch (c) { |
| case '\t': |
| os << "\\t"; |
| break; |
| case '\n': |
| os << "\\n"; |
| break; |
| case '\r': |
| os << "\\r"; |
| break; |
| case '"': |
| os << "\\\""; |
| break; |
| case '\'': |
| os << "\\'"; |
| break; |
| case '\\': |
| os << "\\\\"; |
| break; |
| default: { |
| if (c >= 32 && c < 127) { |
| os << c; |
| } else { |
| os << std::hex << '\\' << (c / 16) << (c % 16) << std::dec; |
| } |
| } |
| } |
| } |
| return os << '"'; |
| } |
| |
| namespace { |
| |
| std::optional<uint32_t> takeWTF8CodePoint(std::string_view& str) { |
| bool valid = true; |
| |
| if (str.size() == 0) { |
| return std::nullopt; |
| } |
| |
| uint8_t leading = str[0]; |
| size_t trailingBytes; |
| uint32_t u; |
| if ((leading & 0b10000000) == 0b00000000) { |
| // 0xxxxxxx |
| trailingBytes = 0; |
| u = leading; |
| } else if ((leading & 0b11100000) == 0b11000000) { |
| // 110xxxxx 10xxxxxx |
| trailingBytes = 1; |
| u = (leading & 0b00011111) << 6; |
| } else if ((leading & 0b11110000) == 0b11100000) { |
| // 1110xxxx 10xxxxxx 10xxxxxx |
| trailingBytes = 2; |
| u = (leading & 0b00001111) << 12; |
| } else if ((leading & 0b11111000) == 0b11110000) { |
| // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
| trailingBytes = 3; |
| u = (leading & 0b00000111) << 18; |
| } else { |
| // Bad WTF-8 leading byte. |
| trailingBytes = 0; |
| valid = false; |
| } |
| |
| if (str.size() <= trailingBytes) { |
| // Unexpected end of string. |
| str = str.substr(str.size()); |
| return std::nullopt; |
| } |
| |
| if (valid) { |
| for (size_t j = 0; j < trailingBytes; ++j) { |
| uint8_t trailing = str[1 + j]; |
| if ((trailing & 0b11000000) != 0b10000000) { |
| // Bad WTF-8 trailing byte. |
| valid = false; |
| break; |
| } |
| // Shift 6 bits for every remaining trailing byte after this one. |
| u |= (trailing & 0b00111111) << (6 * (trailingBytes - j - 1)); |
| } |
| } |
| |
| str = str.substr(1 + trailingBytes); |
| |
| if (!valid) { |
| return std::nullopt; |
| } |
| |
| size_t expectedTrailing = u < 0x80 ? 0 |
| : u < 0x800 ? 1 |
| : u < 0x10000 ? 2 |
| : u < 0x110000 ? 3 |
| : -1; |
| if (trailingBytes != expectedTrailing) { |
| // Overlong encoding or overlarge code point. |
| return std::nullopt; |
| } |
| |
| return u; |
| } |
| |
| std::optional<uint16_t> takeWTF16CodeUnit(std::string_view& str) { |
| if (str.size() < 2) { |
| str = str.substr(str.size()); |
| return std::nullopt; |
| } |
| |
| // Use a little-endian encoding. |
| uint16_t u = uint8_t(str[0]) | (uint8_t(str[1]) << 8); |
| str = str.substr(2); |
| return u; |
| } |
| |
| std::optional<uint32_t> takeWTF16CodePoint(std::string_view& str, |
| bool allowWTF = true) { |
| auto u = takeWTF16CodeUnit(str); |
| if (!u) { |
| return std::nullopt; |
| } |
| |
| if (0xD800 <= *u && *u < 0xDC00) { |
| // High surrogate; take the next low surrogate if it exists. |
| auto next = str; |
| auto low = takeWTF16CodeUnit(next); |
| if (low && 0xDC00 <= *low && *low < 0xE000) { |
| str = next; |
| uint16_t highBits = *u - 0xD800; |
| uint16_t lowBits = *low - 0xDC00; |
| return 0x10000 + ((highBits << 10) | lowBits); |
| } else if (!allowWTF) { |
| // Unpaired high surrogate. |
| return std::nullopt; |
| } |
| } else if (!allowWTF && 0xDC00 <= *u && *u < 0xE000) { |
| // Unpaired low surrogate. |
| return std::nullopt; |
| } |
| |
| return *u; |
| } |
| |
| void writeWTF16CodeUnit(std::ostream& os, uint16_t u) { |
| // Little-endian encoding. |
| os << uint8_t(u & 0xFF); |
| os << uint8_t(u >> 8); |
| } |
| |
| constexpr uint32_t replacementCharacter = 0xFFFD; |
| |
| bool doConvertWTF16ToWTF8(std::ostream& os, |
| std::string_view str, |
| bool allowWTF) { |
| bool valid = true; |
| |
| while (str.size()) { |
| auto u = takeWTF16CodePoint(str, allowWTF); |
| if (!u) { |
| valid = false; |
| u = replacementCharacter; |
| } |
| writeWTF8CodePoint(os, *u); |
| } |
| |
| return valid; |
| } |
| |
| } // anonymous namespace |
| |
| std::ostream& writeWTF8CodePoint(std::ostream& os, uint32_t u) { |
| assert(u < 0x110000); |
| if (u < 0x80) { |
| // 0xxxxxxx |
| os << uint8_t(u); |
| } else if (u < 0x800) { |
| // 110xxxxx 10xxxxxx |
| os << uint8_t(0b11000000 | ((u >> 6) & 0b00011111)); |
| os << uint8_t(0b10000000 | ((u >> 0) & 0b00111111)); |
| } else if (u < 0x10000) { |
| // 1110xxxx 10xxxxxx 10xxxxxx |
| os << uint8_t(0b11100000 | ((u >> 12) & 0b00001111)); |
| os << uint8_t(0b10000000 | ((u >> 6) & 0b00111111)); |
| os << uint8_t(0b10000000 | ((u >> 0) & 0b00111111)); |
| } else { |
| // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
| os << uint8_t(0b11110000 | ((u >> 18) & 0b00000111)); |
| os << uint8_t(0b10000000 | ((u >> 12) & 0b00111111)); |
| os << uint8_t(0b10000000 | ((u >> 6) & 0b00111111)); |
| os << uint8_t(0b10000000 | ((u >> 0) & 0b00111111)); |
| } |
| return os; |
| } |
| |
| std::ostream& writeWTF16CodePoint(std::ostream& os, uint32_t u) { |
| assert(u < 0x110000); |
| if (u < 0x10000) { |
| writeWTF16CodeUnit(os, u); |
| } else { |
| // Encode with a surrogate pair. |
| uint16_t high = 0xD800 + ((u - 0x10000) >> 10); |
| uint16_t low = 0xDC00 + ((u - 0x10000) & 0x3FF); |
| writeWTF16CodeUnit(os, high); |
| writeWTF16CodeUnit(os, low); |
| } |
| return os; |
| } |
| |
| #pragma GCC diagnostic push |
| #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" |
| |
| bool convertWTF8ToWTF16(std::ostream& os, std::string_view str) { |
| bool valid = true; |
| bool lastWasLeadingSurrogate = false; |
| |
| while (str.size()) { |
| auto u = takeWTF8CodePoint(str); |
| if (!u) { |
| valid = false; |
| u = replacementCharacter; |
| } |
| |
| bool isLeadingSurrogate = 0xD800 <= *u && *u < 0xDC00; |
| bool isTrailingSurrogate = 0xDC00 <= *u && *u < 0xE000; |
| if (lastWasLeadingSurrogate && isTrailingSurrogate) { |
| // Invalid surrogate sequence. |
| valid = false; |
| } |
| lastWasLeadingSurrogate = isLeadingSurrogate; |
| |
| writeWTF16CodePoint(os, *u); |
| } |
| |
| return valid; |
| } |
| |
| #pragma GCC diagnostic pop |
| |
| bool convertWTF16ToWTF8(std::ostream& os, std::string_view str) { |
| return doConvertWTF16ToWTF8(os, str, true); |
| } |
| |
| bool convertUTF16ToUTF8(std::ostream& os, std::string_view str) { |
| return doConvertWTF16ToWTF8(os, str, false); |
| } |
| |
| std::ostream& printEscapedJSON(std::ostream& os, std::string_view str) { |
| os << '"'; |
| while (str.size()) { |
| auto u = *takeWTF16CodePoint(str); |
| |
| // Use escape sequences mandated by the JSON spec. |
| switch (u) { |
| case '"': |
| os << "\\\""; |
| continue; |
| case '\\': |
| os << "\\\\"; |
| continue; |
| case '\b': |
| os << "\\b"; |
| continue; |
| case '\f': |
| os << "\\f"; |
| continue; |
| case '\n': |
| os << "\\n"; |
| continue; |
| case '\r': |
| os << "\\r"; |
| continue; |
| case '\t': |
| os << "\\t"; |
| continue; |
| default: |
| break; |
| } |
| |
| // TODO: To minimize size, consider additionally escaping only other control |
| // characters (u <= 0x1F) and surrogates, emitting everything else directly |
| // assuming a UTF-8 encoding of the JSON text. We don't do this now because |
| // Print.cpp would consider the contents unprintable, messing up our test. |
| bool isNaivelyPrintable = 32 <= u && u < 127; |
| if (isNaivelyPrintable) { |
| assert(u < 0x80 && "need additional logic to emit valid UTF-8"); |
| os << uint8_t(u); |
| continue; |
| } |
| |
| // Escape as '\uXXXX` for code points less than 0x10000 or as a |
| // '\uXXXX\uYYYY' surrogate pair otherwise. |
| auto printEscape = [&os](uint32_t codePoint) { |
| assert(codePoint < 0x10000); |
| os << std::hex << "\\u"; |
| os << ((codePoint & 0xF000) >> 12); |
| os << ((codePoint & 0x0F00) >> 8); |
| os << ((codePoint & 0x00F0) >> 4); |
| os << (codePoint & 0x000F); |
| os << std::dec; |
| }; |
| if (u < 0x10000) { |
| printEscape(u); |
| } else { |
| assert(u <= 0x10FFFF && "unexpectedly high code point"); |
| printEscape(0xD800 + ((u - 0x10000) >> 10)); |
| printEscape(0xDC00 + ((u - 0x10000) & 0x3FF)); |
| } |
| } |
| return os << '"'; |
| } |
| |
| bool isUTF8(std::string_view str) { |
| while (str.size()) { |
| auto u = takeWTF8CodePoint(str); |
| if (!u || (0xD800 <= *u && *u < 0xE000)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| } // namespace wasm::String |