blob: ad397e7a2500fceab0a836966db0e56745bb9c48 [file] [edit]
///////////////////////////////////////////////////////////////////////////////
// //
// Unicode.cpp //
// Copyright (C) Microsoft Corporation. All rights reserved. //
// This file is distributed under the University of Illinois Open Source //
// License. See LICENSE.TXT for details. //
// //
// Provides utitlity functions to work with Unicode and other encodings. //
// //
///////////////////////////////////////////////////////////////////////////////
#ifdef _WIN32
#include <specstrings.h>
#else
#include <clocale>
#endif
#include "dxc/Support/Global.h"
#include "dxc/Support/Unicode.h"
#include "dxc/Support/WinIncludes.h"
#include <assert.h>
#include <new>
#include <string>
#ifndef _WIN32
// MultiByteToWideChar which is a Windows-specific method.
// This is a very simplistic implementation for non-Windows platforms. This
// implementation completely ignores CodePage and dwFlags.
int MultiByteToWideChar(uint32_t /*CodePage*/, uint32_t /*dwFlags*/,
const char *lpMultiByteStr, int cbMultiByte,
wchar_t *lpWideCharStr, int cchWideChar) {
// Check for invalid sizes or potential overflow.
if (cbMultiByte == 0 || cbMultiByte < -1 || cbMultiByte == INT32_MAX ||
cchWideChar < 0 || cchWideChar == INT32_MAX) {
SetLastError(ERROR_INVALID_PARAMETER);
return 0;
}
// if cbMultiByte is -1, it indicates that lpMultiByteStr is null-terminated
// and the entire string should be processed.
if (cbMultiByte == -1) {
for (cbMultiByte = 0; lpMultiByteStr[cbMultiByte] != '\0'; ++cbMultiByte)
;
// Add 1 for the null-terminating character.
++cbMultiByte;
}
// If zero is given as the destination size, this function should
// return the required size (including or excluding the null-terminating
// character depending on whether the input included the null-terminator).
// This is the behavior of mbstowcs when the target is null.
if (cchWideChar == 0) {
lpWideCharStr = nullptr;
}
ScopedLocale utf8_locale_scope(CP_UTF8);
bool isNullTerminated = false;
size_t rv;
if (lpMultiByteStr[cbMultiByte - 1] != '\0') {
char *srcStr = (char *)malloc((cbMultiByte + 1) * sizeof(char));
strncpy(srcStr, lpMultiByteStr, cbMultiByte);
srcStr[cbMultiByte] = '\0';
rv = mbstowcs(lpWideCharStr, srcStr, cchWideChar);
free(srcStr);
} else {
rv = mbstowcs(lpWideCharStr, lpMultiByteStr, cchWideChar);
isNullTerminated = true;
}
if (rv == ~(size_t)0) {
// mbstowcs returns -1 on error.
SetLastError(ERROR_INVALID_PARAMETER);
return 0;
}
// Return value of mbstowcs (rv) excludes the terminating character.
// Matching MultiByteToWideChar requires returning the size written including
// the null terminator if the input was null-terminated, otherwise it
// returns the size written excluding the null terminator.
if (isNullTerminated)
rv += 1;
// Check for overflow when returning the size.
if (rv >= INT32_MAX) {
SetLastError(ERROR_INVALID_PARAMETER);
return 0; // Overflow error
}
return rv;
}
// WideCharToMultiByte is a Windows-specific method.
// This is a very simplistic implementation for non-Windows platforms. This
// implementation completely ignores CodePage and dwFlags.
int WideCharToMultiByte(uint32_t /*CodePage*/, uint32_t /*dwFlags*/,
const wchar_t *lpWideCharStr, int cchWideChar,
char *lpMultiByteStr, int cbMultiByte,
const char * /*lpDefaultChar*/,
bool *lpUsedDefaultChar) {
if (lpUsedDefaultChar) {
*lpUsedDefaultChar = FALSE;
}
// Check for invalid sizes or potential overflow.
if (cchWideChar == 0 || cchWideChar < -1 || cchWideChar > (INT32_MAX - 1) ||
cbMultiByte < 0 || cbMultiByte > (INT32_MAX - 1)) {
SetLastError(ERROR_INVALID_PARAMETER);
return 0;
}
// if cchWideChar is -1, it indicates that lpWideCharStr is null-terminated
// and the entire string should be processed.
if (cchWideChar == -1) {
for (cchWideChar = 0; lpWideCharStr[cchWideChar] != '\0'; ++cchWideChar)
;
// Add 1 for the null-terminating character.
++cchWideChar;
}
// If zero is given as the destination size, this function should
// return the required size (including or excluding the null-terminating
// character depending on whether the input included the null-terminator).
// This is the behavior of wcstombs when the target is null.
if (cbMultiByte == 0) {
lpMultiByteStr = nullptr;
}
ScopedLocale utf8_locale_scope(CP_UTF8);
bool isNullTerminated = false;
size_t rv;
if (lpWideCharStr[cchWideChar - 1] != L'\0') {
wchar_t *srcStr = (wchar_t *)malloc((cchWideChar + 1) * sizeof(wchar_t));
wcsncpy(srcStr, lpWideCharStr, cchWideChar);
srcStr[cchWideChar] = L'\0';
rv = wcstombs(lpMultiByteStr, srcStr, cbMultiByte);
free(srcStr);
} else {
rv = wcstombs(lpMultiByteStr, lpWideCharStr, cbMultiByte);
isNullTerminated = true;
}
if (rv == ~(size_t)0) {
// wcstombs returns -1 on error.
SetLastError(ERROR_INVALID_PARAMETER);
return 0;
}
// Return value of wcstombs (rv) excludes the terminating character.
// Matching MultiByteToWideChar requires returning the size written including
// the null terminator if the input was null-terminated, otherwise it
// returns the size written excluding the null terminator.
if (isNullTerminated)
rv += 1;
// Check for overflow when returning the size.
if (rv >= INT32_MAX) {
SetLastError(ERROR_INVALID_PARAMETER);
return 0; // Overflow error
}
return rv;
}
#endif // _WIN32
namespace Unicode {
bool WideToEncodedString(const wchar_t *text, size_t cWide, DWORD cp,
DWORD flags, std::string *pValue, bool *lossy) {
DXASSERT_NOMSG(cWide == ~(size_t)0 || cWide < INT32_MAX);
if (text == nullptr || pValue == nullptr || cWide == 0 ||
!(cWide == ~(size_t)0 || cWide < INT32_MAX))
return false;
BOOL usedDefaultChar;
LPBOOL pUsedDefaultChar = (lossy == nullptr) ? nullptr : &usedDefaultChar;
if (lossy != nullptr)
*lossy = false;
// Handle zero-length as a special case; it's a special value to indicate
// errors in WideCharToMultiByte.
if (cWide == 0) {
pValue->resize(0);
DXASSERT(lossy == nullptr || *lossy == false,
"otherwise earlier initialization in this function was updated");
return true;
}
int cbUTF8 = ::WideCharToMultiByte(cp, flags, text, static_cast<int>(cWide),
nullptr, 0, nullptr, pUsedDefaultChar);
if (cbUTF8 == 0)
return false;
pValue->resize(cbUTF8);
cbUTF8 = ::WideCharToMultiByte(cp, flags, text, static_cast<int>(cWide),
&(*pValue)[0], pValue->size(), nullptr,
pUsedDefaultChar);
DXASSERT(cbUTF8 > 0, "otherwise contents have changed");
if ((cWide == ~(size_t)0 || text[cWide - 1] == L'\0') &&
(*pValue)[pValue->size() - 1] == '\0') {
// When the input is null-terminated, the output includes the null
// terminator. Reduce the size by 1 to remove the embedded null terminator
// inside the string.
pValue->resize(cbUTF8 - 1);
}
if (lossy != nullptr)
*lossy = usedDefaultChar;
return true;
}
bool UTF8ToWideString(const char *pUTF8, std::wstring *pWide) {
return UTF8ToWideString(pUTF8, -1, pWide);
}
bool UTF8ToWideString(const char *pUTF8, size_t cbUTF8, std::wstring *pWide) {
DXASSERT_NOMSG(pWide != nullptr);
DXASSERT_NOMSG(cbUTF8 == ~(size_t)0 || cbUTF8 < INT32_MAX);
// Handle zero-length as a special case; it's a special value to indicate
// errors in MultiByteToWideChar.
if (cbUTF8 == 0) {
pWide->resize(0);
return true;
}
int cWide = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, pUTF8,
static_cast<int>(cbUTF8), nullptr, 0);
if (cWide == 0)
return false;
pWide->resize(cWide);
cWide = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, pUTF8,
static_cast<int>(cbUTF8), &(*pWide)[0],
pWide->size());
DXASSERT(cWide > 0, "otherwise contents changed");
if ((cbUTF8 == ~(size_t)0 || pUTF8[cbUTF8 - 1] == '\0') &&
(*pWide)[pWide->size() - 1] == '\0') {
// When the input is null-terminated, the output includes the null
// terminator. Reduce the size by 1 to remove the embedded null terminator
// inside the string.
pWide->resize(cWide - 1);
}
return true;
}
std::wstring UTF8ToWideStringOrThrow(const char *pUTF8) {
std::wstring result;
if (!UTF8ToWideString(pUTF8, &result)) {
throw hlsl::Exception(DXC_E_STRING_ENCODING_FAILED);
}
return result;
}
bool UTF8ToConsoleString(const char *text, size_t textLen, std::string *pValue,
bool *lossy) {
DXASSERT_NOMSG(text != nullptr);
DXASSERT_NOMSG(pValue != nullptr);
std::wstring text16;
if (lossy != nullptr)
*lossy = false;
if (!UTF8ToWideString(text, textLen, &text16)) {
return false;
}
return WideToConsoleString(text16.c_str(), text16.length() + 1, pValue,
lossy);
}
bool UTF8ToConsoleString(const char *text, std::string *pValue, bool *lossy) {
return UTF8ToConsoleString(text, ~(size_t)0, pValue, lossy);
}
bool WideToConsoleString(const wchar_t *text, size_t textLen,
std::string *pValue, bool *lossy) {
DXASSERT_NOMSG(text != nullptr);
DXASSERT_NOMSG(pValue != nullptr);
UINT cp = GetConsoleOutputCP();
return WideToEncodedString(text, textLen, cp, 0, pValue, lossy);
}
bool WideToConsoleString(const wchar_t *text, std::string *pValue,
bool *lossy) {
return WideToConsoleString(text, ~(size_t)0, pValue, lossy);
}
bool WideToUTF8String(const wchar_t *pWide, size_t cWide, std::string *pUTF8) {
DXASSERT_NOMSG(pWide != nullptr);
DXASSERT_NOMSG(pUTF8 != nullptr);
return WideToEncodedString(pWide, cWide, CP_UTF8, 0, pUTF8, nullptr);
}
bool WideToUTF8String(const wchar_t *pWide, std::string *pUTF8) {
DXASSERT_NOMSG(pWide != nullptr);
DXASSERT_NOMSG(pUTF8 != nullptr);
return WideToEncodedString(pWide, ~(size_t)0, CP_UTF8, 0, pUTF8, nullptr);
}
std::string WideToUTF8StringOrThrow(const wchar_t *pWide) {
std::string result;
if (!WideToUTF8String(pWide, &result)) {
throw hlsl::Exception(DXC_E_STRING_ENCODING_FAILED);
}
return result;
}
bool UTF8BufferToWideComHeap(const char *pUTF8, wchar_t **ppWide) throw() {
*ppWide = nullptr;
int c = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, pUTF8, -1,
nullptr, 0);
if (c == 0)
return false;
CComHeapPtr<wchar_t> p;
if (!p.Allocate(c))
return false;
DXVERIFY_NOMSG(0 < ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, pUTF8,
-1, p.m_pData, c));
*ppWide = p.Detach();
return true;
}
bool UTF8BufferToWideBuffer(const char *pUTF8, int cbUTF8, wchar_t **ppWide,
size_t *pcWide) throw() {
*ppWide = nullptr;
*pcWide = 0;
if (cbUTF8 == 0 || (cbUTF8 == -1 && *pUTF8 == '\0')) {
*ppWide = new (std::nothrow) wchar_t[1];
if (*ppWide == nullptr)
return false;
(*ppWide)[0] = L'\0';
*pcWide = 1;
return true;
}
int c = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, pUTF8, cbUTF8,
nullptr, 0);
if (c == 0)
return false;
// add space for null-terminator if we're not accounting for it
if (cbUTF8 != -1)
c += 1;
wchar_t *p = new (std::nothrow) wchar_t[c];
if (p == nullptr)
return false;
int converted =
::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, pUTF8, cbUTF8, p, c);
(void)converted;
DXASSERT(converted > 0, "otherwise contents have changed");
p[c - 1] = L'\0';
*ppWide = p;
*pcWide = c;
return true;
}
bool WideBufferToUTF8Buffer(const wchar_t *pWide, int cWide, char **ppUTF8,
size_t *pcUTF8) throw() {
*ppUTF8 = nullptr;
*pcUTF8 = 0;
if (cWide == 0 || (cWide == -1 && *pWide == '\0')) {
*ppUTF8 = new (std::nothrow) char[1];
if (*ppUTF8 == nullptr)
return false;
(*ppUTF8)[0] = '\0';
*pcUTF8 = 1;
return true;
}
int c1 = ::WideCharToMultiByte(CP_UTF8, // code page
0, // flags
pWide, // string to convert
cWide, // size, in chars, of string to convert
nullptr, // output buffer
0, // size of output buffer
nullptr, nullptr);
if (c1 == 0)
return false;
// add space for null-terminator if we're not accounting for it
if (cWide != -1)
c1 += 1;
char *p = new (std::nothrow) char[c1];
if (p == nullptr)
return false;
int converted =
::WideCharToMultiByte(CP_UTF8, 0, pWide, cWide, p, c1, nullptr, nullptr);
(void)converted;
DXASSERT(converted > 0, "otherwise contents have changed");
p[c1 - 1] = '\0';
*ppUTF8 = p;
*pcUTF8 = c1;
return true;
}
template <typename TChar>
static bool IsStarMatchT(const TChar *pMask, size_t maskLen, const TChar *pName,
size_t nameLen, TChar star) {
if (maskLen == 0 && nameLen == 0) {
return true;
}
if (maskLen == 0 || nameLen == 0) {
return false;
}
if (pMask[maskLen - 1] == star) {
// Prefix match.
if (maskLen == 1) { // For just '*', everything is a match.
return true;
}
--maskLen;
if (maskLen > nameLen) { // Mask is longer than name, can't be a match.
return false;
}
return 0 == memcmp(pMask, pName, sizeof(TChar) * maskLen);
} else {
// Exact match.
if (nameLen != maskLen) {
return false;
}
return 0 == memcmp(pMask, pName, sizeof(TChar) * nameLen);
}
}
bool IsStarMatchUTF8(const char *pMask, size_t maskLen, const char *pName,
size_t nameLen) {
return IsStarMatchT<char>(pMask, maskLen, pName, nameLen, '*');
}
bool IsStarMatchWide(const wchar_t *pMask, size_t maskLen, const wchar_t *pName,
size_t nameLen) {
return IsStarMatchT<wchar_t>(pMask, maskLen, pName, nameLen, L'*');
}
} // namespace Unicode