| /* Format bytes as hexadecimal */ |
| |
| #include "Python.h" |
| #include "pycore_strhex.h" // _Py_strhex_with_sep() |
| #include "pycore_unicodeobject.h" // _PyUnicode_CheckConsistency() |
| |
| /* Scalar hexlify: convert len bytes to 2*len hex characters. |
| Uses table lookup via Py_hexdigits for the conversion. */ |
| static inline void |
| _Py_hexlify_scalar(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len) |
| { |
| /* Various optimizations like using math instead of a table lookup, |
| manually unrolling the loop, storing the global table pointer locally, |
| and doing wider dst writes have been tried and benchmarked; all produced |
| nearly identical performance on gcc 15. Using a 256 entry uint16_t |
| table was a bit slower. So we keep our old simple and obvious code. */ |
| for (Py_ssize_t i = 0; i < len; i++) { |
| unsigned char c = src[i]; |
| *dst++ = Py_hexdigits[c >> 4]; |
| *dst++ = Py_hexdigits[c & 0x0f]; |
| } |
| } |
| |
| /* Portable SIMD optimization for hexlify using GCC/Clang vector extensions. |
| Uses __builtin_shufflevector for portable interleave that compiles to |
| native SIMD instructions (SSE2 punpcklbw/punpckhbw on x86-64 [always], |
| NEON zip1/zip2 on ARM64 [always], & vzip on ARM32 when compiler flags |
| for the target microarch allow it [try -march=native if running 32-bit |
| on an RPi3 or later]). |
| |
| Performance: |
| - For more common small data it varies between 1.1-3x faster. |
| - Up to 11x faster on larger data than the scalar code. |
| |
| While faster is possible for big data using AVX2 or AVX512, that |
| adds a ton of complication. Who ever really hexes huge data? |
| The 16-64 byte boosts align nicely with md5 - sha512 hexdigests. |
| */ |
| #ifdef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR |
| |
| /* 128-bit vector of 16 unsigned bytes */ |
| typedef unsigned char v16u8 __attribute__((vector_size(16))); |
| /* 128-bit vector of 16 signed bytes - for efficient comparison. |
| Using signed comparison generates pcmpgtb on x86-64 instead of |
| the slower psubusb+pcmpeqb sequence from unsigned comparison. |
| ARM NEON performs the same either way. */ |
| typedef signed char v16s8 __attribute__((vector_size(16))); |
| |
| /* Splat a byte value across all 16 lanes */ |
| static inline v16u8 |
| v16u8_splat(unsigned char x) |
| { |
| return (v16u8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x}; |
| } |
| |
| static inline v16s8 |
| v16s8_splat(signed char x) |
| { |
| return (v16s8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x}; |
| } |
| |
| /* Portable SIMD hexlify: converts 16 bytes to 32 hex chars per iteration. |
| Compiles to native SSE2 on x86-64, NEON on ARM64 (and some ARM32). */ |
| static void |
| _Py_hexlify_simd(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len) |
| { |
| const v16u8 mask_0f = v16u8_splat(0x0f); |
| const v16u8 ascii_0 = v16u8_splat('0'); |
| const v16u8 offset = v16u8_splat('a' - '0' - 10); /* 0x27 */ |
| const v16s8 nine = v16s8_splat(9); |
| |
| Py_ssize_t i = 0; |
| |
| /* Process 16 bytes at a time */ |
| for (; i + 16 <= len; i += 16, dst += 32) { |
| /* Load 16 bytes (memcpy for safe unaligned access) */ |
| v16u8 data; |
| memcpy(&data, src + i, 16); |
| |
| /* Extract high and low nibbles using vector operators */ |
| v16u8 hi = (data >> 4) & mask_0f; |
| v16u8 lo = data & mask_0f; |
| |
| /* Compare > 9 using signed comparison for efficient codegen. |
| Nibble values 0-15 are safely in signed byte range. |
| This generates pcmpgtb on x86-64, avoiding the slower |
| psubusb+pcmpeqb sequence from unsigned comparison. */ |
| v16u8 hi_gt9 = (v16u8)((v16s8)hi > nine); |
| v16u8 lo_gt9 = (v16u8)((v16s8)lo > nine); |
| |
| /* Convert nibbles to hex ASCII */ |
| hi = hi + ascii_0 + (hi_gt9 & offset); |
| lo = lo + ascii_0 + (lo_gt9 & offset); |
| |
| /* Interleave hi/lo nibbles using portable shufflevector. |
| This compiles to punpcklbw/punpckhbw on x86-64, zip1/zip2 on ARM64, |
| or vzip on ARM32. */ |
| v16u8 result0 = __builtin_shufflevector(hi, lo, |
| 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); |
| v16u8 result1 = __builtin_shufflevector(hi, lo, |
| 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); |
| |
| /* Store 32 hex characters */ |
| memcpy(dst, &result0, 16); |
| memcpy(dst + 16, &result1, 16); |
| } |
| |
| /* Scalar fallback for remaining 0-15 bytes */ |
| _Py_hexlify_scalar(src + i, dst, len - i); |
| } |
| |
| #endif /* HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR */ |
| |
| static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen, |
| PyObject* sep, int bytes_per_sep_group, |
| const int return_bytes) |
| { |
| assert(arglen >= 0); |
| |
| Py_UCS1 sep_char = 0; |
| if (sep) { |
| Py_ssize_t seplen = PyObject_Length((PyObject*)sep); |
| if (seplen < 0) { |
| return NULL; |
| } |
| if (seplen != 1) { |
| PyErr_SetString(PyExc_ValueError, "sep must be length 1."); |
| return NULL; |
| } |
| if (PyUnicode_Check(sep)) { |
| if (PyUnicode_KIND(sep) != PyUnicode_1BYTE_KIND) { |
| PyErr_SetString(PyExc_ValueError, "sep must be ASCII."); |
| return NULL; |
| } |
| sep_char = PyUnicode_READ_CHAR(sep, 0); |
| } |
| else if (PyBytes_Check(sep)) { |
| sep_char = PyBytes_AS_STRING(sep)[0]; |
| } |
| else { |
| PyErr_SetString(PyExc_TypeError, "sep must be str or bytes."); |
| return NULL; |
| } |
| if (sep_char > 127 && !return_bytes) { |
| PyErr_SetString(PyExc_ValueError, "sep must be ASCII."); |
| return NULL; |
| } |
| } |
| else { |
| bytes_per_sep_group = 0; |
| } |
| unsigned int abs_bytes_per_sep = _Py_ABS_CAST(unsigned int, bytes_per_sep_group); |
| Py_ssize_t resultlen = 0; |
| if (bytes_per_sep_group && arglen > 0) { |
| /* How many sep characters we'll be inserting. */ |
| resultlen = (arglen - 1) / abs_bytes_per_sep; |
| } |
| /* Bounds checking for our Py_ssize_t indices. */ |
| if (arglen >= PY_SSIZE_T_MAX / 2 - resultlen) { |
| return PyErr_NoMemory(); |
| } |
| resultlen += arglen * 2; |
| |
| if ((size_t)abs_bytes_per_sep >= (size_t)arglen) { |
| bytes_per_sep_group = 0; |
| abs_bytes_per_sep = 0; |
| } |
| |
| PyObject *retval; |
| Py_UCS1 *retbuf; |
| if (return_bytes) { |
| /* If _PyBytes_FromSize() were public we could avoid malloc+copy. */ |
| retval = PyBytes_FromStringAndSize(NULL, resultlen); |
| if (!retval) { |
| return NULL; |
| } |
| retbuf = (Py_UCS1 *)PyBytes_AS_STRING(retval); |
| } |
| else { |
| retval = PyUnicode_New(resultlen, 127); |
| if (!retval) { |
| return NULL; |
| } |
| retbuf = PyUnicode_1BYTE_DATA(retval); |
| } |
| |
| /* Hexlify */ |
| Py_ssize_t i, j; |
| unsigned char c; |
| |
| if (bytes_per_sep_group == 0) { |
| #ifdef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR |
| if (arglen >= 16) { |
| _Py_hexlify_simd((const unsigned char *)argbuf, retbuf, arglen); |
| } |
| else |
| #endif |
| { |
| _Py_hexlify_scalar((const unsigned char *)argbuf, retbuf, arglen); |
| } |
| } |
| else { |
| /* The number of complete chunk+sep periods */ |
| Py_ssize_t chunks = (arglen - 1) / abs_bytes_per_sep; |
| Py_ssize_t chunk; |
| unsigned int k; |
| |
| if (bytes_per_sep_group < 0) { |
| i = j = 0; |
| for (chunk = 0; chunk < chunks; chunk++) { |
| for (k = 0; k < abs_bytes_per_sep; k++) { |
| c = argbuf[i++]; |
| retbuf[j++] = Py_hexdigits[c >> 4]; |
| retbuf[j++] = Py_hexdigits[c & 0x0f]; |
| } |
| retbuf[j++] = sep_char; |
| } |
| while (i < arglen) { |
| c = argbuf[i++]; |
| retbuf[j++] = Py_hexdigits[c >> 4]; |
| retbuf[j++] = Py_hexdigits[c & 0x0f]; |
| } |
| assert(j == resultlen); |
| } |
| else { |
| i = arglen - 1; |
| j = resultlen - 1; |
| for (chunk = 0; chunk < chunks; chunk++) { |
| for (k = 0; k < abs_bytes_per_sep; k++) { |
| c = argbuf[i--]; |
| retbuf[j--] = Py_hexdigits[c & 0x0f]; |
| retbuf[j--] = Py_hexdigits[c >> 4]; |
| } |
| retbuf[j--] = sep_char; |
| } |
| while (i >= 0) { |
| c = argbuf[i--]; |
| retbuf[j--] = Py_hexdigits[c & 0x0f]; |
| retbuf[j--] = Py_hexdigits[c >> 4]; |
| } |
| assert(j == -1); |
| } |
| } |
| |
| #ifdef Py_DEBUG |
| if (!return_bytes) { |
| assert(_PyUnicode_CheckConsistency(retval, 1)); |
| } |
| #endif |
| |
| return retval; |
| } |
| |
| PyObject * _Py_strhex(const char* argbuf, const Py_ssize_t arglen) |
| { |
| return _Py_strhex_impl(argbuf, arglen, NULL, 0, 0); |
| } |
| |
| /* Same as above but returns a bytes() instead of str() to avoid the |
| * need to decode the str() when bytes are needed. */ |
| PyObject* _Py_strhex_bytes(const char* argbuf, const Py_ssize_t arglen) |
| { |
| return _Py_strhex_impl(argbuf, arglen, NULL, 0, 1); |
| } |
| |
| /* These variants include support for a separator between every N bytes: */ |
| |
| PyObject* _Py_strhex_with_sep(const char* argbuf, const Py_ssize_t arglen, |
| PyObject* sep, const int bytes_per_group) |
| { |
| return _Py_strhex_impl(argbuf, arglen, sep, bytes_per_group, 0); |
| } |
| |
| /* Same as above but returns a bytes() instead of str() to avoid the |
| * need to decode the str() when bytes are needed. */ |
| PyObject* _Py_strhex_bytes_with_sep(const char* argbuf, const Py_ssize_t arglen, |
| PyObject* sep, const int bytes_per_group) |
| { |
| return _Py_strhex_impl(argbuf, arglen, sep, bytes_per_group, 1); |
| } |