-
-
Notifications
You must be signed in to change notification settings - Fork 34.2k
Expand file tree
/
Copy pathpystrhex.c
More file actions
280 lines (248 loc) · 9.53 KB
/
pystrhex.c
File metadata and controls
280 lines (248 loc) · 9.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
/* Format bytes as hexadecimal */
#include "Python.h"
#include "pycore_strhex.h" // _Py_strhex_with_sep()
#include "pycore_unicodeobject.h" // _PyUnicode_CheckConsistency()
/* Scalar hexlify: convert len bytes to 2*len hex characters.
Uses table lookup via Py_hexdigits for the conversion. */
static inline void
_Py_hexlify_scalar(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
{
/* Various optimizations like using math instead of a table lookup,
manually unrolling the loop, storing the global table pointer locally,
and doing wider dst writes have been tried and benchmarked; all produced
nearly identical performance on gcc 15. Using a 256 entry uint16_t
table was a bit slower. So we keep our old simple and obvious code. */
for (Py_ssize_t i = 0; i < len; i++) {
unsigned char c = src[i];
*dst++ = Py_hexdigits[c >> 4];
*dst++ = Py_hexdigits[c & 0x0f];
}
}
/* Portable SIMD optimization for hexlify using GCC/Clang vector extensions.
Uses __builtin_shufflevector for portable interleave that compiles to
native SIMD instructions (SSE2 punpcklbw/punpckhbw on x86-64 [always],
NEON zip1/zip2 on ARM64 [always], & vzip on ARM32 when compiler flags
for the target microarch allow it [try -march=native if running 32-bit
on an RPi3 or later]).
Performance:
- For more common small data it varies between 1.1-3x faster.
- Up to 11x faster on larger data than the scalar code.
While faster is possible for big data using AVX2 or AVX512, that
adds a ton of complication. Who ever really hexes huge data?
The 16-64 byte boosts align nicely with md5 - sha512 hexdigests.
*/
#ifdef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR
/* 128-bit vector of 16 unsigned bytes */
typedef unsigned char v16u8 __attribute__((vector_size(16)));
/* 128-bit vector of 16 signed bytes - for efficient comparison.
Using signed comparison generates pcmpgtb on x86-64 instead of
the slower psubusb+pcmpeqb sequence from unsigned comparison.
ARM NEON performs the same either way. */
typedef signed char v16s8 __attribute__((vector_size(16)));
/* Splat a byte value across all 16 lanes */
static inline v16u8
v16u8_splat(unsigned char x)
{
return (v16u8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
}
static inline v16s8
v16s8_splat(signed char x)
{
return (v16s8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
}
/* Portable SIMD hexlify: converts 16 bytes to 32 hex chars per iteration.
Compiles to native SSE2 on x86-64, NEON on ARM64 (and some ARM32). */
static void
_Py_hexlify_simd(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
{
const v16u8 mask_0f = v16u8_splat(0x0f);
const v16u8 ascii_0 = v16u8_splat('0');
const v16u8 offset = v16u8_splat('a' - '0' - 10); /* 0x27 */
const v16s8 nine = v16s8_splat(9);
Py_ssize_t i = 0;
/* Process 16 bytes at a time */
for (; i + 16 <= len; i += 16, dst += 32) {
/* Load 16 bytes (memcpy for safe unaligned access) */
v16u8 data;
memcpy(&data, src + i, 16);
/* Extract high and low nibbles using vector operators */
v16u8 hi = (data >> 4) & mask_0f;
v16u8 lo = data & mask_0f;
/* Compare > 9 using signed comparison for efficient codegen.
Nibble values 0-15 are safely in signed byte range.
This generates pcmpgtb on x86-64, avoiding the slower
psubusb+pcmpeqb sequence from unsigned comparison. */
v16u8 hi_gt9 = (v16u8)((v16s8)hi > nine);
v16u8 lo_gt9 = (v16u8)((v16s8)lo > nine);
/* Convert nibbles to hex ASCII */
hi = hi + ascii_0 + (hi_gt9 & offset);
lo = lo + ascii_0 + (lo_gt9 & offset);
/* Interleave hi/lo nibbles using portable shufflevector.
This compiles to punpcklbw/punpckhbw on x86-64, zip1/zip2 on ARM64,
or vzip on ARM32. */
v16u8 result0 = __builtin_shufflevector(hi, lo,
0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
v16u8 result1 = __builtin_shufflevector(hi, lo,
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
/* Store 32 hex characters */
memcpy(dst, &result0, 16);
memcpy(dst + 16, &result1, 16);
}
/* Scalar fallback for remaining 0-15 bytes */
_Py_hexlify_scalar(src + i, dst, len - i);
}
#endif /* HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR */
static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
PyObject* sep, int bytes_per_sep_group,
const int return_bytes)
{
assert(arglen >= 0);
Py_UCS1 sep_char = 0;
if (sep) {
Py_ssize_t seplen = PyObject_Length((PyObject*)sep);
if (seplen < 0) {
return NULL;
}
if (seplen != 1) {
PyErr_SetString(PyExc_ValueError, "sep must be length 1.");
return NULL;
}
if (PyUnicode_Check(sep)) {
if (PyUnicode_KIND(sep) != PyUnicode_1BYTE_KIND) {
PyErr_SetString(PyExc_ValueError, "sep must be ASCII.");
return NULL;
}
sep_char = PyUnicode_READ_CHAR(sep, 0);
}
else if (PyBytes_Check(sep)) {
sep_char = PyBytes_AS_STRING(sep)[0];
}
else {
PyErr_SetString(PyExc_TypeError, "sep must be str or bytes.");
return NULL;
}
if (sep_char > 127 && !return_bytes) {
PyErr_SetString(PyExc_ValueError, "sep must be ASCII.");
return NULL;
}
}
else {
bytes_per_sep_group = 0;
}
unsigned int abs_bytes_per_sep = _Py_ABS_CAST(unsigned int, bytes_per_sep_group);
Py_ssize_t resultlen = 0;
if (bytes_per_sep_group && arglen > 0) {
/* How many sep characters we'll be inserting. */
resultlen = (arglen - 1) / abs_bytes_per_sep;
}
/* Bounds checking for our Py_ssize_t indices. */
if (arglen >= PY_SSIZE_T_MAX / 2 - resultlen) {
return PyErr_NoMemory();
}
resultlen += arglen * 2;
if ((size_t)abs_bytes_per_sep >= (size_t)arglen) {
bytes_per_sep_group = 0;
abs_bytes_per_sep = 0;
}
PyObject *retval;
Py_UCS1 *retbuf;
if (return_bytes) {
/* If _PyBytes_FromSize() were public we could avoid malloc+copy. */
retval = PyBytes_FromStringAndSize(NULL, resultlen);
if (!retval) {
return NULL;
}
retbuf = (Py_UCS1 *)PyBytes_AS_STRING(retval);
}
else {
retval = PyUnicode_New(resultlen, 127);
if (!retval) {
return NULL;
}
retbuf = PyUnicode_1BYTE_DATA(retval);
}
/* Hexlify */
Py_ssize_t i, j;
unsigned char c;
if (bytes_per_sep_group == 0) {
#ifdef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR
if (arglen >= 16) {
_Py_hexlify_simd((const unsigned char *)argbuf, retbuf, arglen);
}
else
#endif
{
_Py_hexlify_scalar((const unsigned char *)argbuf, retbuf, arglen);
}
}
else {
/* The number of complete chunk+sep periods */
Py_ssize_t chunks = (arglen - 1) / abs_bytes_per_sep;
Py_ssize_t chunk;
unsigned int k;
if (bytes_per_sep_group < 0) {
i = j = 0;
for (chunk = 0; chunk < chunks; chunk++) {
for (k = 0; k < abs_bytes_per_sep; k++) {
c = argbuf[i++];
retbuf[j++] = Py_hexdigits[c >> 4];
retbuf[j++] = Py_hexdigits[c & 0x0f];
}
retbuf[j++] = sep_char;
}
while (i < arglen) {
c = argbuf[i++];
retbuf[j++] = Py_hexdigits[c >> 4];
retbuf[j++] = Py_hexdigits[c & 0x0f];
}
assert(j == resultlen);
}
else {
i = arglen - 1;
j = resultlen - 1;
for (chunk = 0; chunk < chunks; chunk++) {
for (k = 0; k < abs_bytes_per_sep; k++) {
c = argbuf[i--];
retbuf[j--] = Py_hexdigits[c & 0x0f];
retbuf[j--] = Py_hexdigits[c >> 4];
}
retbuf[j--] = sep_char;
}
while (i >= 0) {
c = argbuf[i--];
retbuf[j--] = Py_hexdigits[c & 0x0f];
retbuf[j--] = Py_hexdigits[c >> 4];
}
assert(j == -1);
}
}
#ifdef Py_DEBUG
if (!return_bytes) {
assert(_PyUnicode_CheckConsistency(retval, 1));
}
#endif
return retval;
}
PyObject * _Py_strhex(const char* argbuf, const Py_ssize_t arglen)
{
return _Py_strhex_impl(argbuf, arglen, NULL, 0, 0);
}
/* Same as above but returns a bytes() instead of str() to avoid the
* need to decode the str() when bytes are needed. */
PyObject* _Py_strhex_bytes(const char* argbuf, const Py_ssize_t arglen)
{
return _Py_strhex_impl(argbuf, arglen, NULL, 0, 1);
}
/* These variants include support for a separator between every N bytes: */
PyObject* _Py_strhex_with_sep(const char* argbuf, const Py_ssize_t arglen,
PyObject* sep, const int bytes_per_group)
{
return _Py_strhex_impl(argbuf, arglen, sep, bytes_per_group, 0);
}
/* Same as above but returns a bytes() instead of str() to avoid the
* need to decode the str() when bytes are needed. */
PyObject* _Py_strhex_bytes_with_sep(const char* argbuf, const Py_ssize_t arglen,
PyObject* sep, const int bytes_per_group)
{
return _Py_strhex_impl(argbuf, arglen, sep, bytes_per_group, 1);
}