-
Notifications
You must be signed in to change notification settings - Fork 120
Expand file tree
/
Copy pathstringzilla.c
More file actions
8034 lines (7021 loc) · 318 KB
/
stringzilla.c
File metadata and controls
8034 lines (7021 loc) · 318 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/**
* @file stringzilla.c
* @brief Very light-weight CPython wrapper for StringZilla, with support for memory-mapping,
* native Python strings, Apache Arrow collections, and more.
* @author Ash Vardanian
* @date July 10, 2023
* @copyright Copyright (c) 2023
*
* - Doesn't use PyBind11, NanoBind, Boost.Python, or any other high-level libs, only CPython API.
* - To minimize latency this implementation avoids `PyArg_ParseTupleAndKeywords` calls.
* - Reimplements all of the `str` functionality in C as a `Str` type.
* - Provides a highly generic `Strs` class for handling collections of strings, Arrow-style or not.
*
* Pandas doesn't provide a C API, and even in the 2.0 the Apache Arrow representation is opt-in, not default.
* PyCapsule protocol in conjunction with @b `__arrow_c_array__` dunder methods can be used to extract strings.
* @see https://arrow.apache.org/docs/python/generated/pyarrow.array.html
*
* This module exports C functions via `PyCapsule` of `PyAPI` for use by other extensions (like `stringzillas-cpus`):
* - `sz_py_export_string_like`.
* - `sz_py_export_strings_as_sequence`.
* - `sz_py_export_strings_as_u32tape`.
* - `sz_py_export_strings_as_u64tape`.
* - `sz_py_replace_strings_allocator`.
*
* Function Naming Convention:
* - `Str_like_*`: Functions that can be called both as module-level functions AND as member methods.
* - `Str_*`: Functions that are member-only methods or have simpler calling conventions.
*/
#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
#define NOMINMAX
#include <windows.h>
#else
#include <fcntl.h> // `O_RDNLY`
#include <sys/mman.h> // `mmap`
#include <sys/stat.h> // `stat`
#include <sys/types.h>
#endif
#ifdef _MSC_VER
#include <BaseTsd.h>
typedef SSIZE_T ssize_t;
#else
#include <limits.h> // `SSIZE_MAX`
#include <unistd.h> // `ssize_t`
#endif
// It seems like some Python versions forget to include a header, so we should:
// https://github.com/ashvardanian/StringZilla/actions/runs/7706636733/job/21002535521
#ifndef SSIZE_MAX
#define SSIZE_MAX (SIZE_MAX / 2)
#endif
// Undefine _POSIX_C_SOURCE to avoid redefinition warning with Python headers
#ifdef _POSIX_C_SOURCE
#undef _POSIX_C_SOURCE
#endif
#include <Python.h> // Core CPython interfaces
#include <errno.h> // `errno`
#include <stdio.h> // `fopen`
#include <stdlib.h> // `rand`, `srand`
#include <time.h> // `time`
#include <stringzilla/stringzilla.h>
/**
* @brief Arrow C Data Interface structure for an array schema.
* @see https://arrow.apache.org/docs/format/CDataInterface.html#structure-definitions
*/
struct ArrowSchema {
char const *format;
char const *name;
char const *metadata;
int64_t flags;
int64_t n_children;
struct ArrowSchema **children;
struct ArrowSchema *dictionary;
void (*release)(struct ArrowSchema *);
void *private_data;
};
/**
* @brief Arrow C Data Interface structure for an array content.
* @see https://arrow.apache.org/docs/format/CDataInterface.html#structure-definitions
*/
struct ArrowArray {
int64_t length;
int64_t null_count;
int64_t offset;
int64_t n_buffers;
int64_t n_children;
void const **buffers;
struct ArrowArray **children;
struct ArrowArray *dictionary;
void (*release)(struct ArrowArray *);
void *private_data;
};
typedef struct PyAPI {
sz_bool_t (*sz_py_export_string_like)(PyObject *, sz_cptr_t *, sz_size_t *);
sz_bool_t (*sz_py_export_strings_as_sequence)(PyObject *, sz_sequence_t *);
sz_bool_t (*sz_py_export_strings_as_u32tape)(PyObject *, sz_cptr_t *, sz_u32_t const **, sz_size_t *);
sz_bool_t (*sz_py_export_strings_as_u64tape)(PyObject *, sz_cptr_t *, sz_u64_t const **, sz_size_t *);
sz_bool_t (*sz_py_replace_strings_allocator)(PyObject *, sz_memory_allocator_t *);
} PyAPI;
#pragma region Forward Declarations
static PyTypeObject FileType;
static PyTypeObject StrType;
static PyTypeObject StrsType;
static PyTypeObject SplitIteratorType;
static PyTypeObject Utf8SplitLinesIteratorType;
static PyTypeObject Utf8SplitWhitespaceIteratorType;
static PyTypeObject Utf8WordBoundaryIteratorType;
static PyTypeObject Utf8CaseInsensitiveFindIteratorType;
static PyTypeObject HasherType;
static PyTypeObject Sha256Type;
static sz_string_view_t temporary_memory = {NULL, 0};
/**
* @brief Describes an on-disk file mapped into RAM, which is different from Python's
* native `mmap` module, as it exposes the address of the mapping in memory.
*/
typedef struct {
PyObject ob_base;
#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
HANDLE file_handle;
HANDLE mapping_handle;
#else
int file_descriptor;
#endif
sz_string_view_t memory;
} File;
/**
* @brief Type-punned StringZilla-string, that points to a slice of an existing Python `str`
* or a `File`.
*
* When a slice is constructed, the `parent` object's reference count is being incremented to preserve lifetime.
* It usage in Python would look like:
*
* - Str() # Empty string
* - Str("some-string") # Full-range slice of a Python `str`
* - Str(File("some-path.txt")) # Full-range view of a persisted file
* - Str(File("some-path.txt"), from=0, to=sys.maxsize)
*/
typedef struct {
PyObject ob_base;
PyObject *parent;
sz_string_view_t memory;
} Str;
/**
* @brief String-splitting separator.
*
* Allows lazy evaluation of the `split` and `rsplit`, and can be used to create a `Strs` object.
* which might be more memory-friendly, than greedily invoking `str.split`.
*/
typedef struct {
PyObject ob_base;
PyObject *text_obj; //< For reference counting
PyObject *separator_obj; //< For reference counting
sz_string_view_t text;
sz_string_view_t separator;
sz_find_t finder;
/// @brief How many bytes to skip after each successful find.
/// Generally equal to `needle_length`, or 1 for character sets.
sz_size_t match_length;
/// @brief Should we include the separator in the resulting slices?
sz_bool_t include_match;
/// @brief Should we enumerate the slices in normal or reverse order?
sz_bool_t is_reverse;
/// @brief Upper limit for the number of splits to report. Monotonically decreases during iteration.
sz_size_t max_parts;
/// @brief Indicates that we've already reported the tail of the split, and should return NULL next.
sz_bool_t reached_tail;
} SplitIterator;
/**
* @brief Iterator for splitting a UTF-8 string by Unicode newline characters.
*
* Uses sz_utf8_find_newline to find newlines, supporting all 7 Unicode newline
* characters plus CRLF sequences.
*
* Termination: when start > end (not start == end, which is a valid state yielding empty segment).
*/
typedef struct {
PyObject ob_base;
PyObject *text_obj; //< For reference counting
sz_cptr_t start; //< Current position (start of current segment)
sz_cptr_t end; //< End of original text (immutable)
sz_size_t match_length; //< Length of current segment to yield
/// @brief Should we include the newline characters in the resulting slices?
sz_bool_t keepends;
/// @brief Should we skip empty segments (trailing, leading, consecutive)?
sz_bool_t skip_empty;
} Utf8SplitLinesIterator;
/**
* @brief Iterator for splitting a UTF-8 string by Unicode whitespace characters.
*
* Uses sz_utf8_find_whitespace to find whitespace, supporting all 25 Unicode
* White_Space characters. N whitespace delimiters yield N+1 segments (including empties).
*/
typedef struct {
PyObject ob_base;
PyObject *text_obj; //< For reference counting
sz_cptr_t start; //< Current position in text
sz_cptr_t end; //< End of text (immutable)
sz_size_t match_length; //< Length of current segment to yield
/// @brief Should we skip empty segments (trailing, leading, consecutive)?
sz_bool_t skip_empty;
} Utf8SplitWhitespaceIterator;
/**
* @brief Iterator for finding word boundaries in UTF-8 text per Unicode TR29.
*
* Uses sz_utf8_word_find_boundary to find boundaries, supporting all TR29 rules.
* Yields words (text segments between consecutive word boundaries).
*/
typedef struct {
PyObject ob_base;
PyObject *text_obj; //< For reference counting
sz_cptr_t start; //< Start of current word
sz_cptr_t end; //< End of original text (immutable)
sz_cptr_t text_start; //< Start of original text (for reverse iteration)
/// @brief Should we skip empty segments (consecutive boundaries)?
sz_bool_t skip_empty;
} Utf8WordBoundaryIterator;
/**
* @brief Iterator that yields all case-insensitive matches of a needle in a haystack.
* Uses `sz_utf8_case_insensitive_find` for Unicode-aware case folding.
*/
typedef struct {
PyObject ob_base;
PyObject *haystack_obj; //< Reference for garbage collection
PyObject *needle_obj; //< Reference for garbage collection (needle bytes must remain valid)
sz_cptr_t current; //< Current search position in haystack
sz_cptr_t haystack_end; //< End boundary of haystack
sz_string_view_t needle; //< Needle view (bytes and length)
/// @brief Reusable metadata for repeated searches with the same needle.
sz_utf8_case_insensitive_needle_metadata_t metadata;
/// @brief Whether to allow overlapping matches.
sz_bool_t include_overlapping;
} Utf8CaseInsensitiveFindIterator;
/**
* @brief Variable length Python object similar to `Tuple[Union[Str, str]]`,
* for faster sorting, shuffling, joins, and lookups.
*/
typedef struct {
PyObject ob_base;
enum {
STRS_U32_TAPE_VIEW = 0,
STRS_U64_TAPE_VIEW = 1,
STRS_U32_TAPE = 2,
STRS_U64_TAPE = 3,
STRS_FRAGMENTED = 4,
} layout;
union {
/**
* U32 tape view - references existing Arrow array data, owns nothing.
* The layout is identical to Apache Arrow format: N+1 offsets for N strings.
* https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout
*/
struct u32_tape_view_t {
sz_size_t count;
sz_cptr_t data; // Points to existing data (not owned)
sz_u32_t *offsets; // Points to existing offsets (not owned)
PyObject *parent; // Parent Arrow array or other object
} u32_tape_view;
/**
* U32 tape - owns both offsets and data with custom allocator.
*/
struct u32_tape_t {
sz_size_t count;
sz_cptr_t data; // Owned data
sz_u32_t *offsets; // Owned offsets (N+1 for N strings)
sz_memory_allocator_t allocator;
} u32_tape;
/**
* U64 tape view - references existing Arrow array data, owns nothing.
* The layout is identical to Apache Arrow format: N+1 offsets for N strings.
* https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout
*/
struct u64_tape_view_t {
sz_size_t count;
sz_cptr_t data; // Points to existing data (not owned)
sz_u64_t *offsets; // Points to existing offsets (not owned)
PyObject *parent; // Parent Arrow array or other object
} u64_tape_view;
/**
* U64 tape - owns both offsets and data with custom allocator.
*/
struct u64_tape_t {
sz_size_t count;
sz_cptr_t data; // Owned data
sz_u64_t *offsets; // Owned offsets (N+1 for N strings)
sz_memory_allocator_t allocator;
} u64_tape;
/**
* Reordered subviews - owns only the array of individual spans.
* Each span points to data in the parent object.
*/
struct fragmented_t {
sz_size_t count;
sz_string_view_t *spans; // Owned array of spans
PyObject *parent; // Parent object (Str, Strs, or other)
sz_memory_allocator_t allocator;
} fragmented;
} data;
} Strs;
#pragma endregion
#pragma region Helpers
static sz_ptr_t temporary_memory_allocate(sz_size_t size, sz_string_view_t *existing) {
if (existing->length < size) {
sz_cptr_t new_start = realloc(existing->start, size);
if (!new_start) {
PyErr_Format(PyExc_MemoryError, "Unable to allocate memory for the Levenshtein matrix");
return NULL;
}
existing->start = new_start;
existing->length = size;
}
return existing->start;
}
static void temporary_memory_free(sz_ptr_t start, sz_size_t size, sz_string_view_t *existing) {}
static sz_cptr_t Strs_get_start_(void const *handle, sz_size_t i) {
Strs *strs = (Strs *)handle;
switch (strs->layout) {
case STRS_U32_TAPE: return strs->data.u32_tape.data + strs->data.u32_tape.offsets[i];
case STRS_U32_TAPE_VIEW: return strs->data.u32_tape_view.data + strs->data.u32_tape_view.offsets[i];
case STRS_U64_TAPE: return strs->data.u64_tape.data + strs->data.u64_tape.offsets[i];
case STRS_U64_TAPE_VIEW: return strs->data.u64_tape_view.data + strs->data.u64_tape_view.offsets[i];
case STRS_FRAGMENTED: return strs->data.fragmented.spans[i].start;
}
return NULL;
}
static sz_size_t Strs_get_length_(void const *handle, sz_size_t i) {
Strs *strs = (Strs *)handle;
switch (strs->layout) {
case STRS_U32_TAPE: return strs->data.u32_tape.offsets[i + 1] - strs->data.u32_tape.offsets[i];
case STRS_U32_TAPE_VIEW: return strs->data.u32_tape_view.offsets[i + 1] - strs->data.u32_tape_view.offsets[i];
case STRS_U64_TAPE: return strs->data.u64_tape.offsets[i + 1] - strs->data.u64_tape.offsets[i];
case STRS_U64_TAPE_VIEW: return strs->data.u64_tape_view.offsets[i + 1] - strs->data.u64_tape_view.offsets[i];
case STRS_FRAGMENTED: return strs->data.fragmented.spans[i].length;
}
return 0;
}
void reverse_offsets(sz_sorted_idx_t *array, sz_size_t length) {
sz_size_t i, j;
// Swap array[i] and array[j]
for (i = 0, j = length - 1; i < j; i++, j--) {
sz_sorted_idx_t temp = array[i];
array[i] = array[j];
array[j] = temp;
}
}
void reverse_haystacks(sz_string_view_t *array, sz_size_t length) {
sz_size_t i, j;
// Swap array[i] and array[j]
for (i = 0, j = length - 1; i < j; i++, j--) {
sz_string_view_t temp = array[i];
array[i] = array[j];
array[j] = temp;
}
}
void permute(sz_string_view_t *array, sz_sorted_idx_t *order, sz_size_t length) {
for (sz_size_t i = 0; i < length; ++i) {
if (i == order[i]) continue;
sz_string_view_t temp = array[i];
sz_size_t k = i, j;
while (i != (j = (sz_size_t)order[k])) {
array[k] = array[j];
order[k] = k;
k = j;
}
array[k] = temp;
order[k] = k;
}
}
/**
* @brief Helper function to check if a Python object represents a mutable buffer.
* Returns sz_true_k if the object is mutable (can be written to), sz_false_k if immutable.
* Sets a Python exception if immutable.
*/
SZ_INTERNAL sz_bool_t sz_py_is_mutable(PyObject *object) {
if (PyUnicode_Check(object)) {
PyErr_SetString(PyExc_TypeError, "str objects are immutable (use bytearray instead)");
return sz_false_k;
}
else if (PyBytes_Check(object)) {
PyErr_SetString(PyExc_TypeError, "bytes objects are immutable (use bytearray instead)");
return sz_false_k;
}
else if (PyMemoryView_Check(object)) {
Py_buffer *view = PyMemoryView_GET_BUFFER(object);
if (view->readonly) {
PyErr_SetString(PyExc_TypeError, "memoryview is read-only");
return sz_false_k;
}
}
// Everything else is optimistically considered mutable
return sz_true_k;
}
/**
* @brief Helper function to export a Python string-like object into a `sz_string_view_t`.
* On failure, sets a Python exception and returns 0.
*/
SZ_DYNAMIC sz_bool_t sz_py_export_string_like(PyObject *object, sz_cptr_t *start, sz_size_t *length) {
if (PyUnicode_Check(object)) {
// Handle Python `str` object
Py_ssize_t signed_length;
*start = PyUnicode_AsUTF8AndSize(object, &signed_length);
*length = (sz_size_t)signed_length;
return 1;
}
else if (PyBytes_Check(object)) {
// Handle Python `bytes` object
// https://docs.python.org/3/c-api/bytes.html
Py_ssize_t signed_length;
if (PyBytes_AsStringAndSize(object, (sz_ptr_t *)start, &signed_length) == -1) {
PyErr_SetString(PyExc_ValueError, "Couldn't access `bytes` buffer internals");
return 0;
}
*length = (sz_size_t)signed_length;
return 1;
}
else if (PyByteArray_Check(object)) {
// Handle Python mutable `bytearray` object
// https://docs.python.org/3/c-api/bytearray.html
*start = PyByteArray_AS_STRING(object);
*length = PyByteArray_GET_SIZE(object);
return 1;
}
else if (PyObject_TypeCheck(object, &StrType)) {
Str *str = (Str *)object;
*start = str->memory.start;
*length = str->memory.length;
return 1;
}
else if (PyObject_TypeCheck(object, &FileType)) {
File *file = (File *)object;
*start = file->memory.start;
*length = file->memory.length;
return 1;
}
else if (PyMemoryView_Check(object)) {
// Handle Python `memoryview` object
// https://docs.python.org/3/c-api/memoryview.html
// https://docs.python.org/3/c-api/buffer.html#c.Py_buffer
Py_buffer *view = PyMemoryView_GET_BUFFER(object);
// Make sure we are dealing with single-byte integral representations
if (view->itemsize != 1) {
PyErr_SetString(PyExc_ValueError, "Only single-byte integral types are supported");
return 0;
}
// Let's make sure the data is contiguous.
// This can be a bit trickier for high-dimensional arrays, but CPython has a built-in function for that.
// The flag 'C' stands for C-style-contiguous, which means that the last dimension is contiguous.
// The flag 'F' stands for Fortran-style-contiguous, which means that the first dimension is contiguous.
// The flag 'A' stands for any-contiguous, which only means there are no gaps between elements.
// For byte-level processing that's all we need.
if (!PyBuffer_IsContiguous(view, 'A')) {
PyErr_SetString(PyExc_ValueError, "The array must be contiguous");
return 0;
}
*start = (sz_cptr_t)view->buf;
*length = (sz_size_t)view->len;
return 1;
}
else {
PyErr_SetString(PyExc_TypeError, "Unsupported argument layout");
return 0;
}
}
sz_cptr_t sz_py_strs_sequence_member_start_if_fragmented(void const *sequence_punned, sz_size_t index) {
Strs *strs = (Strs *)sequence_punned;
sz_assert_(strs->layout == STRS_FRAGMENTED && "Expected a reordered Strs layout");
if (index < 0 || index >= strs->data.fragmented.count) {
PyErr_SetString(PyExc_IndexError, "Index out of bounds");
return NULL;
}
return strs->data.fragmented.spans[index].start;
}
sz_size_t sz_py_strs_sequence_member_length_if_fragmented(void const *sequence_punned, sz_size_t index) {
Strs *strs = (Strs *)sequence_punned;
sz_assert_(strs->layout == STRS_FRAGMENTED && "Expected a reordered Strs layout");
if (index < 0 || index >= strs->data.fragmented.count) {
PyErr_SetString(PyExc_IndexError, "Index out of bounds");
return 0;
}
return strs->data.fragmented.spans[index].length;
}
/**
* @brief Helper function to export a `Strs` or similar sequence objects into a `sz_sequence_t`.
*/
SZ_DYNAMIC sz_bool_t sz_py_export_strings_as_sequence(PyObject *object, sz_sequence_t *sequence) {
if (!sequence) return sz_false_k;
if (PyObject_TypeCheck(object, &StrsType)) {
Strs *strs = (Strs *)object;
sz_assert_(strs->layout == STRS_FRAGMENTED && "View as tapes!");
sequence->handle = strs;
sequence->count = strs->data.fragmented.count;
sequence->get_start = sz_py_strs_sequence_member_start_if_fragmented;
sequence->get_length = sz_py_strs_sequence_member_length_if_fragmented;
return sz_true_k;
}
return sz_false_k;
}
/**
* @brief Helper function to export a `Strs` object into `sz_sequence_u32tape_t` components.
*/
SZ_DYNAMIC sz_bool_t sz_py_export_strings_as_u32tape(PyObject *object, sz_cptr_t *data, sz_u32_t const **offsets,
sz_size_t *count) {
if (!data || !offsets || !count) return sz_false_k;
if (!PyObject_TypeCheck(object, &StrsType)) return sz_false_k;
Strs *strs = (Strs *)object;
if (strs->layout == STRS_U32_TAPE) {
*data = strs->data.u32_tape.data;
*offsets = strs->data.u32_tape.offsets;
*count = strs->data.u32_tape.count;
return sz_true_k;
}
else if (strs->layout == STRS_U32_TAPE_VIEW) {
*data = strs->data.u32_tape_view.data;
*offsets = strs->data.u32_tape_view.offsets;
*count = strs->data.u32_tape_view.count;
return sz_true_k;
}
else { return sz_false_k; }
}
/**
* @brief Helper function to export a `Strs` object into `sz_sequence_u64tape_t` components.
*/
SZ_DYNAMIC sz_bool_t sz_py_export_strings_as_u64tape(PyObject *object, sz_cptr_t *data, sz_u64_t const **offsets,
sz_size_t *count) {
if (!data || !offsets || !count) return sz_false_k;
if (!PyObject_TypeCheck(object, &StrsType)) return sz_false_k;
Strs *strs = (Strs *)object;
if (strs->layout == STRS_U64_TAPE) {
*data = strs->data.u64_tape.data;
*offsets = strs->data.u64_tape.offsets;
*count = strs->data.u64_tape.count;
return sz_true_k;
}
else if (strs->layout == STRS_U64_TAPE_VIEW) {
*data = strs->data.u64_tape_view.data;
*offsets = strs->data.u64_tape_view.offsets;
*count = strs->data.u64_tape_view.count;
return sz_true_k;
}
else { return sz_false_k; }
}
static sz_bool_t sz_py_replace_u32_tape_allocator(Strs *strs, sz_memory_allocator_t *old_allocator,
sz_memory_allocator_t *allocator) {
struct u32_tape_t *data = &strs->data.u32_tape;
sz_assert_(data->offsets && "Expected offsets to be allocated");
sz_size_t const string_data_size = (sz_size_t)data->offsets[data->count];
sz_size_t const offsets_size = (data->count + 1) * sizeof(sz_u32_t);
// Allocate new string data with new allocator
sz_ptr_t new_string_data =
string_data_size ? (sz_ptr_t)allocator->allocate(string_data_size, allocator->handle) : (sz_ptr_t)NULL;
if (string_data_size && !new_string_data) return sz_false_k;
memcpy(new_string_data, data->data, string_data_size);
// Allocate new offsets array
sz_u32_t *new_offsets =
offsets_size ? (sz_u32_t *)allocator->allocate(offsets_size, allocator->handle) : (sz_u32_t *)NULL;
if (offsets_size && !new_offsets) {
if (string_data_size) allocator->free(new_string_data, string_data_size, allocator->handle);
return sz_false_k;
}
memcpy(new_offsets, data->offsets, offsets_size);
// Free old memory with old allocator (tapes always own their data)
old_allocator->free(data->data, string_data_size, old_allocator->handle);
old_allocator->free(data->offsets, offsets_size, old_allocator->handle);
// Update pointers and allocator
data->data = new_string_data;
data->offsets = new_offsets;
data->allocator = *allocator;
return sz_true_k;
}
static sz_bool_t sz_py_replace_u64_tape_allocator(Strs *strs, sz_memory_allocator_t *old_allocator,
sz_memory_allocator_t *allocator) {
struct u64_tape_t *data = &strs->data.u64_tape;
sz_assert_(data->offsets && "Expected offsets to be allocated");
sz_size_t string_data_size = (sz_size_t)data->offsets[data->count];
sz_size_t offsets_size = (data->count + 1) * sizeof(sz_u64_t);
// Allocate new string data with new allocator
sz_ptr_t new_string_data =
string_data_size ? (sz_ptr_t)allocator->allocate(string_data_size, allocator->handle) : (sz_ptr_t)NULL;
if (string_data_size && !new_string_data) return sz_false_k;
memcpy(new_string_data, data->data, string_data_size);
// Allocate new offsets array
sz_u64_t *new_offsets =
offsets_size ? (sz_u64_t *)allocator->allocate(offsets_size, allocator->handle) : (sz_u64_t *)NULL;
if (offsets_size && !new_offsets) {
if (string_data_size) allocator->free(new_string_data, string_data_size, allocator->handle);
return sz_false_k;
}
memcpy(new_offsets, data->offsets, offsets_size);
// Free old memory with old allocator (tapes always own their data)
old_allocator->free(data->data, string_data_size, old_allocator->handle);
old_allocator->free(data->offsets, offsets_size, old_allocator->handle);
// Update pointers and allocator
data->data = new_string_data;
data->offsets = new_offsets;
data->allocator = *allocator;
return sz_true_k;
}
static sz_bool_t sz_py_replace_u32_tape_view_allocator(Strs *strs, sz_memory_allocator_t *allocator) {
// Convert view to tape by copying the data
struct u32_tape_view_t *view = &strs->data.u32_tape_view;
sz_u32_t const slice_start_offset = view->offsets[0];
sz_size_t const string_data_size = (sz_size_t)(view->offsets[view->count] - slice_start_offset);
sz_size_t const offsets_size = (view->count + 1) * sizeof(sz_u32_t);
// Allocate new string data with new allocator
sz_ptr_t new_string_data = NULL;
if (string_data_size > 0) {
new_string_data = (sz_ptr_t)allocator->allocate(string_data_size, allocator->handle);
if (!new_string_data) return sz_false_k;
memcpy(new_string_data, view->data + slice_start_offset, string_data_size);
}
// Allocate new offsets array and adjust to be relative to slice start
sz_u32_t *new_offsets = NULL;
if (offsets_size > 0) {
new_offsets = (sz_u32_t *)allocator->allocate(offsets_size, allocator->handle);
if (!new_offsets) {
if (string_data_size > 0) allocator->free(new_string_data, string_data_size, allocator->handle);
return sz_false_k;
}
for (sz_size_t i = 0; i <= view->count; ++i) new_offsets[i] = view->offsets[i] - slice_start_offset;
}
// Release parent reference if any
Py_XDECREF(view->parent);
// Convert to tape layout
strs->layout = STRS_U32_TAPE;
strs->data.u32_tape.count = view->count;
strs->data.u32_tape.data = new_string_data;
strs->data.u32_tape.offsets = new_offsets;
strs->data.u32_tape.allocator = *allocator;
return sz_true_k;
}
static sz_bool_t sz_py_replace_u64_tape_view_allocator(Strs *strs, sz_memory_allocator_t *allocator) {
// Convert view to tape by copying the data
struct u64_tape_view_t *view = &strs->data.u64_tape_view;
sz_u64_t const slice_start_offset = view->offsets[0];
sz_size_t const string_data_size = (sz_size_t)(view->offsets[view->count] - slice_start_offset);
sz_size_t const offsets_size = (view->count + 1) * sizeof(sz_u64_t);
// Allocate new string data with new allocator
sz_ptr_t new_string_data = NULL;
if (string_data_size > 0) {
new_string_data = (sz_ptr_t)allocator->allocate(string_data_size, allocator->handle);
if (!new_string_data) return sz_false_k;
memcpy(new_string_data, view->data + slice_start_offset, string_data_size);
}
// Allocate new offsets array and adjust to be relative to slice start
sz_u64_t *new_offsets = NULL;
if (offsets_size > 0) {
new_offsets = (sz_u64_t *)allocator->allocate(offsets_size, allocator->handle);
if (!new_offsets) {
if (string_data_size > 0) allocator->free(new_string_data, string_data_size, allocator->handle);
return sz_false_k;
}
for (sz_size_t i = 0; i <= view->count; ++i) new_offsets[i] = view->offsets[i] - slice_start_offset;
}
// Release parent reference if any
Py_XDECREF(view->parent);
// Convert to tape layout
strs->layout = STRS_U64_TAPE;
strs->data.u64_tape.count = view->count;
strs->data.u64_tape.data = new_string_data;
strs->data.u64_tape.offsets = new_offsets;
strs->data.u64_tape.allocator = *allocator;
return sz_true_k;
}
static sz_bool_t sz_py_replace_fragmented_allocator(Strs *strs, sz_memory_allocator_t *old_allocator,
sz_memory_allocator_t *allocator) {
struct fragmented_t *fragmented = &strs->data.fragmented;
sz_assert_(fragmented->spans && "Expected spans to be allocated");
// Calculate total size needed for consolidated tape
sz_size_t total_bytes = 0;
for (sz_size_t i = 0; i < fragmented->count; i++) total_bytes += fragmented->spans[i].length;
// Choose 32-bit or 64-bit tape based on size
sz_bool_t use_64bit = total_bytes >= UINT32_MAX;
// Skip allocation if there's no data to allocate (empty strings case)
if (total_bytes == 0) {
// Convert to empty tape layout
old_allocator->free(fragmented->spans, fragmented->count * sizeof(sz_string_view_t), old_allocator->handle);
Py_XDECREF(fragmented->parent);
strs->layout = STRS_U32_TAPE;
strs->data.u32_tape.count = fragmented->count;
strs->data.u32_tape.data = NULL;
strs->data.u32_tape.offsets = NULL;
strs->data.u32_tape.allocator = *allocator;
return sz_true_k;
}
// Allocate consolidated data buffer and offsets array
sz_ptr_t new_data = (sz_ptr_t)allocator->allocate(total_bytes, allocator->handle);
if (!new_data) return sz_false_k;
if (use_64bit) {
sz_u64_t *new_offsets =
(sz_u64_t *)allocator->allocate((fragmented->count + 1) * sizeof(sz_u64_t), allocator->handle);
if (!new_offsets) {
allocator->free(new_data, total_bytes, allocator->handle);
return sz_false_k;
}
// Copy fragmented data into consolidated buffer
sz_size_t current_offset = 0;
new_offsets[0] = 0;
for (sz_size_t i = 0; i < fragmented->count; i++) {
sz_size_t len = fragmented->spans[i].length;
if (len > 0) { memcpy(new_data + current_offset, fragmented->spans[i].start, len); }
current_offset += len;
new_offsets[i + 1] = current_offset;
}
// Free old fragmented data and convert to 64-bit tape
old_allocator->free(fragmented->spans, fragmented->count * sizeof(sz_string_view_t), old_allocator->handle);
Py_XDECREF(fragmented->parent);
strs->layout = STRS_U64_TAPE;
strs->data.u64_tape.count = fragmented->count;
strs->data.u64_tape.data = new_data;
strs->data.u64_tape.offsets = new_offsets;
strs->data.u64_tape.allocator = *allocator;
}
else {
sz_u32_t *new_offsets =
(sz_u32_t *)allocator->allocate((fragmented->count + 1) * sizeof(sz_u32_t), allocator->handle);
if (!new_offsets) {
allocator->free(new_data, total_bytes, allocator->handle);
return sz_false_k;
}
// Copy fragmented data into consolidated buffer
sz_size_t current_offset = 0;
new_offsets[0] = 0;
for (sz_size_t i = 0; i < fragmented->count; i++) {
sz_size_t len = fragmented->spans[i].length;
if (len > 0) { memcpy(new_data + current_offset, fragmented->spans[i].start, len); }
current_offset += len;
// Ensure we don't overflow 32-bit offset
if (current_offset > UINT32_MAX) {
allocator->free(new_data, total_bytes, allocator->handle);
allocator->free(new_offsets, (fragmented->count + 1) * sizeof(sz_u32_t), allocator->handle);
return sz_false_k;
}
new_offsets[i + 1] = (sz_u32_t)current_offset;
}
// Free old fragmented data and convert to 32-bit tape
old_allocator->free(fragmented->spans, fragmented->count * sizeof(sz_string_view_t), old_allocator->handle);
Py_XDECREF(fragmented->parent);
strs->layout = STRS_U32_TAPE;
strs->data.u32_tape.count = fragmented->count;
strs->data.u32_tape.data = new_data;
strs->data.u32_tape.offsets = new_offsets;
strs->data.u32_tape.allocator = *allocator;
}
return sz_true_k;
}
/**
* @brief Helper function to replace the memory allocator in a `Strs` object.
* This reallocates existing string data using the new allocator.
*
* This may change the layout of the `Strs` layout:
* - `STRS_U32_TAPE_VIEW` becomes `STRS_U32_TAPE`.
* - `STRS_U64_TAPE_VIEW` becomes `STRS_U64_TAPE`.
* - `STRS_U32_TAPE` remains, if the allocator is different.
* - `STRS_U64_TAPE` remains, if the allocator is different.
* - `STRS_FRAGMENTED` becomes a `STRS_U32_TAPE` or `STRS_U64_TAPE` depending on the content size.
*/
SZ_DYNAMIC sz_bool_t sz_py_replace_strings_allocator(PyObject *object, sz_memory_allocator_t *allocator) {
if (!object || !allocator) return sz_false_k;
if (!PyObject_TypeCheck(object, &StrsType)) return sz_false_k;
Strs *strs = (Strs *)object;
// Get the current allocator based on layout
sz_memory_allocator_t old_allocator;
switch (strs->layout) {
case STRS_U32_TAPE: old_allocator = strs->data.u32_tape.allocator; break;
case STRS_U64_TAPE: old_allocator = strs->data.u64_tape.allocator; break;
case STRS_FRAGMENTED: old_allocator = strs->data.fragmented.allocator; break;
case STRS_U32_TAPE_VIEW:
case STRS_U64_TAPE_VIEW:
// Traverse parent chain until we find an allocator
{
Strs *up = strs;
while (up && (up->layout == STRS_U32_TAPE_VIEW || up->layout == STRS_U64_TAPE_VIEW)) {
PyObject *parent =
(up->layout == STRS_U32_TAPE_VIEW) ? up->data.u32_tape_view.parent : up->data.u64_tape_view.parent;
if (!parent || !PyObject_TypeCheck(parent, &StrsType)) break;
up = (Strs *)parent;
}
// Extract allocator from the owning layout we found
if (up && up->layout == STRS_U32_TAPE) { old_allocator = up->data.u32_tape.allocator; }
else if (up && up->layout == STRS_U64_TAPE) { old_allocator = up->data.u64_tape.allocator; }
else if (up && up->layout == STRS_FRAGMENTED) { old_allocator = up->data.fragmented.allocator; }
else { sz_memory_allocator_init_default(&old_allocator); } // Final fallback
}
break;
default: sz_memory_allocator_init_default(&old_allocator); break;
}
// Check if the allocators are the same - no need to reallocate
if (sz_memory_allocator_equal(&old_allocator, allocator)) return sz_true_k;
// Handle different Strs layouts using dedicated functions
switch (strs->layout) {
case STRS_U32_TAPE: return sz_py_replace_u32_tape_allocator(strs, &old_allocator, allocator);
case STRS_U64_TAPE: return sz_py_replace_u64_tape_allocator(strs, &old_allocator, allocator);
case STRS_U32_TAPE_VIEW: return sz_py_replace_u32_tape_view_allocator(strs, allocator);
case STRS_U64_TAPE_VIEW: return sz_py_replace_u64_tape_view_allocator(strs, allocator);
case STRS_FRAGMENTED: return sz_py_replace_fragmented_allocator(strs, &old_allocator, allocator);
}
return sz_false_k; // Should never reach here
}
/**
* @brief Helper function to wrap the current exception with a custom prefix message.
* A example is augmenting the argument parsing error with the name of the variable
* that didn't pass the validation.
*/
void wrap_current_exception(sz_cptr_t comment) {
// ? Prior to Python 3.12 we need to fetch and restore the exception state using
// ? `PyErr_Fetch` and `PyErr_Restore` to avoid overwriting the current exception.
// ? After Python 3.12 we can use `PyErr_GetRaisedException` and `PyErr_SetRaisedException`.
sz_unused_(comment);
}
typedef void (*get_string_at_offset_t)(Strs *, Py_ssize_t, Py_ssize_t, PyObject **, sz_cptr_t *, sz_size_t *);
void str_at_offset_u32_tape(Strs *strs, Py_ssize_t i, Py_ssize_t count, //
PyObject **memory_owner, sz_cptr_t *start, sz_size_t *length) {
// Apache Arrow format: offsets[i] to offsets[i+1] defines string i
sz_u32_t start_offset = strs->data.u32_tape.offsets[i];
sz_u32_t end_offset = strs->data.u32_tape.offsets[i + 1];
*start = strs->data.u32_tape.data + start_offset;
*length = end_offset - start_offset;
*memory_owner = strs; // Tapes own their data
}
void str_at_offset_u32_tape_view(Strs *strs, Py_ssize_t i, Py_ssize_t count, //
PyObject **memory_owner, sz_cptr_t *start, sz_size_t *length) {
// Apache Arrow format: offsets[i] to offsets[i+1] defines string i
sz_u32_t start_offset = strs->data.u32_tape_view.offsets[i];
sz_u32_t end_offset = strs->data.u32_tape_view.offsets[i + 1];
*start = strs->data.u32_tape_view.data + start_offset;
*length = end_offset - start_offset;
*memory_owner = strs->data.u32_tape_view.parent;
}
void str_at_offset_u64_tape(Strs *strs, Py_ssize_t i, Py_ssize_t count, //
PyObject **memory_owner, sz_cptr_t *start, sz_size_t *length) {
// Apache Arrow format: offsets[i] to offsets[i+1] defines string i
sz_u64_t start_offset = strs->data.u64_tape.offsets[i];
sz_u64_t end_offset = strs->data.u64_tape.offsets[i + 1];
*start = strs->data.u64_tape.data + start_offset;
*length = end_offset - start_offset;
*memory_owner = strs; // Tapes own their data
}
void str_at_offset_u64_tape_view(Strs *strs, Py_ssize_t i, Py_ssize_t count, //
PyObject **memory_owner, sz_cptr_t *start, sz_size_t *length) {
// Apache Arrow format: offsets[i] to offsets[i+1] defines string i
sz_u64_t start_offset = strs->data.u64_tape_view.offsets[i];
sz_u64_t end_offset = strs->data.u64_tape_view.offsets[i + 1];
*start = strs->data.u64_tape_view.data + start_offset;
*length = end_offset - start_offset;
*memory_owner = strs->data.u64_tape_view.parent;
}
void str_at_offset_fragmented(Strs *strs, Py_ssize_t i, Py_ssize_t count, //
PyObject **memory_owner, sz_cptr_t *start, sz_size_t *length) {
*start = strs->data.fragmented.spans[i].start;
*length = strs->data.fragmented.spans[i].length;
*memory_owner = strs->data.fragmented.parent;
}
get_string_at_offset_t str_at_offset_getter(Strs *strs) {
switch (strs->layout) {
case STRS_U32_TAPE: return str_at_offset_u32_tape;
case STRS_U32_TAPE_VIEW: return str_at_offset_u32_tape_view;
case STRS_U64_TAPE: return str_at_offset_u64_tape;
case STRS_U64_TAPE_VIEW: return str_at_offset_u64_tape_view;
case STRS_FRAGMENTED: return str_at_offset_fragmented;
default:
// Unsupported layout
PyErr_SetString(PyExc_TypeError, "Unsupported layout for conversion");
return NULL;
}
}
#pragma endregion
#pragma region Memory Mapping File
static void File_dealloc(File *self) {
#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
if (self->memory.start) {