StringZilla/python/stringzilla.c at main · ashvardanian/StringZilla

History

8034 lines (7021 loc) · 318 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

/**

* @file stringzilla.c

* @brief Very light-weight CPython wrapper for StringZilla, with support for memory-mapping,

* native Python strings, Apache Arrow collections, and more.

* @author Ash Vardanian

* @date July 10, 2023

* - Doesn't use PyBind11, NanoBind, Boost.Python, or any other high-level libs, only CPython API.

* - To minimize latency this implementation avoids `PyArg_ParseTupleAndKeywords` calls.

* - Reimplements all of the `str` functionality in C as a `Str` type.

* - Provides a highly generic `Strs` class for handling collections of strings, Arrow-style or not.

* Pandas doesn't provide a C API, and even in the 2.0 the Apache Arrow representation is opt-in, not default.

* PyCapsule protocol in conjunction with @b `__arrow_c_array__` dunder methods can be used to extract strings.

* @see https://arrow.apache.org/docs/python/generated/pyarrow.array.html

* This module exports C functions via `PyCapsule` of `PyAPI` for use by other extensions (like `stringzillas-cpus`):

* - `sz_py_export_string_like`.

* - `sz_py_export_strings_as_sequence`.

* - `sz_py_export_strings_as_u32tape`.

* - `sz_py_export_strings_as_u64tape`.

* - `sz_py_replace_strings_allocator`.

* Function Naming Convention:

* - `Str_like_*`: Functions that can be called both as module-level functions AND as member methods.

* - `Str_*`: Functions that are member-only methods or have simpler calling conventions.

#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)

#define NOMINMAX

#include <windows.h>

#else

#include <fcntl.h> // `O_RDNLY`

#include <sys/mman.h> // `mmap`

#include <sys/stat.h> // `stat`

#include <sys/types.h>

#endif

#ifdef _MSC_VER

#include <BaseTsd.h>

typedef SSIZE_T ssize_t;

#else

#include <limits.h> // `SSIZE_MAX`

#include <unistd.h> // `ssize_t`

#endif

// It seems like some Python versions forget to include a header, so we should:

// https://github.com/ashvardanian/StringZilla/actions/runs/7706636733/job/21002535521

#ifndef SSIZE_MAX

#define SSIZE_MAX (SIZE_MAX / 2)

#endif

// Undefine _POSIX_C_SOURCE to avoid redefinition warning with Python headers

#ifdef _POSIX_C_SOURCE

#undef _POSIX_C_SOURCE

#endif

#include <Python.h> // Core CPython interfaces

#include <errno.h> // `errno`

#include <stdio.h> // `fopen`

#include <stdlib.h> // `rand`, `srand`

#include <time.h> // `time`

#include <stringzilla/stringzilla.h>

/**

* @brief Arrow C Data Interface structure for an array schema.

* @see https://arrow.apache.org/docs/format/CDataInterface.html#structure-definitions

struct ArrowSchema {

char const *format;

char const *name;

char const *metadata;

int64_t flags;

int64_t n_children;

struct ArrowSchema **children;

struct ArrowSchema *dictionary;

void (*release)(struct ArrowSchema *);

void *private_data;

};

/**

* @brief Arrow C Data Interface structure for an array content.

* @see https://arrow.apache.org/docs/format/CDataInterface.html#structure-definitions

struct ArrowArray {

int64_t length;

int64_t null_count;

int64_t offset;

int64_t n_buffers;

int64_t n_children;

void const **buffers;

struct ArrowArray **children;

struct ArrowArray *dictionary;

void (*release)(struct ArrowArray *);

void *private_data;

};

typedef struct PyAPI {

sz_bool_t (*sz_py_export_string_like)(PyObject *, sz_cptr_t *, sz_size_t *);

sz_bool_t (*sz_py_export_strings_as_sequence)(PyObject *, sz_sequence_t *);

sz_bool_t (*sz_py_export_strings_as_u32tape)(PyObject *, sz_cptr_t *, sz_u32_t const **, sz_size_t *);

sz_bool_t (*sz_py_export_strings_as_u64tape)(PyObject *, sz_cptr_t *, sz_u64_t const **, sz_size_t *);

sz_bool_t (*sz_py_replace_strings_allocator)(PyObject *, sz_memory_allocator_t *);

} PyAPI;

#pragma region Forward Declarations

static PyTypeObject FileType;

static PyTypeObject StrType;

static PyTypeObject StrsType;

static PyTypeObject SplitIteratorType;

static PyTypeObject Utf8SplitLinesIteratorType;

static PyTypeObject Utf8SplitWhitespaceIteratorType;

static PyTypeObject Utf8WordBoundaryIteratorType;

static PyTypeObject Utf8CaseInsensitiveFindIteratorType;

static PyTypeObject HasherType;

static PyTypeObject Sha256Type;

static sz_string_view_t temporary_memory = {NULL, 0};

/**

* @brief Describes an on-disk file mapped into RAM, which is different from Python's

* native `mmap` module, as it exposes the address of the mapping in memory.

typedef struct {

PyObject ob_base;

#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)

HANDLE file_handle;

HANDLE mapping_handle;

#else

int file_descriptor;

#endif

sz_string_view_t memory;

} File;

/**

* @brief Type-punned StringZilla-string, that points to a slice of an existing Python `str`

* or a `File`.

* When a slice is constructed, the `parent` object's reference count is being incremented to preserve lifetime.

* It usage in Python would look like:

* - Str() # Empty string

* - Str("some-string") # Full-range slice of a Python `str`

* - Str(File("some-path.txt")) # Full-range view of a persisted file

* - Str(File("some-path.txt"), from=0, to=sys.maxsize)

typedef struct {

PyObject ob_base;

PyObject *parent;

sz_string_view_t memory;

} Str;

/**

* @brief String-splitting separator.

* Allows lazy evaluation of the `split` and `rsplit`, and can be used to create a `Strs` object.

* which might be more memory-friendly, than greedily invoking `str.split`.

typedef struct {

PyObject ob_base;

PyObject *text_obj; //< For reference counting

PyObject *separator_obj; //< For reference counting

sz_string_view_t text;

sz_string_view_t separator;

sz_find_t finder;

/// @brief How many bytes to skip after each successful find.

/// Generally equal to `needle_length`, or 1 for character sets.

sz_size_t match_length;

/// @brief Should we include the separator in the resulting slices?

sz_bool_t include_match;

/// @brief Should we enumerate the slices in normal or reverse order?

sz_bool_t is_reverse;

/// @brief Upper limit for the number of splits to report. Monotonically decreases during iteration.

sz_size_t max_parts;

/// @brief Indicates that we've already reported the tail of the split, and should return NULL next.

sz_bool_t reached_tail;

} SplitIterator;

/**

* @brief Iterator for splitting a UTF-8 string by Unicode newline characters.

* Uses sz_utf8_find_newline to find newlines, supporting all 7 Unicode newline

* characters plus CRLF sequences.

* Termination: when start > end (not start == end, which is a valid state yielding empty segment).

typedef struct {

PyObject ob_base;

PyObject *text_obj; //< For reference counting

sz_cptr_t start; //< Current position (start of current segment)

sz_cptr_t end; //< End of original text (immutable)

sz_size_t match_length; //< Length of current segment to yield

/// @brief Should we include the newline characters in the resulting slices?

sz_bool_t keepends;

/// @brief Should we skip empty segments (trailing, leading, consecutive)?

sz_bool_t skip_empty;

} Utf8SplitLinesIterator;

/**

* @brief Iterator for splitting a UTF-8 string by Unicode whitespace characters.

* Uses sz_utf8_find_whitespace to find whitespace, supporting all 25 Unicode

* White_Space characters. N whitespace delimiters yield N+1 segments (including empties).

typedef struct {

PyObject ob_base;

PyObject *text_obj; //< For reference counting

sz_cptr_t start; //< Current position in text

sz_cptr_t end; //< End of text (immutable)

sz_size_t match_length; //< Length of current segment to yield

/// @brief Should we skip empty segments (trailing, leading, consecutive)?

sz_bool_t skip_empty;

} Utf8SplitWhitespaceIterator;

/**

* @brief Iterator for finding word boundaries in UTF-8 text per Unicode TR29.

* Uses sz_utf8_word_find_boundary to find boundaries, supporting all TR29 rules.

* Yields words (text segments between consecutive word boundaries).

typedef struct {

PyObject ob_base;

PyObject *text_obj; //< For reference counting

sz_cptr_t start; //< Start of current word

sz_cptr_t end; //< End of original text (immutable)

sz_cptr_t text_start; //< Start of original text (for reverse iteration)

/// @brief Should we skip empty segments (consecutive boundaries)?

sz_bool_t skip_empty;

} Utf8WordBoundaryIterator;

/**

* @brief Iterator that yields all case-insensitive matches of a needle in a haystack.

* Uses `sz_utf8_case_insensitive_find` for Unicode-aware case folding.

typedef struct {

PyObject ob_base;

PyObject *haystack_obj; //< Reference for garbage collection

PyObject *needle_obj; //< Reference for garbage collection (needle bytes must remain valid)

sz_cptr_t current; //< Current search position in haystack

sz_cptr_t haystack_end; //< End boundary of haystack

sz_string_view_t needle; //< Needle view (bytes and length)

/// @brief Reusable metadata for repeated searches with the same needle.

sz_utf8_case_insensitive_needle_metadata_t metadata;

/// @brief Whether to allow overlapping matches.

sz_bool_t include_overlapping;

} Utf8CaseInsensitiveFindIterator;

/**

* @brief Variable length Python object similar to `Tuple[Union[Str, str]]`,

* for faster sorting, shuffling, joins, and lookups.

typedef struct {

PyObject ob_base;

enum {

STRS_U32_TAPE_VIEW = 0,

STRS_U64_TAPE_VIEW = 1,

STRS_U32_TAPE = 2,

STRS_U64_TAPE = 3,

STRS_FRAGMENTED = 4,

} layout;

union {

/**

* U32 tape view - references existing Arrow array data, owns nothing.

* The layout is identical to Apache Arrow format: N+1 offsets for N strings.

* https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout

struct u32_tape_view_t {

sz_size_t count;

sz_cptr_t data; // Points to existing data (not owned)

sz_u32_t *offsets; // Points to existing offsets (not owned)

PyObject *parent; // Parent Arrow array or other object

} u32_tape_view;

/**

* U32 tape - owns both offsets and data with custom allocator.

struct u32_tape_t {

sz_size_t count;

sz_cptr_t data; // Owned data

sz_u32_t *offsets; // Owned offsets (N+1 for N strings)

sz_memory_allocator_t allocator;

} u32_tape;

/**

* U64 tape view - references existing Arrow array data, owns nothing.

* The layout is identical to Apache Arrow format: N+1 offsets for N strings.

* https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout

struct u64_tape_view_t {

sz_size_t count;

sz_cptr_t data; // Points to existing data (not owned)

sz_u64_t *offsets; // Points to existing offsets (not owned)

PyObject *parent; // Parent Arrow array or other object

} u64_tape_view;

/**

* U64 tape - owns both offsets and data with custom allocator.

struct u64_tape_t {

sz_size_t count;

sz_cptr_t data; // Owned data

sz_u64_t *offsets; // Owned offsets (N+1 for N strings)

sz_memory_allocator_t allocator;

} u64_tape;

/**

* Reordered subviews - owns only the array of individual spans.

* Each span points to data in the parent object.

struct fragmented_t {

sz_size_t count;

sz_string_view_t *spans; // Owned array of spans

PyObject *parent; // Parent object (Str, Strs, or other)

sz_memory_allocator_t allocator;

} fragmented;

} data;

} Strs;

#pragma endregion

#pragma region Helpers

static sz_ptr_t temporary_memory_allocate(sz_size_t size, sz_string_view_t *existing) {

if (existing->length < size) {

sz_cptr_t new_start = realloc(existing->start, size);

if (!new_start) {

PyErr_Format(PyExc_MemoryError, "Unable to allocate memory for the Levenshtein matrix");

return NULL;

}

existing->start = new_start;

existing->length = size;

}

return existing->start;

}

static void temporary_memory_free(sz_ptr_t start, sz_size_t size, sz_string_view_t *existing) {}

static sz_cptr_t Strs_get_start_(void const *handle, sz_size_t i) {

Strs *strs = (Strs *)handle;

switch (strs->layout) {

case STRS_U32_TAPE: return strs->data.u32_tape.data + strs->data.u32_tape.offsets[i];

case STRS_U32_TAPE_VIEW: return strs->data.u32_tape_view.data + strs->data.u32_tape_view.offsets[i];

case STRS_U64_TAPE: return strs->data.u64_tape.data + strs->data.u64_tape.offsets[i];

case STRS_U64_TAPE_VIEW: return strs->data.u64_tape_view.data + strs->data.u64_tape_view.offsets[i];

case STRS_FRAGMENTED: return strs->data.fragmented.spans[i].start;

}

return NULL;

}

static sz_size_t Strs_get_length_(void const *handle, sz_size_t i) {

Strs *strs = (Strs *)handle;

switch (strs->layout) {

case STRS_U32_TAPE: return strs->data.u32_tape.offsets[i + 1] - strs->data.u32_tape.offsets[i];

case STRS_U32_TAPE_VIEW: return strs->data.u32_tape_view.offsets[i + 1] - strs->data.u32_tape_view.offsets[i];

case STRS_U64_TAPE: return strs->data.u64_tape.offsets[i + 1] - strs->data.u64_tape.offsets[i];

case STRS_U64_TAPE_VIEW: return strs->data.u64_tape_view.offsets[i + 1] - strs->data.u64_tape_view.offsets[i];

case STRS_FRAGMENTED: return strs->data.fragmented.spans[i].length;

}

return 0;

}

void reverse_offsets(sz_sorted_idx_t *array, sz_size_t length) {

sz_size_t i, j;

// Swap array[i] and array[j]

for (i = 0, j = length - 1; i < j; i++, j--) {

sz_sorted_idx_t temp = array[i];

array[i] = array[j];

array[j] = temp;

}

void reverse_haystacks(sz_string_view_t *array, sz_size_t length) {

sz_size_t i, j;

// Swap array[i] and array[j]

for (i = 0, j = length - 1; i < j; i++, j--) {

sz_string_view_t temp = array[i];

array[i] = array[j];

array[j] = temp;

}

void permute(sz_string_view_t *array, sz_sorted_idx_t *order, sz_size_t length) {

for (sz_size_t i = 0; i < length; ++i) {

if (i == order[i]) continue;

sz_string_view_t temp = array[i];

sz_size_t k = i, j;

while (i != (j = (sz_size_t)order[k])) {

array[k] = array[j];

order[k] = k;

k = j;

}

array[k] = temp;

order[k] = k;

}

/**

* @brief Helper function to check if a Python object represents a mutable buffer.

* Returns sz_true_k if the object is mutable (can be written to), sz_false_k if immutable.

* Sets a Python exception if immutable.

SZ_INTERNAL sz_bool_t sz_py_is_mutable(PyObject *object) {

if (PyUnicode_Check(object)) {

PyErr_SetString(PyExc_TypeError, "str objects are immutable (use bytearray instead)");

return sz_false_k;

}

else if (PyBytes_Check(object)) {

PyErr_SetString(PyExc_TypeError, "bytes objects are immutable (use bytearray instead)");

return sz_false_k;

}

else if (PyMemoryView_Check(object)) {

Py_buffer *view = PyMemoryView_GET_BUFFER(object);

if (view->readonly) {

PyErr_SetString(PyExc_TypeError, "memoryview is read-only");

return sz_false_k;

}

// Everything else is optimistically considered mutable

return sz_true_k;

}

/**

* @brief Helper function to export a Python string-like object into a `sz_string_view_t`.

* On failure, sets a Python exception and returns 0.

SZ_DYNAMIC sz_bool_t sz_py_export_string_like(PyObject *object, sz_cptr_t *start, sz_size_t *length) {

if (PyUnicode_Check(object)) {

// Handle Python `str` object

Py_ssize_t signed_length;

*start = PyUnicode_AsUTF8AndSize(object, &signed_length);

*length = (sz_size_t)signed_length;

return 1;

}

else if (PyBytes_Check(object)) {

// Handle Python `bytes` object

// https://docs.python.org/3/c-api/bytes.html

Py_ssize_t signed_length;

if (PyBytes_AsStringAndSize(object, (sz_ptr_t *)start, &signed_length) == -1) {

PyErr_SetString(PyExc_ValueError, "Couldn't access `bytes` buffer internals");

return 0;

}

*length = (sz_size_t)signed_length;

return 1;

}

else if (PyByteArray_Check(object)) {

// Handle Python mutable `bytearray` object

// https://docs.python.org/3/c-api/bytearray.html

*start = PyByteArray_AS_STRING(object);

*length = PyByteArray_GET_SIZE(object);

return 1;

}

else if (PyObject_TypeCheck(object, &StrType)) {

Str *str = (Str *)object;

*start = str->memory.start;

*length = str->memory.length;

return 1;

}

else if (PyObject_TypeCheck(object, &FileType)) {

File *file = (File *)object;

*start = file->memory.start;

*length = file->memory.length;

return 1;

}

else if (PyMemoryView_Check(object)) {

// Handle Python `memoryview` object

// https://docs.python.org/3/c-api/memoryview.html

// https://docs.python.org/3/c-api/buffer.html#c.Py_buffer

Py_buffer *view = PyMemoryView_GET_BUFFER(object);

// Make sure we are dealing with single-byte integral representations

if (view->itemsize != 1) {

PyErr_SetString(PyExc_ValueError, "Only single-byte integral types are supported");

return 0;

}

// Let's make sure the data is contiguous.

// This can be a bit trickier for high-dimensional arrays, but CPython has a built-in function for that.

// The flag 'C' stands for C-style-contiguous, which means that the last dimension is contiguous.

// The flag 'F' stands for Fortran-style-contiguous, which means that the first dimension is contiguous.

// The flag 'A' stands for any-contiguous, which only means there are no gaps between elements.

// For byte-level processing that's all we need.

if (!PyBuffer_IsContiguous(view, 'A')) {

PyErr_SetString(PyExc_ValueError, "The array must be contiguous");

return 0;

}

*start = (sz_cptr_t)view->buf;

*length = (sz_size_t)view->len;

return 1;

}

else {

PyErr_SetString(PyExc_TypeError, "Unsupported argument layout");

return 0;

}

sz_cptr_t sz_py_strs_sequence_member_start_if_fragmented(void const *sequence_punned, sz_size_t index) {

Strs *strs = (Strs *)sequence_punned;

sz_assert_(strs->layout == STRS_FRAGMENTED && "Expected a reordered Strs layout");

if (index < 0 || index >= strs->data.fragmented.count) {

PyErr_SetString(PyExc_IndexError, "Index out of bounds");

return NULL;

}

return strs->data.fragmented.spans[index].start;

}

sz_size_t sz_py_strs_sequence_member_length_if_fragmented(void const *sequence_punned, sz_size_t index) {

Strs *strs = (Strs *)sequence_punned;

sz_assert_(strs->layout == STRS_FRAGMENTED && "Expected a reordered Strs layout");

if (index < 0 || index >= strs->data.fragmented.count) {

PyErr_SetString(PyExc_IndexError, "Index out of bounds");

return 0;

}

return strs->data.fragmented.spans[index].length;

}

/**

* @brief Helper function to export a `Strs` or similar sequence objects into a `sz_sequence_t`.

SZ_DYNAMIC sz_bool_t sz_py_export_strings_as_sequence(PyObject *object, sz_sequence_t *sequence) {

if (!sequence) return sz_false_k;

if (PyObject_TypeCheck(object, &StrsType)) {

Strs *strs = (Strs *)object;

sz_assert_(strs->layout == STRS_FRAGMENTED && "View as tapes!");

sequence->handle = strs;

sequence->count = strs->data.fragmented.count;

sequence->get_start = sz_py_strs_sequence_member_start_if_fragmented;

sequence->get_length = sz_py_strs_sequence_member_length_if_fragmented;

return sz_true_k;

}

return sz_false_k;

}

/**

* @brief Helper function to export a `Strs` object into `sz_sequence_u32tape_t` components.

SZ_DYNAMIC sz_bool_t sz_py_export_strings_as_u32tape(PyObject *object, sz_cptr_t *data, sz_u32_t const **offsets,

sz_size_t *count) {

if (!data || !offsets || !count) return sz_false_k;

if (!PyObject_TypeCheck(object, &StrsType)) return sz_false_k;

Strs *strs = (Strs *)object;

if (strs->layout == STRS_U32_TAPE) {

*data = strs->data.u32_tape.data;

*offsets = strs->data.u32_tape.offsets;

*count = strs->data.u32_tape.count;

return sz_true_k;

}

else if (strs->layout == STRS_U32_TAPE_VIEW) {

*data = strs->data.u32_tape_view.data;

*offsets = strs->data.u32_tape_view.offsets;

*count = strs->data.u32_tape_view.count;

return sz_true_k;

}

else { return sz_false_k; }

}

/**

* @brief Helper function to export a `Strs` object into `sz_sequence_u64tape_t` components.

SZ_DYNAMIC sz_bool_t sz_py_export_strings_as_u64tape(PyObject *object, sz_cptr_t *data, sz_u64_t const **offsets,

sz_size_t *count) {

if (!data || !offsets || !count) return sz_false_k;

if (!PyObject_TypeCheck(object, &StrsType)) return sz_false_k;

Strs *strs = (Strs *)object;

if (strs->layout == STRS_U64_TAPE) {

*data = strs->data.u64_tape.data;

*offsets = strs->data.u64_tape.offsets;

*count = strs->data.u64_tape.count;

return sz_true_k;

}

else if (strs->layout == STRS_U64_TAPE_VIEW) {

*data = strs->data.u64_tape_view.data;

*offsets = strs->data.u64_tape_view.offsets;

*count = strs->data.u64_tape_view.count;

return sz_true_k;

}

else { return sz_false_k; }

}

static sz_bool_t sz_py_replace_u32_tape_allocator(Strs *strs, sz_memory_allocator_t *old_allocator,

sz_memory_allocator_t *allocator) {

struct u32_tape_t *data = &strs->data.u32_tape;

sz_assert_(data->offsets && "Expected offsets to be allocated");

sz_size_t const string_data_size = (sz_size_t)data->offsets[data->count];

sz_size_t const offsets_size = (data->count + 1) * sizeof(sz_u32_t);

// Allocate new string data with new allocator

sz_ptr_t new_string_data =

string_data_size ? (sz_ptr_t)allocator->allocate(string_data_size, allocator->handle) : (sz_ptr_t)NULL;

if (string_data_size && !new_string_data) return sz_false_k;

memcpy(new_string_data, data->data, string_data_size);

// Allocate new offsets array

sz_u32_t *new_offsets =

offsets_size ? (sz_u32_t *)allocator->allocate(offsets_size, allocator->handle) : (sz_u32_t *)NULL;

if (offsets_size && !new_offsets) {

if (string_data_size) allocator->free(new_string_data, string_data_size, allocator->handle);

return sz_false_k;

}

memcpy(new_offsets, data->offsets, offsets_size);

// Free old memory with old allocator (tapes always own their data)

old_allocator->free(data->data, string_data_size, old_allocator->handle);

old_allocator->free(data->offsets, offsets_size, old_allocator->handle);

// Update pointers and allocator

data->data = new_string_data;

data->offsets = new_offsets;

data->allocator = *allocator;

return sz_true_k;

}

static sz_bool_t sz_py_replace_u64_tape_allocator(Strs *strs, sz_memory_allocator_t *old_allocator,

sz_memory_allocator_t *allocator) {

struct u64_tape_t *data = &strs->data.u64_tape;

sz_assert_(data->offsets && "Expected offsets to be allocated");

sz_size_t string_data_size = (sz_size_t)data->offsets[data->count];

sz_size_t offsets_size = (data->count + 1) * sizeof(sz_u64_t);

// Allocate new string data with new allocator

sz_ptr_t new_string_data =

string_data_size ? (sz_ptr_t)allocator->allocate(string_data_size, allocator->handle) : (sz_ptr_t)NULL;

if (string_data_size && !new_string_data) return sz_false_k;

memcpy(new_string_data, data->data, string_data_size);

// Allocate new offsets array

sz_u64_t *new_offsets =

offsets_size ? (sz_u64_t *)allocator->allocate(offsets_size, allocator->handle) : (sz_u64_t *)NULL;

if (offsets_size && !new_offsets) {

if (string_data_size) allocator->free(new_string_data, string_data_size, allocator->handle);

return sz_false_k;

}

memcpy(new_offsets, data->offsets, offsets_size);

// Free old memory with old allocator (tapes always own their data)

old_allocator->free(data->data, string_data_size, old_allocator->handle);

old_allocator->free(data->offsets, offsets_size, old_allocator->handle);

// Update pointers and allocator

data->data = new_string_data;

data->offsets = new_offsets;

data->allocator = *allocator;

return sz_true_k;

}

static sz_bool_t sz_py_replace_u32_tape_view_allocator(Strs *strs, sz_memory_allocator_t *allocator) {

// Convert view to tape by copying the data

struct u32_tape_view_t *view = &strs->data.u32_tape_view;

sz_u32_t const slice_start_offset = view->offsets[0];

sz_size_t const string_data_size = (sz_size_t)(view->offsets[view->count] - slice_start_offset);

sz_size_t const offsets_size = (view->count + 1) * sizeof(sz_u32_t);

// Allocate new string data with new allocator

sz_ptr_t new_string_data = NULL;

if (string_data_size > 0) {

new_string_data = (sz_ptr_t)allocator->allocate(string_data_size, allocator->handle);

if (!new_string_data) return sz_false_k;

memcpy(new_string_data, view->data + slice_start_offset, string_data_size);

}

// Allocate new offsets array and adjust to be relative to slice start

sz_u32_t *new_offsets = NULL;

if (offsets_size > 0) {

new_offsets = (sz_u32_t *)allocator->allocate(offsets_size, allocator->handle);

if (!new_offsets) {

if (string_data_size > 0) allocator->free(new_string_data, string_data_size, allocator->handle);

return sz_false_k;

}

for (sz_size_t i = 0; i <= view->count; ++i) new_offsets[i] = view->offsets[i] - slice_start_offset;

}

// Release parent reference if any

Py_XDECREF(view->parent);

// Convert to tape layout

strs->layout = STRS_U32_TAPE;

strs->data.u32_tape.count = view->count;

strs->data.u32_tape.data = new_string_data;

strs->data.u32_tape.offsets = new_offsets;

strs->data.u32_tape.allocator = *allocator;

return sz_true_k;

}

static sz_bool_t sz_py_replace_u64_tape_view_allocator(Strs *strs, sz_memory_allocator_t *allocator) {

// Convert view to tape by copying the data

struct u64_tape_view_t *view = &strs->data.u64_tape_view;

sz_u64_t const slice_start_offset = view->offsets[0];

sz_size_t const string_data_size = (sz_size_t)(view->offsets[view->count] - slice_start_offset);

sz_size_t const offsets_size = (view->count + 1) * sizeof(sz_u64_t);

// Allocate new string data with new allocator

sz_ptr_t new_string_data = NULL;

if (string_data_size > 0) {

new_string_data = (sz_ptr_t)allocator->allocate(string_data_size, allocator->handle);

if (!new_string_data) return sz_false_k;

memcpy(new_string_data, view->data + slice_start_offset, string_data_size);

}

// Allocate new offsets array and adjust to be relative to slice start

sz_u64_t *new_offsets = NULL;

if (offsets_size > 0) {

new_offsets = (sz_u64_t *)allocator->allocate(offsets_size, allocator->handle);

if (!new_offsets) {

if (string_data_size > 0) allocator->free(new_string_data, string_data_size, allocator->handle);

return sz_false_k;

}

for (sz_size_t i = 0; i <= view->count; ++i) new_offsets[i] = view->offsets[i] - slice_start_offset;

}

// Release parent reference if any

Py_XDECREF(view->parent);

// Convert to tape layout

strs->layout = STRS_U64_TAPE;

strs->data.u64_tape.count = view->count;

strs->data.u64_tape.data = new_string_data;

strs->data.u64_tape.offsets = new_offsets;

strs->data.u64_tape.allocator = *allocator;

return sz_true_k;

}

static sz_bool_t sz_py_replace_fragmented_allocator(Strs *strs, sz_memory_allocator_t *old_allocator,

sz_memory_allocator_t *allocator) {

struct fragmented_t *fragmented = &strs->data.fragmented;

sz_assert_(fragmented->spans && "Expected spans to be allocated");

// Calculate total size needed for consolidated tape

sz_size_t total_bytes = 0;

for (sz_size_t i = 0; i < fragmented->count; i++) total_bytes += fragmented->spans[i].length;

// Choose 32-bit or 64-bit tape based on size

sz_bool_t use_64bit = total_bytes >= UINT32_MAX;

// Skip allocation if there's no data to allocate (empty strings case)

if (total_bytes == 0) {

// Convert to empty tape layout

old_allocator->free(fragmented->spans, fragmented->count * sizeof(sz_string_view_t), old_allocator->handle);

Py_XDECREF(fragmented->parent);

strs->layout = STRS_U32_TAPE;

strs->data.u32_tape.count = fragmented->count;

strs->data.u32_tape.data = NULL;

strs->data.u32_tape.offsets = NULL;

strs->data.u32_tape.allocator = *allocator;

return sz_true_k;

}

// Allocate consolidated data buffer and offsets array

sz_ptr_t new_data = (sz_ptr_t)allocator->allocate(total_bytes, allocator->handle);

if (!new_data) return sz_false_k;

if (use_64bit) {

sz_u64_t *new_offsets =

(sz_u64_t *)allocator->allocate((fragmented->count + 1) * sizeof(sz_u64_t), allocator->handle);

if (!new_offsets) {

allocator->free(new_data, total_bytes, allocator->handle);

return sz_false_k;

}

// Copy fragmented data into consolidated buffer

sz_size_t current_offset = 0;

new_offsets[0] = 0;

for (sz_size_t i = 0; i < fragmented->count; i++) {

sz_size_t len = fragmented->spans[i].length;

if (len > 0) { memcpy(new_data + current_offset, fragmented->spans[i].start, len); }

current_offset += len;

new_offsets[i + 1] = current_offset;

}

// Free old fragmented data and convert to 64-bit tape

old_allocator->free(fragmented->spans, fragmented->count * sizeof(sz_string_view_t), old_allocator->handle);

Py_XDECREF(fragmented->parent);

strs->layout = STRS_U64_TAPE;

strs->data.u64_tape.count = fragmented->count;

strs->data.u64_tape.data = new_data;

strs->data.u64_tape.offsets = new_offsets;

strs->data.u64_tape.allocator = *allocator;

}

else {

sz_u32_t *new_offsets =

(sz_u32_t *)allocator->allocate((fragmented->count + 1) * sizeof(sz_u32_t), allocator->handle);

if (!new_offsets) {

allocator->free(new_data, total_bytes, allocator->handle);

return sz_false_k;

}

// Copy fragmented data into consolidated buffer

sz_size_t current_offset = 0;

new_offsets[0] = 0;

for (sz_size_t i = 0; i < fragmented->count; i++) {

sz_size_t len = fragmented->spans[i].length;

if (len > 0) { memcpy(new_data + current_offset, fragmented->spans[i].start, len); }

current_offset += len;

// Ensure we don't overflow 32-bit offset

if (current_offset > UINT32_MAX) {

allocator->free(new_data, total_bytes, allocator->handle);

allocator->free(new_offsets, (fragmented->count + 1) * sizeof(sz_u32_t), allocator->handle);

return sz_false_k;

}

new_offsets[i + 1] = (sz_u32_t)current_offset;

}

// Free old fragmented data and convert to 32-bit tape

old_allocator->free(fragmented->spans, fragmented->count * sizeof(sz_string_view_t), old_allocator->handle);

Py_XDECREF(fragmented->parent);

strs->layout = STRS_U32_TAPE;

strs->data.u32_tape.count = fragmented->count;

strs->data.u32_tape.data = new_data;

strs->data.u32_tape.offsets = new_offsets;

strs->data.u32_tape.allocator = *allocator;

}

return sz_true_k;

}

/**

* @brief Helper function to replace the memory allocator in a `Strs` object.

* This reallocates existing string data using the new allocator.

* This may change the layout of the `Strs` layout:

* - `STRS_U32_TAPE_VIEW` becomes `STRS_U32_TAPE`.

* - `STRS_U64_TAPE_VIEW` becomes `STRS_U64_TAPE`.

* - `STRS_U32_TAPE` remains, if the allocator is different.

* - `STRS_U64_TAPE` remains, if the allocator is different.

* - `STRS_FRAGMENTED` becomes a `STRS_U32_TAPE` or `STRS_U64_TAPE` depending on the content size.

SZ_DYNAMIC sz_bool_t sz_py_replace_strings_allocator(PyObject *object, sz_memory_allocator_t *allocator) {

if (!object || !allocator) return sz_false_k;

if (!PyObject_TypeCheck(object, &StrsType)) return sz_false_k;

Strs *strs = (Strs *)object;

// Get the current allocator based on layout

sz_memory_allocator_t old_allocator;

switch (strs->layout) {

case STRS_U32_TAPE: old_allocator = strs->data.u32_tape.allocator; break;

case STRS_U64_TAPE: old_allocator = strs->data.u64_tape.allocator; break;

case STRS_FRAGMENTED: old_allocator = strs->data.fragmented.allocator; break;

case STRS_U32_TAPE_VIEW:

case STRS_U64_TAPE_VIEW:

// Traverse parent chain until we find an allocator

{

Strs *up = strs;

while (up && (up->layout == STRS_U32_TAPE_VIEW || up->layout == STRS_U64_TAPE_VIEW)) {

PyObject *parent =

(up->layout == STRS_U32_TAPE_VIEW) ? up->data.u32_tape_view.parent : up->data.u64_tape_view.parent;

if (!parent || !PyObject_TypeCheck(parent, &StrsType)) break;

up = (Strs *)parent;

}

// Extract allocator from the owning layout we found

if (up && up->layout == STRS_U32_TAPE) { old_allocator = up->data.u32_tape.allocator; }

else if (up && up->layout == STRS_U64_TAPE) { old_allocator = up->data.u64_tape.allocator; }

else if (up && up->layout == STRS_FRAGMENTED) { old_allocator = up->data.fragmented.allocator; }

else { sz_memory_allocator_init_default(&old_allocator); } // Final fallback

}

break;

default: sz_memory_allocator_init_default(&old_allocator); break;

}

// Check if the allocators are the same - no need to reallocate

if (sz_memory_allocator_equal(&old_allocator, allocator)) return sz_true_k;

// Handle different Strs layouts using dedicated functions

switch (strs->layout) {

case STRS_U32_TAPE: return sz_py_replace_u32_tape_allocator(strs, &old_allocator, allocator);

case STRS_U64_TAPE: return sz_py_replace_u64_tape_allocator(strs, &old_allocator, allocator);

case STRS_U32_TAPE_VIEW: return sz_py_replace_u32_tape_view_allocator(strs, allocator);

case STRS_U64_TAPE_VIEW: return sz_py_replace_u64_tape_view_allocator(strs, allocator);

case STRS_FRAGMENTED: return sz_py_replace_fragmented_allocator(strs, &old_allocator, allocator);

}

return sz_false_k; // Should never reach here

}

/**

* @brief Helper function to wrap the current exception with a custom prefix message.

* A example is augmenting the argument parsing error with the name of the variable

* that didn't pass the validation.

void wrap_current_exception(sz_cptr_t comment) {

// ? Prior to Python 3.12 we need to fetch and restore the exception state using

// ? `PyErr_Fetch` and `PyErr_Restore` to avoid overwriting the current exception.

// ? After Python 3.12 we can use `PyErr_GetRaisedException` and `PyErr_SetRaisedException`.

sz_unused_(comment);

}

typedef void (*get_string_at_offset_t)(Strs *, Py_ssize_t, Py_ssize_t, PyObject **, sz_cptr_t *, sz_size_t *);

void str_at_offset_u32_tape(Strs *strs, Py_ssize_t i, Py_ssize_t count, //

PyObject **memory_owner, sz_cptr_t *start, sz_size_t *length) {

// Apache Arrow format: offsets[i] to offsets[i+1] defines string i

sz_u32_t start_offset = strs->data.u32_tape.offsets[i];

sz_u32_t end_offset = strs->data.u32_tape.offsets[i + 1];

*start = strs->data.u32_tape.data + start_offset;

*length = end_offset - start_offset;

*memory_owner = strs; // Tapes own their data

}

void str_at_offset_u32_tape_view(Strs *strs, Py_ssize_t i, Py_ssize_t count, //

PyObject **memory_owner, sz_cptr_t *start, sz_size_t *length) {

// Apache Arrow format: offsets[i] to offsets[i+1] defines string i

sz_u32_t start_offset = strs->data.u32_tape_view.offsets[i];

sz_u32_t end_offset = strs->data.u32_tape_view.offsets[i + 1];

*start = strs->data.u32_tape_view.data + start_offset;

*length = end_offset - start_offset;

*memory_owner = strs->data.u32_tape_view.parent;

}

void str_at_offset_u64_tape(Strs *strs, Py_ssize_t i, Py_ssize_t count, //

PyObject **memory_owner, sz_cptr_t *start, sz_size_t *length) {

// Apache Arrow format: offsets[i] to offsets[i+1] defines string i

sz_u64_t start_offset = strs->data.u64_tape.offsets[i];

sz_u64_t end_offset = strs->data.u64_tape.offsets[i + 1];

*start = strs->data.u64_tape.data + start_offset;

*length = end_offset - start_offset;

*memory_owner = strs; // Tapes own their data

}

void str_at_offset_u64_tape_view(Strs *strs, Py_ssize_t i, Py_ssize_t count, //

PyObject **memory_owner, sz_cptr_t *start, sz_size_t *length) {

// Apache Arrow format: offsets[i] to offsets[i+1] defines string i

sz_u64_t start_offset = strs->data.u64_tape_view.offsets[i];

sz_u64_t end_offset = strs->data.u64_tape_view.offsets[i + 1];

*start = strs->data.u64_tape_view.data + start_offset;

*length = end_offset - start_offset;

*memory_owner = strs->data.u64_tape_view.parent;

}

void str_at_offset_fragmented(Strs *strs, Py_ssize_t i, Py_ssize_t count, //

PyObject **memory_owner, sz_cptr_t *start, sz_size_t *length) {

*start = strs->data.fragmented.spans[i].start;

*length = strs->data.fragmented.spans[i].length;

*memory_owner = strs->data.fragmented.parent;

}

get_string_at_offset_t str_at_offset_getter(Strs *strs) {

switch (strs->layout) {

case STRS_U32_TAPE: return str_at_offset_u32_tape;

case STRS_U32_TAPE_VIEW: return str_at_offset_u32_tape_view;

case STRS_U64_TAPE: return str_at_offset_u64_tape;

case STRS_U64_TAPE_VIEW: return str_at_offset_u64_tape_view;

case STRS_FRAGMENTED: return str_at_offset_fragmented;

default:

// Unsupported layout

PyErr_SetString(PyExc_TypeError, "Unsupported layout for conversion");

return NULL;

}

#pragma endregion

#pragma region Memory Mapping File

static void File_dealloc(File *self) {

#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)

if (self->memory.start) {

View remainder of file in raw view

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

stringzilla.c

Latest commit

History

stringzilla.c

File metadata and controls