forked from IronLanguages/ironpython3
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPythonEncoding.cs
More file actions
1575 lines (1283 loc) · 73.1 KB
/
PythonEncoding.cs
File metadata and controls
1575 lines (1283 loc) · 73.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the Apache 2.0 License.
// See the LICENSE file in the project root for more information.
#nullable enable
using IronPython.Runtime.Exceptions;
using IronPython.Runtime.Operations;
using Microsoft.Scripting.Runtime;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Linq;
using System.Runtime.InteropServices;
using System.Text;
namespace IronPython.Runtime {
/// <summary>
/// Wrapper class for any well-behaved <see cref="System.Text.Encoding"/> (like any encodings provided by .NET)
/// that allows for encoding/decoding fallbacks operating on byte level.
/// </summary>
/// <remarks>
/// Python encoding/decoding fallbacks (called "error handlers" in Python documentation) can deliver
/// fallback values as strings as well as bytes. .NET fallback mechanism only allows for characters as fallbacks,
/// and only those characters that are deemed valid by a given encoding. So, for instance, lone surrogate escapes,
/// produced by Python's 'surrogateescape' error handler, are not allowed in the standard .NET fallback protocol.
///
/// This class extends the standard .NET fallback protocol to allow fallbacks to provide values normally
/// not allowed by .NET but allowed by Python. It also allows the fallbacks to provide fallback values
/// as a sequence of bytes.
/// <br/>
///
/// Note: Currently, it is not possible to set the fallbacks through assignment to
/// <see cref="EncoderFallback"/> or <see cref="DecoderFallback"/>; the fallbacks have to be provided
/// to the constructor. Also, the fallbacks have to be of type <see cref="PythonEncoderFallback"/>
/// and <see cref="PythonDecoderFallback"/>, which implement the extended fallback protocol.
/// </remarks>
internal class PythonEncoding : Encoding {
// The following two must be different from each other and be pass-through characters for UTF-7
private const char Pass1Marker = '?';
private const char Pass2Marker = '-';
public int CharacterWidth { get; }
public bool IsBigEndian { get; } // meaningful only for wide-char encodings
private Encoding Pass1Encoding { get; }
private Encoding Pass2Encoding { get; }
private PythonEncoder? _residentEncoder;
private PythonDecoder? _residentDecoder;
public PythonEncoding(Encoding encoding, PythonEncoderFallback encoderFallback, PythonDecoderFallback decoderFallback)
: base(0, encoderFallback, decoderFallback) {
if (encoding is null) throw new ArgumentNullException(nameof(encoding));
if (encoderFallback is null) throw new ArgumentNullException(nameof(encoderFallback));
if (decoderFallback is null) throw new ArgumentNullException(nameof(decoderFallback));
try {
unsafe {
char* markerSpan = stackalloc char[] { Pass1Marker };
CharacterWidth = encoding.GetByteCount(markerSpan, 1);
if (1 <= CharacterWidth && CharacterWidth <= 4) {
byte* markerBytes = stackalloc byte[CharacterWidth];
encoding.GetBytes(markerSpan, 1, markerBytes, CharacterWidth);
IsBigEndian = markerBytes[0] == 0;
}
}
} catch (EncoderFallbackException) {
// Q: What encoding cannot encode '?' A: Incomplete charmap.
CharacterWidth = 1;
IsBigEndian = false;
}
// set up pass 1 Encoding, using provided fallback instances
encoderFallback.Encoding = decoderFallback.Encoding = this;
encoderFallback.IsPass1 = decoderFallback.IsPass1 = true;
Pass1Encoding = (Encoding)encoding.Clone();
Pass1Encoding.EncoderFallback = encoderFallback;
Pass1Encoding.DecoderFallback = decoderFallback;
// set up pass 2 Encoding, using clones of provided fallback instances
encoderFallback = (PythonEncoderFallback)encoderFallback.Clone();
decoderFallback = (PythonDecoderFallback)decoderFallback.Clone();
encoderFallback.IsPass1 = decoderFallback.IsPass1 = false;
Pass2Encoding = (Encoding)encoding.Clone();
Pass2Encoding.EncoderFallback = encoderFallback;
Pass2Encoding.DecoderFallback = decoderFallback;
}
[MemberNotNull(nameof(_residentEncoder))]
private void PrepareResidentEncoder() {
if (_residentEncoder is null) {
_residentEncoder = new PythonEncoder(this);
} else {
_residentEncoder.Reset();
}
}
[MemberNotNull(nameof(_residentDecoder))]
private void PrepareResidentDecoder() {
if (_residentDecoder is null) {
_residentDecoder = new PythonDecoder(this);
} else {
_residentDecoder.Reset();
}
}
// mandatory override
public override int GetByteCount(char[] chars, int index, int count) {
PrepareResidentEncoder();
return _residentEncoder.GetByteCount(chars, index, count, flush: true);
}
// NLS workhorse
public override unsafe int GetByteCount(char* chars, int count) {
PrepareResidentEncoder();
return _residentEncoder.GetByteCount(chars, count, flush: true);
}
// used by IronPython
public override int GetByteCount(string s) {
PrepareResidentEncoder();
return _residentEncoder.GetByteCount(s, flush: true);
}
// mandatory override
public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex) {
PrepareResidentEncoder();
return _residentEncoder.GetBytes(chars, charIndex, charCount, bytes, byteIndex, flush: true);
}
// NLS workhorse
public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount) {
PrepareResidentEncoder();
return _residentEncoder.GetBytes(chars, charCount, bytes, byteCount, flush: true);
}
// used by IronPython
public override int GetBytes(string s, int charIndex, int charCount, byte[] bytes, int byteIndex) {
PrepareResidentEncoder();
return _residentEncoder.GetBytes(s, charIndex, charCount, bytes, byteIndex, flush: true);
}
// mandatory override
public override int GetCharCount(byte[] bytes, int index, int count) {
PrepareResidentDecoder();
return _residentDecoder.GetCharCount(bytes, index, count, flush: true);
}
// NLS workhorse
public override unsafe int GetCharCount(byte* bytes, int count) {
PrepareResidentDecoder();
return _residentDecoder.GetCharCount(bytes, count, flush: true);
}
// mandatory override
public override int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex) {
PrepareResidentDecoder();
return _residentDecoder.GetChars(bytes, byteIndex, byteCount, chars, charIndex, flush: true);
}
// used by IronPython
public string GetString(IPythonBuffer input, int index, int count) {
PrepareResidentDecoder();
return _residentDecoder.GetString(input, index, count);
}
public override int GetMaxByteCount(int charCount)
=> Pass1Encoding.GetMaxByteCount(charCount);
public override int GetMaxCharCount(int byteCount)
=> Pass1Encoding.GetMaxCharCount(byteCount);
public override Encoder GetEncoder()
=> new PythonEncoder(this);
public override Decoder GetDecoder()
=> new PythonDecoder(this);
public override int CodePage => Pass1Encoding.CodePage;
public override int WindowsCodePage => Pass1Encoding.WindowsCodePage;
public override string EncodingName => StringOps.GetEncodingName(Pass1Encoding, normalize: false);
public override string HeaderName => Pass1Encoding.HeaderName;
public override string BodyName => Pass1Encoding.BodyName;
public override string WebName => Pass1Encoding.WebName;
public override bool IsBrowserDisplay => false;
public override bool IsBrowserSave => false;
public override bool IsMailNewsDisplay => false;
public override bool IsMailNewsSave => false;
public override bool IsSingleByte => Pass1Encoding.IsSingleByte;
public override int GetHashCode() => Pass1Encoding.GetHashCode();
public override byte[] GetPreamble() => Pass1Encoding.GetPreamble();
public override bool IsAlwaysNormalized(NormalizationForm form) => false;
public static bool HasBugCorefx29898 {
get {
if (_hasBugCorefx29898 == null) {
try {
var codec = (Encoding)new UTF8Encoding(false, throwOnInvalidBytes: true);
codec.GetCharCount(new byte[] { 255 });
_hasBugCorefx29898 = false;
} catch (DecoderFallbackException ex) {
_hasBugCorefx29898 = (ex.Index < 0);
}
}
return (bool)_hasBugCorefx29898;
}
}
private static bool? _hasBugCorefx29898;
internal static int GetUtf16SequenceLength(int rune) => rune > char.MaxValue ? 2 : 1;
private readonly struct MemInt {
private readonly int _current;
private readonly int _initial;
private MemInt(int current, int initial) {
_current = current;
_initial = initial;
}
public int Initial => _initial;
public static implicit operator int(MemInt mi) => mi._current;
public static implicit operator MemInt(int value) => new MemInt(value, value);
/// <summary>Assignment preserving the initial value</summary>
public static MemInt operator <<(MemInt mi, int value) => new MemInt(value, mi._initial);
/// <summary>Addition preserving the initial value</summary>
public static MemInt operator +(MemInt mi, int value) => new MemInt(mi._current + value, mi._initial);
}
private class ProxyEncoder : Encoder {
private Encoding _encoding;
private readonly Encoder _encoder;
public ProxyEncoder(Encoding encoding) {
_encoding = (Encoding)encoding.Clone();
_encoding.EncoderFallback = new ProxyEncoderFallback(_encoding.EncoderFallback.CreateFallbackBuffer(), _encoding.EncoderFallback.MaxCharCount);
_encoder = _encoding.GetEncoder();
Fallback = _encoder.Fallback = _encoding.EncoderFallback;
}
public override int GetByteCount(char[] chars, int index, int count, bool flush)
=> _encoder.GetByteCount(chars, index, count, flush);
public override unsafe int GetByteCount(char* chars, int count, bool flush)
=> _encoder.GetByteCount(chars, count, flush);
public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, bool flush)
=> _encoder.GetBytes(chars, charIndex, charCount, bytes, byteIndex, flush);
public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount, bool flush)
=> _encoder.GetBytes(chars, charCount, bytes, byteCount, flush);
public override void Reset() => _encoder.Reset();
private class ProxyEncoderFallback : EncoderFallback {
private readonly EncoderFallbackBuffer _buffer;
private readonly int _maxCharCount;
public ProxyEncoderFallback(EncoderFallbackBuffer buffer, int maxCharCount) {
_buffer = buffer;
_maxCharCount = maxCharCount;
}
public override EncoderFallbackBuffer CreateFallbackBuffer() => _buffer;
public override int MaxCharCount => _maxCharCount;
}
}
public class PythonEncoder : Encoder {
private readonly PythonEncoding _parentEncoding;
private readonly Encoder _pass1encoder;
private Encoder? _pass2encoder;
public PythonEncoder(PythonEncoding parentEncoding) {
_parentEncoding = parentEncoding;
_pass1encoder = GetEncoder(parentEncoding.Pass1Encoding);
}
private static Encoder GetEncoder(Encoding encoding) {
Encoder encoder = encoding.GetEncoder();
if (encoder.Fallback is not PythonEncoderFallback && encoding.EncoderFallback is PythonEncoderFallback) {
// Non-conformant Encoder implementation, the challenge is to get to the fallback buffer used by such encoder.
// Possibility 1: _pass1encoder is EncoderNLS (or its subclass).
// This weirdo (.NET Core only) does not use Fallback and FallbackBuffer properties from its Encoder base class;
// it redefines them as new properties and uses them instead.
// Although the new FallbackBuffer is public, it is not easilly accessible because the EncoderNLS class is internal.
// One way of accessing it is by reflection. This will be handled by GetPythonEncoderFallbackBuffer()
for (Type? et = encoder.GetType(); et is not null && et.FullName != "System.Text.Encoder"; et = et.BaseType) {
if (et.FullName == "System.Text.EncoderNLS") return encoder;
}
// Possibility 2: _pass1encoder is DefaultEncoder or another stateless encoder;
// This makes sense only if the encoding process of the given encoding is stateless too.
// This should not be common: because .NET strings are UTF-16, it is practically impossible to have a universally-applicable stateless encoding.
// However, such encoding may still be useful in some specifc cases, like non-incremental encoding
// or if the input is guaranteed to never contain surrogate pairs.
// We use ProxyEncoder to access EncoderFallbackBuffer used by such stateless encoder.
return new ProxyEncoder(encoding);
// Possibility 3: Some 3rd party non-compliant encoder. Too bad...
}
return encoder;
}
private static PythonEncoderFallbackBuffer? GetPythonEncoderFallbackBuffer(Encoder? enc) {
if (enc is null) return null;
// This should be as simple as enc.FallbackBuffer as PythonEncoderFallbackBuffer
// but it requires a workaround for a design oddity in System.Text.EncoderNLS on .NET Core
var fbuf = enc.FallbackBuffer as PythonEncoderFallbackBuffer;
#if NETCOREAPP || NETSTANDARD
fbuf ??= enc.GetType().GetProperty(nameof(enc.FallbackBuffer))?.GetValue(enc) as PythonEncoderFallbackBuffer;
#endif
return fbuf;
}
// mandatory override of an abstract method
public override int GetByteCount(char[] chars, int index, int count, bool flush) {
if (chars is null) throw new ArgumentNullException(nameof(chars));
if (index < 0 || count < 0) throw new ArgumentOutOfRangeException(index < 0 ? nameof(index) : nameof(count));
if (chars.Length - index < count) throw new ArgumentOutOfRangeException(nameof(chars));
return this.GetByteCount(chars.AsSpan(index, count), flush);
}
// optional override of a virtual method but preferable to the one from the base class
public override unsafe int GetByteCount(char* chars, int count, bool flush) {
if (chars is null) throw new ArgumentNullException(nameof(chars));
var fbuf1 = GetPythonEncoderFallbackBuffer(_pass1encoder);
var s = new string(chars, 0, count);
fbuf1?.PrepareIncrement(s, forEncoding: false);
int numBytes = _pass1encoder.GetByteCount(chars, count, flush);
fbuf1?.FinalizeIncrement(count, flush);
return numBytes;
}
// not declared in the base class nevertheless still useful in IronPython context, most efficient if input is a string
public int GetByteCount(string s, bool flush) {
if (s is null) throw new ArgumentNullException(nameof(s));
var fbuf1 = GetPythonEncoderFallbackBuffer(_pass1encoder);
fbuf1?.PrepareIncrement(s, forEncoding: false);
int numBytes;
numBytes = _pass1encoder.GetByteCount(s.AsSpan(), flush);
fbuf1?.FinalizeIncrement(s.Length, flush);
return numBytes;
}
// mandatory override
public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, bool flush) {
if (chars is null) throw new ArgumentNullException(nameof(chars));
if (bytes is null) throw new ArgumentNullException(nameof(bytes));
if (charIndex < 0) throw new ArgumentOutOfRangeException(nameof(charIndex));
if (charCount < 0) throw new ArgumentOutOfRangeException(nameof(charCount));
if (chars.Length - charIndex < charCount) throw new ArgumentOutOfRangeException(nameof(chars));
if (byteIndex < 0 || byteIndex > bytes.Length) throw new ArgumentOutOfRangeException(nameof(byteIndex));
var s = new string(chars, charIndex, charCount);
return GetBytes(s, bytes.AsSpan(byteIndex), flush);
}
// NLS workhorse
public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount, bool flush) {
if (chars is null) throw new ArgumentNullException(nameof(chars));
if (bytes is null) throw new ArgumentNullException(nameof(bytes));
if (charCount < 0) throw new ArgumentOutOfRangeException(nameof(charCount));
if (byteCount < 0) throw new ArgumentOutOfRangeException(nameof(byteCount));
var s = new string(chars, 0, charCount);
return GetBytes(s, new Span<byte>(bytes, byteCount), flush);
}
// used by IronPython
public int GetBytes(string s, int charIndex, int charCount, byte[] bytes, int byteIndex, bool flush) {
if (s is null) throw new ArgumentNullException(nameof(s));
if (bytes is null) throw new ArgumentNullException(nameof(bytes));
if (charIndex < 0) throw new ArgumentOutOfRangeException(nameof(charIndex));
if (charCount < 0) throw new ArgumentOutOfRangeException(nameof(charCount));
if (s.Length - charIndex < charCount) throw new ArgumentOutOfRangeException(nameof(s));
if (byteIndex < 0 || byteIndex > bytes.Length) throw new ArgumentOutOfRangeException(nameof(byteIndex));
if (charIndex != 0 || charCount != s.Length) {
s = s.Substring(charIndex, charCount);
}
return GetBytes(s, bytes.AsSpan(byteIndex), flush);
}
private int GetBytes(string data, Span<byte> bytes, bool flush) {
var fbuf1 = GetPythonEncoderFallbackBuffer(_pass1encoder);
var fbuf2 = GetPythonEncoderFallbackBuffer(_pass2encoder);
fbuf1?.PrepareIncrement(data, forEncoding: true);
var chars = data.AsSpan();
int written = _pass1encoder.GetBytes(chars, bytes, flush);
// If the final increment and there were no more fallback bytes, the job is done
if (fbuf1 is null || flush && fbuf1.IsEmpty && (fbuf2?.IsEmpty ?? true)) {
fbuf1?.Reset();
fbuf2?.Reset();
return written;
}
// Lazy creation of _pass2encoder
if (_pass2encoder is null) {
_pass2encoder = GetEncoder(_parentEncoding.Pass2Encoding);
fbuf2 = GetPythonEncoderFallbackBuffer(_pass2encoder);
}
// fbuf2 is not null here because fbuf1 is not null and Pass1Encoding and Pass2Encoding are identical clones
fbuf2!.PrepareIncrement(data, forEncoding: true);
// Restore original fallback bytes
var bytes2 = new byte[written];
_pass2encoder.GetBytes(chars, bytes2, flush);
int cwidth = _parentEncoding.CharacterWidth;
for (int i = 0; i < written; i++) {
if (bytes[i] != bytes2[i]) {
int ofs = (i / cwidth) * cwidth;
for (int p = 0; p < cwidth; p++) {
bytes[ofs++] = fbuf2.GetFallbackByte();
fbuf1.GetFallbackByte(); // count the byte as consumed in fbuf1 too
}
int skip = ofs - i - 1;
i += skip;
}
}
// Check if all fallback bytes are restored properly
fbuf1.FinalizeIncrement(data.Length, flush);
fbuf2.FinalizeIncrement(data.Length, flush);
return written;
}
public override void Reset() {
_pass1encoder.Reset();
_pass2encoder?.Reset();
}
}
public abstract class PythonEncoderFallback : EncoderFallback, ICloneable {
public PythonEncoding Encoding {
get => _encoding ?? throw new NullReferenceException($"Property \"{nameof(Encoding)}\" not initialized before use.");
set => _encoding = value;
}
private PythonEncoding? _encoding;
public bool IsPass1 { get; set; }
public virtual object Clone() => MemberwiseClone();
}
protected abstract class PythonEncoderFallbackBuffer : EncoderFallbackBuffer {
private readonly char _marker;
private readonly Queue<byte>? _allFallbackBytes; // collects all fallback bytes for the whole pass, only used during actual encoding, pass 2
private int _fbkByteCnt; // only used during actual encoding; proxy for _fallbackBytes.Count but valid in pass 1 too
private MemInt _byteCnt; // counts unreported bytes in the buffer from the last fallback; used during both counting and encoding, but counts separately
private ReadOnlyMemory<char> _fallbackChars; // fallback chars (if any) from the last fallback
private int _charCnt; // counts unreported chars in the buffer from the last fallback
private int _fbkNumChars; // number of all (virtual) chars in the buffer from the last fallback; proxy for _fallbackChars.Length but valid for fallback bytes too
// for error reporting
private int _lastRuneUnknown; // in UTF-32; "rune" is an alias for "Unicode codepoint"
private MemInt _lastIndexUnknown;
public PythonEncoderFallbackBuffer(bool isPass1, PythonEncoding encoding) {
_marker = isPass1 ? Pass1Marker : Pass2Marker;
_allFallbackBytes = isPass1 ? null : new Queue<byte>();
this.EncodingCharWidth = encoding.CharacterWidth;
this.CodePage = encoding.CodePage;
_lastIndexUnknown = -1;
}
protected bool EncodingMode { get; private set; }
protected int EncodingCharWidth { get; }
protected int CodePage { get; }
protected string? Data { get; private set; }
public virtual void PrepareIncrement(string data, bool forEncoding) {
Data = data;
if (EncodingMode) {
_byteCnt = (int)_byteCnt;
_lastIndexUnknown = (int)_lastIndexUnknown;
} else {
_byteCnt = _byteCnt.Initial;
_lastIndexUnknown = _lastIndexUnknown.Initial;
}
EncodingMode = forEncoding;
}
public abstract Tuple<ReadOnlyMemory<char>, ReadOnlyMemory<byte>> GetFallbackCharsOrBytes(int runeUnknown, int index);
public override bool Fallback(char charUnknown, int index)
=> FallbackImpl(charUnknown, index);
public override bool Fallback(char charUnknownHigh, char charUnknownLow, int index)
=> FallbackImpl(char.ConvertToUtf32(charUnknownHigh, charUnknownLow), index);
private bool FallbackImpl(int runeUnknown, int index) {
if (_charCnt > 0) {
// There are some unread characters from the previous fallback
// InvalidOperationException would be a better choice, but ArgumentException is what .NET fallback buffers throw
if (_lastRuneUnknown <= char.MaxValue) {
throw new ArgumentException($"Recursive fallback not allowed for character '\\u{_lastRuneUnknown:X4}'");
} else {
throw new ArgumentException($"Recursive fallback not allowed for character '\\U{_lastRuneUnknown:X8}'");
}
}
// The design limitation for wide-char encodings is that
// fallback bytes must be char-aligned (to fill in marker chars)
if (_byteCnt > 0) {
// bytes are not char-aligned yet, so the fallback chars must be consecutive
if (index != _lastIndexUnknown + GetUtf16SequenceLength(_lastRuneUnknown)) {
throw PythonOps.UnicodeEncodeError("incomplete input sequence", _lastRuneUnknown, _lastIndexUnknown);
}
}
var fallbackData = GetFallbackCharsOrBytes(runeUnknown, index);
var newFallbackChars = fallbackData.Item1;
var newFallbackBytes = fallbackData.Item2;
if (newFallbackBytes.IsEmpty) {
if (_byteCnt > 0 && !newFallbackChars.IsEmpty) {
// bytes are not char-aligned yet, so the fallback should have produced remaining bytes, not chars
throw PythonOps.UnicodeEncodeError("incomplete fallback sequence", _lastRuneUnknown, _lastIndexUnknown);
}
// use fallback chars, may be none
_fallbackChars = newFallbackChars;
_charCnt = _fbkNumChars = _fallbackChars.Length;
} else {
if (!newFallbackChars.IsEmpty) {
throw new NotSupportedException("Encoding error handler may produce either chars or bytes, not both at the same time.");
}
// use fallback bytes
if (EncodingMode) {
if (_allFallbackBytes is not null) { // pass 2
foreach (byte b in newFallbackBytes.Span) {
_allFallbackBytes.Enqueue(b);
}
}
_fbkByteCnt += newFallbackBytes.Length;
}
_fallbackChars = default; // will report _marker instead
_byteCnt += newFallbackBytes.Length;
_charCnt = _fbkNumChars = _byteCnt / EncodingCharWidth;
}
_lastRuneUnknown = runeUnknown;
_lastIndexUnknown <<= index;
return _charCnt > 0;
}
public override int Remaining => _charCnt;
public override char GetNextChar() {
if (_charCnt == 0) return '\0';
if (_fallbackChars.IsEmpty) {
_charCnt--;
_byteCnt += -EncodingCharWidth;
return _marker;
} else {
return _fallbackChars.Span[_fallbackChars.Length - _charCnt--];
}
}
public override bool MovePrevious() {
if (_charCnt == _fbkNumChars) return false;
if (_fallbackChars.IsEmpty) {
_byteCnt += EncodingCharWidth;
}
_charCnt++;
return true;
}
public byte GetFallbackByte() {
_fbkByteCnt--;
return _allFallbackBytes?.Dequeue() ?? 0;
}
public virtual bool IsEmpty => _charCnt == 0 && (_fbkByteCnt == 0 || !EncodingMode) && _byteCnt == 0;
public virtual void FinalizeIncrement(int endIndex, bool flush) {
if (flush && !IsEmpty || _byteCnt > 0 && endIndex != _lastIndexUnknown + GetUtf16SequenceLength(_lastRuneUnknown)) {
throw PythonOps.UnicodeEncodeError($"incomplete input sequence", _lastRuneUnknown, _lastIndexUnknown);
}
Data = null; // release input data for possible collection
_lastIndexUnknown += -endIndex; // prep. for next incremental encoding step
}
public override void Reset() {
_allFallbackBytes?.Clear();
_fbkByteCnt = 0;
_byteCnt = 0;
_charCnt = _fbkNumChars = 0;
_fallbackChars = default;
_lastRuneUnknown = '\0';
_lastIndexUnknown = -1;
Data = null;
}
}
private class ProxyDecoder : Decoder {
private Encoding _encoding;
private readonly Decoder _decoder;
public ProxyDecoder(Encoding encoding) {
_encoding = (Encoding)encoding.Clone();
_encoding.DecoderFallback = new ProxyDecoderFallback(_encoding.DecoderFallback.CreateFallbackBuffer(), _encoding.DecoderFallback.MaxCharCount);
_decoder = _encoding.GetDecoder();
Fallback = _decoder.Fallback = _encoding.DecoderFallback;
}
public override int GetCharCount(byte[] bytes, int index, int count)
=> GetCharCount(bytes, index, count, flush: true);
public override int GetCharCount(byte[] bytes, int index, int count, bool flush)
=> _decoder.GetCharCount(bytes, index, count, flush);
public override unsafe int GetCharCount(byte* bytes, int count, bool flush)
=> _decoder.GetCharCount(bytes, count, flush);
public override int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex)
=> GetChars(bytes, byteIndex, byteCount, chars, charIndex, flush: true);
public override int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex, bool flush)
=> _decoder.GetChars(bytes, byteIndex, byteCount, chars, charIndex, flush);
public override unsafe int GetChars(byte* bytes, int byteCount, char* chars, int charCount, bool flush)
=> _decoder.GetChars(bytes, byteCount, chars, charCount, flush);
public override void Reset() => _decoder.Reset();
private class ProxyDecoderFallback : DecoderFallback {
private readonly DecoderFallbackBuffer _buffer;
private readonly int _maxCharCount;
public ProxyDecoderFallback(DecoderFallbackBuffer buffer, int maxCharCount) {
_buffer = buffer;
_maxCharCount = maxCharCount;
}
public override DecoderFallbackBuffer CreateFallbackBuffer() => _buffer;
public override int MaxCharCount => _maxCharCount;
}
}
private class PythonDecoder : Decoder {
private readonly PythonEncoding _parentEncoding;
private readonly Decoder _pass1decoder;
private Decoder? _pass2decoder;
public PythonDecoder(PythonEncoding parentEncoding) {
_parentEncoding = parentEncoding;
_pass1decoder = GetDecoder(_parentEncoding.Pass1Encoding);
}
private static Decoder GetDecoder(Encoding encoding) {
Decoder decoder = encoding.GetDecoder();
if (decoder.Fallback is not PythonDecoderFallback && encoding.DecoderFallback is PythonDecoderFallback) {
// Non-conformant Decoder implementation, the challenge is to get to the fallback buffer used by such decoder.
// See notes at PythonEncoder.GetEncoder(...)
for (Type? dt = decoder.GetType(); dt is not null && dt.FullName != "System.Text.Decoder"; dt = dt.BaseType) {
if (dt.FullName == "System.Text.DecoderNLS") return decoder;
}
return new ProxyDecoder(encoding);
}
return decoder;
}
private static PythonDecoderFallbackBuffer? GetPythonDecoderFallbackBuffer(Decoder? dec) {
if (dec is null) return null;
// see also PythonEncoder.GetPythonEncoderFallbackBuffer(...)
var fbuf = dec.FallbackBuffer as PythonDecoderFallbackBuffer;
#if NETCOREAPP || NETSTANDARD
fbuf ??= dec.GetType().GetProperty(nameof(dec.FallbackBuffer))?.GetValue(dec) as PythonDecoderFallbackBuffer;
#endif
return fbuf;
}
// mandatory override of an abstract method
public override int GetCharCount(byte[] bytes, int index, int count)
=> this.GetCharCount(bytes, index, count, flush: true);
public override int GetCharCount(byte[] bytes, int index, int count, bool flush)
=> this.GetCharCount(bytes.AsSpan(index, count), flush);
// NLS workhorse, used by GetString
public override unsafe int GetCharCount(byte* bytes, int count, bool flush) {
int numChars;
var fbuf1 = GetPythonDecoderFallbackBuffer(_pass1decoder);
fbuf1?.PrepareIncrement(forDecoding: false);
numChars = _pass1decoder.GetCharCount(bytes, count, flush);
fbuf1?.FinalizeIncrement(count, flush);
return numChars;
}
// mandatory override of an abstract method
public override int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex)
=> this.GetChars(bytes, byteIndex, byteCount, chars, charIndex, flush: true);
public override int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex, bool flush)
=> this.GetChars(bytes.AsSpan(byteIndex, byteCount), chars.AsSpan(charIndex), flush);
// NLS workhorse
public override unsafe int GetChars(byte* bytes, int byteCount, char* chars, int charCount, bool flush) {
if (bytes is null) throw new ArgumentNullException(nameof(bytes));
if (chars is null) throw new ArgumentNullException(nameof(chars));
if (byteCount < 0) throw new ArgumentOutOfRangeException(nameof(byteCount));
if (charCount < 0) throw new ArgumentOutOfRangeException(nameof(charCount));
return GetChars(new ReadOnlySpan<byte>(bytes, byteCount), new Span<char>(chars, charCount), flush);
}
// IronPython workhorse, used by GetString(IBuffer,...)
#if NETCOREAPP
public override int GetChars(ReadOnlySpan<byte> bytes, Span<char> chars, bool flush) {
#else
public int GetChars(ReadOnlySpan<byte> bytes, Span<char> chars, bool flush) {
#endif
var fbuf1 = GetPythonDecoderFallbackBuffer(_pass1decoder);
var fbuf2 = GetPythonDecoderFallbackBuffer(_pass2decoder);
fbuf1?.PrepareIncrement(forDecoding: true);
int? surIdxStart = fbuf1?.FallbackCharCount;
int written = _pass1decoder.GetChars(bytes, chars, flush);
// If the final increment and there were no fallback characters, the job is done
if (fbuf1 is null || flush && fbuf1.FallbackCharCount == surIdxStart && fbuf1.IsEmpty && (fbuf2?.IsEmpty ?? true)) {
return written;
}
// Lazy creation of _pass2decoder
if (_pass2decoder is null) {
_pass2decoder = GetDecoder(_parentEncoding.Pass2Encoding);
fbuf2 = GetPythonDecoderFallbackBuffer(_pass2decoder);
}
// fbuf2 is not null here because fbuf1 is not null and Pass1Encoding and Pass2Encoding are identical clones
fbuf2!.Data = fbuf1.Data;
fbuf2.PrepareIncrement(forDecoding: true);
// replace surrogate markers with actual surrogates
var chars2 = new char[written];
_pass2decoder.GetChars(bytes, chars2, flush);
for (int i = 0; i < written; i++) {
if (chars[i] != chars2[i]) {
chars[i] = fbuf2.GetFallbackChar();
}
}
// Check if all fallback chars are restored properly
fbuf1.FinalizeIncrement(bytes.Length, flush);
fbuf2.FinalizeIncrement(bytes.Length, flush);
return written;
}
// used by IronPython
public string GetString(IPythonBuffer input, int index, int count) {
var fbuf1 = GetPythonDecoderFallbackBuffer(_pass1decoder);
// This allows for UnicodeDecodeError, if occurred, to contain the whole input
if (fbuf1 is not null) fbuf1.Data = Tuple.Create(input, index);
var span = input.AsReadOnlySpan().Slice(index, count);
int len = _pass1decoder.GetCharCount(span, flush: true);
if (len == 0) return string.Empty;
return StringExtensions.Create(len, Tuple.Create(input, index, count), (dest, arg) => {
var src = arg.Item1.AsReadOnlySpan().Slice(arg.Item2, arg.Item3);
GetChars(src, dest, flush: true);
});
}
public override void Reset() {
_pass1decoder.Reset();
_pass2decoder?.Reset();
}
}
public abstract class PythonDecoderFallback : DecoderFallback, ICloneable {
public PythonEncoding Encoding {
get => _encoding ?? throw new NullReferenceException($"Property \"{nameof(Encoding)}\" not initialized before use.");
set => _encoding = value;
}
private PythonEncoding? _encoding;
public bool IsPass1 { get; set; }
public virtual object Clone() => MemberwiseClone();
}
protected abstract class PythonDecoderFallbackBuffer : DecoderFallbackBuffer {
private readonly char _marker;
private readonly Queue<char>? _fallbackChars; // collects all fallback chars for the whole pass, only used during actual decoding, pass 2
private int _fbkCnt; // only used during actual decoding; proxy for _fallbackChars.Count but valid in pass 1 too
private MemInt _charCnt; // counts unreported chars from the last fallback; used during both counting and decoding, but counts separately
private int _fbkNumChars; // number of all virtual chars in the buffer from the last fallback
private ReadOnlyMemory<char> _safeFallbackChars; // chars from the last fallback that are safe to report; only used during actual decoding
public PythonDecoderFallbackBuffer(bool isPass1, PythonEncoding encoding) {
_marker = isPass1 ? Pass1Marker : Pass2Marker;
_fallbackChars = isPass1 ? null : new Queue<char>();
this.EncodingCharWidth = encoding.CharacterWidth;
this.CodePage = encoding.CodePage;
}
protected bool DecodingMode { get; private set; }
protected int EncodingCharWidth { get; }
protected int CodePage { get; }
public Tuple<IPythonBuffer, int>? Data { get; set; }
public virtual void PrepareIncrement(bool forDecoding) {
if (DecodingMode) {
_charCnt = (int)_charCnt;
} else {
_charCnt = _charCnt.Initial;
}
DecodingMode = forDecoding;
}
public abstract ReadOnlyMemory<char> GetFallbackChars(byte[] bytesUnknown, int index);
public override bool Fallback(byte[] bytesUnknown, int index) {
if (!this.DecodingMode && this.CodePage == 65001 && PythonEncoding.HasBugCorefx29898) { // only for UTF-8
index += bytesUnknown.Length;
}
ReadOnlyMemory<char> newFallbackChars = GetFallbackChars(bytesUnknown, index);
_fbkNumChars = newFallbackChars.Length;
if (DecodingMode && MemoryMarshal.ToEnumerable(newFallbackChars).All(ch => !char.IsSurrogate(ch))) {
_safeFallbackChars = newFallbackChars;
} else {
_safeFallbackChars = default;
_fbkCnt += _fbkNumChars;
if (_fallbackChars is not null) {
var chars = newFallbackChars.Span;
for (int i = 0; i < _fbkNumChars; i++) {
_fallbackChars.Enqueue(chars[i]);
}
}
}
_charCnt = _fbkNumChars;
return true;
}
public override int Remaining => _charCnt;
public override char GetNextChar() {
if (_charCnt <= 0) return '\0';
if (_safeFallbackChars.IsEmpty) {
_charCnt--;
return _marker; // unfortunately, returning the actual fallback char here might result in an exception
} else {
return _safeFallbackChars.Span[_safeFallbackChars.Length - _charCnt--];
}
}
public override bool MovePrevious() {
if (_charCnt >= _fbkNumChars) return false;
_charCnt++;
return true;
}
// not called for pass1 decoding
public char GetFallbackChar() {
_fbkCnt--;
// _fallbackChars is not null for pass2 decoding
return _fallbackChars!.Dequeue();
}
public virtual bool IsEmpty => (_fallbackChars?.Count ?? 0) == 0;
public int FallbackCharCount => _fbkCnt;
public virtual void FinalizeIncrement(int endIndex, bool flush) {
if (flush && !IsEmpty) {
// If this exception is being thrown, the problem is with the code, not the input sequence.
// Therefore, the exception does not carry any input data.
throw new DecoderFallbackException("internal error");
}
_safeFallbackChars = default;
Data = null; // release input data for possible collection
}
public override void Reset() {
_fallbackChars?.Clear();
_fbkCnt = 0;
_fbkNumChars = 0;
_charCnt = 0;
_safeFallbackChars = default;
Data = null;
}
}
}
internal class PythonSurrogateEscapeEncoding : PythonEncoding {
// Defined in PEP 383
private const ushort LoneSurrogateBase = 0xdc00;
public PythonSurrogateEscapeEncoding(Encoding encoding)
: base(encoding, new SurrogateEscapeEncoderFallback(), new SurrogateEscapeDecoderFallback()) { }
public class SurrogateEscapeEncoderFallback : PythonEncoderFallback {
public override int MaxCharCount => 1;
public override EncoderFallbackBuffer CreateFallbackBuffer()
=> new SurrogateEscapeEncoderFallbackBuffer(this.IsPass1, this.Encoding);
}
private class SurrogateEscapeEncoderFallbackBuffer : PythonEncoderFallbackBuffer {
public SurrogateEscapeEncoderFallbackBuffer(bool isPass1, PythonEncoding encoding)
: base(isPass1, encoding) { }
public override Tuple<ReadOnlyMemory<char>, ReadOnlyMemory<byte>> GetFallbackCharsOrBytes(int runeUnknown, int index) {
if ((runeUnknown & ~0xff) != LoneSurrogateBase) {
// EncoderFallbackException(string, char, int) is not accessible here
throw PythonOps.UnicodeEncodeError(
$"'surrogateescape' error handler: value not in range(0x{LoneSurrogateBase:x4}, 0x{LoneSurrogateBase+0x100:x4})",
runeUnknown,
index
);
}
byte b = (byte)(runeUnknown & 0xff);
if (b < 128) {
throw PythonOps.UnicodeEncodeError(
"'surrogateescape' error handler: bytes below 128 cannot be smuggled (PEP 383)",
runeUnknown,
index
);
}
var fallbackBytes = new byte[] { b };
return new Tuple<ReadOnlyMemory<char>, ReadOnlyMemory<byte>>(default, fallbackBytes.AsMemory());
}
}
public class SurrogateEscapeDecoderFallback : PythonDecoderFallback {
public override int MaxCharCount => this.Encoding.CharacterWidth;
public override DecoderFallbackBuffer CreateFallbackBuffer()
=> new SurrogateEscapeDecoderFallbackBuffer(this.IsPass1, this.Encoding);
}
private class SurrogateEscapeDecoderFallbackBuffer : PythonDecoderFallbackBuffer {
public SurrogateEscapeDecoderFallbackBuffer(bool isPass1, PythonEncoding encoding)
: base(isPass1, encoding) { }
public override ReadOnlyMemory<char> GetFallbackChars(byte[] bytesUnknown, int index) {
int charNum = bytesUnknown.Length;
char[] fallbackChars = new char[charNum];
for (int i = 0; i < charNum; i++) {
if (bytesUnknown[i] < 128) {
throw new DecoderFallbackException(
"'surrogateescape' error handler: bytes below 128 cannot be smuggled (PEP 383)",
bytesUnknown,
index
);
}