X Tutup
Skip to content

Commit fe91d64

Browse files
authored
Support custom encoding error handler (IronLanguages#914)
* Test replace standard error handlers * Replace assertEquals with assertEqual in test_codecs * Use ConcurrentDictionary for codecs error handlers * Clean up codecs error handler tests * Replace PythonEncoderFallback with PythonHandlerEncoderFallback * Add tests for custom encoding error handlers * Simplify handling of byte counting mode * Turn RestorableInt into readonly struct MemInt * Update after review
1 parent 6e75bca commit fe91d64

File tree

6 files changed

+553
-278
lines changed

6 files changed

+553
-278
lines changed

Src/IronPython.Modules/_codecs.cs

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -671,13 +671,18 @@ public bool TryGetByteValue(int c, out byte val) {
671671
}
672672
}
673673

674+
/// <remarks>
675+
/// This implementation is not suitable for incremental encoding.
676+
/// </remarks>
674677
internal class EncodingMapEncoding : Encoding {
675678
private readonly EncodingMap _map;
676679

677680
public EncodingMapEncoding(EncodingMap map) {
678681
_map = map;
679682
}
680683

684+
public override string EncodingName => "charmap";
685+
681686
public override int GetByteCount(char[] chars, int index, int count)
682687
=> GetBytes(chars, index, count, null, 0);
683688

@@ -721,7 +726,7 @@ public override int GetBytes(char[] chars, int charIndex, int charCount, byte[]?
721726
}
722727
}
723728
} catch (EncoderFallbackException) {
724-
throw PythonOps.UnicodeEncodeError("charmap", new string(chars), charIndex, charIndex + 1, "character maps to <undefined>");
729+
throw PythonOps.UnicodeEncodeError(EncodingName, new string(chars), charIndex, charIndex + 1, "character maps to <undefined>");
725730
}
726731
} else {
727732
if (bytes != null) bytes[byteIndex] = val;
@@ -787,6 +792,9 @@ public override int GetMaxCharCount(int byteCount) {
787792
}
788793
}
789794

795+
/// <remarks>
796+
/// This implementation is not suitable for incremental encoding.
797+
/// </remarks>
790798
internal class CharmapEncoding : Encoding {
791799
private readonly IDictionary<object, object> _map;
792800
private int _maxEncodingReplacementLength;
@@ -796,6 +804,8 @@ public CharmapEncoding(IDictionary<object, object> map) {
796804
_map = map;
797805
}
798806

807+
public override string EncodingName => "charmap";
808+
799809
public override int GetByteCount(char[] chars, int index, int count)
800810
=> GetBytes(chars, index, count, null, 0);
801811

@@ -841,7 +851,7 @@ public override int GetBytes(char[] chars, int charIndex, int charCount, byte[]?
841851

842852
}
843853
} catch (EncoderFallbackException) {
844-
throw PythonOps.UnicodeEncodeError("charmap", new string(chars), charIndex, nextIndex, "character maps to <undefined>");
854+
throw PythonOps.UnicodeEncodeError(EncodingName, new string(chars), charIndex, nextIndex, "character maps to <undefined>");
845855
}
846856
charIndex = nextIndex;
847857
} else {

Src/IronPython/Runtime/Operations/PythonOps.cs

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
using System;
88
using System.Collections;
9+
using System.Collections.Concurrent;
910
using System.Collections.Generic;
1011
using System.ComponentModel;
1112
using System.Diagnostics;
@@ -137,24 +138,20 @@ internal static List<object> GetReprInfinite() {
137138

138139
[LightThrowing]
139140
internal static object LookupEncodingError(CodeContext/*!*/ context, string name) {
140-
Dictionary<string, object> errorHandlers = context.LanguageContext.ErrorHandlers;
141-
lock (errorHandlers) {
142-
if (errorHandlers.ContainsKey(name))
143-
return errorHandlers[name];
144-
else
145-
return LightExceptions.Throw(PythonOps.LookupError("unknown error handler name '{0}'", name));
146-
}
141+
ConcurrentDictionary<string, object> errorHandlers = context.LanguageContext.ErrorHandlers;
142+
if (errorHandlers.TryGetValue(name, out object? handler))
143+
return handler;
144+
else
145+
return LightExceptions.Throw(PythonOps.LookupError("unknown error handler name '{0}'", name));
147146
}
148147

149148
internal static void RegisterEncodingError(CodeContext/*!*/ context, string name, object? handler) {
150-
Dictionary<string, object> errorHandlers = context.LanguageContext.ErrorHandlers;
149+
ConcurrentDictionary<string, object> errorHandlers = context.LanguageContext.ErrorHandlers;
151150

152-
lock (errorHandlers) {
153-
if (!PythonOps.IsCallable(context, handler))
154-
throw PythonOps.TypeError("handler must be callable");
151+
if (!PythonOps.IsCallable(context, handler))
152+
throw PythonOps.TypeError("handler must be callable");
155153

156-
errorHandlers[name] = handler;
157-
}
154+
errorHandlers[name] = handler;
158155
}
159156

160157
internal static PythonTuple LookupEncoding(CodeContext/*!*/ context, string encoding) {
@@ -3608,6 +3605,15 @@ public static Exception UnicodeEncodeError(string message, char charUnknownHigh,
36083605
return (EncoderFallbackException)ctor.Invoke(new object[] { message, charUnknownHigh, charUnknownLow, index });
36093606
}
36103607

3608+
internal static Exception UnicodeEncodeError(string message, int runeUnknown, int index) {
3609+
if (runeUnknown <= char.MaxValue) {
3610+
return PythonOps.UnicodeEncodeError(message, (char)runeUnknown, index);
3611+
} else {
3612+
string s = char.ConvertFromUtf32(runeUnknown);
3613+
return PythonOps.UnicodeEncodeError(message, s[0], s[1], index);
3614+
}
3615+
}
3616+
36113617
public static Exception IOError(Exception inner) {
36123618
return OSError(inner.Message, inner);
36133619
}

Src/IronPython/Runtime/Operations/StringOps.cs

Lines changed: 10 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -1798,8 +1798,8 @@ Encoding setFallback(Encoding enc, DecoderFallback fb) {
17981798
case "strict": e = setFallback(e, new ExceptionFallback(e is UTF8Encoding)); break;
17991799
case "replace": e = setFallback(e, ReplacementFallback); break;
18001800
case "ignore": e = setFallback(e, new PythonDecoderFallback(encoding, buffer, start)); break;
1801-
case "surrogateescape": e = new PythonSurrogateEscapeEncoding(e); break;
1802-
case "surrogatepass": e = new PythonSurrogatePassEncoding(e); break;
1801+
case "surrogateescape": e = new PythonSurrogateEscapeEncoding(e, encoding); break;
1802+
case "surrogatepass": e = new PythonSurrogatePassEncoding(e, encoding); break;
18031803
default:
18041804
e = setFallback(e, new PythonDecoderFallback(encoding,
18051805
buffer, start,
@@ -1879,13 +1879,10 @@ static Encoding setFallback(Encoding enc, EncoderFallback fb) {
18791879
case "replace": e = setFallback(e, EncoderFallback.ReplacementFallback); break;
18801880
case "backslashreplace": e = setFallback(e, new BackslashEncoderReplaceFallback()); break;
18811881
case "xmlcharrefreplace": e = setFallback(e, new XmlCharRefEncoderReplaceFallback()); break;
1882-
case "ignore": e = setFallback(e, new PythonEncoderFallback(encoding, s)); break;
1883-
case "surrogateescape": e = new PythonSurrogateEscapeEncoding(e); break;
1884-
case "surrogatepass": e = new PythonSurrogatePassEncoding(e); break;
1885-
default:
1886-
e = setFallback(e, new PythonEncoderFallback(encoding, s,
1887-
() => LightExceptions.CheckAndThrow(PythonOps.LookupEncodingError(context, errors))));
1888-
break;
1882+
case "ignore": e = setFallback(e, new EncoderReplacementFallback(string.Empty)); break;
1883+
case "surrogateescape": e = new PythonSurrogateEscapeEncoding(e, encoding); break;
1884+
case "surrogatepass": e = new PythonSurrogatePassEncoding(e, encoding); break;
1885+
default: e = new PythonErrorHandlerEncoding(context, e, encoding, errors); break;
18891886
}
18901887

18911888
byte[]? preamble = includePreamble ? e.GetPreamble() : null;
@@ -1898,8 +1895,8 @@ static Encoding setFallback(Encoding enc, EncoderFallback fb) {
18981895
}
18991896
e.GetBytes(s, 0, s.Length, bytes, preambleLen);
19001897
} catch (EncoderFallbackException ex) {
1901-
ex.Data["encoding"] = encoding;
1902-
ex.Data["object"] = s;
1898+
if (!ex.Data.Contains("encoding")) ex.Data["encoding"] = encoding;
1899+
if (!ex.Data.Contains("object")) ex.Data["object"] = s;
19031900
throw;
19041901
}
19051902

@@ -2030,8 +2027,8 @@ static CodecsInfo() {
20302027
return d;
20312028
}
20322029

2033-
internal static Dictionary<string, object> MakeErrorHandlersDict() {
2034-
var d = new Dictionary<string, object>();
2030+
internal static ConcurrentDictionary<string, object> MakeErrorHandlersDict() {
2031+
var d = new ConcurrentDictionary<string, object>();
20352032

20362033
d["strict"] = BuiltinFunction.MakeFunction(
20372034
"strict_errors",
@@ -2324,98 +2321,6 @@ public override int GetChars(byte[] bytes, int byteIndex, int byteCount, char[]
23242321
/// and int is an index where encoding/decoding should continue.
23252322
/// TODO: returned int is currently ignored, assumed to be equal to end (i.e. the index is not adjusted).
23262323

2327-
private class PythonEncoderFallbackBuffer : EncoderFallbackBuffer {
2328-
private readonly string _encoding;
2329-
private readonly string _strData;
2330-
private readonly object? _function;
2331-
private string? _buffer;
2332-
private int _bufferIndex;
2333-
2334-
public PythonEncoderFallbackBuffer(string encoding, string str, object? function) {
2335-
_encoding = encoding;
2336-
_strData = str;
2337-
_function = function;
2338-
}
2339-
2340-
public override bool Fallback(char charUnknown, int index) {
2341-
return DoPythonFallback(index, 1);
2342-
}
2343-
2344-
public override bool Fallback(char charUnknownHigh, char charUnknownLow, int index) {
2345-
return DoPythonFallback(index, 2);
2346-
}
2347-
2348-
public override char GetNextChar() {
2349-
if (_buffer == null || _bufferIndex >= _buffer.Length) return Char.MinValue;
2350-
2351-
return _buffer[_bufferIndex++];
2352-
}
2353-
2354-
public override bool MovePrevious() {
2355-
if (_bufferIndex > 0) {
2356-
_bufferIndex--;
2357-
return true;
2358-
}
2359-
return false;
2360-
}
2361-
2362-
public override int Remaining {
2363-
get {
2364-
if (_buffer == null) return 0;
2365-
return _buffer.Length - _bufferIndex;
2366-
}
2367-
}
2368-
2369-
public override void Reset() {
2370-
_buffer = null;
2371-
_bufferIndex = 0;
2372-
base.Reset();
2373-
}
2374-
2375-
private bool DoPythonFallback(int index, int length) {
2376-
if (_function != null) {
2377-
// create the exception object to hand to the user-function...
2378-
var exObj = PythonExceptions.CreatePythonThrowable(PythonExceptions.UnicodeEncodeError, _encoding, _strData, index, index + length, "unexpected code byte");
2379-
2380-
// call the user function...
2381-
object? res = PythonCalls.Call(_function, exObj);
2382-
2383-
string replacement = CheckReplacementTuple(res, "encoding", index + length);
2384-
2385-
// finally process the user's request.
2386-
_buffer = replacement;
2387-
_bufferIndex = 0;
2388-
return true;
2389-
}
2390-
2391-
return false;
2392-
}
2393-
}
2394-
2395-
private class PythonEncoderFallback : EncoderFallback {
2396-
private readonly string encoding;
2397-
private readonly string data;
2398-
private readonly Func<object>? lookup;
2399-
private object? function;
2400-
2401-
public PythonEncoderFallback(string encoding, string data, Func<object>? lookup = null) {
2402-
this.encoding = encoding;
2403-
this.data = data;
2404-
this.lookup = lookup;
2405-
}
2406-
2407-
public override EncoderFallbackBuffer CreateFallbackBuffer() {
2408-
if (function == null && lookup != null) {
2409-
function = lookup.Invoke();
2410-
}
2411-
return new PythonEncoderFallbackBuffer(encoding, data, function);
2412-
}
2413-
2414-
public override int MaxCharCount {
2415-
get { return Int32.MaxValue; }
2416-
}
2417-
}
2418-
24192324
private class PythonDecoderFallbackBuffer : DecoderFallbackBuffer {
24202325
private readonly object? _function;
24212326
private readonly string _encoding;

Src/IronPython/Runtime/PythonContext.cs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
using System;
66
using System.Collections;
7+
using System.Collections.Concurrent;
78
using System.Collections.Generic;
89
using System.Diagnostics;
910
using System.Dynamic;
@@ -59,7 +60,7 @@ public sealed partial class PythonContext : LanguageContext {
5960
private string _initialVersionString;
6061
private PythonModule _clrModule;
6162
private PythonFileManager _fileManager;
62-
private Dictionary<string, object> _errorHandlers;
63+
private ConcurrentDictionary<string, object> _errorHandlers;
6364
private List<object> _searchFunctions;
6465
private Dictionary<object, object> _moduleState;
6566
/// <summary> stored for copyreg module, used for reduce protocol </summary>
@@ -1872,7 +1873,7 @@ public override int ExecuteProgram(SourceUnit/*!*/ program) {
18721873
}
18731874

18741875
/// <summary> Dictionary of error handlers for string codecs. </summary>
1875-
internal Dictionary<string, object> ErrorHandlers {
1876+
internal ConcurrentDictionary<string, object> ErrorHandlers {
18761877
get {
18771878
if (_errorHandlers == null) {
18781879
Interlocked.CompareExchange(ref _errorHandlers, StringOps.CodecsInfo.MakeErrorHandlersDict(), null);

0 commit comments

Comments
 (0)
X Tutup