X Tutup

// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the Apache 2.0 License. // See the LICENSE file in the project root for more information. #pragma warning disable SYSLIB0001 // UTF-7 code paths are obsolete in .NET 5 using IronPython.Runtime; using IronPython.Runtime.Operations; using NUnit.Framework; using System.Linq; using System.Text; namespace IronPythonTest { // Unit testing class PythonSurrogateEscapeEncoding [TestFixture(Category = "IronPython")] public class SurrogateEscapeTest { #region Round-trip tests // Test 256 bytes sequence public class Bytes256Test { private byte[] _bytes; [SetUp] public void SetUp() { _bytes = Enumerable.Range(0, 256).Select(c => (byte)c).ToArray(); } [Test] public void Test256WithAscii() => TestRoundTrip(Encoding.ASCII, _bytes); [Test] public void Test256WithLatin1() => TestRoundTrip(StringOps.Latin1Encoding, _bytes); [Test] public void Test256WithUtf8() => TestRoundTrip(Encoding.UTF8, _bytes); [Test] public void Test256WithDefault() => TestRoundTrip(Encoding.Default, _bytes); } // Test decoding/encoding a valid UTF-8 sequence public class Utf8Test { private byte[] _bytes; [SetUp] public void SetUp() { // 12 bytes of: Питон!! _bytes = "\xd0\x9f\xd0\xb8\xd1\x82\xd0\xbe\xd0\xbd!!".AsBytes(); } [Test] public void TestValidUtf8WithAscii() => TestRoundTrip(Encoding.ASCII, _bytes); [Test] public void TestValidUtf8WithLatin1() => TestRoundTrip(StringOps.Latin1Encoding, _bytes); [Test] public void TestValidUtf8WithUtf8() => TestRoundTrip(Encoding.UTF8, _bytes); [Test] public void TestValidUtf8WithDefault() => TestRoundTrip(Encoding.Default, _bytes); } // Test decoding/encoding an invalid UTF-8 sequence public class Utf8BrokenTest { private byte[] _bytes; [SetUp] public void SetUp() { // 12 bytes: two valid UTF-8 2-byte chars, one non-decodable byte, // one UTF-8 2-byte char with a non-decodable byte inserted in between the UTF-8 bytes // and final valid UTF-8 2-byte char _bytes = "\xd0\x9f\xd0\xb8\x80\xd1\xff\x82\xd0\xbe\xd0\xbd".AsBytes(); } [Test] public void TestBrokenUtf8WithAscii() => TestRoundTrip(Encoding.ASCII, _bytes); [Test] public void TestBrokenUtf8WithLatin1() => TestRoundTrip(StringOps.Latin1Encoding, _bytes); [Test] public void TestBrokenUtf8WithUtf8() => TestRoundTrip(Encoding.UTF8, _bytes); [Test] public void TestBrokenUtf8WithDefault() => TestRoundTrip(Encoding.Default, _bytes); } // Note: UTF-7, UTF-16, and UTF-32 are not round-trip safe in general private static void TestRoundTrip(Encoding enc, byte[] bytes) { Encoding penc = new PythonSurrogateEscapeEncoding(enc); char[] chars1 = new char[penc.GetCharCount(bytes)]; penc.GetChars(bytes, 0, bytes.Length, chars1, 0); char[] chars2 = penc.GetChars(bytes); Assert.That(chars1, Is.EqualTo(chars2)); byte[] bytes1 = penc.GetBytes(chars1); byte[] bytes2 = new byte[penc.GetByteCount(chars1, 0, chars1.Length)]; penc.GetBytes(chars1, 0, chars1.Length, bytes2, 0); Assert.That(bytes1, Is.EqualTo(bytes2)); Assert.That(bytes, Is.EqualTo(bytes1)); } #endregion #region Tests comparing with CPython results // Test 256 bytes sequence public class CPythonCompare256Tests { private byte[] bytes; [OneTimeSetUp] public void OneTimeSetUp() { #if NETCOREAPP Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); #endif } [SetUp] public void SetUp() { bytes = Enumerable.Range(0, 256).Select(c => (byte)c).ToArray(); } // Compare ASCII handling with CPython results [Test] public void TestCompare256WithAscii() { Encoding penc = new PythonSurrogateEscapeEncoding(Encoding.ASCII); char[] chars = penc.GetChars(bytes); string python_chars = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\udc80\udc81\udc82\udc83\udc84\udc85\udc86\udc87\udc88\udc89\udc8a\udc8b\udc8c\udc8d\udc8e\udc8f\udc90\udc91\udc92\udc93\udc94\udc95\udc96\udc97\udc98\udc99\udc9a\udc9b\udc9c\udc9d\udc9e\udc9f\udca0\udca1\udca2\udca3\udca4\udca5\udca6\udca7\udca8\udca9\udcaa\udcab\udcac\udcad\udcae\udcaf\udcb0\udcb1\udcb2\udcb3\udcb4\udcb5\udcb6\udcb7\udcb8\udcb9\udcba\udcbb\udcbc\udcbd\udcbe\udcbf\udcc0\udcc1\udcc2\udcc3\udcc4\udcc5\udcc6\udcc7\udcc8\udcc9\udcca\udccb\udccc\udccd\udcce\udccf\udcd0\udcd1\udcd2\udcd3\udcd4\udcd5\udcd6\udcd7\udcd8\udcd9\udcda\udcdb\udcdc\udcdd\udcde\udcdf\udce0\udce1\udce2\udce3\udce4\udce5\udce6\udce7\udce8\udce9\udcea\udceb\udcec\udced\udcee\udcef\udcf0\udcf1\udcf2\udcf3\udcf4\udcf5\udcf6\udcf7\udcf8\udcf9\udcfa\udcfb\udcfc\udcfd\udcfe\udcff"; Assert.That(chars, Is.EqualTo(python_chars)); } [Test] public void TestCompare256WithLatin1() { Encoding penc = new PythonSurrogateEscapeEncoding(StringOps.Latin1Encoding); char[] chars = penc.GetChars(bytes); string python_chars = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0¡¢£¤¥¦§¨©ª«¬\xad®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"; Assert.That(chars, Is.EqualTo(python_chars)); } // Compare UTF-8 handling with CPython results [Test] public void TestCompare256WithUtf8() { Encoding penc = new PythonSurrogateEscapeEncoding(Encoding.UTF8); char[] chars = penc.GetChars(bytes); string python_chars = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\udc80\udc81\udc82\udc83\udc84\udc85\udc86\udc87\udc88\udc89\udc8a\udc8b\udc8c\udc8d\udc8e\udc8f\udc90\udc91\udc92\udc93\udc94\udc95\udc96\udc97\udc98\udc99\udc9a\udc9b\udc9c\udc9d\udc9e\udc9f\udca0\udca1\udca2\udca3\udca4\udca5\udca6\udca7\udca8\udca9\udcaa\udcab\udcac\udcad\udcae\udcaf\udcb0\udcb1\udcb2\udcb3\udcb4\udcb5\udcb6\udcb7\udcb8\udcb9\udcba\udcbb\udcbc\udcbd\udcbe\udcbf\udcc0\udcc1\udcc2\udcc3\udcc4\udcc5\udcc6\udcc7\udcc8\udcc9\udcca\udccb\udccc\udccd\udcce\udccf\udcd0\udcd1\udcd2\udcd3\udcd4\udcd5\udcd6\udcd7\udcd8\udcd9\udcda\udcdb\udcdc\udcdd\udcde\udcdf\udce0\udce1\udce2\udce3\udce4\udce5\udce6\udce7\udce8\udce9\udcea\udceb\udcec\udced\udcee\udcef\udcf0\udcf1\udcf2\udcf3\udcf4\udcf5\udcf6\udcf7\udcf8\udcf9\udcfa\udcfb\udcfc\udcfd\udcfe\udcff"; Assert.That(chars, Is.EqualTo(python_chars)); } // Compare Windows-1252 (Western European Windows, variant of ISO-8859-1) handling with CPython results [Test] public void TestCompare256WithWindows1252() { Encoding penc = new PythonSurrogateEscapeEncoding(Encoding.GetEncoding(1252)); Assert.That(penc.WebName.ToLowerInvariant(), Is.EqualTo("windows-1252")); char[] chars = penc.GetChars(bytes); string python_chars = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f€\udc81‚ƒ„…†‡ˆ‰Š‹Œ\udc8dŽ\udc8f\udc90‘’“”•–—˜™š›œ\udc9džŸ\xa0¡¢£¤¥¦§¨©ª«¬\xad®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"; string encoded = new string(chars); Assert.That(encoded.Length, Is.EqualTo(python_chars.Length)); for (int i = 0; i < encoded.Length; i++) { if (encoded[i] != python_chars[i]) { // Known differences between Windows and Python (Unicode) implementation of Windows-1252 // https://en.wikipedia.org/wiki/Windows-1252 Assert.That(new[] { 0x81, 0x8d, 0x8f, 0x90, 0x9d }, Has.Member(i)); } } } // Compare ISO-8859-1 (Western European) handling with CPython results [Test] public void TestCompare256WithIso8859_1() { Encoding penc = new PythonSurrogateEscapeEncoding(Encoding.GetEncoding(28591)); Assert.That(penc.WebName, Is.EqualTo("iso-8859-1")); char[] chars = penc.GetChars(bytes); string python_chars = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0¡¢£¤¥¦§¨©ª«¬\xad®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"; string encoded = new string(chars); Assert.That(encoded, Is.EqualTo(python_chars)); } // Compare UTF-7 handling with CPython results [Test] public void TestCompare256WithUtf7() { Encoding utf7 = new UTF7Encoding(allowOptionals: true); Encoding penc = new PythonSurrogateEscapeEncoding(utf7); // The following Python output is produced with python 3.4 but is not correct: it is missing the '+' character string python_chars = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\udc80\udc81\udc82\udc83\udc84\udc85\udc86\udc87\udc88\udc89\udc8a\udc8b\udc8c\udc8d\udc8e\udc8f\udc90\udc91\udc92\udc93\udc94\udc95\udc96\udc97\udc98\udc99\udc9a\udc9b\udc9c\udc9d\udc9e\udc9f\udca0\udca1\udca2\udca3\udca4\udca5\udca6\udca7\udca8\udca9\udcaa\udcab\udcac\udcad\udcae\udcaf\udcb0\udcb1\udcb2\udcb3\udcb4\udcb5\udcb6\udcb7\udcb8\udcb9\udcba\udcbb\udcbc\udcbd\udcbe\udcbf\udcc0\udcc1\udcc2\udcc3\udcc4\udcc5\udcc6\udcc7\udcc8\udcc9\udcca\udccb\udccc\udccd\udcce\udccf\udcd0\udcd1\udcd2\udcd3\udcd4\udcd5\udcd6\udcd7\udcd8\udcd9\udcda\udcdb\udcdc\udcdd\udcde\udcdf\udce0\udce1\udce2\udce3\udce4\udce5\udce6\udce7\udce8\udce9\udcea\udceb\udcec\udced\udcee\udcef\udcf0\udcf1\udcf2\udcf3\udcf4\udcf5\udcf6\udcf7\udcf8\udcf9\udcfa\udcfb\udcfc\udcfd\udcfe\udcff"; // Our implementation will refuse (correctly) to decode because the ',' after '+' is not valid thus requires escaping, // but escaping of chars under 128 is not allowed. Assert.Throws(() => penc.GetChars(bytes)); // Let's try again without the '+' bytes = bytes.Where(i => i != (byte)'+').ToArray(); char[] chars = penc.GetChars(bytes); Assert.That(chars, Is.EqualTo(python_chars)); // Now the encoding part byte[] encoded_bytes = penc.GetBytes(chars); byte[] expected_bytes = "+AAAAAQACAAMABAAFAAYABwAI-\t\n+AAsADA-\r+AA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAf- !\"#$%&'()*,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[+AFw-]^_`abcdefghijklmnopqrstuvwxyz{|}+AH4Af9yA3IHcgtyD3ITchdyG3IfciNyJ3Irci9yM3I3cjtyP3JDckdyS3JPclNyV3Jbcl9yY3Jncmtyb3Jzcndye3J/coNyh3KLco9yk3KXcptyn3Kjcqdyq3KvcrNyt3K7cr9yw3LHcstyz3LTctdy23LfcuNy53Lrcu9y83L3cvty/3MDcwdzC3MPcxNzF3Mbcx9zI3MncytzL3MzczdzO3M/c0NzR3NLc09zU3NXc1tzX3Njc2dza3Nvc3Nzd3N7c39zg3OHc4tzj3OTc5dzm3Ofc6Nzp3Orc69zs3O3c7tzv3PDc8dzy3PPc9Nz13Pbc99z43Pnc+tz73Pzc/dz+3P8-" .Select(c => (byte)c).ToArray(); Assert.That(encoded_bytes, Is.EqualTo(expected_bytes)); // Encoding the given chars with CPython produces the following byte string byte[] python_bytes = "+AAAAAQACAAMABAAFAAYABwAI\t\n+AAsADA\r+AA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAf !\"#$%&'()*,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[+AFw]^_`abcdefghijklmnopqrstuvwxyz{|}+AH4Af9yA3IHcgtyD3ITchdyG3IfciNyJ3Irci9yM3I3cjtyP3JDckdyS3JPclNyV3Jbcl9yY3Jncmtyb3Jzcndye3J/coNyh3KLco9yk3KXcptyn3Kjcqdyq3KvcrNyt3K7cr9yw3LHcstyz3LTctdy23LfcuNy53Lrcu9y83L3cvty/3MDcwdzC3MPcxNzF3Mbcx9zI3MncytzL3MzczdzO3M/c0NzR3NLc09zU3NXc1tzX3Njc2dza3Nvc3Nzd3N7c39zg3OHc4tzj3OTc5dzm3Ofc6Nzp3Orc69zs3O3c7tzv3PDc8dzy3PPc9Nz13Pbc99z43Pnc+tz73Pzc/dz+3P8-" .Select(c => (byte)c).ToArray(); // The sequences expected_bytes and python_bytes are NOT equal: .NET ends encoded blocks (starting with '+') with '-' // Terminating encoded blocks with '-' is optional if not ambiguous. // CPython doesn't terminate blocks with '-' if not mandatory, resulting in a more compact encoding. // However, they both decode to the same text, although, again, CPython's version cannot be decoded using surrogateescape char[] dotnet_decoded = penc.GetChars(encoded_bytes); char[] python_decoded = utf7.GetChars(python_bytes); Assert.That(chars, Is.EqualTo(python_decoded)); Assert.That(chars, Is.EqualTo(dotnet_decoded)); dotnet_decoded = utf7.GetChars(encoded_bytes); Assert.That(chars, Is.EqualTo(dotnet_decoded)); } // Compare UTF-16 handling with CPython results [Test] public void TestCompare256Utf16() { Encoding penc = new PythonSurrogateEscapeEncoding(Encoding.Unicode); char[] chars = penc.GetChars(bytes); char[] python_chars = (new[] { 0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0b0a, 0x0d0c, 0x0f0e, 0x1110, 0x1312, 0x1514, 0x1716, 0x1918, 0x1b1a, 0x1d1c, 0x1f1e, 0x2120, 0x2322, 0x2524, 0x2726, 0x2928, 0x2b2a, 0x2d2c, 0x2f2e, 0x3130, 0x3332, 0x3534, 0x3736, 0x3938, 0x3b3a, 0x3d3c, 0x3f3e, 0x4140, 0x4342, 0x4544, 0x4746, 0x4948, 0x4b4a, 0x4d4c, 0x4f4e, 0x5150, 0x5352, 0x5554, 0x5756, 0x5958, 0x5b5a, 0x5d5c, 0x5f5e, 0x6160, 0x6362, 0x6564, 0x6766, 0x6968, 0x6b6a, 0x6d6c, 0x6f6e, 0x7170, 0x7372, 0x7574, 0x7776, 0x7978, 0x7b7a, 0x7d7c, 0x7f7e, 0x8180, 0x8382, 0x8584, 0x8786, 0x8988, 0x8b8a, 0x8d8c, 0x8f8e, 0x9190, 0x9392, 0x9594, 0x9796, 0x9998, 0x9b9a, 0x9d9c, 0x9f9e, 0xa1a0, 0xa3a2, 0xa5a4, 0xa7a6, 0xa9a8, 0xabaa, 0xadac, 0xafae, 0xb1b0, 0xb3b2, 0xb5b4, 0xb7b6, 0xb9b8, 0xbbba, 0xbdbc, 0xbfbe, 0xc1c0, 0xc3c2, 0xc5c4, 0xc7c6, 0xc9c8, 0xcbca, 0xcdcc, 0xcfce, 0xd1d0, 0xd3d2, 0xd5d4, 0xd7d6, 0xdcd8, 0xdcd9, 0x1069dc, 0xdcde, 0xdcdf, 0xe1e0, 0xe3e2, 0xe5e4, 0xe7e6, 0xe9e8, 0xebea, 0xedec, 0xefee, 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6, 0xf9f8, 0xfbfa, 0xfdfc, 0xfffe }) .SelectMany(i => i <= 0xffff ? ((char)i).ToString() : char.ConvertFromUtf32(i)).ToArray(); Assert.That(python_chars, Is.EqualTo(chars)); // byte[] python_bytes = ??? - CPython fails to encode the string it decoded itself; a bug in CPython? byte[] bytes1 = penc.GetBytes(chars); Assert.That(bytes, Is.EqualTo(bytes1)); } } // Test sequence with surrogates public class CPythonCompareSurrogateTests { private byte[] bytes; [SetUp] public void SetUp() { // In UTF-16LE: Lone high surrogate (invalid), surrogate pair: high-low (valid), lone low surrogate (invalid) bytes = new byte[] { 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf }; } [Test] public void TesWithUtf16() { Encoding penc = new PythonSurrogateEscapeEncoding(Encoding.Unicode); char[] chars = penc.GetChars(bytes); char[] python_chars = (new[] { 0x0000dcd8, 0x0000dcd9, 0x001069dc, 0x0000dcde, 0x0000dcdf }) .SelectMany(i => i <= 0xffff ? ((char)i).ToString() : char.ConvertFromUtf32(i)).ToArray(); Assert.That(python_chars, Is.EqualTo(chars)); // byte[] python_bytes = ??? - CPython fails on encoding the string it decoded itself; a bug in CPython? byte[] bytes1 = penc.GetBytes(chars); Assert.That(bytes, Is.EqualTo(bytes1)); } [Test] public void TestWithUtf32() { Encoding penc = new PythonSurrogateEscapeEncoding(Encoding.UTF32); char[] chars = penc.GetChars(bytes); char[] python_chars = (new[] { 0x0000dcd8, 0x0000dcd9, 0x0000dcda, 0x0000dcdb, 0x0000dcdc, 0x0000dcdd, 0x0000dcde, 0x0000dcdf }) .SelectMany(i => i <= 0xffff ? ((char)i).ToString() : char.ConvertFromUtf32(i)).ToArray(); Assert.That(python_chars, Is.EqualTo(chars)); // byte[] python_bytes = ??? - CPython fails on encoding the string it decoded itself; a bug in CPython? byte[] bytes1 = penc.GetBytes(chars); Assert.That(bytes, Is.EqualTo(bytes1)); } } #endregion // Test incremental (block-wise) decoding/encoding public class IncrementalTests { private byte[] _bytes; [SetUp] public void SetUp() { // In UTF-16LE: Lone high surrogate (invalid), surrogate pair: high-low (valid), lone low surrogate (invalid) _bytes = new byte[] { 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf }; } [Test] public void TestIncrementalWithAscii() { // intersperse with ASCII letters _bytes = _bytes.SelectMany((b, i) => new[] { (byte)('A' + i), b }).Concat(new[] { (byte)'Z' }).ToArray(); Encoding penc = new PythonSurrogateEscapeEncoding(Encoding.ASCII); SurrogateTestHelpers.IncrementalTest(penc, _bytes, roundTrip: true); } [Test] public void TestIncrementalWithLatin1() { // intersperse with ASCII letters _bytes = _bytes.SelectMany((b, i) => new[] { (byte)('A' + i), b }).Concat(new[] { (byte)'Z' }).ToArray(); Encoding penc = new PythonSurrogateEscapeEncoding(StringOps.Latin1Encoding); SurrogateTestHelpers.IncrementalTest(penc, _bytes, roundTrip: true); } [Test] public void TestIncrementalWithUtf16() { Encoding penc = new PythonSurrogateEscapeEncoding(Encoding.Unicode); SurrogateTestHelpers.IncrementalTest(penc, _bytes, roundTrip: true); } [Test] public void TestIncrementalWithUtf32() { Encoding penc = new PythonSurrogateEscapeEncoding(Encoding.UTF32); SurrogateTestHelpers.IncrementalTest(penc, _bytes, roundTrip: true); } [Test] public void TestIncrementalWithUtf8() { // In UTF-8: Lone high surrogate (invalid), surrogate pair: high-low (valid), lone low surrogate (invalid) _bytes = new byte[] { 0xed, 0xa7, 0x98, 0xed, 0xaf, 0x9a, 0xed, 0xb7, 0x9c, 0xed, 0xbf, 0x9e }; Encoding penc = new PythonSurrogateEscapeEncoding(Encoding.UTF8); SurrogateTestHelpers.IncrementalTest(penc, _bytes, roundTrip: true); } } public class EndiannessTests { private byte[] _bytes1, _bytes2; // U+0A00 is an unassigned character, U+000A is LF [SetUp] public void SetUp() { _bytes1 = new byte[] { 0x0a, 0x00, 0x00, 0x00 }; _bytes2 = _bytes1.Reverse().ToArray(); } [Test] public void TestEndiannessWithUtf16LE() { Encoding penc = new PythonSurrogateEscapeEncoding(Encoding.Unicode); Assert.That("\u000a\u0000", Is.EqualTo(penc.GetChars(_bytes1))); Assert.That("\u0000\u0a00", Is.EqualTo(penc.GetChars(_bytes2))); } [Test] public void TestEndiannessWithUtf16BE() { Encoding penc = new PythonSurrogateEscapeEncoding(Encoding.BigEndianUnicode); Assert.That("\u0a00\u0000", Is.EqualTo(penc.GetChars(_bytes1))); Assert.That("\u0000\u000a", Is.EqualTo(penc.GetChars(_bytes2))); } [Test] public void TestEndiannessWithUtf32LE() { Encoding penc = new PythonSurrogateEscapeEncoding(new UTF32Encoding(bigEndian: false, byteOrderMark: false)); Assert.That("\u000a", Is.EqualTo(penc.GetChars(_bytes1))); Assert.Throws(() => penc.GetChars(_bytes2)); } [Test] public void TestEndiannessWithUtf32BE() { Encoding penc = new PythonSurrogateEscapeEncoding(new UTF32Encoding(bigEndian: true, byteOrderMark: false)); Assert.Throws(() => penc.GetChars(_bytes1)); Assert.That("\u000a", Is.EqualTo(penc.GetChars(_bytes2))); } } public class AsciiByteTests { private char[] _chars; [SetUp] public void SetUp() { // surrogate escape carrying byte < 128 is not allowed _chars = "+++\udc41++".ToCharArray(); } [Test] public void TestAsciiByteWithUtf8() => TestAsciiByte(Encoding.UTF8, 1); [Test] public void TestAsciiByteWithUtf16LE() => TestAsciiByte(Encoding.Unicode, 2); [Test] public void TestAsciiByteWithUtf16BE() => TestAsciiByte(Encoding.BigEndianUnicode, 2); [Test] public void TestAsciiByteWithUtf32LE() => TestAsciiByte(new UTF32Encoding(bigEndian: false, byteOrderMark: false), 4); [Test] public void TestAsciiByteWithUtf32BE() => TestAsciiByte(new UTF32Encoding(bigEndian: true, byteOrderMark: false), 4); public void TestAsciiByte(Encoding codec, int charWidth) { Encoding penc = new PythonSurrogateEscapeEncoding(codec); Assert.That(() => penc.GetBytes(_chars), Throws.TypeOf() .With.Property("Index").EqualTo(3) .And.Property("CharUnknown").EqualTo(_chars[3])); var enc = penc.GetEncoder(); var bytes = new byte[_chars.Length * charWidth]; Assert.That(enc.GetByteCount(_chars, 0, 3, flush: false), Is.EqualTo(3 * charWidth)); Assert.That(() => enc.GetBytes(_chars, 0, 3, bytes, 0, flush: false), Throws.Nothing); Assert.That(() => enc.GetBytes(_chars, 3, 2, bytes, 3 * charWidth, flush: false), Throws.TypeOf() .With.Property("Index").EqualTo(0) .And.Property("CharUnknown").EqualTo(_chars[3])); enc.Reset(); Assert.That(() => enc.GetBytes(_chars, 0, 5, bytes, 3 * charWidth, flush: false), Throws.TypeOf() .With.Property("Index").EqualTo(3) .And.Property("CharUnknown").EqualTo(_chars[3])); } } } // Unit testing class PythonSurrogatePassEncoding [TestFixture(Category = "IronPython")] public class SurrogatePassTest { public class EncodingTests { [Test] public void TestAscii() { // 'surrogatepass' is supported only for UTF-8, UTF-16LE, UTF-16BE, UTF-32LE, and UTF-32BE // nevertheless, it can be used with other encodings as long as there are no encoding errors Encoding penc = new PythonSurrogatePassEncoding(Encoding.ASCII); // clean ASCII Assert.That(penc.GetBytes("abc"), Is.EqualTo("abc".AsBytes())); // Attempting to encode surrogates to ASCII will throw an exception. // Note that this is CPython 3.5 behaviour, CPython 3.4 will happily contaminate ASCII with UTF-8 encoded surrogates. // lone high surrogate Assert.Throws(() => penc.GetBytes("\ud810")); // lone low surrogate Assert.Throws(() => penc.GetBytes("\udc0a")); // invalid surrogate pair (low, high) Assert.Throws(() => penc.GetBytes("\ude51\uda2f")); } [Test] public void TestUtf7() { // "surrogatepass" is not supported for UTF-7 per se, // but UTF-7 is supposed to encode any surogate characters into its ASCII mangled form // without requiring any fallback support Encoding penc = new PythonSurrogatePassEncoding(new UTF7Encoding(allowOptionals: true)); // lone high surrogate Assert.That(penc.GetBytes("abc\ud810xyz"), Is.EqualTo("abc+2BA-xyz".AsBytes())); // lone low surrogate Assert.That(penc.GetBytes("abc\udc0axyz"), Is.EqualTo("abc+3Ao-xyz".AsBytes())); // invalid surrogate pair (low, high) Assert.That(penc.GetBytes("abc\ude51\uda2fxyz"), Is.EqualTo("abc+3lHaLw-xyz".AsBytes())); } [Test] public void TestUtf8() { Encoding penc = new PythonSurrogatePassEncoding(Encoding.UTF8); // lone high surrogate Assert.That(penc.GetBytes("abc\ud810xyz"), Is.EqualTo("abc\xed\xa0\x90xyz".AsBytes())); // lone low surrogate Assert.That(penc.GetBytes("abc\udc0axyz"), Is.EqualTo("abc\xed\xb0\x8axyz".AsBytes())); // invalid surrogate pair (low, high) Assert.That(penc.GetBytes("abc\ude51\uda2fxyz"), Is.EqualTo("abc\xed\xb9\x91\xed\xa8\xafxyz".AsBytes())); } [Test] public void TestUtf16LE() { Encoding penc = new PythonSurrogatePassEncoding(Encoding.Unicode); // lone high surrogate Assert.That(penc.GetBytes("\ud810"), Is.EqualTo("\x10\xd8".AsBytes())); // lone low surrogate Assert.That(penc.GetBytes("\udc0a"), Is.EqualTo("\n\xdc".AsBytes())); // invalid surrogate pair (low, high) Assert.That(penc.GetBytes("\ude51\uda2f"), Is.EqualTo("Q\xde/\xda".AsBytes())); } [Test] public void TestUtf16BE() { Encoding penc = new PythonSurrogatePassEncoding(Encoding.BigEndianUnicode); // lone high surrogate Assert.That(penc.GetBytes("\ud810"), Is.EqualTo("\xd8\x10".AsBytes())); // lone low surrogate Assert.That(penc.GetBytes("\udc0a"), Is.EqualTo("\xdc\n".AsBytes())); // invalid surrogate pair (low, high) Assert.That(penc.GetBytes("\ude51\uda2f"), Is.EqualTo("\xdeQ\xda/".AsBytes())); } [Test] public void TestUtf32LE() { Encoding penc = new PythonSurrogatePassEncoding(new UTF32Encoding(bigEndian: false, byteOrderMark: false)); // lone high surrogate Assert.That(penc.GetBytes("\ud810"), Is.EqualTo("\x10\xd8\x00\x00".AsBytes())); // lone low surrogate Assert.That(penc.GetBytes("\udc0a"), Is.EqualTo("\n\xdc\x00\x00".AsBytes())); // invalid surrogate pair (low, high) Assert.That(penc.GetBytes("\ude51\uda2f"), Is.EqualTo("Q\xde\x00\x00/\xda\x00\x00".AsBytes())); } [Test] public void TestUtf32BE() { Encoding penc = new PythonSurrogatePassEncoding(new UTF32Encoding(bigEndian: true, byteOrderMark: false)); // lone high surrogate Assert.That(penc.GetBytes("\ud810"), Is.EqualTo("\x00\x00\xd8\x10".AsBytes())); // lone low surrogate Assert.That(penc.GetBytes("\udc0a"), Is.EqualTo("\x00\x00\xdc\n".AsBytes())); // invalid surrogate pair (low, high) Assert.That(penc.GetBytes("\ude51\uda2f"), Is.EqualTo("\x00\x00\xdeQ\x00\x00\xda/".AsBytes())); } } public class DecodingTests { [Test] public void TestAscii() { // 'surrogatepass' is supported only for UTF-8, UTF-16LE, UTF-16BE, UTF-32LE, and UTF-32BE // nevertheless, it can be used with other encodings as long as there are no encoding errors Encoding penc = new PythonSurrogatePassEncoding(Encoding.ASCII); // clean ASCII Assert.That(penc.GetChars("abc".AsBytes()), Is.EqualTo("abc")); // Attempting to decode surrogates from ASCII will throw an exception. // Note that this is CPython 3.5 behaviour, CPython 3.4 will will blindly extract UTF-8 encoded surrogates from ASCII. // lone high surrogate in UTF-8 Assert.Throws(() => penc.GetChars("\xed\xa0\x90".AsBytes())); // lone low surrogate in UTF-8 Assert.Throws(() => penc.GetChars("\xed\xb0\x8a".AsBytes())); // invalid surrogate pair (low, high) in UTF-8 Assert.Throws(() => penc.GetChars("\xed\xb9\x91\xed\xa8\xaf".AsBytes())); } [Test] public void TestUtf7() { // "surrogatepass" is not supported for UTF-7 per se, // but UTF-7 is supposed to decode any surogate characters from its ASCII mangled form // without requiring any fallback support Encoding penc = new PythonSurrogatePassEncoding(new UTF7Encoding(allowOptionals: true)); // lone high surrogate Assert.That(penc.GetChars("abc+2BA-xyz".AsBytes()), Is.EqualTo("abc\ud810xyz")); // lone low surrogate Assert.That(penc.GetChars("abc+3Ao-xyz".AsBytes()), Is.EqualTo("abc\udc0axyz")); // invalid surrogate pair (low, high) Assert.That(penc.GetChars("abc+3lHaLw-xyz".AsBytes()), Is.EqualTo("abc\ude51\uda2fxyz")); } [Test] public void TestUtf8() { Encoding penc = new PythonSurrogatePassEncoding(Encoding.UTF8); // lone high surrogate Assert.That(penc.GetChars("abc\xed\xa0\x90xyz".AsBytes()), Is.EqualTo("abc\ud810xyz")); // lone low surrogate Assert.That(penc.GetChars("abc\xed\xb0\x8axyz".AsBytes()), Is.EqualTo("abc\udc0axyz")); // invalid surrogate pair (low, high) Assert.That(penc.GetChars("abc\xed\xb9\x91\xed\xa8\xafxyz".AsBytes()), Is.EqualTo("abc\ude51\uda2fxyz")); // valid surrogate pair (high, low) Assert.That(penc.GetChars("abc\xed\xa8\xaf\xed\xb9\x91xyz".AsBytes()), Is.EqualTo("abc\uda2f\ude51xyz")); var chars = new char[9]; // broken lone high surrogate var bytes = "abc\xed-\xa0\x90xyz".AsBytes(); Assert.That(() => penc.GetChars(bytes), Throws.TypeOf() .With.Property("Index").EqualTo(3) .And.Property("BytesUnknown").One.EqualTo(0xed)); var dec = penc.GetDecoder(); Assert.That(dec.GetCharCount(bytes, 0, 4, flush: false), Is.EqualTo(3)); Assert.That(dec.GetChars(bytes, 0, 4, chars, 0, flush: false), Is.EqualTo(3)); Assert.That(() => dec.GetCharCount(bytes, 4, 4, flush: false), Throws.TypeOf() .With.Property("Index").EqualTo(-1) .And.Property("BytesUnknown").One.EqualTo(0xed)); // broken in a different way bytes = "abc\xed\xa0-\x90xyz".AsBytes(); Assert.That(() => penc.GetChars(bytes), Throws.TypeOf() .With.Property("Index").EqualTo(3) .And.Property("BytesUnknown").One.EqualTo(0xed)); dec.Reset(); Assert.That(dec.GetCharCount(bytes, 0, 4, flush: false), Is.EqualTo(3)); Assert.That(dec.GetChars(bytes, 0, 4, chars, 0, flush: false), Is.EqualTo(3)); Assert.That(() => dec.GetCharCount(bytes, 4, 4, flush: false), Throws.TypeOf() .With.Property("Index").EqualTo(-1) .And.Property("BytesUnknown").One.EqualTo(0xed)); dec.Reset(); Assert.That(dec.GetCharCount(bytes, 0, 5, flush: false), Is.EqualTo(3)); Assert.That(dec.GetChars(bytes, 0, 5, chars, 0, flush: false), Is.EqualTo(3)); Assert.That(() => dec.GetCharCount(bytes, 5, 3, flush: false), Throws.TypeOf() .With.Property("Index").EqualTo(-2) .And.Property("BytesUnknown").One.EqualTo(0xed)); // unfinished surrogate sequence in the middle bytes = "abc\xed\xa0xyz".AsBytes(); Assert.That(() => penc.GetChars(bytes), Throws.TypeOf() .With.Property("Index").EqualTo(3) .And.Property("BytesUnknown").One.EqualTo(0xed)); dec.Reset(); Assert.That(dec.GetCharCount(bytes, 0, 5, flush: false), Is.EqualTo(3)); Assert.That(dec.GetChars(bytes, 0, 5, chars, 0, flush: false), Is.EqualTo(3)); Assert.That(() => dec.GetCharCount(bytes, 5, 2, flush: false), Throws.TypeOf() .With.Property("Index").EqualTo(-2) .And.Property("BytesUnknown").One.EqualTo(0xed)); // unfinished surrogate sequence at the end bytes = "abcxyz\xed\xa0".AsBytes(); Assert.That(() => penc.GetChars(bytes), Throws.TypeOf() .With.Property("Index").EqualTo(6) .And.Property("BytesUnknown").One.EqualTo(0xed)); dec.Reset(); Assert.That(dec.GetCharCount(bytes, 0, 7, flush: false), Is.EqualTo(6)); Assert.That(dec.GetChars(bytes, 0, 7, chars, 0, flush: false), Is.EqualTo(6)); Assert.That(() => dec.GetCharCount(bytes, 7, 1, flush: true), Throws.TypeOf() .With.Property("Index").EqualTo(-1) .And.Property("BytesUnknown").One.EqualTo(0xed)); } [Test] public void TestUtf16LE() { Encoding penc = new PythonSurrogatePassEncoding(Encoding.Unicode); // lone high surrogate Assert.That(penc.GetChars("\x10\xd8".AsBytes()), Is.EqualTo("\ud810")); // lone low surrogate Assert.That(penc.GetChars("\n\xdc".AsBytes()), Is.EqualTo("\udc0a")); // invalid surrogate pair (low, high) Assert.That(penc.GetChars("Q\xde/\xda".AsBytes()), Is.EqualTo("\ude51\uda2f")); } [Test] public void TestUtf16BE() { Encoding penc = new PythonSurrogatePassEncoding(Encoding.BigEndianUnicode); // lone high surrogate Assert.That(penc.GetChars("\xd8\x10".AsBytes()), Is.EqualTo("\ud810")); // lone low surrogate Assert.That(penc.GetChars("\xdc\n".AsBytes()), Is.EqualTo("\udc0a")); // invalid surrogate pair (low, high) Assert.That(penc.GetChars("\xdeQ\xda/".AsBytes()), Is.EqualTo("\ude51\uda2f")); } [Test] public void TestUtf32LE() { Encoding penc = new PythonSurrogatePassEncoding(new UTF32Encoding(bigEndian: false, byteOrderMark: false)); // lone high surrogate Assert.That(penc.GetChars("\x10\xd8\x00\x00".AsBytes()), Is.EqualTo("\ud810")); // lone low surrogate Assert.That(penc.GetChars("\n\xdc\x00\x00".AsBytes()), Is.EqualTo("\udc0a")); // invalid surrogate pair (low, high) Assert.That(penc.GetChars("Q\xde\x00\x00/\xda\x00\x00".AsBytes()), Is.EqualTo("\ude51\uda2f")); } [Test] public void TestUtf32BE() { Encoding penc = new PythonSurrogatePassEncoding(new UTF32Encoding(bigEndian: true, byteOrderMark: false)); // lone high surrogate Assert.That(penc.GetChars("\x00\x00\xd8\x10".AsBytes()), Is.EqualTo("\ud810")); // lone low surrogate Assert.That(penc.GetChars("\x00\x00\xdc\n".AsBytes()), Is.EqualTo("\udc0a")); // invalid surrogate pair (low, high) Assert.That(penc.GetChars("\x00\x00\xdeQ\x00\x00\xda/".AsBytes()), Is.EqualTo("\ude51\uda2f")); } } // Test incremental (block-wise) decoding/encoding public class IncrementalTests { [Test] public void TestIncrementalWithUtf16() { // In UTF-16LE: lone low surrogate (invalid) Lone high surrogate (invalid), surrogate pair: high-low (valid), var bytes = new byte[] { 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf }; Encoding penc = new PythonSurrogatePassEncoding(Encoding.Unicode); SurrogateTestHelpers.IncrementalTest(penc, bytes, roundTrip: false); } [Test] public void TestIncrementalWithUtf32() { var bytes = new byte[] { 0xd8, 0xd9, 0x00, 0x00, 0xda, 0xdb, 0x00, 0x00, 0xdc, 0xdd, 0x00, 0x00, 0xde, 0xdf, 0x00, 0x00 }; Encoding penc = new PythonSurrogatePassEncoding(Encoding.UTF32); SurrogateTestHelpers.IncrementalTest(penc, bytes, roundTrip: false); } [Test] public void TestIncrementalWithUtf8() { // In UTF-8: Lone high surrogate (invalid), surrogate pair: high-low (valid), lone low surrogate (invalid) var bytes = new byte[] { 0xed, 0xa7, 0x98, 0xed, 0xaf, 0x9a, 0xed, 0xb7, 0x9c, 0xed, 0xbf, 0x9e }; Encoding penc = new PythonSurrogatePassEncoding(Encoding.UTF8); SurrogateTestHelpers.IncrementalTest(penc, bytes, roundTrip: false); } } } #region Helper methods public static class SurrogateTestHelpers { public static byte[] AsBytes(this string s) => s.Select(c => (byte)c).ToArray(); public static void IncrementalTest(Encoding penc, byte[] bytes, bool roundTrip) { // Reference for comparisons: chars encoded in one step char[] expChars = penc.GetChars(bytes); byte[] expBytes = roundTrip ? bytes : penc.GetBytes(expChars); for (int splitBytesAt = 0; splitBytesAt <= expBytes.Length; splitBytesAt += 1) { // From https://docs.microsoft.com/en-us/dotnet/api/system.text.decoder.getchars?view=netframework-4.5: // The application should call GetCharCount on a block of data immediately before calling GetChars on the same block, // so that any trailing bytes from the previous block are included in the calculation. var dec = penc.GetDecoder(); char[] chars1 = new char[dec.GetCharCount(expBytes, 0, splitBytesAt, flush: false)]; dec.GetChars(expBytes, 0, splitBytesAt, chars1, 0, flush: false); char[] chars2 = new char[dec.GetCharCount(expBytes, splitBytesAt, expBytes.Length - splitBytesAt, flush: true)]; dec.GetChars(expBytes, splitBytesAt, expBytes.Length - splitBytesAt, chars2, 0, flush: true); char[] total_chars = chars1.Concat(chars2).ToArray(); Assert.That(total_chars, Is.EqualTo(expChars), $"Splitting bytes at {splitBytesAt}"); for (int splitCharsAt = 0; splitCharsAt <= total_chars.Length; splitCharsAt += 1) { // From https://docs.microsoft.com/en-us/dotnet/api/system.text.encoder.getbytecount?view=netframework-4.5: // The application should call GetByteCount on a block of data immediately before calling GetBytes on the same block, // so that any trailing characters from the previous block are included in the calculation. var enc = penc.GetEncoder(); byte[] bytes1 = new byte[enc.GetByteCount(total_chars, 0, splitCharsAt, flush: false)]; enc.GetBytes(total_chars, 0, splitCharsAt, bytes1, 0, flush: false); byte[] bytes2 = new byte[enc.GetByteCount(total_chars, splitCharsAt, total_chars.Length - splitCharsAt, flush: true)]; enc.GetBytes(total_chars, splitCharsAt, total_chars.Length - splitCharsAt, bytes2, 0, flush: true); byte[] total_bytes = bytes1.Concat(bytes2).ToArray(); Assert.That(total_bytes, Is.EqualTo(expBytes), $"Splitting chars at {splitCharsAt}"); } } } } #endregion }

X Tutup