Skip to content

Commit 7848ca5

Browse files
BCSharpslozier
authored andcommitted
Implement error handling for (raw-)unicode-escape codec (#694)
* Implement error handling for (raw-)unicode-escape codec * Enable additional stdlib tests * Throw NotImplementedException for unsupported cases
1 parent 64403aa commit 7848ca5

5 files changed

Lines changed: 160 additions & 94 deletions

File tree

Src/IronPython.Modules/_codecs.cs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -179,21 +179,21 @@ public static PythonTuple escape_decode([BytesConversion]IList<byte> data, strin
179179

180180
return PythonTuple.MakeTuple(Bytes.Make(res.ToArray()), data.Count);
181181

182-
LiteralParser.ParseBytesErrorHandler<byte> getErrorHandler(string errors) {
183-
if (errors == null) return null;
182+
static LiteralParser.ParseBytesErrorHandler<byte> getErrorHandler(string errors) {
183+
if (errors == null) return default;
184+
184185
Func<int, IReadOnlyList<byte>> eh = null;
185-
return errorHandler;
186186

187-
IReadOnlyList<byte> errorHandler(IList<byte> data, int start, int end) {
187+
return delegate (IList<byte> data, int start, int end) {
188188
eh ??= errors switch
189189
{
190-
"strict" => idx => throw PythonOps.ValueError("invalid \\x escape at position {0}", idx),
190+
"strict" => idx => throw PythonOps.ValueError(@"invalid \x escape at position {0}", idx),
191191
"replace" => idx => _replacementMarker ??= new[] { (byte)'?' },
192192
"ignore" => idx => null,
193193
_ => idx => throw PythonOps.ValueError("decoding error; unknown error handling code: " + errors),
194194
};
195195
return eh(start);
196-
}
196+
};
197197
}
198198
}
199199
private static byte[] _replacementMarker;
@@ -209,7 +209,7 @@ IReadOnlyList<byte> errorHandler(IList<byte> data, int start, int end) {
209209
case (byte)'\'': buf.Add((byte)'\\'); buf.Add((byte)'\''); break;
210210
default:
211211
if (b < 0x20 || b >= 0x7f) {
212-
buf.AddRange($"\\x{b:x2}".Select(c => (byte)c));
212+
buf.AddRange($"\\x{b:x2}".Select(c => unchecked((byte)c)));
213213
} else {
214214
buf.Add(b);
215215
}

Src/IronPython/Runtime/LiteralParser.cs

Lines changed: 57 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
using System.Collections.Generic;
77
using System.Diagnostics;
88
using System.Globalization;
9+
using System.Linq;
910
using System.Numerics;
1011
using System.Text;
1112

@@ -17,7 +18,9 @@
1718

1819
namespace IronPython.Runtime {
1920
public static class LiteralParser {
20-
public static string ParseString(char[] text, int start, int length, bool isRaw, bool isUniEscape, bool normalizeLineEndings) {
21+
internal delegate IReadOnlyList<char> ParseStringErrorHandler<T>(IList<T> data, int start, int end);
22+
23+
internal static string ParseString(char[] text, int start, int length, bool isRaw, bool isUniEscape, bool normalizeLineEndings) {
2124
Debug.Assert(text != null);
2225
Debug.Assert(start + length <= text.Length);
2326

@@ -28,32 +31,33 @@ public static string ParseString(char[] text, int start, int length, bool isRaw,
2831
return result ?? new string(text, start, length);
2932
}
3033

31-
public static string ParseString(byte[] bytes, int start, int length, bool isRaw) {
34+
internal static string ParseString(byte[] bytes, int start, int length, bool isRaw, ParseStringErrorHandler<byte> errorHandler) {
3235
Debug.Assert(bytes != null);
3336
Debug.Assert(start + length <= bytes.Length);
3437

35-
string result = DoParseString(bytes, start, length, isRaw, isUniEscape: true, normalizeLineEndings: false);
38+
string result = DoParseString(bytes, start, length, isRaw, isUniEscape: true, normalizeLineEndings: false, errorHandler);
3639

3740
return result ?? bytes.MakeString(start, length);
3841
}
3942

40-
private static string DoParseString<T>(T[] data, int start, int length, bool isRaw, bool isUniEscape, bool normalizeLineEndings) where T : IConvertible {
43+
private static string DoParseString<T>(T[] data, int start, int length, bool isRaw, bool isUniEscape, bool normalizeLineEndings, ParseStringErrorHandler<T> errorHandler = default) where T : IConvertible {
44+
Bytes bytesData = null;
4145
StringBuilder buf = null;
4246
int i = start;
4347
int l = start + length;
4448
int val;
4549
while (i < l) {
4650
char ch = data[i++].ToChar(null);
4751
if ((!isRaw || isUniEscape) && ch == '\\') {
48-
StringBuilderInit(ref buf, data, start, i - start - 1, length);
52+
StringBuilderInit(ref buf, data, start, i - 1, length);
4953

5054
if (i >= l) {
5155
if (isRaw) {
5256
buf.Append('\\');
53-
break;
5457
} else {
55-
throw PythonOps.ValueError("Trailing \\ in string");
58+
handleError(i - start - 1, i - start, "\\ at end of string");
5659
}
60+
break;
5761
}
5862
ch = data[i++].ToChar(null);
5963

@@ -62,29 +66,16 @@ private static string DoParseString<T>(T[] data, int start, int length, bool isR
6266
int max = 16;
6367
if (TryParseInt(data, i, len, max, out val, out int consumed)) {
6468
if (val < 0 || val > 0x10ffff) {
65-
throw PythonExceptions.CreateThrowable(
66-
PythonExceptions.UnicodeDecodeError,
67-
isRaw ? "rawunicodeescape" : "unicodeescape",
68-
data is byte[] byteData ? new Bytes(byteData) : Bytes.Empty,
69-
i - start - 2, i - start + consumed,
70-
isRaw ? "\\Uxxxxxxxx out of range" : "illegal Unicode character");
71-
}
72-
73-
if (val < 0x010000) {
69+
handleError(i - start - 2, i - start + consumed, isRaw ? @"\Uxxxxxxxx out of range" : "illegal Unicode character");
70+
} else if (val < 0x010000) {
7471
buf.Append((char)val);
7572
} else {
7673
buf.Append(char.ConvertFromUtf32(val));
7774
}
78-
i += len;
7975
} else {
80-
throw PythonExceptions.CreateThrowable(
81-
PythonExceptions.UnicodeDecodeError,
82-
isRaw ? "rawunicodeescape" : "unicodeescape",
83-
data is byte[] byteData ? new Bytes(byteData) : Bytes.Empty,
84-
i - start - 2,
85-
i - start + consumed,
86-
ch == 'u' ? @"truncated \uXXXX escape" : @"truncated \UXXXXXXXX escape");
76+
handleError(i - start - 2, i - start + consumed, ch == 'u' ? @"truncated \uXXXX escape" : @"truncated \UXXXXXXXX escape");
8777
}
78+
i += consumed;
8879
} else {
8980
if (isRaw) {
9081
buf.Append('\\');
@@ -112,10 +103,12 @@ private static string DoParseString<T>(T[] data, int start, int length, bool isR
112103
continue;
113104
case 'N': {
114105
IronPython.Modules.unicodedata.PerformModuleReload(null, null);
106+
StringBuilder namebuf = new StringBuilder();
107+
bool namestarted = false;
108+
bool namecomplete = false;
115109
if (i < l && data[i].ToChar(null) == '{') {
110+
namestarted = true;
116111
i++;
117-
StringBuilder namebuf = new StringBuilder();
118-
bool namecomplete = false;
119112
while (i < l) {
120113
char namech = data[i++].ToChar(null);
121114
if (namech != '}') {
@@ -125,52 +118,33 @@ private static string DoParseString<T>(T[] data, int start, int length, bool isR
125118
break;
126119
}
127120
}
128-
129-
if (!namecomplete || namebuf.Length == 0)
130-
throw PythonExceptions.CreateThrowable(
131-
PythonExceptions.UnicodeDecodeError,
132-
"unicodeescape",
133-
data is byte[] byteData ? new Bytes(byteData) : Bytes.Empty,
134-
i - start - 3 - namebuf.Length - (namecomplete ? 1 : 0), // 3 for \N{ and 1 for }
135-
i - start - (namecomplete ? 1 : 0), // 1 for }
136-
@"malformed \N character escape");
137-
121+
}
122+
if (!namecomplete || namebuf.Length == 0) {
123+
handleError(i - start - 2 - (namestarted ? 1 : 0) - namebuf.Length - (namecomplete ? 1 : 0), // 2 for \N and 1 for { and 1 for }
124+
i - start - (namecomplete ? 1 : 0), // 1 for }
125+
@"malformed \N character escape");
126+
if (namecomplete) {
127+
buf.Append('}');
128+
}
129+
} else {
138130
try {
139131
string uval = IronPython.Modules.unicodedata.lookup(namebuf.ToString());
140132
buf.Append(uval);
141133
} catch (KeyNotFoundException) {
142-
throw PythonExceptions.CreateThrowable(
143-
PythonExceptions.UnicodeDecodeError,
144-
"unicodeescape",
145-
data is byte[] byteData ? new Bytes(byteData) : Bytes.Empty,
146-
i - start - 4 - namebuf.Length, // 4 for \N{}
147-
i - start,
148-
"unknown Unicode character name");
134+
handleError(i - start - 4 - namebuf.Length, // 4 for \N{}
135+
i - start,
136+
"unknown Unicode character name");
149137
}
150-
151-
} else {
152-
throw PythonExceptions.CreateThrowable(
153-
PythonExceptions.UnicodeDecodeError,
154-
"unicodeescape",
155-
data is byte[] byteData ? new Bytes(byteData) : Bytes.Empty,
156-
i - start - 2, // 2 for \N
157-
i - start,
158-
@"malformed \N character escape");
159138
}
160139
}
161140
continue;
162141
case 'x': //hex
163142
if (!TryParseInt(data, i, 2, 16, out val, out int consumed)) {
164-
throw PythonExceptions.CreateThrowable(
165-
PythonExceptions.UnicodeDecodeError,
166-
"unicodeescape",
167-
data is byte[] byteData ? new Bytes(byteData) : Bytes.Empty,
168-
i - start - 2,
169-
i - start + consumed,
170-
@"truncated \xXX escape");
143+
handleError(i - start - 2, i - start + consumed, @"truncated \xXX escape");
144+
} else {
145+
buf.Append((char)val);
171146
}
172-
buf.Append((char)val);
173-
i += 2;
147+
i += consumed;
174148
continue;
175149
case '0':
176150
case '1':
@@ -200,7 +174,7 @@ private static string DoParseString<T>(T[] data, int start, int length, bool isR
200174
}
201175
}
202176
} else if (ch == '\r' && normalizeLineEndings) {
203-
StringBuilderInit(ref buf, data, start, i - start - 1, length);
177+
StringBuilderInit(ref buf, data, start, i - 1, length);
204178

205179
// normalize line endings
206180
if (i < l && data[i].ToChar(null) == '\n') {
@@ -212,19 +186,35 @@ private static string DoParseString<T>(T[] data, int start, int length, bool isR
212186
}
213187
}
214188
return buf?.ToString();
189+
190+
void handleError(int start, int end, string reason) {
191+
if (bytesData == null) {
192+
var ba = data as byte[];
193+
if (ba == null) throw new NotImplementedException("Error handler for non byte[] data not supported");
194+
bytesData = new Bytes(ba);
195+
}
196+
197+
if (errorHandler == null) {
198+
throw PythonExceptions.CreateThrowable(PythonExceptions.UnicodeDecodeError, isRaw ? "rawunicodeescape" : "unicodeescape", bytesData, start, end, reason);
199+
}
200+
var substitute = errorHandler(data, start, end);
201+
if (substitute != null) {
202+
buf.Append(substitute.ToArray());
203+
}
204+
}
215205
}
216206

217-
private static void StringBuilderInit<T>(ref StringBuilder sb, T[] data, int start, int count, int capacity) {
207+
private static void StringBuilderInit<T>(ref StringBuilder sb, T[] data, int start, int end, int capacity) {
218208
if (sb != null) return;
219209

220210
sb = new StringBuilder(capacity);
221211
switch (data) {
222212
case char[] text:
223-
sb.Append(text, start, count);
213+
sb.Append(text, start, end - start);
224214
break;
225215

226216
case byte[] bytes:
227-
for (int i = start; i < start + count; i++) {
217+
for (int i = start; i < end; i++) {
228218
sb.Append((char)bytes[i]);
229219
}
230220
break;
@@ -236,7 +226,7 @@ private static void StringBuilderInit<T>(ref StringBuilder sb, T[] data, int sta
236226

237227
internal delegate IReadOnlyList<byte> ParseBytesErrorHandler<T>(IList<T> data, int start, int end);
238228

239-
internal static List<byte> ParseBytes<T>(IList<T> data, int start, int length, bool isRaw, bool normalizeLineEndings, ParseBytesErrorHandler<T> errorHandler = null) where T : IConvertible {
229+
internal static List<byte> ParseBytes<T>(IList<T> data, int start, int length, bool isRaw, bool normalizeLineEndings, ParseBytesErrorHandler<T> errorHandler = default) where T : IConvertible {
240230
Debug.Assert(data != null);
241231
Debug.Assert(start + length <= data.Count);
242232

@@ -275,7 +265,7 @@ internal static List<byte> ParseBytes<T>(IList<T> data, int start, int length, b
275265
if (!TryParseInt(data, i, 2, 16, out val, out int consumed)) {
276266
int pos = i - start - 2;
277267
if (errorHandler == null) {
278-
throw PythonOps.ValueError("invalid \\x escape at position {0}", pos);
268+
throw PythonOps.ValueError(@"invalid \x escape at position {0}", pos);
279269
}
280270
var substitute = errorHandler(data, pos, pos + consumed);
281271
if (substitute != null) {

0 commit comments

Comments
 (0)