@@ -1780,6 +1780,9 @@ internal static string RawDecode(CodeContext/*!*/ context, IBufferProtocol data,
17801780 private static DecoderFallback ReplacementFallback = new DecoderReplacementFallback ( "\ufffd " ) ;
17811781
17821782 internal static string DoDecode ( CodeContext context , IPythonBuffer buffer , string ? errors , string encoding , Encoding e , int numBytes = - 1 ) {
1783+ // Precondition: only bytes-like buffers accepted
1784+ Debug . Assert ( buffer . IsCContiguous ( ) ) ;
1785+
17831786 var span = buffer . AsReadOnlySpan ( ) ;
17841787 int start = GetStartingOffset ( span , e ) ;
17851788 int length = ( numBytes >= 0 ? numBytes : span . Length ) - start ;
@@ -1791,40 +1794,35 @@ Encoding setFallback(Encoding enc, DecoderFallback fb) {
17911794 enc . DecoderFallback = fb ;
17921795 return enc ;
17931796 }
1797+ PythonEncoding ? pe = null ; // to avoid downcasting later
17941798 switch ( errors ) {
17951799 case null :
17961800 case "backslashreplace" :
17971801 case "xmlcharrefreplace" :
17981802 case "strict" : e = setFallback ( e , new ExceptionFallback ( e is UTF8Encoding ) ) ; break ;
17991803 case "replace" : e = setFallback ( e , ReplacementFallback ) ; break ;
1800- case "ignore" : e = setFallback ( e , new PythonDecoderFallback ( encoding , buffer , start ) ) ; break ;
1801- case "surrogateescape" : e = new PythonSurrogateEscapeEncoding ( e , encoding ) ; break ;
1802- case "surrogatepass" : e = new PythonSurrogatePassEncoding ( e , encoding ) ; break ;
1803- default :
1804- e = setFallback ( e , new PythonDecoderFallback ( encoding ,
1805- buffer , start ,
1806- ( ) => LightExceptions . CheckAndThrow ( PythonOps . LookupEncodingError ( context , errors ) ) ) ) ;
1807- break ;
1804+ case "ignore" : e = setFallback ( e , new DecoderReplacementFallback ( string . Empty ) ) ; break ;
1805+ case "surrogateescape" : e = pe = new PythonSurrogateEscapeEncoding ( e , encoding ) ; break ;
1806+ case "surrogatepass" : e = pe = new PythonSurrogatePassEncoding ( e , encoding ) ; break ;
1807+ default : e = pe = new PythonErrorHandlerEncoding ( context , e , encoding , errors ) ; break ;
18081808 }
18091809
18101810 string decoded = string . Empty ;
18111811 try {
1812- unsafe {
1813- fixed ( byte * ptr = span . Slice ( start ) ) {
1814- if ( ptr != null ) {
1815- if ( e is UnicodeEscapeEncoding ue ) {
1816- // This overload is not virtual, but the base implementation is inefficient for this encoding
1817- decoded = ue . GetString ( ptr , length ) ;
1818- } else {
1819- decoded = e . GetString ( ptr , length ) ;
1820- }
1821- }
1812+ if ( pe != null ) {
1813+ decoded = pe . GetString ( buffer , start , length ) ;
1814+ } else {
1815+ if ( e is UnicodeEscapeEncoding ue ) {
1816+ // This overload is not virtual, but the base implementation is inefficient for this encoding
1817+ decoded = ue . GetString ( span . Slice ( start , length ) ) ;
1818+ } else {
1819+ decoded = e . GetString ( span . Slice ( start , length ) ) ;
18221820 }
18231821 }
18241822 } catch ( DecoderFallbackException ex ) {
18251823 // augmenting the caught exception instead of creating UnicodeDecodeError to preserve the stack trace
1826- ex . Data [ "encoding" ] = encoding ;
1827- ex . Data [ "object" ] = Bytes . Make ( span . Slice ( start , length ) . ToArray ( ) ) ;
1824+ if ( ! ex . Data . Contains ( "encoding" ) ) ex . Data [ "encoding" ] = encoding ;
1825+ if ( ! ex . Data . Contains ( "object" ) ) ex . Data [ "object" ] = Bytes . Make ( span . Slice ( start , length ) . ToArray ( ) ) ; ;
18281826 throw ;
18291827 }
18301828
@@ -2219,6 +2217,8 @@ private string EscapeEncode(string s, int index, int count) {
22192217 ReprEncode ( s , index , count , isUniEscape : true ) ;
22202218 }
22212219
2220+ public override string EncodingName => _raw ? "rawunicodeescape" : "unicodeescape" ;
2221+
22222222 public override int GetByteCount ( string s )
22232223 => EscapeEncode ( s , 0 , s . Length ) . Length ;
22242224
@@ -2240,10 +2240,12 @@ public override int GetBytes(char[] chars, int charIndex, int charCount, byte[]
22402240 public override string GetString ( byte [ ] bytes , int index , int count )
22412241 => LiteralParser . ParseString ( bytes , index , count , _raw , GetErrorHandler ( ) ) ;
22422242
2243- public new unsafe string GetString ( byte * bytes , int byteCount ) {
2244- var data = new ReadOnlySpan < byte > ( bytes , byteCount ) ;
2245- return LiteralParser . ParseString ( data , _raw , GetErrorHandler ( ) ) ;
2246- }
2243+ #if NETCOREAPP
2244+ public new string GetString ( ReadOnlySpan < byte > bytes )
2245+ #else
2246+ public string GetString ( ReadOnlySpan < byte > bytes )
2247+ #endif
2248+ => LiteralParser . ParseString ( bytes , _raw , GetErrorHandler ( ) ) ;
22472249
22482250 public override unsafe int GetCharCount ( byte * bytes , int count )
22492251 => LiteralParser . ParseString ( new ReadOnlySpan < byte > ( bytes , count ) , _raw , GetErrorHandler ( ) ) . Length ;
@@ -2303,127 +2305,6 @@ public override int GetChars(byte[] bytes, int byteIndex, int byteCount, char[]
23032305
23042306 #region Unicode Encode/Decode Fallback Support
23052307
2306- /// When encoding or decoding strings if an error occurs CPython supports several different
2307- /// behaviors, in addition it supports user-extensible behaviors as well. For the default
2308- /// behavior we're ok - both of us support throwing and replacing. For custom behaviors
2309- /// we define a single fallback for decoding and encoding that calls the python function to do
2310- /// the replacement.
2311- ///
2312- /// When we do the replacement we call the provided handler w/ a UnicodeEncodeError or UnicodeDecodeError
2313- /// object which contains:
2314- /// encoding (string, the encoding the user requested)
2315- /// object (the original string or bytes being encoded/decoded)
2316- /// start (the start of the invalid sequence)
2317- /// end (the exclusive end of the invalid sequence)
2318- /// reason (the error message, e.g. 'unexpected byte code', not sure of others)
2319- ///
2320- /// The decoder returns a tuple of (str, int) where str is the replacement string
2321- /// and int is an index where encoding/decoding should continue.
2322- /// TODO: returned int is currently ignored, assumed to be equal to end (i.e. the index is not adjusted).
2323-
2324- private class PythonDecoderFallbackBuffer : DecoderFallbackBuffer {
2325- private readonly object ? _function ;
2326- private readonly string _encoding ;
2327- private readonly IPythonBuffer _data ;
2328- private readonly int _offset ;
2329- private Bytes ? _byteData ;
2330- private string ? _buffer ;
2331- private int _bufferIndex ;
2332-
2333- public PythonDecoderFallbackBuffer ( string encoding , IPythonBuffer data , int offset , object ? callable ) {
2334- _encoding = encoding ;
2335- _data = data ;
2336- _offset = offset ;
2337- _function = callable ;
2338- }
2339-
2340- public override int Remaining {
2341- get {
2342- if ( _buffer == null ) return 0 ;
2343- return _buffer . Length - _bufferIndex ;
2344- }
2345- }
2346-
2347- public override char GetNextChar ( ) {
2348- if ( _buffer == null || _bufferIndex >= _buffer . Length ) return Char . MinValue ;
2349-
2350- return _buffer [ _bufferIndex ++ ] ;
2351- }
2352-
2353- public override bool MovePrevious ( ) {
2354- if ( _bufferIndex > 0 ) {
2355- _bufferIndex -- ;
2356- return true ;
2357- }
2358- return false ;
2359- }
2360-
2361- public override void Reset ( ) {
2362- _buffer = null ;
2363- _bufferIndex = 0 ;
2364- base . Reset ( ) ;
2365- }
2366-
2367- public override bool Fallback ( byte [ ] bytesUnknown , int index ) {
2368- if ( _function != null ) {
2369- // create the exception object to hand to the user-function...
2370- _byteData ??= Bytes . Make ( _data . AsReadOnlySpan ( ) . Slice ( _offset ) . ToArray ( ) ) ;
2371- var exObj = PythonExceptions . CreatePythonThrowable ( PythonExceptions . UnicodeDecodeError , _encoding , _byteData , index , index + bytesUnknown . Length , "unexpected code byte" ) ;
2372-
2373- // call the user function...
2374- object ? res = PythonCalls . Call ( _function , exObj ) ;
2375-
2376- string replacement = CheckReplacementTuple ( res , "decoding" , index + bytesUnknown . Length ) ;
2377-
2378- // finally process the user's request.
2379- _buffer = replacement ;
2380- _bufferIndex = 0 ;
2381- return true ;
2382- }
2383-
2384- return false ;
2385- }
2386-
2387- }
2388-
2389- private class PythonDecoderFallback : DecoderFallback {
2390- private readonly string encoding ;
2391- private readonly IPythonBuffer data ;
2392- private readonly int offset ;
2393- private readonly Func < object > ? lookup ;
2394- private object ? function ;
2395-
2396- public PythonDecoderFallback ( string encoding , IPythonBuffer data , int offset , Func < object > ? lookup = null ) {
2397- this . encoding = encoding ;
2398- this . data = data ;
2399- this . offset = offset ;
2400- this . lookup = lookup ;
2401- }
2402-
2403- public override DecoderFallbackBuffer CreateFallbackBuffer ( ) {
2404- if ( function == null && lookup != null ) {
2405- function = lookup . Invoke ( ) ;
2406- }
2407- return new PythonDecoderFallbackBuffer ( encoding , data , offset , function ) ;
2408- }
2409-
2410- public override int MaxCharCount {
2411- get { throw new NotImplementedException ( ) ; }
2412- }
2413- }
2414-
2415- private static string CheckReplacementTuple ( object ? res , string encodeOrDecode , int cursorPos ) {
2416- // verify the result is sane...
2417- if ( res is PythonTuple tres && tres . __len__ ( ) == 2
2418- && Converter . TryConvertToString ( tres [ 0 ] , out string ? replacement )
2419- && Converter . TryConvertToInt32 ( tres [ 1 ] , out int newPos ) ) {
2420- if ( newPos != cursorPos ) throw new NotImplementedException ( $ "Moving { encodeOrDecode } cursor not implemented yet") ;
2421- return replacement ;
2422- }
2423-
2424- throw PythonOps . TypeError ( "{1} error handler must return tuple containing (str, int), got {0}" , PythonOps . GetPythonTypeName ( res ) , encodeOrDecode ) ;
2425- }
2426-
24272308 private class BackslashEncoderReplaceFallback : EncoderFallback {
24282309 private class BackslashReplaceFallbackBuffer : EncoderFallbackBuffer {
24292310 private List < char > _buffer = new List < char > ( ) ;
@@ -2931,6 +2812,6 @@ internal static void IdentifyUtfEncoding(string encodingName, out int charWidth,
29312812 }
29322813 }
29332814
2934- #endregion
2815+ #endregion
29352816 }
29362817}
0 commit comments