Skip to content

Commit 69558c7

Browse files
committed
Fix encoding fallback error handling and error message formatting
- Raise ArgumentError("too big fallback string") when a fallback function returns a value that cannot be encoded to the destination encoding. Previously, the invalid character was silently dropped. - Format UTF-8 codepoints as U+XXXX in UndefinedConversionError messages instead of showing the raw byte dump (e.g., "U+FFFD" instead of "\xEF\xBF\xBD"), matching MRI behavior. These changes fix all 12 failing specs in encode_spec.rb related to fallback option error handling.
1 parent a2b7680 commit 69558c7

2 files changed

Lines changed: 19 additions & 20 deletions

File tree

core/src/main/java/org/jruby/util/io/EncodingUtils.java

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1456,9 +1456,11 @@ public boolean call(ThreadContext context, IRubyObject fallback, EConv ec) {
14561456
rep = rep.convertToString();
14571457
Encoding repEnc = ((RubyString) rep).getEncoding();
14581458
ByteList repByteList = ((RubyString) rep).getByteList();
1459-
ec.insertOutput(repByteList.getUnsafeBytes(), repByteList.begin(), repByteList.getRealSize(), repEnc.getName());
1459+
int ret = ec.insertOutput(repByteList.getUnsafeBytes(), repByteList.begin(), repByteList.getRealSize(), repEnc.getName());
14601460

1461-
// TODO: check for too-large replacement
1461+
if (ret == -1) {
1462+
throw argumentError(context, "too big fallback string");
1463+
}
14621464
return true;
14631465
}
14641466
return false;
@@ -1691,18 +1693,27 @@ else if (result == EConvResult.UndefinedConversion) {
16911693
int errBytesP = ec.lastError.getErrorBytesP();
16921694
int errorLen = ec.lastError.getErrorBytesLength();
16931695
final byte[] errSource = ec.lastError.getSource();
1694-
if (Arrays.equals(errSource, "UTF-8".getBytes())) {
1695-
// prepare dumped form
1696-
}
16971696

16981697
RubyString bytes = newString(context, new ByteList(errBytes, errBytesP, errorLen - errBytesP));
1699-
RubyString dumped = (RubyString) bytes.dump(context);
1698+
String charRepresentation;
1699+
1700+
// For UTF-8 source, format as U+XXXX codepoint instead of byte dump
1701+
if (Arrays.equals(errSource, "UTF-8".getBytes())) {
1702+
int codepoint = StringSupport.preciseCodePoint(UTF8Encoding.INSTANCE, errBytes, errBytesP, errBytesP + errorLen);
1703+
if (codepoint >= 0) {
1704+
charRepresentation = String.format("U+%04X", codepoint);
1705+
} else {
1706+
charRepresentation = ((RubyString) bytes.dump(context)).toString();
1707+
}
1708+
} else {
1709+
charRepresentation = ((RubyString) bytes.dump(context)).toString();
1710+
}
17001711

17011712
mesg = new StringBuilder();
17021713
if (Arrays.equals(errSource, ec.source) && Arrays.equals(ec.lastError.getDestination(), ec.destination)) {
1703-
mesg.append(dumped).append(" from ").append( new String(errSource) ).append(" to ").append( new String(ec.lastError.getDestination()) );
1714+
mesg.append(charRepresentation).append(" from ").append( new String(errSource) ).append(" to ").append( new String(ec.lastError.getDestination()) );
17041715
} else {
1705-
mesg.append(dumped).append(" to ").append( new String(ec.lastError.getDestination()) ).append(" in conversion from ").append( new String(ec.source) );
1716+
mesg.append(charRepresentation).append(" to ").append( new String(ec.lastError.getDestination()) ).append(" in conversion from ").append( new String(ec.source) );
17061717
for (int i = 0; i < ec.numTranscoders; i++) {
17071718
mesg.append(" to ").append( new String(ec.elements[i].transcoding.transcoder.getDestination()) );
17081719
}

spec/tags/ruby/core/string/encode_tags.txt

Lines changed: 0 additions & 12 deletions
This file was deleted.

0 commit comments

Comments
 (0)