From 350519165ff083ce59181972ee8cf2e243e4ff9d Mon Sep 17 00:00:00 2001 From: Hugo Heuzard Date: Sat, 2 May 2026 15:57:46 +0200 Subject: [PATCH 1/5] quickjs-libc: add TextEncoder and TextDecoder Implements the WHATWG Encoding API's TextEncoder and TextDecoder classes (UTF-8 only, the only encoding the spec actually requires) and installs them on the global object from js_std_add_helpers, alongside `console`, `print`, and `scriptArgs`. TextEncoder: * encode(string?) -> Uint8Array * encodeInto(string, dst) -> { read, written } * encoding -> "utf-8" TextDecoder: * new TextDecoder(label?, { fatal?, ignoreBOM? }) * decode(input?, { stream? }) -> string * encoding / fatal / ignoreBOM accessors decode() handles: * any TypedArray view or ArrayBuffer (BufferSource) as input, * UTF-8 BOM stripping (suppressed by ignoreBOM), * stream mode by saving up to 3 trailing bytes of an incomplete sequence and prepending them on the next call, * fatal mode by throwing TypeError on any encoding error (including a trailing partial sequence in non-stream mode), * non-fatal mode by emitting U+FFFD for each invalid byte. The label parser accepts the WHATWG list of UTF-8 aliases (case-insensitive, ASCII-whitespace trimmed); other encodings throw RangeError, matching the spec. UTF-8 decoding reuses the existing utf8_decode / utf8_decode_len helpers in cutils.h, so no new UTF-8 logic is introduced. --- quickjs-libc.c | 445 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 445 insertions(+) diff --git a/quickjs-libc.c b/quickjs-libc.c index 1fe3cf5c1..ded1d31b5 100644 --- a/quickjs-libc.c +++ b/quickjs-libc.c @@ -197,6 +197,8 @@ typedef struct JSThreadState { #endif // USE_WORKER JSClassID std_file_class_id; JSClassID worker_class_id; + JSClassID text_encoder_class_id; + JSClassID text_decoder_class_id; } JSThreadState; static uint64_t os_pending_signals; @@ -4599,6 +4601,447 @@ static JSValue js_print(JSContext *ctx, JSValueConst this_val, return JS_UNDEFINED; } +/**********************************************************/ +/* WHATWG Encoding: TextEncoder / TextDecoder (UTF-8 only) */ + +typedef struct { + bool fatal; + bool ignore_bom; + /* Once we've decoded any input (or skipped a BOM), we stop treating + a leading U+FEFF as a BOM. Reset on non-stream decode(). */ + bool bom_seen; + /* Up to 3 trailing bytes of an incomplete UTF-8 sequence saved + across stream decode() calls. */ + uint8_t pending[4]; + int pending_len; +} JSTextDecoder; + +static void js_text_decoder_finalizer(JSRuntime *rt, JSValue val) +{ + JSThreadState *ts = js_get_thread_state(rt); + JSTextDecoder *td = JS_GetOpaque(val, ts->text_decoder_class_id); + js_free_rt(rt, td); +} + +static JSClassDef js_text_encoder_class = { + "TextEncoder", +}; + +static JSClassDef js_text_decoder_class = { + "TextDecoder", + .finalizer = js_text_decoder_finalizer, +}; + +/* Lead-byte length of a UTF-8 sequence, or 0 for invalid/continuation. */ +static int js_utf8_seq_len(uint8_t b) +{ + if (b < 0x80) return 1; + if (b < 0xC2) return 0; + if (b < 0xE0) return 2; + if (b < 0xF0) return 3; + if (b < 0xF5) return 4; + return 0; +} + +/* TextEncoder ------------------------------------------------------------ */ + +static JSValue js_text_encoder_constructor(JSContext *ctx, + JSValueConst new_target, + int argc, JSValueConst *argv) +{ + JSRuntime *rt = JS_GetRuntime(ctx); + JSThreadState *ts = js_get_thread_state(rt); + JSValue proto, obj; + + proto = JS_GetPropertyStr(ctx, new_target, "prototype"); + if (JS_IsException(proto)) + return proto; + obj = JS_NewObjectProtoClass(ctx, proto, ts->text_encoder_class_id); + JS_FreeValue(ctx, proto); + if (JS_IsException(obj)) + return obj; + /* Stateless; opaque is just a brand. */ + JS_SetOpaque(obj, (void *)1); + return obj; +} + +static JSValue js_text_encoder_encode(JSContext *ctx, JSValueConst this_val, + int argc, JSValueConst *argv) +{ + JSRuntime *rt = JS_GetRuntime(ctx); + JSThreadState *ts = js_get_thread_state(rt); + const char *str; + size_t len; + JSValue ret; + + if (!JS_GetOpaque(this_val, ts->text_encoder_class_id)) + return JS_ThrowTypeError(ctx, "'this' is not a TextEncoder"); + if (argc < 1 || JS_IsUndefined(argv[0])) + return JS_NewUint8ArrayCopy(ctx, NULL, 0); + str = JS_ToCStringLen(ctx, &len, argv[0]); + if (!str) + return JS_EXCEPTION; + ret = JS_NewUint8ArrayCopy(ctx, (const uint8_t *)str, len); + JS_FreeCString(ctx, str); + return ret; +} + +static JSValue js_text_encoder_encode_into(JSContext *ctx, JSValueConst this_val, + int argc, JSValueConst *argv) +{ + JSRuntime *rt = JS_GetRuntime(ctx); + JSThreadState *ts = js_get_thread_state(rt); + const char *src; + size_t src_len, dst_len; + uint8_t *dst; + int read = 0, written = 0; + const uint8_t *p, *end, *next; + uint32_t cp; + size_t enc_len; + JSValue ret; + + if (!JS_GetOpaque(this_val, ts->text_encoder_class_id)) + return JS_ThrowTypeError(ctx, "'this' is not a TextEncoder"); + if (argc < 2) + return JS_ThrowTypeError(ctx, "TextEncoder.encodeInto requires two arguments"); + if (JS_GetTypedArrayType(argv[1]) != JS_TYPED_ARRAY_UINT8) + return JS_ThrowTypeError(ctx, + "TextEncoder.encodeInto: destination must be a Uint8Array"); + dst = JS_GetUint8Array(ctx, &dst_len, argv[1]); + if (!dst) + return JS_EXCEPTION; + src = JS_ToCStringLen(ctx, &src_len, argv[0]); + if (!src) + return JS_EXCEPTION; + + p = (const uint8_t *)src; + end = p + src_len; + while (p < end) { + cp = utf8_decode(p, &next); + enc_len = utf8_encode_len(cp); + if ((size_t)written + enc_len > dst_len) + break; + utf8_encode(dst + written, cp); + written += (int)enc_len; + /* Spec: read counts UTF-16 code units consumed from the input. */ + read += (cp > 0xFFFF) ? 2 : 1; + p = next; + } + JS_FreeCString(ctx, src); + + ret = JS_NewObject(ctx); + if (JS_IsException(ret)) + return ret; + JS_DefinePropertyValueStr(ctx, ret, "read", + JS_NewInt32(ctx, read), JS_PROP_C_W_E); + JS_DefinePropertyValueStr(ctx, ret, "written", + JS_NewInt32(ctx, written), JS_PROP_C_W_E); + return ret; +} + +static JSValue js_text_encoder_get_encoding(JSContext *ctx, JSValueConst this_val) +{ + JSRuntime *rt = JS_GetRuntime(ctx); + JSThreadState *ts = js_get_thread_state(rt); + if (!JS_GetOpaque(this_val, ts->text_encoder_class_id)) + return JS_ThrowTypeError(ctx, "'this' is not a TextEncoder"); + return JS_NewString(ctx, "utf-8"); +} + +static const JSCFunctionListEntry js_text_encoder_proto_funcs[] = { + JS_PROP_STRING_DEF("[Symbol.toStringTag]", "TextEncoder", JS_PROP_CONFIGURABLE), + JS_CFUNC_DEF("encode", 1, js_text_encoder_encode), + JS_CFUNC_DEF("encodeInto", 2, js_text_encoder_encode_into), + JS_CGETSET_DEF("encoding", js_text_encoder_get_encoding, NULL), +}; + +/* TextDecoder ------------------------------------------------------------ */ + +/* Match a label against the WHATWG list of UTF-8 aliases (case-insensitive, + ASCII-whitespace trimmed). Returns 0 on match, -1 otherwise. */ +static int js_text_decoder_label_is_utf8(const char *label, size_t len) +{ + static const char * const aliases[] = { + "unicode-1-1-utf-8", "unicode11utf8", "unicode20utf8", + "utf-8", "utf8", "x-unicode20utf8", + }; + size_t i, j; + while (len > 0 && (*label == ' ' || *label == '\t' || *label == '\n' + || *label == '\r' || *label == '\f')) { + label++; len--; + } + while (len > 0 && (label[len-1] == ' ' || label[len-1] == '\t' + || label[len-1] == '\n' || label[len-1] == '\r' + || label[len-1] == '\f')) { + len--; + } + for (i = 0; i < countof(aliases); i++) { + size_t alen = strlen(aliases[i]); + if (alen != len) continue; + for (j = 0; j < len; j++) { + int c = (unsigned char)label[j]; + if (c >= 'A' && c <= 'Z') c += 32; + if (c != aliases[i][j]) break; + } + if (j == len) return 0; + } + return -1; +} + +static JSValue js_text_decoder_constructor(JSContext *ctx, + JSValueConst new_target, + int argc, JSValueConst *argv) +{ + JSRuntime *rt = JS_GetRuntime(ctx); + JSThreadState *ts = js_get_thread_state(rt); + JSValue proto, obj; + JSTextDecoder *td; + bool fatal = false, ignore_bom = false; + + if (argc >= 1 && !JS_IsUndefined(argv[0])) { + size_t llen; + const char *label = JS_ToCStringLen(ctx, &llen, argv[0]); + if (!label) + return JS_EXCEPTION; + if (js_text_decoder_label_is_utf8(label, llen) < 0) { + JSValue err = JS_ThrowRangeError(ctx, + "The \"%s\" encoding is not supported", label); + JS_FreeCString(ctx, label); + return err; + } + JS_FreeCString(ctx, label); + } + if (argc >= 2 && JS_IsObject(argv[1])) { + JSValue v = JS_GetPropertyStr(ctx, argv[1], "fatal"); + if (JS_IsException(v)) return v; + fatal = JS_ToBool(ctx, v); + JS_FreeValue(ctx, v); + v = JS_GetPropertyStr(ctx, argv[1], "ignoreBOM"); + if (JS_IsException(v)) return v; + ignore_bom = JS_ToBool(ctx, v); + JS_FreeValue(ctx, v); + } + + proto = JS_GetPropertyStr(ctx, new_target, "prototype"); + if (JS_IsException(proto)) + return proto; + obj = JS_NewObjectProtoClass(ctx, proto, ts->text_decoder_class_id); + JS_FreeValue(ctx, proto); + if (JS_IsException(obj)) + return obj; + td = js_mallocz(ctx, sizeof(*td)); + if (!td) { + JS_FreeValue(ctx, obj); + return JS_EXCEPTION; + } + td->fatal = fatal; + td->ignore_bom = ignore_bom; + JS_SetOpaque(obj, td); + return obj; +} + +/* Get the byte view of a BufferSource (ArrayBuffer or any TypedArray view). + On success returns 0 with bytes/len populated; on failure returns -1 + with a TypeError pending. JS_UNDEFINED yields the empty input. */ +static int js_text_decoder_get_bytes(JSContext *ctx, JSValueConst v, + const uint8_t **bytes, size_t *len) +{ + if (JS_IsUndefined(v)) { + *bytes = NULL; *len = 0; + return 0; + } + if (JS_IsArrayBuffer(v)) { + size_t l; + uint8_t *p = JS_GetArrayBuffer(ctx, &l, v); + if (!p) return -1; + *bytes = p; *len = l; + return 0; + } + if (JS_GetTypedArrayType(v) >= 0) { + size_t off, blen, bpe, ablen; + JSValue ab = JS_GetTypedArrayBuffer(ctx, v, &off, &blen, &bpe); + uint8_t *p; + if (JS_IsException(ab)) return -1; + p = JS_GetArrayBuffer(ctx, &ablen, ab); + JS_FreeValue(ctx, ab); + if (!p) return -1; + *bytes = p + off; *len = blen; + return 0; + } + JS_ThrowTypeError(ctx, + "TextDecoder.decode: input must be an ArrayBuffer or TypedArray"); + return -1; +} + +static JSValue js_text_decoder_decode(JSContext *ctx, JSValueConst this_val, + int argc, JSValueConst *argv) +{ + JSRuntime *rt = JS_GetRuntime(ctx); + JSThreadState *ts = js_get_thread_state(rt); + JSTextDecoder *td; + const uint8_t *src; + size_t src_len; + bool stream = false; + uint8_t *combined = NULL; + uint8_t *out = NULL; + size_t out_len = 0, out_cap; + const uint8_t *p, *p_end, *next; + uint32_t cp; + JSValue ret; + JSValueConst input = argc > 0 ? argv[0] : JS_UNDEFINED; + + td = JS_GetOpaque(this_val, ts->text_decoder_class_id); + if (!td) + return JS_ThrowTypeError(ctx, "'this' is not a TextDecoder"); + if (argc >= 2 && JS_IsObject(argv[1])) { + JSValue v = JS_GetPropertyStr(ctx, argv[1], "stream"); + if (JS_IsException(v)) return v; + stream = JS_ToBool(ctx, v); + JS_FreeValue(ctx, v); + } + if (js_text_decoder_get_bytes(ctx, input, &src, &src_len) < 0) + return JS_EXCEPTION; + + if (td->pending_len > 0) { + size_t total = (size_t)td->pending_len + src_len; + combined = js_malloc(ctx, total ? total : 1); + if (!combined) return JS_EXCEPTION; + memcpy(combined, td->pending, td->pending_len); + if (src_len > 0) memcpy(combined + td->pending_len, src, src_len); + src = combined; + src_len = total; + td->pending_len = 0; + } + + /* Worst case output: each byte expands to 3-byte U+FFFD replacement. */ + out_cap = src_len * 3 + 4; + out = js_malloc(ctx, out_cap); + if (!out) { + if (combined) js_free(ctx, combined); + return JS_EXCEPTION; + } + + p = src; + p_end = src + src_len; + while (p < p_end) { + int seq_len = js_utf8_seq_len(*p); + if (seq_len == 0) { + if (td->fatal) goto invalid; + out[out_len++] = 0xEF; out[out_len++] = 0xBF; out[out_len++] = 0xBD; + p++; + continue; + } + if (p + seq_len > p_end) { + /* Incomplete trailing sequence. */ + if (stream) { + int rem = (int)(p_end - p); + memcpy(td->pending, p, rem); + td->pending_len = rem; + p = p_end; + break; + } + if (td->fatal) goto invalid; + out[out_len++] = 0xEF; out[out_len++] = 0xBF; out[out_len++] = 0xBD; + p = p_end; + break; + } + cp = utf8_decode_len(p, p_end - p, &next); + if (cp == 0xFFFD && next == p + 1 && *p >= 0x80) { + if (td->fatal) goto invalid; + out[out_len++] = 0xEF; out[out_len++] = 0xBF; out[out_len++] = 0xBD; + p = next; + continue; + } + if (!td->bom_seen) { + td->bom_seen = true; + if (!td->ignore_bom && cp == 0xFEFF) { + p = next; + continue; + } + } + out_len += utf8_encode(out + out_len, cp); + p = next; + } + + if (!stream) { + td->pending_len = 0; + td->bom_seen = false; + } + ret = JS_NewStringLen(ctx, (const char *)out, out_len); + js_free(ctx, out); + if (combined) js_free(ctx, combined); + return ret; + +invalid: + js_free(ctx, out); + if (combined) js_free(ctx, combined); + return JS_ThrowTypeError(ctx, "The encoded data was not valid"); +} + +static JSValue js_text_decoder_get_encoding(JSContext *ctx, JSValueConst this_val) +{ + JSRuntime *rt = JS_GetRuntime(ctx); + JSThreadState *ts = js_get_thread_state(rt); + if (!JS_GetOpaque(this_val, ts->text_decoder_class_id)) + return JS_ThrowTypeError(ctx, "'this' is not a TextDecoder"); + return JS_NewString(ctx, "utf-8"); +} + +static JSValue js_text_decoder_get_fatal(JSContext *ctx, JSValueConst this_val) +{ + JSRuntime *rt = JS_GetRuntime(ctx); + JSThreadState *ts = js_get_thread_state(rt); + JSTextDecoder *td = JS_GetOpaque(this_val, ts->text_decoder_class_id); + if (!td) return JS_ThrowTypeError(ctx, "'this' is not a TextDecoder"); + return JS_NewBool(ctx, td->fatal); +} + +static JSValue js_text_decoder_get_ignore_bom(JSContext *ctx, JSValueConst this_val) +{ + JSRuntime *rt = JS_GetRuntime(ctx); + JSThreadState *ts = js_get_thread_state(rt); + JSTextDecoder *td = JS_GetOpaque(this_val, ts->text_decoder_class_id); + if (!td) return JS_ThrowTypeError(ctx, "'this' is not a TextDecoder"); + return JS_NewBool(ctx, td->ignore_bom); +} + +static const JSCFunctionListEntry js_text_decoder_proto_funcs[] = { + JS_PROP_STRING_DEF("[Symbol.toStringTag]", "TextDecoder", JS_PROP_CONFIGURABLE), + JS_CFUNC_DEF("decode", 1, js_text_decoder_decode), + JS_CGETSET_DEF("encoding", js_text_decoder_get_encoding, NULL), + JS_CGETSET_DEF("fatal", js_text_decoder_get_fatal, NULL), + JS_CGETSET_DEF("ignoreBOM", js_text_decoder_get_ignore_bom, NULL), +}; + +static void js_std_install_text_codecs(JSContext *ctx, JSValue global_obj) +{ + JSRuntime *rt = JS_GetRuntime(ctx); + JSThreadState *ts = js_get_thread_state(rt); + JSValue proto, ctor; + + JS_NewClassID(rt, &ts->text_encoder_class_id); + JS_NewClass(rt, ts->text_encoder_class_id, &js_text_encoder_class); + proto = JS_NewObject(ctx); + JS_SetPropertyFunctionList(ctx, proto, js_text_encoder_proto_funcs, + countof(js_text_encoder_proto_funcs)); + JS_SetClassProto(ctx, ts->text_encoder_class_id, proto); + ctor = JS_NewCFunction2(ctx, js_text_encoder_constructor, "TextEncoder", 0, + JS_CFUNC_constructor, 0); + JS_SetConstructor(ctx, ctor, proto); + JS_SetPropertyStr(ctx, global_obj, "TextEncoder", ctor); + + JS_NewClassID(rt, &ts->text_decoder_class_id); + JS_NewClass(rt, ts->text_decoder_class_id, &js_text_decoder_class); + proto = JS_NewObject(ctx); + JS_SetPropertyFunctionList(ctx, proto, js_text_decoder_proto_funcs, + countof(js_text_decoder_proto_funcs)); + JS_SetClassProto(ctx, ts->text_decoder_class_id, proto); + ctor = JS_NewCFunction2(ctx, js_text_decoder_constructor, "TextDecoder", 2, + JS_CFUNC_constructor, 0); + JS_SetConstructor(ctx, ctor, proto); + JS_SetPropertyStr(ctx, global_obj, "TextDecoder", ctor); +} + void js_std_add_helpers(JSContext *ctx, int argc, char **argv) { JSValue global_obj, console, args; @@ -4624,6 +5067,8 @@ void js_std_add_helpers(JSContext *ctx, int argc, char **argv) JS_SetPropertyStr(ctx, global_obj, "print", JS_NewCFunction(ctx, js_print, "print", 1)); + js_std_install_text_codecs(ctx, global_obj); + JS_FreeValue(ctx, global_obj); } From d1e7882b8074ba99360bb6afff7b945b97b0c2d7 Mon Sep 17 00:00:00 2001 From: Hugo Heuzard Date: Thu, 7 May 2026 16:55:16 +0200 Subject: [PATCH 2/5] quickjs-libc: stringify TextEncoder.encodeInto source before validating destination Per the WHATWG Encoding spec, encodeInto's first argument is converted to a USVString before the second is checked for being a Uint8Array, so the source's toString side effects must be observable even when the destination is invalid. Co-Authored-By: Claude Opus 4.7 (1M context) --- quickjs-libc.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/quickjs-libc.c b/quickjs-libc.c index ded1d31b5..9c1fb819c 100644 --- a/quickjs-libc.c +++ b/quickjs-libc.c @@ -4704,15 +4704,19 @@ static JSValue js_text_encoder_encode_into(JSContext *ctx, JSValueConst this_val return JS_ThrowTypeError(ctx, "'this' is not a TextEncoder"); if (argc < 2) return JS_ThrowTypeError(ctx, "TextEncoder.encodeInto requires two arguments"); - if (JS_GetTypedArrayType(argv[1]) != JS_TYPED_ARRAY_UINT8) + src = JS_ToCStringLen(ctx, &src_len, argv[0]); + if (!src) + return JS_EXCEPTION; + if (JS_GetTypedArrayType(argv[1]) != JS_TYPED_ARRAY_UINT8) { + JS_FreeCString(ctx, src); return JS_ThrowTypeError(ctx, "TextEncoder.encodeInto: destination must be a Uint8Array"); + } dst = JS_GetUint8Array(ctx, &dst_len, argv[1]); - if (!dst) - return JS_EXCEPTION; - src = JS_ToCStringLen(ctx, &src_len, argv[0]); - if (!src) + if (!dst) { + JS_FreeCString(ctx, src); return JS_EXCEPTION; + } p = (const uint8_t *)src; end = p + src_len; From b4b6c873bb4121eb049a5db1982decdb31c96887 Mon Sep 17 00:00:00 2001 From: Hugo Heuzard Date: Thu, 7 May 2026 16:59:30 +0200 Subject: [PATCH 3/5] quickjs-libc: distinguish partial vs invalid trailing UTF-8 in TextDecoder The "incomplete trailing sequence" branch only checked the lead byte's declared length against remaining bytes, so a lead followed by an out-of-range continuation (e.g. E0 41, E0 80, F0 80, F4 90) silently dropped the offending byte and emitted a single U+FFFD. Per WHATWG, an out-of-range continuation must produce U+FFFD and be re-read as a fresh lead, yielding e.g. "U+FFFD U+0041" for E0 41 and two U+FFFD for E0 80. Stream mode had the same issue: it would buffer bytes already known to violate the continuation bounds. Add a small helper that returns the first-continuation-byte bounds for each lead (matching utf8_decode's acceptance set) and use it to walk the available bytes; emit eagerly on the first out-of-range byte, and only defer or flush when every available byte is a valid continuation. Co-Authored-By: Claude Opus 4.7 (1M context) --- quickjs-libc.c | 42 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/quickjs-libc.c b/quickjs-libc.c index 9c1fb819c..94f2aabdc 100644 --- a/quickjs-libc.c +++ b/quickjs-libc.c @@ -4643,6 +4643,17 @@ static int js_utf8_seq_len(uint8_t b) return 0; } +/* Bounds for the first continuation byte after `lead`, matching the + acceptance set of utf8_decode() in cutils.h. Subsequent continuation + bytes are always [0x80, 0xBF]. */ +static void js_utf8_first_cont_bounds(uint8_t lead, uint8_t *lo, uint8_t *hi) +{ + if (lead == 0xE0) { *lo = 0xA0; *hi = 0xBF; } + else if (lead == 0xF0) { *lo = 0x90; *hi = 0xBF; } + else if (lead == 0xF4) { *lo = 0x80; *hi = 0x8F; } + else { *lo = 0x80; *hi = 0xBF; } +} + /* TextEncoder ------------------------------------------------------------ */ static JSValue js_text_encoder_constructor(JSContext *ctx, @@ -4936,11 +4947,34 @@ static JSValue js_text_decoder_decode(JSContext *ctx, JSValueConst this_val, continue; } if (p + seq_len > p_end) { - /* Incomplete trailing sequence. */ + /* Sequence is incomplete by length. Check the bytes we do have + against the per-lead continuation bounds: a byte that's out + of range is a known error and must be re-read as a fresh + lead, not buffered. */ + int avail = (int)(p_end - p); + int k = 1; + if (avail >= 2) { + uint8_t lo, hi; + js_utf8_first_cont_bounds(*p, &lo, &hi); + if (p[1] >= lo && p[1] <= hi) { + for (k = 2; k < avail; k++) { + if (p[k] < 0x80 || p[k] > 0xBF) break; + } + } + } + if (k < avail) { + /* p[k] violates the continuation rules: emit one error, + advance past the lead and any valid continuations, and + leave p[k] for the next iteration. */ + if (td->fatal) goto invalid; + out[out_len++] = 0xEF; out[out_len++] = 0xBF; out[out_len++] = 0xBD; + p += k; + continue; + } + /* Truly partial: defer in stream mode, otherwise flush as one error. */ if (stream) { - int rem = (int)(p_end - p); - memcpy(td->pending, p, rem); - td->pending_len = rem; + memcpy(td->pending, p, avail); + td->pending_len = avail; p = p_end; break; } From cd3f535c3ff591e702ad21252759ff30ef1c29f9 Mon Sep 17 00:00:00 2001 From: Hugo Heuzard Date: Thu, 7 May 2026 17:03:33 +0200 Subject: [PATCH 4/5] quickjs-libc: replace lone surrogates with U+FFFD in TextEncoder WHATWG Encoding's encode/encodeInto operate on USVStrings: lone surrogates in the input are replaced with U+FFFD before UTF-8 encoding. JS_ToCStringLen, however, keeps lone surrogates and emits them as their 3-byte CESU-8-like encoding (ED A0..BF XX), which is invalid UTF-8 and not what the spec mandates. In encode(), scan the JS_ToCStringLen output for ED A0..BF XX (a triple that valid UTF-8 never produces) and rewrite each occurrence to EF BF BD; the replacement is the same length so the output size is unchanged. The common ASCII/BMP path stays a single allocation+copy. In encodeInto(), the loop already calls utf8_decode per code point; clamp surrogate code points (D800..DFFF) to U+FFFD before re-encoding. The read counter naturally still credits 1 UTF-16 code unit for a lone surrogate and 2 for a matched pair. Co-Authored-By: Claude Opus 4.7 (1M context) --- quickjs-libc.c | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/quickjs-libc.c b/quickjs-libc.c index 94f2aabdc..07a71ba04 100644 --- a/quickjs-libc.c +++ b/quickjs-libc.c @@ -4692,7 +4692,42 @@ static JSValue js_text_encoder_encode(JSContext *ctx, JSValueConst this_val, str = JS_ToCStringLen(ctx, &len, argv[0]); if (!str) return JS_EXCEPTION; - ret = JS_NewUint8ArrayCopy(ctx, (const uint8_t *)str, len); + /* JS_ToCStringLen keeps lone surrogates as their 3-byte CESU-8-like + encoding (ED A0..BF XX). USVString conversion in the WHATWG Encoding + spec replaces them with U+FFFD before UTF-8 encoding. Valid UTF-8 + never produces ED A0..BF, so any such triple comes from a lone + surrogate. The replacement is 3 bytes, so output length is unchanged. */ + { + const uint8_t *s = (const uint8_t *)str; + size_t i; + for (i = 0; i + 2 < len; i++) { + if (s[i] == 0xED && s[i+1] >= 0xA0 && s[i+1] <= 0xBF) + break; + } + if (i + 2 >= len) { + ret = JS_NewUint8ArrayCopy(ctx, s, len); + } else { + uint8_t *buf = js_malloc(ctx, len); + size_t j; + if (!buf) { + JS_FreeCString(ctx, str); + return JS_EXCEPTION; + } + memcpy(buf, s, i); + for (j = i; i < len; ) { + if (i + 2 < len && s[i] == 0xED + && s[i+1] >= 0xA0 && s[i+1] <= 0xBF + && s[i+2] >= 0x80 && s[i+2] <= 0xBF) { + buf[j++] = 0xEF; buf[j++] = 0xBF; buf[j++] = 0xBD; + i += 3; + } else { + buf[j++] = s[i++]; + } + } + ret = JS_NewUint8ArrayCopy(ctx, buf, j); + js_free(ctx, buf); + } + } JS_FreeCString(ctx, str); return ret; } @@ -4733,6 +4768,11 @@ static JSValue js_text_encoder_encode_into(JSContext *ctx, JSValueConst this_val end = p + src_len; while (p < end) { cp = utf8_decode(p, &next); + /* JS_ToCStringLen keeps lone surrogates as ED A0..BF XX, which + utf8_decode happily decodes back to a surrogate code point. The + USVString conversion in the spec replaces them with U+FFFD. */ + if (cp >= 0xD800 && cp <= 0xDFFF) + cp = 0xFFFD; enc_len = utf8_encode_len(cp); if ((size_t)written + enc_len > dst_len) break; From 58df657d8dd3381be957f0c0552dcbde8529be20 Mon Sep 17 00:00:00 2001 From: Hugo Heuzard Date: Thu, 7 May 2026 17:21:19 +0200 Subject: [PATCH 5/5] quickjs-libc: add tests for TextEncoder/TextDecoder Cover the WHATWG Encoding behaviors the existing implementation aims at: encoder ToString coercion, lone-surrogate replacement, encodeInto's read/written semantics and partial-write rule, decoder label parsing, BOM handling (default/ignoreBOM/middle/split), per-lead continuation bounds and the "incomplete vs invalid trailing" distinction, fatal mode, and stream split/flush behavior. The classes are installed by js_std_add_helpers, which run-test262 does not call, so `make test` (run-test262 in local mode) didn't see them. Expose the install as a public js_std_add_text_codecs(ctx) and call it from JS_NewCustomContext's local-mode setup so the new test runs under the same harness as the rest of tests/. Co-Authored-By: Claude Opus 4.7 (1M context) --- quickjs-libc.c | 10 +- quickjs-libc.h | 1 + run-test262.c | 1 + tests/test_text_codec.js | 304 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 313 insertions(+), 3 deletions(-) create mode 100644 tests/test_text_codec.js diff --git a/quickjs-libc.c b/quickjs-libc.c index 07a71ba04..2bb0b1734 100644 --- a/quickjs-libc.c +++ b/quickjs-libc.c @@ -5091,11 +5091,13 @@ static const JSCFunctionListEntry js_text_decoder_proto_funcs[] = { JS_CGETSET_DEF("ignoreBOM", js_text_decoder_get_ignore_bom, NULL), }; -static void js_std_install_text_codecs(JSContext *ctx, JSValue global_obj) +void js_std_add_text_codecs(JSContext *ctx) { JSRuntime *rt = JS_GetRuntime(ctx); JSThreadState *ts = js_get_thread_state(rt); - JSValue proto, ctor; + JSValue global_obj, proto, ctor; + + global_obj = JS_GetGlobalObject(ctx); JS_NewClassID(rt, &ts->text_encoder_class_id); JS_NewClass(rt, ts->text_encoder_class_id, &js_text_encoder_class); @@ -5118,6 +5120,8 @@ static void js_std_install_text_codecs(JSContext *ctx, JSValue global_obj) JS_CFUNC_constructor, 0); JS_SetConstructor(ctx, ctor, proto); JS_SetPropertyStr(ctx, global_obj, "TextDecoder", ctor); + + JS_FreeValue(ctx, global_obj); } void js_std_add_helpers(JSContext *ctx, int argc, char **argv) @@ -5145,7 +5149,7 @@ void js_std_add_helpers(JSContext *ctx, int argc, char **argv) JS_SetPropertyStr(ctx, global_obj, "print", JS_NewCFunction(ctx, js_print, "print", 1)); - js_std_install_text_codecs(ctx, global_obj); + js_std_add_text_codecs(ctx); JS_FreeValue(ctx, global_obj); } diff --git a/quickjs-libc.h b/quickjs-libc.h index fd91a2f68..847c6d58f 100644 --- a/quickjs-libc.h +++ b/quickjs-libc.h @@ -45,6 +45,7 @@ JS_LIBC_EXTERN JSModuleDef *js_init_module_os(JSContext *ctx, JS_LIBC_EXTERN JSModuleDef *js_init_module_bjson(JSContext *ctx, const char *module_name); JS_LIBC_EXTERN void js_std_add_helpers(JSContext *ctx, int argc, char **argv); +JS_LIBC_EXTERN void js_std_add_text_codecs(JSContext *ctx); JS_LIBC_EXTERN int js_std_loop(JSContext *ctx); JS_LIBC_EXTERN int js_std_loop_once(JSContext *ctx); JS_LIBC_EXTERN int js_std_poll_io(JSContext *ctx, int timeout_ms); diff --git a/run-test262.c b/run-test262.c index 4c996fb06..ba2f3ff1c 100644 --- a/run-test262.c +++ b/run-test262.c @@ -1729,6 +1729,7 @@ JSContext *JS_NewCustomContext(JSRuntime *rt) js_init_module_std(ctx, "qjs:std"); js_init_module_os(ctx, "qjs:os"); js_init_module_bjson(ctx, "qjs:bjson"); + js_std_add_text_codecs(ctx); obj = JS_GetGlobalObject(ctx); JS_SetPropertyFunctionList(ctx, obj, &qjs_object, 1); JS_FreeValue(ctx, obj); diff --git a/tests/test_text_codec.js b/tests/test_text_codec.js new file mode 100644 index 000000000..ab5710d8e --- /dev/null +++ b/tests/test_text_codec.js @@ -0,0 +1,304 @@ +import { assert, assertThrows, assertArrayEquals } from "./assert.js"; + +function bytes(arr) { return new Uint8Array(arr); } +function arr(u8) { return Array.from(u8); } + +function test_encoder_basic() { + const e = new TextEncoder(); + assert(e.encoding, "utf-8"); + assert(Object.prototype.toString.call(e), "[object TextEncoder]"); + + assertArrayEquals(arr(e.encode()), []); + assertArrayEquals(arr(e.encode(undefined)), []); + assertArrayEquals(arr(e.encode("")), []); + assertArrayEquals(arr(e.encode("hi")), [0x68, 0x69]); + // U+2603 SNOWMAN — 3-byte sequence. + assertArrayEquals(arr(e.encode("☃")), [0xE2, 0x98, 0x83]); + // U+10000 via surrogate pair — 4-byte sequence. + assertArrayEquals(arr(e.encode("𐀀")), [0xF0, 0x90, 0x80, 0x80]); + // ToString coercion. + assertArrayEquals(arr(e.encode(null)), [0x6E, 0x75, 0x6C, 0x6C]); // "null" + assertArrayEquals(arr(e.encode(42)), [0x34, 0x32]); // "42" +} + +function test_encoder_lone_surrogates() { + // USVString conversion: lone surrogates become U+FFFD before encoding. + const e = new TextEncoder(); + assertArrayEquals(arr(e.encode("\uD800")), [0xEF, 0xBF, 0xBD]); + assertArrayEquals(arr(e.encode("\uDFFF")), [0xEF, 0xBF, 0xBD]); + assertArrayEquals(arr(e.encode("\uDC00")), [0xEF, 0xBF, 0xBD]); + assertArrayEquals(arr(e.encode("a\uD800b")), + [0x61, 0xEF, 0xBF, 0xBD, 0x62]); + // Two adjacent lone high surrogates: each replaced independently. + assertArrayEquals(arr(e.encode("\uD800\uD800")), + [0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD]); + // Reverse-order surrogates (low then high): both lone. + assertArrayEquals(arr(e.encode("\uDC00\uD800")), + [0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD]); + // Lone high followed by ASCII before a matched pair: only the lone one + // is replaced. + assertArrayEquals(arr(e.encode("\uD800a😀")), + [0xEF, 0xBF, 0xBD, 0x61, 0xF0, 0x9F, 0x98, 0x80]); +} + +function test_encode_into_basic() { + const e = new TextEncoder(); + + let dst = new Uint8Array(8); + let r = e.encodeInto("hi", dst); + assert(r.read, 2); + assert(r.written, 2); + assertArrayEquals(arr(dst.subarray(0, 2)), [0x68, 0x69]); + + // Surrogate pair: read counts UTF-16 code units (2), written is 4 bytes. + dst = new Uint8Array(8); + r = e.encodeInto("😀", dst); + assert(r.read, 2); + assert(r.written, 4); + assertArrayEquals(arr(dst.subarray(0, 4)), [0xF0, 0x9F, 0x98, 0x80]); + + // Lone surrogate replaced with U+FFFD; read still counts 1 UTF-16 unit. + dst = new Uint8Array(8); + r = e.encodeInto("a\uD800", dst); + assert(r.read, 2); + assert(r.written, 4); + assertArrayEquals(arr(dst.subarray(0, 4)), [0x61, 0xEF, 0xBF, 0xBD]); + + // Empty source. + dst = new Uint8Array(4); dst.fill(0xAA); + r = e.encodeInto("", dst); + assert(r.read, 0); assert(r.written, 0); + assertArrayEquals(arr(dst), [0xAA, 0xAA, 0xAA, 0xAA]); + + // Empty destination. + r = e.encodeInto("abc", new Uint8Array(0)); + assert(r.read, 0); assert(r.written, 0); +} + +function test_encode_into_partial() { + const e = new TextEncoder(); + + // Destination too small for the next char's full encoding — must NOT + // write a partial sequence. + let dst = new Uint8Array(2); dst.fill(0xAA); + let r = e.encodeInto("☃hi", dst); // snowman is 3 bytes + assert(r.read, 0); assert(r.written, 0); + assertArrayEquals(arr(dst), [0xAA, 0xAA]); + + // Same for U+FFFD replacement of a lone surrogate (3 bytes). + dst = new Uint8Array(2); dst.fill(0xAA); + r = e.encodeInto("\uD800X", dst); + assert(r.read, 0); assert(r.written, 0); + assertArrayEquals(arr(dst), [0xAA, 0xAA]); + + // Some chars fit, then we stop short of an over-large one. + dst = new Uint8Array(4); dst.fill(0xAA); + r = e.encodeInto("ab☃c", dst); + assert(r.read, 2); assert(r.written, 2); + assertArrayEquals(arr(dst), [0x61, 0x62, 0xAA, 0xAA]); +} + +function test_encode_into_argument_errors() { + const e = new TextEncoder(); + + assertThrows(TypeError, () => e.encodeInto()); + assertThrows(TypeError, () => e.encodeInto("x")); + assertThrows(TypeError, () => e.encodeInto("x", "not a buffer")); + assertThrows(TypeError, () => e.encodeInto("x", new Int8Array(4))); + assertThrows(TypeError, () => e.encodeInto("x", new Uint16Array(4))); + assertThrows(TypeError, () => e.encodeInto("x", new Uint8ClampedArray(4))); + assertThrows(TypeError, () => e.encodeInto("x", new ArrayBuffer(4))); + + // Source is stringified before destination is validated (spec order). + let calls = []; + const src = { toString() { calls.push("src"); return "x"; } }; + assertThrows(TypeError, () => e.encodeInto(src, "not a buffer")); + assertArrayEquals(calls, ["src"]); +} + +function test_encoder_brand() { + assertThrows(TypeError, () => TextEncoder.prototype.encode.call({}, "x")); + assertThrows(TypeError, () => + TextEncoder.prototype.encodeInto.call({}, "x", new Uint8Array(4))); + // Calling the constructor without `new`. + assertThrows(TypeError, () => TextEncoder()); +} + +function test_decoder_basic() { + const d = new TextDecoder(); + assert(d.encoding, "utf-8"); + assert(d.fatal, false); + assert(d.ignoreBOM, false); + assert(Object.prototype.toString.call(d), "[object TextDecoder]"); + + assert(d.decode(), ""); + assert(d.decode(undefined), ""); + assert(d.decode(bytes([])), ""); + assert(d.decode(bytes([0x68, 0x69])), "hi"); + assert(d.decode(bytes([0xE2, 0x98, 0x83])), "☃"); + assert(d.decode(bytes([0xF0, 0x9F, 0x98, 0x80])), "😀"); // U+1F600 +} + +function test_decoder_input_types() { + const d = new TextDecoder(); + const data = [0x61, 0x62, 0x63]; + + assert(d.decode(new Uint8Array(data)), "abc"); + assert(d.decode(new Uint8Array(data).buffer), "abc"); + assert(d.decode(new Int8Array(new Uint8Array(data).buffer)), "abc"); + + // Subarray view at an offset must use that view's bytes only. + const big = new Uint8Array([0xFF, 0x61, 0x62, 0x63, 0xFF]); + assert(d.decode(big.subarray(1, 4)), "abc"); + + assertThrows(TypeError, () => d.decode("not a buffer")); + assertThrows(TypeError, () => d.decode({})); + assertThrows(TypeError, () => d.decode(null)); + assertThrows(TypeError, () => d.decode(123)); +} + +function test_decoder_label() { + for (const label of [ + "utf-8", "UTF-8", "utf8", "UTF8", "Utf-8", + " utf-8\t", "\nutf-8\r\f", "\fUTF-8 ", + "unicode-1-1-utf-8", "unicode11utf8", + "unicode20utf8", "x-unicode20utf8", + ]) { + assert(new TextDecoder(label).encoding, "utf-8"); + } + for (const label of ["latin1", "iso-8859-1", "utf-16", "windows-1252", + "utf-7", "ascii", ""]) { + assertThrows(RangeError, () => new TextDecoder(label)); + } +} + +function test_decoder_options() { + let d = new TextDecoder("utf-8", { fatal: true }); + assert(d.fatal, true); assert(d.ignoreBOM, false); + + d = new TextDecoder("utf-8", { ignoreBOM: true }); + assert(d.fatal, false); assert(d.ignoreBOM, true); + + d = new TextDecoder("utf-8", { fatal: true, ignoreBOM: true }); + assert(d.fatal, true); assert(d.ignoreBOM, true); + + // Truthy/falsy coercion. + d = new TextDecoder("utf-8", { fatal: 1, ignoreBOM: 0 }); + assert(d.fatal, true); assert(d.ignoreBOM, false); + + // Missing or non-object options: defaults. + d = new TextDecoder("utf-8"); + assert(d.fatal, false); assert(d.ignoreBOM, false); +} + +function test_decoder_bom() { + const bom = [0xEF, 0xBB, 0xBF]; + + // Default: BOM at start is stripped. + let d = new TextDecoder(); + assert(d.decode(bytes([...bom, 0x68, 0x69])), "hi"); + // BOM in the middle is kept as U+FEFF. + assert(d.decode(bytes([0x68, ...bom, 0x69])), "hi"); + // ignoreBOM=true: BOM is kept. + d = new TextDecoder("utf-8", { ignoreBOM: true }); + assert(d.decode(bytes([...bom, 0x68])), "h"); + // Decoder state is reset on non-stream call: a fresh BOM is honored. + d = new TextDecoder(); + assert(d.decode(bytes([...bom, 0x61])), "a"); + assert(d.decode(bytes([...bom, 0x62])), "b"); + // BOM split across stream calls is still recognized. + d = new TextDecoder(); + assert(d.decode(bytes([0xEF, 0xBB]), { stream: true }), ""); + assert(d.decode(bytes([0xBF, 0x68])), "h"); +} + +function test_decoder_invalid_sequences() { + const d = new TextDecoder(); + + // Stray continuation byte. + assert(d.decode(bytes([0x80])), "�"); + + // Lead byte followed by an out-of-range continuation: emit U+FFFD AND + // re-process the offending byte. + assert(d.decode(bytes([0xE0, 0x41])), "�A"); + assert(d.decode(bytes([0xE0, 0x80])), "��"); + assert(d.decode(bytes([0xF0, 0x80])), "��"); + assert(d.decode(bytes([0xF4, 0x90])), "��"); + assert(d.decode(bytes([0xF0, 0x90, 0x7F])), "�"); + + // Truly partial sequences (valid prefix, no following byte): single U+FFFD. + assert(d.decode(bytes([0xE0])), "�"); + assert(d.decode(bytes([0xE0, 0xA0])), "�"); + assert(d.decode(bytes([0xF0, 0x90])), "�"); + assert(d.decode(bytes([0xF0, 0x90, 0x80])), "�"); + + // Bytes that can never start a UTF-8 sequence. + assert(d.decode(bytes([0xC0])), "�"); + assert(d.decode(bytes([0xC1])), "�"); + assert(d.decode(bytes([0xF5])), "�"); + assert(d.decode(bytes([0xFF])), "�"); +} + +function test_decoder_fatal() { + const d = new TextDecoder("utf-8", { fatal: true }); + assert(d.decode(bytes([0x68, 0x69])), "hi"); + assertThrows(TypeError, () => d.decode(bytes([0x80]))); + assertThrows(TypeError, () => d.decode(bytes([0xE0, 0x41]))); + assertThrows(TypeError, () => d.decode(bytes([0xE0]))); + assertThrows(TypeError, () => d.decode(bytes([0xC0]))); + + // Stream mode with valid partial: pending, no error. + const d2 = new TextDecoder("utf-8", { fatal: true }); + assert(d2.decode(bytes([0xE2, 0x98]), { stream: true }), ""); + assert(d2.decode(bytes([0x83])), "☃"); + + // Stream + flush with partial pending → error on flush. + const d3 = new TextDecoder("utf-8", { fatal: true }); + assert(d3.decode(bytes([0xE2, 0x98]), { stream: true }), ""); + assertThrows(TypeError, () => d3.decode()); +} + +function test_decoder_stream() { + // Split a 4-byte sequence at every boundary and reassemble. + const seq = [0xF0, 0x9F, 0x98, 0x80]; // U+1F600 + for (let split = 1; split < 4; split++) { + const d = new TextDecoder(); + let out = d.decode(bytes(seq.slice(0, split)), { stream: true }); + out += d.decode(bytes(seq.slice(split))); + assert(out, "😀"); + } + + // E0 alone deferred; second call's first byte (0x41) is an invalid + // continuation, so we emit U+FFFD eagerly and re-read 0x41 as ASCII. + const d = new TextDecoder(); + assert(d.decode(bytes([0xE0]), { stream: true }), ""); + assert(d.decode(bytes([0x41])), "�A"); +} + +function test_decoder_brand() { + assertThrows(TypeError, () => TextDecoder.prototype.decode.call({})); + const enc_get = + Object.getOwnPropertyDescriptor(TextDecoder.prototype, "encoding").get; + assertThrows(TypeError, () => enc_get.call({})); + const fatal_get = + Object.getOwnPropertyDescriptor(TextDecoder.prototype, "fatal").get; + assertThrows(TypeError, () => fatal_get.call({})); + // Constructor without `new`. + assertThrows(TypeError, () => TextDecoder()); +} + +test_encoder_basic(); +test_encoder_lone_surrogates(); +test_encode_into_basic(); +test_encode_into_partial(); +test_encode_into_argument_errors(); +test_encoder_brand(); +test_decoder_basic(); +test_decoder_input_types(); +test_decoder_label(); +test_decoder_options(); +test_decoder_bom(); +test_decoder_invalid_sequences(); +test_decoder_fatal(); +test_decoder_stream(); +test_decoder_brand();