Skip to content

Commit 776d1a4

Browse files
committed
support \p{L} Unicode property escapes (issue #583)
1 parent f5e38b9 commit 776d1a4

3 files changed

Lines changed: 59 additions & 0 deletions

File tree

src/main/java/org/htmlunit/javascript/regexp/RegExpJsToJavaConverter.java

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import java.util.ArrayDeque;
1818
import java.util.ArrayList;
1919
import java.util.Deque;
20+
import java.util.HashMap;
2021
import java.util.List;
2122

2223
/**
@@ -37,13 +38,20 @@
3738
public class RegExpJsToJavaConverter {
3839

3940
private static final String DIGITS = "0123456789";
41+
private static final HashMap<String, String> UNICODE_ESCAPES;
4042

4143
private Tape tape_;
4244
private boolean insideCharClass_;
4345
private boolean insideRepetition_;
4446
private Deque<Subexpresion> parsingSubexpressions_;
4547
private List<Subexpresion> subexpressions_;
4648

49+
static {
50+
UNICODE_ESCAPES = new HashMap<>();
51+
UNICODE_ESCAPES.put("L", "L");
52+
UNICODE_ESCAPES.put("Letter", "L");
53+
}
54+
4755
/**
4856
* Helper to encapsulate the transformations.
4957
*/
@@ -373,6 +381,34 @@ private void processEscapeSequence() {
373381
return;
374382
}
375383

384+
// Unicode property escape
385+
if ('p' == escapeSequence) {
386+
int next = tape_.read();
387+
if (next > -1) {
388+
if (next == '{') {
389+
final int uPos = tape_.currentPos_;
390+
do {
391+
next = tape_.read();
392+
}
393+
while (next > -1 && next != '}');
394+
if (next == '}') {
395+
final String escape = tape_.tape_.substring(uPos, tape_.currentPos_ - 1);
396+
final String replace = UNICODE_ESCAPES.get(escape);
397+
if (replace != null) {
398+
tape_.tape_.replace(uPos, uPos + escape.length(), replace);
399+
return;
400+
}
401+
}
402+
403+
// back to the old behavior
404+
tape_.move(uPos - tape_.currentPos_ - 3);
405+
tape_.remove(1);
406+
}
407+
}
408+
409+
return;
410+
}
411+
376412
if ("ACEFGHIJKLMNOPQRTUVXYZaeghijklmpqyz".indexOf(escapeSequence) > -1) {
377413
// no need to escape this chars
378414
tape_.move(-2);

src/test/java/org/htmlunit/javascript/regexp/RegExpJsToJavaConverter2Test.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,16 @@ public void validationPatternUnicodeCodePointEscapesFails() throws Exception {
8686
validation("123\\u{1D306}", "123&#x1D307;");
8787
}
8888

89+
/**
90+
* @throws Exception if an error occurs
91+
*/
92+
@Test
93+
@Alerts("true")
94+
public void validationPatternUnicodePropertyEscapeL() throws Exception {
95+
validation("\\p{L}*", "Html");
96+
validation("\\p{L}*", "&#x043C&#x0439&#x0440");
97+
}
98+
8999
private void validation(final String pattern, final String value) throws Exception {
90100
final String html =
91101
"<html><head>\n"

src/test/java/org/htmlunit/javascript/regexp/RegExpJsToJavaConverterTest.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -383,4 +383,17 @@ public void unicode() {
383383
assertEquals("\\u{FFFFD", regExpJsToJavaConverter.convert("\\u{FFFFD"));
384384
assertEquals("\\x{FFFFD}\\}", regExpJsToJavaConverter.convert("\\u{FFFFD}}"));
385385
}
386+
387+
/**
388+
* Unicode property escapes.
389+
*/
390+
@Test
391+
public void unicodePropertyEscapes() {
392+
final RegExpJsToJavaConverter regExpJsToJavaConverter = new RegExpJsToJavaConverter();
393+
394+
assertEquals("\\p{L}0-9", regExpJsToJavaConverter.convert("\\p{L}0-9"));
395+
assertEquals("\\p{L}0-9", regExpJsToJavaConverter.convert("\\p{Letter}0-9"));
396+
397+
assertEquals("p\\{html\\}0-9", regExpJsToJavaConverter.convert("\\p{html}0-9"));
398+
}
386399
}

0 commit comments

Comments
 (0)