slre: implement support for ranges in character classes

When trying to use U-Boot's regex facility, it is a rather large
gotcha that [a-z] range syntax is not supported. It doesn't require a
lot of extra code to implement that; we just let the regular parsing
emit the start and end literal symbols as usual, and add a new
"escape" code RANGE.

At match time, this means the code will first just see an 'a' and try
to match that, and only then recognize that it's actually part of a
range and then do the 'a' <= ch <= 'z' test.

Of course, this means that a - in the middle of a [] pair no longer
matches a literal dash, but I highly doubt anybody relies on
that. Putting it first or last, or escaping it with \, as in most
other RE engines, continues to work.

Reviewed-by: Simon Glass <sjg@chromium.org>
Signed-off-by: Rasmus Villemoes <ravi@prevas.dk>
This commit is contained in:
Rasmus Villemoes
2025-05-13 10:40:32 +02:00
committed by Tom Rini
parent 4d08883556
commit fe4f211850

View File

@@ -30,7 +30,7 @@
#include <slre.h>
enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL,
STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT};
STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT, RANGE};
#ifdef SLRE_TEST
static struct {
@@ -55,7 +55,8 @@ static struct {
{"QUEST", 1, "o"}, /* Match zero or one time, "?" */
{"SPACE", 0, ""}, /* Match whitespace, "\s" */
{"NONSPACE", 0, ""}, /* Match non-space, "\S" */
{"DIGIT", 0, ""} /* Match digit, "\d" */
{"DIGIT", 0, ""}, /* Match digit, "\d" */
{"RANGE", 0, ""}, /* Range separator - */
};
#endif /* SLRE_TEST */
@@ -260,6 +261,15 @@ anyof(struct slre *r, const char **re)
return;
/* NOTREACHED */
break;
case '-':
if (r->data_size == old_data_size || **re == ']') {
/* First or last character, just match - itself. */
store_char_in_data(r, '-');
break;
}
store_char_in_data(r, 0);
store_char_in_data(r, RANGE);
break;
case '\\':
esc = get_escape_char(re);
if ((esc & 0xff) == 0) {
@@ -487,6 +497,14 @@ is_any_of(const unsigned char *p, int len, const char *s, int *ofs)
if (isdigit(ch))
goto match;
break;
case RANGE:
/*
* a-z is represented in the data array as {'a', \0, RANGE, 'z'}
*/
++i;
if (p[i - 3] <= (unsigned char)ch && (unsigned char)ch <= p[i])
goto match;
break;
}
continue;
}