[augeas-devel] augeas: master - Restrict the alphabet of a regexp
David Lutterkort
lutter at fedoraproject.org
Tue Sep 1 18:09:17 UTC 2009
Gitweb: http://git.fedorahosted.org/git/augeas.git?p=augeas.git;a=commitdiff;h=8441a8bdc001e29338ab5756728ae892ca42f264
Commit: 8441a8bdc001e29338ab5756728ae892ca42f264
Parent: f1bc784e6a1c20248b6b7f184c65a9c640911539
Author: David Lutterkort <lutter at redhat.com>
AuthorDate: Sun Dec 14 23:54:53 2008 -0800
Committer: David Lutterkort <lutter at redhat.com>
CommitterDate: Mon Aug 31 14:36:27 2009 -0700
Restrict the alphabet of a regexp
---
src/fa.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++
src/fa.h | 20 ++++++++++++++
src/fa_sym.version | 1 +
tests/fatest.c | 27 +++++++++++++++++++
4 files changed, 119 insertions(+), 0 deletions(-)
diff --git a/src/fa.c b/src/fa.c
index d3edc3c..062c4b3 100644
--- a/src/fa.c
+++ b/src/fa.c
@@ -130,6 +130,10 @@ static void bitset_free(bitset *bs) {
free(bs);
}
+static void bitset_negate(bitset *bs, size_t nbits) {
+ for (int i=0; i < (nbits + UINT_BIT) / UINT_BIT; i++)
+ bs[i] = ~ bs[i];
+}
/*
* Representation of a parsed regular expression. The regular expression is
@@ -3474,6 +3478,73 @@ int fa_as_regexp(struct fa *fa, char **regexp, size_t *regexp_len) {
return -1;
}
+static int re_restrict_alphabet(struct re *re, uchar from, uchar to) {
+ int r1, r2;
+ int result = 0;
+
+ switch(re->type) {
+ case UNION:
+ case CONCAT:
+ r1 = re_restrict_alphabet(re->exp1, from, to);
+ r2 = re_restrict_alphabet(re->exp2, from, to);
+ result = (r1 != 0) ? r1 : r2;
+ break;
+ case CSET:
+ if (re->negate) {
+ re->negate = 0;
+ bitset_negate(re->cset, UCHAR_NUM);
+ }
+ for (int i=from; i <= to; i++)
+ bitset_clr(re->cset, i);
+ break;
+ case CHAR:
+ if (from <= re->c && re->c <= to)
+ result = -1;
+ break;
+ case ITER:
+ result = re_restrict_alphabet(re->exp, from, to);
+ break;
+ case EPSILON:
+ break;
+ default:
+ assert(0);
+ abort();
+ break;
+ }
+ return result;
+}
+
+int fa_restrict_alphabet(const char *regexp, size_t regexp_len,
+ char **newregexp, size_t *newregexp_len,
+ char from, char to) {
+ int result;
+ struct re *re = NULL;
+ struct re_parse parse;
+ struct re_str str;
+
+ *newregexp = NULL;
+ parse.rx = regexp;
+ parse.rend = regexp + regexp_len;
+ parse.error = REG_NOERROR;
+ re = parse_regexp(&parse);
+ if (parse.error != REG_NOERROR)
+ return parse.error;
+
+ result = re_restrict_alphabet(re, from, to);
+ if (result != 0) {
+ result = -2;
+ goto done;
+ }
+
+ MEMZERO(&str, 1);
+ result = re_as_string(re, &str);
+ *newregexp = str.rx;
+ *newregexp_len = str.len;
+ done:
+ re_unref(re);
+ return result;
+}
+
static void print_char(FILE *out, uchar c) {
/* We escape '/' as '\\/' since dot chokes on bare slashes in labels;
Also, a space ' ' is shown as '\s' */
diff --git a/src/fa.h b/src/fa.h
index 08b3087..8b37016 100644
--- a/src/fa.h
+++ b/src/fa.h
@@ -197,6 +197,26 @@ int fa_ambig_example(struct fa *fa1, struct fa *fa2,
* to fail for FA_AS_REGEXP is running out of memory.
*/
int fa_as_regexp(struct fa *fa, char **regexp, size_t *regexp_len);
+
+/* Given the regular expression REGEXP construct a new regular expression
+ * NEWREGEXP that does not match strings containing any of the characters
+ * in the range FROM to TO, with the endpoints included.
+ *
+ * The new regular expression is constructed by removing the range FROM to
+ * TO from all character sets in REGEXP; if any of the characters [FROM,
+ * TO] appear outside a character set in REGEXP, return -2.
+ *
+ * Return 0 if NEWREGEXP was constructed successfully, -1 if an internal
+ * error happened (e.g., an allocation failed) and -2 if NEWREGEXP can not
+ * be constructed because any character in the range [FROM, TO] appears
+ * outside of a character set.
+ *
+ * Return a positive value if REGEXP is not syntactically valid; the value
+ * returned is one of the REG_ERRCODE_T POSIX error codes.
+ */
+int fa_restrict_alphabet(const char *regexp, size_t regexp_len,
+ char **newregexp, size_t *newregexp_len,
+ char from, char to);
#endif
diff --git a/src/fa_sym.version b/src/fa_sym.version
index ab81e8e..a489d86 100644
--- a/src/fa_sym.version
+++ b/src/fa_sym.version
@@ -19,5 +19,6 @@ FA_1.0.0 {
fa_example;
fa_ambig_example;
fa_as_regexp;
+ fa_restrict_alphabet;
local: *;
};
diff --git a/tests/fatest.c b/tests/fatest.c
index e17990e..81b01be 100644
--- a/tests/fatest.c
+++ b/tests/fatest.c
@@ -417,6 +417,8 @@ static void testAsRegexp(CuTest *tc) {
assertFaAsRegexp(tc, "ab|cd");
assertFaAsRegexp(tc, "[a-z]+");
assertFaAsRegexp(tc, "[]a-]+");
+ assertFaAsRegexp(tc, "[^0-9A-Z]");
+ assertFaAsRegexp(tc, "ab|(xy[A-Z0-9])*(uv[^0-9]?)");
assertFaAsRegexp(tc, "[A-CE-GI-LN-QS-Z]");
}
@@ -460,6 +462,30 @@ static void testNul(CuTest *tc) {
CuAssertIntEquals(tc, 0, memcmp(re0, re, re0_len));
}
+static void testRestrictAlphabet(CuTest *tc) {
+ const char *re = "ab|(xy[B-Z0-9])*(uv[^0-9]?)";
+ struct fa *fa_exp = make_good_fa(tc, "((xy[0-9])*)uv[^0-9A-Z]?|ab");
+ struct fa *fa_act = NULL;
+ size_t nre_len;
+ char *nre;
+ int r;
+
+ r = fa_restrict_alphabet(re, strlen(re), &nre, &nre_len, 'A', 'Z');
+ CuAssertIntEquals(tc, 0, r);
+ CuAssertIntEquals(tc, nre_len, strlen(nre));
+ fa_act = make_good_fa(tc, nre);
+ CuAssertTrue(tc, fa_equals(fa_exp, fa_act));
+ free(nre);
+
+ r = fa_restrict_alphabet("HELLO", strlen("HELLO"),
+ &nre, &nre_len, 'A', 'Z');
+ CuAssertIntEquals(tc, -2, r);
+ CuAssertPtrEquals(tc, NULL, nre);
+
+ r = fa_restrict_alphabet("a{2,", strlen("a{2"), &nre, &nre_len, 'A', 'Z');
+ CuAssertIntEquals(tc, REG_EBRACE, r);
+}
+
int main(int argc, char **argv) {
if (argc == 1) {
char *output = NULL;
@@ -480,6 +506,7 @@ int main(int argc, char **argv) {
SUITE_ADD_TEST(suite, testAsRegexpMinus);
SUITE_ADD_TEST(suite, testRangeEnd);
SUITE_ADD_TEST(suite, testNul);
+ SUITE_ADD_TEST(suite, testRestrictAlphabet);
CuSuiteRun(suite);
CuSuiteSummary(suite, &output);
More information about the augeas-devel
mailing list