[augeas-devel] augeas: master - Restrict the alphabet of a regexp

Tue Sep 1 18:09:17 UTC 2009

Gitweb:        http://git.fedorahosted.org/git/augeas.git?p=augeas.git;a=commitdiff;h=8441a8bdc001e29338ab5756728ae892ca42f264
Commit:        8441a8bdc001e29338ab5756728ae892ca42f264
Parent:        f1bc784e6a1c20248b6b7f184c65a9c640911539
Author:        David Lutterkort <lutter at redhat.com>
AuthorDate:    Sun Dec 14 23:54:53 2008 -0800
Committer:     David Lutterkort <lutter at redhat.com>
CommitterDate: Mon Aug 31 14:36:27 2009 -0700

Restrict the alphabet of a regexp

---
 src/fa.c           |   71 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/fa.h           |   20 ++++++++++++++
 src/fa_sym.version |    1 +
 tests/fatest.c     |   27 +++++++++++++++++++
 4 files changed, 119 insertions(+), 0 deletions(-)

diff --git a/src/fa.c b/src/fa.c
index d3edc3c..062c4b3 100644
--- a/src/fa.c
+++ b/src/fa.c
@@ -130,6 +130,10 @@ static void bitset_free(bitset *bs) {
     free(bs);
 }
 
+static void bitset_negate(bitset *bs, size_t nbits) {
+    for (int i=0; i < (nbits + UINT_BIT) / UINT_BIT; i++)
+        bs[i] = ~ bs[i];
+}
 
 /*
  * Representation of a parsed regular expression. The regular expression is
@@ -3474,6 +3478,73 @@ int fa_as_regexp(struct fa *fa, char **regexp, size_t *regexp_len) {
     return -1;
 }
 
+static int re_restrict_alphabet(struct re *re, uchar from, uchar to) {
+    int r1, r2;
+    int result = 0;
+
+    switch(re->type) {
+    case UNION:
+    case CONCAT:
+        r1 = re_restrict_alphabet(re->exp1, from, to);
+        r2 = re_restrict_alphabet(re->exp2, from, to);
+        result = (r1 != 0) ? r1 : r2;
+        break;
+    case CSET:
+        if (re->negate) {
+            re->negate = 0;
+            bitset_negate(re->cset, UCHAR_NUM);
+        }
+        for (int i=from; i <= to; i++)
+            bitset_clr(re->cset, i);
+        break;
+    case CHAR:
+        if (from <= re->c && re->c <= to)
+            result = -1;
+        break;
+    case ITER:
+        result = re_restrict_alphabet(re->exp, from, to);
+        break;
+    case EPSILON:
+        break;
+    default:
+        assert(0);
+        abort();
+        break;
+    }
+    return result;
+}
+
+int fa_restrict_alphabet(const char *regexp, size_t regexp_len,
+                         char **newregexp, size_t *newregexp_len,
+                         char from, char to) {
+    int result;
+    struct re *re = NULL;
+    struct re_parse parse;
+    struct re_str str;
+
+    *newregexp = NULL;
+    parse.rx = regexp;
+    parse.rend = regexp + regexp_len;
+    parse.error = REG_NOERROR;
+    re = parse_regexp(&parse);
+    if (parse.error != REG_NOERROR)
+        return parse.error;
+
+    result = re_restrict_alphabet(re, from, to);
+    if (result != 0) {
+        result = -2;
+        goto done;
+    }
+
+    MEMZERO(&str, 1);
+    result = re_as_string(re, &str);
+    *newregexp = str.rx;
+    *newregexp_len = str.len;
+ done:
+    re_unref(re);
+    return result;
+}
+
 static void print_char(FILE *out, uchar c) {
     /* We escape '/' as '\\/' since dot chokes on bare slashes in labels;
        Also, a space ' ' is shown as '\s' */
diff --git a/src/fa.h b/src/fa.h
index 08b3087..8b37016 100644
--- a/src/fa.h
+++ b/src/fa.h
@@ -197,6 +197,26 @@ int fa_ambig_example(struct fa *fa1, struct fa *fa2,
  * to fail for FA_AS_REGEXP is running out of memory.
  */
 int fa_as_regexp(struct fa *fa, char **regexp, size_t *regexp_len);
+
+/* Given the regular expression REGEXP construct a new regular expression
+ * NEWREGEXP that does not match strings containing any of the characters
+ * in the range FROM to TO, with the endpoints included.
+ *
+ * The new regular expression is constructed by removing the range FROM to
+ * TO from all character sets in REGEXP; if any of the characters [FROM,
+ * TO] appear outside a character set in REGEXP, return -2.
+ *
+ * Return 0 if NEWREGEXP was constructed successfully, -1 if an internal
+ * error happened (e.g., an allocation failed) and -2 if NEWREGEXP can not
+ * be constructed because any character in the range [FROM, TO] appears
+ * outside of a character set.
+ *
+ * Return a positive value if REGEXP is not syntactically valid; the value
+ * returned is one of the REG_ERRCODE_T POSIX error codes.
+ */
+int fa_restrict_alphabet(const char *regexp, size_t regexp_len,
+                         char **newregexp, size_t *newregexp_len,
+                         char from, char to);
 #endif
 
 
diff --git a/src/fa_sym.version b/src/fa_sym.version
index ab81e8e..a489d86 100644
--- a/src/fa_sym.version
+++ b/src/fa_sym.version
@@ -19,5 +19,6 @@ FA_1.0.0 {
       fa_example;
       fa_ambig_example;
       fa_as_regexp;
+      fa_restrict_alphabet;
     local: *;
 };
diff --git a/tests/fatest.c b/tests/fatest.c
index e17990e..81b01be 100644
--- a/tests/fatest.c
+++ b/tests/fatest.c
@@ -417,6 +417,8 @@ static void testAsRegexp(CuTest *tc) {
     assertFaAsRegexp(tc, "ab|cd");
     assertFaAsRegexp(tc, "[a-z]+");
     assertFaAsRegexp(tc, "[]a-]+");
+    assertFaAsRegexp(tc, "[^0-9A-Z]");
+    assertFaAsRegexp(tc, "ab|(xy[A-Z0-9])*(uv[^0-9]?)");
     assertFaAsRegexp(tc, "[A-CE-GI-LN-QS-Z]");
 }
 
@@ -460,6 +462,30 @@ static void testNul(CuTest *tc) {
     CuAssertIntEquals(tc, 0, memcmp(re0, re, re0_len));
 }
 
+static void testRestrictAlphabet(CuTest *tc) {
+    const char *re = "ab|(xy[B-Z0-9])*(uv[^0-9]?)";
+    struct fa *fa_exp = make_good_fa(tc, "((xy[0-9])*)uv[^0-9A-Z]?|ab");
+    struct fa *fa_act = NULL;
+    size_t nre_len;
+    char *nre;
+    int r;
+
+    r = fa_restrict_alphabet(re, strlen(re), &nre, &nre_len, 'A', 'Z');
+    CuAssertIntEquals(tc, 0, r);
+    CuAssertIntEquals(tc, nre_len, strlen(nre));
+    fa_act = make_good_fa(tc, nre);
+    CuAssertTrue(tc, fa_equals(fa_exp, fa_act));
+    free(nre);
+
+    r = fa_restrict_alphabet("HELLO", strlen("HELLO"),
+                             &nre, &nre_len, 'A', 'Z');
+    CuAssertIntEquals(tc, -2, r);
+    CuAssertPtrEquals(tc, NULL, nre);
+
+    r = fa_restrict_alphabet("a{2,", strlen("a{2"), &nre, &nre_len, 'A', 'Z');
+    CuAssertIntEquals(tc, REG_EBRACE, r);
+}
+
 int main(int argc, char **argv) {
     if (argc == 1) {
         char *output = NULL;
@@ -480,6 +506,7 @@ int main(int argc, char **argv) {
         SUITE_ADD_TEST(suite, testAsRegexpMinus);
         SUITE_ADD_TEST(suite, testRangeEnd);
         SUITE_ADD_TEST(suite, testNul);
+        SUITE_ADD_TEST(suite, testRestrictAlphabet);
 
         CuSuiteRun(suite);
         CuSuiteSummary(suite, &output);