[augeas-devel] [PATCH 2/8] Support for case-insensitive operations in libfa

lutter at redhat.com lutter at redhat.com
Wed Jan 13 18:40:39 UTC 2010


From: David Lutterkort <lutter at redhat.com>

  * src/fa.h (fa_nocase, fa_is_nocase): new functions
  * src/fa.c (struct fa): new field nocase; (fa_is_basic): a total FA has
    two transitions under nocase; (fa_clone): preserve nocase flag;
    (union_in_place, concat_in_place, fa_intersect): if one operand is
    nocase, and the other isn't, expand the nocase FA to a case-sensitive
    one; (totalize): do not add transitions on [A-Z]; (fa_nocase,
    fa_is_nocase): new functions; (case_expand): new function
  * tests/fatest.c (assertExample): also test against a nocase FA;
    (testNoCase): test concat etc. on nocase FA's
---
 src/fa.c           |  145 +++++++++++++++++++++++++++++++++++++++++++++++++---
 src/fa.h           |   13 +++++
 src/fa_sym.version |    5 ++
 tests/fatest.c     |   31 +++++++++++
 4 files changed, 186 insertions(+), 8 deletions(-)

diff --git a/src/fa.c b/src/fa.c
index c8d87fb..11e70a1 100644
--- a/src/fa.c
+++ b/src/fa.c
@@ -61,6 +61,7 @@ struct fa {
     struct state *initial;
     int           deterministic : 1;
     int           minimal : 1;
+    unsigned int  nocase : 1;
 };
 
 /* A state in a finite automaton. Transitions are never shared between
@@ -1835,11 +1836,27 @@ int fa_is_basic(struct fa *fa, unsigned int basic) {
     } else if (basic == FA_EPSILON) {
         return fa->initial->accept && fa->initial->tused == 0;
     } else if (basic == FA_TOTAL) {
-        if (! fa->initial->accept || fa->initial->tused != 1)
+        if (! fa->initial->accept)
             return 0;
-        struct trans *t = fa->initial->trans;
-        return t->to == fa->initial &&
-            t->min == UCHAR_MIN && t->max == UCHAR_MAX;
+        if (fa->nocase) {
+            if (fa->initial->tused != 2)
+                return 0;
+            struct trans *t1 = fa->initial->trans;
+            struct trans *t2 = fa->initial->trans + 1;
+            if (t1->to != fa->initial || t2->to != fa->initial)
+                return 0;
+            if (t2->max != UCHAR_MAX) {
+                t1 = t2;
+                t2 = fa->initial->trans;
+            }
+            return (t1->min == UCHAR_MIN && t1->max == 'A' - 1 &&
+                    t2->min == 'Z' + 1 && t2->max == UCHAR_MAX);
+        } else {
+            struct trans *t = fa->initial->trans;
+            return fa->initial->tused == 1 &&
+                t->to == fa->initial &&
+                t->min == UCHAR_MIN && t->max == UCHAR_MAX;
+        }
     }
     return 0;
 }
@@ -1854,6 +1871,7 @@ static struct fa *fa_clone(struct fa *fa) {
 
     result->deterministic = fa->deterministic;
     result->minimal = fa->minimal;
+    result->nocase = fa->nocase;
     list_for_each(s, fa->initial) {
         int i = state_set_push(set, s);
         _E(i < 0);
@@ -1885,12 +1903,21 @@ static struct fa *fa_clone(struct fa *fa) {
     return NULL;
 }
 
+static int case_expand(struct fa *fa);
+
 /* Compute FA1|FA2 and set FA1 to that automaton. FA2 is freed */
 ATTRIBUTE_RETURN_CHECK
 static int union_in_place(struct fa *fa1, struct fa **fa2) {
     struct state *s;
     int r;
 
+    if (fa1->nocase != (*fa2)->nocase) {
+        if (case_expand(fa1) < 0)
+            return -1;
+        if (case_expand(*fa2) < 0)
+            return -1;
+    }
+
     s = add_state(fa1, 0);
     if (s == NULL)
         return -1;
@@ -1930,6 +1957,13 @@ ATTRIBUTE_RETURN_CHECK
 static int concat_in_place(struct fa *fa1, struct fa **fa2) {
     int r;
 
+    if (fa1->nocase != (*fa2)->nocase) {
+        if (case_expand(fa1) < 0)
+            return -1;
+        if (case_expand(*fa2) < 0)
+            return -1;
+    }
+
     list_for_each(s, fa1->initial) {
         if (s->accept) {
             s->accept = 0;
@@ -2160,6 +2194,11 @@ struct fa *fa_intersect(struct fa *fa1, struct fa *fa2) {
     if (fa_is_basic(fa1, FA_EMPTY) || fa_is_basic(fa2, FA_EMPTY))
         return fa_make_empty();
 
+    if (fa1->nocase != fa2->nocase) {
+        _F(case_expand(fa1));
+        _F(case_expand(fa2));
+    }
+
     struct fa *fa = fa_make_empty();
     struct state_set *worklist = state_set_init(-1, S_NONE);
     state_triple_hash *newstates = state_triple_init();
@@ -2212,6 +2251,7 @@ struct fa *fa_intersect(struct fa *fa1, struct fa *fa2) {
         }
     }
     fa->deterministic = fa1->deterministic && fa2->deterministic;
+    fa->nocase = fa1->nocase && fa2->nocase;
  done:
     state_set_free(worklist);
     state_triple_free(newstates);
@@ -2300,22 +2340,39 @@ static int totalize(struct fa *fa) {
     _F(mark_reachable(fa));
     sort_transition_intervals(fa);
 
-    r = add_new_trans(crash, crash, UCHAR_MIN, UCHAR_MAX);
-    if (r < 0)
-        return -1;
+    if (fa->nocase) {
+        r = add_new_trans(crash, crash, UCHAR_MIN, 'A' - 1);
+        if (r < 0)
+            return -1;
+        r = add_new_trans(crash, crash, 'Z' + 1, UCHAR_MAX);
+        if (r < 0)
+            return -1;
+    } else {
+        r = add_new_trans(crash, crash, UCHAR_MIN, UCHAR_MAX);
+        if (r < 0)
+            return -1;
+    }
 
     list_for_each(s, fa->initial) {
         int next = UCHAR_MIN;
         int tused = s->tused;
         for (int i=0; i < tused; i++) {
             uchar min = s->trans[i].min, max = s->trans[i].max;
+            if (fa->nocase) {
+                /* Don't add transitions on [A-Z] into crash */
+                if (isupper(min)) min = 'A';
+                if (isupper(max)) max = 'Z';
+            }
             if (min > next) {
                 r = add_new_trans(s, crash, next, min - 1);
                 if (r < 0)
                     return -1;
             }
-            if (max + 1 > next)
+            if (max + 1 > next) {
                 next = max + 1;
+                if (fa->nocase && isupper(next))
+                    next = 'Z' + 1;
+            }
         }
         if (next <= UCHAR_MAX) {
             r = add_new_trans(s, crash, next, UCHAR_MAX);
@@ -2922,6 +2979,78 @@ int fa_compile(const char *regexp, size_t size, struct fa **fa) {
     return parse.error;
 }
 
+/* We represent a case-insensitive FA by using only transitions on
+ * lower-case letters.
+ */
+int fa_nocase(struct fa *fa) {
+    if (fa == NULL || fa->nocase)
+        return 0;
+
+    fa->nocase = 1;
+    list_for_each(s, fa->initial) {
+        int tused = s->tused;
+        /* For every transition on characters in [A-Z] add a corresponding
+         * transition on [a-z]; remove any portion covering [A-Z] */
+        for (int i=0; i < tused; i++) {
+            struct trans *t = s->trans + i;
+            int lc_min = t->min < 'A' ? 'a' : tolower(t->min);
+            int lc_max = t->max > 'Z' ? 'z' : tolower(t->max);
+
+            if (t->min > 'Z' || t->max < 'A')
+                continue;
+            if (t->min >= 'A' && t->max <= 'Z') {
+                t->min = tolower(t->min);
+                t->max = tolower(t->max);
+            } else if (t->max <= 'Z') {
+                /* t->min < 'A' */
+                t->max = 'A' - 1;
+                _F(add_new_trans(s, t->to, lc_min, lc_max));
+            } else {
+                /* t->min < 'A' && t->max > 'Z' */
+                _F(add_new_trans(s, t->to, 'Z' + 1, t->max));
+                s->trans[i].max = 'A' - 1;
+                _F(add_new_trans(s, s->trans[i].to, lc_min, lc_max));
+            }
+        }
+    }
+    _F(collect(fa));
+    return 0;
+ error:
+    return -1;
+}
+
+int fa_is_nocase(struct fa *fa) {
+    return fa->nocase;
+}
+
+/* If FA is case-insensitive, turn it into a case-sensitive automaton by
+ * adding transitions on upper-case letters for each existing transition on
+ * lower-case letters */
+static int case_expand(struct fa *fa) {
+    if (! fa->nocase)
+        return 0;
+
+    fa->nocase = 0;
+    list_for_each(s, fa->initial) {
+        int tused = s->tused;
+        /* For every transition on characters in [a-z] add a corresponding
+         * transition on [A-Z] */
+        for (int i=0; i < tused; i++) {
+            struct trans *t = s->trans + i;
+            int lc_min = t->min < 'a' ? 'A' : toupper(t->min);
+            int lc_max = t->max > 'z' ? 'Z' : toupper(t->max);
+
+            if (t->min > 'z' || t->max < 'a')
+                continue;
+            _F(add_new_trans(s, t->to, lc_min, lc_max));
+        }
+    }
+    _F(collect(fa));
+    return 0;
+ error:
+    return -1;
+}
+
 /*
  * Regular expression parser
  */
diff --git a/src/fa.h b/src/fa.h
index bf2b341..b9e08cf 100644
--- a/src/fa.h
+++ b/src/fa.h
@@ -67,6 +67,9 @@ extern int fa_minimization_algorithm;
  * On success, FA points to the newly allocated automaton constructed for
  * RE, and the function returns REG_NOERROR. Otherwise, FA is NULL, and the
  * return value indicates the error.
+ *
+ * The FA is case sensitive. Call FA_NOCASE to switch it to
+ * case-insensitive.
  */
 int fa_compile(const char *re, size_t size, struct fa **fa);
 
@@ -228,6 +231,16 @@ int fa_restrict_alphabet(const char *regexp, size_t regexp_len,
  */
 int fa_expand_char_ranges(const char *regexp, size_t regexp_len,
                           char **newregexp, size_t *newregexp_len);
+
+/* Modify FA so that it matches ignoring case.
+ *
+ * Returns 0 on success, and -1 if an allocation fails. On failure, the
+ * automaton is not guaranteed to represent anything sensible.
+ */
+int fa_nocase(struct fa *fa);
+
+/* Return 1 if FA matches ignoring case, 0 if matches are case sensitive */
+int fa_is_nocase(struct fa *fa);
 #endif
 
 
diff --git a/src/fa_sym.version b/src/fa_sym.version
index bc8554a..9101bdf 100644
--- a/src/fa_sym.version
+++ b/src/fa_sym.version
@@ -23,3 +23,8 @@ FA_1.0.0 {
       fa_expand_char_ranges;
     local: *;
 };
+
+FA_1.2.0 {
+      fa_nocase;
+      fa_is_nocase;
+} FA_1.0.0;
diff --git a/tests/fatest.c b/tests/fatest.c
index ee2cbc3..e4ece92 100644
--- a/tests/fatest.c
+++ b/tests/fatest.c
@@ -28,6 +28,7 @@
 #include "memory.h"
 #include <stdio.h>
 #include <stdlib.h>
+#include <ctype.h>
 
 #define FA_DOT_DIR "FA_DOT_DIR"
 
@@ -329,6 +330,15 @@ static void assertExample(CuTest *tc, const char *regexp, const char *exp) {
     fa_example(fa, &xmpl, &xmpl_len);
     CuAssertStrEquals(tc, exp, xmpl);
     free(xmpl);
+
+    fa_nocase(fa);
+    char *s = strdup(exp);
+    for (int i = 0; i < strlen(s); i++) s[i] = tolower(s[i]);
+
+    fa_example(fa, &xmpl, &xmpl_len);
+    CuAssertStrEquals(tc, s, xmpl);
+    free(xmpl);
+    free(s);
 }
 
 static void testExample(CuTest *tc) {
@@ -511,6 +521,26 @@ static void testExpandCharRanges(CuTest *tc) {
     CuAssertIntEquals(tc, strlen(nre), nre_len);
 }
 
+static void testNoCase(CuTest *tc) {
+    struct fa *fa1 = make_good_fa(tc, "[a-z0-9]");
+    struct fa *fa2 = make_good_fa(tc, "B");
+    struct fa *fa, *exp;
+    int r;
+
+    fa_nocase(fa1);
+    fa = fa_intersect(fa1, fa2);
+    CuAssertPtrNotNull(tc, fa);
+    r = fa_equals(fa, fa2);
+    fa_free(fa);
+    CuAssertIntEquals(tc, 1, r);
+
+    fa = fa_concat(fa1, fa2);
+    exp = make_good_fa(tc, "[a-zA-Z0-9]B");
+    r = fa_equals(fa, exp);
+    fa_free(fa);
+    CuAssertIntEquals(tc, 1, r);
+}
+
 int main(int argc, char **argv) {
     if (argc == 1) {
         char *output = NULL;
@@ -533,6 +563,7 @@ int main(int argc, char **argv) {
         SUITE_ADD_TEST(suite, testNul);
         SUITE_ADD_TEST(suite, testRestrictAlphabet);
         SUITE_ADD_TEST(suite, testExpandCharRanges);
+        SUITE_ADD_TEST(suite, testNoCase);
 
         CuSuiteRun(suite);
         CuSuiteSummary(suite, &output);
-- 
1.6.5.2




More information about the augeas-devel mailing list