[augeas-devel] augeas: master - Support for case-insensitive operations in libfa
David Lutterkort
lutter at fedoraproject.org
Thu Jan 14 18:19:47 UTC 2010
Gitweb: http://git.fedorahosted.org/git/augeas.git?p=augeas.git;a=commitdiff;h=14590d5ce63a328ff63354134c3a128cdb45a408
Commit: 14590d5ce63a328ff63354134c3a128cdb45a408
Parent: c10145e8d4fb45015cdcd573e4ab774ddd42b677
Author: David Lutterkort <lutter at redhat.com>
AuthorDate: Tue Jan 12 15:05:55 2010 -0800
Committer: David Lutterkort <lutter at redhat.com>
CommitterDate: Wed Jan 13 10:27:34 2010 -0800
Support for case-insensitive operations in libfa
* src/fa.h (fa_nocase, fa_is_nocase): new functions
* src/fa.c (struct fa): new field nocase; (fa_is_basic): a total FA has
two transitions under nocase; (fa_clone): preserve nocase flag;
(union_in_place, concat_in_place, fa_intersect): if one operand is
nocase, and the other isn't, expand the nocase FA to a case-sensitive
one; (totalize): do not add transitions on [A-Z]; (fa_nocase,
fa_is_nocase): new functions; (case_expand): new function
* tests/fatest.c (assertExample): also test against a nocase FA;
(testNoCase): test concat etc. on nocase FA's
---
src/fa.c | 145 +++++++++++++++++++++++++++++++++++++++++++++++++---
src/fa.h | 13 +++++
src/fa_sym.version | 5 ++
tests/fatest.c | 31 +++++++++++
4 files changed, 186 insertions(+), 8 deletions(-)
diff --git a/src/fa.c b/src/fa.c
index c8d87fb..11e70a1 100644
--- a/src/fa.c
+++ b/src/fa.c
@@ -61,6 +61,7 @@ struct fa {
struct state *initial;
int deterministic : 1;
int minimal : 1;
+ unsigned int nocase : 1;
};
/* A state in a finite automaton. Transitions are never shared between
@@ -1835,11 +1836,27 @@ int fa_is_basic(struct fa *fa, unsigned int basic) {
} else if (basic == FA_EPSILON) {
return fa->initial->accept && fa->initial->tused == 0;
} else if (basic == FA_TOTAL) {
- if (! fa->initial->accept || fa->initial->tused != 1)
+ if (! fa->initial->accept)
return 0;
- struct trans *t = fa->initial->trans;
- return t->to == fa->initial &&
- t->min == UCHAR_MIN && t->max == UCHAR_MAX;
+ if (fa->nocase) {
+ if (fa->initial->tused != 2)
+ return 0;
+ struct trans *t1 = fa->initial->trans;
+ struct trans *t2 = fa->initial->trans + 1;
+ if (t1->to != fa->initial || t2->to != fa->initial)
+ return 0;
+ if (t2->max != UCHAR_MAX) {
+ t1 = t2;
+ t2 = fa->initial->trans;
+ }
+ return (t1->min == UCHAR_MIN && t1->max == 'A' - 1 &&
+ t2->min == 'Z' + 1 && t2->max == UCHAR_MAX);
+ } else {
+ struct trans *t = fa->initial->trans;
+ return fa->initial->tused == 1 &&
+ t->to == fa->initial &&
+ t->min == UCHAR_MIN && t->max == UCHAR_MAX;
+ }
}
return 0;
}
@@ -1854,6 +1871,7 @@ static struct fa *fa_clone(struct fa *fa) {
result->deterministic = fa->deterministic;
result->minimal = fa->minimal;
+ result->nocase = fa->nocase;
list_for_each(s, fa->initial) {
int i = state_set_push(set, s);
_E(i < 0);
@@ -1885,12 +1903,21 @@ static struct fa *fa_clone(struct fa *fa) {
return NULL;
}
+static int case_expand(struct fa *fa);
+
/* Compute FA1|FA2 and set FA1 to that automaton. FA2 is freed */
ATTRIBUTE_RETURN_CHECK
static int union_in_place(struct fa *fa1, struct fa **fa2) {
struct state *s;
int r;
+ if (fa1->nocase != (*fa2)->nocase) {
+ if (case_expand(fa1) < 0)
+ return -1;
+ if (case_expand(*fa2) < 0)
+ return -1;
+ }
+
s = add_state(fa1, 0);
if (s == NULL)
return -1;
@@ -1930,6 +1957,13 @@ ATTRIBUTE_RETURN_CHECK
static int concat_in_place(struct fa *fa1, struct fa **fa2) {
int r;
+ if (fa1->nocase != (*fa2)->nocase) {
+ if (case_expand(fa1) < 0)
+ return -1;
+ if (case_expand(*fa2) < 0)
+ return -1;
+ }
+
list_for_each(s, fa1->initial) {
if (s->accept) {
s->accept = 0;
@@ -2160,6 +2194,11 @@ struct fa *fa_intersect(struct fa *fa1, struct fa *fa2) {
if (fa_is_basic(fa1, FA_EMPTY) || fa_is_basic(fa2, FA_EMPTY))
return fa_make_empty();
+ if (fa1->nocase != fa2->nocase) {
+ _F(case_expand(fa1));
+ _F(case_expand(fa2));
+ }
+
struct fa *fa = fa_make_empty();
struct state_set *worklist = state_set_init(-1, S_NONE);
state_triple_hash *newstates = state_triple_init();
@@ -2212,6 +2251,7 @@ struct fa *fa_intersect(struct fa *fa1, struct fa *fa2) {
}
}
fa->deterministic = fa1->deterministic && fa2->deterministic;
+ fa->nocase = fa1->nocase && fa2->nocase;
done:
state_set_free(worklist);
state_triple_free(newstates);
@@ -2300,22 +2340,39 @@ static int totalize(struct fa *fa) {
_F(mark_reachable(fa));
sort_transition_intervals(fa);
- r = add_new_trans(crash, crash, UCHAR_MIN, UCHAR_MAX);
- if (r < 0)
- return -1;
+ if (fa->nocase) {
+ r = add_new_trans(crash, crash, UCHAR_MIN, 'A' - 1);
+ if (r < 0)
+ return -1;
+ r = add_new_trans(crash, crash, 'Z' + 1, UCHAR_MAX);
+ if (r < 0)
+ return -1;
+ } else {
+ r = add_new_trans(crash, crash, UCHAR_MIN, UCHAR_MAX);
+ if (r < 0)
+ return -1;
+ }
list_for_each(s, fa->initial) {
int next = UCHAR_MIN;
int tused = s->tused;
for (int i=0; i < tused; i++) {
uchar min = s->trans[i].min, max = s->trans[i].max;
+ if (fa->nocase) {
+ /* Don't add transitions on [A-Z] into crash */
+ if (isupper(min)) min = 'A';
+ if (isupper(max)) max = 'Z';
+ }
if (min > next) {
r = add_new_trans(s, crash, next, min - 1);
if (r < 0)
return -1;
}
- if (max + 1 > next)
+ if (max + 1 > next) {
next = max + 1;
+ if (fa->nocase && isupper(next))
+ next = 'Z' + 1;
+ }
}
if (next <= UCHAR_MAX) {
r = add_new_trans(s, crash, next, UCHAR_MAX);
@@ -2922,6 +2979,78 @@ int fa_compile(const char *regexp, size_t size, struct fa **fa) {
return parse.error;
}
+/* We represent a case-insensitive FA by using only transitions on
+ * lower-case letters.
+ */
+int fa_nocase(struct fa *fa) {
+ if (fa == NULL || fa->nocase)
+ return 0;
+
+ fa->nocase = 1;
+ list_for_each(s, fa->initial) {
+ int tused = s->tused;
+ /* For every transition on characters in [A-Z] add a corresponding
+ * transition on [a-z]; remove any portion covering [A-Z] */
+ for (int i=0; i < tused; i++) {
+ struct trans *t = s->trans + i;
+ int lc_min = t->min < 'A' ? 'a' : tolower(t->min);
+ int lc_max = t->max > 'Z' ? 'z' : tolower(t->max);
+
+ if (t->min > 'Z' || t->max < 'A')
+ continue;
+ if (t->min >= 'A' && t->max <= 'Z') {
+ t->min = tolower(t->min);
+ t->max = tolower(t->max);
+ } else if (t->max <= 'Z') {
+ /* t->min < 'A' */
+ t->max = 'A' - 1;
+ _F(add_new_trans(s, t->to, lc_min, lc_max));
+ } else {
+ /* t->min < 'A' && t->max > 'Z' */
+ _F(add_new_trans(s, t->to, 'Z' + 1, t->max));
+ s->trans[i].max = 'A' - 1;
+ _F(add_new_trans(s, s->trans[i].to, lc_min, lc_max));
+ }
+ }
+ }
+ _F(collect(fa));
+ return 0;
+ error:
+ return -1;
+}
+
+int fa_is_nocase(struct fa *fa) {
+ return fa->nocase;
+}
+
+/* If FA is case-insensitive, turn it into a case-sensitive automaton by
+ * adding transitions on upper-case letters for each existing transition on
+ * lower-case letters */
+static int case_expand(struct fa *fa) {
+ if (! fa->nocase)
+ return 0;
+
+ fa->nocase = 0;
+ list_for_each(s, fa->initial) {
+ int tused = s->tused;
+ /* For every transition on characters in [a-z] add a corresponding
+ * transition on [A-Z] */
+ for (int i=0; i < tused; i++) {
+ struct trans *t = s->trans + i;
+ int lc_min = t->min < 'a' ? 'A' : toupper(t->min);
+ int lc_max = t->max > 'z' ? 'Z' : toupper(t->max);
+
+ if (t->min > 'z' || t->max < 'a')
+ continue;
+ _F(add_new_trans(s, t->to, lc_min, lc_max));
+ }
+ }
+ _F(collect(fa));
+ return 0;
+ error:
+ return -1;
+}
+
/*
* Regular expression parser
*/
diff --git a/src/fa.h b/src/fa.h
index bf2b341..b9e08cf 100644
--- a/src/fa.h
+++ b/src/fa.h
@@ -67,6 +67,9 @@ extern int fa_minimization_algorithm;
* On success, FA points to the newly allocated automaton constructed for
* RE, and the function returns REG_NOERROR. Otherwise, FA is NULL, and the
* return value indicates the error.
+ *
+ * The FA is case sensitive. Call FA_NOCASE to switch it to
+ * case-insensitive.
*/
int fa_compile(const char *re, size_t size, struct fa **fa);
@@ -228,6 +231,16 @@ int fa_restrict_alphabet(const char *regexp, size_t regexp_len,
*/
int fa_expand_char_ranges(const char *regexp, size_t regexp_len,
char **newregexp, size_t *newregexp_len);
+
+/* Modify FA so that it matches ignoring case.
+ *
+ * Returns 0 on success, and -1 if an allocation fails. On failure, the
+ * automaton is not guaranteed to represent anything sensible.
+ */
+int fa_nocase(struct fa *fa);
+
+/* Return 1 if FA matches ignoring case, 0 if matches are case sensitive */
+int fa_is_nocase(struct fa *fa);
#endif
diff --git a/src/fa_sym.version b/src/fa_sym.version
index bc8554a..9101bdf 100644
--- a/src/fa_sym.version
+++ b/src/fa_sym.version
@@ -23,3 +23,8 @@ FA_1.0.0 {
fa_expand_char_ranges;
local: *;
};
+
+FA_1.2.0 {
+ fa_nocase;
+ fa_is_nocase;
+} FA_1.0.0;
diff --git a/tests/fatest.c b/tests/fatest.c
index ee2cbc3..e4ece92 100644
--- a/tests/fatest.c
+++ b/tests/fatest.c
@@ -28,6 +28,7 @@
#include "memory.h"
#include <stdio.h>
#include <stdlib.h>
+#include <ctype.h>
#define FA_DOT_DIR "FA_DOT_DIR"
@@ -329,6 +330,15 @@ static void assertExample(CuTest *tc, const char *regexp, const char *exp) {
fa_example(fa, &xmpl, &xmpl_len);
CuAssertStrEquals(tc, exp, xmpl);
free(xmpl);
+
+ fa_nocase(fa);
+ char *s = strdup(exp);
+ for (int i = 0; i < strlen(s); i++) s[i] = tolower(s[i]);
+
+ fa_example(fa, &xmpl, &xmpl_len);
+ CuAssertStrEquals(tc, s, xmpl);
+ free(xmpl);
+ free(s);
}
static void testExample(CuTest *tc) {
@@ -511,6 +521,26 @@ static void testExpandCharRanges(CuTest *tc) {
CuAssertIntEquals(tc, strlen(nre), nre_len);
}
+static void testNoCase(CuTest *tc) {
+ struct fa *fa1 = make_good_fa(tc, "[a-z0-9]");
+ struct fa *fa2 = make_good_fa(tc, "B");
+ struct fa *fa, *exp;
+ int r;
+
+ fa_nocase(fa1);
+ fa = fa_intersect(fa1, fa2);
+ CuAssertPtrNotNull(tc, fa);
+ r = fa_equals(fa, fa2);
+ fa_free(fa);
+ CuAssertIntEquals(tc, 1, r);
+
+ fa = fa_concat(fa1, fa2);
+ exp = make_good_fa(tc, "[a-zA-Z0-9]B");
+ r = fa_equals(fa, exp);
+ fa_free(fa);
+ CuAssertIntEquals(tc, 1, r);
+}
+
int main(int argc, char **argv) {
if (argc == 1) {
char *output = NULL;
@@ -533,6 +563,7 @@ int main(int argc, char **argv) {
SUITE_ADD_TEST(suite, testNul);
SUITE_ADD_TEST(suite, testRestrictAlphabet);
SUITE_ADD_TEST(suite, testExpandCharRanges);
+ SUITE_ADD_TEST(suite, testNoCase);
CuSuiteRun(suite);
CuSuiteSummary(suite, &output);
More information about the augeas-devel
mailing list