[augeas-devel] [PATCH 4 of 4] Add fa_as_regexp that converts an automaton back to a regexp
David Lutterkort
dlutter at redhat.com
Fri May 9 01:33:52 UTC 2008
# HG changeset patch
# User David Lutterkort <dlutter at redhat.com>
# Date 1210296750 25200
# Node ID 3d2531fc583a9168d3b4d9831e55cd771781ef4c
# Parent 0d183841fd643cf4f57102650dff2653e7062b7c
Add fa_as_regexp that converts an automaton back to a regexp
During conversion from FA to regexp, our FA mutates into a 'generalized
transition graph' where transitions aren't labelled with character
intervals, but with regular expressions.
There are lots of gyrations to keep the generated regexp reasonably short
(and syntactically correct)
diff -r 0d183841fd64 -r 3d2531fc583a src/fa.c
--- a/src/fa.c Thu May 08 18:27:26 2008 -0700
+++ b/src/fa.c Thu May 08 18:32:30 2008 -0700
@@ -76,8 +76,13 @@ struct state {
*/
struct trans {
struct state *to;
- uchar min;
- uchar max;
+ union {
+ struct {
+ uchar min;
+ uchar max;
+ };
+ struct re *re;
+ };
};
@@ -303,6 +308,8 @@ static struct state *add_state(struct fa
}
return s;
}
+
+#define last_trans(s) ((s)->trans + (s)->tused - 1)
#define for_each_trans(t, s) \
for (struct trans *t = (s)->trans; \
@@ -2681,6 +2688,557 @@ static struct re *parse_regexp(const cha
return NULL;
}
+/*
+ * Convert a STRUCT RE to a string. Code is complicated by the fact that
+ * we try to be clever and avoid unneeded parens and concatenation with
+ * epsilon etc.
+ */
+static char *re_as_string(const struct re *re);
+
+static int re_binop_count(enum re_type type, const struct re *re) {
+ assert(type == CONCAT || type == UNION);
+ if (re->type == type) {
+ return re_binop_count(type, re->exp1) + re_binop_count(type, re->exp2);
+ } else {
+ return 1;
+ }
+}
+
+static int re_binop_store(enum re_type type, const struct re *re,
+ const struct re **list) {
+ int pos = 0;
+ if (type == re->type) {
+ pos = re_binop_store(type, re->exp1, list);
+ pos += re_binop_store(type, re->exp2, list + pos);
+ } else {
+ list[0] = re;
+ pos = 1;
+ }
+ return pos;
+}
+
+static char *re_union_as_string(const struct re *re) {
+ assert(re->type == UNION);
+
+ const struct re **res = NULL;
+ char **strings = NULL;
+ int nre = 0, r;
+ char *result = NULL;
+
+ nre = re_binop_count(re->type, re);
+ r = ALLOC_N(res, nre);
+ if (r < 0)
+ goto done;
+
+ re_binop_store(re->type, re, res);
+
+ r = ALLOC_N(strings, nre);
+ if (r < 0)
+ goto done;
+
+ int len = 0;
+ for (int i=0; i < nre; i++) {
+ strings[i] = re_as_string(res[i]);
+ len += strlen(strings[i]);
+ }
+ len += (nre-1) + 1;
+
+ r = ALLOC_N(result, len);
+ if (r < 0)
+ goto done;
+
+ char *p = result;
+ for (int i=0; i < nre; i++) {
+ if (i>0)
+ *p++ = '|';
+ p = stpcpy(p, strings[i]);
+ }
+
+ done:
+ free(res);
+ if (strings != NULL) {
+ for (int i=0; i < nre; i++)
+ free(strings[i]);
+ }
+ free(strings);
+ return result;
+}
+
+ATTRIBUTE_PURE
+static int re_needs_parens_in_concat(const struct re *re) {
+ if (re->type == ITER)
+ re = re->exp;
+
+ return (re->type != CHAR && re->type != CSET);
+}
+
+static char *re_concat_as_string(const struct re *re) {
+ assert(re->type == CONCAT);
+
+ const struct re **res = NULL;
+ char **strings = NULL;
+ int nre = 0, r;
+ char *result = NULL;
+
+ nre = re_binop_count(re->type, re);
+ r = ALLOC_N(res, nre);
+ if (r < 0)
+ goto done;
+ re_binop_store(re->type, re, res);
+
+ r = ALLOC_N(strings, nre);
+ if (r < 0)
+ goto done;
+
+ int len = 0;
+ for (int i=0; i < nre; i++) {
+ if (res[i]->type == EPSILON)
+ continue;
+ strings[i] = re_as_string(res[i]);
+ len += strlen(strings[i]);
+ if (re_needs_parens_in_concat(res[i]))
+ len += 2;
+ }
+ len += 1;
+
+ r = ALLOC_N(result, len);
+ if (r < 0)
+ goto done;
+
+ char *p = result;
+ for (int i=0; i < nre; i++) {
+ if (res[i]->type == EPSILON)
+ continue;
+ if (re_needs_parens_in_concat(res[i]))
+ *p++ = '(';
+ p = stpcpy(p, strings[i]);
+ if (re_needs_parens_in_concat(res[i]))
+ *p++ = ')';
+ }
+
+ done:
+ free(res);
+ if (strings != NULL) {
+ for (int i=0; i < nre; i++)
+ free(strings[i]);
+ }
+ free(strings);
+ return result;
+}
+
+static char *re_cset_as_string(const struct re *re) {
+ const uchar rbrack = ']';
+ const uchar dash = '-';
+ static const char *const empty_set = "[]";
+ static const char *const empty_nset = "[^]";
+ static const char *const total_set = "(.|\n)";
+
+ char *result = NULL, *s;
+ int from, to, negate;
+ size_t set_len, nset_len, len;
+ int set_rbrack, set_dash, nset_rbrack, nset_dash;
+ int r;
+
+ set_len = strlen(empty_set);
+ nset_len = strlen(empty_nset);
+
+ /* See if ']' and '-' will be explicitly included in the character set
+ (SET_RBRACK, SET_DASH) or in the negated character set (NSET_RBRACK,
+ NSET_DASH) As we loop over the character set, we reset these flags
+ if they are in the set/negated set, but not mentioned explicitly
+ */
+ set_rbrack = re->cset[rbrack];
+ set_dash = re->cset[dash];
+ nset_rbrack = !set_rbrack;
+ nset_dash = !set_dash;
+
+ for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) {
+ int include = re->cset[from];
+ for (to = from;
+ to < UCHAR_MAX && (re->cset[to+1] == include);
+ to++);
+
+ if (to == from && (from == rbrack || from == dash))
+ continue;
+ if (from == rbrack || from == dash)
+ from += 1;
+ if (to == rbrack || to == dash)
+ to -= 1;
+
+ len = (to == from) ? 1 : ((to == from + 1) ? 2 : 3);
+ if (include) {
+ if (from < rbrack && rbrack < to)
+ set_rbrack = 0;
+ if (from < dash && dash < to)
+ set_dash = 0;
+ set_len += len;
+ } else {
+ if (from < rbrack && rbrack < to)
+ nset_rbrack = 0;
+ if (from < dash && dash < to)
+ nset_dash = 0;
+ nset_len += len;
+ }
+ }
+ set_len += set_rbrack + set_dash;
+ nset_len += nset_rbrack + nset_dash;
+
+ if (nset_len == strlen(empty_nset)) {
+ /* Special case: the set matches every character */
+ return strdup(total_set);
+ }
+
+ if (set_len < nset_len) {
+ negate = 0;
+ r = ALLOC_N(result, set_len + 1);
+ } else {
+ negate = 1;
+ r = ALLOC_N(result, nset_len + 1);
+ }
+ if (r < 0)
+ return NULL;
+
+ s = result;
+ *s++ = '[';
+ if (negate)
+ *s++ = '^';
+ if ((negate && nset_rbrack) || (!negate && set_rbrack))
+ *s++ = rbrack;
+
+ for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) {
+ while (from <= UCHAR_MAX && re->cset[from] == negate)
+ from += 1;
+ if (from > UCHAR_MAX)
+ break;
+ for (to = from;
+ to < UCHAR_MAX && (re->cset[to+1] == ! negate);
+ to++);
+
+ if (to == from && (from == rbrack || from == dash))
+ continue;
+ if (from == rbrack || from == dash)
+ from += 1;
+ if (to == rbrack || to == dash)
+ to -= 1;
+
+ if (to == from) {
+ *s++ = from;
+ } else if (to == from + 1) {
+ *s++ = from;
+ *s++ = to;
+ } else {
+ *s++ = from;
+ *s++ = '-';
+ *s++ = to;
+ }
+ }
+ if (negate) {
+ if (nset_dash)
+ *s++ = dash;
+ } else {
+ if (set_dash)
+ *s++ = dash;
+ }
+ *s = ']';
+
+ return result;
+}
+
+static char *re_iter_as_string(const struct re *re) {
+ const char *quant = NULL;
+ char *result, *exp;
+ int r;
+
+ exp = re_as_string(re->exp);
+ if (exp == NULL)
+ return NULL;
+
+ if (re->min == 0 && re->max == -1) {
+ quant = "*";
+ } else if (re->min == 1 && re->max == -1) {
+ quant = "+";
+ } else if (re->min == 0 && re->max == 1) {
+ quant = "?";
+ }
+
+ if (re->exp->type == CHAR || re->exp->type == CSET) {
+ if (quant == NULL) {
+ r = asprintf(&result, "%s{%d,%d}", exp, re->min, re->max);
+ } else {
+ r = asprintf(&result, "%s%s", exp, quant);
+ }
+ } else {
+ if (quant == NULL) {
+ r = asprintf(&result, "(%s){%d,%d}", exp, re->min, re->max);
+ } else {
+ r = asprintf(&result, "(%s)%s", exp, quant);
+ }
+ }
+ FREE(exp);
+
+ return (r < 0) ? NULL : result;
+}
+
+static char *re_as_string(const struct re *re) {
+ char *result = NULL;
+ switch(re->type) {
+ case UNION:
+ result = re_union_as_string(re);
+ break;
+ case CONCAT:
+ result = re_concat_as_string(re);
+ break;
+ case CSET:
+ result = re_cset_as_string(re);
+ break;
+ case CHAR:
+ if (re->c != '\\' && strchr(special_chars, re->c) == NULL) {
+ if (ALLOC_N(result, 2) == 0) {
+ result[0] = re->c;
+ }
+ } else {
+ if (ALLOC_N(result, 3) == 0) {
+ result[0] = '\\';
+ result[1] = re->c;
+ }
+ }
+ break;
+ case ITER:
+ result = re_iter_as_string(re);
+ break;
+ case EPSILON:
+ if (ALLOC_N(result, 3) == 0) {
+ strcpy(result, "()");
+ }
+ break;
+ default:
+ assert(0);
+ abort();
+ break;
+ }
+ return result;
+}
+
+static int convert_trans_to_re(struct state *s) {
+ struct re *re = NULL;
+ size_t nto = 1;
+ struct trans *trans = NULL;
+ int r;
+
+ if (s->tused == 0)
+ return 0;
+
+ qsort(s->trans, s->tused, sizeof(*s->trans), trans_to_cmp);
+ for (int i = 0; i < s->tused - 1; i++) {
+ if (s->trans[i].to != s->trans[i+1].to)
+ nto += 1;
+ }
+ r = ALLOC_N(trans, nto);
+ if (r < 0)
+ goto error;
+
+ struct state *to = s->trans[0].to;
+ int tind = 0;
+ for_each_trans(t, s) {
+ if (t->to != to) {
+ trans[tind].to = to;
+ trans[tind].re = re;
+ tind += 1;
+ re = NULL;
+ to = t->to;
+ }
+ if (re == NULL) {
+ re = make_re_char_set(0);
+ if (re == NULL)
+ goto error;
+ }
+ add_re_char(re, t->min, t->max);
+ }
+ assert(nto == tind + 1);
+ trans[tind].to = to;
+ trans[tind].re = re;
+
+ /* Simplify CSETs with a single char to a CHAR */
+ for (int t=0; t < nto; t++) {
+ int cnt = 0;
+ uchar chr;
+ for (int c = 0; c < UCHAR_NUM; c++) {
+ if (trans[t].re->cset[c]) {
+ cnt += 1;
+ chr = c;
+ }
+ }
+ if (cnt == 1) {
+ re_unref(trans[t].re);
+ trans[t].re = make_re_char(chr);
+ }
+ }
+ free_trans(s);
+ s->trans = trans;
+ s->tused = s->tsize = nto;
+ return 0;
+
+ error:
+ for (int i=0; i < nto; i++)
+ free_re(trans[i].re);
+ free(trans);
+ return -1;
+}
+
+static int add_new_re_trans(struct state *s1, struct state *s2, struct re *re) {
+ int r;
+ r = add_new_trans(s1, s2, 0, 0);
+ if (r < 0)
+ return -1;
+ last_trans(s1)->re = re;
+ return 0;
+}
+
+/* Add the regular expression R1 . LOOP* . R2 to the transition
+ from S1 to S2. */
+static int re_collapse_trans(struct state *s1, struct state *s2,
+ struct re *r1, struct re *loop, struct re *r2) {
+ struct re *re = NULL;
+
+ if (loop->type != EPSILON) {
+ loop = make_re_rep(ref(loop), 0, -1);
+ }
+
+ if (r1->type == EPSILON) {
+ if (loop->type == EPSILON) {
+ re = ref(r2);
+ } else {
+ re = make_re_binop(CONCAT, loop, ref(r2));
+ }
+ } else {
+ if (loop->type == EPSILON) {
+ if (r2->type == EPSILON) {
+ re = ref(r1);
+ } else {
+ re = make_re_binop(CONCAT, ref(r1), ref(r2));
+ }
+ } else {
+ re = make_re_binop(CONCAT, ref(r1), loop);
+ if (re != NULL && r2->type != EPSILON) {
+ re = make_re_binop(CONCAT, re, ref(r2));
+ }
+ }
+ }
+ if (re == NULL)
+ return -1;
+
+ struct trans *t = NULL;
+ for (t = s1->trans; t <= last_trans(s1) && t->to != s2; t += 1);
+ if (t > last_trans(s1)) {
+ add_new_re_trans(s1, s2, re);
+ } else {
+ if (t->re == NULL) {
+ t->re = re;
+ } else {
+ t->re = make_re_binop(UNION, re, t->re);
+ }
+ }
+ return 0;
+}
+
+/* Convert an FA to a regular expression.
+ * The strategy is the following:
+ * (1) For all states S1 and S2, convert the transitions between them
+ * into one transition whose regexp is a CSET
+ * (2) Add a new initial state INI and a new final state FIN to the automaton,
+ * a transition from INI to the old initial state of FA, and a transition
+ * from all accepting states of FA to FIN; the regexp on those transitions
+ * matches only the empty word
+ * (3) Eliminate states S (except for INI and FIN) one by one:
+ * Let LOOP the regexp for the transition S -> S if it exists, epsilon
+ * otherwise.
+ * For all S1, S2 different from S with S1 -> S -> S2
+ * Let R1 the regexp of S1 -> S
+ * R2 the regexp of S -> S2
+ * R3 the regexp S1 -> S2 (or epsilon if no such transition)
+ * set the regexp on the transition S1 -> S2 to
+ * R1 . (LOOP)* . R2 | R3
+ * (4) The regexp for the whole FA can now be found as the regexp of
+ * the transition INI -> FIN
+ * (5) Convert that STRUCT RE to a string with RE_AS_STRING
+ */
+int fa_as_regexp(struct fa *fa, char **regexp) {
+ int r;
+ struct state *fin = NULL, *ini = NULL;
+ struct re *eps = make_re(EPSILON);
+
+ *regexp = NULL;
+ fa = fa_clone(fa);
+
+ fin = add_state(fa,1);
+ if (fin == NULL)
+ goto error;
+
+ list_for_each(s, fa->initial) {
+ r = convert_trans_to_re(s);
+ if (r < 0)
+ goto error;
+ if (s->accept) {
+ add_new_re_trans(s, fin, ref(eps));
+ }
+ }
+
+ ini = add_state(fa, 0);
+ if (ini == NULL)
+ goto error;
+
+ add_new_re_trans(ini, fa->initial, ref(eps));
+ set_initial(fa, ini);
+
+ list_for_each(s, fa->initial->next) {
+ if (s == fin)
+ continue;
+ /* Eliminate S */
+ struct re *loop = eps;
+ for_each_trans(t, s) {
+ if (t->to == s)
+ loop = t->re;
+ }
+ list_for_each(s1, fa->initial) {
+ if (s == s1)
+ continue;
+ for (int t1 = 0; t1 < s1->tused; t1++) {
+ if (s1->trans[t1].to == s) {
+ for (int t = 0; t < s->tused; t++) {
+ if (s->trans[t].to == s)
+ continue;
+ re_collapse_trans(s1, s->trans[t].to,
+ s1->trans[t1].re,
+ loop,
+ s->trans[t].re);
+ }
+ }
+ }
+ }
+ }
+
+ re_unref(eps);
+
+ for_each_trans(t, fa->initial) {
+ if (t->to == fin) {
+ *regexp = re_as_string(t->re);
+ }
+ }
+
+ list_for_each(s, fa->initial) {
+ for_each_trans(t, s) {
+ unref(t->re, re);
+ }
+ }
+ fa_free(fa);
+
+ return 0;
+ error:
+ fa_free(fa);
+ re_unref(eps);
+ return -1;
+}
+
static void print_char(FILE *out, uchar c) {
/* We escape '/' as '\\/' since dot chokes on bare slashes in labels;
Also, a space ' ' is shown as '\s' */
diff -r 0d183841fd64 -r 3d2531fc583a src/fa.h
--- a/src/fa.h Thu May 08 18:27:26 2008 -0700
+++ b/src/fa.h Thu May 08 18:32:30 2008 -0700
@@ -170,6 +170,18 @@ char *fa_example(fa_t fa);
*/
char *fa_ambig_example(fa_t fa1, fa_t fa2, char **pv, char **v);
+/* Convert the finite automaton FA into a regular expression and set REGEXP
+ * to point to that. When REGEXP is compiled into another automaton, it is
+ * guaranteed that that automaton and FA accept the same language.
+ *
+ * The code tries to be semi-clever about keeping the generated regular
+ * expression short; to guarantee reasonably short regexps, the automaton
+ * should be minimized before passing it to this routine.
+ *
+ * Return 0 on success, and a negative number on failure. The only reason
+ * to fail for FA_AS_REGEXP is running out of memory.
+ */
+int fa_as_regexp(fa_t fa, char **regexp);
#endif
diff -r 0d183841fd64 -r 3d2531fc583a tests/fatest.c
--- a/tests/fatest.c Thu May 08 18:27:26 2008 -0700
+++ b/tests/fatest.c Thu May 08 18:32:30 2008 -0700
@@ -67,6 +67,30 @@ static fa_t mark(fa_t fa) {
return fa;
}
+static void assertAsRegexp(CuTest *tc, fa_t fa) {
+ char *re;
+ fa_t fa1, fa2;
+ fa_t empty = mark(fa_make_basic(FA_EPSILON));
+ int r;
+
+ /* Jump through some hoops to make FA1 a copy of FA */
+ fa1 = mark(fa_concat(fa, empty));
+ /* Minimize FA1, otherwise the regexp returned is enormous for the */
+ /* monster (~ 2MB) and fa_compile becomes incredibly slow */
+ fa_minimize(fa1);
+
+ r = fa_as_regexp(fa1, &re);
+ CuAssertIntEquals(tc, 0, r);
+
+ r = fa_compile(re, &fa2);
+ CuAssertIntEquals(tc, REG_NOERROR, r);
+
+ CuAssertTrue(tc, fa_equals(fa, fa2));
+
+ fa_free(fa2);
+ free(re);
+}
+
static fa_t make_fa(CuTest *tc, const char *regexp, int exp_err) {
fa_t fa;
int r;
@@ -78,6 +102,7 @@ static fa_t make_fa(CuTest *tc, const ch
print_regerror(r, regexp);
CuAssertPtrNotNull(tc, fa);
mark(fa);
+ assertAsRegexp(tc, fa);
} else {
CuAssertPtrEquals(tc, NULL, fa);
}
@@ -345,6 +370,32 @@ static void testAmbig(CuTest *tc) {
assertNotAmbig(tc, "(a|b|c|d|abcd)", "(a|b|c|d|abcd)");
}
+static void assertFaAsRegexp(CuTest *tc, const char *regexp) {
+ char *re;
+ fa_t fa1 = make_good_fa(tc, regexp);
+ fa_t fa2;
+ int r;
+
+ r = fa_as_regexp(fa1, &re);
+ CuAssertIntEquals(tc, 0, r);
+
+ r = fa_compile(re, &fa2);
+ CuAssertIntEquals(tc, REG_NOERROR, r);
+
+ CuAssert(tc, regexp, fa_equals(fa1, fa2));
+
+ fa_free(fa2);
+ free(re);
+}
+
+static void testAsRegexp(CuTest *tc) {
+ assertFaAsRegexp(tc, "a*");
+ assertFaAsRegexp(tc, "abcd");
+ assertFaAsRegexp(tc, "ab|cd");
+ assertFaAsRegexp(tc, "[a-z]+");
+ assertFaAsRegexp(tc, "[]a]+");
+}
+
int main(int argc, char **argv) {
if (argc == 1) {
char *output = NULL;
@@ -361,6 +412,7 @@ int main(int argc, char **argv) {
SUITE_ADD_TEST(suite, testOverlap);
SUITE_ADD_TEST(suite, testExample);
SUITE_ADD_TEST(suite, testAmbig);
+ SUITE_ADD_TEST(suite, testAsRegexp);
CuSuiteRun(suite);
CuSuiteSummary(suite, &output);
@@ -377,7 +429,17 @@ int main(int argc, char **argv) {
print_regerror(r, argv[i]);
} else {
dot(fa);
- printf("Example for %s: %s\n", argv[i], fa_example(fa));
+ char *s = fa_example(fa);
+ printf("Example for %s: %s\n", argv[i], s);
+ free(s);
+ char *re;
+ r = fa_as_regexp(fa, &re);
+ if (r == 0) {
+ printf("/%s/ = /%s/\n", argv[i], re);
+ free(re);
+ } else {
+ printf("/%s/ = ***\n", argv[i]);
+ }
fa_free(fa);
}
}
More information about the augeas-devel
mailing list