[augeas-devel] augeas: master - Earley parser based on Jim/Mandelbaum's transducers
David Lutterkort
lutter at fedoraproject.org
Fri Jan 15 01:31:30 UTC 2010
Gitweb: http://git.fedorahosted.org/git/augeas.git?p=augeas.git;a=commitdiff;h=e1b404516e614ff01bf7e5a86f4481156b069740
Commit: e1b404516e614ff01bf7e5a86f4481156b069740
Parent: 841eb40bd79d878bb7d5a95dc98d6acfd89178cc
Author: David Lutterkort <lutter at redhat.com>
AuthorDate: Fri Oct 9 10:02:18 2009 -0700
Committer: David Lutterkort <lutter at redhat.com>
CommitterDate: Thu Jan 14 14:48:38 2010 -0800
Earley parser based on Jim/Mandelbaum's transducers
---
src/Makefile.am | 2 +-
src/jmt.c | 1796 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
src/jmt.h | 72 +++
3 files changed, 1869 insertions(+), 1 deletions(-)
diff --git a/src/Makefile.am b/src/Makefile.am
index 8b8a524..adfd9bf 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -23,7 +23,7 @@ libaugeas_la_SOURCES = augeas.h augeas.c pathx.c \
memory.h memory.c ref.h ref.c \
syntax.c syntax.h parser.y builtin.c lens.c lens.h regexp.c regexp.h \
transform.h transform.c ast.c get.c put.c list.h \
- info.c info.h errcode.c errcode.h
+ info.c info.h errcode.c errcode.h jmt.h jmt.c
if USE_VERSION_SCRIPT
AUGEAS_VERSION_SCRIPT = $(VERSION_SCRIPT_FLAGS)$(srcdir)/augeas_sym.version
diff --git a/src/jmt.c b/src/jmt.c
new file mode 100644
index 0000000..acc110c
--- /dev/null
+++ b/src/jmt.c
@@ -0,0 +1,1796 @@
+/*
+ * jmt.c: Earley parser for lenses based on Jim/Mandelbaum transducers
+ *
+ * Copyright (C) 2009 Red Hat Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: David Lutterkort <lutter at redhat.com>
+ */
+
+#include <config.h>
+
+#include "jmt.h"
+#include "internal.h"
+#include "memory.h"
+#include "errcode.h"
+
+/* This is an implementation of the Earley parser described in the paper
+ * "Efficient Earley Parsing with Regular Right-hand Sides" by Trever Jim
+ * and Yitzhak Mandelbaum.
+ *
+ * Since we only deal in lenses, they are both terminals and nonterminals:
+ * recursive lenses (those with lens->recursive set) are nonterminals, and
+ * non-recursive lenses are terminals, with the exception of non-recursive
+ * lenses that match the empty word. Lenses are also the semantic actions
+ * attached to a SCAN or COMPLETE during recosntruction of the parse
+ * tree. Our SCAN makes sure that we only ever scan nonempty words - that
+ * means that for nonrecursive lenses which match the empty word (e.g., del
+ * [ \t]* "") we need to make sure we get a COMPLETE when the lens matched
+ * the empty word.
+ *
+ * That is achieved by treating such a lens t as the construct (t|T) with
+ * T := eps.
+ */
+
+/*
+ * Data structures for the Jim/Mandelbaum parser
+ */
+
+#define IND_MAX UINT32_MAX
+
+struct array {
+ size_t elem_size;
+ ind_t used;
+ ind_t size;
+ void *data;
+};
+
+#define array_elem(arr, ind, typ) \
+ (((typ *) (arr).data) + ind)
+
+#define array_for_each(i, arr) \
+ for(ind_t i = 0; i < (arr).used; i++)
+
+#define array_each_elem(elt, arr, typ) \
+ for(typ *elt = (typ *) (arr).data + 0; \
+ elt - (typ *) (arr).data < (arr).used; \
+ elt++)
+
+static void array_init(struct array *arr, size_t elem_size) {
+ MEMZERO(arr, 1);
+ arr->elem_size = elem_size;
+}
+
+ATTRIBUTE_RETURN_CHECK
+static int array_add(struct array *arr, ind_t *ind) {
+ if (arr->used >= arr->size) {
+ int r;
+ ind_t expand = arr->size;
+ if (expand < 8)
+ expand = 8;
+ r = mem_realloc_n(&(arr->data), arr->elem_size, arr->size + expand);
+ if (r < 0)
+ return -1;
+ memset((char *) arr->data + arr->elem_size*arr->size, 0,
+ arr->elem_size * expand);
+ arr->size += expand;
+ }
+ *ind = arr->used;
+ arr->used += 1;
+ return 0;
+}
+
+/* Insert a new entry into the array at index IND. Shift all entries from
+ * IND on back by one. */
+ATTRIBUTE_RETURN_CHECK
+static int array_insert(struct array *arr, ind_t ind) {
+ ind_t last;
+
+ if (array_add(arr, &last) < 0)
+ return -1;
+
+ if (ind >= last)
+ return 0;
+
+ memmove((char *) arr->data + arr->elem_size*(ind+1),
+ (char *) arr->data + arr->elem_size*ind,
+ arr->elem_size * (arr->used - ind - 1));
+ memset((char *) arr->data + arr->elem_size*ind, 0, arr->elem_size);
+ return 0;
+}
+
+static void array_release(struct array *arr) {
+ if (arr != NULL)
+ free(arr->data);
+ arr->used = arr->size = 0;
+}
+
+static void array_remove(struct array *arr, ind_t ind) {
+ char *data = arr->data;
+ memmove(data + ind * arr->elem_size,
+ data + (ind + 1) * arr->elem_size,
+ (arr->used - ind - 1) * arr->elem_size);
+ arr->used -= 1;
+}
+
+ATTRIBUTE_RETURN_CHECK
+static int array_join(struct array *dst, struct array *src) {
+ int r;
+
+ if (dst->elem_size != src->elem_size)
+ return -1;
+
+ r = mem_realloc_n(&(dst->data), dst->elem_size,
+ dst->used + src->used);
+ if (r < 0)
+ return -1;
+
+ memcpy(((char *) dst->data) + dst->used * dst->elem_size,
+ src->data,
+ src->used * src->elem_size);
+ dst->used += src->used;
+ dst->size = dst->used;
+ return 0;
+}
+
+/* Special lens indices - these don't reference lenses, but
+ * some pseudo actions. We stash them at the top of the range
+ * of IND_T, with LENS_MAX giving us the maximal lens index for
+ * a real lens
+ */
+enum trans_op {
+ EPS = IND_MAX,
+ CALL = EPS - 1,
+ LENS_MAX = CALL - 1
+};
+
+struct trans {
+ struct state *to;
+ ind_t lens;
+};
+
+struct state {
+ struct state *next; /* Linked list for memory management */
+ struct array trans; /* Array of struct trans */
+ ind_t nret; /* Number of returned lenses */
+ ind_t *ret; /* The returned lenses */
+ ind_t num; /* Counter for num of new states */
+ unsigned int reachable : 1;
+ unsigned int live : 1;
+};
+
+/* Sets of states used in determinizing the NFA to a DFA */
+struct nfa_state {
+ struct state *state; /* The new state in the DFA */
+ struct array set; /* Set of (struct state *) in the NFA, sorted */
+};
+
+/* For recursive lenses (nonterminals), the mapping of nonterminal to
+ * state. We also store nonrecursive lenses here; in that case, the state
+ * will be NULL */
+struct jmt_lens {
+ struct lens *lens;
+ struct state *state;
+};
+
+/* A Jim/Mandelbaum transducer */
+struct jmt {
+ struct error *error;
+ struct array lenses; /* Array of struct jmt_lens */
+ struct state *start;
+ ind_t lens; /* The start symbol of the grammar */
+ ind_t state_count;
+};
+
+enum item_reason {
+ R_ROOT = 0,
+ R_COMPLETE = 1 << 1,
+ R_PREDICT = 1 << 2,
+ R_SCAN = 1 << 3
+};
+
+/* The reason an item was added for parse reconstruction; can be R_SCAN,
+ * R_COMPLETE, R_PREDICT or R_COMPLETE|R_PREDICT */
+struct link {
+ enum item_reason reason;
+ ind_t lens; /* R_COMPLETE, R_SCAN */
+ ind_t from_set; /* R_COMPLETE, R_PREDICT, R_SCAN */
+ ind_t from_item; /* R_COMPLETE, R_PREDICT, R_SCAN */
+ ind_t to_item; /* R_COMPLETE */
+};
+
+struct item {
+ /* The 'classical' Earley item (state, parent) */
+ struct state *state;
+ ind_t parent;
+ /* Backlinks to why item was added */
+ ind_t nlinks;
+ struct link *links;
+};
+
+struct item_set {
+ struct array items;
+};
+
+struct jmt_parse {
+ struct jmt *jmt;
+ struct error *error;
+ const char *text;
+ ind_t nsets;
+ struct item_set **sets;
+};
+
+#define for_each_item(it, set) \
+ array_each_elem(it, (set)->items, struct item)
+
+#define for_each_trans(t, s) \
+ array_each_elem(t, (s)->trans, struct trans)
+
+#define parse_state(parse, ind) \
+ array_elem(parse->states, ind, struct state)
+
+static struct item *set_item(struct jmt_parse *parse, ind_t set,
+ ind_t item) {
+ ensure(parse->sets[set] != NULL, parse);
+ ensure(item < parse->sets[set]->items.used, parse);
+ return array_elem(parse->sets[set]->items, item, struct item);
+ error:
+ return NULL;
+}
+
+static struct state *item_state(struct jmt_parse *parse, ind_t set,
+ ind_t item) {
+ return set_item(parse, set, item)->state;
+}
+
+static struct trans *state_trans(struct state *state, ind_t t) {
+ return array_elem(state->trans, t, struct trans);
+}
+
+static ind_t item_parent(struct jmt_parse *parse, ind_t set, ind_t item) {
+ return set_item(parse, set, item)->parent;
+}
+
+static bool is_return(const struct state *s) {
+ return s->nret > 0;
+}
+
+static struct lens *lens_of_parse(struct jmt_parse *parse, ind_t lens) {
+ return array_elem(parse->jmt->lenses, lens, struct jmt_lens)->lens;
+}
+
+/*
+ * The parser
+ */
+
+/*
+ * Manipulate the Earley graph. We denote edges in the graph as
+ * [j, (s,i)] -> [k, item_k] => [l, item_l]
+ * to indicate that we are adding item (s,i) to E_j and record that its
+ * child is the item with index item_k in E_k and its sibling is item
+ * item_l in E_l.
+ */
+
+/* Add item (s, k) to E_j. Note that the item was caused by action reason
+ * using lens starting at from_item in E_{from_set}
+ *
+ * [j, (s,k)] -> [from_set, from_item] => [j, to_item]
+ */
+static ind_t parse_add_item(struct jmt_parse *parse, ind_t j,
+ struct state *s, ind_t k,
+ enum item_reason reason, ind_t lens,
+ ind_t from_set,
+ ind_t from_item, ind_t to_item) {
+
+ ensure(from_item == EPS || from_item < parse->sets[from_set]->items.used,
+ parse);
+ ensure(to_item == EPS || to_item < parse->sets[j]->items.used,
+ parse);
+
+ int r;
+ struct item_set *set = parse->sets[j];
+ struct item *item = NULL;
+ ind_t result = IND_MAX;
+
+ if (set == NULL) {
+ r = ALLOC(parse->sets[j]);
+ ERR_NOMEM(r < 0, parse);
+ array_init(&parse->sets[j]->items, sizeof(struct item));
+ set = parse->sets[j];
+ }
+
+ for (ind_t i=0; i < set->items.used; i++) {
+ if (item_state(parse, j, i) == s
+ && item_parent(parse, j, i) == k) {
+ result = i;
+ item = set_item(parse, j, i);
+ break;
+ }
+ }
+
+ if (result == IND_MAX) {
+ r = array_add(&set->items, &result);
+ ERR_NOMEM(r < 0, parse);
+
+ item = set_item(parse, j, result);
+ item->state = s;
+ item->parent = k;
+ }
+
+ for (ind_t i = 0; i < item->nlinks; i++) {
+ struct link *lnk = item->links + i;
+ if (lnk->reason == reason && lnk->lens == lens
+ && lnk->from_set == from_set && lnk->from_item == from_item
+ && lnk->to_item == to_item)
+ return result;
+ }
+
+ r = REALLOC_N(item->links, item->nlinks + 1);
+ ERR_NOMEM(r < 0, parse);
+
+ struct link *lnk = item->links+ item->nlinks;
+ item->nlinks += 1;
+
+ lnk->reason = reason;
+ lnk->lens = lens;
+ lnk->from_set = from_set;
+ lnk->from_item = from_item;
+ lnk->to_item = to_item;
+ error:
+ return result;
+}
+
+/* Add item (s, i) to set E_j and record that it was added because
+ * a parse of nonterminal lens, starting at itemk in E_k, was completed
+ * by item item in E_j
+ *
+ * [j, (s,i)] -> [j, item] => [k, itemk]
+ */
+static void parse_add_complete(struct jmt_parse *parse, ind_t j,
+ struct state *s, ind_t i,
+ ind_t k, ind_t itemk,
+ ind_t lens,
+ ind_t item) {
+ parse_add_item(parse, j, s, i, R_COMPLETE, lens, k, itemk, item);
+}
+
+/* Same as parse_add_complete, but mark the item also as a predict
+ *
+ * [j, (s,i)] -> [j, item] => [k, itemk]
+ */
+static void parse_add_predict_complete(struct jmt_parse *parse, ind_t j,
+ struct state *s, ind_t i,
+ ind_t k, ind_t itemk,
+ ind_t lens,
+ ind_t item) {
+ parse_add_item(parse, j, s, i, R_COMPLETE|R_PREDICT,
+ lens, k, itemk, item);
+}
+
+/* Add item (s, j) to E_j and record that it was added because of a
+ * prediction from item from in E_j
+ */
+static ind_t parse_add_predict(struct jmt_parse *parse, ind_t j,
+ struct state *s, ind_t from) {
+
+ ensure(from < parse->sets[j]->items.used, parse);
+
+ return parse_add_item(parse, j, s, j, R_PREDICT, EPS, j, from, EPS);
+ error:
+ return IND_MAX;
+}
+
+/* Add item (s,i) to E_j and record that it was added because of scanning
+ * with lens starting from item item in E_k.
+ *
+ * [j, (s,i)] -> [k, item]
+ */
+static void parse_add_scan(struct jmt_parse *parse, ind_t j,
+ struct state *s, ind_t i,
+ ind_t lens, ind_t k, ind_t item) {
+ ensure(item < parse->sets[k]->items.used, parse);
+
+ parse_add_item(parse, j, s, i, R_SCAN, lens, k, item, EPS);
+ error:
+ return;
+}
+
+ATTRIBUTE_PURE
+static bool is_complete(const struct link *lnk) {
+ return lnk->reason & R_COMPLETE;
+}
+
+ATTRIBUTE_PURE
+static bool is_predict(const struct link *lnk) {
+ return lnk->reason & R_PREDICT;
+}
+
+ATTRIBUTE_PURE
+static bool is_scan(const struct link *lnk) {
+ return lnk->reason & R_SCAN;
+}
+
+ATTRIBUTE_PURE
+static bool returns(const struct state *s, ind_t l) {
+ for (ind_t i = 0; i < s->nret; i++)
+ if (s->ret[i] == l)
+ return true;
+ return false;
+}
+
+static void state_add_return(struct jmt *jmt, struct state *s, ind_t l) {
+ int r;
+
+ if (s == NULL || returns(s, l))
+ return;
+
+ r = REALLOC_N(s->ret, s->nret + 1);
+ ERR_NOMEM(r < 0, jmt);
+ s->ret[s->nret] = l;
+ s->nret += 1;
+ error:
+ return;
+}
+
+static void state_merge_returns(struct jmt *jmt, struct state *dst,
+ const struct state *src) {
+ for (ind_t l = 0; l < src->nret; l++)
+ state_add_return(jmt, dst, src->ret[l]);
+}
+
+static void nncomplete(struct jmt_parse *parse, ind_t j,
+ struct state *t, ind_t k, ind_t item) {
+
+ for (ind_t itemk = 0; itemk < parse->sets[k]->items.used; itemk++) {
+ struct state *u = item_state(parse, k, itemk);
+ for_each_trans(y, u) {
+ if (returns(t, y->lens)) {
+ ind_t parent = item_parent(parse, k, itemk);
+ parse_add_complete(parse, j,
+ y->to, parent,
+ k, itemk, y->lens, item);
+ }
+ }
+ }
+}
+
+/* NCALLER for (t, i) in E_j, which has index item in E_j, and t -> s a
+ * call in the transducer and s a return. The item (s,j) has index pred in
+ * E_j
+ */
+static void ncaller(struct jmt_parse *parse, ind_t j, ind_t item,
+ struct state *t, ind_t i, struct state *s, ind_t pred) {
+ for_each_trans(u, t) {
+ if (returns(s, u->lens)) {
+ /* [j, (u->to, i)] -> [j, pred] => [j, item] */
+ parse_add_predict_complete(parse, j,
+ u->to, i,
+ j, item, u->lens,
+ pred);
+ }
+ }
+}
+
+/* NCALLEE for (t, parent) in E_j, which has index item in E_j, and t -> s
+ * a call in the transducer and s a return. The item (s,j) has index pred
+ * in E_j
+ */
+static void ncallee(struct jmt_parse *parse, ind_t j, ATTRIBUTE_UNUSED ind_t item,
+ ATTRIBUTE_UNUSED struct state *t, ATTRIBUTE_UNUSED ind_t parent, struct state *s, ind_t pred) {
+ for_each_trans(u, s) {
+ if (returns(s, u->lens)) {
+ /* [j, (u->to, j)] -> [j, item] => [j, item] */
+ parse_add_predict_complete(parse, j,
+ u->to, j,
+ j, pred, u->lens,
+ pred);
+ }
+ }
+}
+
+static struct jmt_parse *parse_init(struct jmt *jmt,
+ const char *text, size_t text_len) {
+ int r;
+ struct jmt_parse *parse;
+
+ r = ALLOC(parse);
+ ERR_NOMEM(r < 0, jmt);
+
+ parse->jmt = jmt;
+ parse->error = jmt->error;
+ parse->text = text;
+ parse->nsets = text_len + 1;
+ r = ALLOC_N(parse->sets, parse->nsets);
+ ERR_NOMEM(r < 0, jmt);
+ return parse;
+ error:
+ if (parse != NULL)
+ free(parse->sets);
+ free(parse);
+ return NULL;
+}
+
+void jmt_free_parse(struct jmt_parse *parse) {
+ if (parse == NULL)
+ return;
+ for (int i=0; i < parse->nsets; i++) {
+ struct item_set *set = parse->sets[i];
+ if (set != NULL) {
+ array_each_elem(x, set->items, struct item)
+ free(x->links);
+ array_release(&set->items);
+ free(set);
+ }
+ }
+ free(parse->sets);
+ free(parse);
+}
+
+static struct state *lens_state(struct jmt *jmt, ind_t l);
+
+static void flens(FILE *fp, ind_t l) {
+ if (l == 0)
+ fprintf(fp, "%c", 'S');
+ else if (l < 'S' - 'A')
+ fprintf(fp, "%c", 'A' + l - 1);
+ else if (l <= 'Z' - 'A')
+ fprintf(fp, "%c", 'A' + l);
+ else
+ fprintf(fp, "%u", l);
+}
+
+static void parse_dot_link(FILE *fp, struct jmt_parse *parse,
+ ind_t k, struct item *x, struct link *lnk) {
+ char *lens_label = NULL;
+ if (is_complete(lnk) || is_scan(lnk)) {
+ struct state *sA = lens_state(parse->jmt, lnk->lens);
+ int r;
+
+ if (sA == NULL)
+ r = xasprintf(&lens_label, "<%d>", lnk->lens);
+ else
+ r = xasprintf(&lens_label, "%d", lnk->lens);
+ }
+ fprintf(fp, " n%d_%d_%d [ label = \"(%d, %d)\"];\n",
+ k, x->state->num, x->parent, x->state->num, x->parent);
+ if (is_complete(lnk)) {
+ struct item *y = set_item(parse, k, lnk->to_item);
+ const char *pred = is_predict(lnk) ? "p" : "";
+ fprintf(fp, " n%d_%d_%d -> n%s%d_%d_%d [ style = dashed ];\n",
+ k, x->state->num, x->parent,
+ pred, k, y->state->num, y->parent);
+ if (is_predict(lnk)) {
+ fprintf(fp, " n%s%d_%d_%d [ label = \"\" ];\n",
+ pred, k, y->state->num, y->parent);
+ fprintf(fp, " n%s%d_%d_%d -> n%d_%d_%d [ style = bold ];\n",
+ pred, k, y->state->num, y->parent,
+ k, y->state->num, y->parent);
+ }
+ y = set_item(parse, lnk->from_set, lnk->from_item);
+ fprintf(fp,
+ " n%d_%d_%d -> n%d_%d_%d [ style = dashed, label = \"",
+ k, x->state->num, x->parent,
+ lnk->from_set, y->state->num, y->parent);
+ flens(fp, lnk->lens);
+ fprintf(fp, "\" ];\n");
+ } else if (is_scan(lnk)) {
+ struct item *y =
+ set_item(parse, lnk->from_set, lnk->from_item);
+ fprintf(fp,
+ " n%d_%d_%d -> n%d_%d_%d [ label = \"",
+ k, x->state->num, x->parent,
+ lnk->from_set, y->state->num, y->parent);
+ for (ind_t i=lnk->from_set; i < k; i++)
+ fprintf(fp, "%c", parse->text[i]);
+ fprintf(fp, "\" ];\n");
+
+ } else if (is_predict(lnk)) {
+ struct item *y =
+ set_item(parse, lnk->from_set, lnk->from_item);
+ fprintf(fp,
+ " n%d_%d_%d -> n%d_%d_%d [ style = bold ];\n",
+ k, x->state->num, x->parent,
+ lnk->from_set, y->state->num, y->parent);
+
+ }
+ free(lens_label);
+}
+
+static void parse_dot(struct jmt_parse *parse, const char *fname) {
+ FILE *fp = debug_fopen("%s", fname);
+ if (fp == NULL)
+ return;
+
+ fprintf(fp, "digraph \"jmt_parse\" {\n");
+ fprintf(fp, " rankdir = RL;\n");
+ for (int k=0; k < parse->nsets; k++) {
+ struct item_set *set = parse->sets[k];
+
+ if (set == NULL)
+ continue;
+
+ fprintf(fp, " subgraph \"cluster_E_%d\" {\n", k);
+ fprintf(fp, " rankdir=RL;\n rank=same;\n");
+ fprintf(fp, " title%d [ label=\"E%d\", shape=plaintext ]\n", k, k);
+ for_each_item(x, set) {
+ for (int i=0; i < x->nlinks; i++) {
+ struct link *lnk = x->links + i;
+ parse_dot_link(fp, parse, k, x, lnk);
+ }
+ }
+ fprintf(fp, "}\n");
+ }
+ fprintf(fp, "}\n");
+ fclose(fp);
+}
+
+struct jmt_parse *
+jmt_parse(struct jmt *jmt, const char *text, size_t text_len)
+{
+ struct jmt_parse *parse = NULL;
+
+ parse = parse_init(jmt, text, text_len);
+ ERR_BAIL(jmt);
+
+ /* INIT */
+ parse_add_item(parse, 0, jmt->start, 0, R_ROOT, EPS, EPS, EPS, EPS);
+ /* NINIT */
+ if (is_return(jmt->start)) {
+ for_each_trans(x, jmt->start) {
+ if (returns(jmt->start, x->lens))
+ parse_add_predict_complete(parse, 0, x->to, 0,
+ 0, 0, x->lens, 0);
+ }
+ }
+
+ for (int j=0; j <= text_len; j++) {
+ struct item_set *set = parse->sets[j];
+ if (set == NULL)
+ continue;
+
+ for (int item=0; item < set->items.used; item++) {
+ struct state *t = item_state(parse, j, item);
+ ind_t i = item_parent(parse, j, item);
+
+ if (is_return(t) && i != j) {
+ /* NNCOMPLETE */
+ nncomplete(parse, j, t, i, item);
+ }
+
+ for_each_trans(x, t) {
+ if (x->lens == CALL) {
+ /* PREDICT */
+ ind_t pred = parse_add_predict(parse, j, x->to, item);
+ ERR_BAIL(parse);
+ if (is_return(x->to)) {
+ /* NCALLER */
+ ncaller(parse, j, item, t, i, x->to, pred);
+ /* NCALLEE */
+ ncallee(parse, j, item, t, i, x->to, pred);
+ }
+ } else {
+ int count;
+ struct lens *lens = lens_of_parse(parse, x->lens);
+ struct state *sA = lens_state(parse->jmt, x->lens);
+ if (! lens->recursive && sA == NULL) {
+ /* SCAN, terminal */
+ // FIXME: We really need to find every k so that
+ // text[j..k] matches lens->ctype, not just one
+ count = regexp_match(lens->ctype, text, text_len, j, NULL);
+ if (count > 0) {
+ parse_add_scan(parse, j+count,
+ x->to, i,
+ x->lens, j, item);
+ }
+ }
+ }
+ }
+ }
+ }
+ if (debugging("cf.jmt.parse"))
+ parse_dot(parse, "jmt_parse.dot");
+ return parse;
+ error:
+ jmt_free_parse(parse);
+ return NULL;
+}
+
+/*
+ * Reconstruction of the parse tree
+ */
+
+static void
+build_nullable(struct jmt_parse *parse, ind_t pos,
+ struct jmt_visitor *visitor, struct lens *lens, int lvl) {
+ if (! lens->recursive) {
+ if (visitor->terminal != NULL) {
+ (*visitor->terminal)(lens, pos, pos, visitor->data);
+ ERR_BAIL(parse);
+ }
+ } else {
+ if (visitor->enter != NULL) {
+ (*visitor->enter)(lens, pos, pos, visitor->data);
+ ERR_BAIL(parse);
+ }
+
+ switch(lens->tag) {
+ case L_REC:
+ build_nullable(parse, pos, visitor, lens->body, lvl+1);
+ break;
+ case L_CONCAT:
+ for (int i=0; i < lens->nchildren; i++)
+ build_nullable(parse, pos, visitor, lens->children[i], lvl+1);
+ break;
+ case L_UNION:
+ for (int i=0; i < lens->nchildren; i++)
+ if (lens->children[i]->ctype_nullable)
+ build_nullable(parse, pos, visitor,
+ lens->children[i], lvl+1);
+ break;
+ case L_SUBTREE:
+ build_nullable(parse, pos, visitor, lens->child, lvl+1);
+ break;
+ case L_STAR:
+ case L_MAYBE:
+ break;
+ default:
+ BUG_ON(true, parse, "Unexpected lens tag %d", lens->tag);
+ }
+
+ if (visitor->exit != NULL) {
+ (*visitor->exit)(lens, pos, pos, visitor->data);
+ ERR_BAIL(parse);
+ }
+ }
+ error:
+ return;
+}
+
+static void build_trace(const char *msg, ind_t start, ind_t end,
+ struct item *x, int lvl) {
+ for (int i=0; i < lvl; i++) putc(' ', stderr);
+ printf("%s %d..%d: (%d, %d) %d %s%s%s\n", msg,
+ start, end, x->state->num,
+ x->parent, x->links->lens,
+ is_complete(x->links) ? "C" : "",
+ is_predict(x->links) ? "P" : "",
+ is_scan(x->links) ? "S" : "");
+}
+
+static int ambig_check(struct jmt_visitor *visitor, struct lens *lens,
+ ind_t k, struct item *x) {
+ ind_t count = 0;
+ ind_t nonpred = 0;
+
+ for (ind_t i=0; i < x->nlinks; i++) {
+ if (!is_predict(x->links + i) || is_complete(x->links + i)) {
+ nonpred = i;
+ count += 1;
+ }
+ }
+
+ if (count > 1) {
+ (*visitor->error)(lens, visitor->data, k,
+ "Ambiguous parse: %d links in state (%d, %d) in E_%d",
+ x->nlinks, x->state->num, x->parent, k);
+ return -1;
+ }
+ if (count == 1 && x->links[0].reason == R_PREDICT) {
+ struct link tmp = x->links[0];
+ x->links[0] = x->links[nonpred];
+ x->links[nonpred] = tmp;
+ }
+ return 0;
+}
+
+static int
+build_tree(struct jmt_parse *parse, ind_t k, ind_t item,
+ struct jmt_visitor *visitor, int lvl) {
+ struct item *x = set_item(parse, k, item);
+ struct lens *lens = lens_of_parse(parse, x->links->lens);
+ ind_t start = x->links->from_set;
+ ind_t end = k;
+ struct item *old_x = x;
+
+ if (start == end) {
+ /* This completion corresponds to a nullable nonterminal
+ * that match epsilon. Reconstruct the full parse tree
+ * for matching epsilon */
+ build_nullable(parse, start, visitor, lens, lvl);
+ return end;
+ }
+
+ if (ambig_check(visitor, lens, k, x) < 0)
+ return end;
+
+ ensure(is_complete(x->links), parse);
+
+ if (debugging("cf.jmt.visit"))
+ build_trace("{", start, end, x, lvl);
+ if (visitor->enter != NULL) {
+ (*visitor->enter)(lens, start, end, visitor->data);
+ ERR_BAIL(parse);
+ }
+
+ /* x is a completion item. (k, x->to_item) is its first child in the
+ * parse tree. (x->from_set, x->from_item) is its next sibling in
+ * right-to-left order in the parse tree (i.e., the order of children
+ * in the parse tree is reversed */
+ if (! is_predict(x->links)) {
+ item = x->links->to_item;
+ x = set_item(parse, k, item);
+ if (ambig_check(visitor, lens, k, x) < 0)
+ return end;
+ ERR_BAIL(parse);
+ while (is_complete(x->links) || is_scan(x->links)) {
+ struct lens *sub = lens_of_parse(parse, x->links->lens);
+ if (sub->recursive) {
+ build_tree(parse, k, item, visitor, lvl+1);
+ ERR_BAIL(parse);
+ } else {
+ if (debugging("cf.jmt.visit"))
+ build_trace("T", x->links->from_set, k, x, lvl+1);
+ if (visitor->terminal != NULL) {
+ (*visitor->terminal)(lens_of_parse(parse, x->links->lens),
+ x->links->from_set, k, visitor->data);
+ ERR_BAIL(parse);
+ }
+ }
+ k = x->links->from_set;
+ item = x->links->from_item;
+ x = set_item(parse, k, item);
+ // FIXME: This is not clean - we are checking if
+ // we would leave because we'd leave the span [start .. end]
+ // That kinda works, but it's a kludge around the trouble we
+ // have with COMPLETE + PREDICT nodes
+ if (x->links->from_set < start)
+ break;
+ if (ambig_check(visitor, lens, k, x) < 0)
+ return end;
+ ERR_BAIL(parse);
+ }
+ }
+ if (debugging("cf.jmt.visit"))
+ build_trace("}", start, end, old_x, lvl);
+ if (visitor->exit != NULL) {
+ (*visitor->exit)(lens, start, end, visitor->data);
+ ERR_BAIL(parse);
+ }
+ error:
+ return end;
+}
+
+void jmt_visit(struct jmt_visitor *visitor, size_t *len) {
+ struct jmt_parse *parse = visitor->parse;
+ ind_t k = parse->nsets - 1; /* Current Earley set */
+ ind_t item;
+ struct item_set *set = parse->sets[k];
+
+ if (set == NULL)
+ goto noparse;
+
+ for (item = 0; item < set->items.used; item++) {
+ struct item *x = set_item(parse, k, item);
+ if (x->parent == 0 && returns(x->state, parse->jmt->lens)) {
+ for (ind_t i = 0; i < x->nlinks; i++) {
+ if (is_complete(x->links + i)) {
+ if (debugging("cf.jmt.visit"))
+ printf("visit: found (%d, %d) in E_%d\n",
+ x->state->num, x->parent, k);
+ goto found;
+ }
+ }
+ }
+ }
+ found:
+ if (item >= parse->sets[k]->items.used)
+ goto noparse;
+ *len = build_tree(parse, k, item, visitor, 0);
+ return;
+ noparse:
+ for (; k > 0; k--)
+ if (parse->sets[k] != NULL) break;
+ *len = k;
+}
+
+/*
+ * Build the automaton
+ */
+
+
+static struct state *make_state(struct jmt *jmt) {
+ struct state *s;
+ int r;
+
+ r = ALLOC(s);
+ ERR_NOMEM(r < 0, jmt);
+ s->num = jmt->state_count++;
+ array_init(&s->trans, sizeof(struct trans));
+ if (jmt->start != NULL)
+ list_cons(jmt->start->next, s);
+ else
+ jmt->start = s;
+ return s;
+ error:
+ return NULL;
+}
+
+static ind_t add_lens(struct jmt *jmt, struct lens *lens) {
+ int r;
+ ind_t l;
+ struct state *sA = NULL;
+ int nullable = 0;
+
+ r = array_add(&jmt->lenses, &l);
+ ERR_NOMEM(r < 0, jmt);
+ ERR_NOMEM(l == IND_MAX, jmt);
+
+ if (! lens->recursive)
+ nullable = regexp_matches_empty(lens->ctype);
+
+ array_elem(jmt->lenses, l, struct jmt_lens)->lens = lens;
+ /* A nonrecursive lens that matches epsilon is both a terminal
+ * and a nonterminal */
+ if (lens->recursive || nullable) {
+ sA = make_state(jmt);
+ ERR_NOMEM(sA == NULL, jmt);
+ array_elem(jmt->lenses, l, struct jmt_lens)->state = sA;
+ if (! lens->recursive) {
+ /* Add lens again, so that l refers to the nonterminal T
+ * for the lens, and l+1 refers to the terminal t for it */
+ ind_t m;
+ r = array_add(&jmt->lenses, &m);
+ ERR_NOMEM(r < 0, jmt);
+ ERR_NOMEM(m == IND_MAX, jmt);
+
+ array_elem(jmt->lenses, m, struct jmt_lens)->lens = lens;
+ }
+ }
+
+ if (debugging("cf.jmt")) {
+ if (sA == NULL) {
+ printf("add_lens: ");
+ print_regexp(stdout, lens->ctype);
+ printf(" %s\n", format_lens(lens));
+ } else {
+ printf("add_lens: ");
+ flens(stdout, l);
+ printf(" %u %s\n", sA->num, format_lens(lens));
+ if (nullable) {
+ printf("add_lens: // %s\n", format_lens(lens));
+ }
+ }
+ }
+
+ return l;
+ error:
+ return IND_MAX;
+}
+
+static struct trans *
+add_new_trans(struct jmt *jmt,
+ struct state *from, struct state *to, ind_t lens) {
+ struct trans *t;
+ ind_t i;
+ int r;
+
+ if (from == NULL || to == NULL)
+ return NULL;
+
+ r = array_add(&from->trans, &i);
+ ERR_NOMEM(r < 0, jmt);
+ t = array_elem(from->trans, i, struct trans);
+ t->to = to;
+ t->lens = lens;
+ return t;
+ error:
+ return NULL;
+}
+
+static struct trans *
+add_eps_trans(struct jmt *jmt, struct state *from, struct state *to) {
+ return add_new_trans(jmt, from, to, EPS);
+}
+
+static struct lens *lens_of_jmt(struct jmt *jmt, ind_t l) {
+ return array_elem(jmt->lenses, l, struct jmt_lens)->lens;
+}
+
+static ind_t lens_index(struct jmt *jmt, struct lens *lens) {
+ array_for_each(i, jmt->lenses)
+ if (lens_of_jmt(jmt, i) == lens)
+ return i;
+ return IND_MAX;
+}
+
+static struct state *lens_state(struct jmt *jmt, ind_t l) {
+ return array_elem(jmt->lenses, l, struct jmt_lens)->state;
+}
+
+static void print_lens_symbol(FILE *fp, struct jmt *jmt, struct lens *lens) {
+ ind_t l = lens_index(jmt, lens);
+ struct state *sA = lens_state(jmt, l);
+
+ if (sA == NULL)
+ print_regexp(fp, lens->ctype);
+ else
+ flens(fp, l);
+}
+
+static void print_grammar(struct jmt *jmt, struct lens *lens) {
+ ind_t l = lens_index(jmt, lens);
+ struct state *sA = lens_state(jmt, l);
+
+ if (sA == NULL || lens->tag == L_REC)
+ return;
+
+ printf(" ");
+ print_lens_symbol(stdout, jmt, lens);
+ printf(" := ");
+
+ if (! lens->recursive) {
+ /* Nullable regexps */
+ print_regexp(stdout, lens->ctype);
+ printf("\n");
+ return;
+ }
+
+ switch (lens->tag) {
+ case L_CONCAT:
+ print_lens_symbol(stdout, jmt, lens->children[0]);
+ for (int i=1; i < lens->nchildren; i++) {
+ printf(" . ");
+ print_lens_symbol(stdout, jmt, lens->children[i]);
+ }
+ printf("\n");
+ for (int i=0; i < lens->nchildren; i++)
+ print_grammar(jmt, lens->children[i]);
+ break;
+ case L_UNION:
+ print_lens_symbol(stdout, jmt, lens->children[0]);
+ for (int i=1; i < lens->nchildren; i++) {
+ printf(" | ");
+ print_lens_symbol(stdout, jmt, lens->children[i]);
+ }
+ printf("\n");
+ for (int i=0; i < lens->nchildren; i++)
+ print_grammar(jmt, lens->children[i]);
+ break;
+ case L_SUBTREE:
+ print_lens_symbol(stdout, jmt, lens->child);
+ printf("\n");
+ print_grammar(jmt, lens->child);
+ break;
+ case L_STAR:
+ print_lens_symbol(stdout, jmt, lens->child);
+ printf("*\n");
+ print_grammar(jmt, lens->child);
+ break;
+ case L_MAYBE:
+ print_lens_symbol(stdout, jmt, lens->child);
+ printf("?\n");
+ print_grammar(jmt, lens->child);
+ break;
+ default:
+ BUG_ON(true, jmt, "Unexpected lens tag %d", lens->tag);
+ break;
+ }
+ error:
+ return;
+}
+
+static void print_grammar_top(struct jmt *jmt, struct lens *rec) {
+ printf("Grammar:\n");
+ printf(" ");
+ print_lens_symbol(stdout, jmt, rec);
+ printf(" := ");
+ print_lens_symbol(stdout, jmt, rec->body);
+ printf("\n");
+ print_grammar(jmt, rec->body);
+}
+
+static void index_lenses(struct jmt *jmt, struct lens *lens) {
+ ind_t l;
+
+ l = lens_index(jmt, lens);
+ if (l == IND_MAX) {
+ l = add_lens(jmt, lens);
+ ERR_BAIL(jmt);
+ }
+
+ if (! lens->recursive)
+ return;
+
+ switch (lens->tag) {
+ case L_CONCAT:
+ case L_UNION:
+ for (int i=0; i < lens->nchildren; i++)
+ index_lenses(jmt, lens->children[i]);
+ break;
+ case L_SUBTREE:
+ case L_STAR:
+ case L_MAYBE:
+ index_lenses(jmt, lens->child);
+ break;
+ case L_REC:
+ /* Nothing to do */
+ break;
+ default:
+ BUG_ON(true, jmt, "Unexpected lens tag %d", lens->tag);
+ break;
+ }
+ error:
+ return;
+}
+
+static void thompson(struct jmt *jmt, struct lens *lens,
+ struct state **s, struct state **f) {
+ ind_t l = lens_index(jmt, lens);
+ struct state *sA = lens_state(jmt, l);
+ ensure(l < jmt->lenses.used, jmt);
+
+ *s = make_state(jmt);
+ *f = make_state(jmt);
+ ERR_BAIL(jmt);
+
+ if (lens->recursive) {
+ /* A nonterminal */
+ add_new_trans(jmt, *s, *f, l);
+ add_new_trans(jmt, *s, sA, CALL);
+ } else if (sA == NULL) {
+ /* A terminal that never matches epsilon */
+ add_new_trans(jmt, *s, *f, l);
+ } else {
+ /* A terminal that matches epsilon */
+ add_new_trans(jmt, *s, *f, l);
+ add_new_trans(jmt, *s, sA, CALL);
+ add_new_trans(jmt, *s, *f, l+1);
+ }
+ error:
+ return;
+}
+
+static void conv(struct jmt *jmt, struct lens *lens,
+ struct state **s, struct state **e,
+ struct state **f) {
+ ind_t l = lens_index(jmt, lens);
+ ensure(l < jmt->lenses.used, jmt);
+ struct state *sA = lens_state(jmt, l);
+
+ *s = NULL;
+ *e = NULL;
+ *f = NULL;
+
+ if (lens->recursive) {
+ /* A nonterminal */
+ *s = make_state(jmt);
+ *f = make_state(jmt);
+ ERR_BAIL(jmt);
+ add_new_trans(jmt, *s, *f, l);
+ ERR_BAIL(jmt);
+ ensure(sA != NULL, jmt);
+ add_new_trans(jmt, *s, sA, EPS);
+ ERR_BAIL(jmt);
+ } else if (sA == NULL) {
+ /* A terminal that never matches epsilon */
+ *s = make_state(jmt);
+ *f = make_state(jmt);
+ ERR_BAIL(jmt);
+ add_new_trans(jmt, *s, *f, l);
+ ERR_BAIL(jmt);
+ } else {
+ /* A terminal that matches epsilon */
+ *s = make_state(jmt);
+ *f = make_state(jmt);
+ ERR_BAIL(jmt);
+ add_new_trans(jmt, *s, *f, l);
+ add_new_trans(jmt, *s, *f, l+1);
+ add_new_trans(jmt, *s, sA, EPS);
+ ERR_BAIL(jmt);
+ }
+ error:
+ return;
+}
+
+static void conv_concat(struct jmt *jmt, struct lens *lens,
+ struct state **s, struct state **e,
+ struct state **f) {
+ struct state *s2, *f2, *e2;
+
+ conv(jmt, lens->children[0], &s2, &e2, &f2);
+ *s = make_state(jmt);
+ add_new_trans(jmt, *s, s2, EPS);
+
+ for (int i=1; i < lens->nchildren; i++) {
+ struct state *s3, *e3, *f3, *scall, *fcall;
+ conv(jmt, lens->children[i], &s3, &e3, &f3);
+ thompson(jmt, lens->children[i], &scall, &fcall);
+ ERR_BAIL(jmt);
+ add_eps_trans(jmt, f2, scall);
+ add_eps_trans(jmt, e2, s3);
+ *f = make_state(jmt);
+ add_eps_trans(jmt, f3, *f);
+ add_eps_trans(jmt, fcall, *f);
+ *e = make_state(jmt);
+ add_eps_trans(jmt, e3, *e);
+ f2 = *f;
+ e2 = *e;
+ }
+ error:
+ return;
+}
+
+static void conv_union(struct jmt *jmt, struct lens *lens,
+ struct state **s, struct state **e,
+ struct state **f) {
+
+ *s = make_state(jmt);
+ *e = make_state(jmt);
+ *f = make_state(jmt);
+ ERR_BAIL(jmt);
+
+ for (int i = 0; i < lens->nchildren; i++) {
+ struct state *s2, *e2, *f2;
+
+ conv(jmt, lens->children[i], &s2, &e2, &f2);
+ ERR_BAIL(jmt);
+
+ add_eps_trans(jmt, *s, s2);
+ add_eps_trans(jmt, e2, *e);
+ add_eps_trans(jmt, f2, *f);
+ }
+
+ error:
+ return;
+}
+
+static void conv_star(struct jmt *jmt, struct lens *lens,
+ struct state **s, struct state **e,
+ struct state **f) {
+
+ *s = make_state(jmt);
+ *e = make_state(jmt);
+ *f = make_state(jmt);
+ ERR_BAIL(jmt);
+
+ struct state *si, *ei, *fi, *scall, *fcall;
+ conv(jmt, lens->child, &si, &ei, &fi);
+ thompson(jmt, lens->child, &scall, &fcall);
+ ERR_BAIL(jmt);
+
+ add_eps_trans(jmt, *s, si);
+ add_eps_trans(jmt, ei, si);
+ add_eps_trans(jmt, *s, *e);
+ add_eps_trans(jmt, ei, *e);
+ add_eps_trans(jmt, fi, scall);
+ add_eps_trans(jmt, fcall, scall);
+ add_eps_trans(jmt, fi, *f);
+ add_eps_trans(jmt, fcall, *f);
+ ERR_BAIL(jmt);
+
+ error:
+ return;
+}
+
+static void conv_rhs(struct jmt *jmt, ind_t l) {
+ struct lens *lens = lens_of_jmt(jmt, l);
+ struct state *s = NULL, *e = NULL, *f = NULL;
+ struct state *sA = lens_state(jmt, l);
+
+ if (! lens->recursive) {
+ /* Nothing to do for terminals that do not match epsilon */
+ if (sA != NULL)
+ state_add_return(jmt, sA, l);
+ return;
+ }
+
+ /* All other nonterminals/recursive lenses */
+
+ /* Maintain P1 */
+ if (lens->ctype_nullable)
+ state_add_return(jmt, sA, l);
+
+ switch (lens->tag) {
+ case L_REC:
+ conv(jmt, lens->body, &s, &e, &f);
+ break;
+ case L_CONCAT:
+ conv_concat(jmt, lens, &s, &e, &f);
+ break;
+ case L_UNION:
+ conv_union(jmt, lens, &s, &e, &f);
+ break;
+ case L_SUBTREE:
+ conv(jmt, lens->child, &s, &e, &f);
+ break;
+ case L_STAR:
+ conv_star(jmt, lens, &s, &e, &f);
+ break;
+ case L_MAYBE:
+ conv(jmt, lens->child, &s, &e, &f);
+ add_new_trans(jmt, s, e, EPS);
+ break;
+ default:
+ BUG_ON(true, jmt, "Unexpected lens tag %d", lens->tag);
+ }
+
+ ensure(sA != NULL, jmt);
+
+ add_eps_trans(jmt, sA, s);
+ state_add_return(jmt, e, l);
+ state_add_return(jmt, f, l);
+
+ error:
+ return;
+}
+
+ATTRIBUTE_RETURN_CHECK
+static int push_state(struct array *worklist, struct state *s) {
+ int r;
+ ind_t ind;
+
+ r = array_add(worklist, &ind);
+ if (r < 0)
+ return -1;
+ *array_elem(*worklist, ind, struct state *) = s;
+ return 0;
+}
+
+static struct state *pop_state(struct array *worklist) {
+ if (worklist->used > 0) {
+ worklist->used -= 1;
+ return *array_elem(*worklist, worklist->used, struct state *);
+ } else {
+ return NULL;
+ }
+}
+
+static void free_state(struct state *s) {
+ if (s == NULL)
+ return;
+ free(s->ret);
+ array_release(&s->trans);
+ free(s);
+}
+
+static void collect(struct jmt *jmt) {
+ struct array worklist;
+ size_t count, removed;
+ int r;
+
+ count = 0;
+ list_for_each(s, jmt->start) {
+ s->live = 0;
+ s->reachable = 0;
+ count += 1;
+ }
+
+ array_init(&worklist, sizeof(struct state *));
+ jmt->start->reachable = 1;
+ for (struct state *s = jmt->start;
+ s != NULL;
+ s = pop_state(&worklist)) {
+ for_each_trans(t, s) {
+ if (! t->to->reachable) {
+ t->to->reachable = 1;
+ r = push_state(&worklist, t->to);
+ ERR_NOMEM(r < 0, jmt);
+ }
+ }
+ }
+
+ list_for_each(s, jmt->start)
+ if (s->reachable && is_return(s))
+ s->live = 1;
+
+ bool changed;
+ do {
+ changed = false;
+ list_for_each(s, jmt->start) {
+ if (! s->live && s->reachable) {
+ for_each_trans(t, s) {
+ if (t->lens != CALL && t->to->live) {
+ s->live = 1;
+ changed = true;
+ break;
+ }
+ }
+ }
+ }
+ } while (changed);
+
+ list_for_each(s, jmt->start) {
+ if (s->live && s->reachable) {
+ for (ind_t i = 0; i < s->trans.used; ) {
+ struct trans *t = state_trans(s, i);
+ if (! (t->to->live && t->to->reachable))
+ array_remove(&s->trans, i);
+ else
+ i += 1;
+ }
+ }
+ }
+
+ removed = 0;
+ for (struct state *s = jmt->start;
+ s->next != NULL; ) {
+ struct state *p = s->next;
+ if (p->live && p->reachable) {
+ s = p;
+ } else {
+ s->next = p->next;
+ free_state(p);
+ removed += 1;
+ }
+ }
+
+ error:
+ array_release(&worklist);
+ return;
+}
+
+static void dedup(struct state *s) {
+ array_for_each(i, s->trans) {
+ struct trans *t = state_trans(s, i);
+ for (ind_t j = i+1; j < s->trans.used;) {
+ struct trans *u = state_trans(s, j);
+ if (t->to == u->to && t->lens == u->lens)
+ array_remove(&s->trans, j);
+ else
+ j += 1;
+ }
+ }
+}
+
+static void unepsilon(struct jmt *jmt) {
+ int r;
+
+ if (debugging("cf.jmt.build"))
+ jmt_dot(jmt, "jmt_10_raw.dot");
+ collect(jmt);
+
+ /* Get rid of epsilon transitions */
+ bool changed;
+ do {
+ changed = false;
+ list_for_each(s, jmt->start) {
+ array_for_each(i , s->trans) {
+ struct trans *t = state_trans(s, i);
+ if (t->lens == EPS) {
+ struct state *to = t->to;
+ array_remove(&s->trans, i);
+ r = array_join(&s->trans, &to->trans);
+ ERR_NOMEM(r < 0, jmt);
+ state_merge_returns(jmt, s, to);
+ dedup(s);
+ changed = true;
+ }
+ }
+ }
+ } while (changed);
+
+ collect(jmt);
+ if (debugging("cf.jmt.build"))
+ jmt_dot(jmt, "jmt_20_uneps.dot");
+ error:
+ return;
+}
+
+static bool is_deterministic(struct jmt *jmt) {
+ list_for_each(s, jmt->start) {
+ array_for_each(i, s->trans) {
+ struct trans *t = state_trans(s, i);
+ for (ind_t j = i+1; j < s->trans.used; j++) {
+ struct trans *u = state_trans(s, j);
+ if (t->lens == u->lens)
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+static void free_nfa_state(struct nfa_state *s) {
+ if (s == NULL)
+ return;
+ array_release(&s->set);
+ free(s);
+}
+
+static struct nfa_state *make_nfa_state(struct jmt *jmt) {
+ struct nfa_state *result = NULL;
+ int r;
+
+ r = ALLOC(result);
+ ERR_NOMEM(r < 0, jmt);
+
+ array_init(&result->set, sizeof(struct state *));
+
+ return result;
+ error:
+ FREE(result);
+ return NULL;
+}
+
+static ind_t nfa_state_add(struct jmt *jmt, struct nfa_state *nfa,
+ struct state *s) {
+ ind_t i;
+ int r;
+
+ array_for_each(j, nfa->set) {
+ struct state *q = *array_elem(nfa->set, j, struct state *);
+ if (q == s)
+ return j;
+ }
+
+ /* Keep the list of states sorted */
+ i = nfa->set.used;
+ for (int j=0; j + 1 < nfa->set.used; j++) {
+ if (s < *array_elem(nfa->set, j, struct state *)) {
+ i = j;
+ break;
+ }
+ }
+ r = array_insert(&nfa->set, i);
+ ERR_NOMEM(r < 0, jmt);
+
+ *array_elem(nfa->set, i, struct state *) = s;
+ return i;
+ error:
+ return IND_MAX;
+}
+
+static bool nfa_same_set(struct nfa_state *s1, struct nfa_state *s2) {
+ if (s1->set.used != s2->set.used)
+ return false;
+
+ array_for_each(i, s1->set) {
+ struct state *q1 = *array_elem(s1->set, i, struct state *);
+ struct state *q2 = *array_elem(s2->set, i, struct state *);
+ if (q1 != q2)
+ return false;
+ }
+
+ return true;
+}
+
+static struct nfa_state *nfa_uniq(struct jmt *jmt, struct array *newstates,
+ struct nfa_state *s) {
+ ind_t ind;
+ int r;
+
+ array_each_elem(q, *newstates, struct nfa_state *) {
+ if (nfa_same_set(s, *q)) {
+ if (s == *q)
+ return s;
+ free_nfa_state(s);
+ return *q;
+ }
+ }
+
+ r = array_add(newstates, &ind);
+ ERR_NOMEM(r < 0, jmt);
+ *array_elem(*newstates, ind, struct nfa_state *) = s;
+
+ if (s->state == NULL) {
+ s->state = make_state(jmt);
+ ERR_BAIL(jmt);
+ }
+
+ /* This makes looking at pictures easier */
+ if (s->set.used == 1)
+ s->state->num = (*array_elem(s->set, 0, struct state *))->num;
+
+ return s;
+
+ error:
+ return NULL;
+}
+
+static void det_target(struct jmt *jmt, struct array *newstates,
+ struct nfa_state *nfas, ind_t l) {
+ struct nfa_state *to = NULL;
+
+ array_each_elem(s, nfas->set, struct state *) {
+ for_each_trans(t, *s) {
+ if (t->lens == l) {
+ if (to == NULL) {
+ to = make_nfa_state(jmt);
+ ERR_RET(jmt);
+ }
+ nfa_state_add(jmt, to, t->to);
+ ERR_RET(jmt);
+ }
+ }
+ }
+
+ if (to != NULL) {
+ to = nfa_uniq(jmt, newstates, to);
+ ERR_RET(jmt);
+ add_new_trans(jmt, nfas->state, to->state, l);
+ }
+}
+
+static void determinize(struct jmt *jmt) {
+ struct nfa_state *ini = NULL;
+ struct array *newstates = NULL;
+ int r;
+ ind_t ind, nlenses;
+
+ if (is_deterministic(jmt))
+ return;
+
+ r = ALLOC(newstates);
+ ERR_NOMEM(r < 0, jmt);
+ array_init(newstates, sizeof(struct nfa_state *));
+
+ nlenses = jmt->lenses.used;
+
+ /* The initial state consists of just the start state */
+ ini = make_nfa_state(jmt);
+ ERR_BAIL(jmt);
+ nfa_state_add(jmt, ini, jmt->start);
+ ERR_BAIL(jmt);
+
+ /* Make a new initial state */
+ ini->state = make_state(jmt);
+ ini->state->num = jmt->start->num; /* Makes lokking at pictures easier */
+ ERR_BAIL(jmt);
+ jmt->start->next = ini->state->next;
+ ini->state->next = jmt->start;
+ jmt->start = ini->state;
+
+ r = array_add(newstates, &ind);
+ ERR_NOMEM(r < 0, jmt);
+
+ *array_elem(*newstates, ind, struct nfa_state *) = ini;
+ ini = NULL;
+
+ for (ind_t i = 0; i < newstates->used; i++) {
+ struct nfa_state *nfas = *array_elem(*newstates, i, struct nfa_state *);
+
+ for (int j = 0; j < nfas->set.used; j++) {
+ struct state *s = *array_elem(nfas->set, j, struct state *);
+ state_merge_returns(jmt, nfas->state, s);
+ }
+
+ for (ind_t l = 0; l < nlenses; l++) {
+ det_target(jmt, newstates, nfas, l);
+ ERR_BAIL(jmt);
+ }
+ det_target(jmt, newstates, nfas, CALL);
+ ERR_BAIL(jmt);
+ }
+
+ collect(jmt);
+
+ done:
+ if (newstates) {
+ array_each_elem(s, *newstates, struct nfa_state *) {
+ free_nfa_state(*s);
+ }
+ array_release(newstates);
+ FREE(newstates);
+ }
+ free_nfa_state(ini);
+ return;
+ error:
+ goto done;
+}
+
+struct jmt *jmt_build(struct lens *lens) {
+ struct jmt *jmt = NULL;
+ int r;
+ ind_t l;
+
+ ensure(lens->tag == L_REC && lens->rec_internal, lens->info);
+
+ r = ALLOC(jmt);
+ ERR_NOMEM(r < 0, lens->info);
+
+ jmt->error = lens->info->error;
+ array_init(&jmt->lenses, sizeof(struct jmt_lens));
+
+ l = add_lens(jmt, lens);
+ ERR_BAIL(jmt);
+ ensure(l == 0, lens->info);
+
+ index_lenses(jmt, lens->body);
+
+ if (debugging("cf.jmt"))
+ print_grammar_top(jmt, lens);
+
+ for (ind_t i=0; i < jmt->lenses.used; i++) {
+ conv_rhs(jmt, i);
+ ERR_BAIL(jmt);
+ }
+
+ unepsilon(jmt);
+ ERR_BAIL(jmt);
+
+ determinize(jmt);
+ ERR_BAIL(jmt);
+
+ if (debugging("cf.jmt.build"))
+ jmt_dot(jmt, "jmt_30_dfa.dot");
+
+ return jmt;
+ error:
+ jmt_free(jmt);
+ return NULL;
+}
+
+void jmt_free(struct jmt *jmt) {
+ if (jmt == NULL)
+ return;
+ array_release(&jmt->lenses);
+ struct state *s = jmt->start;
+ while (s != NULL) {
+ struct state *del = s;
+ s = del->next;
+ free_state(del);
+ }
+ free(jmt);
+}
+
+void jmt_dot(struct jmt *jmt, const char *fname) {
+ FILE *fp = debug_fopen("%s", fname);
+ if (fp == NULL)
+ return;
+
+ fprintf(fp, "digraph \"jmt\" {\n");
+ fprintf(fp, " rankdir = LR;\n");
+ list_for_each(s, jmt->start) {
+ if (is_return(s)) {
+ fprintf(fp, " %u [ shape = doublecircle, label = \"%u (",
+ s->num, s->num);
+ flens(fp, s->ret[0]);
+ for (ind_t i = 1; i < s->nret; i++) {
+ fprintf(fp, ", ");
+ flens(fp, s->ret[i]);
+ }
+ fprintf(fp, ")\" ];\n");
+ }
+ for_each_trans(t, s) {
+ fprintf(fp, " %u -> %u ", s->num, t->to->num);
+ if (t->lens == EPS)
+ fprintf(fp, ";\n");
+ else if (t->lens == CALL)
+ fprintf(fp, "[ label = \"call\" ];\n");
+ else if (lens_state(jmt, t->lens) == NULL) {
+ struct lens *lens = lens_of_jmt(jmt, t->lens);
+ fprintf(fp, "[ label = \"");
+ print_regexp(fp, lens->ctype);
+ fprintf(fp, "\" ];\n");
+ } else {
+ fprintf(fp, "[ label = \"");
+ flens(fp, t->lens);
+ fprintf(fp, "\" ];\n");
+ }
+ }
+ }
+ fprintf(fp, "}\n");
+ fclose(fp);
+}
+
+/*
+ * Local variables:
+ * indent-tabs-mode: nil
+ * c-indent-level: 4
+ * c-basic-offset: 4
+ * tab-width: 4
+ * End:
+ */
diff --git a/src/jmt.h b/src/jmt.h
new file mode 100644
index 0000000..fc344a7
--- /dev/null
+++ b/src/jmt.h
@@ -0,0 +1,72 @@
+/*
+ * jmt.h: Earley parser for lenses based on Jim/Mandelbaum transducers
+ *
+ * Copyright (C) 2009 Red Hat Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: David Lutterkort <lutter at redhat.com>
+ */
+
+#ifndef EARLEY_H_
+#define EARLEY_H_
+
+#include <stdlib.h>
+#include <stdint.h>
+#include "lens.h"
+
+struct jmt;
+struct jmt_parse;
+
+typedef uint32_t ind_t;
+
+struct lens;
+
+typedef void (*jmt_traverser)(struct lens *l, size_t start, size_t end,
+ void *data);
+
+typedef void (*jmt_error)(struct lens *lens, void *data, size_t pos,
+ const char *format, ...);
+
+struct jmt_visitor {
+ struct jmt_parse *parse;
+ jmt_traverser terminal;
+ jmt_traverser enter;
+ jmt_traverser exit;
+ jmt_error error;
+ void *data;
+};
+
+struct jmt *jmt_build(struct lens *l);
+
+struct jmt_parse *jmt_parse(struct jmt *jmt, const char *text, size_t text_len);
+
+void jmt_free_parse(struct jmt_parse *);
+
+void jmt_visit(struct jmt_visitor *visitor, size_t *len);
+
+void jmt_free(struct jmt *jmt);
+
+void jmt_dot(struct jmt *jmt, const char *fname);
+#endif
+
+/*
+ * Local variables:
+ * indent-tabs-mode: nil
+ * c-indent-level: 4
+ * c-basic-offset: 4
+ * tab-width: 4
+ * End:
+ */
More information about the augeas-devel
mailing list