[augeas-devel] [PATCH 2 of 2] Revamp the dict data structure

David Lutterkort dlutter at redhat.com
Sun Aug 10 04:51:52 UTC 2008


5 files changed, 232 insertions(+), 112 deletions(-)
src/Makefile.am |    2 
src/ast.c       |  225 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
src/get.c       |   75 ------------------
src/lens.h      |   18 ----
src/put.c       |   24 -----


# HG changeset patch
# User David Lutterkort <dlutter at redhat.com>
# Date 1218343815 25200
# Node ID 4c27b26fe4e256f0da5c15a56d638ec00385a428
# Parent  63fa252bd6f16b6b70f63abbf17f33131e8cdd34
Revamp the dict data structure

Dicts were simple linked lists of key/value pairs, and appending and
searching on those lists caused serious slowdown when the dicts became
large, for example when writing an /etc/hosts file with > 2000 lines.

Dicts are now organized as arrays that are kept sorted by the key, and
lookup is done through binary search. Though this is still quadratic in
theory, it behaves linearly in practice even for a very large /etc/hosts
file (~ 60k lines); when keys appear in order (e.g. when they are generated
by a sequence) or when all entries fall under one key, adding an entry to a
dict is O(log n).

diff -r 63fa252bd6f1 -r 4c27b26fe4e2 src/Makefile.am
--- a/src/Makefile.am	Sat Aug 09 21:15:04 2008 -0700
+++ b/src/Makefile.am	Sat Aug 09 21:50:15 2008 -0700
@@ -23,7 +23,7 @@
 	internal.h internal.c \
 	memory.h memory.c ref.h \
         syntax.c syntax.h parser.y builtin.c lens.c lens.h regexp.c \
-	transform.c get.c put.c list.h
+	transform.c ast.c get.c put.c list.h
 libaugeas_la_LDFLAGS = -Wl,--version-script=$(srcdir)/augeas_sym.version \
         -version-info $(LIBAUGEAS_VERSION_INFO)
 libaugeas_la_LIBADD = liblexer.la libfa.la $(GNULIB)
diff -r 63fa252bd6f1 -r 4c27b26fe4e2 src/ast.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/ast.c	Sat Aug 09 21:50:15 2008 -0700
@@ -0,0 +1,225 @@
+/*
+ * ast.c: Support routines for put/get
+ *
+ * Copyright (C) 2008 Red Hat Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+ *
+ * Author: David Lutterkort <dlutter at redhat.com>
+ */
+
+#include <config.h>
+#include <stdint.h>
+
+#include "internal.h"
+#include "memory.h"
+#include "lens.h"
+
+/* A dictionary that maps key to a list of (skel, dict) */
+struct dict_entry {
+    struct dict_entry *next;
+    struct skel *skel;
+    struct dict *dict;
+};
+
+/* Associates a KEY with a list of skel/dict pairs.
+
+   Dicts are used in two phases: first they are constructed, through
+   repeated calls to dict_append. In the second phase, items are looked up
+   and removed from the dict.
+
+   During construction, MARK points to the end of the list ENTRY. Once
+   construction is done, MARK points to the head of that list, and ENTRY is
+   moved towards the tail everytime an item is looked up.
+*/
+struct dict_node {
+    char *key;
+    struct dict_entry *entry; /* This will change as entries are looked up */
+    struct dict_entry *mark;  /* Pointer to initial entry, will never change */
+};
+
+/* Nodes are kept sorted by their key, with NULL being smaller than any
+   string */
+struct dict {
+    struct dict_node **nodes;
+    uint32_t          size : 24;
+    uint32_t          used : 24;
+    uint32_t          marked : 1;
+};
+
+static const int dict_initial_size = 2;
+static const int dict_max_expansion = 128;
+static const uint32_t dict_max_size = (1<<24) - 1;
+
+struct dict *make_dict(char *key, struct skel *skel, struct dict *subdict) {
+    struct dict *dict = NULL;
+    if (ALLOC(dict) < 0)
+        goto error;
+    if (ALLOC_N(dict->nodes, dict_initial_size) < 0)
+        goto error;
+    if (ALLOC(dict->nodes[0]) < 0)
+        goto error;
+    if (ALLOC(dict->nodes[0]->entry) < 0)
+        goto error;
+
+    dict->size = dict_initial_size;
+    dict->used = 1;
+    dict->nodes[0]->key = key;
+    dict->nodes[0]->entry->skel = skel;
+    dict->nodes[0]->entry->dict = subdict;
+    dict->nodes[0]->mark = dict->nodes[0]->entry;
+
+    return dict;
+ error:
+    if (dict->nodes) {
+        if (dict->nodes[0])
+            FREE(dict->nodes[0]->entry);
+        FREE(dict->nodes[0]);
+    }
+    FREE(dict->nodes);
+    FREE(dict);
+    return NULL;
+}
+
+void free_dict(struct dict *dict) {
+    if (dict == NULL)
+        return;
+
+    for (int i=0; i < dict->used; i++) {
+        struct dict_node *node = dict->nodes[i];
+        if (! dict->marked)
+            node->mark = node->entry;
+        while (node->mark != NULL) {
+            struct dict_entry *del = node->mark;
+            node->mark = del->next;
+            free_skel(del->skel);
+            free_dict(del->dict);
+            free(del);
+        }
+        free(node->key);
+        FREE(node);
+    }
+    FREE(dict->nodes);
+    FREE(dict);
+}
+
+/* Return the position of KEY in DICT as an integer between 0 and
+   DICT->USED.  If KEY is not in DICT, return a negative number P such that
+   -(P + 1) is the position at which KEY must be inserted to keep the keys
+   of the nodes in DICT sorted.
+*/
+static int dict_pos(struct dict *dict, const char *key) {
+    if (key == NULL) {
+        return (dict->nodes[0]->key == NULL) ? 0 : -1;
+    }
+
+    int l = dict->nodes[0]->key == NULL ? 1 : 0;
+    int h = dict->used;
+    while (l < h) {
+        int m = (l + h)/2;
+        int cmp = strcmp(dict->nodes[m]->key, key);
+        if (cmp > 0)
+            h = m;
+        else if (cmp < 0)
+            l = m + 1;
+        else
+            return m;
+    }
+    return -(l + 1);
+}
+
+static int dict_expand(struct dict *dict) {
+    uint32_t size = dict->size;
+
+    if (size == dict_max_size)
+        return -1;
+    if (size > dict_max_expansion)
+        size += dict_max_expansion;
+    else
+        size *= 2;
+    if (size > dict_max_size)
+        size = dict_max_size;
+    dict->size = size;
+    return REALLOC_N(dict->nodes, dict->size);
+}
+
+int dict_append(struct dict **dict, struct dict *d2) {
+    if (d2 == NULL)
+        return 0;
+
+    if (*dict == NULL) {
+        *dict = d2;
+        return 0;
+    }
+
+    struct dict *d1 = *dict;
+    for (int i2 = 0; i2 < d2->used; i2++) {
+        struct dict_node *n2 = d2->nodes[i2];
+        int i1 = dict_pos(d1, n2->key);
+        if (i1 < 0) {
+            i1 = - i1 - 1;
+            if (d1->size == d1->used) {
+                if (dict_expand(d1) < 0)
+                    return -1;
+            }
+            memmove(d1->nodes + i1 + 1, d1->nodes + i1,
+                    sizeof(*d1->nodes) * (d1->used - i1));
+            d1->nodes[i1] = n2;
+            d1->used += 1;
+        } else {
+            struct dict_node *n1 = d1->nodes[i1];
+            list_tail_cons(n1->entry, n1->mark, n2->entry);
+            FREE(n2->key);
+            FREE(n2);
+        }
+    }
+    FREE(d2->nodes);
+    FREE(d2);
+    return 0;
+}
+
+void dict_lookup(const char *key, struct dict *dict,
+                 struct skel **skel, struct dict **subdict) {
+    *skel = NULL;
+    *subdict = NULL;
+    if (dict != NULL) {
+        if (! dict->marked) {
+            for (int i=0; i < dict->used; i++) {
+                dict->nodes[i]->mark = dict->nodes[i]->entry;
+            }
+            dict->marked = 1;
+        }
+        int p = dict_pos(dict, key);
+        if (p >= 0) {
+            struct dict_node *node = dict->nodes[p];
+            if (node->entry != NULL) {
+                *skel = node->entry->skel;
+                *subdict = node->entry->dict;
+                node->entry = node->entry->next;
+            }
+        }
+    }
+}
+
+
+
+/*
+ * Local variables:
+ *  indent-tabs-mode: nil
+ *  c-indent-level: 4
+ *  c-basic-offset: 4
+ *  tab-width: 4
+ * End:
+ */
diff -r 63fa252bd6f1 -r 4c27b26fe4e2 src/get.c
--- a/src/get.c	Sat Aug 09 21:15:04 2008 -0700
+++ b/src/get.c	Sat Aug 09 21:50:15 2008 -0700
@@ -114,34 +114,6 @@
     free(skel);
 }
 
-static struct dict *make_dict(char *key,
-                              struct skel *skel, struct dict *subdict) {
-    struct dict *dict;
-    CALLOC(dict, 1);
-    CALLOC(dict->entry, 1);
-    dict->key = key;
-    dict->entry->skel = skel;
-    dict->entry->dict = subdict;
-    dict->mark = dict->entry;
-    return dict;
-}
-
-void free_dict(struct dict *dict) {
-    while (dict != NULL) {
-        struct dict *next = dict->next;
-        while (dict->mark != NULL) {
-            struct dict_entry *del = dict->mark;
-            dict->mark = del->next;
-            free_skel(del->skel);
-            free_dict(del->dict);
-            free(del);
-        }
-        free(dict->key);
-        free(dict);
-        dict = next;
-    }
-}
-
 static void print_skel(struct skel *skel);
 static void print_skel_list(struct skel *skels, const char *beg,
                             const char *sep, const char *end) {
@@ -197,53 +169,6 @@
     }
 }
 #endif
-
-static void dict_append(struct dict **dict, struct dict *d2) {
-    struct dict *d1 = *dict;
-
-    if (d1 == NULL) {
-        *dict = d2;
-        return;
-    }
-
-    struct dict *e2 = d2;
-#ifdef DICT_DUMP
-    printf("DICT_APPEND\n");
-    print_dict(d1, 0);
-    printf("AND\n");
-    print_dict(d2, 0);
-#endif
-    while (e2 != NULL) {
-        struct dict *e1;
-        for (e1=d1; e1 != NULL; e1 = e1->next) {
-            if (e1->key == NULL) {
-                if (e2->key == NULL)
-                    break;
-            } else {
-                if (e2->key != NULL && STREQ(e1->key, e2->key))
-                    break;
-            }
-        }
-        if (e1 == NULL) {
-            struct dict *last = e2;
-            e2 = e2->next;
-            last->next = NULL;
-            list_append(d1, last);
-        } else {
-            struct dict *del = e2;
-            list_append(e1->entry, e2->entry);
-            e2 = e2->next;
-            free(del->key);
-            free(del);
-        }
-    }
-#ifdef DICT_DUMP
-    printf("YIELDS\n");
-    print_dict(d1, 0);
-    printf("END\n");
-#endif
-    *dict = d1;
-}
 
 static void get_expected_error(struct state *state, struct lens *l) {
     char *word, *p, *pat;
diff -r 63fa252bd6f1 -r 4c27b26fe4e2 src/lens.h
--- a/src/lens.h	Sat Aug 09 21:15:04 2008 -0700
+++ b/src/lens.h	Sat Aug 09 21:50:15 2008 -0700
@@ -95,20 +95,6 @@
     /* Also tag == L_SUBTREE, with no data in the union */
 };
 
-/* A dictionary that maps key to a list of (skel, dict) */
-struct dict_entry {
-    struct dict_entry *next;
-    struct skel *skel;
-    struct dict *dict;
-};
-
-struct dict {
-    struct dict *next;
-    char        *key;
-    struct dict_entry *entry; /* This will change as entries are looked up */
-    struct dict_entry *mark;  /* Pointer to initial entry, will never change */
-};
-
 struct lns_error {
     struct lens  *lens;
     int           pos;        /* Errors from get/parse */
@@ -116,6 +102,10 @@
     char         *message;
 };
 
+struct dict *make_dict(char *key, struct skel *skel, struct dict *subdict);
+void dict_lookup(const char *key, struct dict *dict,
+                 struct skel **skel, struct dict **subdict);
+int dict_append(struct dict **dict, struct dict *d2);
 void free_skel(struct skel *skel);
 void free_dict(struct dict *dict);
 void free_lns_error(struct lns_error *err);
diff -r 63fa252bd6f1 -r 4c27b26fe4e2 src/put.c
--- a/src/put.c	Sat Aug 09 21:15:04 2008 -0700
+++ b/src/put.c	Sat Aug 09 21:50:15 2008 -0700
@@ -268,22 +268,6 @@
     return (count == split->end - split->start);
 }
 
-static struct dict_entry *dict_lookup(const char *key, struct dict *dict) {
-    struct dict *d;
-    if (key == NULL) {
-        for (d=dict; d != NULL && d->key != NULL; d = d->next);
-    } else {
-        for (d=dict; d != NULL && (d->key == NULL || STRNEQ(key, d->key));
-             d = d->next);
-    }
-    if (d == NULL)
-        return NULL;
-    struct dict_entry *result = d->entry;
-    if (result != NULL)
-        d->entry = result->next;
-    return result;
-}
-
 /* Print TEXT to OUT, translating common escapes like \n */
 static void print_escaped_chars(FILE *out, const char *text) {
     for (const char *c = text; *c != '\0'; c++) {
@@ -406,7 +390,6 @@
     size_t oldpathlen = strlen(state->path);
 
     struct tree *tree = state->split->tree;
-    struct dict_entry *entry = dict_lookup(tree->label, state->dict);
     struct split *split = NULL;
 
     state->key = tree->label;
@@ -416,13 +399,10 @@
     split = make_split(tree->children);
     set_split(state, split);
 
-    if (entry == NULL) {
-        state->skel = NULL;
-        state->dict = NULL;
+    dict_lookup(tree->label, state->dict, &state->skel, &state->dict);
+    if (state->skel == NULL) {
         create_lens(lens->child, state);
     } else {
-        state->skel = entry->skel;
-        state->dict = entry->dict;
         put_lens(lens->child, state);
     }
     assert(state->error != NULL || state->split->next == NULL);




More information about the augeas-devel mailing list