[augeas-devel] augeas: master - Remove character ranges from regexps if we are not in the C locale

David Lutterkort lutter at fedoraproject.org
Tue Nov 10 17:24:35 UTC 2009


Gitweb:        http://git.fedorahosted.org/git/augeas.git?p=augeas.git;a=commitdiff;h=349f936949ef1d2e02b1ee9f1906be762e1a53f9
Commit:        349f936949ef1d2e02b1ee9f1906be762e1a53f9
Parent:        f763d964827bdea804df4fe68e2ecb5c01819edd
Author:        David Lutterkort <lutter at redhat.com>
AuthorDate:    Fri Oct 23 15:02:24 2009 +0200
Committer:     David Lutterkort <lutter at redhat.com>
CommitterDate: Tue Nov 10 09:22:09 2009 -0800

Remove character ranges from regexps if we are not in the C locale

Since re_compile_pattern uses the current locale when expanding character
ranges like [a-z], we need to be careful that such ranges are expanded in
the C locale.

  * configure.ac: check for uselocale
  * src/internal.[ch] (regexp_c_locale): new function
  * src/lens.c (digits_string): spell out the digits
  * src/lens.c (restrict_regexp): expand char ranges in restricted regexp
  * src/lexer.l (regexp_literal): run regexp literals through
    regexp_c_locale
  * src/regexp.c (regexp_escape): compress character ranges for printing
  * src/regexp.c (regexp_minus): expand charcter ranges

Fixes ticket #35 for systems that do not have uselocale (for systems with
uselocale, the fix is commit 07b6a880)
---
 src/internal.c |   36 ++++++++++++++++++++++++++++++++++++
 src/internal.h |   10 ++++++++++
 src/lens.c     |    8 +++++++-
 src/lexer.l    |   14 +++++++++++++-
 src/regexp.c   |   23 ++++++++++++++++++++++-
 5 files changed, 88 insertions(+), 3 deletions(-)

diff --git a/src/internal.c b/src/internal.c
index 3ac5350..318c79a 100644
--- a/src/internal.c
+++ b/src/internal.c
@@ -25,9 +25,11 @@
 #include <ctype.h>
 #include <stdio.h>
 #include <stdarg.h>
+#include <locale.h>
 
 #include "internal.h"
 #include "memory.h"
+#include "fa.h"
 
 #ifndef MIN
 # define MIN(a, b) ((a) < (b) ? (a) : (b))
@@ -402,6 +404,40 @@ void calc_line_ofs(const char *text, size_t pos, size_t *line, size_t *ofs) {
     }
 }
 
+#if HAVE_USELOCALE
+int regexp_c_locale(ATTRIBUTE_UNUSED char **u, ATTRIBUTE_UNUSED size_t *len) {
+    /* On systems with uselocale, we are ok, since we make sure that we
+     * switch to the "C" locale any time we enter through the public API
+     */
+    return 0;
+}
+#else
+int regexp_c_locale(char **u, size_t *len) {
+    /* Without uselocale, we need to expand character ranges */
+    int r;
+    char *s = *u;
+    size_t s_len, u_len;
+    if (len == NULL) {
+        len = &u_len;
+        s_len = strlen(s);
+    } else {
+        s_len = *len;
+    }
+    r = fa_expand_char_ranges(s, s_len, u, len);
+    if (r != 0) {
+        *u = s;
+        *len = s_len;
+    }
+    if (r < 0)
+        return -1;
+    /* Syntax errors will be caught when the result is compiled */
+    if (r > 0)
+        return 0;
+    free(s);
+    return 1;
+}
+#endif
+
 /*
  * Local variables:
  *  indent-tabs-mode: nil
diff --git a/src/internal.h b/src/internal.h
index 55aa551..fe5b716 100644
--- a/src/internal.h
+++ b/src/internal.h
@@ -271,6 +271,16 @@ int xasprintf(char **strp, const char *format, ...);
 /* Calculate line and column number of character POS in TEXT */
 void calc_line_ofs(const char *text, size_t pos, size_t *line, size_t *ofs);
 
+/* Take the first LEN characters from the regexp *U and expand any
+ * character ranges in it. The expanded regexp, if expansion is necessary,
+ * is in U, and the old string is freed. If expansion is not needed or an
+ * error happens, U will be unchanged.
+ *
+ * Return 0 if expansion is not necessary, -1 if an error occurs, and 1 if
+ * expansion was needed.
+ */
+int regexp_c_locale(char **u, size_t *len);
+
 /* Struct: augeas
  * The data structure representing a connection to Augeas. */
 struct augeas {
diff --git a/src/lens.c b/src/lens.c
index 9a9b6e2..d4a8b4b 100644
--- a/src/lens.c
+++ b/src/lens.c
@@ -50,7 +50,7 @@ static const char *const tags[] = {
 };
 
 static const struct string digits_string = {
-    .ref = REF_MAX, .str = (char *) "[0-9]+"
+    .ref = REF_MAX, .str = (char *) "[0123456789]+"
 };
 static const struct string *const digits_pat = &digits_string;
 
@@ -312,6 +312,12 @@ static struct regexp *restrict_regexp(struct regexp *r) {
     if (ret != 0)
         return NULL;
 
+    ret = regexp_c_locale(&nre, &nre_len);
+    if (ret < 0) {
+        free(nre);
+        return NULL;
+    }
+
     r = make_regexp(r->info, nre);
     if (regexp_compile(r) != 0)
         abort();
diff --git a/src/lexer.l b/src/lexer.l
index a62d2c7..a690f07 100644
--- a/src/lexer.l
+++ b/src/lexer.l
@@ -52,6 +52,18 @@ static void loc_update(YYLTYPE *yylloc, const char *s, int len) {
     }
   }
 }
+
+static char *regexp_literal(const char *s, int len) {
+  char *u = unescape(s, len);
+  size_t u_len = strlen(u);
+
+  if (u == NULL)
+    return NULL;
+
+  regexp_c_locale(&u, &u_len);
+
+  return u;
+}
 %}
 
 DIGIT [0-9]
@@ -84,7 +96,7 @@ ARROW  ->
 
   \/([^/]|\\\/)*\/ {
                loc_update(yylloc, yytext, yyleng);
-               yylval->string = unescape(yytext+1, yyleng-2);
+               yylval->string = regexp_literal(yytext+1, yyleng-2);
                return REGEXP;
   }
 
diff --git a/src/regexp.c b/src/regexp.c
index 3748de7..795e6f0 100644
--- a/src/regexp.c
+++ b/src/regexp.c
@@ -34,7 +34,25 @@ static const struct string empty_pattern_string = {
 static const struct string *const empty_pattern = &empty_pattern_string;
 
 char *regexp_escape(const struct regexp *r) {
-    char *pat = escape(r->pattern->str, -1);
+    char *pat = NULL;
+
+#if !HAVE_USELOCALE
+    char *nre = NULL;
+    int ret;
+    size_t nre_len;
+
+    /* Use a range with from > to to force conversion of ranges into
+     * short form */
+    ret = fa_restrict_alphabet(r->pattern->str, strlen(r->pattern->str),
+                               &nre, &nre_len, 2, 1);
+    if (ret == 0) {
+        pat = escape(nre, nre_len);
+        free(nre);
+    }
+#endif
+
+    if (pat == NULL)
+        pat = escape(r->pattern->str, -1);
 
     if (pat == NULL)
         return NULL;
@@ -233,6 +251,9 @@ regexp_minus(struct info *info, struct regexp *r1, struct regexp *r2) {
         goto error;
     }
 
+    if (regexp_c_locale(&s, NULL) < 0)
+        goto error;
+
     result = make_regexp(info, s);
     s = NULL;
 




More information about the augeas-devel mailing list