[Libguestfs] [PATCH v2 1/3] common: Add a lightweight OCaml binding for PCRE.

Richard W.M. Jones rjones at redhat.com
Tue Aug 1 15:54:44 UTC 2017


This uses the gnulib TLS macros.
---
 .gitignore                  |   2 +
 Makefile.am                 |   1 +
 bootstrap                   |   1 +
 common/mlpcre/Makefile.am   | 142 +++++++++++++++++++++++++++++
 common/mlpcre/PCRE.ml       |  32 +++++++
 common/mlpcre/PCRE.mli      |  76 ++++++++++++++++
 common/mlpcre/dummy.c       |   2 +
 common/mlpcre/pcre-c.c      | 216 ++++++++++++++++++++++++++++++++++++++++++++
 common/mlpcre/pcre_tests.ml |  86 ++++++++++++++++++
 configure.ac                |   1 +
 m4/.gitignore               |   1 +
 11 files changed, 560 insertions(+)

diff --git a/.gitignore b/.gitignore
index 4699933d3..0e7a649f8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -123,6 +123,8 @@ Makefile.in
 /common/errnostring/errnostring-gperf.gperf
 /common/errnostring/errnostring.h
 /common/miniexpect/miniexpect.3
+/common/mlpcre/.depend
+/common/mlpcre/pcre_tests
 /common/mlprogress/.depend
 /common/mlstdutils/.depend
 /common/mlstdutils/bytes.ml
diff --git a/Makefile.am b/Makefile.am
index 84b00393d..fb4c99db5 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -156,6 +156,7 @@ SUBDIRS += csharp
 # OCaml tools.  Note 'common/ml*', 'mllib' and 'customize' contain
 # shared code used by other OCaml tools, so these must come first.
 if HAVE_OCAML
+SUBDIRS += common/mlpcre
 SUBDIRS += common/mlprogress
 SUBDIRS += common/mlvisit
 SUBDIRS += common/mlxml
diff --git a/bootstrap b/bootstrap
index 77a95a25b..4e3d4bc51 100755
--- a/bootstrap
+++ b/bootstrap
@@ -95,6 +95,7 @@ symlinkat
 sys_select
 sys_types
 sys_wait
+tls
 vasprintf
 vc-list-files
 warnings
diff --git a/common/mlpcre/Makefile.am b/common/mlpcre/Makefile.am
new file mode 100644
index 000000000..aa638cd94
--- /dev/null
+++ b/common/mlpcre/Makefile.am
@@ -0,0 +1,142 @@
+# Bindings for Perl-compatible Regular Expressions.
+# Copyright (C) 2017 Red Hat Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+include $(top_srcdir)/subdir-rules.mk
+
+EXTRA_DIST = \
+	$(SOURCES_MLI) \
+	$(SOURCES_ML) \
+	$(SOURCES_C) \
+	pcre_tests.ml
+
+SOURCES_MLI = \
+	PCRE.mli
+
+SOURCES_ML = \
+	PCRE.ml
+
+SOURCES_C = \
+	pcre-c.c
+
+if HAVE_OCAML
+
+# We pretend that we're building a C library.  automake handles the
+# compilation of the C sources for us.  At the end we take the C
+# objects and OCaml objects and link them into the OCaml library.
+# This C library is never used.
+
+noinst_LIBRARIES = libmlpcre.a
+
+if !HAVE_OCAMLOPT
+MLPCRE_CMA = mlpcre.cma
+else
+MLPCRE_CMA = mlpcre.cmxa
+endif
+
+noinst_DATA = $(MLPCRE_CMA)
+
+libmlpcre_a_SOURCES = $(SOURCES_C)
+libmlpcre_a_CPPFLAGS = \
+	-I. \
+	-I$(top_builddir) \
+	-I$(top_srcdir)/gnulib/lib -I$(top_builddir)/gnulib/lib \
+	-I$(top_srcdir)/common/utils -I$(top_builddir)/common/utils \
+	-I$(shell $(OCAMLC) -where)
+libmlpcre_a_CFLAGS = \
+	$(WARN_CFLAGS) $(WERROR_CFLAGS) \
+	-fPIC
+
+BOBJECTS = $(SOURCES_ML:.ml=.cmo)
+XOBJECTS = $(BOBJECTS:.cmo=.cmx)
+
+# -I $(top_builddir)/lib/.libs is a hack which forces corresponding -L
+# option to be passed to gcc, so we don't try linking against an
+# installed copy of libguestfs.
+OCAMLPACKAGES = \
+	-I $(top_builddir)/gnulib/lib/.libs \
+	-I $(top_builddir)/common/utils/.libs \
+	-I $(builddir)
+OCAMLPACKAGES_TESTS = $(MLPCRE_CMA)
+
+OCAMLFLAGS = $(OCAML_FLAGS) $(OCAML_WARN_ERROR)
+
+if !HAVE_OCAMLOPT
+OBJECTS = $(BOBJECTS)
+else
+OBJECTS = $(XOBJECTS)
+endif
+
+libmlpcre_a_DEPENDENCIES = $(OBJECTS)
+
+$(MLPCRE_CMA): $(OBJECTS) libmlpcre.a
+	$(OCAMLFIND) mklib $(OCAMLPACKAGES) \
+	    $(OBJECTS) $(libmlpcre_a_OBJECTS) -cclib -lpcre -o mlpcre
+
+# Tests.
+
+pcre_tests_SOURCES = dummy.c
+pcre_tests_BOBJECTS = pcre_tests.cmo
+pcre_tests_XOBJECTS = $(pcre_tests_BOBJECTS:.cmo=.cmx)
+
+# Can't call the following as <test>_OBJECTS because automake gets confused.
+if !HAVE_OCAMLOPT
+pcre_tests_THEOBJECTS = $(pcre_tests_BOBJECTS)
+pcre_tests.cmo: OCAMLPACKAGES += $(OCAMLPACKAGES_TESTS)
+else
+pcre_tests_THEOBJECTS = $(pcre_tests_XOBJECTS)
+pcre_tests.cmx: OCAMLPACKAGES += $(OCAMLPACKAGES_TESTS)
+endif
+
+OCAMLLINKFLAGS = $(LINK_CUSTOM_OCAMLC_ONLY)
+
+pcre_tests_DEPENDENCIES = \
+	$(pcre_tests_THEOBJECTS) \
+	$(MLPCRE_CMA) \
+	$(top_srcdir)/ocaml-link.sh
+pcre_tests_LINK = \
+	$(top_srcdir)/ocaml-link.sh \
+	  -cclib '-lutils -lpcre -lgnu' -- \
+	  $(OCAMLFIND) $(BEST) $(OCAMLFLAGS) $(OCAMLLINKFLAGS) \
+	  $(OCAMLPACKAGES) $(OCAMLPACKAGES_TESTS) \
+	  $(pcre_tests_THEOBJECTS) -o $@
+
+TESTS_ENVIRONMENT = $(top_builddir)/run --test
+LOG_COMPILER = $(VG)
+
+check_PROGRAMS = pcre_tests
+TESTS = pcre_tests
+
+check-valgrind:
+	$(MAKE) VG="@VG@" check
+
+# Dependencies.
+depend: .depend
+
+.depend: $(wildcard $(abs_srcdir)/*.mli) $(wildcard $(abs_srcdir)/*.ml)
+	rm -f $@ $@-t
+	$(OCAMLFIND) ocamldep -I ../../ocaml -I $(abs_srcdir) $^ | \
+	  $(SED) 's/ *$$//' | \
+	  $(SED) -e :a -e '/ *\\$$/N; s/ *\\\n */ /; ta' | \
+	  $(SED) -e 's,$(abs_srcdir)/,$(builddir)/,g' | \
+	  sort > $@-t
+	mv $@-t $@
+
+-include .depend
+
+endif
+
+.PHONY: depend docs
diff --git a/common/mlpcre/PCRE.ml b/common/mlpcre/PCRE.ml
new file mode 100644
index 000000000..94eea4b34
--- /dev/null
+++ b/common/mlpcre/PCRE.ml
@@ -0,0 +1,32 @@
+(* Bindings for Perl-compatible Regular Expressions.
+ * Copyright (C) 2017 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *)
+
+(* Lightweight bindings for the PCRE library. *)
+
+exception Error of string * int
+
+type regexp
+
+external compile : string -> regexp = "guestfs_int_pcre_compile"
+
+external matches : regexp -> string -> bool = "guestfs_int_pcre_matches"
+
+external sub : int -> string = "guestfs_int_pcre_sub"
+
+let () =
+  Callback.register_exception "PCRE.Error" (Error ("", 0))
diff --git a/common/mlpcre/PCRE.mli b/common/mlpcre/PCRE.mli
new file mode 100644
index 000000000..33d131238
--- /dev/null
+++ b/common/mlpcre/PCRE.mli
@@ -0,0 +1,76 @@
+(* Bindings for Perl-compatible Regular Expressions.
+ * Copyright (C) 2017 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *)
+
+(** Lightweight bindings for the PCRE library.
+
+    Note this is {i not} Markus Mottl's ocaml-pcre, and doesn't
+    work like that library.
+
+    To match a regular expression:
+
+{v
+let re = PCRE.compile "(a+)b"
+...
+
+if PCRE.matches re "ccaaaabb" then (
+  let whole = PCRE.sub 0 in (* returns "aaaab" *)
+  let first = PCRE.sub 1 in (* returns "aaaa" *)
+  ...
+)
+v}
+
+    Note that there is implicit global state stored between the
+    call to {!matches} and {!sub}.  This is stored in thread
+    local storage so it is safe provided there are no other calls
+    to {!matches} in the same thread.
+*)
+
+exception Error of string * int
+(** PCRE error raised by various functions.
+
+    The string is the printable error message.  The integer is one
+    of the negative [PCRE_*] error codes (see pcreapi(3)), but may
+    be 0 if there was no error code information. *)
+
+type regexp
+(** The type of a compiled regular expression. *)
+
+val compile : string -> regexp
+(** Compile a regular expression.  This can raise {!Error}. *)
+
+val matches : regexp -> string -> bool
+(** Test whether the regular expression matches the string.  This
+    returns true if the regexp matches or false otherwise.
+
+    This also saves any matched substrings in thread-local storage
+    until either the next call to {!matches} in the current thread
+    or the thread/program exits.  You can call {!sub} to return
+    these substrings.
+
+    This can raise {!Error} if PCRE returns an error. *)
+
+val sub : int -> string
+(** Return the nth substring (capture) matched by the previous call
+    to {!matches} in the current thread.
+
+    If [n == 0] it returns the whole matching part of the string.
+
+    If [n >= 1] it returns the nth substring.
+
+    If there was no nth substring then this raises [Not_found].
+    This can also raise {!Error} for other PCRE-related errors. *)
diff --git a/common/mlpcre/dummy.c b/common/mlpcre/dummy.c
new file mode 100644
index 000000000..ebab6198c
--- /dev/null
+++ b/common/mlpcre/dummy.c
@@ -0,0 +1,2 @@
+/* Dummy source, to be used for OCaml-based tools with no C sources. */
+enum { foo = 1 };
diff --git a/common/mlpcre/pcre-c.c b/common/mlpcre/pcre-c.c
new file mode 100644
index 000000000..efad25d44
--- /dev/null
+++ b/common/mlpcre/pcre-c.c
@@ -0,0 +1,216 @@
+/* Bindings for Perl-compatible Regular Expressions.
+ * Copyright (C) 2017 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+
+#include <pcre.h>
+
+#include <caml/alloc.h>
+#include <caml/callback.h>
+#include <caml/custom.h>
+#include <caml/fail.h>
+#include <caml/memory.h>
+#include <caml/mlvalues.h>
+
+#include "cleanups.h"
+
+#include "glthread/tls.h"
+
+#pragma GCC diagnostic ignored "-Wmissing-prototypes"
+
+/* Data on the most recent match is stored in this thread-local
+ * variable.  It is freed either by the next call to PCRE.matches or
+ * by (clean) thread exit.
+ */
+static gl_tls_key_t last_match;
+
+struct last_match {
+  char *subject;                /* subject string */
+  int *vec;
+  int r;                        /* value returned by pcre_exec */
+};
+
+static void
+free_last_match (struct last_match *data)
+{
+  if (data) {
+    free (data->subject);
+    free (data->vec);
+    free (data);
+  }
+}
+
+static void init (void) __attribute__((constructor));
+
+static void
+init (void)
+{
+  gl_tls_key_init (last_match, (void (*) (void *))free_last_match);
+}
+
+static void
+raise_error (const char *msg, int errcode)
+{
+  value *exn = caml_named_value ("PCRE.Error");
+  value args[2];
+
+  args[0] = caml_copy_string (msg);
+  args[1] = Val_int (errcode);
+  caml_raise_with_args (*exn, 2, args);
+}
+
+/* Wrap and unwrap pcre regular expression handles, with a finalizer. */
+#define Regexp_val(rv) (*(pcre **)Data_custom_val(rv))
+
+static void
+regexp_finalize (value rev)
+{
+  pcre *re = Regexp_val (rev);
+  if (re) pcre_free (re);
+}
+
+static struct custom_operations custom_operations = {
+  (char *) "pcre_custom_operations",
+  regexp_finalize,
+  custom_compare_default,
+  custom_hash_default,
+  custom_serialize_default,
+  custom_deserialize_default
+};
+
+static value
+Val_regexp (pcre *re)
+{
+  CAMLparam0 ();
+  CAMLlocal1 (rv);
+
+  rv = caml_alloc_custom (&custom_operations, sizeof (pcre *), 0, 1);
+  Regexp_val (rv) = re;
+
+  CAMLreturn (rv);
+}
+
+value
+guestfs_int_pcre_compile (value pattv)
+{
+  CAMLparam1 (pattv);
+  pcre *re;
+  int errcode = 0;
+  const char *err;
+  int offset;
+
+  re = pcre_compile2 (String_val (pattv), 0, &errcode, &err, &offset, NULL);
+  if (re == NULL)
+    raise_error (err, errcode);
+
+  CAMLreturn (Val_regexp (re));
+}
+
+value
+guestfs_int_pcre_matches (value rev, value strv)
+{
+  CAMLparam2 (rev, strv);
+  pcre *re = Regexp_val (rev);
+  struct last_match *m, *oldm;
+  size_t len = caml_string_length (strv);
+  int capcount, r;
+  int veclen;
+
+  /* Calculate maximum number of substrings, and hence the vector
+   * length required.
+   */
+  r = pcre_fullinfo (re, NULL, PCRE_INFO_CAPTURECOUNT, (int *) &capcount);
+  /* I believe that errors should never occur because of OCaml
+   * type safety, so we should abort here.  If this ever happens
+   * we will need to look at it again.
+   */
+  assert (r == 0);
+  veclen = 3 * (1 + capcount);
+
+  m = calloc (1, sizeof *m);
+  if (m == NULL)
+    caml_raise_out_of_memory ();
+
+  /* We will need the original subject string when fetching
+   * substrings, so take a copy.
+   */
+  m->subject = malloc (len+1);
+  if (m->subject == NULL) {
+    free_last_match (m);
+    caml_raise_out_of_memory ();
+  }
+  memcpy (m->subject, String_val (strv), len+1);
+
+  m->vec = malloc (veclen * sizeof (int));
+  if (m->vec == NULL) {
+    free_last_match (m);
+    caml_raise_out_of_memory ();
+  }
+
+  m->r = pcre_exec (re, NULL, m->subject, len, 0, 0, m->vec, veclen);
+  if (m->r < 0 && m->r != PCRE_ERROR_NOMATCH) {
+    free_last_match (m);
+    /* The C code in lib/match.c ignores errors. */
+    raise_error ("pcre_exec", m->r);
+  }
+
+  /* This error would indicate that pcre_exec ran out of space in the
+   * vector.  However if we are calculating the size of the vector
+   * correctly above, then this should never happen.
+   */
+  assert (m->r != 0);
+
+  /* Replace the old TLS match data. */
+  oldm = gl_tls_get (last_match);
+  free_last_match (oldm);
+  gl_tls_set (last_match, m);
+
+  CAMLreturn (m->r == PCRE_ERROR_NOMATCH ? Val_false : Val_true);
+}
+
+value
+guestfs_int_pcre_sub (value nv)
+{
+  CAMLparam1 (nv);
+  CAMLlocal1 (strv);
+  int len;
+  CLEANUP_FREE char *str = NULL;
+  struct last_match *m = gl_tls_get (last_match);
+
+  if (m == NULL)
+    raise_error ("PCRE.sub called without calling PCRE.matches", 0);
+
+  len = pcre_get_substring (m->subject, m->vec, m->r, Int_val (nv),
+                            (const char **) &str);
+
+  if (len == PCRE_ERROR_NOSUBSTRING)
+    caml_raise_not_found ();
+
+  if (len < 0)
+    raise_error ("pcre_get_substring", len);
+
+  strv = caml_alloc_string (len);
+  memcpy (String_val (strv), str, len);
+  CAMLreturn (strv);
+}
diff --git a/common/mlpcre/pcre_tests.ml b/common/mlpcre/pcre_tests.ml
new file mode 100644
index 000000000..e5214eab8
--- /dev/null
+++ b/common/mlpcre/pcre_tests.ml
@@ -0,0 +1,86 @@
+(* Test bindings for Perl-compatible Regular Expressions.
+ * Copyright (C) 2017 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *)
+
+open Printf
+
+let compile patt =
+  eprintf "PCRE.compile %s\n%!" patt;
+  PCRE.compile patt
+
+let matches re str =
+  eprintf "PCRE.matches %s ->%!" str;
+  let r = PCRE.matches re str in
+  eprintf " %b\n%!" r;
+  r
+
+let sub i =
+  eprintf "PCRE.sub %d ->%!" i;
+  let r = PCRE.sub i in
+  eprintf " %s\n%!" r;
+  r
+
+let () =
+  try
+    let re0 = compile "a+b" in
+    let re1 = compile "(a+)b" in
+    let re2 = compile "(a+)(b*)" in
+
+    assert (matches re0 "ccaaabbbb" = true);
+    assert (sub 0 = "aaab");
+
+    assert (matches re0 "aaa" = false);
+
+    assert (matches re0 "xyz" = false);
+
+    assert (matches re0 "aaabc" = true);
+    assert (sub 0 = "aaab");
+
+    assert (matches re1 "ccaaabb" = true);
+    assert (sub 1 = "aaa");
+    assert (sub 0 = "aaab");
+
+    assert (matches re2 "ccabbc" = true);
+    assert (sub 1 = "a");
+    assert (sub 2 = "bb");
+    assert (sub 0 = "abb");
+
+    assert (matches re2 "ccac" = true);
+    assert (sub 1 = "a");
+    assert (sub 2 = "");
+    assert (sub 0 = "a")
+  with
+  | Not_found ->
+     failwith "one of the PCRE.sub functions unexpectedly raised Not_found"
+  | PCRE.Error (msg, code) ->
+     failwith (sprintf "PCRE error: %s (PCRE error code %d)" msg code)
+
+(* Compile some bad regexps and check that an exception is thrown.
+ * It would be nice to check the error message is right but
+ * that involves dealing with language and future changes of
+ * PCRE error codes.
+ *)
+let () =
+  List.iter (
+    fun patt ->
+      let msg, code =
+        try ignore (PCRE.compile patt); assert false
+        with PCRE.Error (m, c) -> m, c in
+      eprintf "patt: %s -> exception: %s (%d)\n%!" patt msg code
+  ) [ "("; ")"; "+"; "*"; "(abc" ]
+
+let () = Gc.compact ()
diff --git a/configure.ac b/configure.ac
index 3cf9d0a28..651b48b3b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -196,6 +196,7 @@ AC_CONFIG_FILES([Makefile
                  common/errnostring/Makefile
                  common/edit/Makefile
                  common/miniexpect/Makefile
+                 common/mlpcre/Makefile
                  common/mlprogress/Makefile
                  common/mlstdutils/Makefile
                  common/mlstdutils/guestfs_config.ml
diff --git a/m4/.gitignore b/m4/.gitignore
index 07960ed7b..a84b22e5c 100644
--- a/m4/.gitignore
+++ b/m4/.gitignore
@@ -248,6 +248,7 @@
 /thread.m4
 /time_h.m4
 /timespec.m4
+/tls.m4
 /ttyname_r.m4
 /ulonglong.m4
 /ungetc.m4
-- 
2.13.2




More information about the Libguestfs mailing list