[Libguestfs] [PATCH v3 06/22] common/mlpcre: Add split and nsplit functions.

Richard W.M. Jones rjones at redhat.com
Fri Sep 22 07:36:07 UTC 2017


These work like our String.split and String.nsplit functions.
---
 common/mlpcre/PCRE.ml       | 33 +++++++++++++++++++++++++++++++++
 common/mlpcre/PCRE.mli      | 19 +++++++++++++++++++
 common/mlpcre/pcre_tests.ml | 29 +++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+)

diff --git a/common/mlpcre/PCRE.ml b/common/mlpcre/PCRE.ml
index 753e247e4..b054928f9 100644
--- a/common/mlpcre/PCRE.ml
+++ b/common/mlpcre/PCRE.ml
@@ -52,5 +52,38 @@ let rec replace ?(global = false) patt subst subj =
     xs ^ subst ^ zs
   )
 
+let rec split patt subj =
+  if not (matches patt subj) then
+    subj, ""
+  else (
+    (* If patt matches "yyyy" in the original string then we have
+     * the following situation, where "xxxx" is the part of the
+     * original string before the match, and "zzzz..." is the
+     * part after the match:
+     * "xxxxyyyyzzzzzzzzzzzzz"
+     *      ^   ^
+     *      i1  i2
+     *)
+    let i1, i2 = subi 0 in
+    let xs = String.sub subj 0 i1 (* "xxxx", part before the match *) in
+    let zs = String.sub subj i2 (String.length subj - i2) (* after *) in
+    xs, zs
+  )
+
+and nsplit ?(max = 0) patt subj =
+  if max < 0 then
+    invalid_arg "PCRE.nsplit: max parameter should not be negative";
+
+  (* If we reached the limit, OR if the pattern does not match the string
+   * at all, return the rest of the string as a single element list.
+   *)
+  if max = 1 || not (matches patt subj) then
+    [subj]
+  else (
+    let s1, s2 = split patt subj in
+    let max = if max = 0 then 0 else max - 1 in
+    s1 :: nsplit ~max patt s2
+  )
+
 let () =
   Callback.register_exception "PCRE.Error" (Error ("", 0))
diff --git a/common/mlpcre/PCRE.mli b/common/mlpcre/PCRE.mli
index fcf6fd25e..eacb6fd90 100644
--- a/common/mlpcre/PCRE.mli
+++ b/common/mlpcre/PCRE.mli
@@ -110,3 +110,22 @@ val replace : ?global:bool -> regexp -> string -> string -> string
 
     Note that this function does not allow backreferences.
     Any captures in [patt] are ignored. *)
+
+val split : regexp -> string -> string * string
+val nsplit : ?max:int -> regexp -> string -> string list
+(** [split patt subj] splits the string at the first occurrence
+    of the regular expression [patt], returning the parts of the
+    string before and after the match (the matching part is not
+    returned).  If the pattern does not match then the whole
+    input is returned in the first string, and the second string
+    is empty.
+
+    [nsplit patt subj] is the same but the string is split
+    on every occurrence of [patt].  Note that if the pattern
+    matches at the beginning or end of the string, then an
+    empty string element will be returned at the beginning or
+    end of the list.
+
+    [nsplit] has an optional [?max] parameter which controls
+    the maximum length of the returned list.  The final element
+    contains the remainder of the string. *)
diff --git a/common/mlpcre/pcre_tests.ml b/common/mlpcre/pcre_tests.ml
index 9d42914b9..346019c40 100644
--- a/common/mlpcre/pcre_tests.ml
+++ b/common/mlpcre/pcre_tests.ml
@@ -42,6 +42,20 @@ let replace ?(global = false) patt subst subj =
   eprintf " %s\n%!" r;
   r
 
+let split patt subj =
+  eprintf "PCRE.split <patt> %s ->%!" subj;
+  let s1, s2 = PCRE.split patt subj in
+  eprintf " (%s, %s)\n%!" s1 s2;
+  (s1, s2)
+
+let nsplit ?(max = 0) patt subj =
+  eprintf "PCRE.nsplit%s <patt> %s ->%!"
+          (if max = 0 then "" else sprintf " ~max:%d" max)
+          subj;
+  let ss = PCRE.nsplit ~max patt subj in
+  eprintf " [%s]\n%!" (String.concat "; " ss);
+  ss
+
 let sub i =
   eprintf "PCRE.sub %d ->%!" i;
   let r = PCRE.sub i in
@@ -60,6 +74,7 @@ let () =
     let re1 = compile "(a+)b" in
     let re2 = compile "(a+)(b*)" in
     let re3 = compile ~caseless:true "[^a-z0-9_]" in
+    let ws = compile "\\s+" in
 
     assert (matches re0 "ccaaabbbb" = true);
     assert (sub 0 = "aaab");
@@ -101,6 +116,20 @@ let () =
     assert (replace ~global:true re3 "-" "this is a\xc2\xa3FUNNY.name?"
             (* = "this-is-a-FUNNY-name-" if UTF-8 worked *)
             = "this-is-a--FUNNY-name-");
+
+    (* This also tests PCRE.split since that is used by nsplit. *)
+    assert (nsplit ~max:1 ws "a b c" = [ "a b c" ]);
+    assert (nsplit ~max:2 ws "a b c" = [ "a"; "b c" ]);
+    assert (nsplit ~max:3 ws "a b c" = [ "a"; "b"; "c" ]);
+    assert (nsplit ~max:10 ws "a b c" = [ "a"; "b"; "c" ]);
+    assert (nsplit ws "the cat sat   on \t\t  the mat." =
+              [ "the"; "cat"; "sat"; "on"; "the"; "mat." ]);
+    assert (nsplit ~max:5 ws "the cat sat   on \t\t  the mat." =
+              [ "the"; "cat"; "sat"; "on"; "the mat." ]);
+    assert (nsplit ws " the " = [ ""; "the"; "" ]);
+    assert (nsplit ws "the " = [ "the"; "" ]);
+    assert (nsplit ws " the" = [ ""; "the" ]);
+    assert (nsplit ws "    \t  the" = [ ""; "the" ]);
   with
   | Not_found ->
      failwith "one of the PCRE.sub functions unexpectedly raised Not_found"
-- 
2.13.2




More information about the Libguestfs mailing list