Richard W.M. Jones
2017-Sep-20 14:50 UTC
[Libguestfs] [PATCH 0/4] Replace some uses of the Str module with PCRE.
Str is a pretty ugly regexp module. Let's try to replace it with PCRE. This series of commits goes some small way towards that eventual goal. - - - I wonder if there was a deep reason why we had this? let unix2dos s String.concat "\r\n" (Str.split_delim (Str.regexp_string "\n") s) I replaced it with what I think should be (nearly) equivalent: let unix2dos s String.concat "\r\n" (String.nsplit "\n" s) Rich.
Richard W.M. Jones
2017-Sep-20 14:50 UTC
[Libguestfs] [PATCH 1/4] common/mlpcre: Raise Invalid_argument if PCRE.sub n parameter is negative.
--- common/mlpcre/pcre-c.c | 7 +++++-- common/mlpcre/pcre_tests.ml | 7 +++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/common/mlpcre/pcre-c.c b/common/mlpcre/pcre-c.c index 6fae0e6f4..da9b50d34 100644 --- a/common/mlpcre/pcre-c.c +++ b/common/mlpcre/pcre-c.c @@ -201,6 +201,7 @@ value guestfs_int_pcre_sub (value nv) { CAMLparam1 (nv); + int n = Int_val (nv); CAMLlocal1 (strv); int len; CLEANUP_FREE char *str = NULL; @@ -209,8 +210,10 @@ guestfs_int_pcre_sub (value nv) if (m == NULL) raise_pcre_error ("PCRE.sub called without calling PCRE.matches", 0); - len = pcre_get_substring (m->subject, m->vec, m->r, Int_val (nv), - (const char **) &str); + if (n < 0) + caml_invalid_argument ("PCRE.sub: n must be >= 0"); + + len = pcre_get_substring (m->subject, m->vec, m->r, n, (const char **) &str); if (len == PCRE_ERROR_NOSUBSTRING) caml_raise_not_found (); diff --git a/common/mlpcre/pcre_tests.ml b/common/mlpcre/pcre_tests.ml index e5214eab8..2b18f462f 100644 --- a/common/mlpcre/pcre_tests.ml +++ b/common/mlpcre/pcre_tests.ml @@ -69,6 +69,13 @@ let () | PCRE.Error (msg, code) -> failwith (sprintf "PCRE error: %s (PCRE error code %d)" msg code) +(* Run some out of range [sub] calls to ensure an exception is thrown. *) +let () + let re2 = compile "(a+)(b*)" in + ignore (matches re2 "ccac"); + (try ignore (sub 3) with Not_found -> ()); + (try ignore (sub (-1)) with Invalid_argument _ -> ()) + (* Compile some bad regexps and check that an exception is thrown. * It would be nice to check the error message is right but * that involves dealing with language and future changes of -- 2.13.2
Richard W.M. Jones
2017-Sep-20 14:50 UTC
[Libguestfs] [PATCH 2/4] common/mlpcre: Add PCRE.subi to return indexes instead of the substring.
--- common/mlpcre/PCRE.ml | 3 +-- common/mlpcre/PCRE.mli | 14 ++++++++++++++ common/mlpcre/pcre-c.c | 27 +++++++++++++++++++++++++++ common/mlpcre/pcre_tests.ml | 19 ++++++++++++++++--- 4 files changed, 58 insertions(+), 5 deletions(-) diff --git a/common/mlpcre/PCRE.ml b/common/mlpcre/PCRE.ml index 94eea4b34..5269d41f8 100644 --- a/common/mlpcre/PCRE.ml +++ b/common/mlpcre/PCRE.ml @@ -23,10 +23,9 @@ exception Error of string * int type regexp external compile : string -> regexp = "guestfs_int_pcre_compile" - external matches : regexp -> string -> bool = "guestfs_int_pcre_matches" - external sub : int -> string = "guestfs_int_pcre_sub" +external subi : int -> int * int = "guestfs_int_pcre_subi" let () Callback.register_exception "PCRE.Error" (Error ("", 0)) diff --git a/common/mlpcre/PCRE.mli b/common/mlpcre/PCRE.mli index 331a50a9a..02f16d19d 100644 --- a/common/mlpcre/PCRE.mli +++ b/common/mlpcre/PCRE.mli @@ -77,3 +77,17 @@ val sub : int -> string If there was no nth substring then this raises [Not_found]. This can also raise {!Error} for other PCRE-related errors. *) + +val subi : int -> int * int +(** Return the nth substring (capture) matched by the previous call + to {!matches} in the current thread. + + This is the same as {!sub} but instead of copying the + substring out, it returns the indexes into the original string + of the first character of the substring and the first + character after the substring. + + (See pcreapi(3) section "How pcre_exec() returns captured substrings" + for exact details). + + If there was no nth substring then this raises [Not_found]. *) diff --git a/common/mlpcre/pcre-c.c b/common/mlpcre/pcre-c.c index da9b50d34..15775dad0 100644 --- a/common/mlpcre/pcre-c.c +++ b/common/mlpcre/pcre-c.c @@ -225,3 +225,30 @@ guestfs_int_pcre_sub (value nv) memcpy (String_val (strv), str, len); CAMLreturn (strv); } + +value +guestfs_int_pcre_subi (value nv) +{ + CAMLparam1 (nv); + int n = Int_val (nv); + CAMLlocal1 (rv); + struct last_match *m = gl_tls_get (last_match); + + if (m == NULL) + raise_pcre_error ("PCRE.subi called without calling PCRE.matches", 0); + + if (n < 0) + caml_invalid_argument ("PCRE.subi: n must be >= 0"); + + /* eg if there are 2 captures, m->r == 3, and valid values of n are + * 0, 1 or 2. + */ + if (n >= m->r) + caml_raise_not_found (); + + rv = caml_alloc (2, 0); + Store_field (rv, 0, Val_int (m->vec[n*2])); + Store_field (rv, 1, Val_int (m->vec[n*2+1])); + + CAMLreturn (rv); +} diff --git a/common/mlpcre/pcre_tests.ml b/common/mlpcre/pcre_tests.ml index 2b18f462f..316a4348e 100644 --- a/common/mlpcre/pcre_tests.ml +++ b/common/mlpcre/pcre_tests.ml @@ -34,6 +34,12 @@ let sub i eprintf " %s\n%!" r; r +let subi i + eprintf "PCRE.subi %d ->%!" i; + let i1, i2 = PCRE.subi i in + eprintf " (%d, %d)\n%!" i1 i2; + (i1, i2) + let () try let re0 = compile "a+b" in @@ -62,19 +68,26 @@ let () assert (matches re2 "ccac" = true); assert (sub 1 = "a"); assert (sub 2 = ""); - assert (sub 0 = "a") + assert (sub 0 = "a"); + assert (subi 0 = (2, 3)); + assert (subi 1 = (2, 3)); + assert (subi 2 = (3, 3)) with | Not_found -> failwith "one of the PCRE.sub functions unexpectedly raised Not_found" | PCRE.Error (msg, code) -> failwith (sprintf "PCRE error: %s (PCRE error code %d)" msg code) -(* Run some out of range [sub] calls to ensure an exception is thrown. *) +(* Run some out of range [sub] and [subi] calls to ensure an exception + * is thrown. + *) let () let re2 = compile "(a+)(b*)" in ignore (matches re2 "ccac"); (try ignore (sub 3) with Not_found -> ()); - (try ignore (sub (-1)) with Invalid_argument _ -> ()) + (try ignore (sub (-1)) with Invalid_argument _ -> ()); + (try ignore (subi 3) with Not_found -> ()); + (try ignore (subi (-1)) with Invalid_argument _ -> ()) (* Compile some bad regexps and check that an exception is thrown. * It would be nice to check the error message is right but -- 2.13.2
Richard W.M. Jones
2017-Sep-20 14:50 UTC
[Libguestfs] [PATCH 3/4] common/mlpcre: Add PCRE.replace function.
Similar to Perl s/// but lacks backreferences. --- common/mlpcre/PCRE.ml | 25 +++++++++++++++++++++++++ common/mlpcre/PCRE.mli | 13 +++++++++++++ common/mlpcre/pcre_tests.ml | 23 ++++++++++++++++++++++- 3 files changed, 60 insertions(+), 1 deletion(-) diff --git a/common/mlpcre/PCRE.ml b/common/mlpcre/PCRE.ml index 5269d41f8..d80021f8c 100644 --- a/common/mlpcre/PCRE.ml +++ b/common/mlpcre/PCRE.ml @@ -27,5 +27,30 @@ external matches : regexp -> string -> bool = "guestfs_int_pcre_matches" external sub : int -> string = "guestfs_int_pcre_sub" external subi : int -> int * int = "guestfs_int_pcre_subi" +let rec replace ?(global = false) patt subst subj + if not (matches patt subj) then + (* return original string unchanged if patt doesn't match *) + subj + else ( + (* If patt matches "yyyy" in the original string then we have + * the following situation, where "xxxx" is the part of the + * original string before the match, and "zzzz..." is the + * part after the match: + * "xxxxyyyyzzzzzzzzzzzzz" + * ^ ^ + * i1 i2 + *) + let i1, i2 = subi 0 in + let xs = String.sub subj 0 i1 (* "xxxx", part before the match *) in + let zs = String.sub subj i2 (String.length subj - i2) (* rest *) in + + (* If the global flag was set, we want to continue substitutions + * in the rest of the string. + *) + let zs = if global then replace ~global patt subst zs else zs in + + xs ^ subst ^ zs + ) + let () Callback.register_exception "PCRE.Error" (Error ("", 0)) diff --git a/common/mlpcre/PCRE.mli b/common/mlpcre/PCRE.mli index 02f16d19d..634cc600c 100644 --- a/common/mlpcre/PCRE.mli +++ b/common/mlpcre/PCRE.mli @@ -91,3 +91,16 @@ val subi : int -> int * int for exact details). If there was no nth substring then this raises [Not_found]. *) + +val replace : ?global:bool -> regexp -> string -> string -> string +(** [replace ?global patt subst subj] performs a search and replace + on the subject string ([subj]). Where [patt] matches the + string, [subst] is substituted. This works similarly to the + Perl function [s///]. + + The [?global] flag defaults to false, so only the first + instance of [patt] in the string is replaced. If set to true + then every instance of [patt] in the string is replaced. + + Note that this function does not allow backreferences. + Any captures in [patt] are ignored. *) diff --git a/common/mlpcre/pcre_tests.ml b/common/mlpcre/pcre_tests.ml index 316a4348e..b5f712d20 100644 --- a/common/mlpcre/pcre_tests.ml +++ b/common/mlpcre/pcre_tests.ml @@ -28,6 +28,12 @@ let matches re str eprintf " %b\n%!" r; r +let replace ?(global = false) patt subst subj + eprintf "PCRE.replace global:%b <patt> %s %s ->%!" global subst subj; + let r = PCRE.replace ~global patt subst subj in + eprintf " %s\n%!" r; + r + let sub i eprintf "PCRE.sub %d ->%!" i; let r = PCRE.sub i in @@ -45,6 +51,7 @@ let () let re0 = compile "a+b" in let re1 = compile "(a+)b" in let re2 = compile "(a+)(b*)" in + let re3 = compile "[^A-Za-z0-9_]" in assert (matches re0 "ccaaabbbb" = true); assert (sub 0 = "aaab"); @@ -71,7 +78,21 @@ let () assert (sub 0 = "a"); assert (subi 0 = (2, 3)); assert (subi 1 = (2, 3)); - assert (subi 2 = (3, 3)) + assert (subi 2 = (3, 3)); + + assert (replace re0 "dd" "abcabcaabccca" = "ddcabcaabccca"); + assert (replace ~global:true re0 "dd" "abcabcaabccca" = "ddcddcddccca"); + + (* This example copies a usage from customize/firstboot.ml + * "\xc2\xa3" is utf-8 for the GBP sign. Ideally PCRE would + * recognize that this is a single character, however doing that + * would involve passing the PCRE_UTF8 flag when compiling + * patterns, and that could be problematic if PCRE was built + * without Unicode support (XXX). + *) + assert (replace ~global:true re3 "-" "this is a\xc2\xa3funny.name?" + (* = "this-is-a-funny-name-" if UTF-8 worked *) + = "this-is-a--funny-name-"); with | Not_found -> failwith "one of the PCRE.sub functions unexpectedly raised Not_found" -- 2.13.2
Richard W.M. Jones
2017-Sep-20 14:50 UTC
[Libguestfs] [PATCH 4/4] customize: Remove use of Str module from virt-customize code.
--- customize/Makefile.am | 3 +++ customize/firstboot.ml | 6 +++--- customize/ssh_key.ml | 7 +++---- sysprep/Makefile.am | 3 +++ v2v/Makefile.am | 4 ++++ 5 files changed, 16 insertions(+), 7 deletions(-) diff --git a/customize/Makefile.am b/customize/Makefile.am index 18f4dce44..ff2c2e2d0 100644 --- a/customize/Makefile.am +++ b/customize/Makefile.am @@ -126,6 +126,7 @@ OCAMLPACKAGES = \ -I $(top_builddir)/ocaml \ -I $(top_builddir)/common/mlstdutils \ -I $(top_builddir)/common/mlutils \ + -I $(top_builddir)/common/mlpcre \ -I $(top_builddir)/mllib \ -I $(builddir) if HAVE_OCAML_PKG_GETTEXT @@ -155,6 +156,7 @@ endif OCAMLLINKFLAGS = \ mlstdutils.$(MLARCHIVE) \ mlguestfs.$(MLARCHIVE) \ + mlpcre.$(MLARCHIVE) \ mlcutils.$(MLARCHIVE) \ mllib.$(MLARCHIVE) \ customize.$(MLARCHIVE) \ @@ -176,6 +178,7 @@ virt_customize_DEPENDENCIES = \ $(CUSTOMIZE_THEOBJECTS) \ $(CUSTOMIZE_CMA) \ ../common/mlutils/mlcutils.$(MLARCHIVE) \ + ../common/mlpcre/mlpcre.$(MLARCHIVE) \ ../mllib/mllib.$(MLARCHIVE) virt_customize_LINK = \ $(top_srcdir)/ocaml-link.sh -cclib '$(OCAMLCLIBS)' -- \ diff --git a/customize/firstboot.ml b/customize/firstboot.ml index b07bda001..3ac669e5b 100644 --- a/customize/firstboot.ml +++ b/customize/firstboot.ml @@ -25,12 +25,12 @@ open Common_gettext.Gettext open Regedit let unix2dos s - String.concat "\r\n" (Str.split_delim (Str.regexp_string "\n") s) + String.concat "\r\n" (String.nsplit "\n" s) let sanitize_name - let rex = Str.regexp "[^A-Za-z0-9_]" in + let rex = PCRE.compile "[^A-Za-z0-9_]" in fun n -> - let n = Str.global_replace rex "-" n in + let n = PCRE.replace ~global:true rex "-" n in let len = String.length n and max = 60 in if len >= max then String.sub n 0 max else n diff --git a/customize/ssh_key.ml b/customize/ssh_key.ml index 185536d1d..da0e7d90c 100644 --- a/customize/ssh_key.ml +++ b/customize/ssh_key.ml @@ -47,8 +47,8 @@ and parse_selector_list orig_arg = function (* Find the local [on the host] user's SSH public key. See * ssh-copy-id(1) default_ID_file for rationale. *) -let pubkey_re = Str.regexp "^id.*\\.pub$" -let pubkey_ignore_re = Str.regexp ".*-cert\\.pub$" +let pubkey_re = PCRE.compile "^id.*\\.pub$" +let pubkey_ignore_re = PCRE.compile ".*-cert\\.pub$" let local_user_ssh_pubkey () let home_dir @@ -60,8 +60,7 @@ let local_user_ssh_pubkey () let files = Array.to_list files in let files = List.filter ( fun file -> - Str.string_match pubkey_re file 0 && - not (Str.string_match pubkey_ignore_re file 0) + PCRE.matches pubkey_re file && not (PCRE.matches pubkey_ignore_re file) ) files in if files = [] then error (f_"ssh-inject: no public key file found in %s") ssh_dir; diff --git a/sysprep/Makefile.am b/sysprep/Makefile.am index 0acd1ddbf..5c20a437b 100644 --- a/sysprep/Makefile.am +++ b/sysprep/Makefile.am @@ -117,6 +117,7 @@ OCAMLPACKAGES = \ -I $(top_builddir)/common/visit/.libs \ -I $(top_builddir)/common/mlstdutils \ -I $(top_builddir)/common/mlutils \ + -I $(top_builddir)/common/mlpcre \ -I $(top_builddir)/common/mlvisit \ -I $(top_builddir)/mllib \ -I $(top_builddir)/customize @@ -145,6 +146,7 @@ endif OCAMLLINKFLAGS = \ mlstdutils.$(MLARCHIVE) \ mlguestfs.$(MLARCHIVE) \ + mlpcre.$(MLARCHIVE) \ mlcutils.$(MLARCHIVE) \ mllib.$(MLARCHIVE) \ mlvisit.$(MLARCHIVE) \ @@ -155,6 +157,7 @@ virt_sysprep_DEPENDENCIES = \ $(OBJECTS) \ ../common/mlstdutils/mlstdutils.$(MLARCHIVE) \ ../common/mlutils/mlcutils.$(MLARCHIVE) \ + ../common/mlpcre/mlpcre.$(MLARCHIVE) \ ../mllib/mllib.$(MLARCHIVE) \ ../customize/customize.$(MLARCHIVE) \ $(top_srcdir)/ocaml-link.sh diff --git a/v2v/Makefile.am b/v2v/Makefile.am index 0aafc5725..3a38b3a98 100644 --- a/v2v/Makefile.am +++ b/v2v/Makefile.am @@ -151,6 +151,7 @@ OCAMLPACKAGES = \ -I $(top_builddir)/ocaml \ -I $(top_builddir)/common/mlstdutils \ -I $(top_builddir)/common/mlutils \ + -I $(top_builddir)/common/mlpcre \ -I $(top_builddir)/common/mlxml \ -I $(top_builddir)/mllib \ -I $(top_builddir)/customize @@ -177,6 +178,7 @@ endif OCAMLLINKFLAGS = \ mlstdutils.$(MLARCHIVE) \ mlguestfs.$(MLARCHIVE) \ + mlpcre.$(MLARCHIVE) \ mlxml.$(MLARCHIVE) \ mlcutils.$(MLARCHIVE) \ mllib.$(MLARCHIVE) \ @@ -218,6 +220,7 @@ virt_v2v_copy_to_local_DEPENDENCIES = \ $(COPY_TO_LOCAL_OBJECTS) \ ../common/mlstdutils/mlstdutils.$(MLARCHIVE) \ ../common/mlxml/mlxml.$(MLARCHIVE) \ + ../common/mlpcre/mlpcre.$(MLARCHIVE) \ ../common/mlutils/mlcutils.$(MLARCHIVE) \ ../mllib/mllib.$(MLARCHIVE) \ $(top_srcdir)/ocaml-link.sh @@ -508,6 +511,7 @@ v2v_unit_tests_DEPENDENCIES = \ $(v2v_unit_tests_THEOBJECTS) \ ../common/mlstdutils/mlstdutils.$(MLARCHIVE) \ ../common/mlxml/mlxml.$(MLARCHIVE) \ + ../common/mlpcre/mlpcre.$(MLARCHIVE) \ ../common/mlutils/mlcutils.$(MLARCHIVE) \ ../mllib/mllib.$(MLARCHIVE) \ $(top_srcdir)/ocaml-link.sh -- 2.13.2
Apparently Analagous Threads
- [PATCH v2 00/18] Replace many more uses of the Str module with PCRE.
- [PATCH v3 00/22] Replace almost all uses of the Str module with PCRE.
- [PATCH v2 0/3] v2v: add -o json output mode
- [PATCH 0/2] Add lightweight bindings for PCRE.
- [PATCH v2 0/3] common: Add a lightweight OCaml binding for PCRE.