thr3ads.net - Libguestfs - Re: [Libguestfs] [PATCH v2 4/5] v2v: ova: don't extract files from OVA if it's not needed [Nov 2016]

If this information is useful, please help other people find it:
Share via:

Pino Toscano

2016-Nov-21 15:21 UTC

Re: [Libguestfs] [PATCH v2 4/5] v2v: ova: don't extract files from OVA if it's not needed

On Saturday, 12 November 2016 16:37:52 CET Tomáš Golembiovský
wrote:> We don't have to always extract all files from the OVA archive. The
OVA,
> as defined in the standard, is plain tar. We can work directly over the
> tar archive if we use correct 'offset' and 'size' options
when defining
> the backing file for QEMU.
> 
> This leads to improvements in speed and puts much lower requirement on
> available disk space.
> 
> Signed-off-by: Tomáš Golembiovský <tgolembi@redhat.com>
> ---
>  v2v/input_ova.ml | 177
+++++++++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 154 insertions(+), 23 deletions(-)
> 
> diff --git a/v2v/input_ova.ml b/v2v/input_ova.ml
> index f76fe82..7a3e27b 100644
> --- a/v2v/input_ova.ml
> +++ b/v2v/input_ova.ml
> @@ -39,17 +39,23 @@ object
>  
>    method source () >  
> -    let untar ?(format = "") file outdir > -      let cmd = [
"tar"; sprintf "-x%sf" format; file; "-C"; outdir
] in
> +    (* Untar part or all files from tar archive. If [path] is specified it
is
> +     * a path in the tar archive.
> +     *)
> +    let untar ?(format = "") ?(path) file outdir No need for parenthesis around "path".
Also, tar supports extracting more files at the same time, so I'd
change "path" into "paths", and just call this once later on
...
> +      let cmd > +        [ "tar"; sprintf "-x%sf"
format; file; "-C"; outdir ]
> +        @ (match path with None -> [] | Some p -> [p])
> +      in
>        if run_command cmd <> 0 then
>          error (f_"error unpacking %s, see earlier error
messages") ova in
>  
>      (* Extract ova file. *)
> -    let exploded > +    let exploded, partial >        (* The spec
allows a directory to be specified as an ova.  This
>         * is also pretty convenient.
>         *)
> -      if is_directory ova then ova
> +      if is_directory ova then ova, false
>        else (
>          let uncompress_head zcat file >            let cmd = sprintf
"%s %s" zcat (quote file) in
> @@ -67,11 +73,53 @@ object
>  
>            tmpfile in
>  
> +        (* Untar only ovf and manifest from the archive *)
> +        let untar_partial file outdir I'd rename this as "untar_metadata", as it seems a better fitting
name.
YMMV.
> +          let files > +            external_command (sprintf "tar
-tf %s" (Filename.quote file)) in
> +          List.iter (fun f ->
> +            if Filename.check_suffix f ".ovf" ||
> +              Filename.check_suffix f ".mf" then
> +                untar ~path:f file outdir
> +          ) files in
... i.e. here: this can be changed to:

  let files     filter_map (
      fun f ->
        if Filename.check_suffix f ".ovf" ||
           Filename.check_suffix f ".mf" then
          Some f
        else None
    ) files in
  untar ~paths:files file outdir
> +        let qemu_img_version () > +          let cmd = "qemu-img
--version" in
> +          let lines = external_command cmd in
> +          match lines with
> +          | [] -> error ("'qemu-img --version' returned no
output")
> +          | line :: _ ->
> +              let rex = Str.regexp "qemu-img version
\\([0-9]+\\)\\.\\([0-9]+\\)" in
> +              if Str.string_match rex line 0 then (
> +                try
> +                  int_of_string (Str.matched_group 1 line),
> +                  int_of_string (Str.matched_group 2 line)
> +                with Failure _ ->
> +                  warning (f_"warning: failed to read qemu-img
version! Line: %S")
> +                  line;
"warning" is already printed by the warning function; also I'd
simplify
the message as:

  warning (f_"failed to parse the version of qemu-img (%S), assuming
0.9")
    line
> +                  0, 9
> +              ) else (
> +                warning (f_"warning: failed to read qemu-img version!
Line: %S")
> +                  line;
Ditto.
> +                0, 9
> +              )
> +        in
> +
>          match detect_file_type ova with
>          | `Tar ->
>            (* Normal ovas are tar file (not compressed). *)
> -          untar ova tmpdir;
> -          tmpdir
> +          let qmajor, qminor = qemu_img_version () in
> +          if qmajor > 2 || (qmajor == 2 && qminor >= 8) then
(
> +            (* If QEMU is recent enough we don't have to extract
everything. We
> +             * can access disks inside the tar archive.
> +             *)
> +            untar_partial ova tmpdir;
> +            tmpdir, true
> +          ) else (
> +            untar ova tmpdir;
> +            tmpdir, false
> +          )
> +
>          | `Zip ->
>            (* However, although not permitted by the spec, people ship
>             * zip files as ova too.
> @@ -81,7 +129,7 @@ object
>              [ "-j"; "-d"; tmpdir; ova ] in
>            if run_command cmd <> 0 then
>              error (f_"error unpacking %s, see earlier error
messages") ova;
> -          tmpdir
> +          tmpdir, false
>          | (`GZip|`XZ) as format ->
>            let zcat, tar_fmt >              match format with
> @@ -94,7 +142,7 @@ object
>            (match tmpfiletype with
>            | `Tar ->
>              untar ~format:tar_fmt ova tmpdir;
> -            tmpdir
> +            tmpdir, false
>            | `Zip | `GZip | `XZ | `Unknown ->
>              error (f_"%s: unsupported file format\n\nFormats which we
currently understand for '-i ova' are: tar (uncompressed, compress with
gzip or xz), zip") ova
>            )
> @@ -135,6 +183,59 @@ object
>        loop [dir]
>      in
>  
> +    (* Find file in [tar] archive and return at which byte it starts and
how
> +     * long it is.
> +     *)
> +    let find_file_in_tar tar filename > +      let lines =
external_command (stringify_args [ "tar"; "tRvf"; tar ]) in
stringify_args is meant to be used for displaying, rather than passing
it back as shell command. Please just build a command string as done in
the rest of the places.
> +      let rec loop lines > +        match lines with
> +        | [] -> raise Not_found
> +        | line :: lines -> (
> +          (* Lines have the form:
> +           * block <offset>: <perms>
<owner>/<group> <size> <mdate> <mtime>
<file>
> +           *)
> +          let rex = Str.regexp "^block \\([0-9]+\\): \\([^ \\t]+\\)
\\([^ \\t]+\\) +\\([^ \\t]+\\) \\([^ \\t]+\\) \\([^ \\t]+\\) \\(.+\\)$" in
Whoa how unreadable :/ The alternative would be using String.nsplit,
but it would play badly with filenames with spaces inside the archive.
Maybe a faster alternative could be:
a) find the 6th empty space, and pick the substring of all the rest
   of the string after it
b) filter the lines by picking only the wanted file, checking using (a)
c) once a single line is found, use String.nsplit and parse only parts
   1 (removing the ':' suffix) and 4

It feels slightly more complex, but this way we can avoid running the
regex until we find the right filename.
> +          if Str.string_match rex line 0 then (
> +            let offset = Str.matched_group 1 line in
> +            let size = Str.matched_group 4 line in
> +            let fname = Str.matched_group 7 line in
> +
> +            let offset > +              try Int64.of_string offset
> +              with Failure _ ->
> +                error (f_"invalid offset returned by tar: %S")
offset in
> +
> +            let size > +              try Int64.of_string size
> +              with Failure _ ->
> +                error (f_"invalid size returned by tar: %S")
size in
> +
> +            if fname = filename then
> +              (* Note: Offset is actualy block number and there is a
single
> +               * block with tar header at the beginning of the file. So
skip
> +               * the header and convert the block number to bytes before
> +               * returning.
> +               *)
> +              (offset +^ 1L) *^ 512L, size
> +            else loop lines
> +          ) else
> +            error (f_"failed to parse line returned by tar: %S")
line
> +        )
> +      in
> +      loop lines
> +    in
> +
> +    let subfolder folder parent > +      if String.is_prefix folder
(parent // "") then
> +        let len = String.length parent in
> +        String.sub folder (len+1) (String.length folder-len-1)
> +      else if folder = parent then
> +        ""
I'd invert the two checks (first the = one, then is_prefix).
> +      else
> +        assert false
> +    in
> +
>      (* Search for the ovf file. *)
>      let ovf = find_files exploded ".ovf" in
>      let ovf > @@ -152,6 +253,7 @@ object
>        fun mf ->
>          debug "processing manifest %s" mf;
>          let mf_folder = Filename.dirname mf in
> +        let mf_subfolder = subfolder mf_folder exploded in
>          let chan = open_in mf in
>          let rec loop () >            let line = input_line chan in
> @@ -160,7 +262,11 @@ object
>              let disk = Str.matched_group 2 line in
>              let expected = Str.matched_group 3 line in
>              let csum = Checksums.of_string mode expected in
> -            try Checksums.verify_checksum csum (mf_folder // disk)
> +            try
> +              if partial then
> +                Checksums.verify_checksum csum ~tar:ova (mf_subfolder //
disk)
> +              else
> +                Checksums.verify_checksum csum (mf_folder // disk)
>              with Checksums.Mismatched_checksum (_, actual) ->
>                error (f_"checksum of disk %s does not match manifest
%s (actual %s(%s) = %s, expected %s(%s) = %s)")
>                  disk mf mode disk actual mode disk expected;
> @@ -283,25 +389,50 @@ object
>              | Some "gzip" -> true
>              | Some s -> error (f_"unsupported comprression in OVF:
%s") s in
>  
> -          let filename = if compressed then (
> -            let new_filename = tmpdir // String.random8 () ^
".vmdk" in
> -            let cmd > -              sprintf "zcat %s >
%s" (quote ovf_folder // filename) (quote new_filename) in
> -            if shell_command cmd <> 0 then
> -              error (f_"error uncompressing %s, see earlier error
messages")
> -                filename;
> -            new_filename
> -          )
> -          else
> -            ovf_folder // filename
> +          let filename, partial > +            if compressed then (
> +              if partial then
> +                untar ~path:((subfolder ovf_folder exploded) // filename)
> +                  ova tmpdir;
> +              let new_filename = tmpdir // String.random8 () ^
".vmdk" in
> +              let cmd > +                sprintf "zcat %s >
%s" (quote ovf_folder // filename) (quote new_filename) in
> +              if shell_command cmd <> 0 then
> +                error (f_"error uncompressing %s, see earlier error
messages")
> +                  filename;
I guess parts of this block could be shared/factored out with the same
bits used earlier.

Thanks,
-- 
Pino Toscano

Tomáš Golembiovský

2016-Nov-23 15:16 UTC

head link

Re: [Libguestfs] [PATCH v2 4/5] v2v: ova: don't extract files from OVA if it's not needed

On Mon, 21 Nov 2016 16:21:02 +0100
Pino Toscano <ptoscano@redhat.com> wrote:
> On Saturday, 12 November 2016 16:37:52 CET Tomáš Golembiovský wrote:
> > We don't have to always extract all files from the OVA archive.
The OVA,
> > as defined in the standard, is plain tar. We can work directly over
the
> > tar archive if we use correct 'offset' and 'size'
options when defining
> > the backing file for QEMU.
> > 
> > This leads to improvements in speed and puts much lower requirement on
> > available disk space.
> > 
> > Signed-off-by: Tomáš Golembiovský <tgolembi@redhat.com>
> > ---
> >  v2v/input_ova.ml | 177
+++++++++++++++++++++++++++++++++++++++++++++++--------
> >  1 file changed, 154 insertions(+), 23 deletions(-)
> > 
> > diff --git a/v2v/input_ova.ml b/v2v/input_ova.ml
> > index f76fe82..7a3e27b 100644
> > --- a/v2v/input_ova.ml
> > +++ b/v2v/input_ova.ml
> > @@ -39,17 +39,23 @@ object
> >  
> >    method source () > >  
> > -    let untar ?(format = "") file outdir > > -     
let cmd = [ "tar"; sprintf "-x%sf" format; file;
"-C"; outdir ] in
> > +    (* Untar part or all files from tar archive. If [path] is
specified it is
> > +     * a path in the tar archive.
> > +     *)
> > +    let untar ?(format = "") ?(path) file outdir =  
> 
> No need for parenthesis around "path".
> Also, tar supports extracting more files at the same time, so I'd
> change "path" into "paths", and just call this once
later on ...
You mean pass an array/list instead of single string? Yeah, why not.
> 
> > +      let cmd > > +        [ "tar"; sprintf
"-x%sf" format; file; "-C"; outdir ]
> > +        @ (match path with None -> [] | Some p -> [p])
> > +      in
> >        if run_command cmd <> 0 then
> >          error (f_"error unpacking %s, see earlier error
messages") ova in
> >  
> >      (* Extract ova file. *)
> > -    let exploded > > +    let exploded, partial > >      
(* The spec allows a directory to be specified as an ova.  This
> >         * is also pretty convenient.
> >         *)
> > -      if is_directory ova then ova
> > +      if is_directory ova then ova, false
> >        else (
> >          let uncompress_head zcat file > >            let cmd =
sprintf "%s %s" zcat (quote file) in
> > @@ -67,11 +73,53 @@ object
> >  
> >            tmpfile in
> >  
> > +        (* Untar only ovf and manifest from the archive *)
> > +        let untar_partial file outdir =  
> 
> I'd rename this as "untar_metadata", as it seems a better
fitting name.
> YMMV.
Agreed.
> > +        let qemu_img_version () > > +          let cmd =
"qemu-img --version" in
> > +          let lines = external_command cmd in
> > +          match lines with
> > +          | [] -> error ("'qemu-img --version'
returned no output")
> > +          | line :: _ ->
> > +              let rex = Str.regexp "qemu-img version
\\([0-9]+\\)\\.\\([0-9]+\\)" in
> > +              if Str.string_match rex line 0 then (
> > +                try
> > +                  int_of_string (Str.matched_group 1 line),
> > +                  int_of_string (Str.matched_group 2 line)
> > +                with Failure _ ->
> > +                  warning (f_"warning: failed to read qemu-img
version! Line: %S")
> > +                  line;  
> 
> "warning" is already printed by the warning function; also
I'd simplify
> the message as:
> 
>   warning (f_"failed to parse the version of qemu-img (%S), assuming
0.9")
>     line
Whops.

> > +      let rec loop lines > > +        match lines with
> > +        | [] -> raise Not_found
> > +        | line :: lines -> (
> > +          (* Lines have the form:
> > +           * block <offset>: <perms>
<owner>/<group> <size> <mdate> <mtime>
<file>
> > +           *)
> > +          let rex = Str.regexp "^block \\([0-9]+\\): \\([^
\\t]+\\) \\([^ \\t]+\\) +\\([^ \\t]+\\) \\([^ \\t]+\\) \\([^ \\t]+\\)
\\(.+\\)$" in
> 
> Whoa how unreadable :/ The alternative would be using String.nsplit,
> but it would play badly with filenames with spaces inside the archive.
> Maybe a faster alternative could be:
> a) find the 6th empty space, and pick the substring of all the rest
>    of the string after it
> b) filter the lines by picking only the wanted file, checking using (a)
> c) once a single line is found, use String.nsplit and parse only parts
>    1 (removing the ':' suffix) and 4
> 
> It feels slightly more complex, but this way we can avoid running the
> regex until we find the right filename.
The problem is there are multiple spaces used to align the column with
file sizes. Hence the complicated regexp. :(


-- 
Tomáš Golembiovský <tgolembi@redhat.com>

Pino Toscano

2016-Nov-24 15:47 UTC

head link

Re: [Libguestfs] [PATCH v2 4/5] v2v: ova: don't extract files from OVA if it's not needed

On Wednesday, 23 November 2016 16:16:44 CET Tomáš Golembiovský
wrote:> On Mon, 21 Nov 2016 16:21:02 +0100
> Pino Toscano <ptoscano@redhat.com> wrote:
> 
> > On Saturday, 12 November 2016 16:37:52 CET Tomáš Golembiovský wrote:
> > > We don't have to always extract all files from the OVA
archive. The OVA,
> > > as defined in the standard, is plain tar. We can work directly
over the
> > > tar archive if we use correct 'offset' and 'size'
options when defining
> > > the backing file for QEMU.
> > > 
> > > This leads to improvements in speed and puts much lower
requirement on
> > > available disk space.
> > > 
> > > Signed-off-by: Tomáš Golembiovský <tgolembi@redhat.com>
> > > ---
> > >  v2v/input_ova.ml | 177
+++++++++++++++++++++++++++++++++++++++++++++++--------
> > >  1 file changed, 154 insertions(+), 23 deletions(-)
> > > 
> > > diff --git a/v2v/input_ova.ml b/v2v/input_ova.ml
> > > index f76fe82..7a3e27b 100644
> > > --- a/v2v/input_ova.ml
> > > +++ b/v2v/input_ova.ml
> > > @@ -39,17 +39,23 @@ object
> > >  
> > >    method source () > > >  
> > > -    let untar ?(format = "") file outdir > >
> -      let cmd = [ "tar"; sprintf "-x%sf" format; file;
"-C"; outdir ] in
> > > +    (* Untar part or all files from tar archive. If [path] is
specified it is
> > > +     * a path in the tar archive.
> > > +     *)
> > > +    let untar ?(format = "") ?(path) file outdir =  
> > 
> > No need for parenthesis around "path".
> > Also, tar supports extracting more files at the same time, so I'd
> > change "path" into "paths", and just call this
once later on ...
> 
> You mean pass an array/list instead of single string? Yeah, why not.
Right.
> > > +      let rec loop lines > > > +        match lines
with
> > > +        | [] -> raise Not_found
> > > +        | line :: lines -> (
> > > +          (* Lines have the form:
> > > +           * block <offset>: <perms>
<owner>/<group> <size> <mdate> <mtime>
<file>
> > > +           *)
> > > +          let rex = Str.regexp "^block \\([0-9]+\\): \\([^
\\t]+\\) \\([^ \\t]+\\) +\\([^ \\t]+\\) \\([^ \\t]+\\) \\([^ \\t]+\\)
\\(.+\\)$" in
> > 
> > Whoa how unreadable :/ The alternative would be using String.nsplit,
> > but it would play badly with filenames with spaces inside the archive.
> > Maybe a faster alternative could be:
> > a) find the 6th empty space, and pick the substring of all the rest
> >    of the string after it
> > b) filter the lines by picking only the wanted file, checking using
(a)
> > c) once a single line is found, use String.nsplit and parse only parts
> >    1 (removing the ':' suffix) and 4
> > 
> > It feels slightly more complex, but this way we can avoid running the
> > regex until we find the right filename.
> 
> The problem is there are multiple spaces used to align the column with
> file sizes. Hence the complicated regexp. :(
I see... in any case, I'd prefer to avoid running the regex on every
line if possible. An idea could be improve our String.nsplit to ignore
empty fields, and thus
  String.nsplit " " "foo  bar" -> ["foo";
"bar"]

-- 
Pino Toscano

Reasonably Related Threads

Search for more possibly parallel threads

Libguestfs - Nov 2016 - Re: [PATCH v2 4/5] v2v: ova: don't extract files from OVA if it's not needed

Re: [Libguestfs] [PATCH v2 4/5] v2v: ova: don't extract files from OVA if it's not needed

Re: [Libguestfs] [PATCH v2 4/5] v2v: ova: don't extract files from OVA if it's not needed

Re: [Libguestfs] [PATCH v2 4/5] v2v: ova: don't extract files from OVA if it's not needed

Reasonably Related Threads