Richard W.M. Jones
2018-Nov-02 14:23 UTC
[Libguestfs] [PATCH REPOST] Introduce a wrapper around xmlParseURI.
Previously posted: https://www.redhat.com/archives/libguestfs/2017-December/msg00046.html Rich.
Richard W.M. Jones
2018-Nov-02 14:23 UTC
[Libguestfs] [PATCH REPOST] Introduce a wrapper around xmlParseURI.
We only use xmlParseURI to parse our own "homebrew" URIs, for example the ones used by guestfish --add or virt-v2v. Unfortunately xmlParseURI cannot handle URIs with spaces or other non-RFC-compliant characters so simple commands like these fail: $ guestfish -a 'ssh://example.com/virtual machine.img' guestfish: --add: could not parse URI 'ssh://example.com/virtual machine.img' $ guestfish -a 'ssh://example.com/バーチャルマシン.img' guestfish: --add: could not parse URI 'ssh://example.com/バーチャルマシン.img' This is a usability problem. However since these are not expected to be generic RFC-compliant URIs we can perform the required percent-escaping ourselves instead of demanding that the user does this. Note that the wrapper function should not be used on real URLs or libvirt URLs. --- common/mlxml/Makefile.am | 1 + common/mlxml/xml-c.c | 45 +++++++-- common/mlxml/xml.ml | 1 + common/mlxml/xml.mli | 4 + common/options/uri.c | 5 +- common/utils/Makefile.am | 2 + common/utils/libxml2-utils.c | 178 ++++++++++++++++++++++++++++++++++ common/utils/libxml2-utils.h | 27 ++++++ docs/C_SOURCE_FILES | 2 + v2v/input_vmx.ml | 8 +- v2v/virt-v2v-input-vmware.pod | 5 +- 11 files changed, 260 insertions(+), 18 deletions(-) diff --git a/common/mlxml/Makefile.am b/common/mlxml/Makefile.am index 7f36b743a..95915f80c 100644 --- a/common/mlxml/Makefile.am +++ b/common/mlxml/Makefile.am @@ -53,6 +53,7 @@ libmlxml_a_CPPFLAGS = \ -I. \ -I$(top_builddir) \ -I$(top_srcdir)/gnulib/lib -I$(top_builddir)/gnulib/lib \ + -I$(top_srcdir)/common/utils -I$(top_builddir)/common/utils \ -I$(shell $(OCAMLC) -where) libmlxml_a_CFLAGS = \ $(WARN_CFLAGS) $(WERROR_CFLAGS) \ diff --git a/common/mlxml/xml-c.c b/common/mlxml/xml-c.c index 3ebecb25e..6dcdb5ccb 100644 --- a/common/mlxml/xml-c.c +++ b/common/mlxml/xml-c.c @@ -27,17 +27,21 @@ #include <stdlib.h> #include <string.h> #include <unistd.h> +#include <errno.h> #include <caml/alloc.h> #include <caml/custom.h> #include <caml/fail.h> #include <caml/memory.h> #include <caml/mlvalues.h> +#include <caml/unixsupport.h> #include <libxml/xpath.h> #include <libxml/xpathInternals.h> #include <libxml/uri.h> +#include "libxml2-utils.h" + #pragma GCC diagnostic ignored "-Wmissing-prototypes" /* xmlDocPtr type */ @@ -426,16 +430,11 @@ mllib_xml_doc_get_root_element (value docv) } } -value -mllib_xml_parse_uri (value strv) +static value +Val_uri (xmlURIPtr uri) { - CAMLparam1 (strv); + CAMLparam0 (); CAMLlocal3 (rv, sv, ov); - xmlURIPtr uri; - - uri = xmlParseURI (String_val (strv)); - if (uri == NULL) - caml_invalid_argument ("parse_uri: unable to parse URI"); rv = caml_alloc_tuple (9); @@ -514,7 +513,37 @@ mllib_xml_parse_uri (value strv) else ov = Val_int (0); Store_field (rv, 8, ov); + CAMLreturn (rv); +} + +value +mllib_xml_parse_uri (value strv) +{ + CAMLparam1 (strv); + CAMLlocal1 (rv); + xmlURIPtr uri; + + uri = xmlParseURI (String_val (strv)); + if (uri == NULL) + caml_invalid_argument ("parse_uri: unable to parse URI"); + + rv = Val_uri (uri); xmlFreeURI (uri); + CAMLreturn (rv); +} + +value +mllib_xml_parse_nonstandard_uri (value strv) +{ + CAMLparam1 (strv); + CAMLlocal1 (rv); + xmlURIPtr uri; + uri = guestfs_int_parse_nonstandard_uri (String_val (strv)); + if (uri == NULL) + unix_error (errno, (char *) "Xml.parse_uri", strv); + + rv = Val_uri (uri); + xmlFreeURI (uri); CAMLreturn (rv); } diff --git a/common/mlxml/xml.ml b/common/mlxml/xml.ml index 5b5c09c00..faeea35ee 100644 --- a/common/mlxml/xml.ml +++ b/common/mlxml/xml.ml @@ -162,3 +162,4 @@ type uri = { } external parse_uri : string -> uri = "mllib_xml_parse_uri" +external parse_nonstandard_uri : string -> uri = "mllib_xml_parse_nonstandard_uri" diff --git a/common/mlxml/xml.mli b/common/mlxml/xml.mli index f561bd673..73c2fdd4b 100644 --- a/common/mlxml/xml.mli +++ b/common/mlxml/xml.mli @@ -115,3 +115,7 @@ val parse_uri : string -> uri Note this is different from the {!URI} module which is specialized for parsing the [-a] parameter on the command line. This function exposes the full [xmlParseURI] interface. *) + +val parse_nonstandard_uri : string -> uri +(** Similar to {!parse_uri} but only for use with our non-standard + URIs. See [guestfs_int_parse_nonstandard_uri] in [common/utils]. *) diff --git a/common/options/uri.c b/common/options/uri.c index ac36bccb2..88a5f0560 100644 --- a/common/options/uri.c +++ b/common/options/uri.c @@ -38,6 +38,7 @@ #include "guestfs.h" #include "guestfs-utils.h" +#include "libxml2-utils.h" #include "uri.h" static int is_uri (const char *arg); @@ -114,9 +115,9 @@ parse (const char *arg, char **path_ret, char **protocol_ret, CLEANUP_FREE char *socket = NULL; char *path; - uri = xmlParseURI (arg); + uri = guestfs_int_parse_nonstandard_uri (arg); if (!uri) { - fprintf (stderr, _("%s: --add: could not parse URI ‘%s’\n"), + fprintf (stderr, _("%s: --add: could not parse URI ‘%s’: %m\n"), getprogname (), arg); return -1; } diff --git a/common/utils/Makefile.am b/common/utils/Makefile.am index a0154d85f..4ff9efcbd 100644 --- a/common/utils/Makefile.am +++ b/common/utils/Makefile.am @@ -26,6 +26,8 @@ libutils_la_SOURCES = \ gnulib-cleanups.c \ guestfs-utils.h \ libxml2-cleanups.c \ + libxml2-utils.c \ + libxml2-utils.h \ libxml2-writer-macros.h \ utils.c libutils_la_CPPFLAGS = \ diff --git a/common/utils/libxml2-utils.c b/common/utils/libxml2-utils.c new file mode 100644 index 000000000..8a05aa5b1 --- /dev/null +++ b/common/utils/libxml2-utils.c @@ -0,0 +1,178 @@ +/* libguestfs + * Copyright (C) 2017 Red Hat Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * Utility functions using libxml2. + * + * These functions these I<must not> call internal library functions + * such as C<safe_*>, C<error> or C<perrorf>, or any C<guestfs_int_*>. + */ + +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <locale.h> +#include <langinfo.h> +#include <iconv.h> + +#include <libxml/uri.h> + +#include "c-ctype.h" + +/* NB: MUST NOT include "guestfs-internal.h". */ +#include "guestfs.h" +#include "guestfs-utils.h" +#include "libxml2-utils.h" + +static char *local_string_to_utf8 (/* const */ char *input); + +/** + * This is a wrapper around C<xmlParseURI>. That function cannot + * handle spaces and some non-ASCII characters found in URIs. This + * wrapper URI-encodes those before calling C<xmlParseURI> and returns + * the URI structure. + * + * This function should B<only> be called for the URIs that libguestfs + * has invented, for things like guestfish I<--add> and virt-v2v. + * + * For real URIs or libvirt URIs this may cause corruption in corner + * cases. (See L<https://news.ycombinator.com/item?id=11673058> + * describing some of the complexity involved in dealing with real + * URI). + * + * On error, returns C<NULL> and sets C<errno> appropriately. + * + * Caller must call C<xmlFreeURI> on the returned structure or use the + * C<CLEANUP_XMLFREEURI> cleanup macro. + */ +xmlURIPtr +guestfs_int_parse_nonstandard_uri (const char *arg) +{ + CLEANUP_FREE char *uri = NULL; + CLEANUP_FREE char *escaped_uri = NULL; + static const char hexdigit[] = "0123456789abcdef"; + size_t i, j, len; + xmlURIPtr ret; + + /* Convert the string to UTF-8. */ + uri = local_string_to_utf8 ((char *) arg); + if (uri == NULL) + return NULL; + + /* Since we know the URI is in well-formed UTF-8 we can iterate over + * the bytes to do the escaping. The output of this will never be + * more than 3 times larger (each byte might be rewritten as ‘%XX’). + */ + len = strlen (uri); + escaped_uri = malloc (3*len + 1); + if (escaped_uri == NULL) + return NULL; + + for (i = j = 0; i < strlen (uri); ++i) { + /* See RFC 3986 appendix A. Note this leaves existing %-encoded + * escapes alone. + */ + if (c_isalnum (uri[i]) || + strchr ("%-._~:/?#[]@!$&'()*+,;=", uri[i]) != NULL) + escaped_uri[j++] = uri[i]; + else { + escaped_uri[j++] = '%'; + escaped_uri[j++] = hexdigit [(((unsigned char) uri[i]) >> 4) & 0xf]; + escaped_uri[j++] = hexdigit [((unsigned char) uri[i]) & 0xf]; + } + } + escaped_uri[j++] = '\0'; + + /* libxml2 xmlParseURI does not reliably set errno, so it's likely + * best to ignore whatever errno is returned and overwrite it with + * EINVAL. + */ + ret = xmlParseURI (escaped_uri); + if (ret == NULL) { + errno = EINVAL; + return NULL; + } + + return ret; +} + +/* Would be const, but the interface to iconv is not const-correct on + * all platforms. The input string is not touched. + */ +static char * +local_string_to_utf8 (/* const */ char *input) +{ + iconv_t ic; + size_t len, inlen, outlen, outalloc, r, prev; + int err; + char *out, *inp, *outp; + + /* Convert from input locale to UTF-8. */ + ic = iconv_open ("UTF-8", nl_langinfo (CODESET)); + if (ic == (iconv_t) -1) + return NULL; + + len = strlen (input); + outalloc = len; /* Initial guess. */ + + again: + inlen = len; + outlen = outalloc; + out = malloc (outlen + 1); + if (out == NULL) { + err = errno; + iconv_close (ic); + errno = err; + return NULL; + } + inp = input; + outp = out; + + r = iconv (ic, (ICONV_CONST char **) &inp, &inlen, &outp, &outlen); + if (r == (size_t) -1) { + if (errno == E2BIG) { + err = errno; + prev = outalloc; + /* Try again with a larger output buffer. */ + free (out); + outalloc *= 2; + if (outalloc < prev) { + iconv_close (ic); + errno = err; + return NULL; + } + goto again; + } + else { + /* Else some other conversion failure, eg. EILSEQ, EINVAL. */ + err = errno; + iconv_close (ic); + free (out); + errno = err; + return NULL; + } + } + + *outp = '\0'; + iconv_close (ic); + + return out; +} diff --git a/common/utils/libxml2-utils.h b/common/utils/libxml2-utils.h new file mode 100644 index 000000000..d9916ea58 --- /dev/null +++ b/common/utils/libxml2-utils.h @@ -0,0 +1,27 @@ +/* libguestfs + * Copyright (C) 2017 Red Hat Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef GUESTFS_LIBXML2_UTILS_H_ +#define GUESTFS_LIBXML2_UTILS_H_ + +#include <libxml/uri.h> + +/* libxml2-utils.c */ +extern xmlURIPtr guestfs_int_parse_nonstandard_uri (const char *uri); + +#endif /* GUESTFS_LIBXML2_UTILS_H_ */ diff --git a/docs/C_SOURCE_FILES b/docs/C_SOURCE_FILES index 7f1c60b30..853484e0f 100644 --- a/docs/C_SOURCE_FILES +++ b/docs/C_SOURCE_FILES @@ -62,6 +62,8 @@ common/utils/cleanups.h common/utils/gnulib-cleanups.c common/utils/guestfs-utils.h common/utils/libxml2-cleanups.c +common/utils/libxml2-utils.c +common/utils/libxml2-utils.h common/utils/libxml2-writer-macros.h common/utils/utils.c common/visit/visit.c diff --git a/v2v/input_vmx.ml b/v2v/input_vmx.ml index b169b2537..30649b33b 100644 --- a/v2v/input_vmx.ml +++ b/v2v/input_vmx.ml @@ -38,11 +38,11 @@ type vmx_source let vmx_source_of_arg input_transport arg match input_transport, arg with | None, arg -> File arg - | Some `SSH, arg -> + | Some `SSH, uri -> let uri - try Xml.parse_uri arg - with Invalid_argument _ -> - error (f_"remote vmx ‘%s’ could not be parsed as a URI") arg in + try Xml.parse_nonstandard_uri uri + with exn -> + error (f_"could not parse URI: %s") (Printexc.to_string exn) in if uri.Xml.uri_scheme <> None && uri.Xml.uri_scheme <> Some "ssh" then error (f_"vmx URI start with ‘ssh://...’"); if uri.Xml.uri_server = None then diff --git a/v2v/virt-v2v-input-vmware.pod b/v2v/virt-v2v-input-vmware.pod index 636652e50..f73e1fb55 100644 --- a/v2v/virt-v2v-input-vmware.pod +++ b/v2v/virt-v2v-input-vmware.pod @@ -152,10 +152,7 @@ authorized_keys. When using the SSH input transport you must specify a remote C<ssh://...> URI pointing to the VMX file. A typical URI looks like: - ssh://root@esxi.example.com/vmfs/volumes/datastore1/my%20guest/my%20guest.vmx - -Any space must be escaped with C<%20> and other non-ASCII characters -may also need to be URI-escaped. + ssh://root@esxi.example.com/vmfs/volumes/datastore1/my guest/my guest.vmx The username is not required if it is the same as your local username. -- 2.19.0.rc0
Pino Toscano
2018-Nov-30 12:35 UTC
Re: [Libguestfs] [PATCH REPOST] Introduce a wrapper around xmlParseURI.
On Friday, 2 November 2018 15:23:07 CET Richard W.M. Jones wrote:> We only use xmlParseURI to parse our own "homebrew" URIs, for example > the ones used by guestfish --add or virt-v2v. Unfortunately > xmlParseURI cannot handle URIs with spaces or other non-RFC-compliant > characters so simple commands like these fail: > > $ guestfish -a 'ssh://example.com/virtual machine.img' > guestfish: --add: could not parse URI 'ssh://example.com/virtual machine.img' > > $ guestfish -a 'ssh://example.com/バーチャルマシン.img' > guestfish: --add: could not parse URI 'ssh://example.com/バーチャルマシン.img' > > This is a usability problem. However since these are not expected to > be generic RFC-compliant URIs we can perform the required > percent-escaping ourselves instead of demanding that the user does > this. > > Note that the wrapper function should not be used on real URLs or > libvirt URLs. > ---I do not think this is a good idea at all. First of all, converting the URI to UTF-8 is a bad idea, since that that is not the encoding of the URI. Second, it does a search&replace on the whole string, just skipping some characters however not considering the various parts of an URI. Also, this will break well-formed URIs that use e.g. Punycode. In the end, users must provide compliant URIs anyway, so letting them always do the proper job seems the better option to me. Yes, I know it is not the best option for users manually invoking the tools, but certainly less problematic than dealing with all the possible issues of partially-encoded-URIs. This is also explained by the ycombinator link mentioned in a comment, and how this is a mess in e.g. modern web browsers. Let's not get into this mess, and just stay with the simple, and effective solution: always require compliant URIs. -- Pino Toscano
Seemingly Similar Threads
- [PATCH] Introduce a wrapper around xmlParseURI.
- [PATCH] Introduce a wrapper around xmlParseURI.
- [PATCH REPOST] Introduce a wrapper around xmlParseURI.
- [PATCH v3 0/2] inspect: basic UTF-8 encoding for rpm
- [PATCH v3 1/2] common: extract UTF-8 conversion function