Cédric Bosdonnat
2018-Feb-28 08:59 UTC
[Libguestfs] [PATCH v3 0/2] inspect: basic UTF-8 encoding for rpm
Diff to v2: * inlined local_string_to_utf8 Cédric Bosdonnat (2): common: extract UTF-8 conversion function inspector: rpm summary and description may not be utf-8 common/utils/guestfs-utils.h | 11 +++++ common/utils/libxml2-utils.c | 69 +-------------------------- common/utils/utils.c | 64 +++++++++++++++++++++++++ inspector/expected-fedora.img.xml | 4 ++ lib/inspect-apps.c | 30 ++++++++++-- test-data/phony-guests/fedora-packages.db.txt | 4 +- 6 files changed, 108 insertions(+), 74 deletions(-) -- 2.16.1
Cédric Bosdonnat
2018-Feb-28 08:59 UTC
[Libguestfs] [PATCH v3 1/2] common: extract UTF-8 conversion function
libxml2-utils.c local_string_to_utf8() function could easily be reused in other places. This commit extracts it with a new parameter to allow giving the encoding of the input string and publishes it in guestfs-utils.h as guestfs_int_string_to_utf8() --- common/utils/guestfs-utils.h | 11 +++++++ common/utils/libxml2-utils.c | 69 +------------------------------------------- common/utils/utils.c | 64 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 68 deletions(-) diff --git a/common/utils/guestfs-utils.h b/common/utils/guestfs-utils.h index 90e7c3dd9..86da693bc 100644 --- a/common/utils/guestfs-utils.h +++ b/common/utils/guestfs-utils.h @@ -33,6 +33,7 @@ #define GUESTFS_UTILS_H_ #include <stdbool.h> +#include <langinfo.h> #include "guestfs-internal-all.h" #include "cleanups.h" @@ -70,6 +71,16 @@ extern int guestfs_int_is_fifo (int64_t mode); extern int guestfs_int_is_lnk (int64_t mode); extern int guestfs_int_is_sock (int64_t mode); extern char *guestfs_int_full_path (const char *dir, const char *name); +extern char *guestfs_int_string_to_utf8 (/* const */ char *input, const char *encoding); + +/* Would be const, but the interface to iconv is not const-correct on + * all platforms. The input string is not touched. + */ +static inline char * +guestfs_int_local_string_to_utf8 (/* const */ char *input) +{ + return guestfs_int_string_to_utf8 (input, nl_langinfo (CODESET)); +} /* Not all language bindings know how to deal with Pointer arguments. * Those that don't will use this macro which complains noisily and diff --git a/common/utils/libxml2-utils.c b/common/utils/libxml2-utils.c index 8a05aa5b1..a71db30dd 100644 --- a/common/utils/libxml2-utils.c +++ b/common/utils/libxml2-utils.c @@ -30,8 +30,6 @@ #include <string.h> #include <errno.h> #include <locale.h> -#include <langinfo.h> -#include <iconv.h> #include <libxml/uri.h> @@ -42,8 +40,6 @@ #include "guestfs-utils.h" #include "libxml2-utils.h" -static char *local_string_to_utf8 (/* const */ char *input); - /** * This is a wrapper around C<xmlParseURI>. That function cannot * handle spaces and some non-ASCII characters found in URIs. This @@ -73,7 +69,7 @@ guestfs_int_parse_nonstandard_uri (const char *arg) xmlURIPtr ret; /* Convert the string to UTF-8. */ - uri = local_string_to_utf8 ((char *) arg); + uri = guestfs_int_local_string_to_utf8 ((char *) arg); if (uri == NULL) return NULL; @@ -113,66 +109,3 @@ guestfs_int_parse_nonstandard_uri (const char *arg) return ret; } - -/* Would be const, but the interface to iconv is not const-correct on - * all platforms. The input string is not touched. - */ -static char * -local_string_to_utf8 (/* const */ char *input) -{ - iconv_t ic; - size_t len, inlen, outlen, outalloc, r, prev; - int err; - char *out, *inp, *outp; - - /* Convert from input locale to UTF-8. */ - ic = iconv_open ("UTF-8", nl_langinfo (CODESET)); - if (ic == (iconv_t) -1) - return NULL; - - len = strlen (input); - outalloc = len; /* Initial guess. */ - - again: - inlen = len; - outlen = outalloc; - out = malloc (outlen + 1); - if (out == NULL) { - err = errno; - iconv_close (ic); - errno = err; - return NULL; - } - inp = input; - outp = out; - - r = iconv (ic, (ICONV_CONST char **) &inp, &inlen, &outp, &outlen); - if (r == (size_t) -1) { - if (errno == E2BIG) { - err = errno; - prev = outalloc; - /* Try again with a larger output buffer. */ - free (out); - outalloc *= 2; - if (outalloc < prev) { - iconv_close (ic); - errno = err; - return NULL; - } - goto again; - } - else { - /* Else some other conversion failure, eg. EILSEQ, EINVAL. */ - err = errno; - iconv_close (ic); - free (out); - errno = err; - return NULL; - } - } - - *outp = '\0'; - iconv_close (ic); - - return out; -} diff --git a/common/utils/utils.c b/common/utils/utils.c index 22af62b0f..faef7c089 100644 --- a/common/utils/utils.c +++ b/common/utils/utils.c @@ -35,6 +35,7 @@ #include <sys/types.h> #include <sys/wait.h> #include <libintl.h> +#include <iconv.h> /* NB: MUST NOT require linking to gnulib, because that will break the * Python 'sdist' which includes a copy of this file. It's OK to @@ -733,3 +734,66 @@ guestfs_int_full_path (const char *dir, const char *name) return path; } + +/* Would be const, but the interface to iconv is not const-correct on + * all platforms. The input string is not touched. + */ +char * +guestfs_int_string_to_utf8 (/* const */ char *input, const char *encoding) +{ + iconv_t ic; + size_t len, inlen, outlen, outalloc, r, prev; + int err; + char *out, *inp, *outp; + + /* Convert from input encoding to UTF-8. */ + ic = iconv_open ("UTF-8", encoding); + if (ic == (iconv_t) -1) + return NULL; + + len = strlen (input); + outalloc = len; /* Initial guess. */ + + again: + inlen = len; + outlen = outalloc; + out = malloc (outlen + 1); + if (out == NULL) { + err = errno; + iconv_close (ic); + errno = err; + return NULL; + } + inp = input; + outp = out; + + r = iconv (ic, (ICONV_CONST char **) &inp, &inlen, &outp, &outlen); + if (r == (size_t) -1) { + if (errno == E2BIG) { + err = errno; + prev = outalloc; + /* Try again with a larger output buffer. */ + free (out); + outalloc *= 2; + if (outalloc < prev) { + iconv_close (ic); + errno = err; + return NULL; + } + goto again; + } + else { + /* Else some other conversion failure, eg. EILSEQ, EINVAL. */ + err = errno; + iconv_close (ic); + free (out); + errno = err; + return NULL; + } + } + + *outp = '\0'; + iconv_close (ic); + + return out; +} -- 2.16.1
Cédric Bosdonnat
2018-Feb-28 08:59 UTC
[Libguestfs] [PATCH v3 2/2] inspector: rpm summary and description may not be utf-8
The application inspection code assumes the data in the RPM database are encoded in UTF-8. However this is not always the case. As a basic workaround, try to parse the string to UTF-8 and if that fails, try converting it from latin-1. --- inspector/expected-fedora.img.xml | 4 ++++ lib/inspect-apps.c | 30 +++++++++++++++++++++++---- test-data/phony-guests/fedora-packages.db.txt | 4 ++-- 3 files changed, 32 insertions(+), 6 deletions(-) diff --git a/inspector/expected-fedora.img.xml b/inspector/expected-fedora.img.xml index df6060a73..c29f9770e 100644 --- a/inspector/expected-fedora.img.xml +++ b/inspector/expected-fedora.img.xml @@ -34,12 +34,16 @@ <version>1.0</version> <release>1.fc14</release> <arch>x86_64</arch> + <summary>summary with ö</summary> + <description>description with ö</description> </application> <application> <name>test2</name> <version>2.0</version> <release>2.fc14</release> <arch>x86_64</arch> + <summary>summary with ö</summary> + <description>description with ö</description> </application> <application> <name>test3</name> diff --git a/lib/inspect-apps.c b/lib/inspect-apps.c index f0cf16b38..fdea85188 100644 --- a/lib/inspect-apps.c +++ b/lib/inspect-apps.c @@ -22,6 +22,7 @@ #include <stdlib.h> #include <unistd.h> #include <string.h> +#include <iconv.h> #ifdef HAVE_ENDIAN_H #include <endian.h> @@ -43,6 +44,7 @@ #include "guestfs.h" #include "guestfs-internal.h" #include "guestfs-internal-actions.h" +#include "guestfs-utils.h" #include "structs-cleanups.h" #ifdef DB_DUMP @@ -251,7 +253,7 @@ get_rpm_header_tag (guestfs_h *g, const unsigned char *header_start, /* This function parses the RPM header structure to pull out various * tag strings (version, release, arch, etc.). For more detail on the * header format, see: - * http://www.rpm.org/max-rpm/s1-rpm-file-format-rpm-file-format.html#S2-RPM-FILE-FORMAT-HEADER + * http://rpm.org/devel_doc/file_format.html#24-header-format */ /* The minimum header size that makes sense here is 24 bytes. Four @@ -301,6 +303,20 @@ struct read_package_data { struct guestfs_application2_list *apps; }; +static char * +to_utf8 (guestfs_h *g, char *input) +{ + char *out = NULL; + + out = guestfs_int_string_to_utf8 (input, "UTF-8"); + if (!out) { + out = guestfs_int_string_to_utf8 (input, "ISO-8859-1"); + perrorf (g, "Not an UTF-8 or latin-1 string: '%s'", input); + } + + return out; +} + static int read_package (guestfs_h *g, const unsigned char *key, size_t keylen, @@ -311,7 +327,7 @@ read_package (guestfs_h *g, struct rpm_name nkey, *entry; CLEANUP_FREE char *version = NULL, *release = NULL, *epoch_str = NULL, *arch = NULL, *url = NULL, *summary = NULL, - *description = NULL; + *description = NULL, *summary_raw = NULL, *description_raw = NULL; int32_t epoch; /* This function reads one (key, value) pair from the Packages @@ -342,8 +358,14 @@ read_package (guestfs_h *g, epoch_str = get_rpm_header_tag (g, value, valuelen, RPMTAG_EPOCH, 'i'); arch = get_rpm_header_tag (g, value, valuelen, RPMTAG_ARCH, 's'); url = get_rpm_header_tag (g, value, valuelen, RPMTAG_URL, 's'); - summary = get_rpm_header_tag (g, value, valuelen, RPMTAG_SUMMARY, 's'); - description = get_rpm_header_tag (g, value, valuelen, RPMTAG_DESCRIPTION, 's'); + summary_raw = get_rpm_header_tag (g, value, valuelen, RPMTAG_SUMMARY, 's'); + description_raw = get_rpm_header_tag (g, value, valuelen, RPMTAG_DESCRIPTION, 's'); + + /* Try (not too hard) to get UTF-8 */ + if (summary_raw) + summary = to_utf8 (g, summary_raw); + if (description_raw) + description = to_utf8 (g, description_raw); /* The epoch is stored as big-endian integer. */ if (epoch_str) diff --git a/test-data/phony-guests/fedora-packages.db.txt b/test-data/phony-guests/fedora-packages.db.txt index f16a5aa76..927d6eb5f 100644 --- a/test-data/phony-guests/fedora-packages.db.txt +++ b/test-data/phony-guests/fedora-packages.db.txt @@ -5,9 +5,9 @@ h_nelem=3 db_pagesize=4096 HEADER=END \01\00\00\00 - \00\00\00\03\00\00\00\11\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\001.0\001.fc14\00x86_64\00 + \00\00\00\05\00\00\00\33\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\00\00\00\03\ec\00\00\00\00\00\00\00\12\00\00\00\00\00\00\03\ed\00\00\00\00\00\00\00\21\00\00\00\001.0\001.fc14\00x86_64\00summary with \f6\00description with \f6\00 \02\00\00\00 - \00\00\00\03\00\00\00\11\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\002.0\002.fc14\00x86_64\00 + \00\00\00\05\00\00\00\35\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\00\00\00\03\ec\00\00\00\00\00\00\00\12\00\00\00\00\00\00\03\ed\00\00\00\00\00\00\00\22\00\00\00\002.0\002.fc14\00x86_64\00summary with \c3\b6\00description with \c3\b6\00 \03\00\00\00 \00\00\00\03\00\00\00\11\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\003.0\003.fc14\00x86_64\00 DATA=END -- 2.16.1
Apparently Analagous Threads
- [PATCH] Introduce a wrapper around xmlParseURI.
- [PATCH REPOST] Introduce a wrapper around xmlParseURI.
- Fwd: [PATCH hivex] non-ASCII characters in node names
- [PATCH v3 0/2] inspect: basic UTF-8 encoding for rpm
- Re: [PATCH] Add a cache for iconv_t handles to hive_t