Cédric Bosdonnat
2018-Feb-14 17:40 UTC
[Libguestfs] [PATCH] inspector: rpm summary and description may not be utf-8
The application inspection code assumes the data in the RPM database are encoded in UTF-8. However this is not always the case. As a basic workaround, try to parse the string to UTF-8 and if that fails, try converting it from latin-1. --- inspector/expected-fedora.img.xml | 4 ++ lib/inspect-apps.c | 75 +++++++++++++++++++++++++-- test-data/phony-guests/fedora-packages.db.txt | 4 +- 3 files changed, 77 insertions(+), 6 deletions(-) diff --git a/inspector/expected-fedora.img.xml b/inspector/expected-fedora.img.xml index 8d40e8cb7..ffefce177 100644 --- a/inspector/expected-fedora.img.xml +++ b/inspector/expected-fedora.img.xml @@ -33,12 +33,16 @@ <version>1.0</version> <release>1.fc14</release> <arch>x86_64</arch> + <summary>summary with ö</summary> + <description>description with ö</description> </application> <application> <name>test2</name> <version>2.0</version> <release>2.fc14</release> <arch>x86_64</arch> + <summary>summary with ö</summary> + <description>description with ö</description> </application> <application> <name>test3</name> diff --git a/lib/inspect-apps.c b/lib/inspect-apps.c index f0cf16b38..5adfabfe6 100644 --- a/lib/inspect-apps.c +++ b/lib/inspect-apps.c @@ -22,6 +22,7 @@ #include <stdlib.h> #include <unistd.h> #include <string.h> +#include <iconv.h> #ifdef HAVE_ENDIAN_H #include <endian.h> @@ -251,7 +252,7 @@ get_rpm_header_tag (guestfs_h *g, const unsigned char *header_start, /* This function parses the RPM header structure to pull out various * tag strings (version, release, arch, etc.). For more detail on the * header format, see: - * http://www.rpm.org/max-rpm/s1-rpm-file-format-rpm-file-format.html#S2-RPM-FILE-FORMAT-HEADER + * http://rpm.org/devel_doc/file_format.html#24-header-format */ /* The minimum header size that makes sense here is 24 bytes. Four @@ -301,6 +302,66 @@ struct read_package_data { struct guestfs_application2_list *apps; }; +static char * +to_utf8 (guestfs_h *g, char *input) +{ + iconv_t cd_utf8_utf8 = (iconv_t)(-1); + iconv_t cd_utf8_latin1 = (iconv_t)(-1); + size_t in_left, out_left, res; + char *in_ptr; + char *out_ptr; + char *output = NULL; + char *result = NULL; + + cd_utf8_utf8 = iconv_open("UTF-8", "UTF-8"); + if (cd_utf8_utf8 == (iconv_t)(-1)) { + perrorf(g, "No iconv UTF-8 encoding"); + goto cleanup; + } + + in_ptr = input; + in_left = strlen(input) + 1; + out_left = in_left * 4; + output = safe_malloc(g, out_left); + out_ptr = output; + + res = iconv(cd_utf8_utf8, &in_ptr, &in_left, &out_ptr, &out_left); + if (res == (size_t)(-1)) { + if (errno == E2BIG) { + perrorf(g, "iconv: '%s', buffer length: %lu", input, strlen(input) * 4); + goto cleanup; + } + + /* Try latin-1 encoding */ + cd_utf8_latin1 = iconv_open("UTF-8", "ISO-8859-1"); + if (cd_utf8_latin1 == (iconv_t)(-1)) { + perrorf(g, "No iconv ISO-8859-1 encoding"); + goto cleanup; + } + + in_ptr = input; + in_left = strlen(input) + 1; + out_left = in_left * 4; + out_ptr = output; + + res = iconv(cd_utf8_latin1, &in_ptr, &in_left, &out_ptr, &out_left); + if (res == (size_t)(-1)) { + perrorf(g, "Failed to parse latin-1: '%s'", input); + goto cleanup; + } + } + + result = output; + + cleanup: + iconv_close(cd_utf8_utf8); + iconv_close(cd_utf8_latin1); + if (!result) + free(output); + + return result; +} + static int read_package (guestfs_h *g, const unsigned char *key, size_t keylen, @@ -311,7 +372,7 @@ read_package (guestfs_h *g, struct rpm_name nkey, *entry; CLEANUP_FREE char *version = NULL, *release = NULL, *epoch_str = NULL, *arch = NULL, *url = NULL, *summary = NULL, - *description = NULL; + *description = NULL, *summary_raw = NULL, *description_raw = NULL; int32_t epoch; /* This function reads one (key, value) pair from the Packages @@ -342,8 +403,14 @@ read_package (guestfs_h *g, epoch_str = get_rpm_header_tag (g, value, valuelen, RPMTAG_EPOCH, 'i'); arch = get_rpm_header_tag (g, value, valuelen, RPMTAG_ARCH, 's'); url = get_rpm_header_tag (g, value, valuelen, RPMTAG_URL, 's'); - summary = get_rpm_header_tag (g, value, valuelen, RPMTAG_SUMMARY, 's'); - description = get_rpm_header_tag (g, value, valuelen, RPMTAG_DESCRIPTION, 's'); + summary_raw = get_rpm_header_tag (g, value, valuelen, RPMTAG_SUMMARY, 's'); + description_raw = get_rpm_header_tag (g, value, valuelen, RPMTAG_DESCRIPTION, 's'); + + /* Try (not too hard) to get UTF-8 */ + if (summary_raw) + summary = to_utf8(g, summary_raw); + if (description_raw) + description = to_utf8(g, description_raw); /* The epoch is stored as big-endian integer. */ if (epoch_str) diff --git a/test-data/phony-guests/fedora-packages.db.txt b/test-data/phony-guests/fedora-packages.db.txt index f16a5aa76..927d6eb5f 100644 --- a/test-data/phony-guests/fedora-packages.db.txt +++ b/test-data/phony-guests/fedora-packages.db.txt @@ -5,9 +5,9 @@ h_nelem=3 db_pagesize=4096 HEADER=END \01\00\00\00 - \00\00\00\03\00\00\00\11\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\001.0\001.fc14\00x86_64\00 + \00\00\00\05\00\00\00\33\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\00\00\00\03\ec\00\00\00\00\00\00\00\12\00\00\00\00\00\00\03\ed\00\00\00\00\00\00\00\21\00\00\00\001.0\001.fc14\00x86_64\00summary with \f6\00description with \f6\00 \02\00\00\00 - \00\00\00\03\00\00\00\11\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\002.0\002.fc14\00x86_64\00 + \00\00\00\05\00\00\00\35\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\00\00\00\03\ec\00\00\00\00\00\00\00\12\00\00\00\00\00\00\03\ed\00\00\00\00\00\00\00\22\00\00\00\002.0\002.fc14\00x86_64\00summary with \c3\b6\00description with \c3\b6\00 \03\00\00\00 \00\00\00\03\00\00\00\11\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\003.0\003.fc14\00x86_64\00 DATA=END -- 2.16.1
Richard W.M. Jones
2018-Feb-15 14:48 UTC
Re: [Libguestfs] [PATCH] inspector: rpm summary and description may not be utf-8
On Wed, Feb 14, 2018 at 06:40:44PM +0100, Cédric Bosdonnat wrote:> +static char * > +to_utf8 (guestfs_h *g, char *input)What you might want to do to simplify this is to factor out the function local_string_to_utf8 from common/utils/libxml2-utils.c. First patch would move local_string_to_utf8 into common/utils/utils.c (making it non-static). Unfortunately there's a small difference in error handling, because your function is calling perrorf, and functions in common/utils/utils.c are not allowed to use the guestfs handle. However it's not a huge problem, you'll just have to call perrorf() from the caller in this file, ie here:> + /* Try (not too hard) to get UTF-8 */ > + if (summary_raw) > + summary = to_utf8(g, summary_raw);^^^ space> + if (description_raw) > + description = to_utf8(g, description_raw);^^^ space The rest of the patch looks fine to me. Rich. -- Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones Read my programming and virtualization blog: http://rwmj.wordpress.com Fedora Windows cross-compiler. Compile Windows programs, test, and build Windows installers. Over 100 libraries supported. http://fedoraproject.org/wiki/MinGW
Reasonably Related Threads
- [PATCH v2 2/2] inspector: rpm summary and description may not be utf-8
- [PATCH v3 2/2] inspector: rpm summary and description may not be utf-8
- [PATCH v2 0/2] inspect: basic UTF-8 encoding for rpm
- echo cancellation on Blackfin DSK
- [PATCH v3 0/2] inspect: basic UTF-8 encoding for rpm