Cédric Bosdonnat
2018-Feb-28 08:59 UTC
[Libguestfs] [PATCH v3 0/2] inspect: basic UTF-8 encoding for rpm
Diff to v2: * inlined local_string_to_utf8 Cédric Bosdonnat (2): common: extract UTF-8 conversion function inspector: rpm summary and description may not be utf-8 common/utils/guestfs-utils.h | 11 +++++ common/utils/libxml2-utils.c | 69 +-------------------------- common/utils/utils.c | 64 +++++++++++++++++++++++++ inspector/expected-fedora.img.xml | 4 ++ lib/inspect-apps.c | 30 ++++++++++-- test-data/phony-guests/fedora-packages.db.txt | 4 +- 6 files changed, 108 insertions(+), 74 deletions(-) -- 2.16.1
Cédric Bosdonnat
2018-Feb-28 08:59 UTC
[Libguestfs] [PATCH v3 1/2] common: extract UTF-8 conversion function
libxml2-utils.c local_string_to_utf8() function could easily be reused
in other places. This commit extracts it with a new parameter to allow
giving the encoding of the input string and publishes it in
guestfs-utils.h as guestfs_int_string_to_utf8()
---
common/utils/guestfs-utils.h | 11 +++++++
common/utils/libxml2-utils.c | 69 +-------------------------------------------
common/utils/utils.c | 64 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 76 insertions(+), 68 deletions(-)
diff --git a/common/utils/guestfs-utils.h b/common/utils/guestfs-utils.h
index 90e7c3dd9..86da693bc 100644
--- a/common/utils/guestfs-utils.h
+++ b/common/utils/guestfs-utils.h
@@ -33,6 +33,7 @@
#define GUESTFS_UTILS_H_
#include <stdbool.h>
+#include <langinfo.h>
#include "guestfs-internal-all.h"
#include "cleanups.h"
@@ -70,6 +71,16 @@ extern int guestfs_int_is_fifo (int64_t mode);
extern int guestfs_int_is_lnk (int64_t mode);
extern int guestfs_int_is_sock (int64_t mode);
extern char *guestfs_int_full_path (const char *dir, const char *name);
+extern char *guestfs_int_string_to_utf8 (/* const */ char *input, const char
*encoding);
+
+/* Would be const, but the interface to iconv is not const-correct on
+ * all platforms. The input string is not touched.
+ */
+static inline char *
+guestfs_int_local_string_to_utf8 (/* const */ char *input)
+{
+ return guestfs_int_string_to_utf8 (input, nl_langinfo (CODESET));
+}
/* Not all language bindings know how to deal with Pointer arguments.
* Those that don't will use this macro which complains noisily and
diff --git a/common/utils/libxml2-utils.c b/common/utils/libxml2-utils.c
index 8a05aa5b1..a71db30dd 100644
--- a/common/utils/libxml2-utils.c
+++ b/common/utils/libxml2-utils.c
@@ -30,8 +30,6 @@
#include <string.h>
#include <errno.h>
#include <locale.h>
-#include <langinfo.h>
-#include <iconv.h>
#include <libxml/uri.h>
@@ -42,8 +40,6 @@
#include "guestfs-utils.h"
#include "libxml2-utils.h"
-static char *local_string_to_utf8 (/* const */ char *input);
-
/**
* This is a wrapper around C<xmlParseURI>. That function cannot
* handle spaces and some non-ASCII characters found in URIs. This
@@ -73,7 +69,7 @@ guestfs_int_parse_nonstandard_uri (const char *arg)
xmlURIPtr ret;
/* Convert the string to UTF-8. */
- uri = local_string_to_utf8 ((char *) arg);
+ uri = guestfs_int_local_string_to_utf8 ((char *) arg);
if (uri == NULL)
return NULL;
@@ -113,66 +109,3 @@ guestfs_int_parse_nonstandard_uri (const char *arg)
return ret;
}
-
-/* Would be const, but the interface to iconv is not const-correct on
- * all platforms. The input string is not touched.
- */
-static char *
-local_string_to_utf8 (/* const */ char *input)
-{
- iconv_t ic;
- size_t len, inlen, outlen, outalloc, r, prev;
- int err;
- char *out, *inp, *outp;
-
- /* Convert from input locale to UTF-8. */
- ic = iconv_open ("UTF-8", nl_langinfo (CODESET));
- if (ic == (iconv_t) -1)
- return NULL;
-
- len = strlen (input);
- outalloc = len; /* Initial guess. */
-
- again:
- inlen = len;
- outlen = outalloc;
- out = malloc (outlen + 1);
- if (out == NULL) {
- err = errno;
- iconv_close (ic);
- errno = err;
- return NULL;
- }
- inp = input;
- outp = out;
-
- r = iconv (ic, (ICONV_CONST char **) &inp, &inlen, &outp,
&outlen);
- if (r == (size_t) -1) {
- if (errno == E2BIG) {
- err = errno;
- prev = outalloc;
- /* Try again with a larger output buffer. */
- free (out);
- outalloc *= 2;
- if (outalloc < prev) {
- iconv_close (ic);
- errno = err;
- return NULL;
- }
- goto again;
- }
- else {
- /* Else some other conversion failure, eg. EILSEQ, EINVAL. */
- err = errno;
- iconv_close (ic);
- free (out);
- errno = err;
- return NULL;
- }
- }
-
- *outp = '\0';
- iconv_close (ic);
-
- return out;
-}
diff --git a/common/utils/utils.c b/common/utils/utils.c
index 22af62b0f..faef7c089 100644
--- a/common/utils/utils.c
+++ b/common/utils/utils.c
@@ -35,6 +35,7 @@
#include <sys/types.h>
#include <sys/wait.h>
#include <libintl.h>
+#include <iconv.h>
/* NB: MUST NOT require linking to gnulib, because that will break the
* Python 'sdist' which includes a copy of this file. It's OK to
@@ -733,3 +734,66 @@ guestfs_int_full_path (const char *dir, const char *name)
return path;
}
+
+/* Would be const, but the interface to iconv is not const-correct on
+ * all platforms. The input string is not touched.
+ */
+char *
+guestfs_int_string_to_utf8 (/* const */ char *input, const char *encoding)
+{
+ iconv_t ic;
+ size_t len, inlen, outlen, outalloc, r, prev;
+ int err;
+ char *out, *inp, *outp;
+
+ /* Convert from input encoding to UTF-8. */
+ ic = iconv_open ("UTF-8", encoding);
+ if (ic == (iconv_t) -1)
+ return NULL;
+
+ len = strlen (input);
+ outalloc = len; /* Initial guess. */
+
+ again:
+ inlen = len;
+ outlen = outalloc;
+ out = malloc (outlen + 1);
+ if (out == NULL) {
+ err = errno;
+ iconv_close (ic);
+ errno = err;
+ return NULL;
+ }
+ inp = input;
+ outp = out;
+
+ r = iconv (ic, (ICONV_CONST char **) &inp, &inlen, &outp,
&outlen);
+ if (r == (size_t) -1) {
+ if (errno == E2BIG) {
+ err = errno;
+ prev = outalloc;
+ /* Try again with a larger output buffer. */
+ free (out);
+ outalloc *= 2;
+ if (outalloc < prev) {
+ iconv_close (ic);
+ errno = err;
+ return NULL;
+ }
+ goto again;
+ }
+ else {
+ /* Else some other conversion failure, eg. EILSEQ, EINVAL. */
+ err = errno;
+ iconv_close (ic);
+ free (out);
+ errno = err;
+ return NULL;
+ }
+ }
+
+ *outp = '\0';
+ iconv_close (ic);
+
+ return out;
+}
--
2.16.1
Cédric Bosdonnat
2018-Feb-28 08:59 UTC
[Libguestfs] [PATCH v3 2/2] inspector: rpm summary and description may not be utf-8
The application inspection code assumes the data in the RPM database
are encoded in UTF-8. However this is not always the case.
As a basic workaround, try to parse the string to UTF-8 and if that
fails, try converting it from latin-1.
---
inspector/expected-fedora.img.xml | 4 ++++
lib/inspect-apps.c | 30 +++++++++++++++++++++++----
test-data/phony-guests/fedora-packages.db.txt | 4 ++--
3 files changed, 32 insertions(+), 6 deletions(-)
diff --git a/inspector/expected-fedora.img.xml
b/inspector/expected-fedora.img.xml
index df6060a73..c29f9770e 100644
--- a/inspector/expected-fedora.img.xml
+++ b/inspector/expected-fedora.img.xml
@@ -34,12 +34,16 @@
<version>1.0</version>
<release>1.fc14</release>
<arch>x86_64</arch>
+ <summary>summary with ö</summary>
+ <description>description with ö</description>
</application>
<application>
<name>test2</name>
<version>2.0</version>
<release>2.fc14</release>
<arch>x86_64</arch>
+ <summary>summary with ö</summary>
+ <description>description with ö</description>
</application>
<application>
<name>test3</name>
diff --git a/lib/inspect-apps.c b/lib/inspect-apps.c
index f0cf16b38..fdea85188 100644
--- a/lib/inspect-apps.c
+++ b/lib/inspect-apps.c
@@ -22,6 +22,7 @@
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
+#include <iconv.h>
#ifdef HAVE_ENDIAN_H
#include <endian.h>
@@ -43,6 +44,7 @@
#include "guestfs.h"
#include "guestfs-internal.h"
#include "guestfs-internal-actions.h"
+#include "guestfs-utils.h"
#include "structs-cleanups.h"
#ifdef DB_DUMP
@@ -251,7 +253,7 @@ get_rpm_header_tag (guestfs_h *g, const unsigned char
*header_start,
/* This function parses the RPM header structure to pull out various
* tag strings (version, release, arch, etc.). For more detail on the
* header format, see:
- *
http://www.rpm.org/max-rpm/s1-rpm-file-format-rpm-file-format.html#S2-RPM-FILE-FORMAT-HEADER
+ * http://rpm.org/devel_doc/file_format.html#24-header-format
*/
/* The minimum header size that makes sense here is 24 bytes. Four
@@ -301,6 +303,20 @@ struct read_package_data {
struct guestfs_application2_list *apps;
};
+static char *
+to_utf8 (guestfs_h *g, char *input)
+{
+ char *out = NULL;
+
+ out = guestfs_int_string_to_utf8 (input, "UTF-8");
+ if (!out) {
+ out = guestfs_int_string_to_utf8 (input, "ISO-8859-1");
+ perrorf (g, "Not an UTF-8 or latin-1 string: '%s'",
input);
+ }
+
+ return out;
+}
+
static int
read_package (guestfs_h *g,
const unsigned char *key, size_t keylen,
@@ -311,7 +327,7 @@ read_package (guestfs_h *g,
struct rpm_name nkey, *entry;
CLEANUP_FREE char *version = NULL, *release = NULL,
*epoch_str = NULL, *arch = NULL, *url = NULL, *summary = NULL,
- *description = NULL;
+ *description = NULL, *summary_raw = NULL, *description_raw = NULL;
int32_t epoch;
/* This function reads one (key, value) pair from the Packages
@@ -342,8 +358,14 @@ read_package (guestfs_h *g,
epoch_str = get_rpm_header_tag (g, value, valuelen, RPMTAG_EPOCH,
'i');
arch = get_rpm_header_tag (g, value, valuelen, RPMTAG_ARCH, 's');
url = get_rpm_header_tag (g, value, valuelen, RPMTAG_URL, 's');
- summary = get_rpm_header_tag (g, value, valuelen, RPMTAG_SUMMARY,
's');
- description = get_rpm_header_tag (g, value, valuelen, RPMTAG_DESCRIPTION,
's');
+ summary_raw = get_rpm_header_tag (g, value, valuelen, RPMTAG_SUMMARY,
's');
+ description_raw = get_rpm_header_tag (g, value, valuelen, RPMTAG_DESCRIPTION,
's');
+
+ /* Try (not too hard) to get UTF-8 */
+ if (summary_raw)
+ summary = to_utf8 (g, summary_raw);
+ if (description_raw)
+ description = to_utf8 (g, description_raw);
/* The epoch is stored as big-endian integer. */
if (epoch_str)
diff --git a/test-data/phony-guests/fedora-packages.db.txt
b/test-data/phony-guests/fedora-packages.db.txt
index f16a5aa76..927d6eb5f 100644
--- a/test-data/phony-guests/fedora-packages.db.txt
+++ b/test-data/phony-guests/fedora-packages.db.txt
@@ -5,9 +5,9 @@ h_nelem=3
db_pagesize=4096
HEADER=END
\01\00\00\00
-
\00\00\00\03\00\00\00\11\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\001.0\001.fc14\00x86_64\00
+
\00\00\00\05\00\00\00\33\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\00\00\00\03\ec\00\00\00\00\00\00\00\12\00\00\00\00\00\00\03\ed\00\00\00\00\00\00\00\21\00\00\00\001.0\001.fc14\00x86_64\00summary
with \f6\00description with \f6\00
\02\00\00\00
-
\00\00\00\03\00\00\00\11\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\002.0\002.fc14\00x86_64\00
+
\00\00\00\05\00\00\00\35\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\00\00\00\03\ec\00\00\00\00\00\00\00\12\00\00\00\00\00\00\03\ed\00\00\00\00\00\00\00\22\00\00\00\002.0\002.fc14\00x86_64\00summary
with \c3\b6\00description with \c3\b6\00
\03\00\00\00
\00\00\00\03\00\00\00\11\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\003.0\003.fc14\00x86_64\00
DATA=END
--
2.16.1
Maybe Matching Threads
- [PATCH v2 0/2] inspect: basic UTF-8 encoding for rpm
- [PATCH] inspector: rpm summary and description may not be utf-8
- [PATCH] Introduce a wrapper around xmlParseURI.
- [PATCH REPOST] Introduce a wrapper around xmlParseURI.
- [PATCH] Add a cache for iconv_t handles to hive_t