Cédric Bosdonnat
2018-Feb-15  17:06 UTC
[Libguestfs] [PATCH v2 0/2] inspect: basic UTF-8 encoding for rpm
This needs Richard's patch: https://www.redhat.com/archives/libguestfs/2018-February/msg00099.html Diff to v1: * factorized the UTF-8 conversion functions * small style fixes Cédric Bosdonnat (2): common: extract UTF-8 conversion function inspector: rpm summary and description may not be utf-8 common/utils/guestfs-utils.h | 1 + common/utils/libxml2-utils.c | 57 +----------------------- common/utils/utils.c | 64 +++++++++++++++++++++++++++ inspector/expected-fedora.img.xml | 4 ++ lib/inspect-apps.c | 30 +++++++++++-- test-data/phony-guests/fedora-packages.db.txt | 4 +- 6 files changed, 98 insertions(+), 62 deletions(-) -- 2.16.1
Cédric Bosdonnat
2018-Feb-15  17:06 UTC
[Libguestfs] [PATCH v2 1/2] common: extract UTF-8 conversion function
libxml2-utils.c local_string_to_utf8() function could easily be reused
in other places. This commit extracts it with a new parameter to allow
giving the encoding of the input string and publishes it in
guestfs-utils.h as guestfs_int_string_to_utf8()
---
 common/utils/guestfs-utils.h |  1 +
 common/utils/libxml2-utils.c | 57 +--------------------------------------
 common/utils/utils.c         | 64 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 56 deletions(-)
diff --git a/common/utils/guestfs-utils.h b/common/utils/guestfs-utils.h
index 90e7c3dd9..68e704a2c 100644
--- a/common/utils/guestfs-utils.h
+++ b/common/utils/guestfs-utils.h
@@ -70,6 +70,7 @@ extern int guestfs_int_is_fifo (int64_t mode);
 extern int guestfs_int_is_lnk (int64_t mode);
 extern int guestfs_int_is_sock (int64_t mode);
 extern char *guestfs_int_full_path (const char *dir, const char *name);
+extern char * guestfs_int_string_to_utf8 (/* const */ char *input, const char
*encoding);
 
 /* Not all language bindings know how to deal with Pointer arguments.
  * Those that don't will use this macro which complains noisily and
diff --git a/common/utils/libxml2-utils.c b/common/utils/libxml2-utils.c
index 8a05aa5b1..f83bb177e 100644
--- a/common/utils/libxml2-utils.c
+++ b/common/utils/libxml2-utils.c
@@ -31,7 +31,6 @@
 #include <errno.h>
 #include <locale.h>
 #include <langinfo.h>
-#include <iconv.h>
 
 #include <libxml/uri.h>
 
@@ -120,59 +119,5 @@ guestfs_int_parse_nonstandard_uri (const char *arg)
 static char *
 local_string_to_utf8 (/* const */ char *input)
 {
-  iconv_t ic;
-  size_t len, inlen, outlen, outalloc, r, prev;
-  int err;
-  char *out, *inp, *outp;
-
-  /* Convert from input locale to UTF-8. */
-  ic = iconv_open ("UTF-8", nl_langinfo (CODESET));
-  if (ic == (iconv_t) -1)
-    return NULL;
-
-  len = strlen (input);
-  outalloc = len;               /* Initial guess. */
-
- again:
-  inlen = len;
-  outlen = outalloc;
-  out = malloc (outlen + 1);
-  if (out == NULL) {
-    err = errno;
-    iconv_close (ic);
-    errno = err;
-    return NULL;
-  }
-  inp = input;
-  outp = out;
-
-  r = iconv (ic, (ICONV_CONST char **) &inp, &inlen, &outp,
&outlen);
-  if (r == (size_t) -1) {
-    if (errno == E2BIG) {
-      err = errno;
-      prev = outalloc;
-      /* Try again with a larger output buffer. */
-      free (out);
-      outalloc *= 2;
-      if (outalloc < prev) {
-        iconv_close (ic);
-        errno = err;
-        return NULL;
-      }
-      goto again;
-    }
-    else {
-      /* Else some other conversion failure, eg. EILSEQ, EINVAL. */
-      err = errno;
-      iconv_close (ic);
-      free (out);
-      errno = err;
-      return NULL;
-    }
-  }
-
-  *outp = '\0';
-  iconv_close (ic);
-
-  return out;
+  return guestfs_int_string_to_utf8 (input, nl_langinfo (CODESET));
 }
diff --git a/common/utils/utils.c b/common/utils/utils.c
index 22af62b0f..faef7c089 100644
--- a/common/utils/utils.c
+++ b/common/utils/utils.c
@@ -35,6 +35,7 @@
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <libintl.h>
+#include <iconv.h>
 
 /* NB: MUST NOT require linking to gnulib, because that will break the
  * Python 'sdist' which includes a copy of this file.  It's OK to
@@ -733,3 +734,66 @@ guestfs_int_full_path (const char *dir, const char *name)
 
   return path;
 }
+
+/* Would be const, but the interface to iconv is not const-correct on
+ * all platforms.  The input string is not touched.
+ */
+char *
+guestfs_int_string_to_utf8 (/* const */ char *input, const char *encoding)
+{
+  iconv_t ic;
+  size_t len, inlen, outlen, outalloc, r, prev;
+  int err;
+  char *out, *inp, *outp;
+
+  /* Convert from input encoding to UTF-8. */
+  ic = iconv_open ("UTF-8", encoding);
+  if (ic == (iconv_t) -1)
+    return NULL;
+
+  len = strlen (input);
+  outalloc = len;               /* Initial guess. */
+
+ again:
+  inlen = len;
+  outlen = outalloc;
+  out = malloc (outlen + 1);
+  if (out == NULL) {
+    err = errno;
+    iconv_close (ic);
+    errno = err;
+    return NULL;
+  }
+  inp = input;
+  outp = out;
+
+  r = iconv (ic, (ICONV_CONST char **) &inp, &inlen, &outp,
&outlen);
+  if (r == (size_t) -1) {
+    if (errno == E2BIG) {
+      err = errno;
+      prev = outalloc;
+      /* Try again with a larger output buffer. */
+      free (out);
+      outalloc *= 2;
+      if (outalloc < prev) {
+        iconv_close (ic);
+        errno = err;
+        return NULL;
+      }
+      goto again;
+    }
+    else {
+      /* Else some other conversion failure, eg. EILSEQ, EINVAL. */
+      err = errno;
+      iconv_close (ic);
+      free (out);
+      errno = err;
+      return NULL;
+    }
+  }
+
+  *outp = '\0';
+  iconv_close (ic);
+
+  return out;
+}
-- 
2.16.1
Cédric Bosdonnat
2018-Feb-15  17:06 UTC
[Libguestfs] [PATCH v2 2/2] inspector: rpm summary and description may not be utf-8
The application inspection code assumes the data in the RPM database
are encoded in UTF-8. However this is not always the case.
As a basic workaround, try to parse the string to UTF-8 and if that
fails, try converting it from latin-1.
---
 inspector/expected-fedora.img.xml             |  4 ++++
 lib/inspect-apps.c                            | 30 +++++++++++++++++++++++----
 test-data/phony-guests/fedora-packages.db.txt |  4 ++--
 3 files changed, 32 insertions(+), 6 deletions(-)
diff --git a/inspector/expected-fedora.img.xml
b/inspector/expected-fedora.img.xml
index 8d40e8cb7..ffefce177 100644
--- a/inspector/expected-fedora.img.xml
+++ b/inspector/expected-fedora.img.xml
@@ -33,12 +33,16 @@
         <version>1.0</version>
         <release>1.fc14</release>
         <arch>x86_64</arch>
+        <summary>summary with ö</summary>
+        <description>description with ö</description>
       </application>
       <application>
         <name>test2</name>
         <version>2.0</version>
         <release>2.fc14</release>
         <arch>x86_64</arch>
+        <summary>summary with ö</summary>
+        <description>description with ö</description>
       </application>
       <application>
         <name>test3</name>
diff --git a/lib/inspect-apps.c b/lib/inspect-apps.c
index f0cf16b38..fdea85188 100644
--- a/lib/inspect-apps.c
+++ b/lib/inspect-apps.c
@@ -22,6 +22,7 @@
 #include <stdlib.h>
 #include <unistd.h>
 #include <string.h>
+#include <iconv.h>
 
 #ifdef HAVE_ENDIAN_H
 #include <endian.h>
@@ -43,6 +44,7 @@
 #include "guestfs.h"
 #include "guestfs-internal.h"
 #include "guestfs-internal-actions.h"
+#include "guestfs-utils.h"
 #include "structs-cleanups.h"
 
 #ifdef DB_DUMP
@@ -251,7 +253,7 @@ get_rpm_header_tag (guestfs_h *g, const unsigned char
*header_start,
   /* This function parses the RPM header structure to pull out various
    * tag strings (version, release, arch, etc.).  For more detail on the
    * header format, see:
-   *
http://www.rpm.org/max-rpm/s1-rpm-file-format-rpm-file-format.html#S2-RPM-FILE-FORMAT-HEADER
+   * http://rpm.org/devel_doc/file_format.html#24-header-format
    */
 
   /* The minimum header size that makes sense here is 24 bytes.  Four
@@ -301,6 +303,20 @@ struct read_package_data {
   struct guestfs_application2_list *apps;
 };
 
+static char *
+to_utf8 (guestfs_h *g, char *input)
+{
+  char *out = NULL;
+
+  out = guestfs_int_string_to_utf8 (input, "UTF-8");
+  if (!out) {
+    out = guestfs_int_string_to_utf8 (input, "ISO-8859-1");
+    perrorf (g, "Not an UTF-8 or latin-1 string: '%s'",
input);
+  }
+
+  return out;
+}
+
 static int
 read_package (guestfs_h *g,
               const unsigned char *key, size_t keylen,
@@ -311,7 +327,7 @@ read_package (guestfs_h *g,
   struct rpm_name nkey, *entry;
   CLEANUP_FREE char *version = NULL, *release = NULL,
     *epoch_str = NULL, *arch = NULL, *url = NULL, *summary = NULL,
-    *description = NULL;
+    *description = NULL, *summary_raw = NULL, *description_raw = NULL;
   int32_t epoch;
 
   /* This function reads one (key, value) pair from the Packages
@@ -342,8 +358,14 @@ read_package (guestfs_h *g,
   epoch_str = get_rpm_header_tag (g, value, valuelen, RPMTAG_EPOCH,
'i');
   arch = get_rpm_header_tag (g, value, valuelen, RPMTAG_ARCH, 's');
   url = get_rpm_header_tag (g, value, valuelen, RPMTAG_URL, 's');
-  summary = get_rpm_header_tag (g, value, valuelen, RPMTAG_SUMMARY,
's');
-  description = get_rpm_header_tag (g, value, valuelen, RPMTAG_DESCRIPTION,
's');
+  summary_raw = get_rpm_header_tag (g, value, valuelen, RPMTAG_SUMMARY,
's');
+  description_raw = get_rpm_header_tag (g, value, valuelen, RPMTAG_DESCRIPTION,
's');
+
+  /* Try (not too hard) to get UTF-8 */
+  if (summary_raw)
+    summary = to_utf8 (g, summary_raw);
+  if (description_raw)
+    description = to_utf8 (g, description_raw);
 
   /* The epoch is stored as big-endian integer. */
   if (epoch_str)
diff --git a/test-data/phony-guests/fedora-packages.db.txt
b/test-data/phony-guests/fedora-packages.db.txt
index f16a5aa76..927d6eb5f 100644
--- a/test-data/phony-guests/fedora-packages.db.txt
+++ b/test-data/phony-guests/fedora-packages.db.txt
@@ -5,9 +5,9 @@ h_nelem=3
 db_pagesize=4096
 HEADER=END
  \01\00\00\00
-
\00\00\00\03\00\00\00\11\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\001.0\001.fc14\00x86_64\00
+
\00\00\00\05\00\00\00\33\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\00\00\00\03\ec\00\00\00\00\00\00\00\12\00\00\00\00\00\00\03\ed\00\00\00\00\00\00\00\21\00\00\00\001.0\001.fc14\00x86_64\00summary
with \f6\00description with \f6\00
  \02\00\00\00
-
\00\00\00\03\00\00\00\11\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\002.0\002.fc14\00x86_64\00
+
\00\00\00\05\00\00\00\35\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\00\00\00\03\ec\00\00\00\00\00\00\00\12\00\00\00\00\00\00\03\ed\00\00\00\00\00\00\00\22\00\00\00\002.0\002.fc14\00x86_64\00summary
with \c3\b6\00description with \c3\b6\00
  \03\00\00\00
 
\00\00\00\03\00\00\00\11\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\003.0\003.fc14\00x86_64\00
 DATA=END
-- 
2.16.1
Richard W.M. Jones
2018-Feb-20  09:50 UTC
Re: [Libguestfs] [PATCH v2 2/2] inspector: rpm summary and description may not be utf-8
On Thu, Feb 15, 2018 at 06:06:06PM +0100, Cédric Bosdonnat wrote:> The application inspection code assumes the data in the RPM database > are encoded in UTF-8. However this is not always the case. > > As a basic workaround, try to parse the string to UTF-8 and if that > fails, try converting it from latin-1. > --- > inspector/expected-fedora.img.xml | 4 ++++ > lib/inspect-apps.c | 30 +++++++++++++++++++++++---- > test-data/phony-guests/fedora-packages.db.txt | 4 ++-- > 3 files changed, 32 insertions(+), 6 deletions(-) > > diff --git a/inspector/expected-fedora.img.xml b/inspector/expected-fedora.img.xml > index 8d40e8cb7..ffefce177 100644 > --- a/inspector/expected-fedora.img.xml > +++ b/inspector/expected-fedora.img.xml > @@ -33,12 +33,16 @@ > <version>1.0</version> > <release>1.fc14</release> > <arch>x86_64</arch> > + <summary>summary with ö</summary> > + <description>description with ö</description> > </application> > <application> > <name>test2</name> > <version>2.0</version> > <release>2.fc14</release> > <arch>x86_64</arch> > + <summary>summary with ö</summary> > + <description>description with ö</description> > </application> > <application> > <name>test3</name> > diff --git a/lib/inspect-apps.c b/lib/inspect-apps.c > index f0cf16b38..fdea85188 100644 > --- a/lib/inspect-apps.c > +++ b/lib/inspect-apps.c > @@ -22,6 +22,7 @@ > #include <stdlib.h> > #include <unistd.h> > #include <string.h> > +#include <iconv.h> > > #ifdef HAVE_ENDIAN_H > #include <endian.h> > @@ -43,6 +44,7 @@ > #include "guestfs.h" > #include "guestfs-internal.h" > #include "guestfs-internal-actions.h" > +#include "guestfs-utils.h" > #include "structs-cleanups.h" > > #ifdef DB_DUMP > @@ -251,7 +253,7 @@ get_rpm_header_tag (guestfs_h *g, const unsigned char *header_start, > /* This function parses the RPM header structure to pull out various > * tag strings (version, release, arch, etc.). For more detail on the > * header format, see: > - * http://www.rpm.org/max-rpm/s1-rpm-file-format-rpm-file-format.html#S2-RPM-FILE-FORMAT-HEADER > + * http://rpm.org/devel_doc/file_format.html#24-header-format > */ > > /* The minimum header size that makes sense here is 24 bytes. Four > @@ -301,6 +303,20 @@ struct read_package_data { > struct guestfs_application2_list *apps; > }; > > +static char * > +to_utf8 (guestfs_h *g, char *input) > +{ > + char *out = NULL; > + > + out = guestfs_int_string_to_utf8 (input, "UTF-8"); > + if (!out) { > + out = guestfs_int_string_to_utf8 (input, "ISO-8859-1"); > + perrorf (g, "Not an UTF-8 or latin-1 string: '%s'", input); > + } > + > + return out; > +} > + > static int > read_package (guestfs_h *g, > const unsigned char *key, size_t keylen, > @@ -311,7 +327,7 @@ read_package (guestfs_h *g, > struct rpm_name nkey, *entry; > CLEANUP_FREE char *version = NULL, *release = NULL, > *epoch_str = NULL, *arch = NULL, *url = NULL, *summary = NULL, > - *description = NULL; > + *description = NULL, *summary_raw = NULL, *description_raw = NULL; > int32_t epoch; > > /* This function reads one (key, value) pair from the Packages > @@ -342,8 +358,14 @@ read_package (guestfs_h *g, > epoch_str = get_rpm_header_tag (g, value, valuelen, RPMTAG_EPOCH, 'i'); > arch = get_rpm_header_tag (g, value, valuelen, RPMTAG_ARCH, 's'); > url = get_rpm_header_tag (g, value, valuelen, RPMTAG_URL, 's'); > - summary = get_rpm_header_tag (g, value, valuelen, RPMTAG_SUMMARY, 's'); > - description = get_rpm_header_tag (g, value, valuelen, RPMTAG_DESCRIPTION, 's'); > + summary_raw = get_rpm_header_tag (g, value, valuelen, RPMTAG_SUMMARY, 's'); > + description_raw = get_rpm_header_tag (g, value, valuelen, RPMTAG_DESCRIPTION, 's'); > + > + /* Try (not too hard) to get UTF-8 */ > + if (summary_raw) > + summary = to_utf8 (g, summary_raw); > + if (description_raw) > + description = to_utf8 (g, description_raw); > > /* The epoch is stored as big-endian integer. */ > if (epoch_str) > diff --git a/test-data/phony-guests/fedora-packages.db.txt b/test-data/phony-guests/fedora-packages.db.txt > index f16a5aa76..927d6eb5f 100644 > --- a/test-data/phony-guests/fedora-packages.db.txt > +++ b/test-data/phony-guests/fedora-packages.db.txt > @@ -5,9 +5,9 @@ h_nelem=3 > db_pagesize=4096 > HEADER=END > \01\00\00\00 > - \00\00\00\03\00\00\00\11\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\001.0\001.fc14\00x86_64\00 > + \00\00\00\05\00\00\00\33\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\00\00\00\03\ec\00\00\00\00\00\00\00\12\00\00\00\00\00\00\03\ed\00\00\00\00\00\00\00\21\00\00\00\001.0\001.fc14\00x86_64\00summary with \f6\00description with \f6\00 > \02\00\00\00 > - \00\00\00\03\00\00\00\11\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\002.0\002.fc14\00x86_64\00 > + \00\00\00\05\00\00\00\35\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\00\00\00\03\ec\00\00\00\00\00\00\00\12\00\00\00\00\00\00\03\ed\00\00\00\00\00\00\00\22\00\00\00\002.0\002.fc14\00x86_64\00summary with \c3\b6\00description with \c3\b6\00 > \03\00\00\00 > \00\00\00\03\00\00\00\11\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\003.0\003.fc14\00x86_64\00 > DATA=ENDThis seems fine now, although I would have preferred local_string_to_utf8 to be inlined since it is only used in a single place. However my patch needs to be reviewed first. Rich. -- Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones Read my programming and virtualization blog: http://rwmj.wordpress.com Fedora Windows cross-compiler. Compile Windows programs, test, and build Windows installers. Over 100 libraries supported. http://fedoraproject.org/wiki/MinGW
Seemingly Similar Threads
- [PATCH v3 0/2] inspect: basic UTF-8 encoding for rpm
- [PATCH] inspector: rpm summary and description may not be utf-8
- [PATCH v2 2/2] inspector: rpm summary and description may not be utf-8
- [PATCH v3 2/2] inspector: rpm summary and description may not be utf-8
- [PATCH 1/3] inspect: add source and summary to internal add_application