[Libguestfs] [PATCH] inspector: rpm summary and description may not be utf-8

Cédric Bosdonnat cbosdonnat at suse.com
Wed Feb 14 17:40:44 UTC 2018


The application inspection code assumes the data in the RPM database
are encoded in UTF-8. However this is not always the case.

As a basic workaround, try to parse the string to UTF-8 and if that
fails, try converting it from latin-1.
---
 inspector/expected-fedora.img.xml             |  4 ++
 lib/inspect-apps.c                            | 75 +++++++++++++++++++++++++--
 test-data/phony-guests/fedora-packages.db.txt |  4 +-
 3 files changed, 77 insertions(+), 6 deletions(-)

diff --git a/inspector/expected-fedora.img.xml b/inspector/expected-fedora.img.xml
index 8d40e8cb7..ffefce177 100644
--- a/inspector/expected-fedora.img.xml
+++ b/inspector/expected-fedora.img.xml
@@ -33,12 +33,16 @@
         <version>1.0</version>
         <release>1.fc14</release>
         <arch>x86_64</arch>
+        <summary>summary with ö</summary>
+        <description>description with ö</description>
       </application>
       <application>
         <name>test2</name>
         <version>2.0</version>
         <release>2.fc14</release>
         <arch>x86_64</arch>
+        <summary>summary with ö</summary>
+        <description>description with ö</description>
       </application>
       <application>
         <name>test3</name>
diff --git a/lib/inspect-apps.c b/lib/inspect-apps.c
index f0cf16b38..5adfabfe6 100644
--- a/lib/inspect-apps.c
+++ b/lib/inspect-apps.c
@@ -22,6 +22,7 @@
 #include <stdlib.h>
 #include <unistd.h>
 #include <string.h>
+#include <iconv.h>
 
 #ifdef HAVE_ENDIAN_H
 #include <endian.h>
@@ -251,7 +252,7 @@ get_rpm_header_tag (guestfs_h *g, const unsigned char *header_start,
   /* This function parses the RPM header structure to pull out various
    * tag strings (version, release, arch, etc.).  For more detail on the
    * header format, see:
-   * http://www.rpm.org/max-rpm/s1-rpm-file-format-rpm-file-format.html#S2-RPM-FILE-FORMAT-HEADER
+   * http://rpm.org/devel_doc/file_format.html#24-header-format
    */
 
   /* The minimum header size that makes sense here is 24 bytes.  Four
@@ -301,6 +302,66 @@ struct read_package_data {
   struct guestfs_application2_list *apps;
 };
 
+static char *
+to_utf8 (guestfs_h *g, char *input)
+{
+  iconv_t cd_utf8_utf8 = (iconv_t)(-1);
+  iconv_t cd_utf8_latin1 = (iconv_t)(-1);
+  size_t in_left, out_left, res;
+  char *in_ptr;
+  char *out_ptr;
+  char *output = NULL;
+  char *result = NULL;
+
+  cd_utf8_utf8 = iconv_open("UTF-8", "UTF-8");
+  if (cd_utf8_utf8 == (iconv_t)(-1)) {
+    perrorf(g, "No iconv UTF-8 encoding");
+    goto cleanup;
+  }
+
+  in_ptr = input;
+  in_left = strlen(input) + 1;
+  out_left = in_left * 4;
+  output = safe_malloc(g, out_left);
+  out_ptr = output;
+
+  res = iconv(cd_utf8_utf8, &in_ptr, &in_left, &out_ptr, &out_left);
+  if (res == (size_t)(-1)) {
+    if (errno == E2BIG) {
+      perrorf(g, "iconv: '%s', buffer length: %lu", input, strlen(input) * 4);
+      goto cleanup;
+    }
+
+    /* Try latin-1 encoding */
+    cd_utf8_latin1 = iconv_open("UTF-8", "ISO-8859-1");
+    if (cd_utf8_latin1 == (iconv_t)(-1)) {
+      perrorf(g, "No iconv ISO-8859-1 encoding");
+      goto cleanup;
+    }
+
+    in_ptr = input;
+    in_left = strlen(input) + 1;
+    out_left = in_left * 4;
+    out_ptr = output;
+
+    res = iconv(cd_utf8_latin1, &in_ptr, &in_left, &out_ptr, &out_left);
+    if (res == (size_t)(-1)) {
+      perrorf(g, "Failed to parse latin-1: '%s'", input);
+      goto cleanup;
+    }
+  }
+
+  result = output;
+
+ cleanup:
+   iconv_close(cd_utf8_utf8);
+   iconv_close(cd_utf8_latin1);
+   if (!result)
+     free(output);
+
+   return result;
+}
+
 static int
 read_package (guestfs_h *g,
               const unsigned char *key, size_t keylen,
@@ -311,7 +372,7 @@ read_package (guestfs_h *g,
   struct rpm_name nkey, *entry;
   CLEANUP_FREE char *version = NULL, *release = NULL,
     *epoch_str = NULL, *arch = NULL, *url = NULL, *summary = NULL,
-    *description = NULL;
+    *description = NULL, *summary_raw = NULL, *description_raw = NULL;
   int32_t epoch;
 
   /* This function reads one (key, value) pair from the Packages
@@ -342,8 +403,14 @@ read_package (guestfs_h *g,
   epoch_str = get_rpm_header_tag (g, value, valuelen, RPMTAG_EPOCH, 'i');
   arch = get_rpm_header_tag (g, value, valuelen, RPMTAG_ARCH, 's');
   url = get_rpm_header_tag (g, value, valuelen, RPMTAG_URL, 's');
-  summary = get_rpm_header_tag (g, value, valuelen, RPMTAG_SUMMARY, 's');
-  description = get_rpm_header_tag (g, value, valuelen, RPMTAG_DESCRIPTION, 's');
+  summary_raw = get_rpm_header_tag (g, value, valuelen, RPMTAG_SUMMARY, 's');
+  description_raw = get_rpm_header_tag (g, value, valuelen, RPMTAG_DESCRIPTION, 's');
+
+  /* Try (not too hard) to get UTF-8 */
+  if (summary_raw)
+    summary = to_utf8(g, summary_raw);
+  if (description_raw)
+    description = to_utf8(g, description_raw);
 
   /* The epoch is stored as big-endian integer. */
   if (epoch_str)
diff --git a/test-data/phony-guests/fedora-packages.db.txt b/test-data/phony-guests/fedora-packages.db.txt
index f16a5aa76..927d6eb5f 100644
--- a/test-data/phony-guests/fedora-packages.db.txt
+++ b/test-data/phony-guests/fedora-packages.db.txt
@@ -5,9 +5,9 @@ h_nelem=3
 db_pagesize=4096
 HEADER=END
  \01\00\00\00
- \00\00\00\03\00\00\00\11\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\001.0\001.fc14\00x86_64\00
+ \00\00\00\05\00\00\00\33\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\00\00\00\03\ec\00\00\00\00\00\00\00\12\00\00\00\00\00\00\03\ed\00\00\00\00\00\00\00\21\00\00\00\001.0\001.fc14\00x86_64\00summary with \f6\00description with \f6\00
  \02\00\00\00
- \00\00\00\03\00\00\00\11\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\002.0\002.fc14\00x86_64\00
+ \00\00\00\05\00\00\00\35\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\00\00\00\03\ec\00\00\00\00\00\00\00\12\00\00\00\00\00\00\03\ed\00\00\00\00\00\00\00\22\00\00\00\002.0\002.fc14\00x86_64\00summary with \c3\b6\00description with \c3\b6\00
  \03\00\00\00
  \00\00\00\03\00\00\00\11\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\003.0\003.fc14\00x86_64\00
 DATA=END
-- 
2.16.1




More information about the Libguestfs mailing list