[Libguestfs] [PATCH v2 2/2] inspector: rpm summary and description may not be utf-8

Richard W.M. Jones rjones at redhat.com
Tue Feb 20 09:50:20 UTC 2018


On Thu, Feb 15, 2018 at 06:06:06PM +0100, Cédric Bosdonnat wrote:
> The application inspection code assumes the data in the RPM database
> are encoded in UTF-8. However this is not always the case.
> 
> As a basic workaround, try to parse the string to UTF-8 and if that
> fails, try converting it from latin-1.
> ---
>  inspector/expected-fedora.img.xml             |  4 ++++
>  lib/inspect-apps.c                            | 30 +++++++++++++++++++++++----
>  test-data/phony-guests/fedora-packages.db.txt |  4 ++--
>  3 files changed, 32 insertions(+), 6 deletions(-)
> 
> diff --git a/inspector/expected-fedora.img.xml b/inspector/expected-fedora.img.xml
> index 8d40e8cb7..ffefce177 100644
> --- a/inspector/expected-fedora.img.xml
> +++ b/inspector/expected-fedora.img.xml
> @@ -33,12 +33,16 @@
>          <version>1.0</version>
>          <release>1.fc14</release>
>          <arch>x86_64</arch>
> +        <summary>summary with ö</summary>
> +        <description>description with ö</description>
>        </application>
>        <application>
>          <name>test2</name>
>          <version>2.0</version>
>          <release>2.fc14</release>
>          <arch>x86_64</arch>
> +        <summary>summary with ö</summary>
> +        <description>description with ö</description>
>        </application>
>        <application>
>          <name>test3</name>
> diff --git a/lib/inspect-apps.c b/lib/inspect-apps.c
> index f0cf16b38..fdea85188 100644
> --- a/lib/inspect-apps.c
> +++ b/lib/inspect-apps.c
> @@ -22,6 +22,7 @@
>  #include <stdlib.h>
>  #include <unistd.h>
>  #include <string.h>
> +#include <iconv.h>
>  
>  #ifdef HAVE_ENDIAN_H
>  #include <endian.h>
> @@ -43,6 +44,7 @@
>  #include "guestfs.h"
>  #include "guestfs-internal.h"
>  #include "guestfs-internal-actions.h"
> +#include "guestfs-utils.h"
>  #include "structs-cleanups.h"
>  
>  #ifdef DB_DUMP
> @@ -251,7 +253,7 @@ get_rpm_header_tag (guestfs_h *g, const unsigned char *header_start,
>    /* This function parses the RPM header structure to pull out various
>     * tag strings (version, release, arch, etc.).  For more detail on the
>     * header format, see:
> -   * http://www.rpm.org/max-rpm/s1-rpm-file-format-rpm-file-format.html#S2-RPM-FILE-FORMAT-HEADER
> +   * http://rpm.org/devel_doc/file_format.html#24-header-format
>     */
>  
>    /* The minimum header size that makes sense here is 24 bytes.  Four
> @@ -301,6 +303,20 @@ struct read_package_data {
>    struct guestfs_application2_list *apps;
>  };
>  
> +static char *
> +to_utf8 (guestfs_h *g, char *input)
> +{
> +  char *out = NULL;
> +
> +  out = guestfs_int_string_to_utf8 (input, "UTF-8");
> +  if (!out) {
> +    out = guestfs_int_string_to_utf8 (input, "ISO-8859-1");
> +    perrorf (g, "Not an UTF-8 or latin-1 string: '%s'", input);
> +  }
> +
> +  return out;
> +}
> +
>  static int
>  read_package (guestfs_h *g,
>                const unsigned char *key, size_t keylen,
> @@ -311,7 +327,7 @@ read_package (guestfs_h *g,
>    struct rpm_name nkey, *entry;
>    CLEANUP_FREE char *version = NULL, *release = NULL,
>      *epoch_str = NULL, *arch = NULL, *url = NULL, *summary = NULL,
> -    *description = NULL;
> +    *description = NULL, *summary_raw = NULL, *description_raw = NULL;
>    int32_t epoch;
>  
>    /* This function reads one (key, value) pair from the Packages
> @@ -342,8 +358,14 @@ read_package (guestfs_h *g,
>    epoch_str = get_rpm_header_tag (g, value, valuelen, RPMTAG_EPOCH, 'i');
>    arch = get_rpm_header_tag (g, value, valuelen, RPMTAG_ARCH, 's');
>    url = get_rpm_header_tag (g, value, valuelen, RPMTAG_URL, 's');
> -  summary = get_rpm_header_tag (g, value, valuelen, RPMTAG_SUMMARY, 's');
> -  description = get_rpm_header_tag (g, value, valuelen, RPMTAG_DESCRIPTION, 's');
> +  summary_raw = get_rpm_header_tag (g, value, valuelen, RPMTAG_SUMMARY, 's');
> +  description_raw = get_rpm_header_tag (g, value, valuelen, RPMTAG_DESCRIPTION, 's');
> +
> +  /* Try (not too hard) to get UTF-8 */
> +  if (summary_raw)
> +    summary = to_utf8 (g, summary_raw);
> +  if (description_raw)
> +    description = to_utf8 (g, description_raw);
>  
>    /* The epoch is stored as big-endian integer. */
>    if (epoch_str)
> diff --git a/test-data/phony-guests/fedora-packages.db.txt b/test-data/phony-guests/fedora-packages.db.txt
> index f16a5aa76..927d6eb5f 100644
> --- a/test-data/phony-guests/fedora-packages.db.txt
> +++ b/test-data/phony-guests/fedora-packages.db.txt
> @@ -5,9 +5,9 @@ h_nelem=3
>  db_pagesize=4096
>  HEADER=END
>   \01\00\00\00
> - \00\00\00\03\00\00\00\11\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\001.0\001.fc14\00x86_64\00
> + \00\00\00\05\00\00\00\33\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\00\00\00\03\ec\00\00\00\00\00\00\00\12\00\00\00\00\00\00\03\ed\00\00\00\00\00\00\00\21\00\00\00\001.0\001.fc14\00x86_64\00summary with \f6\00description with \f6\00
>   \02\00\00\00
> - \00\00\00\03\00\00\00\11\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\002.0\002.fc14\00x86_64\00
> + \00\00\00\05\00\00\00\35\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\00\00\00\03\ec\00\00\00\00\00\00\00\12\00\00\00\00\00\00\03\ed\00\00\00\00\00\00\00\22\00\00\00\002.0\002.fc14\00x86_64\00summary with \c3\b6\00description with \c3\b6\00
>   \03\00\00\00
>   \00\00\00\03\00\00\00\11\00\00\03\e9\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\ea\00\00\00\00\00\00\00\04\00\00\00\00\00\00\03\fe\00\00\00\00\00\00\00\0b\00\00\00\003.0\003.fc14\00x86_64\00
>  DATA=END

This seems fine now, although I would have preferred
local_string_to_utf8 to be inlined since it is only used in a single
place.

However my patch needs to be reviewed first.

Rich.

-- 
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
Read my programming and virtualization blog: http://rwmj.wordpress.com
Fedora Windows cross-compiler. Compile Windows programs, test, and
build Windows installers. Over 100 libraries supported.
http://fedoraproject.org/wiki/MinGW




More information about the Libguestfs mailing list