[Libguestfs] Fwd: [PATCH hivex] non-ASCII characters in node names

Richard W.M. Jones rjones at redhat.com
Thu Jul 22 12:00:11 UTC 2010


Hilko, forwarding this to the mailing list.  Please post patches
over there.

Rich.

----- Forwarded message  -----

Date: Wed, 21 Jul 2010 17:09:53 +0200
From: Hilko Bengen
Subject: patch: non-ASCII characters in node names

Hi Richard,

I was a little bit surprised when a colleague claimed that key and value
names in the registry could contain non-ASCII characters.

I created keys and values with the following names:

* "asdf"
* "äöü" (common in German, can be represented in Windows-1252, Latin1,
  Latin9)
* the Euro sign (can be represented in Windows-1252, Latin9)
* the international currency symbol (can be represented in Windows-1252,
  Latin1)

>From looking at the nodes/values, I have come to the conclusion that
Windows first looks if all characters in a string can be represented in
the Latin1 encoding. If that fails, UTF-16 is used. A bit in the "flags"
field is used to indicate the character encoding.

I have implemented and briefly tested read support for those names in
the patch below. If that patch is acceptable, I'll do write support
tomorrow.

-Hilko


----- End forwarded message -----

-- 
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
virt-top is 'top' for virtual machines.  Tiny program with many
powerful monitoring features, net stats, disk stats, logging, etc.
http://et.redhat.com/~rjones/virt-top
-------------- next part --------------
diff --git a/lib/hivex.c b/lib/hivex.c
index 13d7556..bcee0ec 100644
--- a/lib/hivex.c
+++ b/lib/hivex.c
@@ -62,6 +62,7 @@
 #define HIVEX_MAX_ALLOCATION  1000000
 
 static char *windows_utf16_to_utf8 (/* const */ char *input, size_t len);
+static char *windows_latin1_to_utf8 (/* const */ char *input, size_t len);
 static size_t utf16_string_len_in_bytes (const char *str);
 static size_t utf16_string_len_in_bytes_max (const char *str, size_t len);
 
@@ -177,7 +178,8 @@ block_len (hive_h *h, size_t blkoff, int *used)
 struct ntreg_nk_record {
   int32_t seg_len;              /* length (always -ve because used) */
   char id[2];                   /* "nk" */
-  uint16_t flags;
+  uint16_t flags;               /* bit 5 set: latin1
+                                   bit 5 clr: UTF-16 */
   char timestamp[8];
   uint32_t unknown1;
   uint32_t parent;              /* offset of owner/parent */
@@ -571,11 +573,6 @@ hivex_node_name (hive_h *h, hive_node_h node)
 
   struct ntreg_nk_record *nk = (struct ntreg_nk_record *) (h->addr + node);
 
-  /* AFAIK the node name is always plain ASCII, so no conversion
-   * to UTF-8 is necessary.  However we do need to nul-terminate
-   * the string.
-   */
-
   /* nk->name_len is unsigned, 16 bit, so this is safe ...  However
    * we have to make sure the length doesn't exceed the block length.
    */
@@ -589,11 +586,12 @@ hivex_node_name (hive_h *h, hive_node_h node)
     return NULL;
   }
 
-  char *ret = malloc (len + 1);
-  if (ret == NULL)
-    return NULL;
-  memcpy (ret, nk->name, len);
-  ret[len] = '\0';
+  char *ret;
+  if (le16toh(nk->flags) & 0x20) {
+    ret = windows_latin1_to_utf8(nk->name, len);
+  } else {
+    ret = windows_utf16_to_utf8(nk->name, len);
+  }
   return ret;
 }
 
@@ -1113,6 +1111,7 @@ hivex_node_get_value (hive_h *h, hive_node_h node, const char *key)
 char *
 hivex_value_key (hive_h *h, hive_value_h value)
 {
+  iconv_t ic;
   if (!IS_VALID_BLOCK (h, value) || !BLOCK_ID_EQ (h, value, "vk")) {
     errno = EINVAL;
     return 0;
@@ -1120,10 +1119,6 @@ hivex_value_key (hive_h *h, hive_value_h value)
 
   struct ntreg_vk_record *vk = (struct ntreg_vk_record *) (h->addr + value);
 
-  /* AFAIK the key is always plain ASCII, so no conversion to UTF-8 is
-   * necessary.  However we do need to nul-terminate the string.
-   */
-
   /* vk->name_len is unsigned, 16 bit, so this is safe ...  However
    * we have to make sure the length doesn't exceed the block length.
    */
@@ -1137,11 +1132,14 @@ hivex_value_key (hive_h *h, hive_value_h value)
     return NULL;
   }
 
-  char *ret = malloc (len + 1);
-  if (ret == NULL)
-    return NULL;
-  memcpy (ret, vk->name, len);
-  ret[len] = '\0';
+  char *ret;
+  if (le16toh(vk->flags) & 0x01) {
+    ret = windows_latin1_to_utf8(vk->name, len);
+  } else {
+    ret = windows_utf16_to_utf8(vk->name, len);
+  }
+  if (!ret)
+    errno = EILSEQ;
   return ret;
 }
 
@@ -1250,58 +1248,70 @@ hivex_value_value (hive_h *h, hive_value_h value,
 }
 
 static char *
-windows_utf16_to_utf8 (/* const */ char *input, size_t len)
+iconv_wrapper (iconv_t ic, char *input, size_t len)
 {
-  iconv_t ic = iconv_open ("UTF-8", "UTF-16");
-  if (ic == (iconv_t) -1)
-    return NULL;
-
-  /* iconv(3) has an insane interface ... */
-
-  /* Mostly UTF-8 will be smaller, so this is a good initial guess. */
   size_t outalloc = len;
-
- again:;
-  size_t inlen = len;
-  size_t outlen = outalloc;
-  char *out = malloc (outlen + 1);
-  if (out == NULL) {
-    int err = errno;
-    iconv_close (ic);
-    errno = err;
-    return NULL;
-  }
-  char *inp = input;
-  char *outp = out;
-
-  size_t r = iconv (ic, &inp, &inlen, &outp, &outlen);
-  if (r == (size_t) -1) {
-    if (errno == E2BIG) {
+  for(;;) {
+    size_t inlen = len;
+    size_t outlen = outalloc;
+    char *out = malloc (outlen + 1);
+    if (out == NULL) {
       int err = errno;
-      size_t prev = outalloc;
-      /* Try again with a larger output buffer. */
-      free (out);
-      outalloc *= 2;
-      if (outalloc < prev) {
-        iconv_close (ic);
+      errno = err;
+      return NULL;
+    }
+    char *inp = input;
+    char *outp = out;
+
+    size_t r = iconv (ic, &inp, &inlen, &outp, &outlen);
+    if (r == (size_t) -1) {
+      if (errno == E2BIG) {
+        int err = errno;
+        size_t prev = outalloc;
+        /* Try again with a larger output buffer. */
+        free (out);
+        outalloc *= 2;
+        if (outalloc < prev) {
+          errno = err;
+          return NULL;
+        }
+        continue;
+      }
+      else {
+        /* Else some conversion failure, eg. EILSEQ, EINVAL. */
+        int err = errno;
+        free (out);
         errno = err;
         return NULL;
       }
-      goto again;
-    }
-    else {
-      /* Else some conversion failure, eg. EILSEQ, EINVAL. */
-      int err = errno;
-      iconv_close (ic);
-      free (out);
-      errno = err;
-      return NULL;
     }
+    *outp = '\0';
+    return out;
   }
+}
 
-  *outp = '\0';
+static char *
+windows_latin1_to_utf8 (char *input, size_t len)
+{
+  iconv_t ic = iconv_open ("UTF-8", "ISO-8859-1");
+  if (ic == (iconv_t) -1)
+    return NULL;
+
+  /* In the most common case, there are only ASCII characters. */
+  char * out = iconv_wrapper (ic, input, len);
   iconv_close (ic);
+  return out;
+}
+
+static char *
+windows_utf16_to_utf8 (/* const */ char *input, size_t len)
+{
+  iconv_t ic = iconv_open ("UTF-8", "UTF-16");
+  if (ic == (iconv_t) -1)
+    return NULL;
 
+  char * out = iconv_wrapper(ic, input, len);
+  iconv_close (ic);
   return out;
 }


More information about the Libguestfs mailing list