[Libguestfs] Fwd: [PATCH hivex] non-ASCII characters in node names
Richard W.M. Jones
rjones at redhat.com
Thu Jul 22 12:00:11 UTC 2010
Hilko, forwarding this to the mailing list. Please post patches
over there.
Rich.
----- Forwarded message -----
Date: Wed, 21 Jul 2010 17:09:53 +0200
From: Hilko Bengen
Subject: patch: non-ASCII characters in node names
Hi Richard,
I was a little bit surprised when a colleague claimed that key and value
names in the registry could contain non-ASCII characters.
I created keys and values with the following names:
* "asdf"
* "äöü" (common in German, can be represented in Windows-1252, Latin1,
Latin9)
* the Euro sign (can be represented in Windows-1252, Latin9)
* the international currency symbol (can be represented in Windows-1252,
Latin1)
>From looking at the nodes/values, I have come to the conclusion that
Windows first looks if all characters in a string can be represented in
the Latin1 encoding. If that fails, UTF-16 is used. A bit in the "flags"
field is used to indicate the character encoding.
I have implemented and briefly tested read support for those names in
the patch below. If that patch is acceptable, I'll do write support
tomorrow.
-Hilko
----- End forwarded message -----
--
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
virt-top is 'top' for virtual machines. Tiny program with many
powerful monitoring features, net stats, disk stats, logging, etc.
http://et.redhat.com/~rjones/virt-top
-------------- next part --------------
diff --git a/lib/hivex.c b/lib/hivex.c
index 13d7556..bcee0ec 100644
--- a/lib/hivex.c
+++ b/lib/hivex.c
@@ -62,6 +62,7 @@
#define HIVEX_MAX_ALLOCATION 1000000
static char *windows_utf16_to_utf8 (/* const */ char *input, size_t len);
+static char *windows_latin1_to_utf8 (/* const */ char *input, size_t len);
static size_t utf16_string_len_in_bytes (const char *str);
static size_t utf16_string_len_in_bytes_max (const char *str, size_t len);
@@ -177,7 +178,8 @@ block_len (hive_h *h, size_t blkoff, int *used)
struct ntreg_nk_record {
int32_t seg_len; /* length (always -ve because used) */
char id[2]; /* "nk" */
- uint16_t flags;
+ uint16_t flags; /* bit 5 set: latin1
+ bit 5 clr: UTF-16 */
char timestamp[8];
uint32_t unknown1;
uint32_t parent; /* offset of owner/parent */
@@ -571,11 +573,6 @@ hivex_node_name (hive_h *h, hive_node_h node)
struct ntreg_nk_record *nk = (struct ntreg_nk_record *) (h->addr + node);
- /* AFAIK the node name is always plain ASCII, so no conversion
- * to UTF-8 is necessary. However we do need to nul-terminate
- * the string.
- */
-
/* nk->name_len is unsigned, 16 bit, so this is safe ... However
* we have to make sure the length doesn't exceed the block length.
*/
@@ -589,11 +586,12 @@ hivex_node_name (hive_h *h, hive_node_h node)
return NULL;
}
- char *ret = malloc (len + 1);
- if (ret == NULL)
- return NULL;
- memcpy (ret, nk->name, len);
- ret[len] = '\0';
+ char *ret;
+ if (le16toh(nk->flags) & 0x20) {
+ ret = windows_latin1_to_utf8(nk->name, len);
+ } else {
+ ret = windows_utf16_to_utf8(nk->name, len);
+ }
return ret;
}
@@ -1113,6 +1111,7 @@ hivex_node_get_value (hive_h *h, hive_node_h node, const char *key)
char *
hivex_value_key (hive_h *h, hive_value_h value)
{
+ iconv_t ic;
if (!IS_VALID_BLOCK (h, value) || !BLOCK_ID_EQ (h, value, "vk")) {
errno = EINVAL;
return 0;
@@ -1120,10 +1119,6 @@ hivex_value_key (hive_h *h, hive_value_h value)
struct ntreg_vk_record *vk = (struct ntreg_vk_record *) (h->addr + value);
- /* AFAIK the key is always plain ASCII, so no conversion to UTF-8 is
- * necessary. However we do need to nul-terminate the string.
- */
-
/* vk->name_len is unsigned, 16 bit, so this is safe ... However
* we have to make sure the length doesn't exceed the block length.
*/
@@ -1137,11 +1132,14 @@ hivex_value_key (hive_h *h, hive_value_h value)
return NULL;
}
- char *ret = malloc (len + 1);
- if (ret == NULL)
- return NULL;
- memcpy (ret, vk->name, len);
- ret[len] = '\0';
+ char *ret;
+ if (le16toh(vk->flags) & 0x01) {
+ ret = windows_latin1_to_utf8(vk->name, len);
+ } else {
+ ret = windows_utf16_to_utf8(vk->name, len);
+ }
+ if (!ret)
+ errno = EILSEQ;
return ret;
}
@@ -1250,58 +1248,70 @@ hivex_value_value (hive_h *h, hive_value_h value,
}
static char *
-windows_utf16_to_utf8 (/* const */ char *input, size_t len)
+iconv_wrapper (iconv_t ic, char *input, size_t len)
{
- iconv_t ic = iconv_open ("UTF-8", "UTF-16");
- if (ic == (iconv_t) -1)
- return NULL;
-
- /* iconv(3) has an insane interface ... */
-
- /* Mostly UTF-8 will be smaller, so this is a good initial guess. */
size_t outalloc = len;
-
- again:;
- size_t inlen = len;
- size_t outlen = outalloc;
- char *out = malloc (outlen + 1);
- if (out == NULL) {
- int err = errno;
- iconv_close (ic);
- errno = err;
- return NULL;
- }
- char *inp = input;
- char *outp = out;
-
- size_t r = iconv (ic, &inp, &inlen, &outp, &outlen);
- if (r == (size_t) -1) {
- if (errno == E2BIG) {
+ for(;;) {
+ size_t inlen = len;
+ size_t outlen = outalloc;
+ char *out = malloc (outlen + 1);
+ if (out == NULL) {
int err = errno;
- size_t prev = outalloc;
- /* Try again with a larger output buffer. */
- free (out);
- outalloc *= 2;
- if (outalloc < prev) {
- iconv_close (ic);
+ errno = err;
+ return NULL;
+ }
+ char *inp = input;
+ char *outp = out;
+
+ size_t r = iconv (ic, &inp, &inlen, &outp, &outlen);
+ if (r == (size_t) -1) {
+ if (errno == E2BIG) {
+ int err = errno;
+ size_t prev = outalloc;
+ /* Try again with a larger output buffer. */
+ free (out);
+ outalloc *= 2;
+ if (outalloc < prev) {
+ errno = err;
+ return NULL;
+ }
+ continue;
+ }
+ else {
+ /* Else some conversion failure, eg. EILSEQ, EINVAL. */
+ int err = errno;
+ free (out);
errno = err;
return NULL;
}
- goto again;
- }
- else {
- /* Else some conversion failure, eg. EILSEQ, EINVAL. */
- int err = errno;
- iconv_close (ic);
- free (out);
- errno = err;
- return NULL;
}
+ *outp = '\0';
+ return out;
}
+}
- *outp = '\0';
+static char *
+windows_latin1_to_utf8 (char *input, size_t len)
+{
+ iconv_t ic = iconv_open ("UTF-8", "ISO-8859-1");
+ if (ic == (iconv_t) -1)
+ return NULL;
+
+ /* In the most common case, there are only ASCII characters. */
+ char * out = iconv_wrapper (ic, input, len);
iconv_close (ic);
+ return out;
+}
+
+static char *
+windows_utf16_to_utf8 (/* const */ char *input, size_t len)
+{
+ iconv_t ic = iconv_open ("UTF-8", "UTF-16");
+ if (ic == (iconv_t) -1)
+ return NULL;
+ char * out = iconv_wrapper(ic, input, len);
+ iconv_close (ic);
return out;
}
More information about the Libguestfs
mailing list