Richard W.M. Jones
2010-Jul-22 12:00 UTC
[Libguestfs] Fwd: [PATCH hivex] non-ASCII characters in node names
Hilko, forwarding this to the mailing list. Please post patches over there. Rich. ----- Forwarded message ----- Date: Wed, 21 Jul 2010 17:09:53 +0200 From: Hilko Bengen Subject: patch: non-ASCII characters in node names Hi Richard, I was a little bit surprised when a colleague claimed that key and value names in the registry could contain non-ASCII characters. I created keys and values with the following names: * "asdf" * "???" (common in German, can be represented in Windows-1252, Latin1, Latin9) * the Euro sign (can be represented in Windows-1252, Latin9) * the international currency symbol (can be represented in Windows-1252, Latin1)>From looking at the nodes/values, I have come to the conclusion thatWindows first looks if all characters in a string can be represented in the Latin1 encoding. If that fails, UTF-16 is used. A bit in the "flags" field is used to indicate the character encoding. I have implemented and briefly tested read support for those names in the patch below. If that patch is acceptable, I'll do write support tomorrow. -Hilko ----- End forwarded message ----- -- Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones virt-top is 'top' for virtual machines. Tiny program with many powerful monitoring features, net stats, disk stats, logging, etc. http://et.redhat.com/~rjones/virt-top -------------- next part -------------- diff --git a/lib/hivex.c b/lib/hivex.c index 13d7556..bcee0ec 100644 --- a/lib/hivex.c +++ b/lib/hivex.c @@ -62,6 +62,7 @@ #define HIVEX_MAX_ALLOCATION 1000000 static char *windows_utf16_to_utf8 (/* const */ char *input, size_t len); +static char *windows_latin1_to_utf8 (/* const */ char *input, size_t len); static size_t utf16_string_len_in_bytes (const char *str); static size_t utf16_string_len_in_bytes_max (const char *str, size_t len); @@ -177,7 +178,8 @@ block_len (hive_h *h, size_t blkoff, int *used) struct ntreg_nk_record { int32_t seg_len; /* length (always -ve because used) */ char id[2]; /* "nk" */ - uint16_t flags; + uint16_t flags; /* bit 5 set: latin1 + bit 5 clr: UTF-16 */ char timestamp[8]; uint32_t unknown1; uint32_t parent; /* offset of owner/parent */ @@ -571,11 +573,6 @@ hivex_node_name (hive_h *h, hive_node_h node) struct ntreg_nk_record *nk = (struct ntreg_nk_record *) (h->addr + node); - /* AFAIK the node name is always plain ASCII, so no conversion - * to UTF-8 is necessary. However we do need to nul-terminate - * the string. - */ - /* nk->name_len is unsigned, 16 bit, so this is safe ... However * we have to make sure the length doesn't exceed the block length. */ @@ -589,11 +586,12 @@ hivex_node_name (hive_h *h, hive_node_h node) return NULL; } - char *ret = malloc (len + 1); - if (ret == NULL) - return NULL; - memcpy (ret, nk->name, len); - ret[len] = '\0'; + char *ret; + if (le16toh(nk->flags) & 0x20) { + ret = windows_latin1_to_utf8(nk->name, len); + } else { + ret = windows_utf16_to_utf8(nk->name, len); + } return ret; } @@ -1113,6 +1111,7 @@ hivex_node_get_value (hive_h *h, hive_node_h node, const char *key) char * hivex_value_key (hive_h *h, hive_value_h value) { + iconv_t ic; if (!IS_VALID_BLOCK (h, value) || !BLOCK_ID_EQ (h, value, "vk")) { errno = EINVAL; return 0; @@ -1120,10 +1119,6 @@ hivex_value_key (hive_h *h, hive_value_h value) struct ntreg_vk_record *vk = (struct ntreg_vk_record *) (h->addr + value); - /* AFAIK the key is always plain ASCII, so no conversion to UTF-8 is - * necessary. However we do need to nul-terminate the string. - */ - /* vk->name_len is unsigned, 16 bit, so this is safe ... However * we have to make sure the length doesn't exceed the block length. */ @@ -1137,11 +1132,14 @@ hivex_value_key (hive_h *h, hive_value_h value) return NULL; } - char *ret = malloc (len + 1); - if (ret == NULL) - return NULL; - memcpy (ret, vk->name, len); - ret[len] = '\0'; + char *ret; + if (le16toh(vk->flags) & 0x01) { + ret = windows_latin1_to_utf8(vk->name, len); + } else { + ret = windows_utf16_to_utf8(vk->name, len); + } + if (!ret) + errno = EILSEQ; return ret; } @@ -1250,58 +1248,70 @@ hivex_value_value (hive_h *h, hive_value_h value, } static char * -windows_utf16_to_utf8 (/* const */ char *input, size_t len) +iconv_wrapper (iconv_t ic, char *input, size_t len) { - iconv_t ic = iconv_open ("UTF-8", "UTF-16"); - if (ic == (iconv_t) -1) - return NULL; - - /* iconv(3) has an insane interface ... */ - - /* Mostly UTF-8 will be smaller, so this is a good initial guess. */ size_t outalloc = len; - - again:; - size_t inlen = len; - size_t outlen = outalloc; - char *out = malloc (outlen + 1); - if (out == NULL) { - int err = errno; - iconv_close (ic); - errno = err; - return NULL; - } - char *inp = input; - char *outp = out; - - size_t r = iconv (ic, &inp, &inlen, &outp, &outlen); - if (r == (size_t) -1) { - if (errno == E2BIG) { + for(;;) { + size_t inlen = len; + size_t outlen = outalloc; + char *out = malloc (outlen + 1); + if (out == NULL) { int err = errno; - size_t prev = outalloc; - /* Try again with a larger output buffer. */ - free (out); - outalloc *= 2; - if (outalloc < prev) { - iconv_close (ic); + errno = err; + return NULL; + } + char *inp = input; + char *outp = out; + + size_t r = iconv (ic, &inp, &inlen, &outp, &outlen); + if (r == (size_t) -1) { + if (errno == E2BIG) { + int err = errno; + size_t prev = outalloc; + /* Try again with a larger output buffer. */ + free (out); + outalloc *= 2; + if (outalloc < prev) { + errno = err; + return NULL; + } + continue; + } + else { + /* Else some conversion failure, eg. EILSEQ, EINVAL. */ + int err = errno; + free (out); errno = err; return NULL; } - goto again; - } - else { - /* Else some conversion failure, eg. EILSEQ, EINVAL. */ - int err = errno; - iconv_close (ic); - free (out); - errno = err; - return NULL; } + *outp = '\0'; + return out; } +} - *outp = '\0'; +static char * +windows_latin1_to_utf8 (char *input, size_t len) +{ + iconv_t ic = iconv_open ("UTF-8", "ISO-8859-1"); + if (ic == (iconv_t) -1) + return NULL; + + /* In the most common case, there are only ASCII characters. */ + char * out = iconv_wrapper (ic, input, len); iconv_close (ic); + return out; +} + +static char * +windows_utf16_to_utf8 (/* const */ char *input, size_t len) +{ + iconv_t ic = iconv_open ("UTF-8", "UTF-16"); + if (ic == (iconv_t) -1) + return NULL; + char * out = iconv_wrapper(ic, input, len); + iconv_close (ic); return out; }
Seemingly Similar Threads
- Re: [PATCH] Add a cache for iconv_t handles to hive_t
- [PATCH] Add a cache for iconv_t handles to hive_t
- [PATCH] Add a cache for iconv_t handles to hive_t
- [PATCH v3 1/2] common: extract UTF-8 conversion function
- [PATCH 1/3] lib: Further generalize iconv wrapper function.