Couple more thoughts about iconv,
(1) The 'embedded nul' error is thrown by mkCharLenCE, after the real
conversion is complete. The converted string exists in memory, though
not in a form that R can currently represent as a STRSXP. Hence the
error when passed to mkCharLenCE.
(2) The patch I submitted needed some PROTECTion, it's fixed in the
patch below. Also, in one place I did not alter CHAR(x) where x may be a
RAWSXP, because that's the pointer type required, the alternative is
(const char*) RAW(x). Should this alternative be used?
(3) I don't see why the iconv 'sub' argument shouldn't work with
raw
objects also, but I think this means the 'sub' argument should also
accept raw objects.
(4) Suppose iconv fails. If x were a STRSXP, the result is NA_STRING.
Should failure of iconv on a RAWSXP return R_NilValue?
-Matt
Index: src/library/base/R/New-Internal.R
==================================================================---
src/library/base/R/New-Internal.R (revision 52328)
+++ src/library/base/R/New-Internal.R (working copy)
@@ -239,7 +239,7 @@
iconv <- function(x, from = "", to = "", sub = NA, mark
= TRUE)
{
- if(!is.character(x)) x <- as.character(x)
+ if(!is.character(x) && !is.raw(x)) x <- as.character(x)
.Internal(iconv(x, from, to, as.character(sub), mark))
}
Index: src/main/sysutils.c
==================================================================---
src/main/sysutils.c (revision 52328)
+++ src/main/sysutils.c (working copy)
@@ -548,16 +548,17 @@
int mark;
const char *from, *to;
Rboolean isLatin1 = FALSE, isUTF8 = FALSE;
+ Rboolean isRawx = (TYPEOF(x) == RAWSXP);
- if(TYPEOF(x) != STRSXP)
- error(_("'x' must be a character vector"));
+ if(TYPEOF(x) != STRSXP && !isRawx)
+ error(_("'x' must be a character vector or raw"));
if(!isString(CADR(args)) || length(CADR(args)) != 1)
error(_("invalid '%s' argument"), "from");
if(!isString(CADDR(args)) || length(CADDR(args)) != 1)
error(_("invalid '%s' argument"), "to");
if(!isString(CADDDR(args)) || length(CADDDR(args)) != 1)
error(_("invalid '%s' argument"), "sub");
- if(STRING_ELT(CADDDR(args), 0) == NA_STRING) sub = NULL;
+ if(STRING_ELT(CADDDR(args), 0) == NA_STRING || isRawx) sub = NULL;
else sub = translateChar(STRING_ELT(CADDDR(args), 0));
mark = asLogical(CAD4R(args));
if(mark == NA_LOGICAL)
@@ -584,7 +585,7 @@
PROTECT(ans = duplicate(x));
R_AllocStringBuffer(0, &cbuff); /* 0 -> default */
for(i = 0; i < LENGTH(x); i++) {
- si = STRING_ELT(x, i);
+ si = isRawx ? x : STRING_ELT(x, i);
top_of_loop:
inbuf = CHAR(si); inb = LENGTH(si);
outbuf = cbuff.data; outb = cbuff.bufsize - 1;
@@ -622,7 +623,7 @@
goto next_char;
}
- if(res != -1 && inb == 0) {
+ if(res != -1 && inb == 0 && !isRawx) {
cetype_t ienc = CE_NATIVE;
nout = cbuff.bufsize - 1 - outb;
@@ -632,7 +633,13 @@
}
SET_STRING_ELT(ans, i, mkCharLenCE(cbuff.data, nout, ienc));
}
- else SET_STRING_ELT(ans, i, NA_STRING);
+ else if(!isRawx) SET_STRING_ELT(ans, i, NA_STRING);
+ else {
+ nout = cbuff.bufsize - 1 - outb;
+ UNPROTECT(1);
+ PROTECT(ans = allocVector(RAWSXP, nout));
+ memcpy(RAW(ans), cbuff.data, nout);
+ }
}
Riconv_close(obj);
R_FreeStringBuffer(&cbuff);
On Sat, 2010-06-19 at 16:53 -0400, Matt Shotwell wrote:> R community,
>
> As you may know, R's iconv doesn't work well converting to and from
> encodings that allow embedded nulls. For example
>
> > iconv("foo", to="UTF-16")
> Error in iconv("foo", to = "UTF-16") :
> embedded nul in string: '\xff\xfef\0o\0o\0'
>
> However, I don't believe embedded nulls are at issue here, but rather
> that R's iconv doesn't accept objects of type RAWSXP. The iconv
> mechanism, after all, operates on encoded binary data, and not
> necessarily null terminated C strings. I'd like to submit a very small
> patch (12 lines w/o documentation) that allows R's iconv to operate on
> raw objects, while not interfering or affecting the behavior of iconv on
> character vectors. To keep this message terse, I've put additional
> discussion, description of what the patch does, and examples here:
> http://biostatmatt.com/archives/456
>
> Also, here is a link to the patch file:
> http://biostatmatt.com/R/R-devel-iconv-0.0.patch
>
> If this change is adopted, I'd be happy to submit a documentation patch
> also.
>
> -Matt
>
> Index: src/library/base/R/New-Internal.R
> ==================================================================> ---
src/library/base/R/New-Internal.R (revision 52328)
> +++ src/library/base/R/New-Internal.R (working copy)
> @@ -239,7 +239,7 @@
>
> iconv <- function(x, from = "", to = "", sub = NA,
mark = TRUE)
> {
> - if(!is.character(x)) x <- as.character(x)
> + if(!is.character(x) && !is.raw(x)) x <- as.character(x)
> .Internal(iconv(x, from, to, as.character(sub), mark))
> }
>
> Index: src/main/sysutils.c
> ==================================================================> ---
src/main/sysutils.c (revision 52328)
> +++ src/main/sysutils.c (working copy)
> @@ -548,16 +548,17 @@
> int mark;
> const char *from, *to;
> Rboolean isLatin1 = FALSE, isUTF8 = FALSE;
> + Rboolean isRawx = (TYPEOF(x) == RAWSXP);
>
> - if(TYPEOF(x) != STRSXP)
> - error(_("'x' must be a character vector"));
> + if(TYPEOF(x) != STRSXP && !isRawx)
> + error(_("'x' must be a character vector or raw"));
> if(!isString(CADR(args)) || length(CADR(args)) != 1)
> error(_("invalid '%s' argument"),
"from");
> if(!isString(CADDR(args)) || length(CADDR(args)) != 1)
> error(_("invalid '%s' argument"), "to");
> if(!isString(CADDDR(args)) || length(CADDDR(args)) != 1)
> error(_("invalid '%s' argument"), "sub");
> - if(STRING_ELT(CADDDR(args), 0) == NA_STRING) sub = NULL;
> + if(STRING_ELT(CADDDR(args), 0) == NA_STRING || isRawx) sub = NULL;
> else sub = translateChar(STRING_ELT(CADDDR(args), 0));
> mark = asLogical(CAD4R(args));
> if(mark == NA_LOGICAL)
> @@ -584,7 +585,7 @@
> PROTECT(ans = duplicate(x));
> R_AllocStringBuffer(0, &cbuff); /* 0 -> default */
> for(i = 0; i < LENGTH(x); i++) {
> - si = STRING_ELT(x, i);
> + si = isRawx ? x : STRING_ELT(x, i);
> top_of_loop:
> inbuf = CHAR(si); inb = LENGTH(si);
> outbuf = cbuff.data; outb = cbuff.bufsize - 1;
> @@ -622,7 +623,7 @@
> goto next_char;
> }
>
> - if(res != -1 && inb == 0) {
> + if(res != -1 && inb == 0 && !isRawx) {
> cetype_t ienc = CE_NATIVE;
>
> nout = cbuff.bufsize - 1 - outb;
> @@ -632,7 +633,12 @@
> }
> SET_STRING_ELT(ans, i, mkCharLenCE(cbuff.data, nout, ienc));
> }
> - else SET_STRING_ELT(ans, i, NA_STRING);
> + else if(!isRawx) SET_STRING_ELT(ans, i, NA_STRING);
> + else {
> + nout = cbuff.bufsize - 1 - outb;
> + ans = allocVector(RAWSXP, nout);
> + memcpy(RAW(ans), cbuff.data, nout);
> + }
> }
> Riconv_close(obj);
> R_FreeStringBuffer(&cbuff);
>
--
Matthew S. Shotwell
Graduate Student
Division of Biostatistics and Epidemiology
Medical University of South Carolina
http://biostatmatt.com