Displaying 8 results from an estimated 8 matches for "utf8clen".
2017 Mar 19
2
[PATCH] Improve utf8clen and remove utf8_table4
Given a char `c' which should be the start byte of a utf8 character,
the utf8clen function returns the byte length of the utf8 character.
Before this patch, the utf8clen function would return either:
* 1 if `c' was an ascii character or a utf8 continuation byte
* An int in the range [2, 6] indicating the byte length of the utf8
character
With this patch, the utf...
2015 Mar 02
2
Errors on Windows with grep(fixed=TRUE) on UTF-8 strings
On Windows, grep(fixed=TRUE) throws errors with some UTF-8 strings.
Here's an example (must be run on Windows to reproduce the error):
Sys.setlocale("LC_CTYPE", "chinese")
y <- rawToChar(as.raw(c(0xe6, 0xb8, 0x97)))
Encoding(y) <- "UTF-8"
y
# [1] "?"
grep("\n", y, fixed = TRUE)
# Error in grep("\n", y, fixed = TRUE) : invalid
2015 Mar 04
0
Errors on Windows with grep(fixed=TRUE) on UTF-8 strings
...p;& use_UTF8) {
int ib, used;
- mbs_init(&mb_st);
for (ib = 0, i = 0; ib <= len-plen; i++) {
if (strncmp(pat, target+ib, plen) == 0) {
if (next != NULL) *next = ib + plen;
return i;
}
- used = (int) Mbrtowc(NULL, target+ib, MB_CUR_MAX, &mb_st);
+ used = utf8clen(target[ib]);
if (used <= 0) break;
ib += used;
}
- } else if (!useBytes && use_UTF8) {
+ } else if (!useBytes && mbcslocale) { /* skip along by chars */
+ mbstate_t mb_st;
int ib, used;
+ mbs_init(&mb_st);
for (ib = 0, i = 0; ib <= len-plen; i++) {...
2018 Mar 29
2
Possible `substr` bug in UTF-8 Corner Case
...;character".
Index: src/main/character.c
===================================================================
--- src/main/character.c????(revision 74482)
+++ src/main/character.c????(working copy)
@@ -283,7 +283,7 @@
????for (i = 0; i < so && str < end; i++) {
????????int used = utf8clen(*str);
????????if (i < sa - 1) { str += used; continue; }
-????????for (j = 0; j < used; j++) *buf++ = *str++;
+????????for (j = 0; j < used && str < end; j++) *buf++ = *str++;
????}
???? } else if (ienc == CE_LATIN1 || ienc == CE_BYTES) {
????for (str += (sa - 1), i = sa; i <...
2011 Aug 04
1
slightly speeding up readChar()
...amp; !useBytes) {
int i, clen;
char *p, *q;
p = buf = (char *) R_alloc(MB_CUR_MAX*len+1, sizeof(char));
memset(buf, 0, MB_CUR_MAX*len+1);
for(i = 0; i < len; i++) {
q = p;
m = con->read(p, sizeof(char), 1, con);
if(!m) { if(i == 0) return R_NilValue; else break;}
clen = utf8clen(*p++);
if(clen > 1) {
m = con->read(p, sizeof(char), clen - 1, con);
if(m < clen - 1) error(_("invalid UTF-8 input in readChar()"));
p += clen - 1;
/* NB: this only checks validity of multi-byte characters */
if((int)mbrtowc(NULL, q, clen, NULL) < 0)...
2018 Mar 29
0
Possible `substr` bug in UTF-8 Corner Case
...in/character.c
> ===================================================================
> --- src/main/character.c????(revision 74482)
> +++ src/main/character.c????(working copy)
> @@ -283,7 +283,7 @@
> ????for (i = 0; i < so && str < end; i++) {
> ????????int used = utf8clen(*str);
> ????????if (i < sa - 1) { str += used; continue; }
> -????????for (j = 0; j < used; j++) *buf++ = *str++;
> +????????for (j = 0; j < used && str < end; j++) *buf++ = *str++;
> ????}
> ???? } else if (ienc == CE_LATIN1 || ienc == CE_BYTES) {
> ????f...
2005 Jul 20
1
(PR#8017) build of REventLoop package crashes with 2.1 due
...PORT_MBCS)
> < #include <wchar.h>
> < #endif
> <
> < /* main/util.c */
> < void UNIMPLEMENTED_TYPE(char *s, SEXP x);
> < void UNIMPLEMENTED_TYPEt(char *s, SEXPTYPE t);
> < Rboolean utf8strIsASCII(char *str);
> < #ifdef SUPPORT_MBCS
> < int utf8clen(char c);
> < #define mbs_init(x) memset(x, 0, sizeof(mbstate_t))
> < size_t Mbrtowc(wchar_t *wc, const char *s, size_t n, mbstate_t *ps);
> < void mbcsToLatin1(char *in, char *out);
> < Rboolean mbcsValid(char *str);
> < char *Rf_strchr(const char *s, int c);
> <...
2005 Jul 19
0
build of REventLoop package crashes with 2.1 due tosyntax error in Defn.h (PR#8017)
...defined(HAVE_WCHAR_H) && defined(SUPPORT_MBCS)
< #include <wchar.h>
< #endif
<
< /* main/util.c */
< void UNIMPLEMENTED_TYPE(char *s, SEXP x);
< void UNIMPLEMENTED_TYPEt(char *s, SEXPTYPE t);
< Rboolean utf8strIsASCII(char *str);
< #ifdef SUPPORT_MBCS
< int utf8clen(char c);
< #define mbs_init(x) memset(x, 0, sizeof(mbstate_t))
< size_t Mbrtowc(wchar_t *wc, const char *s, size_t n, mbstate_t *ps);
< void mbcsToLatin1(char *in, char *out);
< Rboolean mbcsValid(char *str);
< char *Rf_strchr(const char *s, int c);
< char *Rf_strrchr(const char *...