From 61f567378a983f7dad5adad3493d4e059888b39e Mon Sep 17 00:00:00 2001 From: David Lawrence Ramsey Date: Thu, 21 Jul 2005 22:12:03 +0000 Subject: [PATCH] display invalid multibyte sequences as Unicode FFFD, take 2; also clean up the character-parsing functions git-svn-id: svn://svn.savannah.gnu.org/nano/trunk/nano@2909 35c25a1d-7b9e-4130-9fde-d3aeb78583b8 --- ChangeLog | 13 +++++++++ src/chars.c | 80 ++++++++++++++++++++++++++++------------------------- src/proto.h | 5 ++++ src/winio.c | 21 ++++---------- 4 files changed, 67 insertions(+), 52 deletions(-) diff --git a/ChangeLog b/ChangeLog index 23620865..ee241247 100644 --- a/ChangeLog +++ b/ChangeLog @@ -78,6 +78,16 @@ CVS code - out of the prompt, and that magichistory is properly updated when we change it and then move up. New function history_reset(); changes to nanogetstr(). (DLR) + - Various character-handling cleanups. If we get an invalid + multibyte sequence, treat it as Unicode FFFD (Replacement + Character), unless we're determining if it's a control + character or searching for a match to it. Also, remove + unneeded variables and checks when parsing multibyte + sequences. Changes to is_alnum_mbchar(), is_blank_mbchar(), + is_cntrl_mbchar(), is_punct_mbchar(), control_mbrep(), + mbwidth(), make_mbchar(), parse_mbchar(), mbstrncasecmp(), + mbstrcasestr(), mbrevstrcasestr(), mbstrchr(), and + display_string(). (DLR) - chars.c: mbstrchr() - Don't count matches between valid and invalid multibyte @@ -147,6 +157,9 @@ CVS code - HAVE_SNPRINTF. (DLR) - Remove TOP from the topmidnone enum, and rename it centernone. (DLR) + proto.h: + - Add declarations for bad_mbchar and bad_mbchar_len, so that we + can use them in display_string() as well as chars.c. (DLR) - rcfile.c: nregcomp() - Return TRUE when the compilation succeeds and FALSE otherwise, diff --git a/src/chars.c b/src/chars.c index ea80c2fa..1e1dce57 100644 --- a/src/chars.c +++ b/src/chars.c @@ -37,6 +37,14 @@ #ifdef HAVE_WCTYPE_H #include #endif + +static const wchar_t bad_wchar = 0xFFFD; + /* If we get an invalid multibyte sequence, we treat it as + * Unicode FFFD (Replacement Character), unless we're + * determining if it's a control character or searching for a + * match to it. */ +const char *bad_mbchar = "\xEF\xBF\xBD"; +const int bad_mbchar_len = 3; #endif #ifndef HAVE_ISBLANK @@ -70,11 +78,10 @@ bool is_alnum_mbchar(const char *c) #ifdef ENABLE_UTF8 if (ISSET(USE_UTF8)) { wchar_t wc; - int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX); - if (c_mb_len <= 0) { + if (mbtowc(&wc, c, MB_CUR_MAX) < 0) { mbtowc(NULL, NULL, 0); - wc = (unsigned char)*c; + wc = bad_wchar; } return iswalnum(wc); @@ -91,11 +98,10 @@ bool is_blank_mbchar(const char *c) #ifdef ENABLE_UTF8 if (ISSET(USE_UTF8)) { wchar_t wc; - int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX); - if (c_mb_len <= 0) { + if (mbtowc(&wc, c, MB_CUR_MAX) < 0) { mbtowc(NULL, NULL, 0); - wc = (unsigned char)*c; + wc = bad_wchar; } return iswblank(wc); @@ -132,9 +138,8 @@ bool is_cntrl_mbchar(const char *c) #ifdef ENABLE_UTF8 if (ISSET(USE_UTF8)) { wchar_t wc; - int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX); - if (c_mb_len <= 0) { + if (mbtowc(&wc, c, MB_CUR_MAX) < 0) { mbtowc(NULL, NULL, 0); wc = (unsigned char)*c; } @@ -155,9 +160,9 @@ bool is_punct_mbchar(const char *c) wchar_t wc; int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX); - if (c_mb_len <= 0) { + if (c_mb_len < 0) { mbtowc(NULL, NULL, 0); - wc = (unsigned char)*c; + wc = bad_wchar; } return iswpunct(wc); @@ -215,16 +220,17 @@ char *control_mbrep(const char *c, char *crep, int *crep_len) if (ISSET(USE_UTF8)) { wchar_t wc; - if (mbtowc(&wc, c, MB_CUR_MAX) <= 0) { + if (mbtowc(&wc, c, MB_CUR_MAX) < 0) { mbtowc(NULL, NULL, 0); - wc = (unsigned char)*c; - } - - *crep_len = wctomb(crep, control_wrep(wc)); - - if (*crep_len <= 0) { - wctomb(NULL, 0); - *crep_len = 0; + crep = (char *)bad_mbchar; + *crep_len = bad_mbchar_len; + } else { + *crep_len = wctomb(crep, control_wrep(wc)); + + if (*crep_len < 0) { + wctomb(NULL, 0); + *crep_len = 0; + } } } else { #endif @@ -245,11 +251,11 @@ int mbwidth(const char *c) #ifdef ENABLE_UTF8 if (ISSET(USE_UTF8)) { wchar_t wc; - int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX), width; + int width; - if (c_mb_len <= 0) { + if (mbtowc(&wc, c, MB_CUR_MAX) < 0) { mbtowc(NULL, NULL, 0); - wc = (unsigned char)*c; + wc = bad_wchar; } width = wcwidth(wc); @@ -289,7 +295,7 @@ char *make_mbchar(int chr, int *chr_mb_len) chr_mb = charalloc(MB_CUR_MAX); *chr_mb_len = wctomb(chr_mb, chr); - if (*chr_mb_len <= 0) { + if (*chr_mb_len < 0) { wctomb(NULL, 0); *chr_mb_len = 0; } @@ -324,15 +330,15 @@ int parse_mbchar(const char *buf, char *chr, bool *bad_chr, size_t /* Get the number of bytes in the multibyte character. */ buf_mb_len = mblen(buf, MB_CUR_MAX); - /* If buf contains a null byte or an invalid multibyte - * character, set bad_chr to TRUE (if it contains the latter) - * and interpret buf's first byte. */ - if (buf_mb_len <= 0) { + /* If buf contains an invalid multibyte character, set bad_chr + * to TRUE and interpret buf's first byte. */ + if (buf_mb_len < 0) { mblen(NULL, 0); - if (buf_mb_len < 0 && bad_chr != NULL) + if (bad_chr != NULL) *bad_chr = TRUE; buf_mb_len = 1; - } + } else if (buf_mb_len == 0) + buf_mb_len++; /* Save the multibyte character in chr. */ if (chr != NULL) { @@ -480,7 +486,7 @@ int mbstrncasecmp(const char *s1, const char *s2, size_t n) s1_mb_len = parse_mbchar(s1, s1_mb, NULL, NULL); - if (mbtowc(&ws1, s1_mb, s1_mb_len) <= 0) { + if (mbtowc(&ws1, s1_mb, s1_mb_len) < 0) { mbtowc(NULL, NULL, 0); ws1 = (unsigned char)*s1_mb; bad_s1_mb = TRUE; @@ -488,7 +494,7 @@ int mbstrncasecmp(const char *s1, const char *s2, size_t n) s2_mb_len = parse_mbchar(s2, s2_mb, NULL, NULL); - if (mbtowc(&ws2, s2_mb, s2_mb_len) <= 0) { + if (mbtowc(&ws2, s2_mb, s2_mb_len) < 0) { mbtowc(NULL, NULL, 0); ws2 = (unsigned char)*s2_mb; bad_s2_mb = TRUE; @@ -554,7 +560,7 @@ const char *mbstrcasestr(const char *haystack, const char *needle) r_mb_len = parse_mbchar(r, r_mb, NULL, NULL); - if (mbtowc(&wr, r_mb, r_mb_len) <= 0) { + if (mbtowc(&wr, r_mb, r_mb_len) < 0) { mbtowc(NULL, NULL, 0); wr = (unsigned char)*r; bad_r_mb = TRUE; @@ -562,7 +568,7 @@ const char *mbstrcasestr(const char *haystack, const char *needle) q_mb_len = parse_mbchar(q, q_mb, NULL, NULL); - if (mbtowc(&wq, q_mb, q_mb_len) <= 0) { + if (mbtowc(&wq, q_mb, q_mb_len) < 0) { mbtowc(NULL, NULL, 0); wq = (unsigned char)*q; bad_q_mb = TRUE; @@ -660,7 +666,7 @@ const char *mbrevstrcasestr(const char *haystack, const char *needle, r_mb_len = parse_mbchar(r, r_mb, NULL, NULL); - if (mbtowc(&wr, r_mb, r_mb_len) <= 0) { + if (mbtowc(&wr, r_mb, r_mb_len) < 0) { mbtowc(NULL, NULL, 0); wr = (unsigned char)*r; bad_r_mb = TRUE; @@ -668,7 +674,7 @@ const char *mbrevstrcasestr(const char *haystack, const char *needle, q_mb_len = parse_mbchar(q, q_mb, NULL, NULL); - if (mbtowc(&wq, q_mb, q_mb_len) <= 0) { + if (mbtowc(&wq, q_mb, q_mb_len) < 0) { mbtowc(NULL, NULL, 0); wq = (unsigned char)*q; bad_q_mb = TRUE; @@ -766,7 +772,7 @@ char *mbstrchr(const char *s, char *c) wchar_t ws, wc; int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX); - if (c_mb_len <= 0) { + if (c_mb_len < 0) { mbtowc(NULL, NULL, 0); wc = (unsigned char)*c; bad_c_mb = TRUE; @@ -775,7 +781,7 @@ char *mbstrchr(const char *s, char *c) while (*s != '\0') { int s_mb_len = parse_mbchar(s, s_mb, NULL, NULL); - if (mbtowc(&ws, s_mb, s_mb_len) <= 0) { + if (mbtowc(&ws, s_mb, s_mb_len) < 0) { mbtowc(NULL, NULL, 0); ws = (unsigned char)*s; bad_s_mb = TRUE; diff --git a/src/proto.h b/src/proto.h index ad442189..098af1a8 100644 --- a/src/proto.h +++ b/src/proto.h @@ -138,6 +138,11 @@ extern bool curses_ended; extern char *homedir; +#ifdef ENABLE_UTF8 +extern const char *bad_mbchar; +extern const int bad_mbchar_len; +#endif + /* The functions we want available. */ /* Public functions in chars.c. */ diff --git a/src/winio.c b/src/winio.c index f0089cc6..9c0b9dc9 100644 --- a/src/winio.c +++ b/src/winio.c @@ -2365,8 +2365,8 @@ char *display_string(const char *buf, size_t start_col, size_t len, bool start_col++; } /* If buf contains a control character, interpret it. If buf - * contains an invalid multibyte control character, interpret - * it as though it's a normal control character.*/ + * contains an invalid multibyte control character, display it + * as such.*/ } else if (is_cntrl_mbchar(buf_mb)) { char *ctrl_buf_mb = charalloc(mb_cur_max()); int ctrl_buf_mb_len, i; @@ -2402,21 +2402,12 @@ char *display_string(const char *buf, size_t start_col, size_t len, bool #ifdef ENABLE_UTF8 /* If buf contains an invalid multibyte non-control - * character, interpret it as though it's a normal - * non-control character. */ + * character, display it as such. */ if (ISSET(USE_UTF8) && bad_char) { - char *bad_buf_mb; - int bad_buf_mb_len; + for (i = 0; i < bad_mbchar_len; i++) + converted[index++] = bad_mbchar[i]; - bad_buf_mb = make_mbchar((unsigned char)*buf_mb, - &bad_buf_mb_len); - - for (i = 0; i < bad_buf_mb_len; i++) - converted[index++] = bad_buf_mb[i]; - - start_col += mbwidth(bad_buf_mb); - - free(bad_buf_mb); + start_col += mbwidth(bad_mbchar); } else { #endif for (i = 0; i < buf_mb_len; i++) -- 2.39.5