From: David Lawrence Ramsey Date: Thu, 4 Aug 2005 20:24:26 +0000 (+0000) Subject: add better handling of invalid Unicode, plus a few miscellaneous minor X-Git-Tag: v1.3.9~73 X-Git-Url: https://git.wh0rd.org/?a=commitdiff_plain;h=8c55d21bd63b5925dbfd0bf77e4071a89a29e43a;p=nano.git add better handling of invalid Unicode, plus a few miscellaneous minor fixes git-svn-id: svn://svn.savannah.gnu.org/nano/trunk/nano@2973 35c25a1d-7b9e-4130-9fde-d3aeb78583b8 --- diff --git a/ChangeLog b/ChangeLog index 2a5c604e..a79a4fca 100644 --- a/ChangeLog +++ b/ChangeLog @@ -137,9 +137,15 @@ CVS code - - color.c: - Remove unneeded fcntl.h include. (DLR) - chars.c: + control_rep(), control_mbrep() + - Assert that the multibyte character passed in is a control + character if it's valid. (DLR) mbrep() - New function, the equivalent of control_mbrep() for non-control characters. (DLR) + - Treat the Unicode characters D800-DFFF and FFFE-FFFF as + invalid, since the C library's multibyte functions don't seem + to. (DLR) parse_mbchar() - Remove now-unneeded bad_chr parameter. (DLR) mbstrchr() @@ -263,10 +269,13 @@ CVS code - as wc does. (DLR) - winio.c: get_word_kbinput() - - Don't allow the input word to be between hexadecimal D800 to - DFFF or hexadecimal FFFE to FFFD, as they are invalid Unicode - characters; rename variables word and word_digits to uni and - uni_digits; and rename to get_unicode_kbinput(). (DLR) + - Multiply the entered digits by hexadecimal numbers instead of + decimal numbers for clarity, rename to get_unicode_kbinput(), + and rename variables word and word_digits to uni and + uni_digits. (DLR) + parse_verbatim_kbinput() + - Rename variables word_mb and word_mb_len to uni_mb and + uni_mb_len. (DLR) display_string() - Instead of using parse_mbchar()'s bad_chr parameter, use mbrep() to get the representation of a bad character. (DLR) diff --git a/src/chars.c b/src/chars.c index dbcd721a..dff80d84 100644 --- a/src/chars.c +++ b/src/chars.c @@ -184,6 +184,8 @@ bool is_word_mbchar(const char *c, bool allow_punct) * is (c + 64). We return that character. */ char control_rep(char c) { + assert(is_cntrl_char(c)); + /* Treat newlines embedded in a line as encoded nulls. */ if (c == '\n') return '@'; @@ -198,6 +200,8 @@ char control_rep(char c) * where ch is (c + 64). We return that wide character. */ wchar_t control_wrep(wchar_t wc) { + assert(is_cntrl_wchar(wc)); + /* Treat newlines embedded in a line as encoded nulls. */ if (wc == '\n') return '@'; @@ -251,7 +255,10 @@ char *mbrep(const char *c, char *crep, int *crep_len) if (ISSET(USE_UTF8)) { wchar_t wc; - if (mbtowc(&wc, c, MB_CUR_MAX) < 0) { + /* Unicode D800-DFFF and FFFE-FFFF are invalid, even though + * they're parsed properly. */ + if (mbtowc(&wc, c, MB_CUR_MAX) < 0 || ((0xD800 <= wc && wc <= + 0xDFFF) || (0XFFFE <= wc && wc <= 0xFFFF))) { mbtowc(NULL, NULL, 0); crep = (char *)bad_mbchar; *crep_len = bad_mbchar_len; diff --git a/src/winio.c b/src/winio.c index cbf2a13f..a48ca48a 100644 --- a/src/winio.c +++ b/src/winio.c @@ -1232,8 +1232,8 @@ int get_byte_kbinput(int kbinput } /* Translate a Unicode sequence: turn a four-digit hexadecimal number - * from 0000 to D7FF or E000 to FFFD (case-insensitive) into its - * corresponding multibyte value. */ + * from 0000 to FFFF(case-insensitive) into its corresponding multibyte + * value. */ int get_unicode_kbinput(int kbinput #ifndef NANO_SMALL , bool reset @@ -1273,11 +1273,9 @@ int get_unicode_kbinput(int kbinput case 2: /* Two digits: add the digit we got to the 0x100's position * of the Unicode sequence holder. */ - if (('0' <= kbinput && kbinput <= '7') || (uni != 0xD000 && - '8' <= kbinput && kbinput <= '9')) + if ('0' <= kbinput && kbinput <= '9') uni += (kbinput - '0') * 0x100; - else if (uni != 0xd000 && 'a' <= tolower(kbinput) && - tolower(kbinput) <= 'f') + else if ('a' <= tolower(kbinput) && tolower(kbinput) <= 'f') uni += (tolower(kbinput) + 10 - 'a') * 0x100; else /* If the character we got isn't a hexadecimal digit, or @@ -1305,9 +1303,8 @@ int get_unicode_kbinput(int kbinput if ('0' <= kbinput && kbinput <= '9') { uni += (kbinput - '0'); retval = uni; - } else if (('a' <= tolower(kbinput) && - tolower(kbinput) <= 'd') || (uni != 0xFFF0 && 'e' <= - tolower(kbinput) && tolower(kbinput) <= 'f')) { + } else if ('a' <= tolower(kbinput) && tolower(kbinput) <= + 'f') { uni += (tolower(kbinput) + 10 - 'a'); retval = uni; } else @@ -1418,13 +1415,13 @@ int *get_verbatim_kbinput(WINDOW *win, size_t *kbinput_len) * that, leave the input as-is. */ int *parse_verbatim_kbinput(WINDOW *win, size_t *kbinput_len) { - int *kbinput, word, *retval; + int *kbinput, uni, *retval; /* Read in the first keystroke. */ while ((kbinput = get_input(win, 1)) == NULL); /* Check whether the first keystroke is a hexadecimal digit. */ - word = get_unicode_kbinput(*kbinput + uni = get_unicode_kbinput(*kbinput #ifndef NANO_SMALL , FALSE #endif @@ -1432,36 +1429,36 @@ int *parse_verbatim_kbinput(WINDOW *win, size_t *kbinput_len) /* If the first keystroke isn't a hexadecimal digit, put back the * first keystroke. */ - if (word != ERR) + if (uni != ERR) unget_input(kbinput, 1); /* Otherwise, read in keystrokes until we have a complete word * sequence, and put back the corresponding word value. */ else { - char *word_mb; - int word_mb_len, *seq, i; + char *uni_mb; + int uni_mb_len, *seq, i; - while (word == ERR) { + while (uni == ERR) { while ((kbinput = get_input(win, 1)) == NULL); - word = get_unicode_kbinput(*kbinput + uni = get_unicode_kbinput(*kbinput #ifndef NANO_SMALL , FALSE #endif ); } - /* Put back the multibyte equivalent of the word value. */ - word_mb = make_mbchar(word, &word_mb_len); + /* Put back the multibyte equivalent of the Unicode value. */ + uni_mb = make_mbchar(uni, &uni_mb_len); - seq = (int *)nmalloc(word_mb_len * sizeof(int)); + seq = (int *)nmalloc(uni_mb_len * sizeof(int)); - for (i = 0; i < word_mb_len; i++) - seq[i] = (unsigned char)word_mb[i]; + for (i = 0; i < uni_mb_len; i++) + seq[i] = (unsigned char)uni_mb[i]; - unget_input(seq, word_mb_len); + unget_input(seq, uni_mb_len); free(seq); - free(word_mb); + free(uni_mb); } /* Get the complete sequence, and save the characters in it as the