add better handling of invalid Unicode, plus a few miscellaneous minor

author David Lawrence Ramsey <pooka109@gmail.com>

Thu, 4 Aug 2005 20:24:26 +0000 (20:24 +0000)

committer David Lawrence Ramsey <pooka109@gmail.com>

Thu, 4 Aug 2005 20:24:26 +0000 (20:24 +0000)
author David Lawrence Ramsey <pooka109@gmail.com>
Thu, 4 Aug 2005 20:24:26 +0000 (20:24 +0000)
committer David Lawrence Ramsey <pooka109@gmail.com>
Thu, 4 Aug 2005 20:24:26 +0000 (20:24 +0000)
diff --git a/ChangeLog b/ChangeLog

index 2a5c604eeb9b3b1d2d5cb14535977d7ac6586aa7..a79a4fca4e8ff0a0da11c5b36c4d6651e7bf33df 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -137,9 +137,15 @@ CVS code -
  - color.c:
         - Remove unneeded fcntl.h include. (DLR)
  - chars.c:
+  control_rep(), control_mbrep()
+       - Assert that the multibyte character passed in is a control
+         character if it's valid. (DLR)
    mbrep()
         - New function, the equivalent of control_mbrep() for non-control
           characters. (DLR)
+       - Treat the Unicode characters D800-DFFF and FFFE-FFFF as
+         invalid, since the C library's multibyte functions don't seem
+         to. (DLR)
    parse_mbchar()
         - Remove now-unneeded bad_chr parameter. (DLR)
    mbstrchr()
@@ -263,10 +269,13 @@ CVS code -
           as wc does. (DLR)
  - winio.c:
    get_word_kbinput()
-       - Don't allow the input word to be between hexadecimal D800 to
-         DFFF or hexadecimal FFFE to FFFD, as they are invalid Unicode
-         characters; rename variables word and word_digits to uni and
-         uni_digits; and rename to get_unicode_kbinput(). (DLR)
+       - Multiply the entered digits by hexadecimal numbers instead of
+         decimal numbers for clarity, rename to get_unicode_kbinput(),
+         and rename variables word and word_digits to uni and
+         uni_digits. (DLR)
+  parse_verbatim_kbinput()
+       - Rename variables word_mb and word_mb_len to uni_mb and
+         uni_mb_len. (DLR)
    display_string()
         - Instead of using parse_mbchar()'s bad_chr parameter, use
           mbrep() to get the representation of a bad character. (DLR)
diff --git a/src/chars.c b/src/chars.c

index dbcd721a6989c65a261d872a42d5a6a37e9cde5d..dff80d843448eafa06daab288031c065f1e2a3da 100644 (file)
--- a/src/chars.c
+++ b/src/chars.c
@@ -184,6 +184,8 @@ bool is_word_mbchar(const char *c, bool allow_punct)
   * is (c + 64).  We return that character. */
  char control_rep(char c)
  {
+    assert(is_cntrl_char(c));
+
      /* Treat newlines embedded in a line as encoded nulls. */
      if (c == '\n')
         return '@';
@@ -198,6 +200,8 @@ char control_rep(char c)
   * where ch is (c + 64).  We return that wide character. */
  wchar_t control_wrep(wchar_t wc)
  {
+    assert(is_cntrl_wchar(wc));
+
      /* Treat newlines embedded in a line as encoded nulls. */
      if (wc == '\n')
         return '@';
@@ -251,7 +255,10 @@ char *mbrep(const char *c, char *crep, int *crep_len)
      if (ISSET(USE_UTF8)) {
         wchar_t wc;
  
-       if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
+       /* Unicode D800-DFFF and FFFE-FFFF are invalid, even though
+        * they're parsed properly. */
+       if (mbtowc(&wc, c, MB_CUR_MAX) < 0 || ((0xD800 <= wc && wc <=
+               0xDFFF) || (0XFFFE <= wc && wc <= 0xFFFF))) {
             mbtowc(NULL, NULL, 0);
             crep = (char *)bad_mbchar;
             *crep_len = bad_mbchar_len;
diff --git a/src/winio.c b/src/winio.c

index cbf2a13fde7323f710b0f7aa226d63a706d0b3bd..a48ca48ab480682d6d13ea46b0f8cc759cca6f33 100644 (file)
--- a/src/winio.c
+++ b/src/winio.c
@@ -1232,8 +1232,8 @@ int get_byte_kbinput(int kbinput
  }
  
  /* Translate a Unicode sequence: turn a four-digit hexadecimal number
- * from 0000 to D7FF or E000 to FFFD (case-insensitive) into its
- * corresponding multibyte value. */
+ * from 0000 to FFFF(case-insensitive) into its corresponding multibyte
+ * value. */
  int get_unicode_kbinput(int kbinput
  #ifndef NANO_SMALL
         , bool reset
@@ -1273,11 +1273,9 @@ int get_unicode_kbinput(int kbinput
         case 2:
             /* Two digits: add the digit we got to the 0x100's position
              * of the Unicode sequence holder. */
-           if (('0' <= kbinput && kbinput <= '7') || (uni != 0xD000 &&
-               '8' <= kbinput && kbinput <= '9'))
+           if ('0' <= kbinput && kbinput <= '9')
                 uni += (kbinput - '0') * 0x100;
-           else if (uni != 0xd000 && 'a' <= tolower(kbinput) &&
-               tolower(kbinput) <= 'f')
+           else if ('a' <= tolower(kbinput) && tolower(kbinput) <= 'f')
                 uni += (tolower(kbinput) + 10 - 'a') * 0x100;
             else
                 /* If the character we got isn't a hexadecimal digit, or
@@ -1305,9 +1303,8 @@ int get_unicode_kbinput(int kbinput
             if ('0' <= kbinput && kbinput <= '9') {
                 uni += (kbinput - '0');
                 retval = uni;
-           } else if (('a' <= tolower(kbinput) &&
-               tolower(kbinput) <= 'd') || (uni != 0xFFF0 && 'e' <=
-               tolower(kbinput) && tolower(kbinput) <= 'f')) {
+           } else if ('a' <= tolower(kbinput) && tolower(kbinput) <=
+               'f') {
                 uni += (tolower(kbinput) + 10 - 'a');
                 retval = uni;
             } else
@@ -1418,13 +1415,13 @@ int *get_verbatim_kbinput(WINDOW *win, size_t *kbinput_len)
   * that, leave the input as-is. */ 
  int *parse_verbatim_kbinput(WINDOW *win, size_t *kbinput_len)
  {
-    int *kbinput, word, *retval;
+    int *kbinput, uni, *retval;
  
      /* Read in the first keystroke. */
      while ((kbinput = get_input(win, 1)) == NULL);
  
      /* Check whether the first keystroke is a hexadecimal digit. */
-    word = get_unicode_kbinput(*kbinput
+    uni = get_unicode_kbinput(*kbinput
  #ifndef NANO_SMALL
         , FALSE
  #endif
@@ -1432,36 +1429,36 @@ int *parse_verbatim_kbinput(WINDOW *win, size_t *kbinput_len)
  
      /* If the first keystroke isn't a hexadecimal digit, put back the
       * first keystroke. */
-    if (word != ERR)
+    if (uni != ERR)
         unget_input(kbinput, 1);
      /* Otherwise, read in keystrokes until we have a complete word
       * sequence, and put back the corresponding word value. */
      else {
-       char *word_mb;
-       int word_mb_len, *seq, i;
+       char *uni_mb;
+       int uni_mb_len, *seq, i;
  
-       while (word == ERR) {
+       while (uni == ERR) {
             while ((kbinput = get_input(win, 1)) == NULL);
  
-           word = get_unicode_kbinput(*kbinput
+           uni = get_unicode_kbinput(*kbinput
  #ifndef NANO_SMALL
                 , FALSE
  #endif
                 );
         }
  
-       /* Put back the multibyte equivalent of the word value. */
-       word_mb = make_mbchar(word, &word_mb_len);
+       /* Put back the multibyte equivalent of the Unicode value. */
+       uni_mb = make_mbchar(uni, &uni_mb_len);
  
-       seq = (int *)nmalloc(word_mb_len * sizeof(int));
+       seq = (int *)nmalloc(uni_mb_len * sizeof(int));
  
-       for (i = 0; i < word_mb_len; i++)
-           seq[i] = (unsigned char)word_mb[i];
+       for (i = 0; i < uni_mb_len; i++)
+           seq[i] = (unsigned char)uni_mb[i];
  
-       unget_input(seq, word_mb_len);
+       unget_input(seq, uni_mb_len);
  
         free(seq);
-       free(word_mb);
+       free(uni_mb);
      }
  
      /* Get the complete sequence, and save the characters in it as the
author	David Lawrence Ramsey <pooka109@gmail.com>
	Thu, 4 Aug 2005 20:24:26 +0000 (20:24 +0000)
committer	David Lawrence Ramsey <pooka109@gmail.com>
	Thu, 4 Aug 2005 20:24:26 +0000 (20:24 +0000)
ChangeLog		patch \| blob \| history
src/chars.c		patch \| blob \| history
src/winio.c		patch \| blob \| history