even better handling of invalid Unicode characters

author David Lawrence Ramsey <pooka109@gmail.com>

Fri, 5 Aug 2005 03:14:29 +0000 (03:14 +0000)

committer David Lawrence Ramsey <pooka109@gmail.com>

Fri, 5 Aug 2005 03:14:29 +0000 (03:14 +0000)
author David Lawrence Ramsey <pooka109@gmail.com>
Fri, 5 Aug 2005 03:14:29 +0000 (03:14 +0000)
committer David Lawrence Ramsey <pooka109@gmail.com>
Fri, 5 Aug 2005 03:14:29 +0000 (03:14 +0000)
diff --git a/ChangeLog b/ChangeLog

index 2321bc7afce69e8486ec73b1e4e2c8dbccf1885c..70764eff9922117fb19b38777a8313f16a09eb52 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -134,6 +134,10 @@ CVS code -
           get_key_buffer(), check_statusblank(), nanogetstr(),
           titlebar(), statusbar(), bottombars(), edit_refresh(),
           do_yesno(), and do_help(). (DLR)
+       - Treat the Unicode characters D800-DFFF and FFFE-FFFF as
+         invalid, since the C library's multibyte functions don't seem
+         to.  New function is_valid_unicode(); changes to mbrep() and
+         make_mbchar(). (DLR)
  - color.c:
         - Remove unneeded fcntl.h include. (DLR)
  - chars.c:
@@ -143,13 +147,6 @@ CVS code -
    mbrep()
         - New function, the equivalent of control_mbrep() for non-control
           characters. (DLR)
-       - Treat the Unicode characters D800-DFFF and FFFE-FFFF as
-         invalid, since the C library's multibyte functions don't seem
-         to. (DLR)
-  make_mbchar()
-       - Treat the Unicode characters D800-DFFF and FFFE-FFFF as
-         invalid, since the C library's multibyte functions don't seem
-         to. (DLR)
    parse_mbchar()
         - Remove now-unneeded bad_chr parameter. (DLR)
    mbstrchr()
diff --git a/src/chars.c b/src/chars.c

index f21b9d6f4886131655d338720ff7991b9d7b72f5..8703d9cc1fae756afa536f5a8a43f6a06e5e426c 100644 (file)
--- a/src/chars.c
+++ b/src/chars.c
@@ -255,10 +255,8 @@ char *mbrep(const char *c, char *crep, int *crep_len)
      if (ISSET(USE_UTF8)) {
         wchar_t wc;
  
-       /* Unicode D800-DFFF and FFFE-FFFF are invalid, even though
-        * they're parsed properly. */
-       if (mbtowc(&wc, c, MB_CUR_MAX) < 0 || ((0xD800 <= wc && wc <=
-               0xDFFF) || (0XFFFE <= wc && wc <= 0xFFFF))) {
+       /* Reject invalid Unicode characters. */
+       if (mbtowc(&wc, c, MB_CUR_MAX) < 0 || !is_valid_unicode(wc)) {
             mbtowc(NULL, NULL, 0);
             crep = (char *)bad_mbchar;
             *crep_len = bad_mbchar_len;
@@ -331,12 +329,10 @@ char *make_mbchar(int chr, int *chr_mb_len)
  #ifdef ENABLE_UTF8
      if (ISSET(USE_UTF8)) {
         chr_mb = charalloc(MB_CUR_MAX);
-       *chr_mb_len = wctomb(chr_mb, chr);
+       *chr_mb_len = wctomb(chr_mb, (wchar_t)chr);
  
-       /* Unicode D800-DFFF and FFFE-FFFF are invalid, even though
-        * they're parsed properly. */
-       if (*chr_mb_len < 0 || ((0xD800 <= chr && chr <= 0xDFFF) ||
-               (0XFFFE <= chr && chr <= 0xFFFF))) {
+       /* Reject invalid Unicode characters. */
+       if (*chr_mb_len < 0 || !is_valid_unicode((wchar_t)chr)) {
             wctomb(NULL, 0);
             *chr_mb_len = 0;
         }
@@ -887,6 +883,16 @@ bool has_blank_mbchars(const char *s)
  #endif /* ENABLE_NANORC */
  #endif /* !DISABLE_JUSTIFY */
  
+#ifdef ENABLE_UTF8
+/* Return TRUE if wc is valid Unicode (i.e, it's not negative or in the
+ * ranges D800-DFFF or FFFE-FFFF), and FALSE otherwise. */
+bool is_valid_unicode(wchar_t wc)
+{
+    return (0 <= wc && (wc <= 0xD7FF || 0xE000 <= wc) && (wc !=
+       0xFFFE && wc != 0xFFFF));
+}
+#endif
+
  #ifdef ENABLE_NANORC
  /* Check if the string s is a valid multibyte string.  Return TRUE if it
   * is, and FALSE otherwise. */
diff --git a/src/proto.h b/src/proto.h

index d3d5ee7cb1cf2706cd452b94f4837508930708d2..86ef80eedafb92bbf8a7f4e36bbefa79c2df9b50 100644 (file)
--- a/src/proto.h
+++ b/src/proto.h
@@ -197,6 +197,9 @@ bool has_blank_chars(const char *s);
  bool has_blank_mbchars(const char *s);
  #endif
  #endif
+#ifdef ENABLE_UTF8
+bool is_valid_unicode(wchar_t wc);
+#endif
  #ifdef ENABLE_NANORC
  bool is_valid_mbstring(const char *s);
  #endif
author	David Lawrence Ramsey <pooka109@gmail.com>
	Fri, 5 Aug 2005 03:14:29 +0000 (03:14 +0000)
committer	David Lawrence Ramsey <pooka109@gmail.com>
	Fri, 5 Aug 2005 03:14:29 +0000 (03:14 +0000)
ChangeLog		patch \| blob \| history
src/chars.c		patch \| blob \| history
src/proto.h		patch \| blob \| history