detect words more accurately by taking punctuation into account, and

author David Lawrence Ramsey <pooka109@gmail.com>

Mon, 13 Jun 2005 02:40:04 +0000 (02:40 +0000)

committer David Lawrence Ramsey <pooka109@gmail.com>

Mon, 13 Jun 2005 02:40:04 +0000 (02:40 +0000)
author David Lawrence Ramsey <pooka109@gmail.com>
Mon, 13 Jun 2005 02:40:04 +0000 (02:40 +0000)
committer David Lawrence Ramsey <pooka109@gmail.com>
Mon, 13 Jun 2005 02:40:04 +0000 (02:40 +0000)
diff --git a/ChangeLog b/ChangeLog

index 1ce72e2882800af728ca4c90bb9a44d924474260..afe72d5b74840086ee5f2951a291c399f1117a00 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -65,6 +65,12 @@ CVS code -
           this is disabled when NANO_SMALL is defined.  New functions
           do_word_count() and do_next_word_void(); changes to
           shortcut_init() and do_next_word(). (DLR)
+       - Detect words more accurately by taking punctuation into
+         account, and convert all word-detecting functions to use the
+         same wrapper function for ease of maintenance.  New functions
+         is_punct_mbchar() and is_word_mbchar(); changes to
+         do_next_word(), do_prev_word(), is_whole_word(),
+         do_statusbar_next_word(), and do_statusbar_prev_word(). (DLR)
  - chars.c:
    make_mbstring()
         - Change erroneous ENABLE_EXTRA #ifdef to NANO_EXTRA to fix a
@@ -233,6 +239,7 @@ CVS code -
           Weinehall)
         - Don't refer to the built-in file browser as crappy anymore.
           (DLR)
+       - Check for iswpunct(). (DLR)
  - doc/faq.html:
         - Update the question about the FAQ to mention the current
           maintainer. (DLR)
diff --git a/configure.ac b/configure.ac

index d10581ebf0f3c32618c2a19715ae4c002bbf0f57..ccbc9c36378d5cc911b7dc499c8a7daec91c72f1 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -399,7 +399,7 @@ dnl Checks for functions.
  AC_CHECK_FUNCS(snprintf vsnprintf isblank strcasecmp strncasecmp strcasestr strnlen getline getdelim)
  
  if test x$enable_utf8 != xno; then
-    AC_CHECK_FUNCS(iswalnum mblen mbtowc wctomb wcwidth iswspace iswblank)
+    AC_CHECK_FUNCS(iswalnum iswblank iswpunct iswspace mblen mbtowc wctomb wcwidth)
  fi
  
  if test x$ac_cv_func_snprintf = xno || test x$ac_cv_func_vsnprintf = xno; then
@@ -472,12 +472,13 @@ fi
  if test x$enable_utf8 != xno && \
     test x$CURSES_LIB_WIDE = xyes && \
     test x$ac_cv_func_iswalnum = xyes && \
+   test x$ac_cv_func_iswpunct = xyes && \
+   (test x$ac_cv_func_iswblank = xyes || test x$ac_cv_func_iswspace = xyes) && \
     test x$ac_cv_func_mblen = xyes && \
     test x$ac_cv_func_mbtowc = xyes && \
     test x$ac_cv_func_wctomb = xyes && \
-   test x$ac_cv_func_wcwidth = xyes && \
-   (test x$ac_cv_func_iswspace = xyes || test x$ac_cv_func_iswblank = xyes); then
-       AC_DEFINE(NANO_WIDE, 1, [Define this if your system has sufficient wide character support (a wide curses library, iswalnum(), iswspace() or iswblank(), mblen(), mbtowc(), wctomb(), and wcwidth()).])
+   test x$ac_cv_func_wcwidth = xyes; then
+       AC_DEFINE(NANO_WIDE, 1, [Define this if your system has sufficient wide character support (a wide curses library, iswalnum(), iswpunct(), iswblank() or iswspace(), mblen(), mbtowc(), wctomb(), and wcwidth()).])
  else
      if test x$enable_utf8 = xyes; then
         AC_MSG_ERROR([
diff --git a/src/chars.c b/src/chars.c

index 880b034fc11135b525a990e1f05ede115ccf8751..57e1a71ccc65fbbc46c370f28704fbc4d8991655 100644 (file)
--- a/src/chars.c
+++ b/src/chars.c
@@ -146,6 +146,35 @@ bool is_cntrl_mbchar(const char *c)
         return is_cntrl_char((unsigned char)*c);
  }
  
+/* This function is equivalent to ispunct() for multibyte characters. */
+bool is_punct_mbchar(const char *c)
+{
+    assert(c != NULL);
+
+#ifdef NANO_WIDE
+    if (!ISSET(NO_UTF8)) {
+       wchar_t wc;
+       int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX);
+
+       if (c_mb_len <= 0) {
+           mbtowc(NULL, NULL, 0);
+           wc = (unsigned char)*c;
+       }
+
+       return iswpunct(wc);
+    } else
+#endif
+       return ispunct((unsigned char)*c);
+}
+
+/* This function returns TRUE for a multibyte character found in a word
+ * (currently only an alphanumeric or punctuation character) and FALSE
+ * otherwise. */
+bool is_word_mbchar(const char *c)
+{
+    return is_alnum_mbchar(c) || is_punct_mbchar(c);
+}
+
  /* c is a control character.  It displays as ^@, ^?, or ^[ch], where ch
   * is c + 64.  We return that character. */
  char control_rep(char c)
diff --git a/src/nano.c b/src/nano.c

index e49a24dfdb82f85beb37e1c2b96e43fdff26e3c9..fb3f523fd8c34399e64ccfb365b56ff8a05c2cdd 100644 (file)
--- a/src/nano.c
+++ b/src/nano.c
@@ -1468,7 +1468,7 @@ bool do_next_word(bool allow_update)
  
         /* If we've found it, stop moving forward through the current
          * line. */
-       if (!is_alnum_mbchar(char_mb))
+       if (!is_word_mbchar(char_mb))
             break;
         /* If we haven't found it, then we've started on a word, so set
          * started_on_word to TRUE. */
@@ -1489,7 +1489,7 @@ bool do_next_word(bool allow_update)
  
             /* If we've found it, stop moving forward through the
              * current line. */
-           if (is_alnum_mbchar(char_mb))
+           if (is_word_mbchar(char_mb))
                 break;
  
             current_x += char_mb_len;
@@ -1546,7 +1546,7 @@ void do_prev_word(void)
  
         /* If we've found it, stop moving backward through the current
          * line. */
-       if (!is_alnum_mbchar(char_mb))
+       if (!is_word_mbchar(char_mb))
             break;
  
         if (current_x == 0)
@@ -1569,7 +1569,7 @@ void do_prev_word(void)
  
             /* If we've found it, stop moving backward through the
              * current line. */
-           if (is_alnum_mbchar(char_mb))
+           if (is_word_mbchar(char_mb))
                 break;
  
             if (current_x == 0)
@@ -1608,7 +1608,7 @@ void do_prev_word(void)
  
             /* If we've found it, stop moving backward through the
              * current line. */
-           if (!is_alnum_mbchar(char_mb))
+           if (!is_word_mbchar(char_mb))
                 break;
  
             if (current_x == 0)
diff --git a/src/proto.h b/src/proto.h

index f180a8c118566b4994c09e942a409dad76fdbb04..7ce99ec4c58d4dc3f900ce8450e472dd2e09945f 100644 (file)
--- a/src/proto.h
+++ b/src/proto.h
@@ -173,6 +173,8 @@ bool is_cntrl_char(int c);
  bool is_cntrl_wchar(wint_t wc);
  #endif
  bool is_cntrl_mbchar(const char *c);
+bool is_punct_mbchar(const char *c);
+bool is_word_mbchar(const char *c);
  char control_rep(char c);
  #ifdef NANO_WIDE
  wchar_t control_wrep(wchar_t c);
diff --git a/src/search.c b/src/search.c

index 45ec4f6fb63f6728069861151a610cd8c03fd55c..e739e9976efa4d5bea61be8a49a52a93856c3af1 100644 (file)
--- a/src/search.c
+++ b/src/search.c
@@ -273,11 +273,11 @@ bool is_whole_word(size_t pos, const char *buf, const char *word)
      parse_mbchar(buf + word_end, r, NULL, NULL);
  
      /* If we're at the beginning of the line or the character before the
-     * word isn't an alphanumeric character, and if we're at the end of
-     * the line or the character after the word isn't an alphanumeric
-     * character, we have a whole word. */
-    retval = (pos == 0 || !is_alnum_mbchar(p)) &&
-       (word_end == strlen(buf) || !is_alnum_mbchar(r));
+     * word isn't a  "word" character, and if we're at the end of the
+     * line or the character after the word isn't a "word" character, we
+     * have a whole word. */
+    retval = (pos == 0 || !is_word_mbchar(p)) &&
+       (word_end == strlen(buf) || !is_word_mbchar(r));
  
      free(p);
      free(r);
diff --git a/src/winio.c b/src/winio.c

index 85307bad99ac3b8bf5ad7b8baaee3171b95e97a5..8f2d690b67e5e01f19f6551da642afde4dcfdd4a 100644 (file)
--- a/src/winio.c
+++ b/src/winio.c
@@ -1917,7 +1917,7 @@ void do_statusbar_next_word(void)
  
         /* If we've found it, stop moving forward through the current
          * line. */
-       if (!is_alnum_mbchar(char_mb))
+       if (!is_word_mbchar(char_mb))
             break;
  
         statusbar_x += char_mb_len;
@@ -1933,7 +1933,7 @@ void do_statusbar_next_word(void)
  
         /* If we've found it, stop moving forward through the current
          * line. */
-       if (is_alnum_mbchar(char_mb))
+       if (is_word_mbchar(char_mb))
             break;
  
         statusbar_x += char_mb_len;
@@ -1960,7 +1960,7 @@ void do_statusbar_prev_word(void)
  
         /* If we've found it, stop moving backward through the current
          * line. */
-       if (!is_alnum_mbchar(char_mb))
+       if (!is_word_mbchar(char_mb))
             break;
  
         if (statusbar_x == 0)
@@ -1982,7 +1982,7 @@ void do_statusbar_prev_word(void)
  
         /* If we've found it, stop moving backward through the current
          * line. */
-       if (is_alnum_mbchar(char_mb))
+       if (is_word_mbchar(char_mb))
             break;
  
         if (statusbar_x == 0)
@@ -2005,7 +2005,7 @@ void do_statusbar_prev_word(void)
  
             /* If we've found it, stop moving backward through the
              * current line. */
-           if (!is_alnum_mbchar(char_mb))
+           if (!is_word_mbchar(char_mb))
                 break;
  
             if (statusbar_x == 0)
author	David Lawrence Ramsey <pooka109@gmail.com>
	Mon, 13 Jun 2005 02:40:04 +0000 (02:40 +0000)
committer	David Lawrence Ramsey <pooka109@gmail.com>
	Mon, 13 Jun 2005 02:40:04 +0000 (02:40 +0000)
ChangeLog		patch \| blob \| history
configure.ac		patch \| blob \| history
src/chars.c		patch \| blob \| history
src/nano.c		patch \| blob \| history
src/proto.h		patch \| blob \| history
src/search.c		patch \| blob \| history
src/winio.c		patch \| blob \| history