Commit 1a50945f authored by Eli Zaretskii's avatar Eli Zaretskii

Improve [:alpha:] and [:alnum:] for multibyte characters (Bug#19878)

 src/character.c (alphabeticp, decimalnump): New functions.
 src/character.h (alphabeticp, decimalnump): Add prototypes.
 src/regex.c (ISALNUM, ISALPHA): Check Unicode character properties
 for multibyte characters by calling alphabeticp and decimalnump.
 (BIT_ALPHA, BIT_ALNUM): New bit masks.
 (re_wctype_to_bit): Return them when the class is RECC_ALPHA or
 RECC_ALNUM.
 (re_match_2_internal): Call ISALPHA and ISALNUM when appropriate.

 doc/lispref/searching.texi (Char Classes): Update the documentation of
 [:alpha:] and [:alnum:].

 etc/NEWS: Mention the changes in [:alpha:] and [:alnum:].
parent 31ecbf8d
2015-02-28 Eli Zaretskii <eliz@gnu.org>
* searching.texi (Char Classes): Update the documentation of
[:alpha:] and [:alnum:]. (Bug#19878)
2015-02-27 Eli Zaretskii <eliz@gnu.org>
* os.texi (Startup Summary):
......
......@@ -541,11 +541,15 @@ and what they mean:
@item [:ascii:]
This matches any @acronym{ASCII} character (codes 0--127).
@item [:alnum:]
This matches any letter or digit. (At present, for multibyte
characters, it matches anything that has word syntax.)
This matches any letter or digit. For multibyte characters, it
matches characters whose Unicode @samp{general-category} property
(@pxref{Character Properties}) indicates they are alphabetic or
decimal number characters.
@item [:alpha:]
This matches any letter. (At present, for multibyte characters, it
matches anything that has word syntax.)
This matches any letter. For multibyte characters, it matches
characters whose Unicode @samp{general-category} property
(@pxref{Character Properties}) indicates they are alphabetic
characters.
@item [:blank:]
This matches space and tab only.
@item [:cntrl:]
......
......@@ -612,6 +612,12 @@ when signaling a file error. For example, it now reports "Permission
denied" instead of "permission denied". The old behavior was problematic
in languages like German where downcasing rules depend on grammar.
+++
** The character classes [:alpha:] and [:alnum:] in regular expressions
now match multibyte characters using Unicode character properties.
If you want the old behavior where they matched any character with
word syntax, use `\sw' instead.
* Lisp Changes in Emacs 25.1
......
2015-02-28 Eli Zaretskii <eliz@gnu.org>
* character.c (alphabeticp, decimalnump): New functions.
* character.h (alphabeticp, decimalnump): Add prototypes.
* regex.c (ISALNUM, ISALPHA): Check Unicode character properties
for multibyte characters by calling alphabeticp and decimalnump.
(BIT_ALPHA, BIT_ALNUM): New bit masks.
(re_wctype_to_bit): Return them when the class is RECC_ALPHA or
RECC_ALNUM.
(re_match_2_internal): Call ISALPHA and ISALNUM when appropriate.
(Bug#19878)
2015-02-27 Jan Djärv <jan.h.d@swipnet.se>
* xterm.h (x_real_pos_and_offsets): Take outer_border as arg also.
......
......@@ -984,6 +984,48 @@ character is not ASCII nor 8-bit character, an error is signaled. */)
#ifdef emacs
/* Return 'true' if C is an alphabetic character as defined by its
Unicode properties. */
bool
alphabeticp (int c)
{
Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
if (INTEGERP (category))
{
unicode_category_t gen_cat = XINT (category);
/* See UTS #18. There are additional characters that should be
here, those designated as Other_uppercase, Other_lowercase,
and Other_alphabetic; FIXME. */
return (gen_cat == UNICODE_CATEGORY_Lu
|| gen_cat == UNICODE_CATEGORY_Ll
|| gen_cat == UNICODE_CATEGORY_Lt
|| gen_cat == UNICODE_CATEGORY_Lm
|| gen_cat == UNICODE_CATEGORY_Lo
|| gen_cat == UNICODE_CATEGORY_Mn
|| gen_cat == UNICODE_CATEGORY_Mc
|| gen_cat == UNICODE_CATEGORY_Me
|| gen_cat == UNICODE_CATEGORY_Nl) ? true : false;
}
}
/* Return 'true' if C is an decimal-number character as defined by its
Unicode properties. */
bool
decimalnump (int c)
{
Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
if (INTEGERP (category))
{
unicode_category_t gen_cat = XINT (category);
/* See UTS #18. */
return (gen_cat == UNICODE_CATEGORY_Nd) ? true : false;
}
}
void
syms_of_character (void)
{
......
......@@ -660,6 +660,9 @@ extern ptrdiff_t lisp_string_width (Lisp_Object, ptrdiff_t,
extern Lisp_Object Vchar_unify_table;
extern Lisp_Object string_escape_byte8 (Lisp_Object);
extern bool alphabeticp (int);
extern bool decimalnump (int);
/* Return a translation table of id number ID. */
#define GET_TRANSLATION_TABLE(id) \
(XCDR (XVECTOR (Vtranslation_table_vector)->contents[(id)]))
......
......@@ -324,12 +324,12 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
? (((c) >= 'a' && (c) <= 'z') \
|| ((c) >= 'A' && (c) <= 'Z') \
|| ((c) >= '0' && (c) <= '9')) \
: SYNTAX (c) == Sword)
: (alphabeticp (c) || decimalnump (c)))
# define ISALPHA(c) (IS_REAL_ASCII (c) \
? (((c) >= 'a' && (c) <= 'z') \
|| ((c) >= 'A' && (c) <= 'Z')) \
: SYNTAX (c) == Sword)
: alphabeticp (c))
# define ISLOWER(c) lowercasep (c)
......@@ -1872,6 +1872,8 @@ struct range_table_work_area
#define BIT_SPACE 0x8
#define BIT_UPPER 0x10
#define BIT_MULTIBYTE 0x20
#define BIT_ALPHA 0x40
#define BIT_ALNUM 0x80
/* Set the bit for character C in a list. */
......@@ -2072,7 +2074,9 @@ re_wctype_to_bit (re_wctype_t cc)
{
case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
case RECC_MULTIBYTE: return BIT_MULTIBYTE;
case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
case RECC_ALPHA: return BIT_ALPHA;
case RECC_ALNUM: return BIT_ALNUM;
case RECC_WORD: return BIT_WORD;
case RECC_LOWER: return BIT_LOWER;
case RECC_UPPER: return BIT_UPPER;
case RECC_PUNCT: return BIT_PUNCT;
......@@ -2930,7 +2934,7 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
#endif /* emacs */
/* In most cases the matching rule for char classes
only uses the syntax table for multibyte chars,
so that the content of the syntax-table it is not
so that the content of the syntax-table is not
hardcoded in the range_table. SPACE and WORD are
the two exceptions. */
if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
......@@ -2945,7 +2949,7 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax,
p = class_beg;
SET_LIST_BIT ('[');
/* Because the `:' may starts the range, we
/* Because the `:' may start the range, we
can't simply set bit and repeat the loop.
Instead, just set it to C and handle below. */
c = ':';
......@@ -5513,7 +5517,9 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1,
| (class_bits & BIT_PUNCT && ISPUNCT (c))
| (class_bits & BIT_SPACE && ISSPACE (c))
| (class_bits & BIT_UPPER && ISUPPER (c))
| (class_bits & BIT_WORD && ISWORD (c)))
| (class_bits & BIT_WORD && ISWORD (c))
| (class_bits & BIT_ALPHA && ISALPHA (c))
| (class_bits & BIT_ALNUM && ISALNUM (c)))
not = !not;
else
CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment