Commit 6c284c6b authored by Eli Zaretskii's avatar Eli Zaretskii

Make [:print:] support non-ASCII characters correctly

* src/regex.c (ISPRINT): Call 'printablep' for multibyte characters.
(BIT_PRINT): New bit mask.
(re_wctype_to_bit): Return BIT_PRINT for RECC_PRINT.
* src/character.c (printablep): New function.
* src/character.h (printablep): Add prototype.

* lisp/emacs-lisp/rx.el (rx): Doc fix: document the new behavior
of 'print', 'alnum', and 'alphabetic'.

* doc/lispref/searching.texi (Char Classes): Document the new
behavior of [:print:].

* etc/NEWS: Mention the new behavior of [:print:].
parent 8802474a
...@@ -569,8 +569,11 @@ This matches any multibyte character (@pxref{Text Representations}). ...@@ -569,8 +569,11 @@ This matches any multibyte character (@pxref{Text Representations}).
@item [:nonascii:] @item [:nonascii:]
This matches any non-@acronym{ASCII} character. This matches any non-@acronym{ASCII} character.
@item [:print:] @item [:print:]
This matches printing characters---everything except @acronym{ASCII} control This matches printing characters---everything except @acronym{ASCII}
characters and the delete character. and non-@acronym{ASCII} control characters (including the delete
character), surrogates, and codepoints unassigned by Unicode, as
indicated by the Unicode @samp{general-category} property
(@pxref{Character Properties}).
@item [:punct:] @item [:punct:]
This matches any punctuation character. (At present, for multibyte This matches any punctuation character. (At present, for multibyte
characters, it matches anything that has non-word syntax.) characters, it matches anything that has non-word syntax.)
......
...@@ -628,6 +628,14 @@ notifications, if Emacs is compiled with file notification support. ...@@ -628,6 +628,14 @@ notifications, if Emacs is compiled with file notification support.
--- ---
*** gulp.el *** gulp.el
+++
** The character class [:print:] in regular expressions
no longer matches any multibyte character. Instead, Emacs now
consults the Unicode character properties to determine which
characters are printable. In particular, surrogates and unassigned
codepoints are now rejected by this class. If you want the old
behavior, use [:multibyte:] instead.
* New Modes and Packages in Emacs 25.1 * New Modes and Packages in Emacs 25.1
......
...@@ -969,16 +969,16 @@ CHAR ...@@ -969,16 +969,16 @@ CHAR
space, and DEL. space, and DEL.
`printing', `print' `printing', `print'
matches printing characters--everything except ASCII control chars matches printing characters--everything except ASCII and non-ASCII
and DEL. control characters, surrogates, and codepoints unassigned by Unicode.
`alphanumeric', `alnum' `alphanumeric', `alnum'
matches letters and digits. (But at present, for multibyte characters, matches alphabetic characters and digits. (For multibyte characters,
it matches anything that has word syntax.) it matches according to Unicode character properties.)
`letter', `alphabetic', `alpha' `letter', `alphabetic', `alpha'
matches letters. (But at present, for multibyte characters, matches alphabetic characters. (For multibyte characters,
it matches anything that has word syntax.) it matches according to Unicode character properties.)
`ascii' `ascii'
matches ASCII (unibyte) characters. matches ASCII (unibyte) characters.
......
...@@ -1022,6 +1022,22 @@ decimalnump (int c) ...@@ -1022,6 +1022,22 @@ decimalnump (int c)
return gen_cat == UNICODE_CATEGORY_Nd; return gen_cat == UNICODE_CATEGORY_Nd;
} }
/* Return 'true' if C is a printable character as defined by its
Unicode properties. */
bool
printablep (int c)
{
Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
if (! INTEGERP (category))
return false;
EMACS_INT gen_cat = XINT (category);
/* See UTS #18. */
return (!(gen_cat == UNICODE_CATEGORY_Cc /* control */
|| gen_cat == UNICODE_CATEGORY_Cs /* surrogate */
|| gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
}
void void
syms_of_character (void) syms_of_character (void)
{ {
......
...@@ -662,6 +662,7 @@ extern Lisp_Object string_escape_byte8 (Lisp_Object); ...@@ -662,6 +662,7 @@ extern Lisp_Object string_escape_byte8 (Lisp_Object);
extern bool alphabeticp (int); extern bool alphabeticp (int);
extern bool decimalnump (int); extern bool decimalnump (int);
extern bool printablep (int);
/* Return a translation table of id number ID. */ /* Return a translation table of id number ID. */
#define GET_TRANSLATION_TABLE(id) \ #define GET_TRANSLATION_TABLE(id) \
......
...@@ -318,7 +318,7 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 }; ...@@ -318,7 +318,7 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \ # define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \ ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
: 1) : printablep (c))
# define ISALNUM(c) (IS_REAL_ASCII (c) \ # define ISALNUM(c) (IS_REAL_ASCII (c) \
? (((c) >= 'a' && (c) <= 'z') \ ? (((c) >= 'a' && (c) <= 'z') \
...@@ -1865,7 +1865,8 @@ struct range_table_work_area ...@@ -1865,7 +1865,8 @@ struct range_table_work_area
#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i]) #define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
/* Bits used to implement the multibyte-part of the various character classes /* Bits used to implement the multibyte-part of the various character classes
such as [:alnum:] in a charset's range table. */ such as [:alnum:] in a charset's range table. The code currently assumes
that only the low 16 bits are used. */
#define BIT_WORD 0x1 #define BIT_WORD 0x1
#define BIT_LOWER 0x2 #define BIT_LOWER 0x2
#define BIT_PUNCT 0x4 #define BIT_PUNCT 0x4
...@@ -1874,6 +1875,7 @@ struct range_table_work_area ...@@ -1874,6 +1875,7 @@ struct range_table_work_area
#define BIT_MULTIBYTE 0x20 #define BIT_MULTIBYTE 0x20
#define BIT_ALPHA 0x40 #define BIT_ALPHA 0x40
#define BIT_ALNUM 0x80 #define BIT_ALNUM 0x80
#define BIT_PRINT 0x100
/* Set the bit for character C in a list. */ /* Set the bit for character C in a list. */
...@@ -2072,7 +2074,7 @@ re_wctype_to_bit (re_wctype_t cc) ...@@ -2072,7 +2074,7 @@ re_wctype_to_bit (re_wctype_t cc)
{ {
switch (cc) switch (cc)
{ {
case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH: case RECC_NONASCII: case RECC_GRAPH:
case RECC_MULTIBYTE: return BIT_MULTIBYTE; case RECC_MULTIBYTE: return BIT_MULTIBYTE;
case RECC_ALPHA: return BIT_ALPHA; case RECC_ALPHA: return BIT_ALPHA;
case RECC_ALNUM: return BIT_ALNUM; case RECC_ALNUM: return BIT_ALNUM;
...@@ -2081,6 +2083,7 @@ re_wctype_to_bit (re_wctype_t cc) ...@@ -2081,6 +2083,7 @@ re_wctype_to_bit (re_wctype_t cc)
case RECC_UPPER: return BIT_UPPER; case RECC_UPPER: return BIT_UPPER;
case RECC_PUNCT: return BIT_PUNCT; case RECC_PUNCT: return BIT_PUNCT;
case RECC_SPACE: return BIT_SPACE; case RECC_SPACE: return BIT_SPACE;
case RECC_PRINT: return BIT_PRINT;
case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL: case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0; case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
default: default:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment