Commit de7d5f36 authored by Philipp Stephani's avatar Philipp Stephani Committed by Paul Eggert

Implement named character escapes, similar to Perl

* lread.c (init_character_names): New function.
(read_escape): Read Perl-style named character escape sequences.
(syms_of_lread): Initialize new variable 'character_names'.
* test/src/lread-tests.el (lread-char-empty-name): Add test file
for src/lread.c.
parent 7621a521
......@@ -43,6 +43,7 @@ along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
#include "systime.h"
#include "termhooks.h"
#include "blockinput.h"
#include <c-ctype.h>
#ifdef MSDOS
#include "msdos.h"
......@@ -2149,6 +2150,36 @@ grow_read_buffer (void)
MAX_MULTIBYTE_LENGTH, -1, 1);
}
/* Hash table that maps Unicode character names to code points. */
static Lisp_Object character_names;
/* Length of the longest Unicode character name, in bytes. */
static ptrdiff_t max_character_name_length;
/* Initializes `character_names' and `max_character_name_length'.
Called by `read_escape'. */
void init_character_names ()
{
character_names = CALLN (Fmake_hash_table,
QCtest, Qequal,
/* Currently around 100,000 Unicode
characters are defined. */
QCsize, make_natnum (100000));
const Lisp_Object get_property =
Fsymbol_function (intern_c_string ("get-char-code-property"));
ptrdiff_t length = 0;
for (int i = 0; i <= MAX_UNICODE_CHAR; ++i)
{
const Lisp_Object code = make_natnum (i);
const Lisp_Object name = call2 (get_property, code, Qname);
if (NILP (name)) continue;
CHECK_STRING (name);
length = max (length, SBYTES (name));
Fputhash (name, code, character_names);
}
max_character_name_length = length;
}
/* Read a \-escape sequence, assuming we already read the `\'.
If the escape sequence forces unibyte, return eight-bit char. */
......@@ -2356,6 +2387,68 @@ read_escape (Lisp_Object readcharfun, bool stringp)
return i;
}
case 'N':
/* Named character. */
{
c = READCHAR;
if (c != '{')
invalid_syntax ("Expected opening brace after \\N");
if (NILP (character_names))
init_character_names ();
USE_SAFE_ALLOCA;
char *name = SAFE_ALLOCA (max_character_name_length + 1);
bool whitespace = false;
ptrdiff_t length = 0;
while (true)
{
c = READCHAR;
if (c < 0)
end_of_file_error ();
if (c == '}')
break;
if (! c_isascii (c))
xsignal1 (Qinvalid_read_syntax,
CALLN (Fformat,
build_pure_c_string ("Non-ASCII character U+%04X"
" in character name"),
make_natnum (c)));
/* We treat multiple adjacent whitespace characters as a
single space character. This makes it easier to use
character names in e.g. multi-line strings. */
if (c_isspace (c))
{
if (! whitespace)
{
whitespace = true;
name[length++] = ' ';
}
}
else
{
whitespace = false;
name[length++] = c;
}
if (length >= max_character_name_length)
invalid_syntax ("Character name too long");
}
if (length == 0)
invalid_syntax ("Empty character name");
name[length] = 0;
const Lisp_Object lisp_name = make_unibyte_string (name, length);
const Lisp_Object code =
(length >= 3 && length <= 10 && name[0] == 'U' && name[1] == '+') ?
/* Code point as U+N, where N is between 1 and 8 hexadecimal
digits. */
string_to_number (name + 2, 16, false) :
Fgethash (lisp_name, character_names, Qnil);
SAFE_FREE ();
if (! RANGED_INTEGERP (0, code, MAX_UNICODE_CHAR))
xsignal1 (Qinvalid_read_syntax,
CALLN (Fformat,
build_pure_c_string ("\\N{%s}"), lisp_name));
return XINT (code);
}
default:
return c;
}
......@@ -4744,4 +4837,7 @@ that are loaded before your customizations are read! */);
DEFSYM (Qweakness, "weakness");
DEFSYM (Qrehash_size, "rehash-size");
DEFSYM (Qrehash_threshold, "rehash-threshold");
character_names = Qnil;
staticpro (&character_names);
}
;;; lread-tests.el --- tests for lread.c -*- lexical-binding: t; -*-
;; Copyright (C) 2016 Google Inc.
;; Author: Philipp Stephani <phst@google.com>
;; This file is part of GNU Emacs.
;; This program is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
;; along with this program. If not, see <http://www.gnu.org/licenses/>.
;;; Commentary:
;; Unit tests for code in src/lread.c.
;;; Code:
(ert-deftest lread-char-number ()
(should (equal ?\N{U+A817} #xA817)))
(ert-deftest lread-char-name ()
(should (equal ?\N{SYLOTI NAGRI LETTER
DHO}
#xA817)))
(ert-deftest lread-char-invalid-number ()
(should-error (read "?\\N{U+110000}") :type 'invalid-read-syntax))
(ert-deftest lread-char-invalid-name ()
(should-error (read "?\\N{DOES NOT EXIST}")) :type 'invalid-read-syntax)
(ert-deftest lread-char-non-ascii-name ()
(should-error (read "?\\N{LATIN CAPITAL LETTER Ø}")) 'invalid-read-syntax)
(ert-deftest lread-char-empty-name ()
(should-error (read "?\\N{}")) 'invalid-read-syntax)
(ert-deftest lread-string-char-number ()
(should (equal "a\N{U+A817}b" "a\uA817b")))
(ert-deftest lread-string-char-name ()
(should (equal "a\N{SYLOTI NAGRI LETTER DHO}b" "a\uA817b")))
;;; lread-tests.el ends here
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment