Commit 4538a5e3 authored by Michal Nazarewicz's avatar Michal Nazarewicz

Refactor regex character class parsing in [:name:]

re_wctype function is used in three separate places and in all of
those places almost exact code extracting the name from [:name:]
surrounds it.  Furthermore, re_wctype requires a NUL-terminated
string, so the name of the character class is copied to a temporary
buffer.

The code duplication and unnecessary memory copying can be avoided by
pushing the responsibility of parsing the whole [:name:] sequence to
the function.

Furthermore, since now the function has access to the length of the
character class name (since it’s doing the parsing), it can take
advantage of that information in skipping some string comparisons and
using a constant-length memcmp instead of strcmp which needs to take
care of NUL bytes.

* src/regex.c (re_wctype): Delete function.  Replace it with:
(re_wctype_parse): New function which parses a whole [:name:] string
and returns a RECC_* constant or -1 if the string is not of [:name:]
format.
(regex_compile): Use re_wctype_parse.
* src/syntax.c (skip_chars): Use re_wctype_parse.
parent e7257061
This diff is collapsed.
......@@ -585,25 +585,13 @@ extern void regfree (regex_t *__preg);
/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
# include <wchar.h>
# include <wctype.h>
#endif
#if WIDE_CHAR_SUPPORT
/* The GNU C library provides support for user-defined character classes
and the functions from ISO C amendment 1. */
# ifdef CHARCLASS_NAME_MAX
# define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
# else
/* This shouldn't happen but some implementation might still have this
problem. Use a reasonable default value. */
# define CHAR_CLASS_MAX_LENGTH 256
# endif
typedef wctype_t re_wctype_t;
typedef wchar_t re_wchar_t;
# define re_wctype wctype
# define re_iswctype iswctype
# define re_wctype_to_bit(cc) 0
#else
# define CHAR_CLASS_MAX_LENGTH 9 /* Namely, `multibyte'. */
# ifndef emacs
# define btowc(c) c
# endif
......@@ -621,7 +609,7 @@ typedef enum { RECC_ERROR = 0,
} re_wctype_t;
extern char re_iswctype (int ch, re_wctype_t cc);
extern re_wctype_t re_wctype (const unsigned char* str);
extern re_wctype_t re_wctype_parse (const unsigned char **strp, unsigned limit);
typedef int re_wchar_t;
......
......@@ -1691,44 +1691,22 @@ skip_chars (bool forwardp, Lisp_Object string, Lisp_Object lim,
/* At first setup fastmap. */
while (i_byte < size_byte)
{
c = str[i_byte++];
if (handle_iso_classes && c == '['
&& i_byte < size_byte
&& str[i_byte] == ':')
if (handle_iso_classes)
{
const unsigned char *class_beg = str + i_byte + 1;
const unsigned char *class_end = class_beg;
const unsigned char *class_limit = str + size_byte - 2;
/* Leave room for the null. */
unsigned char class_name[CHAR_CLASS_MAX_LENGTH + 1];
re_wctype_t cc;
if (class_limit - class_beg > CHAR_CLASS_MAX_LENGTH)
class_limit = class_beg + CHAR_CLASS_MAX_LENGTH;
while (class_end < class_limit
&& *class_end >= 'a' && *class_end <= 'z')
class_end++;
if (class_end == class_beg
|| *class_end != ':' || class_end[1] != ']')
goto not_a_class_name;
memcpy (class_name, class_beg, class_end - class_beg);
class_name[class_end - class_beg] = 0;
cc = re_wctype (class_name);
const unsigned char *ch = str + i_byte;
re_wctype_t cc = re_wctype_parse (&ch, size_byte - i_byte);
if (cc == 0)
error ("Invalid ISO C character class");
iso_classes = Fcons (make_number (cc), iso_classes);
i_byte = class_end + 2 - str;
continue;
if (cc != -1)
{
iso_classes = Fcons (make_number (cc), iso_classes);
i_byte = ch - str;
continue;
}
}
not_a_class_name:
c = str[i_byte++];
if (c == '\\')
{
if (i_byte == size_byte)
......@@ -1808,54 +1786,32 @@ skip_chars (bool forwardp, Lisp_Object string, Lisp_Object lim,
while (i_byte < size_byte)
{
int leading_code = str[i_byte];
c = STRING_CHAR_AND_LENGTH (str + i_byte, len);
i_byte += len;
if (handle_iso_classes && c == '['
&& i_byte < size_byte
&& STRING_CHAR (str + i_byte) == ':')
if (handle_iso_classes)
{
const unsigned char *class_beg = str + i_byte + 1;
const unsigned char *class_end = class_beg;
const unsigned char *class_limit = str + size_byte - 2;
/* Leave room for the null. */
unsigned char class_name[CHAR_CLASS_MAX_LENGTH + 1];
re_wctype_t cc;
if (class_limit - class_beg > CHAR_CLASS_MAX_LENGTH)
class_limit = class_beg + CHAR_CLASS_MAX_LENGTH;
while (class_end < class_limit
&& *class_end >= 'a' && *class_end <= 'z')
class_end++;
if (class_end == class_beg
|| *class_end != ':' || class_end[1] != ']')
goto not_a_class_name_multibyte;
memcpy (class_name, class_beg, class_end - class_beg);
class_name[class_end - class_beg] = 0;
cc = re_wctype (class_name);
const unsigned char *ch = str + i_byte;
re_wctype_t cc = re_wctype_parse (&ch, size_byte - i_byte);
if (cc == 0)
error ("Invalid ISO C character class");
iso_classes = Fcons (make_number (cc), iso_classes);
i_byte = class_end + 2 - str;
continue;
if (cc != -1)
{
iso_classes = Fcons (make_number (cc), iso_classes);
i_byte = ch - str;
continue;
}
}
not_a_class_name_multibyte:
if (c == '\\')
if (leading_code== '\\')
{
if (i_byte == size_byte)
if (++i_byte == size_byte)
break;
leading_code = str[i_byte];
c = STRING_CHAR_AND_LENGTH (str + i_byte, len);
i_byte += len;
}
c = STRING_CHAR_AND_LENGTH (str + i_byte, len);
i_byte += len;
/* Treat `-' as range character only if another character
follows. */
if (i_byte + 1 < size_byte
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment