regex-emacs.c 147 KB
Newer Older
1
/* Emacs regular expression matching and search
Karl Berry's avatar
Karl Berry committed
2

Paul Eggert's avatar
Paul Eggert committed
3
   Copyright (C) 1993-2019 Free Software Foundation, Inc.
Karl Berry's avatar
Karl Berry committed
4

5 6
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
Miles Bader's avatar
Miles Bader committed
7
   the Free Software Foundation; either version 3, or (at your option)
8 9 10 11
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Juanma Barranquero's avatar
Juanma Barranquero committed
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 14 15
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17

18
/* TODO:
19
   - structure the opcode space into opcode+flag.
20
   - replace (succeed_n + jump_n + set_number_at) with something that doesn't
21
     need to modify the compiled regexp so that re_search can be reentrant.
22
   - get rid of on_failure_jump_smart by doing the optimization in re_comp
23
     rather than at run-time, so that re_search can be reentrant.
24
*/
25

26
#include <config.h>
27

28
#include "regex-emacs.h"
29

30
#include <stdlib.h>
31

32 33 34 35
#include "character.h"
#include "buffer.h"
#include "syntax.h"
#include "category.h"
36

37
/* Maximum number of duplicates an interval can allow.  Some systems
38 39 40
   define this in other header files, but we want our value, so remove
   any previous define.  Repeat counts are stored in opcodes as 2-byte
   unsigned integers.  */
41 42
#ifdef RE_DUP_MAX
# undef RE_DUP_MAX
43
#endif
44 45
#define RE_DUP_MAX (0xffff)

46
/* Make syntax table lookup grant data in gl_state.  */
47
#define SYNTAX(c) syntax_property (c, 1)
48

49
/* Convert the pointer to the char to BEG-based offset from the start.  */
50
#define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
51
/* Strings are 0-indexed, buffers are 1-indexed; pun on the boolean
52
   result to get the right base index.  */
53
#define POS_AS_IN_BUFFER(p)                                    \
54
  ((p) + (NILP (gl_state.object) || BUFFERP (gl_state.object)))
55

56 57 58
#define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
#define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
#define RE_STRING_CHAR(p, multibyte) \
59
  (multibyte ? STRING_CHAR (p) : *(p))
60
#define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
61
  (multibyte ? STRING_CHAR_AND_LENGTH (p, len) : ((len) = 1, *(p)))
62

63
#define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
64

65
#define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
66

67 68 69
/* Set C a (possibly converted to multibyte) character before P.  P
   points into a string which is the virtual concatenation of STR1
   (which ends at END1) or STR2 (which ends at END2).  */
70
#define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2)			     \
71
  do {									     \
72
    if (target_multibyte)						     \
73 74
      {									     \
	re_char *dtemp = (p) == (str2) ? (end1) : (p);			     \
75 76 77
	re_char *dlimit = (p) > (str2) && (p) <= (end2) ? (str2) : (str1);   \
	while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp))		     \
	  continue;							     \
78
	c = STRING_CHAR (dtemp);					     \
79 80 81 82
      }									     \
    else								     \
      {									     \
	(c = ((p) == (str2) ? (end1) : (p))[-1]);			     \
83
	(c) = RE_CHAR_TO_MULTIBYTE (c);					     \
84
      }									     \
85
  } while (false)
86

87 88
/* Set C a (possibly converted to multibyte) character at P, and set
   LEN to the byte length of that character.  */
89
#define GET_CHAR_AFTER(c, p, len)		\
90
  do {						\
91
    if (target_multibyte)			\
92
      (c) = STRING_CHAR_AND_LENGTH (p, len);	\
93 94
    else					\
      {						\
95
	(c) = *p;				\
96
	len = 1;				\
97
	(c) = RE_CHAR_TO_MULTIBYTE (c);		\
98
      }						\
99
   } while (false)
100

101
/* 1 if C is an ASCII character.  */
102
#define IS_REAL_ASCII(c) ((c) < 0200)
103

104
/* 1 if C is a unibyte character.  */
105
#define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
106

107
/* The Emacs definitions should not be directly affected by locales.  */
108

109
/* In Emacs, these are only used for single-byte characters.  */
110 111 112
#define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
#define ISCNTRL(c) ((c) < ' ')
#define ISXDIGIT(c) (0 <= char_hexdigit (c))
113 114 115

/* The rest must handle multibyte characters.  */

116
#define ISBLANK(c) (IS_REAL_ASCII (c)			\
117 118 119
                     ? ((c) == ' ' || (c) == '\t')      \
                     : blankp (c))

120
#define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c)				\
121
		     ? (c) > ' ' && !((c) >= 0177 && (c) <= 0240)	\
122
		     : graphicp (c))
123

124
#define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c)				\
125
		    ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237)	\
126
		     : printablep (c))
127

128
#define ISALNUM(c) (IS_REAL_ASCII (c)			\
129 130 131
		    ? (((c) >= 'a' && (c) <= 'z')	\
		       || ((c) >= 'A' && (c) <= 'Z')	\
		       || ((c) >= '0' && (c) <= '9'))	\
132
		    : alphanumericp (c))
133

134
#define ISALPHA(c) (IS_REAL_ASCII (c)			\
135 136
		    ? (((c) >= 'a' && (c) <= 'z')	\
		       || ((c) >= 'A' && (c) <= 'Z'))	\
137
		    : alphabeticp (c))
138

139
#define ISLOWER(c) lowercasep (c)
140

141
#define ISPUNCT(c) (IS_REAL_ASCII (c)				\
142 143
		    ? ((c) > ' ' && (c) < 0177			\
		       && !(((c) >= 'a' && (c) <= 'z')		\
144 145
		            || ((c) >= 'A' && (c) <= 'Z')	\
		            || ((c) >= '0' && (c) <= '9')))	\
146 147
		    : SYNTAX (c) != Sword)

148
#define ISSPACE(c) (SYNTAX (c) == Swhitespace)
149

150
#define ISUPPER(c) uppercasep (c)
151

152
#define ISWORD(c) (SYNTAX (c) == Sword)
153

154
/* Use alloca instead of malloc.  This is because using malloc in
155 156
   re_search* or re_match* could cause memory leaks when C-g is used
   in Emacs (note that SAFE_ALLOCA could also call malloc, but does so
157
   via 'record_xmalloc' which uses 'unwind_protect' to ensure the
158 159 160
   memory is freed even in case of non-local exits); also, malloc is
   slower and causes storage fragmentation.  On the other hand, malloc
   is more portable, and easier to debug.
161

162
   Because we sometimes use alloca, some routines have to be macros,
163
   not functions -- 'alloca'-allocated space disappears at the end of the
164 165
   function it is called in.  */

166 167 168
/* This may be adjusted in main(), if the stack is successfully grown.  */
ptrdiff_t emacs_re_safe_alloca = MAX_ALLOCA;
/* Like USE_SAFE_ALLOCA, but use emacs_re_safe_alloca.  */
169 170
#define REGEX_USE_SAFE_ALLOCA					       \
  USE_SAFE_ALLOCA; sa_avail = emacs_re_safe_alloca
171

172
/* Assumes a 'char *destination' variable.  */
173 174
#define REGEX_REALLOCATE(source, osize, nsize)				\
  (destination = SAFE_ALLOCA (nsize),					\
175
   memcpy (destination, source, osize))
176

177 178
/* True if 'size1' is non-NULL and PTR is pointing anywhere inside
   'string1' or just past its end.  This works if PTR is NULL, which is
179
   a good thing.  */
Richard M. Stallman's avatar
Richard M. Stallman committed
180
#define FIRST_STRING_P(ptr)					\
181 182
  (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)

183
#define BYTEWIDTH 8 /* In bits.  */
184

185 186 187
/* Type of source-pattern and string chars.  */
typedef const unsigned char re_char;

188 189
static void re_compile_fastmap (struct re_pattern_buffer *);
static ptrdiff_t re_match_2_internal (struct re_pattern_buffer *bufp,
190 191
				     re_char *string1, ptrdiff_t size1,
				     re_char *string2, ptrdiff_t size2,
192
				     ptrdiff_t pos,
Paul Eggert's avatar
Paul Eggert committed
193
				     struct re_registers *regs,
194
				     ptrdiff_t stop);
195 196

/* These are the command codes that appear in compiled regular
197
   expressions.  Some opcodes are followed by argument bytes.  A
198 199 200 201 202 203 204
   command code can specify any interpretation whatsoever for its
   arguments.  Zero bytes may appear in the compiled regular expression.  */

typedef enum
{
  no_op = 0,

205
  /* Succeed right away--no more backtracking.  */
206 207
  succeed,

Richard M. Stallman's avatar
Richard M. Stallman committed
208
	/* Followed by one byte giving n, then by n literal bytes.  */
209 210
  exactn,

Richard M. Stallman's avatar
Richard M. Stallman committed
211
	/* Matches any (more or less) character.  */
212 213
  anychar,

Richard M. Stallman's avatar
Richard M. Stallman committed
214 215 216 217 218
	/* Matches any one char belonging to specified set.  First
	   following byte is number of bitmap bytes.  Then come bytes
	   for a bitmap saying which chars are in.  Bits in each byte
	   are ordered low-bit-first.  A character is in the set if its
	   bit is 1.  A character too large to have a bit in the map is
219 220 221 222 223
	   automatically not in the set.

	   If the length byte has the 0x80 bit set, then that stuff
	   is followed by a range table:
	       2 bytes of flags for character sets (low 8 bits, high 8 bits)
224
		   See RANGE_TABLE_WORK_BITS below.
225
	       2 bytes, the number of pairs that follow (upto 32767)
226
	       pairs, each 2 multibyte characters,
227
		   each multibyte character represented as 3 bytes.  */
228 229
  charset,

Richard M. Stallman's avatar
Richard M. Stallman committed
230
	/* Same parameters as charset, but match any character that is
231
	   not one of those specified.  */
232 233
  charset_not,

Richard M. Stallman's avatar
Richard M. Stallman committed
234 235 236
	/* Start remembering the text that is matched, for storing in a
	   register.  Followed by one byte with the register number, in
	   the range 0 to one less than the pattern buffer's re_nsub
237
	   field.  */
238 239
  start_memory,

Richard M. Stallman's avatar
Richard M. Stallman committed
240 241
	/* Stop remembering the text that is matched and store it in a
	   memory register.  Followed by one byte with the register
242
	   number, in the range 0 to one less than 're_nsub' in the
243
	   pattern buffer.  */
244 245
  stop_memory,

Richard M. Stallman's avatar
Richard M. Stallman committed
246
	/* Match a duplicate of something remembered. Followed by one
247
	   byte containing the register number.  */
248 249
  duplicate,

Richard M. Stallman's avatar
Richard M. Stallman committed
250
	/* Fail unless at beginning of line.  */
251 252
  begline,

253
	/* Fail unless at end of line.  */
254 255
  endline,

256
	/* Succeeds if at beginning of buffer.  */
257 258
  begbuf,

Richard M. Stallman's avatar
Richard M. Stallman committed
259
	/* Analogously, for end of buffer/string.  */
260
  endbuf,
261

Richard M. Stallman's avatar
Richard M. Stallman committed
262
	/* Followed by two byte relative address to which to jump.  */
263
  jump,
264

Richard M. Stallman's avatar
Richard M. Stallman committed
265
	/* Followed by two-byte relative address of place to resume at
Juanma Barranquero's avatar
Juanma Barranquero committed
266
	   in case of failure.  */
267
  on_failure_jump,
268

Richard M. Stallman's avatar
Richard M. Stallman committed
269 270
	/* Like on_failure_jump, but pushes a placeholder instead of the
	   current string position when executed.  */
271
  on_failure_keep_string_jump,
272

273
	/* Just like 'on_failure_jump', except that it checks that we
274 275 276 277
	   don't get stuck in an infinite loop (matching an empty string
	   indefinitely).  */
  on_failure_jump_loop,

278
	/* Just like 'on_failure_jump_loop', except that it checks for
279 280
	   a different kind of loop (the kind that shows up with non-greedy
	   operators).  This operation has to be immediately preceded
281
	   by a 'no_op'.  */
282 283
  on_failure_jump_nastyloop,

284
	/* A smart 'on_failure_jump' used for greedy * and + operators.
Juanma Barranquero's avatar
Juanma Barranquero committed
285
	   It analyzes the loop before which it is put and if the
286
	   loop does not require backtracking, it changes itself to
287 288 289
	   'on_failure_keep_string_jump' and short-circuits the loop,
	   else it just defaults to changing itself into 'on_failure_jump'.
	   It assumes that it is pointing to just past a 'jump'.  */
290
  on_failure_jump_smart,
291

Richard M. Stallman's avatar
Richard M. Stallman committed
292
	/* Followed by two-byte relative address and two-byte number n.
293 294 295
	   After matching N times, jump to the address upon failure.
	   Does not work if N starts at 0: use on_failure_jump_loop
	   instead.  */
296 297
  succeed_n,

Richard M. Stallman's avatar
Richard M. Stallman committed
298 299
	/* Followed by two-byte relative address, and two-byte number n.
	   Jump to the address N times, then fail.  */
300 301
  jump_n,

Richard M. Stallman's avatar
Richard M. Stallman committed
302
	/* Set the following two-byte relative address to the
Juanma Barranquero's avatar
Juanma Barranquero committed
303
	   subsequent two-byte number.  The address *includes* the two
Richard M. Stallman's avatar
Richard M. Stallman committed
304
	   bytes of number.  */
305 306 307 308 309 310
  set_number_at,

  wordbeg,	/* Succeeds if at word beginning.  */
  wordend,	/* Succeeds if at word end.  */

  wordbound,	/* Succeeds if at a word boundary.  */
Juanma Barranquero's avatar
Juanma Barranquero committed
311
  notwordbound,	/* Succeeds if not at a word boundary.  */
312

313 314 315
  symbeg,       /* Succeeds if at symbol beginning.  */
  symend,       /* Succeeds if at symbol end.  */

316
	/* Matches any character whose syntax is specified.  Followed by
Richard M. Stallman's avatar
Richard M. Stallman committed
317
	   a byte which contains a syntax code, e.g., Sword.  */
318 319 320
  syntaxspec,

	/* Matches any character whose syntax is not that specified.  */
321
  notsyntaxspec,
322

323
  at_dot,	/* Succeeds if at point.  */
324 325

  /* Matches any character whose category-set contains the specified
Juanma Barranquero's avatar
Juanma Barranquero committed
326 327
     category.  The operator is followed by a byte which contains a
     category code (mnemonic ASCII character).  */
328 329 330 331 332 333
  categoryspec,

  /* Matches any character whose category-set does not contain the
     specified category.  The operator is followed by a byte which
     contains the category code (mnemonic ASCII character).  */
  notcategoryspec
334 335 336 337 338 339 340 341 342 343
} re_opcode_t;

/* Common operations on the compiled pattern.  */

/* Store NUMBER in two contiguous bytes starting at DESTINATION.  */

#define STORE_NUMBER(destination, number)				\
  do {									\
    (destination)[0] = (number) & 0377;					\
    (destination)[1] = (number) >> 8;					\
344
  } while (false)
345 346 347 348 349 350 351 352 353

/* Same as STORE_NUMBER, except increment DESTINATION to
   the byte after where the number is stored.  Therefore, DESTINATION
   must be an lvalue.  */

#define STORE_NUMBER_AND_INCR(destination, number)			\
  do {									\
    STORE_NUMBER (destination, number);					\
    (destination) += 2;							\
354
  } while (false)
355 356 357 358 359

/* Put into DESTINATION a number stored in two contiguous bytes starting
   at SOURCE.  */

#define EXTRACT_NUMBER(destination, source)				\
360
  ((destination) = extract_number (source))
361

362 363
static int
extract_number (re_char *source)
364
{
365 366
  signed char leading_byte = source[1];
  return leading_byte * 256 + source[0];
367 368 369 370 371 372
}

/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
   SOURCE must be an lvalue.  */

#define EXTRACT_NUMBER_AND_INCR(destination, source)			\
373
  ((destination) = extract_number_and_incr (&source))
374

375 376
static int
extract_number_and_incr (re_char **source)
377
{
378
  int num = extract_number (*source);
379
  *source += 2;
380
  return num;
381 382
}

383 384
/* Store a multibyte character in three contiguous bytes starting
   DESTINATION, and increment DESTINATION to the byte after where the
Juanma Barranquero's avatar
Juanma Barranquero committed
385
   character is stored.  Therefore, DESTINATION must be an lvalue.  */
386 387 388 389 390 391 392

#define STORE_CHARACTER_AND_INCR(destination, character)	\
  do {								\
    (destination)[0] = (character) & 0377;			\
    (destination)[1] = ((character) >> 8) & 0377;		\
    (destination)[2] = (character) >> 16;			\
    (destination) += 3;						\
393
  } while (false)
394 395

/* Put into DESTINATION a character stored in three contiguous bytes
Juanma Barranquero's avatar
Juanma Barranquero committed
396
   starting at SOURCE.  */
397 398 399 400 401 402

#define EXTRACT_CHARACTER(destination, source)	\
  do {						\
    (destination) = ((source)[0]		\
		     | ((source)[1] << 8)	\
		     | ((source)[2] << 16));	\
403
  } while (false)
404 405 406 407 408 409 410 411 412


/* Macros for charset. */

/* Size of bitmap of charset P in bytes.  P is a start of charset,
   i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not.  */
#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)

/* Nonzero if charset P has range table.  */
413
#define CHARSET_RANGE_TABLE_EXISTS_P(p)	 (((p)[1] & 0x80) != 0)
414 415 416

/* Return the address of range table of charset P.  But not the start
   of table itself, but the before where the number of ranges is
417
   stored.  '2 +' means to skip re_opcode_t and size of bitmap,
418 419 420 421 422 423 424
   and the 2 bytes of flags at the start of the range table.  */
#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])

/* Extract the bit flags that start a range table.  */
#define CHARSET_RANGE_TABLE_BITS(p)		\
  ((p)[2 + CHARSET_BITMAP_SIZE (p)]		\
   + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
425 426

/* Return the address of end of RANGE_TABLE.  COUNT is number of
427 428
   ranges (which is a pair of (start, end)) in the RANGE_TABLE.  '* 2'
   is start of range and end of range.  '* 3' is size of each start
429 430 431 432
   and end.  */
#define CHARSET_RANGE_TABLE_END(range_table, count)	\
  ((range_table) + (count) * 2 * 3)

433 434
/* If REGEX_EMACS_DEBUG is defined, print many voluminous messages
   (if the variable regex_emacs_debug is positive).  */
435

436
#ifdef REGEX_EMACS_DEBUG
437

438
/* Use standard I/O for debugging.  */
439
# include <stdio.h>
440

441
static int regex_emacs_debug = -100000;
442

443
# define DEBUG_STATEMENT(e) e
444
# define DEBUG_PRINT(...) if (regex_emacs_debug > 0) printf (__VA_ARGS__)
445
# define DEBUG_COMPILES_ARGUMENTS
446
# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)				\
447
  if (regex_emacs_debug > 0) print_partial_compiled_pattern (s, e)
448
# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)			\
449
  if (regex_emacs_debug > 0) print_double_string (w, s1, sz1, s2, sz2)
450 451 452 453


/* Print the fastmap in human-readable form.  */

454 455
static void
print_fastmap (char *fastmap)
456
{
457 458
  bool was_a_range = false;
  int i = 0;
459

460 461 462 463
  while (i < (1 << BYTEWIDTH))
    {
      if (fastmap[i++])
	{
464
	  was_a_range = false;
Richard M. Stallman's avatar
Richard M. Stallman committed
465 466 467
	  putchar (i - 1);
	  while (i < (1 << BYTEWIDTH)  &&  fastmap[i])
	    {
468
	      was_a_range = true;
Richard M. Stallman's avatar
Richard M. Stallman committed
469 470
	      i++;
	    }
471
	  if (was_a_range)
Richard M. Stallman's avatar
Richard M. Stallman committed
472 473 474 475 476
	    {
	      printf ("-");
	      putchar (i - 1);
	    }
	}
477
    }
478
  putchar ('\n');
479 480 481 482 483 484
}


/* Print a compiled pattern string in human-readable form, starting at
   the START pointer into it and ending just before the pointer END.  */

485 486
static void
print_partial_compiled_pattern (re_char *start, re_char *end)
487 488
{
  int mcnt, mcnt2;
489 490
  re_char *p = start;
  re_char *pend = end;
491 492 493

  if (start == NULL)
    {
494
      fprintf (stderr, "(null)\n");
495 496
      return;
    }
497

498 499 500
  /* Loop over pattern commands.  */
  while (p < pend)
    {
501
      fprintf (stderr, "%td:\t", p - start);
502 503 504

      switch ((re_opcode_t) *p++)
	{
Richard M. Stallman's avatar
Richard M. Stallman committed
505
	case no_op:
506
	  fprintf (stderr, "/no_op");
Richard M. Stallman's avatar
Richard M. Stallman committed
507
	  break;
508

509
	case succeed:
510
	  fprintf (stderr, "/succeed");
511 512
	  break;

513 514
	case exactn:
	  mcnt = *p++;
515
	  fprintf (stderr, "/exactn/%d", mcnt);
Richard M. Stallman's avatar
Richard M. Stallman committed
516
	  do
517
	    {
518
	      fprintf (stderr, "/%c", *p++);
Richard M. Stallman's avatar
Richard M. Stallman committed
519 520 521
	    }
	  while (--mcnt);
	  break;
522 523

	case start_memory:
524
	  fprintf (stderr, "/start_memory/%d", *p++);
Richard M. Stallman's avatar
Richard M. Stallman committed
525
	  break;
526 527

	case stop_memory:
528
	  fprintf (stderr, "/stop_memory/%d", *p++);
Richard M. Stallman's avatar
Richard M. Stallman committed
529
	  break;
530 531

	case duplicate:
532
	  fprintf (stderr, "/duplicate/%d", *p++);
533 534 535
	  break;

	case anychar:
536
	  fprintf (stderr, "/anychar");
537 538 539
	  break;

	case charset:
Richard M. Stallman's avatar
Richard M. Stallman committed
540 541
	case charset_not:
	  {
542 543
	    int c, last = -100;
	    bool in_range = false;
544
	    int length = CHARSET_BITMAP_SIZE (p - 1);
545
	    bool has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
546

547
	    fprintf (stderr, "/charset [%s",
Kenichi Handa's avatar
Kenichi Handa committed
548
		     (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
549

Kenichi Handa's avatar
Kenichi Handa committed
550 551
	    if (p + *p >= pend)
	      fprintf (stderr, " !extends past end of pattern! ");
552

Richard M. Stallman's avatar
Richard M. Stallman committed
553
	    for (c = 0; c < 256; c++)
554
	      if (c / 8 < length
555 556 557 558 559
		  && (p[1 + (c/8)] & (1 << (c % 8))))
		{
		  /* Are we starting a range?  */
		  if (last + 1 == c && ! in_range)
		    {
560
		      fprintf (stderr, "-");
561
		      in_range = true;
562 563 564
		    }
		  /* Have we broken a range?  */
		  else if (last + 1 != c && in_range)
565
		    {
566
		      fprintf (stderr, "%c", last);
567
		      in_range = false;
568
		    }
569

570
		  if (! in_range)
571
		    fprintf (stderr, "%c", c);
572 573

		  last = c;
Richard M. Stallman's avatar
Richard M. Stallman committed
574
	      }
575 576

	    if (in_range)
577
	      fprintf (stderr, "%c", last);
578

579
	    fprintf (stderr, "]");
580

581
	    p += 1 + length;
582 583

	    if (has_range_table)
584 585
	      {
		int count;
586
		fprintf (stderr, "has-range-table");
587 588 589 590 591 592

		/* ??? Should print the range table; for now, just skip it.  */
		p += 2;		/* skip range table bits */
		EXTRACT_NUMBER_AND_INCR (count, p);
		p = CHARSET_RANGE_TABLE_END (p, count);
	      }
593 594 595 596
	  }
	  break;

	case begline:
597
	  fprintf (stderr, "/begline");
Richard M. Stallman's avatar
Richard M. Stallman committed
598
	  break;
599 600

	case endline:
601
	  fprintf (stderr, "/endline");
Richard M. Stallman's avatar
Richard M. Stallman committed
602
	  break;
603 604

	case on_failure_jump:
605 606
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  fprintf (stderr, "/on_failure_jump to %td", p + mcnt - start);
Richard M. Stallman's avatar
Richard M. Stallman committed
607
	  break;
608 609

	case on_failure_keep_string_jump:
610 611 612
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  fprintf (stderr, "/on_failure_keep_string_jump to %td",
		   p + mcnt - start);
Richard M. Stallman's avatar
Richard M. Stallman committed
613
	  break;
614

615
	case on_failure_jump_nastyloop:
616 617 618
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  fprintf (stderr, "/on_failure_jump_nastyloop to %td",
		   p + mcnt - start);
619 620
	  break;

621
	case on_failure_jump_loop:
622 623 624
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  fprintf (stderr, "/on_failure_jump_loop to %td",
		   p + mcnt - start);
625 626
	  break;

627
	case on_failure_jump_smart:
628 629 630
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  fprintf (stderr, "/on_failure_jump_smart to %td",
		   p + mcnt - start);
631 632
	  break;

Richard M. Stallman's avatar
Richard M. Stallman committed
633
	case jump:
634 635
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  fprintf (stderr, "/jump to %td", p + mcnt - start);
636 637
	  break;

Richard M. Stallman's avatar
Richard M. Stallman committed
638
	case succeed_n:
639 640 641 642
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  EXTRACT_NUMBER_AND_INCR (mcnt2, p);
	  fprintf (stderr, "/succeed_n to %td, %d times",
		   p - 2 + mcnt - start, mcnt2);
Richard M. Stallman's avatar
Richard M. Stallman committed
643
	  break;
644

Richard M. Stallman's avatar
Richard M. Stallman committed
645
	case jump_n:
646 647 648 649
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  EXTRACT_NUMBER_AND_INCR (mcnt2, p);
	  fprintf (stderr, "/jump_n to %td, %d times",
		   p - 2 + mcnt - start, mcnt2);
Richard M. Stallman's avatar
Richard M. Stallman committed
650
	  break;
651

Richard M. Stallman's avatar
Richard M. Stallman committed
652
	case set_number_at:
653 654 655 656
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  EXTRACT_NUMBER_AND_INCR (mcnt2, p);
	  fprintf (stderr, "/set_number_at location %td to %d",
		   p - 2 + mcnt - start, mcnt2);
Richard M. Stallman's avatar
Richard M. Stallman committed
657
	  break;
658

Richard M. Stallman's avatar
Richard M. Stallman committed
659
	case wordbound:
660
	  fprintf (stderr, "/wordbound");
661 662 663
	  break;

	case notwordbound:
664
	  fprintf (stderr, "/notwordbound");
Richard M. Stallman's avatar
Richard M. Stallman committed
665
	  break;
666 667

	case wordbeg:
668
	  fprintf (stderr, "/wordbeg");
669
	  break;
670

671
	case wordend:
672
	  fprintf (stderr, "/wordend");
673
	  break;
674

675
	case symbeg:
676
	  fprintf (stderr, "/symbeg");
677 678 679
	  break;

	case symend:
680
	  fprintf (stderr, "/symend");
681
	  break;
682

683
	case syntaxspec:
684
	  fprintf (stderr, "/syntaxspec");
685
	  mcnt = *p++;
686
	  fprintf (stderr, "/%d", mcnt);
687 688 689
	  break;

	case notsyntaxspec:
690
	  fprintf (stderr, "/notsyntaxspec");
691
	  mcnt = *p++;
692
	  fprintf (stderr, "/%d", mcnt);
693 694
	  break;

695
	case at_dot:
696
	  fprintf (stderr, "/at_dot");
Richard M. Stallman's avatar
Richard M. Stallman committed
697
	  break;
698

699
	case categoryspec:
700
	  fprintf (stderr, "/categoryspec");
701
	  mcnt = *p++;
702
	  fprintf (stderr, "/%d", mcnt);
Richard M. Stallman's avatar
Richard M. Stallman committed
703
	  break;
704

705
	case notcategoryspec:
706
	  fprintf (stderr, "/notcategoryspec");
707
	  mcnt = *p++;
708
	  fprintf (stderr, "/%d", mcnt);
709 710 711
	  break;

	case begbuf:
712
	  fprintf (stderr, "/begbuf");
Richard M. Stallman's avatar
Richard M. Stallman committed
713
	  break;
714 715

	case endbuf:
716
	  fprintf (stderr, "/endbuf");
Richard M. Stallman's avatar
Richard M. Stallman committed
717
	  break;
718

Richard M. Stallman's avatar
Richard M. Stallman committed
719
	default:
720
	  fprintf (stderr, "?%d", *(p-1));
721 722
	}

723
      fprintf (stderr, "\n");
724 725
    }

726
  fprintf (stderr, "%td:\tend of pattern.\n", p - start);
727 728 729
}


730 731
static void
print_compiled_pattern (struct re_pattern_buffer *bufp)
732
{
733
  re_char *buffer = bufp->buffer;
734 735

  print_partial_compiled_pattern (buffer, buffer + bufp->used);
736
  printf ("%tu bytes used/%tu bytes allocated.\n",
737
	  bufp->used, bufp->allocated);
738 739 740 741 742 743 744

  if (bufp->fastmap_accurate && bufp->fastmap)
    {
      printf ("fastmap: ");
      print_fastmap (bufp->fastmap);
    }

745
  printf ("re_nsub: %tu\t", bufp->re_nsub);
746 747
  printf ("regs_alloc: %d\t", bufp->regs_allocated);
  printf ("can_be_null: %d\t", bufp->can_be_null);
748
  fflush (stdout);
749 750 751 752
  /* Perhaps we should print the translate table?  */
}


753
static void
754 755
print_double_string (re_char *where, re_char *string1, ptrdiff_t size1,
		     re_char *string2, ptrdiff_t size2)
756 757 758 759 760 761
{
  if (where == NULL)
    printf ("(null)");
  else
    {
      if (FIRST_STRING_P (where))
Richard M. Stallman's avatar
Richard M. Stallman committed
762
	{
763
	  fwrite_unlocked (where, 1, string1 + size1 - where, stdout);
Richard M. Stallman's avatar
Richard M. Stallman committed
764 765
	  where = string2;
	}
766

767
      fwrite_unlocked (where, 1, string2 + size2 - where, stdout);
768 769 770
    }
}

771
#else /* not REGEX_EMACS_DEBUG */
772

773
# define DEBUG_STATEMENT(e)
Paul Eggert's avatar
Paul Eggert committed
774
# define DEBUG_PRINT(...)
775 776
# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
777

778
#endif /* not REGEX_EMACS_DEBUG */
779

780
typedef enum
781
{
782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807
  REG_NOERROR = 0,	/* Success.  */
  REG_NOMATCH,		/* Didn't find a match (for regexec).  */

  /* POSIX regcomp return error codes.  (In the order listed in the
     standard.)  An older version of this code supported the POSIX
     API; this version continues to use these names internally.  */
  REG_BADPAT,		/* Invalid pattern.  */
  REG_ECOLLATE,		/* Not implemented.  */
  REG_ECTYPE,		/* Invalid character class name.  */
  REG_EESCAPE,		/* Trailing backslash.  */
  REG_ESUBREG,		/* Invalid back reference.  */
  REG_EBRACK,		/* Unmatched left bracket.  */
  REG_EPAREN,		/* Parenthesis imbalance.  */
  REG_EBRACE,		/* Unmatched \{.  */
  REG_BADBR,		/* Invalid contents of \{\}.  */
  REG_ERANGE,		/* Invalid range end.  */
  REG_ESPACE,		/* Ran out of memory.  */
  REG_BADRPT,		/* No preceding re for repetition op.  */

  /* Error codes we've added.  */
  REG_EEND,		/* Premature end.  */
  REG_ESIZE,		/* Compiled pattern bigger than 2^16 bytes.  */
  REG_ERPAREN,		/* Unmatched ) or \); not returned from regcomp.  */
  REG_ERANGEX,		/* Range striding over charsets.  */
  REG_ESIZEBR           /* n or m too big in \{n,m\} */
} reg_errcode_t;
808 809

static const char *re_error_msgid[] =
810
  {
811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829
   [REG_NOERROR] = "Success",
   [REG_NOMATCH] = "No match",
   [REG_BADPAT] = "Invalid regular expression",
   [REG_ECOLLATE] = "Invalid collation character",
   [REG_ECTYPE] = "Invalid character class name",
   [REG_EESCAPE] = "Trailing backslash",
   [REG_ESUBREG] = "Invalid back reference",
   [REG_EBRACK] = "Unmatched [ or [^",
   [REG_EPAREN] = "Unmatched ( or \\(",
   [REG_EBRACE] = "Unmatched \\{",
   [REG_BADBR] = "Invalid content of \\{\\}",
   [REG_ERANGE] = "Invalid range end",
   [REG_ESPACE] = "Memory exhausted",
   [REG_BADRPT] = "Invalid preceding regular expression",
   [REG_EEND] = "Premature end of regular expression",
   [REG_ESIZE] = "Regular expression too big",
   [REG_ERPAREN] = "Unmatched ) or \\)",
   [REG_ERANGEX ] = "Range striding over charsets",
   [REG_ESIZEBR ] = "Invalid content of \\{\\}",
830 831
  };

832 833
/* For 'regs_allocated'.  */
enum { REGS_UNALLOCATED, REGS_REALLOCATE, REGS_FIXED };
834

835 836
/* If 'regs_allocated' is REGS_UNALLOCATED in the pattern buffer,
   're_match_2' returns information about at least this many registers
837
   the first time a 'regs' structure is passed.  */
838
enum { RE_NREGS = 30 };
839

840 841 842 843 844
/* The searching and matching functions allocate memory for the
   failure stack and registers.  Otherwise searching and matching
   routines would have much smaller memory resources at their
   disposal, and therefore might fail to handle complex regexps.  */

845 846
/* Failure stack declarations and macros; both re_compile_fastmap and
   re_match_2 use a failure stack.  These have to be macros because of
847
   SAFE_ALLOCA.  */
848

849

850
/* Approximate number of failure points for which to initially allocate space
851 852
   when matching.  If this number is exceeded, we allocate more
   space, so it is not a hard limit.  */
853
#define INIT_FAILURE_ALLOC 20
854 855

/* Roughly the maximum number of failure points on the stack.  Would be
856
   exactly that if failure always used TYPICAL_FAILURE_SIZE items.
857
   This is a variable only so users of regex can assign to it; we never
858 859
   change it ourselves.  We always multiply it by TYPICAL_FAILURE_SIZE
   before using it, so it should probably be a byte-count instead.  */
860
/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
861 862 863
   whose default stack limit is 2mb.  In order for a larger
   value to work reliably, you have to try to make it accord
   with the process stack limit.  */
864
ptrdiff_t emacs_re_max_failures = 40000;
865 866 867

union fail_stack_elt
{
868
  re_char *pointer;
869
  intptr_t integer;
870 871 872 873 874 875 876
};

typedef union fail_stack_elt fail_stack_elt_t;

typedef struct
{
  fail_stack_elt_t *stack;
877 878 879
  ptrdiff_t size;
  ptrdiff_t avail;	/* Offset of next open position.  */
  ptrdiff_t frame;	/* Offset of the cur constructed frame.  */
880 881
} fail_stack_type;

882
#define FAIL_STACK_EMPTY()     (fail_stack.frame == 0)
883 884


885
/* Define macros to initialize and free the failure stack.  */
886

887
#define INIT_FAIL_STACK()						\
888
  do {									\
889
    fail_stack.stack =							\
890 891
      SAFE_ALLOCA (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE		\
		   * sizeof (fail_stack_elt_t));			\
892 893
    fail_stack.size = INIT_FAILURE_ALLOC;				\
    fail_stack.avail = 0;						\
894
    fail_stack.frame = 0;						\
895
  } while (false)
896 897


898
/* Double the size of FAIL_STACK, up to a limit
899
   which allows approximately 'emacs_re_max_failures' items.
900 901

   Return 1 if succeeds, and 0 if either ran out of memory
902 903
   allocating space for it or it was already too large.

904
   REGEX_REALLOCATE requires 'destination' be declared.   */
Roland McGrath's avatar