regex-emacs.c 149 KB
Newer Older
1
/* Emacs regular expression matching and search
Karl Berry's avatar
Karl Berry committed
2

Paul Eggert's avatar
Paul Eggert committed
3
   Copyright (C) 1993-2020 Free Software Foundation, Inc.
Karl Berry's avatar
Karl Berry committed
4

5 6
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
Miles Bader's avatar
Miles Bader committed
7
   the Free Software Foundation; either version 3, or (at your option)
8 9 10 11
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Juanma Barranquero's avatar
Juanma Barranquero committed
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 14 15
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17

18
/* TODO:
19
   - structure the opcode space into opcode+flag.
20
   - replace (succeed_n + jump_n + set_number_at) with something that doesn't
21
     need to modify the compiled regexp so that re_search can be reentrant.
22
   - get rid of on_failure_jump_smart by doing the optimization in re_comp
23
     rather than at run-time, so that re_search can be reentrant.
24
*/
25

26
#include <config.h>
27

28
#include "regex-emacs.h"
29

30
#include <stdlib.h>
31

32 33 34 35
#include "character.h"
#include "buffer.h"
#include "syntax.h"
#include "category.h"
36

37
/* Maximum number of duplicates an interval can allow.  Some systems
38 39 40
   define this in other header files, but we want our value, so remove
   any previous define.  Repeat counts are stored in opcodes as 2-byte
   unsigned integers.  */
41 42
#ifdef RE_DUP_MAX
# undef RE_DUP_MAX
43
#endif
44 45
#define RE_DUP_MAX (0xffff)

46
/* Make syntax table lookup grant data in gl_state.  */
47
#define SYNTAX(c) syntax_property (c, 1)
48

49
/* Convert the pointer to the char to BEG-based offset from the start.  */
50
#define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
51
/* Strings are 0-indexed, buffers are 1-indexed; pun on the boolean
52
   result to get the right base index.  */
53
#define POS_AS_IN_BUFFER(p)                                    \
54
  ((p) + (NILP (gl_state.object) || BUFFERP (gl_state.object)))
55

56 57 58
#define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
#define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
#define RE_STRING_CHAR(p, multibyte) \
59
  (multibyte ? STRING_CHAR (p) : *(p))
60
#define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
61
  (multibyte ? string_char_and_length (p, &(len)) : ((len) = 1, *(p)))
62

63
#define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
64

65
#define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
66

67 68 69
/* Set C a (possibly converted to multibyte) character before P.  P
   points into a string which is the virtual concatenation of STR1
   (which ends at END1) or STR2 (which ends at END2).  */
70
#define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2)			     \
71
  do {									     \
72
    if (target_multibyte)						     \
73 74
      {									     \
	re_char *dtemp = (p) == (str2) ? (end1) : (p);			     \
75 76 77
	re_char *dlimit = (p) > (str2) && (p) <= (end2) ? (str2) : (str1);   \
	while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp))		     \
	  continue;							     \
78
	c = STRING_CHAR (dtemp);					     \
79 80 81 82
      }									     \
    else								     \
      {									     \
	(c = ((p) == (str2) ? (end1) : (p))[-1]);			     \
83
	(c) = RE_CHAR_TO_MULTIBYTE (c);					     \
84
      }									     \
85
  } while (false)
86

87 88
/* Set C a (possibly converted to multibyte) character at P, and set
   LEN to the byte length of that character.  */
89
#define GET_CHAR_AFTER(c, p, len)		\
90
  do {						\
91
    if (target_multibyte)			\
92
      (c) = string_char_and_length (p, &(len));	\
93 94
    else					\
      {						\
95
	(c) = *p;				\
96
	len = 1;				\
97
	(c) = RE_CHAR_TO_MULTIBYTE (c);		\
98
      }						\
99
   } while (false)
100

101
/* 1 if C is an ASCII character.  */
102
#define IS_REAL_ASCII(c) ((c) < 0200)
103

104
/* 1 if C is a unibyte character.  */
105
#define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
106

107
/* The Emacs definitions should not be directly affected by locales.  */
108

109
/* In Emacs, these are only used for single-byte characters.  */
110 111 112
#define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
#define ISCNTRL(c) ((c) < ' ')
#define ISXDIGIT(c) (0 <= char_hexdigit (c))
113 114 115

/* The rest must handle multibyte characters.  */

116
#define ISBLANK(c) (IS_REAL_ASCII (c)			\
117 118 119
                     ? ((c) == ' ' || (c) == '\t')      \
                     : blankp (c))

120
#define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c)				\
121
		     ? (c) > ' ' && !((c) >= 0177 && (c) <= 0240)	\
122
		     : graphicp (c))
123

124
#define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c)				\
125
		    ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237)	\
126
		     : printablep (c))
127

128
#define ISALNUM(c) (IS_REAL_ASCII (c)			\
129 130 131
		    ? (((c) >= 'a' && (c) <= 'z')	\
		       || ((c) >= 'A' && (c) <= 'Z')	\
		       || ((c) >= '0' && (c) <= '9'))	\
132
		    : alphanumericp (c))
133

134
#define ISALPHA(c) (IS_REAL_ASCII (c)			\
135 136
		    ? (((c) >= 'a' && (c) <= 'z')	\
		       || ((c) >= 'A' && (c) <= 'Z'))	\
137
		    : alphabeticp (c))
138

139
#define ISLOWER(c) lowercasep (c)
140

141
#define ISPUNCT(c) (IS_REAL_ASCII (c)				\
142 143
		    ? ((c) > ' ' && (c) < 0177			\
		       && !(((c) >= 'a' && (c) <= 'z')		\
144 145
		            || ((c) >= 'A' && (c) <= 'Z')	\
		            || ((c) >= '0' && (c) <= '9')))	\
146 147
		    : SYNTAX (c) != Sword)

148
#define ISSPACE(c) (SYNTAX (c) == Swhitespace)
149

150
#define ISUPPER(c) uppercasep (c)
151

152
#define ISWORD(c) (SYNTAX (c) == Sword)
153

154
/* Use alloca instead of malloc.  This is because using malloc in
155 156
   re_search* or re_match* could cause memory leaks when C-g is used
   in Emacs (note that SAFE_ALLOCA could also call malloc, but does so
157
   via 'record_xmalloc' which uses 'unwind_protect' to ensure the
158 159 160
   memory is freed even in case of non-local exits); also, malloc is
   slower and causes storage fragmentation.  On the other hand, malloc
   is more portable, and easier to debug.
161

162
   Because we sometimes use alloca, some routines have to be macros,
163
   not functions -- 'alloca'-allocated space disappears at the end of the
164 165
   function it is called in.  */

166 167 168
/* This may be adjusted in main(), if the stack is successfully grown.  */
ptrdiff_t emacs_re_safe_alloca = MAX_ALLOCA;
/* Like USE_SAFE_ALLOCA, but use emacs_re_safe_alloca.  */
169 170
#define REGEX_USE_SAFE_ALLOCA					       \
  USE_SAFE_ALLOCA; sa_avail = emacs_re_safe_alloca
171

172
/* Assumes a 'char *destination' variable.  */
173 174
#define REGEX_REALLOCATE(source, osize, nsize)				\
  (destination = SAFE_ALLOCA (nsize),					\
175
   memcpy (destination, source, osize))
176

177 178
/* True if 'size1' is non-NULL and PTR is pointing anywhere inside
   'string1' or just past its end.  This works if PTR is NULL, which is
179
   a good thing.  */
Richard M. Stallman's avatar
Richard M. Stallman committed
180
#define FIRST_STRING_P(ptr)					\
181 182
  (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)

183
#define BYTEWIDTH 8 /* In bits.  */
184

185 186 187
/* Type of source-pattern and string chars.  */
typedef const unsigned char re_char;

188 189
static void re_compile_fastmap (struct re_pattern_buffer *);
static ptrdiff_t re_match_2_internal (struct re_pattern_buffer *bufp,
190 191
				     re_char *string1, ptrdiff_t size1,
				     re_char *string2, ptrdiff_t size2,
192
				     ptrdiff_t pos,
Paul Eggert's avatar
Paul Eggert committed
193
				     struct re_registers *regs,
194
				     ptrdiff_t stop);
195 196

/* These are the command codes that appear in compiled regular
197
   expressions.  Some opcodes are followed by argument bytes.  A
198 199 200 201 202 203 204
   command code can specify any interpretation whatsoever for its
   arguments.  Zero bytes may appear in the compiled regular expression.  */

typedef enum
{
  no_op = 0,

205
  /* Succeed right away--no more backtracking.  */
206 207
  succeed,

Richard M. Stallman's avatar
Richard M. Stallman committed
208
	/* Followed by one byte giving n, then by n literal bytes.  */
209 210
  exactn,

Richard M. Stallman's avatar
Richard M. Stallman committed
211
	/* Matches any (more or less) character.  */
212 213
  anychar,

Richard M. Stallman's avatar
Richard M. Stallman committed
214 215 216 217 218
	/* Matches any one char belonging to specified set.  First
	   following byte is number of bitmap bytes.  Then come bytes
	   for a bitmap saying which chars are in.  Bits in each byte
	   are ordered low-bit-first.  A character is in the set if its
	   bit is 1.  A character too large to have a bit in the map is
219 220 221 222 223
	   automatically not in the set.

	   If the length byte has the 0x80 bit set, then that stuff
	   is followed by a range table:
	       2 bytes of flags for character sets (low 8 bits, high 8 bits)
224
		   See RANGE_TABLE_WORK_BITS below.
225
	       2 bytes, the number of pairs that follow (upto 32767)
226
	       pairs, each 2 multibyte characters,
227
		   each multibyte character represented as 3 bytes.  */
228 229
  charset,

Richard M. Stallman's avatar
Richard M. Stallman committed
230
	/* Same parameters as charset, but match any character that is
231
	   not one of those specified.  */
232 233
  charset_not,

Richard M. Stallman's avatar
Richard M. Stallman committed
234 235 236
	/* Start remembering the text that is matched, for storing in a
	   register.  Followed by one byte with the register number, in
	   the range 0 to one less than the pattern buffer's re_nsub
237
	   field.  */
238 239
  start_memory,

Richard M. Stallman's avatar
Richard M. Stallman committed
240 241
	/* Stop remembering the text that is matched and store it in a
	   memory register.  Followed by one byte with the register
242
	   number, in the range 0 to one less than 're_nsub' in the
243
	   pattern buffer.  */
244 245
  stop_memory,

Richard M. Stallman's avatar
Richard M. Stallman committed
246
	/* Match a duplicate of something remembered. Followed by one
247
	   byte containing the register number.  */
248 249
  duplicate,

Richard M. Stallman's avatar
Richard M. Stallman committed
250
	/* Fail unless at beginning of line.  */
251 252
  begline,

253
	/* Fail unless at end of line.  */
254 255
  endline,

256
	/* Succeeds if at beginning of buffer.  */
257 258
  begbuf,

Richard M. Stallman's avatar
Richard M. Stallman committed
259
	/* Analogously, for end of buffer/string.  */
260
  endbuf,
261

Richard M. Stallman's avatar
Richard M. Stallman committed
262
	/* Followed by two byte relative address to which to jump.  */
263
  jump,
264

Richard M. Stallman's avatar
Richard M. Stallman committed
265
	/* Followed by two-byte relative address of place to resume at
Juanma Barranquero's avatar
Juanma Barranquero committed
266
	   in case of failure.  */
267
  on_failure_jump,
268

Richard M. Stallman's avatar
Richard M. Stallman committed
269 270
	/* Like on_failure_jump, but pushes a placeholder instead of the
	   current string position when executed.  */
271
  on_failure_keep_string_jump,
272

273
	/* Just like 'on_failure_jump', except that it checks that we
274 275 276 277
	   don't get stuck in an infinite loop (matching an empty string
	   indefinitely).  */
  on_failure_jump_loop,

278
	/* Just like 'on_failure_jump_loop', except that it checks for
279 280
	   a different kind of loop (the kind that shows up with non-greedy
	   operators).  This operation has to be immediately preceded
281
	   by a 'no_op'.  */
282 283
  on_failure_jump_nastyloop,

284
	/* A smart 'on_failure_jump' used for greedy * and + operators.
Juanma Barranquero's avatar
Juanma Barranquero committed
285
	   It analyzes the loop before which it is put and if the
286
	   loop does not require backtracking, it changes itself to
287 288 289
	   'on_failure_keep_string_jump' and short-circuits the loop,
	   else it just defaults to changing itself into 'on_failure_jump'.
	   It assumes that it is pointing to just past a 'jump'.  */
290
  on_failure_jump_smart,
291

Richard M. Stallman's avatar
Richard M. Stallman committed
292
	/* Followed by two-byte relative address and two-byte number n.
293 294 295
	   After matching N times, jump to the address upon failure.
	   Does not work if N starts at 0: use on_failure_jump_loop
	   instead.  */
296 297
  succeed_n,

Richard M. Stallman's avatar
Richard M. Stallman committed
298 299
	/* Followed by two-byte relative address, and two-byte number n.
	   Jump to the address N times, then fail.  */
300 301
  jump_n,

Richard M. Stallman's avatar
Richard M. Stallman committed
302
	/* Set the following two-byte relative address to the
Juanma Barranquero's avatar
Juanma Barranquero committed
303
	   subsequent two-byte number.  The address *includes* the two
Richard M. Stallman's avatar
Richard M. Stallman committed
304
	   bytes of number.  */
305 306 307 308 309 310
  set_number_at,

  wordbeg,	/* Succeeds if at word beginning.  */
  wordend,	/* Succeeds if at word end.  */

  wordbound,	/* Succeeds if at a word boundary.  */
Juanma Barranquero's avatar
Juanma Barranquero committed
311
  notwordbound,	/* Succeeds if not at a word boundary.  */
312

313 314 315
  symbeg,       /* Succeeds if at symbol beginning.  */
  symend,       /* Succeeds if at symbol end.  */

316
	/* Matches any character whose syntax is specified.  Followed by
Richard M. Stallman's avatar
Richard M. Stallman committed
317
	   a byte which contains a syntax code, e.g., Sword.  */
318 319 320
  syntaxspec,

	/* Matches any character whose syntax is not that specified.  */
321
  notsyntaxspec,
322

323
  at_dot,	/* Succeeds if at point.  */
324 325

  /* Matches any character whose category-set contains the specified
Juanma Barranquero's avatar
Juanma Barranquero committed
326 327
     category.  The operator is followed by a byte which contains a
     category code (mnemonic ASCII character).  */
328 329 330 331 332 333
  categoryspec,

  /* Matches any character whose category-set does not contain the
     specified category.  The operator is followed by a byte which
     contains the category code (mnemonic ASCII character).  */
  notcategoryspec
334 335 336 337 338 339 340 341 342 343
} re_opcode_t;

/* Common operations on the compiled pattern.  */

/* Store NUMBER in two contiguous bytes starting at DESTINATION.  */

#define STORE_NUMBER(destination, number)				\
  do {									\
    (destination)[0] = (number) & 0377;					\
    (destination)[1] = (number) >> 8;					\
344
  } while (false)
345 346 347 348 349 350 351 352 353

/* Same as STORE_NUMBER, except increment DESTINATION to
   the byte after where the number is stored.  Therefore, DESTINATION
   must be an lvalue.  */

#define STORE_NUMBER_AND_INCR(destination, number)			\
  do {									\
    STORE_NUMBER (destination, number);					\
    (destination) += 2;							\
354
  } while (false)
355 356 357 358 359

/* Put into DESTINATION a number stored in two contiguous bytes starting
   at SOURCE.  */

#define EXTRACT_NUMBER(destination, source)				\
360
  ((destination) = extract_number (source))
361

362 363
static int
extract_number (re_char *source)
364
{
365 366
  signed char leading_byte = source[1];
  return leading_byte * 256 + source[0];
367 368 369 370 371 372
}

/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
   SOURCE must be an lvalue.  */

#define EXTRACT_NUMBER_AND_INCR(destination, source)			\
373
  ((destination) = extract_number_and_incr (&source))
374

375 376
static int
extract_number_and_incr (re_char **source)
377
{
378
  int num = extract_number (*source);
379
  *source += 2;
380
  return num;
381 382
}

383 384
/* Store a multibyte character in three contiguous bytes starting
   DESTINATION, and increment DESTINATION to the byte after where the
Juanma Barranquero's avatar
Juanma Barranquero committed
385
   character is stored.  Therefore, DESTINATION must be an lvalue.  */
386 387 388 389 390 391 392

#define STORE_CHARACTER_AND_INCR(destination, character)	\
  do {								\
    (destination)[0] = (character) & 0377;			\
    (destination)[1] = ((character) >> 8) & 0377;		\
    (destination)[2] = (character) >> 16;			\
    (destination) += 3;						\
393
  } while (false)
394 395

/* Put into DESTINATION a character stored in three contiguous bytes
Juanma Barranquero's avatar
Juanma Barranquero committed
396
   starting at SOURCE.  */
397 398 399 400 401 402

#define EXTRACT_CHARACTER(destination, source)	\
  do {						\
    (destination) = ((source)[0]		\
		     | ((source)[1] << 8)	\
		     | ((source)[2] << 16));	\
403
  } while (false)
404 405 406 407 408 409 410 411 412


/* Macros for charset. */

/* Size of bitmap of charset P in bytes.  P is a start of charset,
   i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not.  */
#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)

/* Nonzero if charset P has range table.  */
413
#define CHARSET_RANGE_TABLE_EXISTS_P(p)	 (((p)[1] & 0x80) != 0)
414 415 416

/* Return the address of range table of charset P.  But not the start
   of table itself, but the before where the number of ranges is
417
   stored.  '2 +' means to skip re_opcode_t and size of bitmap,
418 419 420 421 422 423 424
   and the 2 bytes of flags at the start of the range table.  */
#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])

/* Extract the bit flags that start a range table.  */
#define CHARSET_RANGE_TABLE_BITS(p)		\
  ((p)[2 + CHARSET_BITMAP_SIZE (p)]		\
   + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
425 426

/* Return the address of end of RANGE_TABLE.  COUNT is number of
427 428
   ranges (which is a pair of (start, end)) in the RANGE_TABLE.  '* 2'
   is start of range and end of range.  '* 3' is size of each start
429 430 431 432
   and end.  */
#define CHARSET_RANGE_TABLE_END(range_table, count)	\
  ((range_table) + (count) * 2 * 3)

433 434
/* If REGEX_EMACS_DEBUG is defined, print many voluminous messages
   (if the variable regex_emacs_debug is positive).  */
435

436
#ifdef REGEX_EMACS_DEBUG
437

438
/* Use standard I/O for debugging.  */
439
# include "sysstdio.h"
440

441
static int regex_emacs_debug = -100000;
442

443
# define DEBUG_STATEMENT(e) e
444 445
# define DEBUG_PRINT(...)                                       \
  if (regex_emacs_debug > 0) fprintf (stderr, __VA_ARGS__)
446
# define DEBUG_COMPILES_ARGUMENTS
447
# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)				\
448
  if (regex_emacs_debug > 0) print_partial_compiled_pattern (s, e)
449
# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)			\
450
  if (regex_emacs_debug > 0) print_double_string (w, s1, sz1, s2, sz2)
451

452 453 454 455
static void
debug_putchar (int c)
{
  if (c >= 32 && c <= 126)
456
    putc (c, stderr);
457
  else
458 459 460 461
    {
      unsigned int uc = c;
      fprintf (stderr, "{%02x}", uc);
    }
462
}
463 464 465

/* Print the fastmap in human-readable form.  */

466 467
static void
print_fastmap (char *fastmap)
468
{
469 470
  bool was_a_range = false;
  int i = 0;
471

472 473 474 475
  while (i < (1 << BYTEWIDTH))
    {
      if (fastmap[i++])
	{
476
	  was_a_range = false;
477
	  debug_putchar (i - 1);
Richard M. Stallman's avatar
Richard M. Stallman committed
478 479
	  while (i < (1 << BYTEWIDTH)  &&  fastmap[i])
	    {
480
	      was_a_range = true;
Richard M. Stallman's avatar
Richard M. Stallman committed
481 482
	      i++;
	    }
483
	  if (was_a_range)
Richard M. Stallman's avatar
Richard M. Stallman committed
484
	    {
485
	      debug_putchar ('-');
486
	      debug_putchar (i - 1);
Richard M. Stallman's avatar
Richard M. Stallman committed
487 488
	    }
	}
489
    }
490
  putc ('\n', stderr);
491 492 493 494 495 496
}


/* Print a compiled pattern string in human-readable form, starting at
   the START pointer into it and ending just before the pointer END.  */

497 498
static void
print_partial_compiled_pattern (re_char *start, re_char *end)
499 500
{
  int mcnt, mcnt2;
501 502
  re_char *p = start;
  re_char *pend = end;
503 504 505

  if (start == NULL)
    {
506
      fputs ("(null)\n", stderr);
507 508
      return;
    }
509

510 511 512
  /* Loop over pattern commands.  */
  while (p < pend)
    {
513
      fprintf (stderr, "%td:\t", p - start);
514 515 516

      switch ((re_opcode_t) *p++)
	{
Richard M. Stallman's avatar
Richard M. Stallman committed
517
	case no_op:
518
	  fputs ("/no_op", stderr);
Richard M. Stallman's avatar
Richard M. Stallman committed
519
	  break;
520

521
	case succeed:
522
	  fputs ("/succeed", stderr);
523 524
	  break;

525 526
	case exactn:
	  mcnt = *p++;
527
	  fprintf (stderr, "/exactn/%d", mcnt);
Richard M. Stallman's avatar
Richard M. Stallman committed
528
	  do
529
	    {
530
	      debug_putchar ('/');
531
	      debug_putchar (*p++);
Richard M. Stallman's avatar
Richard M. Stallman committed
532 533 534
	    }
	  while (--mcnt);
	  break;
535 536

	case start_memory:
537
	  fprintf (stderr, "/start_memory/%d", *p++);
Richard M. Stallman's avatar
Richard M. Stallman committed
538
	  break;
539 540

	case stop_memory:
541
	  fprintf (stderr, "/stop_memory/%d", *p++);
Richard M. Stallman's avatar
Richard M. Stallman committed
542
	  break;
543 544

	case duplicate:
545
	  fprintf (stderr, "/duplicate/%d", *p++);
546 547 548
	  break;

	case anychar:
549
	  fputs ("/anychar", stderr);
550 551 552
	  break;

	case charset:
Richard M. Stallman's avatar
Richard M. Stallman committed
553 554
	case charset_not:
	  {
555 556
	    int c, last = -100;
	    bool in_range = false;
557
	    int length = CHARSET_BITMAP_SIZE (p - 1);
558
	    bool has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
559

560
	    fprintf (stderr, "/charset [%s",
Kenichi Handa's avatar
Kenichi Handa committed
561
		     (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
562

Kenichi Handa's avatar
Kenichi Handa committed
563
	    if (p + *p >= pend)
564
	      fputs (" !extends past end of pattern! ", stderr);
565

Richard M. Stallman's avatar
Richard M. Stallman committed
566
	    for (c = 0; c < 256; c++)
567
	      if (c / 8 < length
568 569 570 571 572
		  && (p[1 + (c/8)] & (1 << (c % 8))))
		{
		  /* Are we starting a range?  */
		  if (last + 1 == c && ! in_range)
		    {
573
		      debug_putchar ('-');
574
		      in_range = true;
575 576 577
		    }
		  /* Have we broken a range?  */
		  else if (last + 1 != c && in_range)
578
		    {
579
		      debug_putchar (last);
580
		      in_range = false;
581
		    }
582

583
		  if (! in_range)
584
		    debug_putchar (c);
585 586

		  last = c;
Richard M. Stallman's avatar
Richard M. Stallman committed
587
	      }
588 589

	    if (in_range)
590
	      debug_putchar (last);
591

592
	    debug_putchar (']');
593

594
	    p += 1 + length;
595 596

	    if (has_range_table)
597 598
	      {
		int count;
599
		fputs ("has-range-table", stderr);
600 601 602 603 604 605

		/* ??? Should print the range table; for now, just skip it.  */
		p += 2;		/* skip range table bits */
		EXTRACT_NUMBER_AND_INCR (count, p);
		p = CHARSET_RANGE_TABLE_END (p, count);
	      }
606 607 608 609
	  }
	  break;

	case begline:
610
	  fputs ("/begline", stderr);
Richard M. Stallman's avatar
Richard M. Stallman committed
611
	  break;
612 613

	case endline:
614
	  fputs ("/endline", stderr);
Richard M. Stallman's avatar
Richard M. Stallman committed
615
	  break;
616 617

	case on_failure_jump:
618 619
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  fprintf (stderr, "/on_failure_jump to %td", p + mcnt - start);
Richard M. Stallman's avatar
Richard M. Stallman committed
620
	  break;
621 622

	case on_failure_keep_string_jump:
623 624 625
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  fprintf (stderr, "/on_failure_keep_string_jump to %td",
		   p + mcnt - start);
Richard M. Stallman's avatar
Richard M. Stallman committed
626
	  break;
627

628
	case on_failure_jump_nastyloop:
629 630 631
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  fprintf (stderr, "/on_failure_jump_nastyloop to %td",
		   p + mcnt - start);
632 633
	  break;

634
	case on_failure_jump_loop:
635 636 637
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  fprintf (stderr, "/on_failure_jump_loop to %td",
		   p + mcnt - start);
638 639
	  break;

640
	case on_failure_jump_smart:
641 642 643
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  fprintf (stderr, "/on_failure_jump_smart to %td",
		   p + mcnt - start);
644 645
	  break;

Richard M. Stallman's avatar
Richard M. Stallman committed
646
	case jump:
647 648
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  fprintf (stderr, "/jump to %td", p + mcnt - start);
649 650
	  break;

Richard M. Stallman's avatar
Richard M. Stallman committed
651
	case succeed_n:
652 653 654 655
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  EXTRACT_NUMBER_AND_INCR (mcnt2, p);
	  fprintf (stderr, "/succeed_n to %td, %d times",
		   p - 2 + mcnt - start, mcnt2);
Richard M. Stallman's avatar
Richard M. Stallman committed
656
	  break;
657

Richard M. Stallman's avatar
Richard M. Stallman committed
658
	case jump_n:
659 660 661 662
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  EXTRACT_NUMBER_AND_INCR (mcnt2, p);
	  fprintf (stderr, "/jump_n to %td, %d times",
		   p - 2 + mcnt - start, mcnt2);
Richard M. Stallman's avatar
Richard M. Stallman committed
663
	  break;
664

Richard M. Stallman's avatar
Richard M. Stallman committed
665
	case set_number_at:
666 667 668 669
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  EXTRACT_NUMBER_AND_INCR (mcnt2, p);
	  fprintf (stderr, "/set_number_at location %td to %d",
		   p - 2 + mcnt - start, mcnt2);
Richard M. Stallman's avatar
Richard M. Stallman committed
670
	  break;
671

Richard M. Stallman's avatar
Richard M. Stallman committed
672
	case wordbound:
673
	  fputs ("/wordbound", stderr);
674 675 676
	  break;

	case notwordbound:
677
	  fputs ("/notwordbound", stderr);
Richard M. Stallman's avatar
Richard M. Stallman committed
678
	  break;
679 680

	case wordbeg:
681
	  fputs ("/wordbeg", stderr);
682
	  break;
683

684
	case wordend:
685
	  fputs ("/wordend", stderr);
686
	  break;
687

688
	case symbeg:
689
	  fputs ("/symbeg", stderr);
690 691 692
	  break;

	case symend:
693
	  fputs ("/symend", stderr);
694
	  break;
695

696
	case syntaxspec:
697
	  fputs ("/syntaxspec", stderr);
698
	  mcnt = *p++;
699
	  fprintf (stderr, "/%d", mcnt);
700 701 702
	  break;

	case notsyntaxspec:
703
	  fputs ("/notsyntaxspec", stderr);
704
	  mcnt = *p++;
705
	  fprintf (stderr, "/%d", mcnt);
706 707
	  break;

708
	case at_dot:
709
	  fputs ("/at_dot", stderr);
Richard M. Stallman's avatar
Richard M. Stallman committed
710
	  break;
711

712
	case categoryspec:
713
	  fputs ("/categoryspec", stderr);
714
	  mcnt = *p++;
715
	  fprintf (stderr, "/%d", mcnt);
Richard M. Stallman's avatar
Richard M. Stallman committed
716
	  break;
717

718
	case notcategoryspec:
719
	  fputs ("/notcategoryspec", stderr);
720
	  mcnt = *p++;
721
	  fprintf (stderr, "/%d", mcnt);
722 723 724
	  break;

	case begbuf:
725
	  fputs ("/begbuf", stderr);
Richard M. Stallman's avatar
Richard M. Stallman committed
726
	  break;
727 728

	case endbuf:
729
	  fputs ("/endbuf", stderr);
Richard M. Stallman's avatar
Richard M. Stallman committed
730
	  break;
731

Richard M. Stallman's avatar
Richard M. Stallman committed
732
	default:
733
	  fprintf (stderr, "?%d", *(p-1));
734 735
	}

736
      putc ('\n', stderr);
737 738
    }

739
  fprintf (stderr, "%td:\tend of pattern.\n", p - start);
740 741 742
}


743 744
static void
print_compiled_pattern (struct re_pattern_buffer *bufp)
745
{
746
  re_char *buffer = bufp->buffer;
747 748

  print_partial_compiled_pattern (buffer, buffer + bufp->used);
749
  fprintf (stderr, "%td bytes used/%td bytes allocated.\n",
750
           bufp->used, bufp->allocated);
751 752 753

  if (bufp->fastmap_accurate && bufp->fastmap)
    {
754
      fputs ("fastmap: ", stderr);
755 756 757
      print_fastmap (bufp->fastmap);
    }

758
  fprintf (stderr, "re_nsub: %td\t", bufp->re_nsub);
759
  fprintf (stderr, "regs_alloc: %d\t", bufp->regs_allocated);
760
  fprintf (stderr, "can_be_null: %d\n", bufp->can_be_null);
761 762 763 764
  /* Perhaps we should print the translate table?  */
}


765
static void
766 767
print_double_string (re_char *where, re_char *string1, ptrdiff_t size1,
		     re_char *string2, ptrdiff_t size2)
768 769
{
  if (where == NULL)
770
    fputs ("(null)", stderr);
771 772
  else
    {
773
      int i;
774
      if (FIRST_STRING_P (where))
Richard M. Stallman's avatar
Richard M. Stallman committed
775
	{
776 777
	  for (i = 0; i < string1 + size1 - where; i++)
	    debug_putchar (where[i]);
Richard M. Stallman's avatar
Richard M. Stallman committed
778 779
	  where = string2;
	}
780

781 782
      for (i = 0; i < string2 + size2 - where; i++)
        debug_putchar (where[i]);
783 784 785
    }
}

786
#else /* not REGEX_EMACS_DEBUG */
787

788
# define DEBUG_STATEMENT(e)
Paul Eggert's avatar
Paul Eggert committed
789
# define DEBUG_PRINT(...)
790 791
# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
792

793
#endif /* not REGEX_EMACS_DEBUG */
794

795
typedef enum
796
{
797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820
  REG_NOERROR = 0,	/* Success.  */
  REG_NOMATCH,		/* Didn't find a match (for regexec).  */

  /* POSIX regcomp return error codes.  (In the order listed in the
     standard.)  An older version of this code supported the POSIX
     API; this version continues to use these names internally.  */
  REG_BADPAT,		/* Invalid pattern.  */
  REG_ECOLLATE,		/* Not implemented.  */
  REG_ECTYPE,		/* Invalid character class name.  */
  REG_EESCAPE,		/* Trailing backslash.  */
  REG_ESUBREG,		/* Invalid back reference.  */
  REG_EBRACK,		/* Unmatched left bracket.  */
  REG_EPAREN,		/* Parenthesis imbalance.  */
  REG_EBRACE,		/* Unmatched \{.  */
  REG_BADBR,		/* Invalid contents of \{\}.  */
  REG_ERANGE,		/* Invalid range end.  */
  REG_ESPACE,		/* Ran out of memory.  */
  REG_BADRPT,		/* No preceding re for repetition op.  */

  /* Error codes we've added.  */
  REG_EEND,		/* Premature end.  */
  REG_ESIZE,		/* Compiled pattern bigger than 2^16 bytes.  */
  REG_ERPAREN,		/* Unmatched ) or \); not returned from regcomp.  */
  REG_ERANGEX,		/* Range striding over charsets.  */
821
  REG_ESIZEBR           /* n or m too big in \{n,m\} */
822
} reg_errcode_t;
823 824

static const char *re_error_msgid[] =
825
  {
826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844
   [REG_NOERROR] = "Success",
   [REG_NOMATCH] = "No match",
   [REG_BADPAT] = "Invalid regular expression",
   [REG_ECOLLATE] = "Invalid collation character",
   [REG_ECTYPE] = "Invalid character class name",
   [REG_EESCAPE] = "Trailing backslash",
   [REG_ESUBREG] = "Invalid back reference",
   [REG_EBRACK] = "Unmatched [ or [^",
   [REG_EPAREN] = "Unmatched ( or \\(",
   [REG_EBRACE] = "Unmatched \\{",
   [REG_BADBR] = "Invalid content of \\{\\}",
   [REG_ERANGE] = "Invalid range end",
   [REG_ESPACE] = "Memory exhausted",
   [REG_BADRPT] = "Invalid preceding regular expression",
   [REG_EEND] = "Premature end of regular expression",
   [REG_ESIZE] = "Regular expression too big",
   [REG_ERPAREN] = "Unmatched ) or \\)",
   [REG_ERANGEX ] = "Range striding over charsets",
   [REG_ESIZEBR ] = "Invalid content of \\{\\}",
845 846
  };

847 848
/* For 'regs_allocated'.  */
enum { REGS_UNALLOCATED, REGS_REALLOCATE, REGS_FIXED };
849

850 851
/* If 'regs_allocated' is REGS_UNALLOCATED in the pattern buffer,
   're_match_2' returns information about at least this many registers
852
   the first time a 'regs' structure is passed.  */
853
enum { RE_NREGS = 30 };
854

855 856 857 858 859
/* The searching and matching functions allocate memory for the
   failure stack and registers.  Otherwise searching and matching
   routines would have much smaller memory resources at their
   disposal, and therefore might fail to handle complex regexps.  */

860 861
/* Failure stack declarations and macros; both re_compile_fastmap and
   re_match_2 use a failure stack.  These have to be macros because of
862
   SAFE_ALLOCA.  */
863

864

865
/* Approximate number of failure points for which to initially allocate space
866 867
   when matching.  If this number is exceeded, we allocate more
   space, so it is not a hard limit.  */
868
#define INIT_FAILURE_ALLOC 20
869 870

/* Roughly the maximum number of failure points on the stack.  Would be
871
   exactly that if failure always used TYPICAL_FAILURE_SIZE items.
872
   This is a variable only so users of regex can assign to it; we never
873 874
   change it ourselves.  We always multiply it by TYPICAL_FAILURE_SIZE
   before using it, so it should probably be a byte-count instead.  */
875
/* Note that 4400 was enough to cause a crash on Alpha OSF/1,
876 877 878
   whose default stack limit is 2mb.  In order for a larger
   value to work reliably, you have to try to make it accord
   with the process stack limit.  */
879
ptrdiff_t emacs_re_max_failures = 40000;
880 881 882

union fail_stack_elt
{
883
  re_char *pointer;
884
  intptr_t integer;
885 886 887 888 889 890 891
};

typedef union fail_stack_elt fail_stack_elt_t;

typedef struct