regex-emacs.c 147 KB
Newer Older
1
/* Emacs regular expression matching and search
Karl Berry's avatar
Karl Berry committed
2

Paul Eggert's avatar
Paul Eggert committed
3
   Copyright (C) 1993-2019 Free Software Foundation, Inc.
Karl Berry's avatar
Karl Berry committed
4

5 6
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
Miles Bader's avatar
Miles Bader committed
7
   the Free Software Foundation; either version 3, or (at your option)
8 9 10 11
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Juanma Barranquero's avatar
Juanma Barranquero committed
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 14 15
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17

18
/* TODO:
19
   - structure the opcode space into opcode+flag.
20
   - replace (succeed_n + jump_n + set_number_at) with something that doesn't
21
     need to modify the compiled regexp so that re_search can be reentrant.
22
   - get rid of on_failure_jump_smart by doing the optimization in re_comp
23
     rather than at run-time, so that re_search can be reentrant.
24
*/
25

26
#include <config.h>
27

28
#include "regex-emacs.h"
29

30
#include <stdlib.h>
31

32 33 34 35
#include "character.h"
#include "buffer.h"
#include "syntax.h"
#include "category.h"
36

37
/* Maximum number of duplicates an interval can allow.  Some systems
38 39 40
   define this in other header files, but we want our value, so remove
   any previous define.  Repeat counts are stored in opcodes as 2-byte
   unsigned integers.  */
41 42
#ifdef RE_DUP_MAX
# undef RE_DUP_MAX
43
#endif
44 45
#define RE_DUP_MAX (0xffff)

46
/* Make syntax table lookup grant data in gl_state.  */
47
#define SYNTAX(c) syntax_property (c, 1)
48

49
/* Convert the pointer to the char to BEG-based offset from the start.  */
50
#define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
51
/* Strings are 0-indexed, buffers are 1-indexed; pun on the boolean
52
   result to get the right base index.  */
53
#define POS_AS_IN_BUFFER(p)                                    \
54
  ((p) + (NILP (gl_state.object) || BUFFERP (gl_state.object)))
55

56 57 58
#define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
#define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
#define RE_STRING_CHAR(p, multibyte) \
59
  (multibyte ? STRING_CHAR (p) : *(p))
60
#define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
61
  (multibyte ? STRING_CHAR_AND_LENGTH (p, len) : ((len) = 1, *(p)))
62

63
#define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
64

65
#define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
66

67 68 69
/* Set C a (possibly converted to multibyte) character before P.  P
   points into a string which is the virtual concatenation of STR1
   (which ends at END1) or STR2 (which ends at END2).  */
70
#define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2)			     \
71
  do {									     \
72
    if (target_multibyte)						     \
73 74
      {									     \
	re_char *dtemp = (p) == (str2) ? (end1) : (p);			     \
75 76 77
	re_char *dlimit = (p) > (str2) && (p) <= (end2) ? (str2) : (str1);   \
	while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp))		     \
	  continue;							     \
78
	c = STRING_CHAR (dtemp);					     \
79 80 81 82
      }									     \
    else								     \
      {									     \
	(c = ((p) == (str2) ? (end1) : (p))[-1]);			     \
83
	(c) = RE_CHAR_TO_MULTIBYTE (c);					     \
84
      }									     \
85
  } while (false)
86

87 88
/* Set C a (possibly converted to multibyte) character at P, and set
   LEN to the byte length of that character.  */
89
#define GET_CHAR_AFTER(c, p, len)		\
90
  do {						\
91
    if (target_multibyte)			\
92
      (c) = STRING_CHAR_AND_LENGTH (p, len);	\
93 94
    else					\
      {						\
95
	(c) = *p;				\
96
	len = 1;				\
97
	(c) = RE_CHAR_TO_MULTIBYTE (c);		\
98
      }						\
99
   } while (false)
100

101
/* 1 if C is an ASCII character.  */
102
#define IS_REAL_ASCII(c) ((c) < 0200)
103

104
/* 1 if C is a unibyte character.  */
105
#define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
106

107
/* The Emacs definitions should not be directly affected by locales.  */
108

109
/* In Emacs, these are only used for single-byte characters.  */
110 111 112
#define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
#define ISCNTRL(c) ((c) < ' ')
#define ISXDIGIT(c) (0 <= char_hexdigit (c))
113 114 115

/* The rest must handle multibyte characters.  */

116
#define ISBLANK(c) (IS_REAL_ASCII (c)			\
117 118 119
                     ? ((c) == ' ' || (c) == '\t')      \
                     : blankp (c))

120
#define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c)				\
121
		     ? (c) > ' ' && !((c) >= 0177 && (c) <= 0240)	\
122
		     : graphicp (c))
123

124
#define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c)				\
125
		    ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237)	\
126
		     : printablep (c))
127

128
#define ISALNUM(c) (IS_REAL_ASCII (c)			\
129 130 131
		    ? (((c) >= 'a' && (c) <= 'z')	\
		       || ((c) >= 'A' && (c) <= 'Z')	\
		       || ((c) >= '0' && (c) <= '9'))	\
132
		    : alphanumericp (c))
133

134
#define ISALPHA(c) (IS_REAL_ASCII (c)			\
135 136
		    ? (((c) >= 'a' && (c) <= 'z')	\
		       || ((c) >= 'A' && (c) <= 'Z'))	\
137
		    : alphabeticp (c))
138

139
#define ISLOWER(c) lowercasep (c)
140

141
#define ISPUNCT(c) (IS_REAL_ASCII (c)				\
142 143
		    ? ((c) > ' ' && (c) < 0177			\
		       && !(((c) >= 'a' && (c) <= 'z')		\
144 145
		            || ((c) >= 'A' && (c) <= 'Z')	\
		            || ((c) >= '0' && (c) <= '9')))	\
146 147
		    : SYNTAX (c) != Sword)

148
#define ISSPACE(c) (SYNTAX (c) == Swhitespace)
149

150
#define ISUPPER(c) uppercasep (c)
151

152
#define ISWORD(c) (SYNTAX (c) == Sword)
153

154
/* Use alloca instead of malloc.  This is because using malloc in
155 156
   re_search* or re_match* could cause memory leaks when C-g is used
   in Emacs (note that SAFE_ALLOCA could also call malloc, but does so
157
   via 'record_xmalloc' which uses 'unwind_protect' to ensure the
158 159 160
   memory is freed even in case of non-local exits); also, malloc is
   slower and causes storage fragmentation.  On the other hand, malloc
   is more portable, and easier to debug.
161

162
   Because we sometimes use alloca, some routines have to be macros,
163
   not functions -- 'alloca'-allocated space disappears at the end of the
164 165
   function it is called in.  */

166 167 168
/* This may be adjusted in main(), if the stack is successfully grown.  */
ptrdiff_t emacs_re_safe_alloca = MAX_ALLOCA;
/* Like USE_SAFE_ALLOCA, but use emacs_re_safe_alloca.  */
169 170
#define REGEX_USE_SAFE_ALLOCA					       \
  USE_SAFE_ALLOCA; sa_avail = emacs_re_safe_alloca
171

172
/* Assumes a 'char *destination' variable.  */
173 174
#define REGEX_REALLOCATE(source, osize, nsize)				\
  (destination = SAFE_ALLOCA (nsize),					\
175
   memcpy (destination, source, osize))
176

177 178
/* True if 'size1' is non-NULL and PTR is pointing anywhere inside
   'string1' or just past its end.  This works if PTR is NULL, which is
179
   a good thing.  */
Richard M. Stallman's avatar
Richard M. Stallman committed
180
#define FIRST_STRING_P(ptr)					\
181 182
  (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)

183
#define BYTEWIDTH 8 /* In bits.  */
184

185 186 187
/* Type of source-pattern and string chars.  */
typedef const unsigned char re_char;

188 189
static void re_compile_fastmap (struct re_pattern_buffer *);
static ptrdiff_t re_match_2_internal (struct re_pattern_buffer *bufp,
190 191
				     re_char *string1, ptrdiff_t size1,
				     re_char *string2, ptrdiff_t size2,
192
				     ptrdiff_t pos,
Paul Eggert's avatar
Paul Eggert committed
193
				     struct re_registers *regs,
194
				     ptrdiff_t stop);
195 196

/* These are the command codes that appear in compiled regular
197
   expressions.  Some opcodes are followed by argument bytes.  A
198 199 200 201 202 203 204
   command code can specify any interpretation whatsoever for its
   arguments.  Zero bytes may appear in the compiled regular expression.  */

typedef enum
{
  no_op = 0,

205
  /* Succeed right away--no more backtracking.  */
206 207
  succeed,

Richard M. Stallman's avatar
Richard M. Stallman committed
208
	/* Followed by one byte giving n, then by n literal bytes.  */
209 210
  exactn,

Richard M. Stallman's avatar
Richard M. Stallman committed
211
	/* Matches any (more or less) character.  */
212 213
  anychar,

Richard M. Stallman's avatar
Richard M. Stallman committed
214 215 216 217 218
	/* Matches any one char belonging to specified set.  First
	   following byte is number of bitmap bytes.  Then come bytes
	   for a bitmap saying which chars are in.  Bits in each byte
	   are ordered low-bit-first.  A character is in the set if its
	   bit is 1.  A character too large to have a bit in the map is
219 220 221 222 223
	   automatically not in the set.

	   If the length byte has the 0x80 bit set, then that stuff
	   is followed by a range table:
	       2 bytes of flags for character sets (low 8 bits, high 8 bits)
224
		   See RANGE_TABLE_WORK_BITS below.
225
	       2 bytes, the number of pairs that follow (upto 32767)
226
	       pairs, each 2 multibyte characters,
227
		   each multibyte character represented as 3 bytes.  */
228 229
  charset,

Richard M. Stallman's avatar
Richard M. Stallman committed
230
	/* Same parameters as charset, but match any character that is
231
	   not one of those specified.  */
232 233
  charset_not,

Richard M. Stallman's avatar
Richard M. Stallman committed
234 235 236
	/* Start remembering the text that is matched, for storing in a
	   register.  Followed by one byte with the register number, in
	   the range 0 to one less than the pattern buffer's re_nsub
237
	   field.  */
238 239
  start_memory,

Richard M. Stallman's avatar
Richard M. Stallman committed
240 241
	/* Stop remembering the text that is matched and store it in a
	   memory register.  Followed by one byte with the register
242
	   number, in the range 0 to one less than 're_nsub' in the
243
	   pattern buffer.  */
244 245
  stop_memory,

Richard M. Stallman's avatar
Richard M. Stallman committed
246
	/* Match a duplicate of something remembered. Followed by one
247
	   byte containing the register number.  */
248 249
  duplicate,

Richard M. Stallman's avatar
Richard M. Stallman committed
250
	/* Fail unless at beginning of line.  */
251 252
  begline,

253
	/* Fail unless at end of line.  */
254 255
  endline,

256
	/* Succeeds if at beginning of buffer.  */
257 258
  begbuf,

Richard M. Stallman's avatar
Richard M. Stallman committed
259
	/* Analogously, for end of buffer/string.  */
260
  endbuf,
261

Richard M. Stallman's avatar
Richard M. Stallman committed
262
	/* Followed by two byte relative address to which to jump.  */
263
  jump,
264

Richard M. Stallman's avatar
Richard M. Stallman committed
265
	/* Followed by two-byte relative address of place to resume at
Juanma Barranquero's avatar
Juanma Barranquero committed
266
	   in case of failure.  */
267
  on_failure_jump,
268

Richard M. Stallman's avatar
Richard M. Stallman committed
269 270
	/* Like on_failure_jump, but pushes a placeholder instead of the
	   current string position when executed.  */
271
  on_failure_keep_string_jump,
272

273
	/* Just like 'on_failure_jump', except that it checks that we
274 275 276 277
	   don't get stuck in an infinite loop (matching an empty string
	   indefinitely).  */
  on_failure_jump_loop,

278
	/* Just like 'on_failure_jump_loop', except that it checks for
279 280
	   a different kind of loop (the kind that shows up with non-greedy
	   operators).  This operation has to be immediately preceded
281
	   by a 'no_op'.  */
282 283
  on_failure_jump_nastyloop,

284
	/* A smart 'on_failure_jump' used for greedy * and + operators.
Juanma Barranquero's avatar
Juanma Barranquero committed
285
	   It analyzes the loop before which it is put and if the
286
	   loop does not require backtracking, it changes itself to
287 288 289
	   'on_failure_keep_string_jump' and short-circuits the loop,
	   else it just defaults to changing itself into 'on_failure_jump'.
	   It assumes that it is pointing to just past a 'jump'.  */
290
  on_failure_jump_smart,
291

Richard M. Stallman's avatar
Richard M. Stallman committed
292
	/* Followed by two-byte relative address and two-byte number n.
293 294 295
	   After matching N times, jump to the address upon failure.
	   Does not work if N starts at 0: use on_failure_jump_loop
	   instead.  */
296 297
  succeed_n,

Richard M. Stallman's avatar
Richard M. Stallman committed
298 299
	/* Followed by two-byte relative address, and two-byte number n.
	   Jump to the address N times, then fail.  */
300 301
  jump_n,

Richard M. Stallman's avatar
Richard M. Stallman committed
302
	/* Set the following two-byte relative address to the
Juanma Barranquero's avatar
Juanma Barranquero committed
303
	   subsequent two-byte number.  The address *includes* the two
Richard M. Stallman's avatar
Richard M. Stallman committed
304
	   bytes of number.  */
305 306 307 308 309 310
  set_number_at,

  wordbeg,	/* Succeeds if at word beginning.  */
  wordend,	/* Succeeds if at word end.  */

  wordbound,	/* Succeeds if at a word boundary.  */
Juanma Barranquero's avatar
Juanma Barranquero committed
311
  notwordbound,	/* Succeeds if not at a word boundary.  */
312

313 314 315
  symbeg,       /* Succeeds if at symbol beginning.  */
  symend,       /* Succeeds if at symbol end.  */

316
	/* Matches any character whose syntax is specified.  Followed by
Richard M. Stallman's avatar
Richard M. Stallman committed
317
	   a byte which contains a syntax code, e.g., Sword.  */
318 319 320
  syntaxspec,

	/* Matches any character whose syntax is not that specified.  */
321
  notsyntaxspec,
322

323
  at_dot,	/* Succeeds if at point.  */
324 325

  /* Matches any character whose category-set contains the specified
Juanma Barranquero's avatar
Juanma Barranquero committed
326 327
     category.  The operator is followed by a byte which contains a
     category code (mnemonic ASCII character).  */
328 329 330 331 332 333
  categoryspec,

  /* Matches any character whose category-set does not contain the
     specified category.  The operator is followed by a byte which
     contains the category code (mnemonic ASCII character).  */
  notcategoryspec
334 335 336 337 338 339 340 341 342 343
} re_opcode_t;

/* Common operations on the compiled pattern.  */

/* Store NUMBER in two contiguous bytes starting at DESTINATION.  */

#define STORE_NUMBER(destination, number)				\
  do {									\
    (destination)[0] = (number) & 0377;					\
    (destination)[1] = (number) >> 8;					\
344
  } while (false)
345 346 347 348 349 350 351 352 353

/* Same as STORE_NUMBER, except increment DESTINATION to
   the byte after where the number is stored.  Therefore, DESTINATION
   must be an lvalue.  */

#define STORE_NUMBER_AND_INCR(destination, number)			\
  do {									\
    STORE_NUMBER (destination, number);					\
    (destination) += 2;							\
354
  } while (false)
355 356 357 358 359

/* Put into DESTINATION a number stored in two contiguous bytes starting
   at SOURCE.  */

#define EXTRACT_NUMBER(destination, source)				\
360
  ((destination) = extract_number (source))
361

362 363
static int
extract_number (re_char *source)
364
{
365 366
  signed char leading_byte = source[1];
  return leading_byte * 256 + source[0];
367 368 369 370 371 372
}

/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
   SOURCE must be an lvalue.  */

#define EXTRACT_NUMBER_AND_INCR(destination, source)			\
373
  ((destination) = extract_number_and_incr (&source))
374

375 376
static int
extract_number_and_incr (re_char **source)
377
{
378
  int num = extract_number (*source);
379
  *source += 2;
380
  return num;
381 382
}

383 384
/* Store a multibyte character in three contiguous bytes starting
   DESTINATION, and increment DESTINATION to the byte after where the
Juanma Barranquero's avatar
Juanma Barranquero committed
385
   character is stored.  Therefore, DESTINATION must be an lvalue.  */
386 387 388 389 390 391 392

#define STORE_CHARACTER_AND_INCR(destination, character)	\
  do {								\
    (destination)[0] = (character) & 0377;			\
    (destination)[1] = ((character) >> 8) & 0377;		\
    (destination)[2] = (character) >> 16;			\
    (destination) += 3;						\
393
  } while (false)
394 395

/* Put into DESTINATION a character stored in three contiguous bytes
Juanma Barranquero's avatar
Juanma Barranquero committed
396
   starting at SOURCE.  */
397 398 399 400 401 402

#define EXTRACT_CHARACTER(destination, source)	\
  do {						\
    (destination) = ((source)[0]		\
		     | ((source)[1] << 8)	\
		     | ((source)[2] << 16));	\
403
  } while (false)
404 405 406 407 408 409 410 411 412


/* Macros for charset. */

/* Size of bitmap of charset P in bytes.  P is a start of charset,
   i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not.  */
#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)

/* Nonzero if charset P has range table.  */
413
#define CHARSET_RANGE_TABLE_EXISTS_P(p)	 (((p)[1] & 0x80) != 0)
414 415 416

/* Return the address of range table of charset P.  But not the start
   of table itself, but the before where the number of ranges is
417
   stored.  '2 +' means to skip re_opcode_t and size of bitmap,
418 419 420 421 422 423 424
   and the 2 bytes of flags at the start of the range table.  */
#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])

/* Extract the bit flags that start a range table.  */
#define CHARSET_RANGE_TABLE_BITS(p)		\
  ((p)[2 + CHARSET_BITMAP_SIZE (p)]		\
   + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
425 426

/* Return the address of end of RANGE_TABLE.  COUNT is number of
427 428
   ranges (which is a pair of (start, end)) in the RANGE_TABLE.  '* 2'
   is start of range and end of range.  '* 3' is size of each start
429 430 431 432
   and end.  */
#define CHARSET_RANGE_TABLE_END(range_table, count)	\
  ((range_table) + (count) * 2 * 3)

433 434
/* If REGEX_EMACS_DEBUG is defined, print many voluminous messages
   (if the variable regex_emacs_debug is positive).  */
435

436
#ifdef REGEX_EMACS_DEBUG
437

438
/* Use standard I/O for debugging.  */
439
# include <stdio.h>
440

441
static int regex_emacs_debug = -100000;
442

443
# define DEBUG_STATEMENT(e) e
444 445
# define DEBUG_PRINT(...)                                       \
  if (regex_emacs_debug > 0) fprintf (stderr, __VA_ARGS__)
446
# define DEBUG_COMPILES_ARGUMENTS
447
# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)				\
448
  if (regex_emacs_debug > 0) print_partial_compiled_pattern (s, e)
449
# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)			\
450
  if (regex_emacs_debug > 0) print_double_string (w, s1, sz1, s2, sz2)
451 452 453 454


/* Print the fastmap in human-readable form.  */

455 456
static void
print_fastmap (char *fastmap)
457
{
458 459
  bool was_a_range = false;
  int i = 0;
460

461 462 463 464
  while (i < (1 << BYTEWIDTH))
    {
      if (fastmap[i++])
	{
465
	  was_a_range = false;
466
	  fputc (i - 1, stderr);
Richard M. Stallman's avatar
Richard M. Stallman committed
467 468
	  while (i < (1 << BYTEWIDTH)  &&  fastmap[i])
	    {
469
	      was_a_range = true;
Richard M. Stallman's avatar
Richard M. Stallman committed
470 471
	      i++;
	    }
472
	  if (was_a_range)
Richard M. Stallman's avatar
Richard M. Stallman committed
473
	    {
474 475
	      fprintf (stderr, "-");
	      fputc (i - 1, stderr);
Richard M. Stallman's avatar
Richard M. Stallman committed
476 477
	    }
	}
478
    }
479
  fputc ('\n', stderr);
480 481 482 483 484 485
}


/* Print a compiled pattern string in human-readable form, starting at
   the START pointer into it and ending just before the pointer END.  */

486 487
static void
print_partial_compiled_pattern (re_char *start, re_char *end)
488 489
{
  int mcnt, mcnt2;
490 491
  re_char *p = start;
  re_char *pend = end;
492 493 494

  if (start == NULL)
    {
495
      fprintf (stderr, "(null)\n");
496 497
      return;
    }
498

499 500 501
  /* Loop over pattern commands.  */
  while (p < pend)
    {
502
      fprintf (stderr, "%td:\t", p - start);
503 504 505

      switch ((re_opcode_t) *p++)
	{
Richard M. Stallman's avatar
Richard M. Stallman committed
506
	case no_op:
507
	  fprintf (stderr, "/no_op");
Richard M. Stallman's avatar
Richard M. Stallman committed
508
	  break;
509

510
	case succeed:
511
	  fprintf (stderr, "/succeed");
512 513
	  break;

514 515
	case exactn:
	  mcnt = *p++;
516
	  fprintf (stderr, "/exactn/%d", mcnt);
Richard M. Stallman's avatar
Richard M. Stallman committed
517
	  do
518
	    {
519
	      fprintf (stderr, "/%c", *p++);
Richard M. Stallman's avatar
Richard M. Stallman committed
520 521 522
	    }
	  while (--mcnt);
	  break;
523 524

	case start_memory:
525
	  fprintf (stderr, "/start_memory/%d", *p++);
Richard M. Stallman's avatar
Richard M. Stallman committed
526
	  break;
527 528

	case stop_memory:
529
	  fprintf (stderr, "/stop_memory/%d", *p++);
Richard M. Stallman's avatar
Richard M. Stallman committed
530
	  break;
531 532

	case duplicate:
533
	  fprintf (stderr, "/duplicate/%d", *p++);
534 535 536
	  break;

	case anychar:
537
	  fprintf (stderr, "/anychar");
538 539 540
	  break;

	case charset:
Richard M. Stallman's avatar
Richard M. Stallman committed
541 542
	case charset_not:
	  {
543 544
	    int c, last = -100;
	    bool in_range = false;
545
	    int length = CHARSET_BITMAP_SIZE (p - 1);
546
	    bool has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
547

548
	    fprintf (stderr, "/charset [%s",
Kenichi Handa's avatar
Kenichi Handa committed
549
		     (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
550

Kenichi Handa's avatar
Kenichi Handa committed
551 552
	    if (p + *p >= pend)
	      fprintf (stderr, " !extends past end of pattern! ");
553

Richard M. Stallman's avatar
Richard M. Stallman committed
554
	    for (c = 0; c < 256; c++)
555
	      if (c / 8 < length
556 557 558 559 560
		  && (p[1 + (c/8)] & (1 << (c % 8))))
		{
		  /* Are we starting a range?  */
		  if (last + 1 == c && ! in_range)
		    {
561
		      fprintf (stderr, "-");
562
		      in_range = true;
563 564 565
		    }
		  /* Have we broken a range?  */
		  else if (last + 1 != c && in_range)
566
		    {
567
		      fprintf (stderr, "%c", last);
568
		      in_range = false;
569
		    }
570

571
		  if (! in_range)
572
		    fprintf (stderr, "%c", c);
573 574

		  last = c;
Richard M. Stallman's avatar
Richard M. Stallman committed
575
	      }
576 577

	    if (in_range)
578
	      fprintf (stderr, "%c", last);
579

580
	    fprintf (stderr, "]");
581

582
	    p += 1 + length;
583 584

	    if (has_range_table)
585 586
	      {
		int count;
587
		fprintf (stderr, "has-range-table");
588 589 590 591 592 593

		/* ??? Should print the range table; for now, just skip it.  */
		p += 2;		/* skip range table bits */
		EXTRACT_NUMBER_AND_INCR (count, p);
		p = CHARSET_RANGE_TABLE_END (p, count);
	      }
594 595 596 597
	  }
	  break;

	case begline:
598
	  fprintf (stderr, "/begline");
Richard M. Stallman's avatar
Richard M. Stallman committed
599
	  break;
600 601

	case endline:
602
	  fprintf (stderr, "/endline");
Richard M. Stallman's avatar
Richard M. Stallman committed
603
	  break;
604 605

	case on_failure_jump:
606 607
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  fprintf (stderr, "/on_failure_jump to %td", p + mcnt - start);
Richard M. Stallman's avatar
Richard M. Stallman committed
608
	  break;
609 610

	case on_failure_keep_string_jump:
611 612 613
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  fprintf (stderr, "/on_failure_keep_string_jump to %td",
		   p + mcnt - start);
Richard M. Stallman's avatar
Richard M. Stallman committed
614
	  break;
615

616
	case on_failure_jump_nastyloop:
617 618 619
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  fprintf (stderr, "/on_failure_jump_nastyloop to %td",
		   p + mcnt - start);
620 621
	  break;

622
	case on_failure_jump_loop:
623 624 625
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  fprintf (stderr, "/on_failure_jump_loop to %td",
		   p + mcnt - start);
626 627
	  break;

628
	case on_failure_jump_smart:
629 630 631
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  fprintf (stderr, "/on_failure_jump_smart to %td",
		   p + mcnt - start);
632 633
	  break;

Richard M. Stallman's avatar
Richard M. Stallman committed
634
	case jump:
635 636
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  fprintf (stderr, "/jump to %td", p + mcnt - start);
637 638
	  break;

Richard M. Stallman's avatar
Richard M. Stallman committed
639
	case succeed_n:
640 641 642 643
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  EXTRACT_NUMBER_AND_INCR (mcnt2, p);
	  fprintf (stderr, "/succeed_n to %td, %d times",
		   p - 2 + mcnt - start, mcnt2);
Richard M. Stallman's avatar
Richard M. Stallman committed
644
	  break;
645

Richard M. Stallman's avatar
Richard M. Stallman committed
646
	case jump_n:
647 648 649 650
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  EXTRACT_NUMBER_AND_INCR (mcnt2, p);
	  fprintf (stderr, "/jump_n to %td, %d times",
		   p - 2 + mcnt - start, mcnt2);
Richard M. Stallman's avatar
Richard M. Stallman committed
651
	  break;
652

Richard M. Stallman's avatar
Richard M. Stallman committed
653
	case set_number_at:
654 655 656 657
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  EXTRACT_NUMBER_AND_INCR (mcnt2, p);
	  fprintf (stderr, "/set_number_at location %td to %d",
		   p - 2 + mcnt - start, mcnt2);
Richard M. Stallman's avatar
Richard M. Stallman committed
658
	  break;
659

Richard M. Stallman's avatar
Richard M. Stallman committed
660
	case wordbound:
661
	  fprintf (stderr, "/wordbound");
662 663 664
	  break;

	case notwordbound:
665
	  fprintf (stderr, "/notwordbound");
Richard M. Stallman's avatar
Richard M. Stallman committed
666
	  break;
667 668

	case wordbeg:
669
	  fprintf (stderr, "/wordbeg");
670
	  break;
671

672
	case wordend:
673
	  fprintf (stderr, "/wordend");
674
	  break;
675

676
	case symbeg:
677
	  fprintf (stderr, "/symbeg");
678 679 680
	  break;

	case symend:
681
	  fprintf (stderr, "/symend");
682
	  break;
683

684
	case syntaxspec:
685
	  fprintf (stderr, "/syntaxspec");
686
	  mcnt = *p++;
687
	  fprintf (stderr, "/%d", mcnt);
688 689 690
	  break;

	case notsyntaxspec:
691
	  fprintf (stderr, "/notsyntaxspec");
692
	  mcnt = *p++;
693
	  fprintf (stderr, "/%d", mcnt);
694 695
	  break;

696
	case at_dot:
697
	  fprintf (stderr, "/at_dot");
Richard M. Stallman's avatar
Richard M. Stallman committed
698
	  break;
699

700
	case categoryspec:
701
	  fprintf (stderr, "/categoryspec");
702
	  mcnt = *p++;
703
	  fprintf (stderr, "/%d", mcnt);
Richard M. Stallman's avatar
Richard M. Stallman committed
704
	  break;
705

706
	case notcategoryspec:
707
	  fprintf (stderr, "/notcategoryspec");
708
	  mcnt = *p++;
709
	  fprintf (stderr, "/%d", mcnt);
710 711 712
	  break;

	case begbuf:
713
	  fprintf (stderr, "/begbuf");
Richard M. Stallman's avatar
Richard M. Stallman committed
714
	  break;
715 716

	case endbuf:
717
	  fprintf (stderr, "/endbuf");
Richard M. Stallman's avatar
Richard M. Stallman committed
718
	  break;
719

Richard M. Stallman's avatar
Richard M. Stallman committed
720
	default:
721
	  fprintf (stderr, "?%d", *(p-1));
722 723
	}

724
      fprintf (stderr, "\n");
725 726
    }

727
  fprintf (stderr, "%td:\tend of pattern.\n", p - start);
728 729 730
}


731 732
static void
print_compiled_pattern (struct re_pattern_buffer *bufp)
733
{
734
  re_char *buffer = bufp->buffer;
735 736

  print_partial_compiled_pattern (buffer, buffer + bufp->used);
737 738
  fprintf (stderr, "%tu bytes used/%tu bytes allocated.\n",
           bufp->used, bufp->allocated);
739 740 741

  if (bufp->fastmap_accurate && bufp->fastmap)
    {
742
      fprintf (stderr, "fastmap: ");
743 744 745
      print_fastmap (bufp->fastmap);
    }

746 747 748 749
  fprintf (stderr, "re_nsub: %tu\t", bufp->re_nsub);
  fprintf (stderr, "regs_alloc: %d\t", bufp->regs_allocated);
  fprintf (stderr, "can_be_null: %d\t", bufp->can_be_null);
  fflush (stderr);
750 751 752 753
  /* Perhaps we should print the translate table?  */
}


754
static void
755 756
print_double_string (re_char *where, re_char *string1, ptrdiff_t size1,
		     re_char *string2, ptrdiff_t size2)
757 758
{
  if (where == NULL)
759
    fprintf (stderr, "(null)");
760 761 762
  else
    {
      if (FIRST_STRING_P (where))
Richard M. Stallman's avatar
Richard M. Stallman committed
763
	{
764
	  fwrite_unlocked (where, 1, string1 + size1 - where, stderr);
Richard M. Stallman's avatar
Richard M. Stallman committed
765 766
	  where = string2;
	}
767

768
      fwrite_unlocked (where, 1, string2 + size2 - where, stderr);
769 770 771
    }
}

772
#else /* not REGEX_EMACS_DEBUG */
773

774
# define DEBUG_STATEMENT(e)
Paul Eggert's avatar
Paul Eggert committed
775
# define DEBUG_PRINT(...)
776 777
# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
778

779
#endif /* not REGEX_EMACS_DEBUG */
780

781
typedef enum
782
{
783 784 785 786 787