regex-emacs.c 148 KB
Newer Older
1
/* Emacs regular expression matching and search
Karl Berry's avatar
Karl Berry committed
2

Paul Eggert's avatar
Paul Eggert committed
3
   Copyright (C) 1993-2019 Free Software Foundation, Inc.
Karl Berry's avatar
Karl Berry committed
4

5 6
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
Miles Bader's avatar
Miles Bader committed
7
   the Free Software Foundation; either version 3, or (at your option)
8 9 10 11
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Juanma Barranquero's avatar
Juanma Barranquero committed
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 14 15
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17

18
/* TODO:
19
   - structure the opcode space into opcode+flag.
20
   - replace (succeed_n + jump_n + set_number_at) with something that doesn't
21
     need to modify the compiled regexp so that re_search can be reentrant.
22
   - get rid of on_failure_jump_smart by doing the optimization in re_comp
23
     rather than at run-time, so that re_search can be reentrant.
24
*/
25

26
#include <config.h>
27

28
#include "regex-emacs.h"
29

30
#include <stdlib.h>
31

32 33 34 35
#include "character.h"
#include "buffer.h"
#include "syntax.h"
#include "category.h"
36

37
/* Maximum number of duplicates an interval can allow.  Some systems
38 39 40
   define this in other header files, but we want our value, so remove
   any previous define.  Repeat counts are stored in opcodes as 2-byte
   unsigned integers.  */
41 42
#ifdef RE_DUP_MAX
# undef RE_DUP_MAX
43
#endif
44 45
#define RE_DUP_MAX (0xffff)

46
/* Make syntax table lookup grant data in gl_state.  */
47
#define SYNTAX(c) syntax_property (c, 1)
48

49
/* Convert the pointer to the char to BEG-based offset from the start.  */
50
#define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
51
/* Strings are 0-indexed, buffers are 1-indexed; pun on the boolean
52
   result to get the right base index.  */
53
#define POS_AS_IN_BUFFER(p)                                    \
54
  ((p) + (NILP (gl_state.object) || BUFFERP (gl_state.object)))
55

56 57 58
#define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
#define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
#define RE_STRING_CHAR(p, multibyte) \
59
  (multibyte ? STRING_CHAR (p) : *(p))
60
#define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
61
  (multibyte ? STRING_CHAR_AND_LENGTH (p, len) : ((len) = 1, *(p)))
62

63
#define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
64

65
#define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
66

67 68 69
/* Set C a (possibly converted to multibyte) character before P.  P
   points into a string which is the virtual concatenation of STR1
   (which ends at END1) or STR2 (which ends at END2).  */
70
#define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2)			     \
71
  do {									     \
72
    if (target_multibyte)						     \
73 74
      {									     \
	re_char *dtemp = (p) == (str2) ? (end1) : (p);			     \
75 76 77
	re_char *dlimit = (p) > (str2) && (p) <= (end2) ? (str2) : (str1);   \
	while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp))		     \
	  continue;							     \
78
	c = STRING_CHAR (dtemp);					     \
79 80 81 82
      }									     \
    else								     \
      {									     \
	(c = ((p) == (str2) ? (end1) : (p))[-1]);			     \
83
	(c) = RE_CHAR_TO_MULTIBYTE (c);					     \
84
      }									     \
85
  } while (false)
86

87 88
/* Set C a (possibly converted to multibyte) character at P, and set
   LEN to the byte length of that character.  */
89
#define GET_CHAR_AFTER(c, p, len)		\
90
  do {						\
91
    if (target_multibyte)			\
92
      (c) = STRING_CHAR_AND_LENGTH (p, len);	\
93 94
    else					\
      {						\
95
	(c) = *p;				\
96
	len = 1;				\
97
	(c) = RE_CHAR_TO_MULTIBYTE (c);		\
98
      }						\
99
   } while (false)
100

101
/* 1 if C is an ASCII character.  */
102
#define IS_REAL_ASCII(c) ((c) < 0200)
103

104
/* 1 if C is a unibyte character.  */
105
#define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
106

107
/* The Emacs definitions should not be directly affected by locales.  */
108

109
/* In Emacs, these are only used for single-byte characters.  */
110 111 112
#define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
#define ISCNTRL(c) ((c) < ' ')
#define ISXDIGIT(c) (0 <= char_hexdigit (c))
113 114 115

/* The rest must handle multibyte characters.  */

116
#define ISBLANK(c) (IS_REAL_ASCII (c)			\
117 118 119
                     ? ((c) == ' ' || (c) == '\t')      \
                     : blankp (c))

120
#define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c)				\
121
		     ? (c) > ' ' && !((c) >= 0177 && (c) <= 0240)	\
122
		     : graphicp (c))
123

124
#define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c)				\
125
		    ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237)	\
126
		     : printablep (c))
127

128
#define ISALNUM(c) (IS_REAL_ASCII (c)			\
129 130 131
		    ? (((c) >= 'a' && (c) <= 'z')	\
		       || ((c) >= 'A' && (c) <= 'Z')	\
		       || ((c) >= '0' && (c) <= '9'))	\
132
		    : alphanumericp (c))
133

134
#define ISALPHA(c) (IS_REAL_ASCII (c)			\
135 136
		    ? (((c) >= 'a' && (c) <= 'z')	\
		       || ((c) >= 'A' && (c) <= 'Z'))	\
137
		    : alphabeticp (c))
138

139
#define ISLOWER(c) lowercasep (c)
140

141
#define ISPUNCT(c) (IS_REAL_ASCII (c)				\
142 143
		    ? ((c) > ' ' && (c) < 0177			\
		       && !(((c) >= 'a' && (c) <= 'z')		\
144 145
		            || ((c) >= 'A' && (c) <= 'Z')	\
		            || ((c) >= '0' && (c) <= '9')))	\
146 147
		    : SYNTAX (c) != Sword)

148
#define ISSPACE(c) (SYNTAX (c) == Swhitespace)
149

150
#define ISUPPER(c) uppercasep (c)
151

152
#define ISWORD(c) (SYNTAX (c) == Sword)
153

154
/* Use alloca instead of malloc.  This is because using malloc in
155 156
   re_search* or re_match* could cause memory leaks when C-g is used
   in Emacs (note that SAFE_ALLOCA could also call malloc, but does so
157
   via 'record_xmalloc' which uses 'unwind_protect' to ensure the
158 159 160
   memory is freed even in case of non-local exits); also, malloc is
   slower and causes storage fragmentation.  On the other hand, malloc
   is more portable, and easier to debug.
161

162
   Because we sometimes use alloca, some routines have to be macros,
163
   not functions -- 'alloca'-allocated space disappears at the end of the
164 165
   function it is called in.  */

166 167 168
/* This may be adjusted in main(), if the stack is successfully grown.  */
ptrdiff_t emacs_re_safe_alloca = MAX_ALLOCA;
/* Like USE_SAFE_ALLOCA, but use emacs_re_safe_alloca.  */
169 170
#define REGEX_USE_SAFE_ALLOCA					       \
  USE_SAFE_ALLOCA; sa_avail = emacs_re_safe_alloca
171

172
/* Assumes a 'char *destination' variable.  */
173 174
#define REGEX_REALLOCATE(source, osize, nsize)				\
  (destination = SAFE_ALLOCA (nsize),					\
175
   memcpy (destination, source, osize))
176

177 178
/* True if 'size1' is non-NULL and PTR is pointing anywhere inside
   'string1' or just past its end.  This works if PTR is NULL, which is
179
   a good thing.  */
Richard M. Stallman's avatar
Richard M. Stallman committed
180
#define FIRST_STRING_P(ptr)					\
181 182
  (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)

183
#define BYTEWIDTH 8 /* In bits.  */
184

185 186 187
/* Type of source-pattern and string chars.  */
typedef const unsigned char re_char;

188 189
static void re_compile_fastmap (struct re_pattern_buffer *);
static ptrdiff_t re_match_2_internal (struct re_pattern_buffer *bufp,
190 191
				     re_char *string1, ptrdiff_t size1,
				     re_char *string2, ptrdiff_t size2,
192
				     ptrdiff_t pos,
Paul Eggert's avatar
Paul Eggert committed
193
				     struct re_registers *regs,
194
				     ptrdiff_t stop);
195 196

/* These are the command codes that appear in compiled regular
197
   expressions.  Some opcodes are followed by argument bytes.  A
198 199 200 201 202 203 204
   command code can specify any interpretation whatsoever for its
   arguments.  Zero bytes may appear in the compiled regular expression.  */

typedef enum
{
  no_op = 0,

205
  /* Succeed right away--no more backtracking.  */
206 207
  succeed,

Richard M. Stallman's avatar
Richard M. Stallman committed
208
	/* Followed by one byte giving n, then by n literal bytes.  */
209 210
  exactn,

Richard M. Stallman's avatar
Richard M. Stallman committed
211
	/* Matches any (more or less) character.  */
212 213
  anychar,

Richard M. Stallman's avatar
Richard M. Stallman committed
214 215 216 217 218
	/* Matches any one char belonging to specified set.  First
	   following byte is number of bitmap bytes.  Then come bytes
	   for a bitmap saying which chars are in.  Bits in each byte
	   are ordered low-bit-first.  A character is in the set if its
	   bit is 1.  A character too large to have a bit in the map is
219 220 221 222 223
	   automatically not in the set.

	   If the length byte has the 0x80 bit set, then that stuff
	   is followed by a range table:
	       2 bytes of flags for character sets (low 8 bits, high 8 bits)
224
		   See RANGE_TABLE_WORK_BITS below.
225
	       2 bytes, the number of pairs that follow (upto 32767)
226
	       pairs, each 2 multibyte characters,
227
		   each multibyte character represented as 3 bytes.  */
228 229
  charset,

Richard M. Stallman's avatar
Richard M. Stallman committed
230
	/* Same parameters as charset, but match any character that is
231
	   not one of those specified.  */
232 233
  charset_not,

Richard M. Stallman's avatar
Richard M. Stallman committed
234 235 236
	/* Start remembering the text that is matched, for storing in a
	   register.  Followed by one byte with the register number, in
	   the range 0 to one less than the pattern buffer's re_nsub
237
	   field.  */
238 239
  start_memory,

Richard M. Stallman's avatar
Richard M. Stallman committed
240 241
	/* Stop remembering the text that is matched and store it in a
	   memory register.  Followed by one byte with the register
242
	   number, in the range 0 to one less than 're_nsub' in the
243
	   pattern buffer.  */
244 245
  stop_memory,

Richard M. Stallman's avatar
Richard M. Stallman committed
246
	/* Match a duplicate of something remembered. Followed by one
247
	   byte containing the register number.  */
248 249
  duplicate,

Richard M. Stallman's avatar
Richard M. Stallman committed
250
	/* Fail unless at beginning of line.  */
251 252
  begline,

253
	/* Fail unless at end of line.  */
254 255
  endline,

256
	/* Succeeds if at beginning of buffer.  */
257 258
  begbuf,

Richard M. Stallman's avatar
Richard M. Stallman committed
259
	/* Analogously, for end of buffer/string.  */
260
  endbuf,
261

Richard M. Stallman's avatar
Richard M. Stallman committed
262
	/* Followed by two byte relative address to which to jump.  */
263
  jump,
264

Richard M. Stallman's avatar
Richard M. Stallman committed
265
	/* Followed by two-byte relative address of place to resume at
Juanma Barranquero's avatar
Juanma Barranquero committed
266
	   in case of failure.  */
267
  on_failure_jump,
268

Richard M. Stallman's avatar
Richard M. Stallman committed
269 270
	/* Like on_failure_jump, but pushes a placeholder instead of the
	   current string position when executed.  */
271
  on_failure_keep_string_jump,
272

273
	/* Just like 'on_failure_jump', except that it checks that we
274 275 276 277
	   don't get stuck in an infinite loop (matching an empty string
	   indefinitely).  */
  on_failure_jump_loop,

278
	/* Just like 'on_failure_jump_loop', except that it checks for
279 280
	   a different kind of loop (the kind that shows up with non-greedy
	   operators).  This operation has to be immediately preceded
281
	   by a 'no_op'.  */
282 283
  on_failure_jump_nastyloop,

284
	/* A smart 'on_failure_jump' used for greedy * and + operators.
Juanma Barranquero's avatar
Juanma Barranquero committed
285
	   It analyzes the loop before which it is put and if the
286
	   loop does not require backtracking, it changes itself to
287 288 289
	   'on_failure_keep_string_jump' and short-circuits the loop,
	   else it just defaults to changing itself into 'on_failure_jump'.
	   It assumes that it is pointing to just past a 'jump'.  */
290
  on_failure_jump_smart,
291

Richard M. Stallman's avatar
Richard M. Stallman committed
292
	/* Followed by two-byte relative address and two-byte number n.
293 294 295
	   After matching N times, jump to the address upon failure.
	   Does not work if N starts at 0: use on_failure_jump_loop
	   instead.  */
296 297
  succeed_n,

Richard M. Stallman's avatar
Richard M. Stallman committed
298 299
	/* Followed by two-byte relative address, and two-byte number n.
	   Jump to the address N times, then fail.  */
300 301
  jump_n,

Richard M. Stallman's avatar
Richard M. Stallman committed
302
	/* Set the following two-byte relative address to the
Juanma Barranquero's avatar
Juanma Barranquero committed
303
	   subsequent two-byte number.  The address *includes* the two
Richard M. Stallman's avatar
Richard M. Stallman committed
304
	   bytes of number.  */
305 306 307 308 309 310
  set_number_at,

  wordbeg,	/* Succeeds if at word beginning.  */
  wordend,	/* Succeeds if at word end.  */

  wordbound,	/* Succeeds if at a word boundary.  */
Juanma Barranquero's avatar
Juanma Barranquero committed
311
  notwordbound,	/* Succeeds if not at a word boundary.  */
312

313 314 315
  symbeg,       /* Succeeds if at symbol beginning.  */
  symend,       /* Succeeds if at symbol end.  */

316
	/* Matches any character whose syntax is specified.  Followed by
Richard M. Stallman's avatar
Richard M. Stallman committed
317
	   a byte which contains a syntax code, e.g., Sword.  */
318 319 320
  syntaxspec,

	/* Matches any character whose syntax is not that specified.  */
321
  notsyntaxspec,
322

323
  at_dot,	/* Succeeds if at point.  */
324 325

  /* Matches any character whose category-set contains the specified
Juanma Barranquero's avatar
Juanma Barranquero committed
326 327
     category.  The operator is followed by a byte which contains a
     category code (mnemonic ASCII character).  */
328 329 330 331 332 333
  categoryspec,

  /* Matches any character whose category-set does not contain the
     specified category.  The operator is followed by a byte which
     contains the category code (mnemonic ASCII character).  */
  notcategoryspec
334 335 336 337 338 339 340 341 342 343
} re_opcode_t;

/* Common operations on the compiled pattern.  */

/* Store NUMBER in two contiguous bytes starting at DESTINATION.  */

#define STORE_NUMBER(destination, number)				\
  do {									\
    (destination)[0] = (number) & 0377;					\
    (destination)[1] = (number) >> 8;					\
344
  } while (false)
345 346 347 348 349 350 351 352 353

/* Same as STORE_NUMBER, except increment DESTINATION to
   the byte after where the number is stored.  Therefore, DESTINATION
   must be an lvalue.  */

#define STORE_NUMBER_AND_INCR(destination, number)			\
  do {									\
    STORE_NUMBER (destination, number);					\
    (destination) += 2;							\
354
  } while (false)
355 356 357 358 359

/* Put into DESTINATION a number stored in two contiguous bytes starting
   at SOURCE.  */

#define EXTRACT_NUMBER(destination, source)				\
360
  ((destination) = extract_number (source))
361

362 363
static int
extract_number (re_char *source)
364
{
365 366
  signed char leading_byte = source[1];
  return leading_byte * 256 + source[0];
367 368 369 370 371 372
}

/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
   SOURCE must be an lvalue.  */

#define EXTRACT_NUMBER_AND_INCR(destination, source)			\
373
  ((destination) = extract_number_and_incr (&source))
374

375 376
static int
extract_number_and_incr (re_char **source)
377
{
378
  int num = extract_number (*source);
379
  *source += 2;
380
  return num;
381 382
}

383 384
/* Store a multibyte character in three contiguous bytes starting
   DESTINATION, and increment DESTINATION to the byte after where the
Juanma Barranquero's avatar
Juanma Barranquero committed
385
   character is stored.  Therefore, DESTINATION must be an lvalue.  */
386 387 388 389 390 391 392

#define STORE_CHARACTER_AND_INCR(destination, character)	\
  do {								\
    (destination)[0] = (character) & 0377;			\
    (destination)[1] = ((character) >> 8) & 0377;		\
    (destination)[2] = (character) >> 16;			\
    (destination) += 3;						\
393
  } while (false)
394 395

/* Put into DESTINATION a character stored in three contiguous bytes
Juanma Barranquero's avatar
Juanma Barranquero committed
396
   starting at SOURCE.  */
397 398 399 400 401 402

#define EXTRACT_CHARACTER(destination, source)	\
  do {						\
    (destination) = ((source)[0]		\
		     | ((source)[1] << 8)	\
		     | ((source)[2] << 16));	\
403
  } while (false)
404 405 406 407 408 409 410 411 412


/* Macros for charset. */

/* Size of bitmap of charset P in bytes.  P is a start of charset,
   i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not.  */
#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)

/* Nonzero if charset P has range table.  */
413
#define CHARSET_RANGE_TABLE_EXISTS_P(p)	 (((p)[1] & 0x80) != 0)
414 415 416

/* Return the address of range table of charset P.  But not the start
   of table itself, but the before where the number of ranges is
417
   stored.  '2 +' means to skip re_opcode_t and size of bitmap,
418 419 420 421 422 423 424
   and the 2 bytes of flags at the start of the range table.  */
#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])

/* Extract the bit flags that start a range table.  */
#define CHARSET_RANGE_TABLE_BITS(p)		\
  ((p)[2 + CHARSET_BITMAP_SIZE (p)]		\
   + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
425 426

/* Return the address of end of RANGE_TABLE.  COUNT is number of
427 428
   ranges (which is a pair of (start, end)) in the RANGE_TABLE.  '* 2'
   is start of range and end of range.  '* 3' is size of each start
429 430 431 432
   and end.  */
#define CHARSET_RANGE_TABLE_END(range_table, count)	\
  ((range_table) + (count) * 2 * 3)

433 434
/* If REGEX_EMACS_DEBUG is defined, print many voluminous messages
   (if the variable regex_emacs_debug is positive).  */
435

436
#ifdef REGEX_EMACS_DEBUG
437

438
/* Use standard I/O for debugging.  */
439
# include <stdio.h>
440

441
static int regex_emacs_debug = -100000;
442

443
# define DEBUG_STATEMENT(e) e
444 445
# define DEBUG_PRINT(...)                                       \
  if (regex_emacs_debug > 0) fprintf (stderr, __VA_ARGS__)
446
# define DEBUG_COMPILES_ARGUMENTS
447
# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)				\
448
  if (regex_emacs_debug > 0) print_partial_compiled_pattern (s, e)
449
# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)			\
450
  if (regex_emacs_debug > 0) print_double_string (w, s1, sz1, s2, sz2)
451

452 453 454 455 456 457 458 459
static void
debug_putchar (int c)
{
  if (c >= 32 && c <= 126)
    fputc (c, stderr);
  else
    fprintf (stderr, "{%02x}", c);
}
460 461 462

/* Print the fastmap in human-readable form.  */

463 464
static void
print_fastmap (char *fastmap)
465
{
466 467
  bool was_a_range = false;
  int i = 0;
468

469 470 471 472
  while (i < (1 << BYTEWIDTH))
    {
      if (fastmap[i++])
	{
473
	  was_a_range = false;
474
	  debug_putchar (i - 1);
Richard M. Stallman's avatar
Richard M. Stallman committed
475 476
	  while (i < (1 << BYTEWIDTH)  &&  fastmap[i])
	    {
477
	      was_a_range = true;
Richard M. Stallman's avatar
Richard M. Stallman committed
478 479
	      i++;
	    }
480
	  if (was_a_range)
Richard M. Stallman's avatar
Richard M. Stallman committed
481
	    {
482
	      fprintf (stderr, "-");
483
	      debug_putchar (i - 1);
Richard M. Stallman's avatar
Richard M. Stallman committed
484 485
	    }
	}
486
    }
487
  fputc ('\n', stderr);
488 489 490 491 492 493
}


/* Print a compiled pattern string in human-readable form, starting at
   the START pointer into it and ending just before the pointer END.  */

494 495
static void
print_partial_compiled_pattern (re_char *start, re_char *end)
496 497
{
  int mcnt, mcnt2;
498 499
  re_char *p = start;
  re_char *pend = end;
500 501 502

  if (start == NULL)
    {
503
      fprintf (stderr, "(null)\n");
504 505
      return;
    }
506

507 508 509
  /* Loop over pattern commands.  */
  while (p < pend)
    {
510
      fprintf (stderr, "%td:\t", p - start);
511 512 513

      switch ((re_opcode_t) *p++)
	{
Richard M. Stallman's avatar
Richard M. Stallman committed
514
	case no_op:
515
	  fprintf (stderr, "/no_op");
Richard M. Stallman's avatar
Richard M. Stallman committed
516
	  break;
517

518
	case succeed:
519
	  fprintf (stderr, "/succeed");
520 521
	  break;

522 523
	case exactn:
	  mcnt = *p++;
524
	  fprintf (stderr, "/exactn/%d", mcnt);
Richard M. Stallman's avatar
Richard M. Stallman committed
525
	  do
526
	    {
527 528
	      fprintf (stderr, "/");
	      debug_putchar (*p++);
Richard M. Stallman's avatar
Richard M. Stallman committed
529 530 531
	    }
	  while (--mcnt);
	  break;
532 533

	case start_memory:
534
	  fprintf (stderr, "/start_memory/%d", *p++);
Richard M. Stallman's avatar
Richard M. Stallman committed
535
	  break;
536 537

	case stop_memory:
538
	  fprintf (stderr, "/stop_memory/%d", *p++);
Richard M. Stallman's avatar
Richard M. Stallman committed
539
	  break;
540 541

	case duplicate:
542
	  fprintf (stderr, "/duplicate/%d", *p++);
543 544 545
	  break;

	case anychar:
546
	  fprintf (stderr, "/anychar");
547 548 549
	  break;

	case charset:
Richard M. Stallman's avatar
Richard M. Stallman committed
550 551
	case charset_not:
	  {
552 553
	    int c, last = -100;
	    bool in_range = false;
554
	    int length = CHARSET_BITMAP_SIZE (p - 1);
555
	    bool has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
556

557
	    fprintf (stderr, "/charset [%s",
Kenichi Handa's avatar
Kenichi Handa committed
558
		     (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
559

Kenichi Handa's avatar
Kenichi Handa committed
560 561
	    if (p + *p >= pend)
	      fprintf (stderr, " !extends past end of pattern! ");
562

Richard M. Stallman's avatar
Richard M. Stallman committed
563
	    for (c = 0; c < 256; c++)
564
	      if (c / 8 < length
565 566 567 568 569
		  && (p[1 + (c/8)] & (1 << (c % 8))))
		{
		  /* Are we starting a range?  */
		  if (last + 1 == c && ! in_range)
		    {
570
		      fprintf (stderr, "-");
571
		      in_range = true;
572 573 574
		    }
		  /* Have we broken a range?  */
		  else if (last + 1 != c && in_range)
575
		    {
576
		      debug_putchar (last);
577
		      in_range = false;
578
		    }
579

580
		  if (! in_range)
581
		    debug_putchar (c);
582 583

		  last = c;
Richard M. Stallman's avatar
Richard M. Stallman committed
584
	      }
585 586

	    if (in_range)
587
	      debug_putchar (last);
588

589
	    fprintf (stderr, "]");
590

591
	    p += 1 + length;
592 593

	    if (has_range_table)
594 595
	      {
		int count;
596
		fprintf (stderr, "has-range-table");
597 598 599 600 601 602

		/* ??? Should print the range table; for now, just skip it.  */
		p += 2;		/* skip range table bits */
		EXTRACT_NUMBER_AND_INCR (count, p);
		p = CHARSET_RANGE_TABLE_END (p, count);
	      }
603 604 605 606
	  }
	  break;

	case begline:
607
	  fprintf (stderr, "/begline");
Richard M. Stallman's avatar
Richard M. Stallman committed
608
	  break;
609 610

	case endline:
611
	  fprintf (stderr, "/endline");
Richard M. Stallman's avatar
Richard M. Stallman committed
612
	  break;
613 614

	case on_failure_jump:
615 616
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  fprintf (stderr, "/on_failure_jump to %td", p + mcnt - start);
Richard M. Stallman's avatar
Richard M. Stallman committed
617
	  break;
618 619

	case on_failure_keep_string_jump:
620 621 622
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  fprintf (stderr, "/on_failure_keep_string_jump to %td",
		   p + mcnt - start);
Richard M. Stallman's avatar
Richard M. Stallman committed
623
	  break;
624

625
	case on_failure_jump_nastyloop:
626 627 628
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  fprintf (stderr, "/on_failure_jump_nastyloop to %td",
		   p + mcnt - start);
629 630
	  break;

631
	case on_failure_jump_loop:
632 633 634
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  fprintf (stderr, "/on_failure_jump_loop to %td",
		   p + mcnt - start);
635 636
	  break;

637
	case on_failure_jump_smart:
638 639 640
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  fprintf (stderr, "/on_failure_jump_smart to %td",
		   p + mcnt - start);
641 642
	  break;

Richard M. Stallman's avatar
Richard M. Stallman committed
643
	case jump:
644 645
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  fprintf (stderr, "/jump to %td", p + mcnt - start);
646 647
	  break;

Richard M. Stallman's avatar
Richard M. Stallman committed
648
	case succeed_n:
649 650 651 652
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  EXTRACT_NUMBER_AND_INCR (mcnt2, p);
	  fprintf (stderr, "/succeed_n to %td, %d times",
		   p - 2 + mcnt - start, mcnt2);
Richard M. Stallman's avatar
Richard M. Stallman committed
653
	  break;
654

Richard M. Stallman's avatar
Richard M. Stallman committed
655
	case jump_n:
656 657 658 659
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  EXTRACT_NUMBER_AND_INCR (mcnt2, p);
	  fprintf (stderr, "/jump_n to %td, %d times",
		   p - 2 + mcnt - start, mcnt2);
Richard M. Stallman's avatar
Richard M. Stallman committed
660
	  break;
661

Richard M. Stallman's avatar
Richard M. Stallman committed
662
	case set_number_at:
663 664 665 666
	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
	  EXTRACT_NUMBER_AND_INCR (mcnt2, p);
	  fprintf (stderr, "/set_number_at location %td to %d",
		   p - 2 + mcnt - start, mcnt2);
Richard M. Stallman's avatar
Richard M. Stallman committed
667
	  break;
668

Richard M. Stallman's avatar
Richard M. Stallman committed
669
	case wordbound:
670
	  fprintf (stderr, "/wordbound");
671 672 673
	  break;

	case notwordbound:
674
	  fprintf (stderr, "/notwordbound");
Richard M. Stallman's avatar
Richard M. Stallman committed
675
	  break;
676 677

	case wordbeg:
678
	  fprintf (stderr, "/wordbeg");
679
	  break;
680

681
	case wordend:
682
	  fprintf (stderr, "/wordend");
683
	  break;
684

685
	case symbeg:
686
	  fprintf (stderr, "/symbeg");
687 688 689
	  break;

	case symend:
690
	  fprintf (stderr, "/symend");
691
	  break;
692

693
	case syntaxspec:
694
	  fprintf (stderr, "/syntaxspec");
695
	  mcnt = *p++;
696
	  fprintf (stderr, "/%d", mcnt);
697 698 699
	  break;

	case notsyntaxspec:
700
	  fprintf (stderr, "/notsyntaxspec");
701
	  mcnt = *p++;
702
	  fprintf (stderr, "/%d", mcnt);
703 704
	  break;

705
	case at_dot:
706
	  fprintf (stderr, "/at_dot");
Richard M. Stallman's avatar
Richard M. Stallman committed
707
	  break;
708

709
	case categoryspec:
710
	  fprintf (stderr, "/categoryspec");
711
	  mcnt = *p++;
712
	  fprintf (stderr, "/%d", mcnt);
Richard M. Stallman's avatar
Richard M. Stallman committed
713
	  break;
714

715
	case notcategoryspec:
716
	  fprintf (stderr, "/notcategoryspec");
717
	  mcnt = *p++;
718
	  fprintf (stderr, "/%d", mcnt);
719 720 721
	  break;

	case begbuf:
722
	  fprintf (stderr, "/begbuf");
Richard M. Stallman's avatar
Richard M. Stallman committed
723
	  break;
724 725

	case endbuf:
726
	  fprintf (stderr, "/endbuf");
Richard M. Stallman's avatar
Richard M. Stallman committed
727
	  break;
728

Richard M. Stallman's avatar
Richard M. Stallman committed
729
	default:
730
	  fprintf (stderr, "?%d", *(p-1));
731 732
	}

733
      fprintf (stderr, "\n");
734 735
    }

736
  fprintf (stderr, "%td:\tend of pattern.\n", p - start);
737 738 739
}


740 741
static void
print_compiled_pattern (struct re_pattern_buffer *bufp)
742
{
743
  re_char *buffer = bufp->buffer;
744 745

  print_partial_compiled_pattern (buffer, buffer + bufp->used);
746 747
  fprintf (stderr, "%tu bytes used/%tu bytes allocated.\n",
           bufp->used, bufp->allocated);
748 749 750

  if (bufp->fastmap_accurate && bufp->fastmap)
    {
751
      fprintf (stderr, "fastmap: ");
752 753 754
      print_fastmap (bufp->fastmap);
    }

755 756 757 758
  fprintf (stderr, "re_nsub: %tu\t", bufp->re_nsub);
  fprintf (stderr, "regs_alloc: %d\t", bufp->regs_allocated);
  fprintf (stderr, "can_be_null: %d\t", bufp->can_be_null);
  fflush (stderr);
759 760 761 762
  /* Perhaps we should print the translate table?  */
}


763
static void
764 765
print_double_string (re_char *where, re_char *string1, ptrdiff_t size1,
		     re_char *string2, ptrdiff_t size2)
766 767
{
  if (where == NULL)
768
    fprintf (stderr, "(null)");
769 770
  else
    {
771
      int i;
772
      if (FIRST_STRING_P (where))
Richard M. Stallman's avatar
Richard M. Stallman committed
773
	{
774 775
	  for (i = 0; i < string1 + size1 - where; i++)
	    debug_putchar (where[i]);
Richard M. Stallman's avatar
Richard M. Stallman committed
776 777
	  where = string2;
	}
778

779 780
      for (i = 0; i < string2 + size2 - where; i++)
        debug_putchar (where[i]);
781 782 783
    }
}

784
#else /* not REGEX_EMACS_DEBUG */
785

786
# define DEBUG_STATEMENT(e)
Paul Eggert's avatar
Paul Eggert committed
787
# define DEBUG_PRINT(...)
788 789
# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)