regex.c 189 KB
Newer Older
Richard M. Stallman's avatar
Richard M. Stallman committed
1
/* Extended regular expression matching and search library, version
2
   0.12.  (Implements POSIX draft P1003.2/D11.2, except for some of the
Karl Berry's avatar
Karl Berry committed
3 4
   internationalization features.)

Paul Eggert's avatar
Paul Eggert committed
5
   Copyright (C) 1993-2019 Free Software Foundation, Inc.
Karl Berry's avatar
Karl Berry committed
6

7 8
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
Miles Bader's avatar
Miles Bader committed
9
   the Free Software Foundation; either version 3, or (at your option)
10 11 12 13
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Juanma Barranquero's avatar
Juanma Barranquero committed
14
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 16 17
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
18
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
19

20
/* TODO:
21
   - structure the opcode space into opcode+flag.
22
   - merge with glibc's regex.[ch].
23
   - replace (succeed_n + jump_n + set_number_at) with something that doesn't
24 25 26
     need to modify the compiled regexp so that re_match can be reentrant.
   - get rid of on_failure_jump_smart by doing the optimization in re_comp
     rather than at run-time, so that re_match can be reentrant.
27
*/
28

29
/* AIX requires this to be the first thing in the file.  */
30
#if defined _AIX && !defined REGEX_MALLOC
31 32 33
  #pragma alloca
#endif

34 35
/* Ignore some GCC warnings for now.  This section should go away
   once the Emacs and Gnulib regex code is merged.  */
36
#if 4 < __GNUC__ + (5 <= __GNUC_MINOR__) || defined __clang__
37 38 39 40 41 42 43 44 45
# pragma GCC diagnostic ignored "-Wstrict-overflow"
# ifndef emacs
#  pragma GCC diagnostic ignored "-Wunused-function"
#  pragma GCC diagnostic ignored "-Wunused-macros"
#  pragma GCC diagnostic ignored "-Wunused-result"
#  pragma GCC diagnostic ignored "-Wunused-variable"
# endif
#endif

46
#if 4 < __GNUC__ + (6 <= __GNUC_MINOR__) && ! defined __clang__
47 48 49
# pragma GCC diagnostic ignored "-Wunused-but-set-variable"
#endif

50
#include <config.h>
51

52
#include <stddef.h>
Paul Eggert's avatar
Paul Eggert committed
53
#include <stdlib.h>
54 55

#ifdef emacs
56 57 58
/* We need this for `regex.h', and perhaps for the Emacs include files.  */
# include <sys/types.h>
#endif
59

60 61
/* Whether to use ISO C Amendment 1 wide char functions.
   Those should not be used for Emacs since it uses its own.  */
62 63 64
#if defined _LIBC
#define WIDE_CHAR_SUPPORT 1
#else
65
#define WIDE_CHAR_SUPPORT \
66 67
	(HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
#endif
68

Paul Eggert's avatar
Paul Eggert committed
69
/* For platform which support the ISO C amendment 1 functionality we
70
   support user defined character classes.  */
71
#if WIDE_CHAR_SUPPORT
72 73 74 75 76
/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>.  */
# include <wchar.h>
# include <wctype.h>
#endif

77 78 79 80 81
#ifdef _LIBC
/* We have to keep the namespace clean.  */
# define regfree(preg) __regfree (preg)
# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
82
# define regerror(err_code, preg, errbuf, errbuf_size) \
Juanma Barranquero's avatar
Juanma Barranquero committed
83
	__regerror (err_code, preg, errbuf, errbuf_size)
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
# define re_set_registers(bu, re, nu, st, en) \
	__re_set_registers (bu, re, nu, st, en)
# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
	__re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
# define re_match(bufp, string, size, pos, regs) \
	__re_match (bufp, string, size, pos, regs)
# define re_search(bufp, string, size, startpos, range, regs) \
	__re_search (bufp, string, size, startpos, range, regs)
# define re_compile_pattern(pattern, length, bufp) \
	__re_compile_pattern (pattern, length, bufp)
# define re_set_syntax(syntax) __re_set_syntax (syntax)
# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
	__re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)

99 100 101 102 103
/* Make sure we call libc's function even if the user overrides them.  */
# define btowc __btowc
# define iswctype __iswctype
# define wctype __wctype

104 105 106 107 108 109 110 111 112 113
# define WEAK_ALIAS(a,b) weak_alias (a, b)

/* We are also using some library internals.  */
# include <locale/localeinfo.h>
# include <locale/elem-hash.h>
# include <langinfo.h>
#else
# define WEAK_ALIAS(a,b)
#endif

114
/* This is for other GNU distributions with internationalized messages.  */
115
#if HAVE_LIBINTL_H || defined _LIBC
116 117 118 119 120
# include <libintl.h>
#else
# define gettext(msgid) (msgid)
#endif

121 122 123
#ifndef gettext_noop
/* This define is so xgettext can find the internationalizable
   strings.  */
124
# define gettext_noop(String) String
125 126
#endif

127 128 129 130
/* The `emacs' switch turns on certain matching commands
   that make sense only in Emacs. */
#ifdef emacs

131
# include "lisp.h"
132
# include "character.h"
133
# include "buffer.h"
134

135 136
# include "syntax.h"
# include "category.h"
137

138 139 140
/* Make syntax table lookup grant data in gl_state.  */
# define SYNTAX(c) syntax_property (c, 1)

141 142 143
# ifdef malloc
#  undef malloc
# endif
144
# define malloc xmalloc
145 146 147
# ifdef realloc
#  undef realloc
# endif
148
# define realloc xrealloc
149 150 151
# ifdef free
#  undef free
# endif
152
# define free xfree
153

Juanma Barranquero's avatar
Juanma Barranquero committed
154
/* Converts the pointer to the char to BEG-based offset from the start.  */
155
# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
156 157
/* Strings are 0-indexed, buffers are 1-indexed; we pun on the boolean
   result to get the right base index.  */
158 159 160
# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))

# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
161
# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
162 163 164 165
# define RE_STRING_CHAR(p, multibyte) \
  (multibyte ? (STRING_CHAR (p)) : (*(p)))
# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
  (multibyte ? (STRING_CHAR_AND_LENGTH (p, len)) : ((len) = 1, *(p)))
166

167
# define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
168

169
# define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
170

171 172 173
/* Set C a (possibly converted to multibyte) character before P.  P
   points into a string which is the virtual concatenation of STR1
   (which ends at END1) or STR2 (which ends at END2).  */
174 175
# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2)		     \
  do {									     \
176
    if (target_multibyte)						     \
177 178 179 180
      {									     \
	re_char *dtemp = (p) == (str2) ? (end1) : (p);			     \
	re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
	while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp));		     \
181
	c = STRING_CHAR (dtemp);					     \
182 183 184 185
      }									     \
    else								     \
      {									     \
	(c = ((p) == (str2) ? (end1) : (p))[-1]);			     \
186
	(c) = RE_CHAR_TO_MULTIBYTE (c);					     \
187
      }									     \
188 189
  } while (0)

190 191 192 193
/* Set C a (possibly converted to multibyte) character at P, and set
   LEN to the byte length of that character.  */
# define GET_CHAR_AFTER(c, p, len)		\
  do {						\
194
    if (target_multibyte)			\
195
      (c) = STRING_CHAR_AND_LENGTH (p, len);	\
196 197
    else					\
      {						\
198
	(c) = *p;				\
199
	len = 1;				\
200
	(c) = RE_CHAR_TO_MULTIBYTE (c);		\
201
      }						\
Kenichi Handa's avatar
Kenichi Handa committed
202
   } while (0)
Stefan Monnier's avatar
Stefan Monnier committed
203

204 205 206 207 208
#else  /* not emacs */

/* If we are not linking with Emacs proper,
   we can't use the relocating allocator
   even if config.h says that we can.  */
209
# undef REL_ALLOC
210

Paul Eggert's avatar
Paul Eggert committed
211
# include <unistd.h>
212

213 214
/* When used in Emacs's lib-src, we need xmalloc and xrealloc. */

215
static void *
216
xmalloc (size_t size)
217
{
218
  void *val = malloc (size);
219 220
  if (!val && size)
    {
221
      write (STDERR_FILENO, "virtual memory exhausted\n", 25);
222 223 224 225 226
      exit (1);
    }
  return val;
}

227
static void *
228
xrealloc (void *block, size_t size)
229
{
230
  void *val;
231 232 233
  /* We must call malloc explicitly when BLOCK is 0, since some
     reallocs don't do this.  */
  if (! block)
234
    val = malloc (size);
235
  else
236
    val = realloc (block, size);
237 238
  if (!val && size)
    {
239
      write (STDERR_FILENO, "virtual memory exhausted\n", 25);
240 241 242 243 244
      exit (1);
    }
  return val;
}

245 246 247 248 249 250 251 252 253
# ifdef malloc
#  undef malloc
# endif
# define malloc xmalloc
# ifdef realloc
#  undef realloc
# endif
# define realloc xrealloc

254
# include <stdbool.h>
Paul Eggert's avatar
Paul Eggert committed
255
# include <string.h>
256 257 258

/* Define the syntax stuff for \<, \>, etc.  */

259
/* Sword must be nonzero for the wordchar pattern commands in re_match_2.  */
260
enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
261

262
/* Dummy macros for non-Emacs environments.  */
263 264
# define MAX_MULTIBYTE_LENGTH 1
# define RE_MULTIBYTE_P(x) 0
265
# define RE_TARGET_MULTIBYTE_P(x) 0
266
# define WORD_BOUNDARY_P(c1, c2) (0)
267
# define BYTES_BY_CHAR_HEAD(p) (1)
268
# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
269 270
# define STRING_CHAR(p) (*(p))
# define RE_STRING_CHAR(p, multibyte) STRING_CHAR (p)
271
# define CHAR_STRING(c, s) (*(s) = (c), 1)
272 273
# define STRING_CHAR_AND_LENGTH(p, actual_len) ((actual_len) = 1, *(p))
# define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) STRING_CHAR_AND_LENGTH (p, len)
274 275
# define RE_CHAR_TO_MULTIBYTE(c) (c)
# define RE_CHAR_TO_UNIBYTE(c) (c)
276
# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
277
  (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
278 279
# define GET_CHAR_AFTER(c, p, len)	\
  (c = *p, len = 1)
280
# define CHAR_BYTE8_P(c) (0)
281
# define CHAR_LEADING_CODE(c) (c)
Kenichi Handa's avatar
Kenichi Handa committed
282

283
#endif /* not emacs */
Stefan Monnier's avatar
Stefan Monnier committed
284 285

#ifndef RE_TRANSLATE
286 287
# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
# define RE_TRANSLATE_P(TBL) (TBL)
Stefan Monnier's avatar
Stefan Monnier committed
288
#endif
289 290 291 292

/* Get the interface, including the syntax bits.  */
#include "regex.h"

293 294
/* isalpha etc. are used for the character classes.  */
#include <ctype.h>
295

296
#ifdef emacs
297

298
/* 1 if C is an ASCII character.  */
299
# define IS_REAL_ASCII(c) ((c) < 0200)
300

301
/* 1 if C is a unibyte character.  */
302
# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
303

304
/* The Emacs definitions should not be directly affected by locales.  */
305

306
/* In Emacs, these are only used for single-byte characters.  */
307 308
# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
# define ISCNTRL(c) ((c) < ' ')
309
# define ISXDIGIT(c) (0 <= char_hexdigit (c))
310 311 312

/* The rest must handle multibyte characters.  */

313 314 315 316
# define ISBLANK(c) (IS_REAL_ASCII (c)                  \
                     ? ((c) == ' ' || (c) == '\t')      \
                     : blankp (c))

317
# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c)				\
318
		     ? (c) > ' ' && !((c) >= 0177 && (c) <= 0240)	\
319
		     : graphicp (c))
320

321
# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c)				\
322
		    ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237)	\
323
		     : printablep (c))
324

325
# define ISALNUM(c) (IS_REAL_ASCII (c)			\
326 327 328
		    ? (((c) >= 'a' && (c) <= 'z')	\
		       || ((c) >= 'A' && (c) <= 'Z')	\
		       || ((c) >= '0' && (c) <= '9'))	\
329
		    : alphanumericp (c))
330

331
# define ISALPHA(c) (IS_REAL_ASCII (c)			\
332 333
		    ? (((c) >= 'a' && (c) <= 'z')	\
		       || ((c) >= 'A' && (c) <= 'Z'))	\
334
		    : alphabeticp (c))
335

336
# define ISLOWER(c) lowercasep (c)
337

338
# define ISPUNCT(c) (IS_REAL_ASCII (c)				\
339 340
		    ? ((c) > ' ' && (c) < 0177			\
		       && !(((c) >= 'a' && (c) <= 'z')		\
341 342
		            || ((c) >= 'A' && (c) <= 'Z')	\
		            || ((c) >= '0' && (c) <= '9')))	\
343 344
		    : SYNTAX (c) != Sword)

345
# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
346

347
# define ISUPPER(c) uppercasep (c)
348

349
# define ISWORD(c) (SYNTAX (c) == Sword)
350 351 352

#else /* not emacs */

353
/* 1 if C is an ASCII character.  */
354
# define IS_REAL_ASCII(c) ((c) < 0200)
355 356

/* This distinction is not meaningful, except in Emacs.  */
357 358 359
# define ISUNIBYTE(c) 1

# ifdef isblank
360
#  define ISBLANK(c) isblank (c)
361 362 363 364
# else
#  define ISBLANK(c) ((c) == ' ' || (c) == '\t')
# endif
# ifdef isgraph
365
#  define ISGRAPH(c) isgraph (c)
366
# else
367
#  define ISGRAPH(c) (isprint (c) && !isspace (c))
368 369
# endif

370
/* Solaris defines ISPRINT so we must undefine it first.  */
371
# undef ISPRINT
372 373 374 375 376 377 378 379 380 381
# define ISPRINT(c) isprint (c)
# define ISDIGIT(c) isdigit (c)
# define ISALNUM(c) isalnum (c)
# define ISALPHA(c) isalpha (c)
# define ISCNTRL(c) iscntrl (c)
# define ISLOWER(c) islower (c)
# define ISPUNCT(c) ispunct (c)
# define ISSPACE(c) isspace (c)
# define ISUPPER(c) isupper (c)
# define ISXDIGIT(c) isxdigit (c)
382

Juanma Barranquero's avatar
Juanma Barranquero committed
383
# define ISWORD(c) ISALPHA (c)
384

385
# ifdef _tolower
Juanma Barranquero's avatar
Juanma Barranquero committed
386
#  define TOLOWER(c) _tolower (c)
387
# else
Juanma Barranquero's avatar
Juanma Barranquero committed
388
#  define TOLOWER(c) tolower (c)
389 390 391 392 393
# endif

/* How many characters in the character set.  */
# define CHAR_SET_SIZE 256

394
# ifdef SYNTAX_TABLE
395

396
extern char *re_syntax_table;
397

398 399 400 401 402
# else /* not SYNTAX_TABLE */

static char re_syntax_table[CHAR_SET_SIZE];

static void
403
init_syntax_once (void)
404 405 406 407 408 409 410
{
   register int c;
   static int done = 0;

   if (done)
     return;

411
   memset (re_syntax_table, 0, sizeof re_syntax_table);
412

413 414 415
   for (c = 0; c < CHAR_SET_SIZE; ++c)
     if (ISALNUM (c))
	re_syntax_table[c] = Sword;
416

417
   re_syntax_table['_'] = Ssymbol;
418

419 420 421 422
   done = 1;
}

# endif /* not SYNTAX_TABLE */
423

424 425
# define SYNTAX(c) re_syntax_table[(c)]

426 427
#endif /* not emacs */

Paul Eggert's avatar
Paul Eggert committed
428
#define SIGN_EXTEND_CHAR(c) ((signed char) (c))
429 430 431

/* Should we use malloc or alloca?  If REGEX_MALLOC is not defined, we
   use `alloca' instead of `malloc'.  This is because using malloc in
432 433 434 435 436 437
   re_search* or re_match* could cause memory leaks when C-g is used
   in Emacs (note that SAFE_ALLOCA could also call malloc, but does so
   via `record_xmalloc' which uses `unwind_protect' to ensure the
   memory is freed even in case of non-local exits); also, malloc is
   slower and causes storage fragmentation.  On the other hand, malloc
   is more portable, and easier to debug.
438

439 440 441 442 443 444
   Because we sometimes use alloca, some routines have to be macros,
   not functions -- `alloca'-allocated space disappears at the end of the
   function it is called in.  */

#ifdef REGEX_MALLOC

445 446 447
# define REGEX_ALLOCATE malloc
# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
# define REGEX_FREE free
448 449 450

#else /* not REGEX_MALLOC  */

451
# ifdef emacs
452 453 454 455 456 457 458
/* This may be adjusted in main(), if the stack is successfully grown.  */
ptrdiff_t emacs_re_safe_alloca = MAX_ALLOCA;
/* Like USE_SAFE_ALLOCA, but use emacs_re_safe_alloca.  */
#  define REGEX_USE_SAFE_ALLOCA                                        \
  ptrdiff_t sa_avail = emacs_re_safe_alloca;                           \
  ptrdiff_t sa_count = SPECPDL_INDEX (); bool sa_must_free = false

459 460 461
#  define REGEX_SAFE_FREE() SAFE_FREE ()
#  define REGEX_ALLOCATE SAFE_ALLOCA
# else
Paul Eggert's avatar
Paul Eggert committed
462
#  include <alloca.h>
463 464
#  define REGEX_ALLOCATE alloca
# endif
465 466

/* Assumes a `char *destination' variable.  */
467
# define REGEX_REALLOCATE(source, osize, nsize)				\
468
  (destination = REGEX_ALLOCATE (nsize),				\
469
   memcpy (destination, source, osize))
470 471

/* No need to do anything to free, after alloca.  */
472
# define REGEX_FREE(arg) ((void)0) /* Do nothing!  But inhibit gcc warning.  */
473 474 475

#endif /* not REGEX_MALLOC */

476 477 478 479 480
#ifndef REGEX_USE_SAFE_ALLOCA
# define REGEX_USE_SAFE_ALLOCA ((void) 0)
# define REGEX_SAFE_FREE() ((void) 0)
#endif

481 482
/* Define how to allocate the failure stack.  */

483
#if defined REL_ALLOC && defined REGEX_MALLOC
484

485
# define REGEX_ALLOCATE_STACK(size)				\
486
  r_alloc (&failure_stack_ptr, (size))
487
# define REGEX_REALLOCATE_STACK(source, osize, nsize)		\
488
  r_re_alloc (&failure_stack_ptr, (nsize))
489
# define REGEX_FREE_STACK(ptr)					\
490 491
  r_alloc_free (&failure_stack_ptr)

492
#else /* not using relocating allocator */
493

494 495 496
# define REGEX_ALLOCATE_STACK(size) REGEX_ALLOCATE (size)
# define REGEX_REALLOCATE_STACK(source, o, n) REGEX_REALLOCATE (source, o, n)
# define REGEX_FREE_STACK(ptr) REGEX_FREE (ptr)
497

498
#endif /* not using relocating allocator */
499 500 501 502 503


/* True if `size1' is non-NULL and PTR is pointing anywhere inside
   `string1' or just past its end.  This works if PTR is NULL, which is
   a good thing.  */
Richard M. Stallman's avatar
Richard M. Stallman committed
504
#define FIRST_STRING_P(ptr)					\
505 506 507 508 509 510 511
  (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)

/* (Re)Allocate N items of type T using malloc, or fail.  */
#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))

512
#define BYTEWIDTH 8 /* In bits.  */
513

Paul Eggert's avatar
Paul Eggert committed
514
#ifndef emacs
515 516 517 518 519
# undef max
# undef min
# define max(a, b) ((a) > (b) ? (a) : (b))
# define min(a, b) ((a) < (b) ? (a) : (b))
#endif
520

521
/* Type of source-pattern and string chars.  */
522 523
#ifdef _MSC_VER
typedef unsigned char re_char;
Paul Eggert's avatar
Paul Eggert committed
524
typedef const re_char const_re_char;
525
#else
526
typedef const unsigned char re_char;
Paul Eggert's avatar
Paul Eggert committed
527
typedef re_char const_re_char;
528
#endif
529

530 531
typedef char boolean;

Paul Eggert's avatar
Paul Eggert committed
532 533 534 535 536 537
static regoff_t re_match_2_internal (struct re_pattern_buffer *bufp,
				     re_char *string1, size_t size1,
				     re_char *string2, size_t size2,
				     ssize_t pos,
				     struct re_registers *regs,
				     ssize_t stop);
538 539

/* These are the command codes that appear in compiled regular
540
   expressions.  Some opcodes are followed by argument bytes.  A
541 542 543 544 545 546 547
   command code can specify any interpretation whatsoever for its
   arguments.  Zero bytes may appear in the compiled regular expression.  */

typedef enum
{
  no_op = 0,

548
  /* Succeed right away--no more backtracking.  */
549 550
  succeed,

Richard M. Stallman's avatar
Richard M. Stallman committed
551
	/* Followed by one byte giving n, then by n literal bytes.  */
552 553
  exactn,

Richard M. Stallman's avatar
Richard M. Stallman committed
554
	/* Matches any (more or less) character.  */
555 556
  anychar,

Richard M. Stallman's avatar
Richard M. Stallman committed
557 558 559 560 561
	/* Matches any one char belonging to specified set.  First
	   following byte is number of bitmap bytes.  Then come bytes
	   for a bitmap saying which chars are in.  Bits in each byte
	   are ordered low-bit-first.  A character is in the set if its
	   bit is 1.  A character too large to have a bit in the map is
562 563 564 565 566
	   automatically not in the set.

	   If the length byte has the 0x80 bit set, then that stuff
	   is followed by a range table:
	       2 bytes of flags for character sets (low 8 bits, high 8 bits)
567
		   See RANGE_TABLE_WORK_BITS below.
568
	       2 bytes, the number of pairs that follow (upto 32767)
569
	       pairs, each 2 multibyte characters,
570
		   each multibyte character represented as 3 bytes.  */
571 572
  charset,

Richard M. Stallman's avatar
Richard M. Stallman committed
573
	/* Same parameters as charset, but match any character that is
574
	   not one of those specified.  */
575 576
  charset_not,

Richard M. Stallman's avatar
Richard M. Stallman committed
577 578 579
	/* Start remembering the text that is matched, for storing in a
	   register.  Followed by one byte with the register number, in
	   the range 0 to one less than the pattern buffer's re_nsub
580
	   field.  */
581 582
  start_memory,

Richard M. Stallman's avatar
Richard M. Stallman committed
583 584 585
	/* Stop remembering the text that is matched and store it in a
	   memory register.  Followed by one byte with the register
	   number, in the range 0 to one less than `re_nsub' in the
586
	   pattern buffer.  */
587 588
  stop_memory,

Richard M. Stallman's avatar
Richard M. Stallman committed
589
	/* Match a duplicate of something remembered. Followed by one
590
	   byte containing the register number.  */
591 592
  duplicate,

Richard M. Stallman's avatar
Richard M. Stallman committed
593
	/* Fail unless at beginning of line.  */
594 595
  begline,

596
	/* Fail unless at end of line.  */
597 598
  endline,

Richard M. Stallman's avatar
Richard M. Stallman committed
599 600
	/* Succeeds if at beginning of buffer (if emacs) or at beginning
	   of string to be matched (if not).  */
601 602
  begbuf,

Richard M. Stallman's avatar
Richard M. Stallman committed
603
	/* Analogously, for end of buffer/string.  */
604
  endbuf,
605

Richard M. Stallman's avatar
Richard M. Stallman committed
606
	/* Followed by two byte relative address to which to jump.  */
607
  jump,
608

Richard M. Stallman's avatar
Richard M. Stallman committed
609
	/* Followed by two-byte relative address of place to resume at
Juanma Barranquero's avatar
Juanma Barranquero committed
610
	   in case of failure.  */
611
  on_failure_jump,
612

Richard M. Stallman's avatar
Richard M. Stallman committed
613 614
	/* Like on_failure_jump, but pushes a placeholder instead of the
	   current string position when executed.  */
615
  on_failure_keep_string_jump,
616

617 618 619 620 621
	/* Just like `on_failure_jump', except that it checks that we
	   don't get stuck in an infinite loop (matching an empty string
	   indefinitely).  */
  on_failure_jump_loop,

622 623 624 625 626 627
	/* Just like `on_failure_jump_loop', except that it checks for
	   a different kind of loop (the kind that shows up with non-greedy
	   operators).  This operation has to be immediately preceded
	   by a `no_op'.  */
  on_failure_jump_nastyloop,

628
	/* A smart `on_failure_jump' used for greedy * and + operators.
Juanma Barranquero's avatar
Juanma Barranquero committed
629
	   It analyzes the loop before which it is put and if the
630
	   loop does not require backtracking, it changes itself to
Stefan Monnier's avatar
Stefan Monnier committed
631 632 633
	   `on_failure_keep_string_jump' and short-circuits the loop,
	   else it just defaults to changing itself into `on_failure_jump'.
	   It assumes that it is pointing to just past a `jump'.  */
634
  on_failure_jump_smart,
635

Richard M. Stallman's avatar
Richard M. Stallman committed
636
	/* Followed by two-byte relative address and two-byte number n.
637 638 639
	   After matching N times, jump to the address upon failure.
	   Does not work if N starts at 0: use on_failure_jump_loop
	   instead.  */
640 641
  succeed_n,

Richard M. Stallman's avatar
Richard M. Stallman committed
642 643
	/* Followed by two-byte relative address, and two-byte number n.
	   Jump to the address N times, then fail.  */
644 645
  jump_n,

Richard M. Stallman's avatar
Richard M. Stallman committed
646
	/* Set the following two-byte relative address to the
Juanma Barranquero's avatar
Juanma Barranquero committed
647
	   subsequent two-byte number.  The address *includes* the two
Richard M. Stallman's avatar
Richard M. Stallman committed
648
	   bytes of number.  */
649 650 651 652 653 654
  set_number_at,

  wordbeg,	/* Succeeds if at word beginning.  */
  wordend,	/* Succeeds if at word end.  */

  wordbound,	/* Succeeds if at a word boundary.  */
Juanma Barranquero's avatar
Juanma Barranquero committed
655
  notwordbound,	/* Succeeds if not at a word boundary.  */
656

657 658 659
  symbeg,       /* Succeeds if at symbol beginning.  */
  symend,       /* Succeeds if at symbol end.  */

660
	/* Matches any character whose syntax is specified.  Followed by
Richard M. Stallman's avatar
Richard M. Stallman committed
661
	   a byte which contains a syntax code, e.g., Sword.  */
662 663 664
  syntaxspec,

	/* Matches any character whose syntax is not that specified.  */
665 666 667
  notsyntaxspec

#ifdef emacs
668
  , at_dot,	/* Succeeds if at point.  */
669 670

  /* Matches any character whose category-set contains the specified
Juanma Barranquero's avatar
Juanma Barranquero committed
671 672
     category.  The operator is followed by a byte which contains a
     category code (mnemonic ASCII character).  */
673 674 675 676 677 678
  categoryspec,

  /* Matches any character whose category-set does not contain the
     specified category.  The operator is followed by a byte which
     contains the category code (mnemonic ASCII character).  */
  notcategoryspec
679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705
#endif /* emacs */
} re_opcode_t;

/* Common operations on the compiled pattern.  */

/* Store NUMBER in two contiguous bytes starting at DESTINATION.  */

#define STORE_NUMBER(destination, number)				\
  do {									\
    (destination)[0] = (number) & 0377;					\
    (destination)[1] = (number) >> 8;					\
  } while (0)

/* Same as STORE_NUMBER, except increment DESTINATION to
   the byte after where the number is stored.  Therefore, DESTINATION
   must be an lvalue.  */

#define STORE_NUMBER_AND_INCR(destination, number)			\
  do {									\
    STORE_NUMBER (destination, number);					\
    (destination) += 2;							\
  } while (0)

/* Put into DESTINATION a number stored in two contiguous bytes starting
   at SOURCE.  */

#define EXTRACT_NUMBER(destination, source)				\
706
  ((destination) = extract_number (source))
707

708 709
static int
extract_number (re_char *source)
710
{
711 712
  unsigned leading_byte = SIGN_EXTEND_CHAR (source[1]);
  return (leading_byte << 8) + source[0];
713 714 715 716 717 718
}

/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
   SOURCE must be an lvalue.  */

#define EXTRACT_NUMBER_AND_INCR(destination, source)			\
719
  ((destination) = extract_number_and_incr (&source))
720

721 722
static int
extract_number_and_incr (re_char **source)
723
{
724
  int num = extract_number (*source);
725
  *source += 2;
726
  return num;
727 728
}

729 730
/* Store a multibyte character in three contiguous bytes starting
   DESTINATION, and increment DESTINATION to the byte after where the
Juanma Barranquero's avatar
Juanma Barranquero committed
731
   character is stored.  Therefore, DESTINATION must be an lvalue.  */
732 733 734 735 736 737 738 739 740 741

#define STORE_CHARACTER_AND_INCR(destination, character)	\
  do {								\
    (destination)[0] = (character) & 0377;			\
    (destination)[1] = ((character) >> 8) & 0377;		\
    (destination)[2] = (character) >> 16;			\
    (destination) += 3;						\
  } while (0)

/* Put into DESTINATION a character stored in three contiguous bytes
Juanma Barranquero's avatar
Juanma Barranquero committed
742
   starting at SOURCE.  */
743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758

#define EXTRACT_CHARACTER(destination, source)	\
  do {						\
    (destination) = ((source)[0]		\
		     | ((source)[1] << 8)	\
		     | ((source)[2] << 16));	\
  } while (0)


/* Macros for charset. */

/* Size of bitmap of charset P in bytes.  P is a start of charset,
   i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not.  */
#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)

/* Nonzero if charset P has range table.  */
Richard M. Stallman's avatar
Richard M. Stallman committed
759
#define CHARSET_RANGE_TABLE_EXISTS_P(p)	 ((p)[1] & 0x80)
760 761 762

/* Return the address of range table of charset P.  But not the start
   of table itself, but the before where the number of ranges is
763 764 765 766
   stored.  `2 +' means to skip re_opcode_t and size of bitmap,
   and the 2 bytes of flags at the start of the range table.  */
#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])

767
#ifdef emacs
768 769 770 771
/* Extract the bit flags that start a range table.  */
#define CHARSET_RANGE_TABLE_BITS(p)		\
  ((p)[2 + CHARSET_BITMAP_SIZE (p)]		\
   + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
772
#endif
773 774

/* Return the address of end of RANGE_TABLE.  COUNT is number of
Juanma Barranquero's avatar
Juanma Barranquero committed
775 776
   ranges (which is a pair of (start, end)) in the RANGE_TABLE.  `* 2'
   is start of range and end of range.  `* 3' is size of each start
777 778 779 780
   and end.  */
#define CHARSET_RANGE_TABLE_END(range_table, count)	\
  ((range_table) + (count) * 2 * 3)

781 782 783 784
/* If DEBUG is defined, Regex prints many voluminous messages about what
   it is doing (if the variable `debug' is nonzero).  If linked with the
   main program in `iregex.c', you can enter patterns and strings
   interactively.  And if linked with the main program in `main.c' and
785
   the other test files, you can run the already-written tests.  */
786 787 788 789

#ifdef DEBUG

/* We use standard I/O for debugging.  */
790
# include <stdio.h>
791 792

/* It is useful to test things that ``must'' be true when debugging.  */
793
# include <assert.h>
794

795
static int debug = -100000;
796

797
# define DEBUG_STATEMENT(e) e
798 799
# define DEBUG_PRINT(...) if (debug > 0) printf (__VA_ARGS__)
# define DEBUG_COMPILES_ARGUMENTS
800
# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)				\
801
  if (debug > 0) print_partial_compiled_pattern (s, e)
802
# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)			\
803
  if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
804 805 806 807


/* Print the fastmap in human-readable form.  */

808 809
static void
print_fastmap (char *fastmap)
810 811
{
  unsigned was_a_range = 0;
812 813
  unsigned i = 0;

814 815 816 817 818
  while (i < (1 << BYTEWIDTH))
    {
      if (fastmap[i++])
	{
	  was_a_range = 0;
Richard M. Stallman's avatar
Richard M. Stallman committed
819 820 821 822 823 824
	  putchar (i - 1);
	  while (i < (1 << BYTEWIDTH)  &&  fastmap[i])
	    {
	      was_a_range = 1;
	      i++;
	    }
825
	  if (was_a_range)
Richard M. Stallman's avatar
Richard M. Stallman committed
826 827 828 829 830
	    {
	      printf ("-");
	      putchar (i - 1);
	    }
	}
831
    }
832
  putchar ('\n');
833 834 835 836 837 838
}


/* Print a compiled pattern string in human-readable form, starting at
   the START pointer into it and ending just before the pointer END.  */

839 840
static void
print_partial_compiled_pattern (re_char *start, re_char *end)
841 842
{
  int mcnt, mcnt2;
843 844
  re_char *p = start;
  re_char *pend = end;
845 846 847

  if (start == NULL)
    {
848
      fprintf (stderr, "(null)\n");
849 850
      return;
    }
851

852 853 854
  /* Loop over pattern commands.  */
  while (p < pend)
    {
855
      fprintf (stderr, "%td:\t", p - start);
856 857 858

      switch ((re_opcode_t) *p++)
	{
Richard M. Stallman's avatar
Richard M. Stallman committed
859
	case no_op:
860
	  fprintf (stderr, "/no_op");
Richard M. Stallman's avatar
Richard M. Stallman committed
861
	  break;
862

863
	case succeed:
864
	  fprintf (stderr, "/succeed");
865 866
	  break;

867 868
	case exactn:
	  mcnt = *p++;
869
	  fprintf (stderr, "/exactn/%d", mcnt);
Richard M. Stallman's avatar
Richard M. Stallman committed
870
	  do
871
	    {
872
	      fprintf (stderr, "/%c", *p++);
Richard M. Stallman's avatar
Richard M. Stallman committed
873 874 875
	    }
	  while (--mcnt);
	  break;
876 877

	case start_memory:
878
	  fprintf (stderr, "/start_memory/%d", *p++);
Richard M. Stallman's avatar
Richard M. Stallman committed
879
	  break;
880 881

	case stop_memory:
882
	  fprintf (stderr, "/stop_memory/%d", *p++);
Richard M. Stallman's avatar
Richard M. Stallman committed
883
	  break;
884 885

	case duplicate:
886
	  fprintf (stderr, "/duplicate/%d", *p++);
887 888 889
	  break;

	case anychar:
890
	  fprintf (stderr, "/anychar");
891 892 893
	  break;

	case charset:
Richard M. Stallman's avatar
Richard M. Stallman committed
894 895 896
	case charset_not:
	  {
	    register int c, last = -100;
897
	    register int in_range = 0;
898 899
	    int length = CHARSET_BITMAP_SIZE (p - 1);
	    int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
900

901
	    fprintf (stderr, "/charset [%s",
Kenichi Handa's avatar
Kenichi Handa committed
902
		     (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
903

Kenichi Handa's avatar
Kenichi Handa committed
904 905
	    if (p + *p >= pend)
	      fprintf (stderr, " !extends past end of pattern! ");
906

Richard M. Stallman's avatar
Richard M. Stallman committed
907
	    for (c = 0; c < 256; c++)
908
	      if (c / 8 < length
909 910 911 912 913
		  && (p[1 + (c/8)] & (1 << (c % 8))))
		{
		  /* Are we starting a range?  */
		  if (last + 1 == c && ! in_range)
		    {