regex.c 186 KB
Newer Older
Richard M. Stallman's avatar
Richard M. Stallman committed
1
/* Extended regular expression matching and search library, version
2
   0.12.  (Implements POSIX draft P1003.2/D11.2, except for some of the
Karl Berry's avatar
Karl Berry committed
3 4
   internationalization features.)

5 6
   Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
                 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
Karl Berry's avatar
Karl Berry committed
7

8 9 10 11 12 13 14
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Richard M. Stallman's avatar
Richard M. Stallman committed
15
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the
16 17 18 19
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
Lute Kamstra's avatar
Lute Kamstra committed
20
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
Richard M. Stallman's avatar
Richard M. Stallman committed
21
   USA.	 */
22

23
/* TODO:
24
   - structure the opcode space into opcode+flag.
25
   - merge with glibc's regex.[ch].
26
   - replace (succeed_n + jump_n + set_number_at) with something that doesn't
27 28 29
     need to modify the compiled regexp so that re_match can be reentrant.
   - get rid of on_failure_jump_smart by doing the optimization in re_comp
     rather than at run-time, so that re_match can be reentrant.
30
*/
31

32
/* AIX requires this to be the first thing in the file. */
33
#if defined _AIX && !defined REGEX_MALLOC
34 35 36 37
  #pragma alloca
#endif

#ifdef HAVE_CONFIG_H
38
# include <config.h>
39 40
#endif

41 42 43 44 45 46
#if defined STDC_HEADERS && !defined emacs
# include <stddef.h>
#else
/* We need this for `regex.h', and perhaps for the Emacs include files.  */
# include <sys/types.h>
#endif
47

48 49
/* Whether to use ISO C Amendment 1 wide char functions.
   Those should not be used for Emacs since it uses its own.  */
50 51 52
#if defined _LIBC
#define WIDE_CHAR_SUPPORT 1
#else
53
#define WIDE_CHAR_SUPPORT \
54 55
	(HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
#endif
56 57 58

/* For platform which support the ISO C amendement 1 functionality we
   support user defined character classes.  */
59
#if WIDE_CHAR_SUPPORT
60 61 62 63 64
/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>.  */
# include <wchar.h>
# include <wctype.h>
#endif

65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
#ifdef _LIBC
/* We have to keep the namespace clean.  */
# define regfree(preg) __regfree (preg)
# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
# define regerror(errcode, preg, errbuf, errbuf_size) \
	__regerror(errcode, preg, errbuf, errbuf_size)
# define re_set_registers(bu, re, nu, st, en) \
	__re_set_registers (bu, re, nu, st, en)
# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
	__re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
# define re_match(bufp, string, size, pos, regs) \
	__re_match (bufp, string, size, pos, regs)
# define re_search(bufp, string, size, startpos, range, regs) \
	__re_search (bufp, string, size, startpos, range, regs)
# define re_compile_pattern(pattern, length, bufp) \
	__re_compile_pattern (pattern, length, bufp)
# define re_set_syntax(syntax) __re_set_syntax (syntax)
# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
	__re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)

87 88 89 90 91
/* Make sure we call libc's function even if the user overrides them.  */
# define btowc __btowc
# define iswctype __iswctype
# define wctype __wctype

92 93 94 95 96 97 98 99 100 101
# define WEAK_ALIAS(a,b) weak_alias (a, b)

/* We are also using some library internals.  */
# include <locale/localeinfo.h>
# include <locale/elem-hash.h>
# include <langinfo.h>
#else
# define WEAK_ALIAS(a,b)
#endif

102
/* This is for other GNU distributions with internationalized messages.  */
103
#if HAVE_LIBINTL_H || defined _LIBC
104 105 106 107 108
# include <libintl.h>
#else
# define gettext(msgid) (msgid)
#endif

109 110 111
#ifndef gettext_noop
/* This define is so xgettext can find the internationalizable
   strings.  */
112
# define gettext_noop(String) String
113 114
#endif

115 116 117 118
/* The `emacs' switch turns on certain matching commands
   that make sense only in Emacs. */
#ifdef emacs

119 120
# include "lisp.h"
# include "buffer.h"
121 122

/* Make syntax table lookup grant data in gl_state.  */
123
# define SYNTAX_ENTRY_VIA_PROPERTY
124

125 126 127
# include "syntax.h"
# include "charset.h"
# include "category.h"
128

129 130 131
# ifdef malloc
#  undef malloc
# endif
132
# define malloc xmalloc
133 134 135
# ifdef realloc
#  undef realloc
# endif
136
# define realloc xrealloc
137 138 139
# ifdef free
#  undef free
# endif
140
# define free xfree
141

142 143 144 145 146 147
/* Converts the pointer to the char to BEG-based offset from the start.	 */
# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))

# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
# define RE_STRING_CHAR(p, s) \
Stefan Monnier's avatar
Stefan Monnier committed
148
  (multibyte ? (STRING_CHAR (p, s)) : (*(p)))
149
# define RE_STRING_CHAR_AND_LENGTH(p, s, len) \
150 151 152 153 154
  (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p)))

/* Set C a (possibly multibyte) character before P.  P points into a
   string which is the virtual concatenation of STR1 (which ends at
   END1) or STR2 (which ends at END2).  */
155
# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2)		\
156 157 158
  do {									\
    if (multibyte)							\
       {						    		\
159 160
	 re_char *dtemp = (p) == (str2) ? (end1) : (p);		    	\
	 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
161 162 163
	 re_char *d0 = dtemp;						\
	 PREV_CHAR_BOUNDARY (d0, dlimit);				\
	 c = STRING_CHAR (d0, dtemp - d0);				\
164 165 166 167 168
       }						    		\
     else						    		\
       (c = ((p) == (str2) ? (end1) : (p))[-1]);			\
  } while (0)

Stefan Monnier's avatar
Stefan Monnier committed
169

170 171 172 173 174
#else  /* not emacs */

/* If we are not linking with Emacs proper,
   we can't use the relocating allocator
   even if config.h says that we can.  */
175
# undef REL_ALLOC
176

177 178 179
# if defined STDC_HEADERS || defined _LIBC
#  include <stdlib.h>
# else
180 181
char *malloc ();
char *realloc ();
182
# endif
183

184
/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
185
   If nothing else has been done, use the method below.  */
186 187 188 189 190 191 192
# ifdef INHIBIT_STRING_HEADER
#  if !(defined HAVE_BZERO && defined HAVE_BCOPY)
#   if !defined bzero && !defined bcopy
#    undef INHIBIT_STRING_HEADER
#   endif
#  endif
# endif
193

194
/* This is the normal way of making sure we have memcpy, memcmp and bzero.
195 196
   This is used in most programs--a few other programs avoid this
   by defining INHIBIT_STRING_HEADER.  */
197 198 199 200
# ifndef INHIBIT_STRING_HEADER
#  if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
#   include <string.h>
#   ifndef bzero
201 202 203 204 205
#    ifndef _LIBC
#     define bzero(s, n)	(memset (s, '\0', n), (s))
#    else
#     define bzero(s, n)	__bzero (s, n)
#    endif
206 207 208
#   endif
#  else
#   include <strings.h>
209 210 211 212 213 214
#   ifndef memcmp
#    define memcmp(s1, s2, n)	bcmp (s1, s2, n)
#   endif
#   ifndef memcpy
#    define memcpy(d, s, n)	(bcopy (s, d, n), (d))
#   endif
215 216
#  endif
# endif
217 218 219

/* Define the syntax stuff for \<, \>, etc.  */

220
/* Sword must be nonzero for the wordchar pattern commands in re_match_2.  */
221
enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
222

223 224 225 226 227
# ifdef SWITCH_ENUM_BUG
#  define SWITCH_ENUM_CAST(x) ((int)(x))
# else
#  define SWITCH_ENUM_CAST(x) (x)
# endif
228

229
/* Dummy macros for non-Emacs environments.  */
230 231 232 233 234 235 236 237 238 239
# define BASE_LEADING_CODE_P(c) (0)
# define CHAR_CHARSET(c) 0
# define CHARSET_LEADING_CODE_BASE(c) 0
# define MAX_MULTIBYTE_LENGTH 1
# define RE_MULTIBYTE_P(x) 0
# define WORD_BOUNDARY_P(c1, c2) (0)
# define CHAR_HEAD_P(p) (1)
# define SINGLE_BYTE_CHAR_P(c) (1)
# define SAME_CHARSET_P(c1, c2) (1)
# define MULTIBYTE_FORM_LENGTH(p, s) (1)
240
# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
241 242 243 244 245 246
# define STRING_CHAR(p, s) (*(p))
# define RE_STRING_CHAR STRING_CHAR
# define CHAR_STRING(c, s) (*(s) = (c), 1)
# define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
# define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH
# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
247
  (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
248
# define MAKE_CHAR(charset, c1, c2) (c1)
249
#endif /* not emacs */
Stefan Monnier's avatar
Stefan Monnier committed
250 251

#ifndef RE_TRANSLATE
252 253
# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
# define RE_TRANSLATE_P(TBL) (TBL)
Stefan Monnier's avatar
Stefan Monnier committed
254
#endif
255 256 257 258

/* Get the interface, including the syntax bits.  */
#include "regex.h"

259 260
/* isalpha etc. are used for the character classes.  */
#include <ctype.h>
261

262
#ifdef emacs
263

264
/* 1 if C is an ASCII character.  */
265
# define IS_REAL_ASCII(c) ((c) < 0200)
266

267
/* 1 if C is a unibyte character.  */
268
# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
269

270
/* The Emacs definitions should not be directly affected by locales.  */
271

272
/* In Emacs, these are only used for single-byte characters.  */
273 274 275
# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
# define ISCNTRL(c) ((c) < ' ')
# define ISXDIGIT(c) (((c) >= '0' && (c) <= '9')		\
276 277
		     || ((c) >= 'a' && (c) <= 'f')	\
		     || ((c) >= 'A' && (c) <= 'F'))
278 279

/* This is only used for single-byte characters.  */
280
# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
281 282 283

/* The rest must handle multibyte characters.  */

284
# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c)				\
285
		    ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237)	\
286 287
		    : 1)

288
# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c)				\
289
		    ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237)	\
290 291
		    : 1)

292
# define ISALNUM(c) (IS_REAL_ASCII (c)			\
293 294 295
		    ? (((c) >= 'a' && (c) <= 'z')	\
		       || ((c) >= 'A' && (c) <= 'Z')	\
		       || ((c) >= '0' && (c) <= '9'))	\
296 297
		    : SYNTAX (c) == Sword)

298
# define ISALPHA(c) (IS_REAL_ASCII (c)			\
299 300
		    ? (((c) >= 'a' && (c) <= 'z')	\
		       || ((c) >= 'A' && (c) <= 'Z'))	\
301 302
		    : SYNTAX (c) == Sword)

303
# define ISLOWER(c) (LOWERCASEP (c))
304

305
# define ISPUNCT(c) (IS_REAL_ASCII (c)				\
306 307
		    ? ((c) > ' ' && (c) < 0177			\
		       && !(((c) >= 'a' && (c) <= 'z')		\
308 309
		            || ((c) >= 'A' && (c) <= 'Z')	\
		            || ((c) >= '0' && (c) <= '9')))	\
310 311
		    : SYNTAX (c) != Sword)

312
# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
313

314
# define ISUPPER(c) (UPPERCASEP (c))
315

316
# define ISWORD(c) (SYNTAX (c) == Sword)
317 318 319

#else /* not emacs */

320 321 322 323 324
/* Jim Meyering writes:

   "... Some ctype macros are valid only for character codes that
   isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
   using /bin/cc or gcc but without giving an ansi option).  So, all
325
   ctype uses should be through macros like ISPRINT...  If
326 327 328
   STDC_HEADERS is defined, then autoconf has verified that the ctype
   macros don't need to be guarded with references to isascii. ...
   Defining isascii to 1 should let any compiler worth its salt
329 330
   eliminate the && through constant folding."
   Solaris defines some of these symbols so we must undefine them first.  */
331

332
# undef ISASCII
333 334 335 336 337
# if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
#  define ISASCII(c) 1
# else
#  define ISASCII(c) isascii(c)
# endif
338 339

/* 1 if C is an ASCII character.  */
340
# define IS_REAL_ASCII(c) ((c) < 0200)
341 342

/* This distinction is not meaningful, except in Emacs.  */
343 344 345 346 347 348 349 350 351 352 353 354 355
# define ISUNIBYTE(c) 1

# ifdef isblank
#  define ISBLANK(c) (ISASCII (c) && isblank (c))
# else
#  define ISBLANK(c) ((c) == ' ' || (c) == '\t')
# endif
# ifdef isgraph
#  define ISGRAPH(c) (ISASCII (c) && isgraph (c))
# else
#  define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
# endif

356
# undef ISPRINT
357 358 359 360 361 362 363 364 365 366 367 368 369
# define ISPRINT(c) (ISASCII (c) && isprint (c))
# define ISDIGIT(c) (ISASCII (c) && isdigit (c))
# define ISALNUM(c) (ISASCII (c) && isalnum (c))
# define ISALPHA(c) (ISASCII (c) && isalpha (c))
# define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
# define ISLOWER(c) (ISASCII (c) && islower (c))
# define ISPUNCT(c) (ISASCII (c) && ispunct (c))
# define ISSPACE(c) (ISASCII (c) && isspace (c))
# define ISUPPER(c) (ISASCII (c) && isupper (c))
# define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))

# define ISWORD(c) ISALPHA(c)

370 371 372 373 374 375 376 377 378
# ifdef _tolower
#  define TOLOWER(c) _tolower(c)
# else
#  define TOLOWER(c) tolower(c)
# endif

/* How many characters in the character set.  */
# define CHAR_SET_SIZE 256

379
# ifdef SYNTAX_TABLE
380

381
extern char *re_syntax_table;
382

383 384 385 386 387 388 389 390 391 392 393 394 395 396 397
# else /* not SYNTAX_TABLE */

static char re_syntax_table[CHAR_SET_SIZE];

static void
init_syntax_once ()
{
   register int c;
   static int done = 0;

   if (done)
     return;

   bzero (re_syntax_table, sizeof re_syntax_table);

398 399 400
   for (c = 0; c < CHAR_SET_SIZE; ++c)
     if (ISALNUM (c))
	re_syntax_table[c] = Sword;
401

402
   re_syntax_table['_'] = Ssymbol;
403

404 405 406 407
   done = 1;
}

# endif /* not SYNTAX_TABLE */
408

409 410
# define SYNTAX(c) re_syntax_table[(c)]

411 412
#endif /* not emacs */

413
#ifndef NULL
414
# define NULL (void *)0
415 416 417 418 419
#endif

/* We remove any previous definition of `SIGN_EXTEND_CHAR',
   since ours (we hope) works properly with all combinations of
   machines, compilers, `char' and `unsigned char' argument types.
420
   (Per Bothner suggested the basic approach.)  */
421 422
#undef SIGN_EXTEND_CHAR
#if __STDC__
423
# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
424 425
#else  /* not __STDC__ */
/* As in Harbison and Steele.  */
426
# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
427 428 429 430 431 432
#endif

/* Should we use malloc or alloca?  If REGEX_MALLOC is not defined, we
   use `alloca' instead of `malloc'.  This is because using malloc in
   re_search* or re_match* could cause memory leaks when C-g is used in
   Emacs; also, malloc is slower and causes storage fragmentation.  On
433 434
   the other hand, malloc is more portable, and easier to debug.

435 436 437 438 439 440
   Because we sometimes use alloca, some routines have to be macros,
   not functions -- `alloca'-allocated space disappears at the end of the
   function it is called in.  */

#ifdef REGEX_MALLOC

441 442 443
# define REGEX_ALLOCATE malloc
# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
# define REGEX_FREE free
444 445 446 447

#else /* not REGEX_MALLOC  */

/* Emacs already defines alloca, sometimes.  */
448
# ifndef alloca
449 450

/* Make alloca work the best possible way.  */
451 452 453 454 455 456 457
#  ifdef __GNUC__
#   define alloca __builtin_alloca
#  else /* not __GNUC__ */
#   if HAVE_ALLOCA_H
#    include <alloca.h>
#   endif /* HAVE_ALLOCA_H */
#  endif /* not __GNUC__ */
458

459
# endif /* not alloca */
460

461
# define REGEX_ALLOCATE alloca
462 463

/* Assumes a `char *destination' variable.  */
464
# define REGEX_REALLOCATE(source, osize, nsize)				\
465
  (destination = (char *) alloca (nsize),				\
466
   memcpy (destination, source, osize))
467 468

/* No need to do anything to free, after alloca.  */
469
# define REGEX_FREE(arg) ((void)0) /* Do nothing!  But inhibit gcc warning.  */
470 471 472 473 474

#endif /* not REGEX_MALLOC */

/* Define how to allocate the failure stack.  */

475
#if defined REL_ALLOC && defined REGEX_MALLOC
476

477
# define REGEX_ALLOCATE_STACK(size)				\
478
  r_alloc (&failure_stack_ptr, (size))
479
# define REGEX_REALLOCATE_STACK(source, osize, nsize)		\
480
  r_re_alloc (&failure_stack_ptr, (nsize))
481
# define REGEX_FREE_STACK(ptr)					\
482 483
  r_alloc_free (&failure_stack_ptr)

484
#else /* not using relocating allocator */
485

486
# ifdef REGEX_MALLOC
487

488 489 490
#  define REGEX_ALLOCATE_STACK malloc
#  define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
#  define REGEX_FREE_STACK free
491

492
# else /* not REGEX_MALLOC */
493

494
#  define REGEX_ALLOCATE_STACK alloca
495

496
#  define REGEX_REALLOCATE_STACK(source, osize, nsize)			\
497
   REGEX_REALLOCATE (source, osize, nsize)
Richard M. Stallman's avatar
Richard M. Stallman committed
498
/* No need to explicitly free anything.	 */
499
#  define REGEX_FREE_STACK(arg) ((void)0)
500

501
# endif /* not REGEX_MALLOC */
502
#endif /* not using relocating allocator */
503 504 505 506 507


/* True if `size1' is non-NULL and PTR is pointing anywhere inside
   `string1' or just past its end.  This works if PTR is NULL, which is
   a good thing.  */
Richard M. Stallman's avatar
Richard M. Stallman committed
508
#define FIRST_STRING_P(ptr)					\
509 510 511 512 513 514 515 516 517
  (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)

/* (Re)Allocate N items of type T using malloc, or fail.  */
#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
#define RETALLOC_IF(addr, n, t) \
  if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))

518
#define BYTEWIDTH 8 /* In bits.  */
519 520 521 522 523 524 525 526 527 528 529 530

#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))

#undef MAX
#undef MIN
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))

typedef char boolean;
#define false 0
#define true 1

531 532 533 534 535 536
static int re_match_2_internal _RE_ARGS ((struct re_pattern_buffer *bufp,
					re_char *string1, int size1,
					re_char *string2, int size2,
					int pos,
					struct re_registers *regs,
					int stop));
537 538

/* These are the command codes that appear in compiled regular
539
   expressions.  Some opcodes are followed by argument bytes.  A
540 541 542 543 544 545 546
   command code can specify any interpretation whatsoever for its
   arguments.  Zero bytes may appear in the compiled regular expression.  */

typedef enum
{
  no_op = 0,

547
  /* Succeed right away--no more backtracking.  */
548 549
  succeed,

Richard M. Stallman's avatar
Richard M. Stallman committed
550
	/* Followed by one byte giving n, then by n literal bytes.  */
551 552
  exactn,

Richard M. Stallman's avatar
Richard M. Stallman committed
553
	/* Matches any (more or less) character.  */
554 555
  anychar,

Richard M. Stallman's avatar
Richard M. Stallman committed
556 557 558 559 560
	/* Matches any one char belonging to specified set.  First
	   following byte is number of bitmap bytes.  Then come bytes
	   for a bitmap saying which chars are in.  Bits in each byte
	   are ordered low-bit-first.  A character is in the set if its
	   bit is 1.  A character too large to have a bit in the map is
561 562 563 564 565
	   automatically not in the set.

	   If the length byte has the 0x80 bit set, then that stuff
	   is followed by a range table:
	       2 bytes of flags for character sets (low 8 bits, high 8 bits)
566
		   See RANGE_TABLE_WORK_BITS below.
567
	       2 bytes, the number of pairs that follow (upto 32767)
568
	       pairs, each 2 multibyte characters,
569
		   each multibyte character represented as 3 bytes.  */
570 571
  charset,

Richard M. Stallman's avatar
Richard M. Stallman committed
572
	/* Same parameters as charset, but match any character that is
573
	   not one of those specified.  */
574 575
  charset_not,

Richard M. Stallman's avatar
Richard M. Stallman committed
576 577 578
	/* Start remembering the text that is matched, for storing in a
	   register.  Followed by one byte with the register number, in
	   the range 0 to one less than the pattern buffer's re_nsub
579
	   field.  */
580 581
  start_memory,

Richard M. Stallman's avatar
Richard M. Stallman committed
582 583 584
	/* Stop remembering the text that is matched and store it in a
	   memory register.  Followed by one byte with the register
	   number, in the range 0 to one less than `re_nsub' in the
585
	   pattern buffer.  */
586 587
  stop_memory,

Richard M. Stallman's avatar
Richard M. Stallman committed
588
	/* Match a duplicate of something remembered. Followed by one
589
	   byte containing the register number.  */
590 591
  duplicate,

Richard M. Stallman's avatar
Richard M. Stallman committed
592
	/* Fail unless at beginning of line.  */
593 594
  begline,

595
	/* Fail unless at end of line.  */
596 597
  endline,

Richard M. Stallman's avatar
Richard M. Stallman committed
598 599
	/* Succeeds if at beginning of buffer (if emacs) or at beginning
	   of string to be matched (if not).  */
600 601
  begbuf,

Richard M. Stallman's avatar
Richard M. Stallman committed
602
	/* Analogously, for end of buffer/string.  */
603
  endbuf,
604

Richard M. Stallman's avatar
Richard M. Stallman committed
605
	/* Followed by two byte relative address to which to jump.  */
606
  jump,
607

Richard M. Stallman's avatar
Richard M. Stallman committed
608 609
	/* Followed by two-byte relative address of place to resume at
	   in case of failure.	*/
610
  on_failure_jump,
611

Richard M. Stallman's avatar
Richard M. Stallman committed
612 613
	/* Like on_failure_jump, but pushes a placeholder instead of the
	   current string position when executed.  */
614
  on_failure_keep_string_jump,
615

616 617 618 619 620
	/* Just like `on_failure_jump', except that it checks that we
	   don't get stuck in an infinite loop (matching an empty string
	   indefinitely).  */
  on_failure_jump_loop,

621 622 623 624 625 626
	/* Just like `on_failure_jump_loop', except that it checks for
	   a different kind of loop (the kind that shows up with non-greedy
	   operators).  This operation has to be immediately preceded
	   by a `no_op'.  */
  on_failure_jump_nastyloop,

627
	/* A smart `on_failure_jump' used for greedy * and + operators.
628 629
	   It analyses the loop before which it is put and if the
	   loop does not require backtracking, it changes itself to
Stefan Monnier's avatar
Stefan Monnier committed
630 631 632
	   `on_failure_keep_string_jump' and short-circuits the loop,
	   else it just defaults to changing itself into `on_failure_jump'.
	   It assumes that it is pointing to just past a `jump'.  */
633
  on_failure_jump_smart,
634

Richard M. Stallman's avatar
Richard M. Stallman committed
635
	/* Followed by two-byte relative address and two-byte number n.
636 637 638
	   After matching N times, jump to the address upon failure.
	   Does not work if N starts at 0: use on_failure_jump_loop
	   instead.  */
639 640
  succeed_n,

Richard M. Stallman's avatar
Richard M. Stallman committed
641 642
	/* Followed by two-byte relative address, and two-byte number n.
	   Jump to the address N times, then fail.  */
643 644
  jump_n,

Richard M. Stallman's avatar
Richard M. Stallman committed
645 646 647
	/* Set the following two-byte relative address to the
	   subsequent two-byte number.	The address *includes* the two
	   bytes of number.  */
648 649 650 651 652 653
  set_number_at,

  wordbeg,	/* Succeeds if at word beginning.  */
  wordend,	/* Succeeds if at word end.  */

  wordbound,	/* Succeeds if at a word boundary.  */
654
  notwordbound,	/* Succeeds if not at a word boundary.	*/
655

656 657 658
  symbeg,       /* Succeeds if at symbol beginning.  */
  symend,       /* Succeeds if at symbol end.  */

659
	/* Matches any character whose syntax is specified.  Followed by
Richard M. Stallman's avatar
Richard M. Stallman committed
660
	   a byte which contains a syntax code, e.g., Sword.  */
661 662 663
  syntaxspec,

	/* Matches any character whose syntax is not that specified.  */
664 665 666 667 668 669
  notsyntaxspec

#ifdef emacs
  ,before_dot,	/* Succeeds if before point.  */
  at_dot,	/* Succeeds if at point.  */
  after_dot,	/* Succeeds if after point.  */
670 671

  /* Matches any character whose category-set contains the specified
Richard M. Stallman's avatar
Richard M. Stallman committed
672 673
     category.	The operator is followed by a byte which contains a
     category code (mnemonic ASCII character).	*/
674 675 676 677 678 679
  categoryspec,

  /* Matches any character whose category-set does not contain the
     specified category.  The operator is followed by a byte which
     contains the category code (mnemonic ASCII character).  */
  notcategoryspec
680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712
#endif /* emacs */
} re_opcode_t;

/* Common operations on the compiled pattern.  */

/* Store NUMBER in two contiguous bytes starting at DESTINATION.  */

#define STORE_NUMBER(destination, number)				\
  do {									\
    (destination)[0] = (number) & 0377;					\
    (destination)[1] = (number) >> 8;					\
  } while (0)

/* Same as STORE_NUMBER, except increment DESTINATION to
   the byte after where the number is stored.  Therefore, DESTINATION
   must be an lvalue.  */

#define STORE_NUMBER_AND_INCR(destination, number)			\
  do {									\
    STORE_NUMBER (destination, number);					\
    (destination) += 2;							\
  } while (0)

/* Put into DESTINATION a number stored in two contiguous bytes starting
   at SOURCE.  */

#define EXTRACT_NUMBER(destination, source)				\
  do {									\
    (destination) = *(source) & 0377;					\
    (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8;		\
  } while (0)

#ifdef DEBUG
713
static void extract_number _RE_ARGS ((int *dest, re_char *source));
714 715 716
static void
extract_number (dest, source)
    int *dest;
717
    re_char *source;
718
{
719
  int temp = SIGN_EXTEND_CHAR (*(source + 1));
720 721 722 723
  *dest = *source & 0377;
  *dest += temp << 8;
}

724
# ifndef EXTRACT_MACROS /* To debug the macros.  */
725 726 727
#  undef EXTRACT_NUMBER
#  define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
# endif /* not EXTRACT_MACROS */
728 729 730 731 732 733 734 735 736

#endif /* DEBUG */

/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
   SOURCE must be an lvalue.  */

#define EXTRACT_NUMBER_AND_INCR(destination, source)			\
  do {									\
    EXTRACT_NUMBER (destination, source);				\
Richard M. Stallman's avatar
Richard M. Stallman committed
737
    (source) += 2;							\
738 739 740
  } while (0)

#ifdef DEBUG
741 742
static void extract_number_and_incr _RE_ARGS ((int *destination,
					       re_char **source));
743 744 745
static void
extract_number_and_incr (destination, source)
    int *destination;
746
    re_char **source;
747
{
748 749 750 751
  extract_number (destination, *source);
  *source += 2;
}

752 753 754
# ifndef EXTRACT_MACROS
#  undef EXTRACT_NUMBER_AND_INCR
#  define EXTRACT_NUMBER_AND_INCR(dest, src) \
755
  extract_number_and_incr (&dest, &src)
756
# endif /* not EXTRACT_MACROS */
757 758 759

#endif /* DEBUG */

760 761
/* Store a multibyte character in three contiguous bytes starting
   DESTINATION, and increment DESTINATION to the byte after where the
Richard M. Stallman's avatar
Richard M. Stallman committed
762
   character is stored.	 Therefore, DESTINATION must be an lvalue.  */
763 764 765 766 767 768 769 770 771 772

#define STORE_CHARACTER_AND_INCR(destination, character)	\
  do {								\
    (destination)[0] = (character) & 0377;			\
    (destination)[1] = ((character) >> 8) & 0377;		\
    (destination)[2] = (character) >> 16;			\
    (destination) += 3;						\
  } while (0)

/* Put into DESTINATION a character stored in three contiguous bytes
Richard M. Stallman's avatar
Richard M. Stallman committed
773
   starting at SOURCE.	*/
774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789

#define EXTRACT_CHARACTER(destination, source)	\
  do {						\
    (destination) = ((source)[0]		\
		     | ((source)[1] << 8)	\
		     | ((source)[2] << 16));	\
  } while (0)


/* Macros for charset. */

/* Size of bitmap of charset P in bytes.  P is a start of charset,
   i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not.  */
#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)

/* Nonzero if charset P has range table.  */
Richard M. Stallman's avatar
Richard M. Stallman committed
790
#define CHARSET_RANGE_TABLE_EXISTS_P(p)	 ((p)[1] & 0x80)
791 792 793

/* Return the address of range table of charset P.  But not the start
   of table itself, but the before where the number of ranges is
794 795 796 797 798 799 800 801
   stored.  `2 +' means to skip re_opcode_t and size of bitmap,
   and the 2 bytes of flags at the start of the range table.  */
#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])

/* Extract the bit flags that start a range table.  */
#define CHARSET_RANGE_TABLE_BITS(p)		\
  ((p)[2 + CHARSET_BITMAP_SIZE (p)]		\
   + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
802 803 804 805 806 807 808

/* Test if C is listed in the bitmap of charset P.  */
#define CHARSET_LOOKUP_BITMAP(p, c)				\
  ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH			\
   && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))

/* Return the address of end of RANGE_TABLE.  COUNT is number of
Richard M. Stallman's avatar
Richard M. Stallman committed
809 810
   ranges (which is a pair of (start, end)) in the RANGE_TABLE.	 `* 2'
   is start of range and end of range.	`* 3' is size of each start
811 812 813 814
   and end.  */
#define CHARSET_RANGE_TABLE_END(range_table, count)	\
  ((range_table) + (count) * 2 * 3)

Richard M. Stallman's avatar
Richard M. Stallman committed
815
/* Test if C is in RANGE_TABLE.	 A flag NOT is negated if C is in.
816 817 818 819
   COUNT is number of ranges in RANGE_TABLE.  */
#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count)	\
  do									\
    {									\
820 821 822
      re_wchar_t range_start, range_end;				\
      re_char *p;							\
      re_char *range_table_end						\
823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845
	= CHARSET_RANGE_TABLE_END ((range_table), (count));		\
									\
      for (p = (range_table); p < range_table_end; p += 2 * 3)		\
	{								\
	  EXTRACT_CHARACTER (range_start, p);				\
	  EXTRACT_CHARACTER (range_end, p + 3);				\
									\
	  if (range_start <= (c) && (c) <= range_end)			\
	    {								\
	      (not) = !(not);						\
	      break;							\
	    }								\
	}								\
    }									\
  while (0)

/* Test if C is in range table of CHARSET.  The flag NOT is negated if
   C is listed in it.  */
#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset)			\
  do									\
    {									\
      /* Number of ranges in range table. */				\
      int count;							\
846 847
      re_char *range_table = CHARSET_RANGE_TABLE (charset);		\
      									\
848 849 850 851 852
      EXTRACT_NUMBER_AND_INCR (count, range_table);			\
      CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count);	\
    }									\
  while (0)

853 854 855 856
/* If DEBUG is defined, Regex prints many voluminous messages about what
   it is doing (if the variable `debug' is nonzero).  If linked with the
   main program in `iregex.c', you can enter patterns and strings
   interactively.  And if linked with the main program in `main.c' and
857
   the other test files, you can run the already-written tests.  */
858 859 860 861

#ifdef DEBUG

/* We use standard I/O for debugging.  */
862
# include <stdio.h>
863 864

/* It is useful to test things that ``must'' be true when debugging.  */
865
# include <assert.h>
866

867
static int debug = -100000;
868

869 870 871 872 873 874
# define DEBUG_STATEMENT(e) e
# define DEBUG_PRINT1(x) if (debug > 0) printf (x)
# define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
# define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)				\
875
  if (debug > 0) print_partial_compiled_pattern (s, e)
876
# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)			\
877
  if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
878 879 880 881 882 883 884 885 886


/* Print the fastmap in human-readable form.  */

void
print_fastmap (fastmap)
    char *fastmap;
{
  unsigned was_a_range = 0;
887 888
  unsigned i = 0;

889 890 891 892 893
  while (i < (1 << BYTEWIDTH))
    {
      if (fastmap[i++])
	{
	  was_a_range = 0;
Richard M. Stallman's avatar
Richard M. Stallman committed
894 895 896 897 898 899
	  putchar (i - 1);
	  while (i < (1 << BYTEWIDTH)  &&  fastmap[i])
	    {
	      was_a_range = 1;
	      i++;
	    }
900
	  if (was_a_range)
Richard M. Stallman's avatar
Richard M. Stallman committed
901 902 903 904 905
	    {
	      printf ("-");
	      putchar (i - 1);
	    }
	}
906
    }
907
  putchar ('\n');
908 909 910 911 912 913 914 915
}


/* Print a compiled pattern string in human-readable form, starting at
   the START pointer into it and ending just before the pointer END.  */

void
print_partial_compiled_pattern (start, end)
916 917
    re_char *start;
    re_char *end;
918 919
{
  int mcnt, mcnt2;
920 921
  re_char *p = start;
  re_char *pend = end;
922 923 924

  if (start == NULL)
    {
925
      fprintf (stderr, "(null)\n");
926 927
      return;
    }
928

929 930 931
  /* Loop over pattern commands.  */
  while (p < pend)
    {
932
      fprintf (stderr, "%d:\t", p - start);
933 934 935

      switch ((re_opcode_t) *p++)
	{
Richard M. Stallman's avatar
Richard M. Stallman committed
936
	case no_op:
937
	  fprintf (stderr, "/no_op");
Richard M. Stallman's avatar
Richard M. Stallman committed
938
	  break;
939

940
	case succeed:
941
	  fprintf (stderr, "/succeed");
942 943
	  break;

944 945
	case exactn:
	  mcnt = *p++;
946
	  fprintf (stderr, "/exactn/%d", mcnt);
Richard M. Stallman's avatar
Richard M. Stallman committed
947
	  do
948
	    {
949
	      fprintf (stderr, "/%c", *p++);
Richard M. Stallman's avatar
Richard M. Stallman committed
950 951 952
	    }
	  while (--mcnt);
	  break;
953 954

	case start_memory:
955
	  fprintf (stderr, "/start_memory/%d", *p++);
Richard M. Stallman's avatar
Richard M. Stallman committed
956
	  break;
957 958

	case stop_memory:
959
	  fprintf (stderr, "/stop_memory/%d", *p++);
Richard M. Stallman's avatar
Richard M. Stallman committed
960
	  break;
961 962

	case duplicate:
963
	  fprintf (stderr, "/duplicate/%d", *p++);
964 965 966
	  break;

	case anychar:
967
	  fprintf (stderr, "/anychar");
968 969 970
	  break;

	case charset:
Richard M. Stallman's avatar
Richard M. Stallman committed
971 972 973
	case charset_not:
	  {
	    register int c, last = -100;
974
	    register int in_range = 0;
975 976
	    int length = CHARSET_BITMAP_SIZE (p - 1);
	    int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
977

978
	    fprintf (stderr, "/charset [%s",
979
		     (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
980

981 982
	    if (p + *p >= pend)
	      fprintf (stderr, " !extends past end of pattern! ");
983

Richard M. Stallman's avatar
Richard M. Stallman committed
984
	    for (c = 0; c < 256; c++)
985
	      if (c / 8 < length
986 987 988 989 990
		  && (p[1 + (c/8)] & (1 << (c % 8))))
		{
		  /* Are we starting a range?  */
		  if (last + 1 == c && ! in_range)
		    {
991
		      fprintf (stderr, "-");
992 993 994 995
		      in_range = 1;
		    }
		  /* Have we broken a range?  */
		  else if (last + 1 != c && in_range)
996
		    {
997
		      fprintf (stderr, "%c", last);
998 999
		      in_range = 0;
		    }
1000

1001
		  if (! in_range)
1002
		    fprintf (stderr, "%c", c);
1003 1004

		  last = c;
Richard M. Stallman's avatar
Richard M. Stallman committed
1005
	      }
1006 1007

	    if (in_range)
1008
	      fprintf (stderr, "%c", last);
1009

1010
	    fprintf (stderr, "]");
1011

1012
	    p += 1 + length;
1013 1014

	    if (has_range_table)
1015 1016
	      {
		int count;
1017
		fprintf (stderr, "has-range-table");
1018 1019 1020 1021 1022 1023

		/* ??? Should print the range table; for now, just skip it.  */
		p += 2;		/* skip range table bits */
		EXTRACT_NUMBER_AND_INCR (count, p);
		p = CHARSET_RANGE_TABLE_END (p, count);
	      }
1024 1025 1026 1027
	  }
	  break;

	case begline:
1028
	  fprintf (stderr, "/begline");
Richard M. Stallman's avatar
Richard M. Stallman committed
1029
	  break;
1030 1031

	case endline:
1032
	  fprintf (stderr, "/endline");
Richard M. Stallman's avatar
Richard M. Stallman committed
1033
	  break;
1034 1035

	case on_failure_jump:
Richard M. Stallman's avatar
Richard M. Stallman committed
1036
	  extract_number_and_incr (&mcnt, &p);
1037
	  fprintf (stderr, "/on_failure_jump to %d", p + mcnt - start);
Richard M. Stallman's avatar
Richard M. Stallman committed
1038
	  break;
1039 1040

	case on_failure_keep_string_jump:
Richard M. Stallman's avatar
Richard M. Stallman committed
1041
	  extract_number_and_incr (&mcnt, &p);
1042
	  fprintf (stderr, "/on_failure_keep_string_jump to %d", p + mcnt - start);
Richard M. Stallman's avatar
Richard M. Stallman committed
1043
	  break;
1044

1045 1046
	case on_failure_jump_nastyloop:
	  extract_number_and_incr (&mcnt, &p);
1047
	  fprintf (stderr, "/on_failure_jump_nastyloop to %d", p + mcnt - start);
1048 1049
	  break;

1050
	case on_failure_jump_loop:
1051
	  extract_number_and_incr (&mcnt, &p);
1052
	  fprintf (stderr, "/on_failure_jump_loop to %d", p + mcnt - start);
1053 1054
	  break;

1055
	case on_failure_jump_smart:
1056
	  extract_number_and_incr (&mcnt, &p);
1057
	  fprintf (stderr, "/on_failure_jump_smart to %d", p + mcnt - start);
1058 1059
	  break;

Richard M. Stallman's avatar
Richard M. Stallman committed
1060
	case jump:
1061
	  extract_number_and_incr (&mcnt, &p);
1062
	  fprintf (stderr, "/jump to %d", p + mcnt - start);
1063 1064
	  break;

Richard M. Stallman's avatar
Richard M. Stallman committed
1065 1066 1067
	case succeed_n:
	  extract_number_and_incr (&mcnt, &p);
	  extract_number_and_incr (&mcnt2, &p);
1068
	  fprintf (stderr, "/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
Richard M. Stallman's avatar
Richard M. Stallman committed
1069
	  break;
1070

Richard M. Stallman's avatar
Richard M. Stallman committed
1071 1072 1073
	case jump_n:
	  extract_number_and_incr (&mcnt, &p);
	  extract_number_and_incr (&mcnt2, &p);
1074
	  fprintf (stderr, "/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
Richard M. Stallman's avatar
Richard M. Stallman committed
1075
	  break;
1076

Richard M. Stallman's avatar
Richard M. Stallman committed
1077 1078 1079
	case set_number_at:
	  extract_number_and_incr (&mcnt, &p);
	  extract_number_and_incr (&mcnt2, &p);
1080
	  fprintf (stderr, "/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
Richard M. Stallman's avatar
Richard M. Stallman committed
1081
	  break;
1082

Richard M. Stallman's avatar
Richard M. Stallman committed
1083
	case wordbound:
1084
	  fprintf (stderr, "/wordbound");
1085 1086 1087
	  break;

	case notwordbound:
1088
	  fprintf (stderr, "/notwordbound");
Richard M. Stallman's avatar
Richard M. Stallman committed
1089
	  break;
1090 1091

	case wordbeg:
1092
	  fprintf (stderr, "/wordbeg");
1093
	  break;
1094

1095
	case wordend:
1096
	  fprintf (stderr, "/wordend");
1097
	  break;
1098

1099
	case symbeg:
1100
	  fprintf (stderr, "/symbeg");
1101 1102 1103
	  break;

	case symend:
1104
	  fprintf (stderr, "/symend");
1105 1106
	  break;

1107
	case syntaxspec:
1108
	  fprintf (stderr, "/syntaxspec");
1109
	  mcnt = *p++;
1110
	  fprintf (stderr, "/%d", mcnt);
1111 1112 1113
	  break;

	case notsyntaxspec:
1114
	  fprintf (stderr, "/notsyntaxspec");
1115
	  mcnt = *p++;
1116
	  fprintf (stderr, "/%d", mcnt);
1117 1118
	  break;

1119
# ifdef emacs