regex.c 195 KB
Newer Older
Richard M. Stallman's avatar
Richard M. Stallman committed
1
/* Extended regular expression matching and search library, version
2
   0.12.  (Implements POSIX draft P1003.2/D11.2, except for some of the
Karl Berry's avatar
Karl Berry committed
3 4
   internationalization features.)

5
   Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
Stefan Monnier's avatar
Stefan Monnier committed
6
                 2002, 2003, 2004, 2005, 2006, 2007, 2008
Miles Bader's avatar
Miles Bader committed
7
                 Free Software Foundation, Inc.
Karl Berry's avatar
Karl Berry committed
8

9 10
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
Miles Bader's avatar
Miles Bader committed
11
   the Free Software Foundation; either version 3, or (at your option)
12 13 14 15
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Juanma Barranquero's avatar
Juanma Barranquero committed
16
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 18 19 20
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
Lute Kamstra's avatar
Lute Kamstra committed
21
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
Juanma Barranquero's avatar
Juanma Barranquero committed
22
   USA.  */
23

24
/* TODO:
25
   - structure the opcode space into opcode+flag.
26
   - merge with glibc's regex.[ch].
27
   - replace (succeed_n + jump_n + set_number_at) with something that doesn't
28 29 30
     need to modify the compiled regexp so that re_match can be reentrant.
   - get rid of on_failure_jump_smart by doing the optimization in re_comp
     rather than at run-time, so that re_match can be reentrant.
31
*/
32

33
/* AIX requires this to be the first thing in the file. */
34
#if defined _AIX && !defined REGEX_MALLOC
35 36 37 38
  #pragma alloca
#endif

#ifdef HAVE_CONFIG_H
39
# include <config.h>
40 41
#endif

42 43 44 45 46 47
#if defined STDC_HEADERS && !defined emacs
# include <stddef.h>
#else
/* We need this for `regex.h', and perhaps for the Emacs include files.  */
# include <sys/types.h>
#endif
48

49 50
/* Whether to use ISO C Amendment 1 wide char functions.
   Those should not be used for Emacs since it uses its own.  */
51 52 53
#if defined _LIBC
#define WIDE_CHAR_SUPPORT 1
#else
54
#define WIDE_CHAR_SUPPORT \
55 56
	(HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
#endif
57 58 59

/* For platform which support the ISO C amendement 1 functionality we
   support user defined character classes.  */
60
#if WIDE_CHAR_SUPPORT
61 62 63 64 65
/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>.  */
# include <wchar.h>
# include <wctype.h>
#endif

66 67 68 69 70
#ifdef _LIBC
/* We have to keep the namespace clean.  */
# define regfree(preg) __regfree (preg)
# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
71 72
# define regerror(err_code, preg, errbuf, errbuf_size) \
	__regerror(err_code, preg, errbuf, errbuf_size)
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
# define re_set_registers(bu, re, nu, st, en) \
	__re_set_registers (bu, re, nu, st, en)
# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
	__re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
# define re_match(bufp, string, size, pos, regs) \
	__re_match (bufp, string, size, pos, regs)
# define re_search(bufp, string, size, startpos, range, regs) \
	__re_search (bufp, string, size, startpos, range, regs)
# define re_compile_pattern(pattern, length, bufp) \
	__re_compile_pattern (pattern, length, bufp)
# define re_set_syntax(syntax) __re_set_syntax (syntax)
# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
	__re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)

88 89 90 91 92
/* Make sure we call libc's function even if the user overrides them.  */
# define btowc __btowc
# define iswctype __iswctype
# define wctype __wctype

93 94 95 96 97 98 99 100 101 102
# define WEAK_ALIAS(a,b) weak_alias (a, b)

/* We are also using some library internals.  */
# include <locale/localeinfo.h>
# include <locale/elem-hash.h>
# include <langinfo.h>
#else
# define WEAK_ALIAS(a,b)
#endif

103
/* This is for other GNU distributions with internationalized messages.  */
104
#if HAVE_LIBINTL_H || defined _LIBC
105 106 107 108 109
# include <libintl.h>
#else
# define gettext(msgid) (msgid)
#endif

110 111 112
#ifndef gettext_noop
/* This define is so xgettext can find the internationalizable
   strings.  */
113
# define gettext_noop(String) String
114 115
#endif

116 117 118 119
/* The `emacs' switch turns on certain matching commands
   that make sense only in Emacs. */
#ifdef emacs

120 121
# include "lisp.h"
# include "buffer.h"
122 123

/* Make syntax table lookup grant data in gl_state.  */
124
# define SYNTAX_ENTRY_VIA_PROPERTY
125

126
# include "syntax.h"
127
# include "character.h"
128
# include "category.h"
129

130 131 132
# ifdef malloc
#  undef malloc
# endif
133
# define malloc xmalloc
134 135 136
# ifdef realloc
#  undef realloc
# endif
137
# define realloc xrealloc
138 139 140
# ifdef free
#  undef free
# endif
141
# define free xfree
142

Juanma Barranquero's avatar
Juanma Barranquero committed
143
/* Converts the pointer to the char to BEG-based offset from the start.  */
144 145 146 147
# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))

# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
148
# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
149
# define RE_STRING_CHAR(p, s, multibyte) \
Stefan Monnier's avatar
Stefan Monnier committed
150
  (multibyte ? (STRING_CHAR (p, s)) : (*(p)))
151
# define RE_STRING_CHAR_AND_LENGTH(p, s, len, multibyte) \
152 153
  (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p)))

154 155 156 157 158 159 160
# define RE_CHAR_TO_MULTIBYTE(c) unibyte_to_multibyte_table[(c)]

# define RE_CHAR_TO_UNIBYTE(c)			\
  (ASCII_CHAR_P (c) ? (c)			\
   : CHAR_BYTE8_P (c) ? CHAR_TO_BYTE8 (c)	\
   : multibyte_char_to_unibyte_safe (c))

161 162 163
/* Set C a (possibly converted to multibyte) character before P.  P
   points into a string which is the virtual concatenation of STR1
   (which ends at END1) or STR2 (which ends at END2).  */
164 165
# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2)		     \
  do {									     \
166
    if (target_multibyte)						     \
167 168 169 170 171 172 173 174 175
      {									     \
	re_char *dtemp = (p) == (str2) ? (end1) : (p);			     \
	re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
	while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp));		     \
	c = STRING_CHAR (dtemp, (p) - dtemp);				     \
      }									     \
    else								     \
      {									     \
	(c = ((p) == (str2) ? (end1) : (p))[-1]);			     \
176
	(c) = RE_CHAR_TO_MULTIBYTE (c);					     \
177
      }									     \
178 179
  } while (0)

180 181 182 183
/* Set C a (possibly converted to multibyte) character at P, and set
   LEN to the byte length of that character.  */
# define GET_CHAR_AFTER(c, p, len)		\
  do {						\
184
    if (target_multibyte)			\
185
      (c) = STRING_CHAR_AND_LENGTH (p, 0, len);	\
186 187
    else					\
      {						\
188
	(c) = *p;				\
189
	len = 1;				\
190
	(c) = RE_CHAR_TO_MULTIBYTE (c);		\
191
      }						\
Kenichi Handa's avatar
Kenichi Handa committed
192
   } while (0)
Stefan Monnier's avatar
Stefan Monnier committed
193

194 195 196 197 198
#else  /* not emacs */

/* If we are not linking with Emacs proper,
   we can't use the relocating allocator
   even if config.h says that we can.  */
199
# undef REL_ALLOC
200

201 202 203
# if defined STDC_HEADERS || defined _LIBC
#  include <stdlib.h>
# else
204 205
char *malloc ();
char *realloc ();
206
# endif
207

208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
/* When used in Emacs's lib-src, we need xmalloc and xrealloc. */

void *
xmalloc (size)
     size_t size;
{
  register void *val;
  val = (void *) malloc (size);
  if (!val && size)
    {
      write (2, "virtual memory exhausted\n", 25);
      exit (1);
    }
  return val;
}

void *
xrealloc (block, size)
     void *block;
     size_t size;
{
  register void *val;
  /* We must call malloc explicitly when BLOCK is 0, since some
     reallocs don't do this.  */
  if (! block)
    val = (void *) malloc (size);
  else
    val = (void *) realloc (block, size);
  if (!val && size)
    {
      write (2, "virtual memory exhausted\n", 25);
      exit (1);
    }
  return val;
}

244 245 246 247 248 249 250 251 252
# ifdef malloc
#  undef malloc
# endif
# define malloc xmalloc
# ifdef realloc
#  undef realloc
# endif
# define realloc xrealloc

253
/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
254
   If nothing else has been done, use the method below.  */
255 256 257 258 259 260 261
# ifdef INHIBIT_STRING_HEADER
#  if !(defined HAVE_BZERO && defined HAVE_BCOPY)
#   if !defined bzero && !defined bcopy
#    undef INHIBIT_STRING_HEADER
#   endif
#  endif
# endif
262

263
/* This is the normal way of making sure we have memcpy, memcmp and bzero.
264 265
   This is used in most programs--a few other programs avoid this
   by defining INHIBIT_STRING_HEADER.  */
266 267 268 269
# ifndef INHIBIT_STRING_HEADER
#  if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
#   include <string.h>
#   ifndef bzero
270 271 272 273 274
#    ifndef _LIBC
#     define bzero(s, n)	(memset (s, '\0', n), (s))
#    else
#     define bzero(s, n)	__bzero (s, n)
#    endif
275 276 277
#   endif
#  else
#   include <strings.h>
278 279 280 281 282 283
#   ifndef memcmp
#    define memcmp(s1, s2, n)	bcmp (s1, s2, n)
#   endif
#   ifndef memcpy
#    define memcpy(d, s, n)	(bcopy (s, d, n), (d))
#   endif
284 285
#  endif
# endif
286 287 288

/* Define the syntax stuff for \<, \>, etc.  */

289
/* Sword must be nonzero for the wordchar pattern commands in re_match_2.  */
290
enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
291

292
#  define SWITCH_ENUM_CAST(x) (x)
293

294
/* Dummy macros for non-Emacs environments.  */
295 296 297 298 299
# define BASE_LEADING_CODE_P(c) (0)
# define CHAR_CHARSET(c) 0
# define CHARSET_LEADING_CODE_BASE(c) 0
# define MAX_MULTIBYTE_LENGTH 1
# define RE_MULTIBYTE_P(x) 0
300
# define RE_TARGET_MULTIBYTE_P(x) 0
301 302 303 304 305
# define WORD_BOUNDARY_P(c1, c2) (0)
# define CHAR_HEAD_P(p) (1)
# define SINGLE_BYTE_CHAR_P(c) (1)
# define SAME_CHARSET_P(c1, c2) (1)
# define MULTIBYTE_FORM_LENGTH(p, s) (1)
306
# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
307
# define STRING_CHAR(p, s) (*(p))
308
# define RE_STRING_CHAR(p, s, multibyte) STRING_CHAR ((p), (s))
309 310
# define CHAR_STRING(c, s) (*(s) = (c), 1)
# define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
311
# define RE_STRING_CHAR_AND_LENGTH(p, s, len, multibyte) STRING_CHAR_AND_LENGTH ((p), (s), (len))
312 313
# define RE_CHAR_TO_MULTIBYTE(c) (c)
# define RE_CHAR_TO_UNIBYTE(c) (c)
314
# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
315
  (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
316 317
# define GET_CHAR_AFTER(c, p, len)	\
  (c = *p, len = 1)
318
# define MAKE_CHAR(charset, c1, c2) (c1)
319 320
# define BYTE8_TO_CHAR(c) (c)
# define CHAR_BYTE8_P(c) (0)
321
# define CHAR_LEADING_CODE(c) (c)
Kenichi Handa's avatar
Kenichi Handa committed
322

323
#endif /* not emacs */
Stefan Monnier's avatar
Stefan Monnier committed
324 325

#ifndef RE_TRANSLATE
326 327
# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
# define RE_TRANSLATE_P(TBL) (TBL)
Stefan Monnier's avatar
Stefan Monnier committed
328
#endif
329 330 331 332

/* Get the interface, including the syntax bits.  */
#include "regex.h"

333 334
/* isalpha etc. are used for the character classes.  */
#include <ctype.h>
335

336
#ifdef emacs
337

338
/* 1 if C is an ASCII character.  */
339
# define IS_REAL_ASCII(c) ((c) < 0200)
340

341
/* 1 if C is a unibyte character.  */
342
# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
343

344
/* The Emacs definitions should not be directly affected by locales.  */
345

346
/* In Emacs, these are only used for single-byte characters.  */
347 348 349
# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
# define ISCNTRL(c) ((c) < ' ')
# define ISXDIGIT(c) (((c) >= '0' && (c) <= '9')		\
350 351
		     || ((c) >= 'a' && (c) <= 'f')	\
		     || ((c) >= 'A' && (c) <= 'F'))
352 353

/* This is only used for single-byte characters.  */
354
# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
355 356 357

/* The rest must handle multibyte characters.  */

358
# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c)				\
359
		    ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237)	\
360 361
		    : 1)

362
# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c)				\
363
		    ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237)	\
364 365
		    : 1)

366
# define ISALNUM(c) (IS_REAL_ASCII (c)			\
367 368 369
		    ? (((c) >= 'a' && (c) <= 'z')	\
		       || ((c) >= 'A' && (c) <= 'Z')	\
		       || ((c) >= '0' && (c) <= '9'))	\
370 371
		    : SYNTAX (c) == Sword)

372
# define ISALPHA(c) (IS_REAL_ASCII (c)			\
373 374
		    ? (((c) >= 'a' && (c) <= 'z')	\
		       || ((c) >= 'A' && (c) <= 'Z'))	\
375 376
		    : SYNTAX (c) == Sword)

377
# define ISLOWER(c) (LOWERCASEP (c))
378

379
# define ISPUNCT(c) (IS_REAL_ASCII (c)				\
380 381
		    ? ((c) > ' ' && (c) < 0177			\
		       && !(((c) >= 'a' && (c) <= 'z')		\
382 383
		            || ((c) >= 'A' && (c) <= 'Z')	\
		            || ((c) >= '0' && (c) <= '9')))	\
384 385
		    : SYNTAX (c) != Sword)

386
# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
387

388
# define ISUPPER(c) (UPPERCASEP (c))
389

390
# define ISWORD(c) (SYNTAX (c) == Sword)
391 392 393

#else /* not emacs */

394 395 396 397 398
/* Jim Meyering writes:

   "... Some ctype macros are valid only for character codes that
   isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
   using /bin/cc or gcc but without giving an ansi option).  So, all
399
   ctype uses should be through macros like ISPRINT...  If
400 401 402
   STDC_HEADERS is defined, then autoconf has verified that the ctype
   macros don't need to be guarded with references to isascii. ...
   Defining isascii to 1 should let any compiler worth its salt
403 404
   eliminate the && through constant folding."
   Solaris defines some of these symbols so we must undefine them first.  */
405

406
# undef ISASCII
407 408 409 410 411
# if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
#  define ISASCII(c) 1
# else
#  define ISASCII(c) isascii(c)
# endif
412 413

/* 1 if C is an ASCII character.  */
414
# define IS_REAL_ASCII(c) ((c) < 0200)
415 416

/* This distinction is not meaningful, except in Emacs.  */
417 418 419 420 421 422 423 424 425 426 427 428 429
# define ISUNIBYTE(c) 1

# ifdef isblank
#  define ISBLANK(c) (ISASCII (c) && isblank (c))
# else
#  define ISBLANK(c) ((c) == ' ' || (c) == '\t')
# endif
# ifdef isgraph
#  define ISGRAPH(c) (ISASCII (c) && isgraph (c))
# else
#  define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
# endif

430
# undef ISPRINT
431 432 433 434 435 436 437 438 439 440 441 442 443
# define ISPRINT(c) (ISASCII (c) && isprint (c))
# define ISDIGIT(c) (ISASCII (c) && isdigit (c))
# define ISALNUM(c) (ISASCII (c) && isalnum (c))
# define ISALPHA(c) (ISASCII (c) && isalpha (c))
# define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
# define ISLOWER(c) (ISASCII (c) && islower (c))
# define ISPUNCT(c) (ISASCII (c) && ispunct (c))
# define ISSPACE(c) (ISASCII (c) && isspace (c))
# define ISUPPER(c) (ISASCII (c) && isupper (c))
# define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))

# define ISWORD(c) ISALPHA(c)

444 445 446 447 448 449 450 451 452
# ifdef _tolower
#  define TOLOWER(c) _tolower(c)
# else
#  define TOLOWER(c) tolower(c)
# endif

/* How many characters in the character set.  */
# define CHAR_SET_SIZE 256

453
# ifdef SYNTAX_TABLE
454

455
extern char *re_syntax_table;
456

457 458 459 460 461 462 463 464 465 466 467 468 469 470 471
# else /* not SYNTAX_TABLE */

static char re_syntax_table[CHAR_SET_SIZE];

static void
init_syntax_once ()
{
   register int c;
   static int done = 0;

   if (done)
     return;

   bzero (re_syntax_table, sizeof re_syntax_table);

472 473 474
   for (c = 0; c < CHAR_SET_SIZE; ++c)
     if (ISALNUM (c))
	re_syntax_table[c] = Sword;
475

476
   re_syntax_table['_'] = Ssymbol;
477

478 479 480 481
   done = 1;
}

# endif /* not SYNTAX_TABLE */
482

483 484
# define SYNTAX(c) re_syntax_table[(c)]

485 486
#endif /* not emacs */

487
#ifndef NULL
488
# define NULL (void *)0
489 490 491 492 493
#endif

/* We remove any previous definition of `SIGN_EXTEND_CHAR',
   since ours (we hope) works properly with all combinations of
   machines, compilers, `char' and `unsigned char' argument types.
494
   (Per Bothner suggested the basic approach.)  */
495 496
#undef SIGN_EXTEND_CHAR
#if __STDC__
497
# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
498 499
#else  /* not __STDC__ */
/* As in Harbison and Steele.  */
500
# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
501 502 503 504 505 506
#endif

/* Should we use malloc or alloca?  If REGEX_MALLOC is not defined, we
   use `alloca' instead of `malloc'.  This is because using malloc in
   re_search* or re_match* could cause memory leaks when C-g is used in
   Emacs; also, malloc is slower and causes storage fragmentation.  On
507 508
   the other hand, malloc is more portable, and easier to debug.

509 510 511 512 513 514
   Because we sometimes use alloca, some routines have to be macros,
   not functions -- `alloca'-allocated space disappears at the end of the
   function it is called in.  */

#ifdef REGEX_MALLOC

515 516 517
# define REGEX_ALLOCATE malloc
# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
# define REGEX_FREE free
518 519 520 521

#else /* not REGEX_MALLOC  */

/* Emacs already defines alloca, sometimes.  */
522
# ifndef alloca
523 524

/* Make alloca work the best possible way.  */
525 526 527
#  ifdef __GNUC__
#   define alloca __builtin_alloca
#  else /* not __GNUC__ */
528
#   ifdef HAVE_ALLOCA_H
529 530 531
#    include <alloca.h>
#   endif /* HAVE_ALLOCA_H */
#  endif /* not __GNUC__ */
532

533
# endif /* not alloca */
534

535
# define REGEX_ALLOCATE alloca
536 537

/* Assumes a `char *destination' variable.  */
538
# define REGEX_REALLOCATE(source, osize, nsize)				\
539
  (destination = (char *) alloca (nsize),				\
540
   memcpy (destination, source, osize))
541 542

/* No need to do anything to free, after alloca.  */
543
# define REGEX_FREE(arg) ((void)0) /* Do nothing!  But inhibit gcc warning.  */
544 545 546 547 548

#endif /* not REGEX_MALLOC */

/* Define how to allocate the failure stack.  */

549
#if defined REL_ALLOC && defined REGEX_MALLOC
550

551
# define REGEX_ALLOCATE_STACK(size)				\
552
  r_alloc (&failure_stack_ptr, (size))
553
# define REGEX_REALLOCATE_STACK(source, osize, nsize)		\
554
  r_re_alloc (&failure_stack_ptr, (nsize))
555
# define REGEX_FREE_STACK(ptr)					\
556 557
  r_alloc_free (&failure_stack_ptr)

558
#else /* not using relocating allocator */
559

560
# ifdef REGEX_MALLOC
561

562 563 564
#  define REGEX_ALLOCATE_STACK malloc
#  define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
#  define REGEX_FREE_STACK free
565

566
# else /* not REGEX_MALLOC */
567

568
#  define REGEX_ALLOCATE_STACK alloca
569

570
#  define REGEX_REALLOCATE_STACK(source, osize, nsize)			\
571
   REGEX_REALLOCATE (source, osize, nsize)
Juanma Barranquero's avatar
Juanma Barranquero committed
572
/* No need to explicitly free anything.  */
573
#  define REGEX_FREE_STACK(arg) ((void)0)
574

575
# endif /* not REGEX_MALLOC */
576
#endif /* not using relocating allocator */
577 578 579 580 581


/* True if `size1' is non-NULL and PTR is pointing anywhere inside
   `string1' or just past its end.  This works if PTR is NULL, which is
   a good thing.  */
Richard M. Stallman's avatar
Richard M. Stallman committed
582
#define FIRST_STRING_P(ptr)					\
583 584 585 586 587 588 589 590 591
  (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)

/* (Re)Allocate N items of type T using malloc, or fail.  */
#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
#define RETALLOC_IF(addr, n, t) \
  if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))

592
#define BYTEWIDTH 8 /* In bits.  */
593 594 595 596 597 598 599 600

#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))

#undef MAX
#undef MIN
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))

601 602 603
/* Type of source-pattern and string chars.  */
typedef const unsigned char re_char;

604 605 606 607
typedef char boolean;
#define false 0
#define true 1

608 609 610 611 612 613
static int re_match_2_internal _RE_ARGS ((struct re_pattern_buffer *bufp,
					re_char *string1, int size1,
					re_char *string2, int size2,
					int pos,
					struct re_registers *regs,
					int stop));
614 615

/* These are the command codes that appear in compiled regular
616
   expressions.  Some opcodes are followed by argument bytes.  A
617 618 619 620 621 622 623
   command code can specify any interpretation whatsoever for its
   arguments.  Zero bytes may appear in the compiled regular expression.  */

typedef enum
{
  no_op = 0,

624
  /* Succeed right away--no more backtracking.  */
625 626
  succeed,

Richard M. Stallman's avatar
Richard M. Stallman committed
627
	/* Followed by one byte giving n, then by n literal bytes.  */
628 629
  exactn,

Richard M. Stallman's avatar
Richard M. Stallman committed
630
	/* Matches any (more or less) character.  */
631 632
  anychar,

Richard M. Stallman's avatar
Richard M. Stallman committed
633 634 635 636 637
	/* Matches any one char belonging to specified set.  First
	   following byte is number of bitmap bytes.  Then come bytes
	   for a bitmap saying which chars are in.  Bits in each byte
	   are ordered low-bit-first.  A character is in the set if its
	   bit is 1.  A character too large to have a bit in the map is
638 639 640 641 642
	   automatically not in the set.

	   If the length byte has the 0x80 bit set, then that stuff
	   is followed by a range table:
	       2 bytes of flags for character sets (low 8 bits, high 8 bits)
643
		   See RANGE_TABLE_WORK_BITS below.
644
	       2 bytes, the number of pairs that follow (upto 32767)
645
	       pairs, each 2 multibyte characters,
646
		   each multibyte character represented as 3 bytes.  */
647 648
  charset,

Richard M. Stallman's avatar
Richard M. Stallman committed
649
	/* Same parameters as charset, but match any character that is
650
	   not one of those specified.  */
651 652
  charset_not,

Richard M. Stallman's avatar
Richard M. Stallman committed
653 654 655
	/* Start remembering the text that is matched, for storing in a
	   register.  Followed by one byte with the register number, in
	   the range 0 to one less than the pattern buffer's re_nsub
656
	   field.  */
657 658
  start_memory,

Richard M. Stallman's avatar
Richard M. Stallman committed
659 660 661
	/* Stop remembering the text that is matched and store it in a
	   memory register.  Followed by one byte with the register
	   number, in the range 0 to one less than `re_nsub' in the
662
	   pattern buffer.  */
663 664
  stop_memory,

Richard M. Stallman's avatar
Richard M. Stallman committed
665
	/* Match a duplicate of something remembered. Followed by one
666
	   byte containing the register number.  */
667 668
  duplicate,

Richard M. Stallman's avatar
Richard M. Stallman committed
669
	/* Fail unless at beginning of line.  */
670 671
  begline,

672
	/* Fail unless at end of line.  */
673 674
  endline,

Richard M. Stallman's avatar
Richard M. Stallman committed
675 676
	/* Succeeds if at beginning of buffer (if emacs) or at beginning
	   of string to be matched (if not).  */
677 678
  begbuf,

Richard M. Stallman's avatar
Richard M. Stallman committed
679
	/* Analogously, for end of buffer/string.  */
680
  endbuf,
681

Richard M. Stallman's avatar
Richard M. Stallman committed
682
	/* Followed by two byte relative address to which to jump.  */
683
  jump,
684

Richard M. Stallman's avatar
Richard M. Stallman committed
685
	/* Followed by two-byte relative address of place to resume at
Juanma Barranquero's avatar
Juanma Barranquero committed
686
	   in case of failure.  */
687
  on_failure_jump,
688

Richard M. Stallman's avatar
Richard M. Stallman committed
689 690
	/* Like on_failure_jump, but pushes a placeholder instead of the
	   current string position when executed.  */
691
  on_failure_keep_string_jump,
692

693 694 695 696 697
	/* Just like `on_failure_jump', except that it checks that we
	   don't get stuck in an infinite loop (matching an empty string
	   indefinitely).  */
  on_failure_jump_loop,

698 699 700 701 702 703
	/* Just like `on_failure_jump_loop', except that it checks for
	   a different kind of loop (the kind that shows up with non-greedy
	   operators).  This operation has to be immediately preceded
	   by a `no_op'.  */
  on_failure_jump_nastyloop,

704
	/* A smart `on_failure_jump' used for greedy * and + operators.
705 706
	   It analyses the loop before which it is put and if the
	   loop does not require backtracking, it changes itself to
Stefan Monnier's avatar
Stefan Monnier committed
707 708 709
	   `on_failure_keep_string_jump' and short-circuits the loop,
	   else it just defaults to changing itself into `on_failure_jump'.
	   It assumes that it is pointing to just past a `jump'.  */
710
  on_failure_jump_smart,
711

Richard M. Stallman's avatar
Richard M. Stallman committed
712
	/* Followed by two-byte relative address and two-byte number n.
713 714 715
	   After matching N times, jump to the address upon failure.
	   Does not work if N starts at 0: use on_failure_jump_loop
	   instead.  */
716 717
  succeed_n,

Richard M. Stallman's avatar
Richard M. Stallman committed
718 719
	/* Followed by two-byte relative address, and two-byte number n.
	   Jump to the address N times, then fail.  */
720 721
  jump_n,

Richard M. Stallman's avatar
Richard M. Stallman committed
722
	/* Set the following two-byte relative address to the
Juanma Barranquero's avatar
Juanma Barranquero committed
723
	   subsequent two-byte number.  The address *includes* the two
Richard M. Stallman's avatar
Richard M. Stallman committed
724
	   bytes of number.  */
725 726 727 728 729 730
  set_number_at,

  wordbeg,	/* Succeeds if at word beginning.  */
  wordend,	/* Succeeds if at word end.  */

  wordbound,	/* Succeeds if at a word boundary.  */
Juanma Barranquero's avatar
Juanma Barranquero committed
731
  notwordbound,	/* Succeeds if not at a word boundary.  */
732

733 734 735
  symbeg,       /* Succeeds if at symbol beginning.  */
  symend,       /* Succeeds if at symbol end.  */

736
	/* Matches any character whose syntax is specified.  Followed by
Richard M. Stallman's avatar
Richard M. Stallman committed
737
	   a byte which contains a syntax code, e.g., Sword.  */
738 739 740
  syntaxspec,

	/* Matches any character whose syntax is not that specified.  */
741 742 743 744 745 746
  notsyntaxspec

#ifdef emacs
  ,before_dot,	/* Succeeds if before point.  */
  at_dot,	/* Succeeds if at point.  */
  after_dot,	/* Succeeds if after point.  */
747 748

  /* Matches any character whose category-set contains the specified
Juanma Barranquero's avatar
Juanma Barranquero committed
749 750
     category.  The operator is followed by a byte which contains a
     category code (mnemonic ASCII character).  */
751 752 753 754 755 756
  categoryspec,

  /* Matches any character whose category-set does not contain the
     specified category.  The operator is followed by a byte which
     contains the category code (mnemonic ASCII character).  */
  notcategoryspec
757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789
#endif /* emacs */
} re_opcode_t;

/* Common operations on the compiled pattern.  */

/* Store NUMBER in two contiguous bytes starting at DESTINATION.  */

#define STORE_NUMBER(destination, number)				\
  do {									\
    (destination)[0] = (number) & 0377;					\
    (destination)[1] = (number) >> 8;					\
  } while (0)

/* Same as STORE_NUMBER, except increment DESTINATION to
   the byte after where the number is stored.  Therefore, DESTINATION
   must be an lvalue.  */

#define STORE_NUMBER_AND_INCR(destination, number)			\
  do {									\
    STORE_NUMBER (destination, number);					\
    (destination) += 2;							\
  } while (0)

/* Put into DESTINATION a number stored in two contiguous bytes starting
   at SOURCE.  */

#define EXTRACT_NUMBER(destination, source)				\
  do {									\
    (destination) = *(source) & 0377;					\
    (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8;		\
  } while (0)

#ifdef DEBUG
790
static void extract_number _RE_ARGS ((int *dest, re_char *source));
791 792 793
static void
extract_number (dest, source)
    int *dest;
794
    re_char *source;
795
{
796
  int temp = SIGN_EXTEND_CHAR (*(source + 1));
797 798 799 800
  *dest = *source & 0377;
  *dest += temp << 8;
}

801
# ifndef EXTRACT_MACROS /* To debug the macros.  */
802 803 804
#  undef EXTRACT_NUMBER
#  define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
# endif /* not EXTRACT_MACROS */
805 806 807 808 809 810 811 812 813

#endif /* DEBUG */

/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
   SOURCE must be an lvalue.  */

#define EXTRACT_NUMBER_AND_INCR(destination, source)			\
  do {									\
    EXTRACT_NUMBER (destination, source);				\
Richard M. Stallman's avatar
Richard M. Stallman committed
814
    (source) += 2;							\
815 816 817
  } while (0)

#ifdef DEBUG
818 819
static void extract_number_and_incr _RE_ARGS ((int *destination,
					       re_char **source));
820 821 822
static void
extract_number_and_incr (destination, source)
    int *destination;
823
    re_char **source;
824
{
825 826 827 828
  extract_number (destination, *source);
  *source += 2;
}

829 830 831
# ifndef EXTRACT_MACROS
#  undef EXTRACT_NUMBER_AND_INCR
#  define EXTRACT_NUMBER_AND_INCR(dest, src) \
832
  extract_number_and_incr (&dest, &src)
833
# endif /* not EXTRACT_MACROS */
834 835 836

#endif /* DEBUG */

837 838
/* Store a multibyte character in three contiguous bytes starting
   DESTINATION, and increment DESTINATION to the byte after where the
Juanma Barranquero's avatar
Juanma Barranquero committed
839
   character is stored.  Therefore, DESTINATION must be an lvalue.  */
840 841 842 843 844 845 846 847 848 849

#define STORE_CHARACTER_AND_INCR(destination, character)	\
  do {								\
    (destination)[0] = (character) & 0377;			\
    (destination)[1] = ((character) >> 8) & 0377;		\
    (destination)[2] = (character) >> 16;			\
    (destination) += 3;						\
  } while (0)

/* Put into DESTINATION a character stored in three contiguous bytes
Juanma Barranquero's avatar
Juanma Barranquero committed
850
   starting at SOURCE.  */
851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866

#define EXTRACT_CHARACTER(destination, source)	\
  do {						\
    (destination) = ((source)[0]		\
		     | ((source)[1] << 8)	\
		     | ((source)[2] << 16));	\
  } while (0)


/* Macros for charset. */

/* Size of bitmap of charset P in bytes.  P is a start of charset,
   i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not.  */
#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)

/* Nonzero if charset P has range table.  */
Richard M. Stallman's avatar
Richard M. Stallman committed
867
#define CHARSET_RANGE_TABLE_EXISTS_P(p)	 ((p)[1] & 0x80)
868 869 870

/* Return the address of range table of charset P.  But not the start
   of table itself, but the before where the number of ranges is
871 872 873 874 875 876 877 878
   stored.  `2 +' means to skip re_opcode_t and size of bitmap,
   and the 2 bytes of flags at the start of the range table.  */
#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])

/* Extract the bit flags that start a range table.  */
#define CHARSET_RANGE_TABLE_BITS(p)		\
  ((p)[2 + CHARSET_BITMAP_SIZE (p)]		\
   + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
879 880 881 882 883 884 885

/* Test if C is listed in the bitmap of charset P.  */
#define CHARSET_LOOKUP_BITMAP(p, c)				\
  ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH			\
   && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))

/* Return the address of end of RANGE_TABLE.  COUNT is number of
Juanma Barranquero's avatar
Juanma Barranquero committed
886 887
   ranges (which is a pair of (start, end)) in the RANGE_TABLE.  `* 2'
   is start of range and end of range.  `* 3' is size of each start
888 889 890 891
   and end.  */
#define CHARSET_RANGE_TABLE_END(range_table, count)	\
  ((range_table) + (count) * 2 * 3)

Juanma Barranquero's avatar
Juanma Barranquero committed
892
/* Test if C is in RANGE_TABLE.  A flag NOT is negated if C is in.
893 894 895 896
   COUNT is number of ranges in RANGE_TABLE.  */
#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count)	\
  do									\
    {									\
897 898 899
      re_wchar_t range_start, range_end;				\
      re_char *p;							\
      re_char *range_table_end						\
900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922
	= CHARSET_RANGE_TABLE_END ((range_table), (count));		\
									\
      for (p = (range_table); p < range_table_end; p += 2 * 3)		\
	{								\
	  EXTRACT_CHARACTER (range_start, p);				\
	  EXTRACT_CHARACTER (range_end, p + 3);				\
									\
	  if (range_start <= (c) && (c) <= range_end)			\
	    {								\
	      (not) = !(not);						\
	      break;							\
	    }								\
	}								\
    }									\
  while (0)

/* Test if C is in range table of CHARSET.  The flag NOT is negated if
   C is listed in it.  */
#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset)			\
  do									\
    {									\
      /* Number of ranges in range table. */				\
      int count;							\
923 924
      re_char *range_table = CHARSET_RANGE_TABLE (charset);		\
      									\
925 926 927 928 929
      EXTRACT_NUMBER_AND_INCR (count, range_table);			\
      CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count);	\
    }									\
  while (0)

930 931 932 933
/* If DEBUG is defined, Regex prints many voluminous messages about what
   it is doing (if the variable `debug' is nonzero).  If linked with the
   main program in `iregex.c', you can enter patterns and strings
   interactively.  And if linked with the main program in `main.c' and
934
   the other test files, you can run the already-written tests.  */
935 936 937 938

#ifdef DEBUG

/* We use standard I/O for debugging.  */
939
# include <stdio.h>
940 941

/* It is useful to test things that ``must'' be true when debugging.  */
942
# include <assert.h>
943

944
static int debug = -100000;
945

946 947 948 949 950 951
# define DEBUG_STATEMENT(e) e
# define DEBUG_PRINT1(x) if (debug > 0) printf (x)
# define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
# define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)				\
952
  if (debug > 0) print_partial_compiled_pattern (s, e)
953
# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)			\
954
  if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
955 956 957 958 959 960 961 962 963


/* Print the fastmap in human-readable form.  */

void
print_fastmap (fastmap)
    char *fastmap;
{
  unsigned was_a_range = 0;
964 965
  unsigned i = 0;

966 967 968 969 970
  while (i < (1 << BYTEWIDTH))
    {
      if (fastmap[i++])
	{
	  was_a_range = 0;
Richard M. Stallman's avatar
Richard M. Stallman committed
971 972 973 974 975 976
	  putchar (i - 1);
	  while (i < (1 << BYTEWIDTH)  &&  fastmap[i])
	    {
	      was_a_range = 1;
	      i++;
	    }
977
	  if (was_a_range)
Richard M. Stallman's avatar
Richard M. Stallman committed
978 979 980 981 982
	    {
	      printf ("-");
	      putchar (i - 1);
	    }
	}