regex.c 195 KB
Newer Older
Richard M. Stallman's avatar
Richard M. Stallman committed
1
/* Extended regular expression matching and search library, version
2
   0.12.  (Implements POSIX draft P1003.2/D11.2, except for some of the
Karl Berry's avatar
Karl Berry committed
3 4
   internationalization features.)

5
   Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
Glenn Morris's avatar
Glenn Morris committed
6
                 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
Miles Bader's avatar
Miles Bader committed
7
                 Free Software Foundation, Inc.
Karl Berry's avatar
Karl Berry committed
8

9 10
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
Miles Bader's avatar
Miles Bader committed
11
   the Free Software Foundation; either version 3, or (at your option)
12 13 14 15
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Juanma Barranquero's avatar
Juanma Barranquero committed
16
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 18 19 20
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
Lute Kamstra's avatar
Lute Kamstra committed
21
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
Juanma Barranquero's avatar
Juanma Barranquero committed
22
   USA.  */
23

24
/* TODO:
25
   - structure the opcode space into opcode+flag.
26
   - merge with glibc's regex.[ch].
27
   - replace (succeed_n + jump_n + set_number_at) with something that doesn't
28 29 30
     need to modify the compiled regexp so that re_match can be reentrant.
   - get rid of on_failure_jump_smart by doing the optimization in re_comp
     rather than at run-time, so that re_match can be reentrant.
31
*/
32

33
/* AIX requires this to be the first thing in the file. */
34
#if defined _AIX && !defined REGEX_MALLOC
35 36 37 38
  #pragma alloca
#endif

#ifdef HAVE_CONFIG_H
39
# include <config.h>
40 41
#endif

42 43 44 45 46 47
#if defined STDC_HEADERS && !defined emacs
# include <stddef.h>
#else
/* We need this for `regex.h', and perhaps for the Emacs include files.  */
# include <sys/types.h>
#endif
48

49 50
/* Whether to use ISO C Amendment 1 wide char functions.
   Those should not be used for Emacs since it uses its own.  */
51 52 53
#if defined _LIBC
#define WIDE_CHAR_SUPPORT 1
#else
54
#define WIDE_CHAR_SUPPORT \
55 56
	(HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
#endif
57 58 59

/* For platform which support the ISO C amendement 1 functionality we
   support user defined character classes.  */
60
#if WIDE_CHAR_SUPPORT
61 62 63 64 65
/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>.  */
# include <wchar.h>
# include <wctype.h>
#endif

66 67 68 69 70
#ifdef _LIBC
/* We have to keep the namespace clean.  */
# define regfree(preg) __regfree (preg)
# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
71 72
# define regerror(err_code, preg, errbuf, errbuf_size) \
	__regerror(err_code, preg, errbuf, errbuf_size)
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
# define re_set_registers(bu, re, nu, st, en) \
	__re_set_registers (bu, re, nu, st, en)
# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
	__re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
# define re_match(bufp, string, size, pos, regs) \
	__re_match (bufp, string, size, pos, regs)
# define re_search(bufp, string, size, startpos, range, regs) \
	__re_search (bufp, string, size, startpos, range, regs)
# define re_compile_pattern(pattern, length, bufp) \
	__re_compile_pattern (pattern, length, bufp)
# define re_set_syntax(syntax) __re_set_syntax (syntax)
# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
	__re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)

88 89 90 91 92
/* Make sure we call libc's function even if the user overrides them.  */
# define btowc __btowc
# define iswctype __iswctype
# define wctype __wctype

93 94 95 96 97 98 99 100 101 102
# define WEAK_ALIAS(a,b) weak_alias (a, b)

/* We are also using some library internals.  */
# include <locale/localeinfo.h>
# include <locale/elem-hash.h>
# include <langinfo.h>
#else
# define WEAK_ALIAS(a,b)
#endif

103
/* This is for other GNU distributions with internationalized messages.  */
104
#if HAVE_LIBINTL_H || defined _LIBC
105 106 107 108 109
# include <libintl.h>
#else
# define gettext(msgid) (msgid)
#endif

110 111 112
#ifndef gettext_noop
/* This define is so xgettext can find the internationalizable
   strings.  */
113
# define gettext_noop(String) String
114 115
#endif

116 117 118 119
/* The `emacs' switch turns on certain matching commands
   that make sense only in Emacs. */
#ifdef emacs

120 121
# include "lisp.h"
# include "buffer.h"
122 123

/* Make syntax table lookup grant data in gl_state.  */
124
# define SYNTAX_ENTRY_VIA_PROPERTY
125

126
# include "syntax.h"
127
# include "character.h"
128
# include "category.h"
129

130 131 132
# ifdef malloc
#  undef malloc
# endif
133
# define malloc xmalloc
134 135 136
# ifdef realloc
#  undef realloc
# endif
137
# define realloc xrealloc
138 139 140
# ifdef free
#  undef free
# endif
141
# define free xfree
142

Juanma Barranquero's avatar
Juanma Barranquero committed
143
/* Converts the pointer to the char to BEG-based offset from the start.  */
144 145 146 147
# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))

# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
148
# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
149
# define RE_STRING_CHAR(p, s, multibyte) \
Stefan Monnier's avatar
Stefan Monnier committed
150
  (multibyte ? (STRING_CHAR (p, s)) : (*(p)))
151
# define RE_STRING_CHAR_AND_LENGTH(p, s, len, multibyte) \
152 153
  (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p)))

154 155
# define RE_CHAR_TO_MULTIBYTE(c) unibyte_to_multibyte_table[(c)]

156
# define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
157

158 159 160
/* Set C a (possibly converted to multibyte) character before P.  P
   points into a string which is the virtual concatenation of STR1
   (which ends at END1) or STR2 (which ends at END2).  */
161 162
# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2)		     \
  do {									     \
163
    if (target_multibyte)						     \
164 165 166 167 168 169 170 171 172
      {									     \
	re_char *dtemp = (p) == (str2) ? (end1) : (p);			     \
	re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
	while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp));		     \
	c = STRING_CHAR (dtemp, (p) - dtemp);				     \
      }									     \
    else								     \
      {									     \
	(c = ((p) == (str2) ? (end1) : (p))[-1]);			     \
173
	(c) = RE_CHAR_TO_MULTIBYTE (c);					     \
174
      }									     \
175 176
  } while (0)

177 178 179 180
/* Set C a (possibly converted to multibyte) character at P, and set
   LEN to the byte length of that character.  */
# define GET_CHAR_AFTER(c, p, len)		\
  do {						\
181
    if (target_multibyte)			\
182
      (c) = STRING_CHAR_AND_LENGTH (p, 0, len);	\
183 184
    else					\
      {						\
185
	(c) = *p;				\
186
	len = 1;				\
187
	(c) = RE_CHAR_TO_MULTIBYTE (c);		\
188
      }						\
Kenichi Handa's avatar
Kenichi Handa committed
189
   } while (0)
Stefan Monnier's avatar
Stefan Monnier committed
190

191 192 193 194 195
#else  /* not emacs */

/* If we are not linking with Emacs proper,
   we can't use the relocating allocator
   even if config.h says that we can.  */
196
# undef REL_ALLOC
197

198 199 200
# if defined STDC_HEADERS || defined _LIBC
#  include <stdlib.h>
# else
201 202
char *malloc ();
char *realloc ();
203
# endif
204

205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
/* When used in Emacs's lib-src, we need xmalloc and xrealloc. */

void *
xmalloc (size)
     size_t size;
{
  register void *val;
  val = (void *) malloc (size);
  if (!val && size)
    {
      write (2, "virtual memory exhausted\n", 25);
      exit (1);
    }
  return val;
}

void *
xrealloc (block, size)
     void *block;
     size_t size;
{
  register void *val;
  /* We must call malloc explicitly when BLOCK is 0, since some
     reallocs don't do this.  */
  if (! block)
    val = (void *) malloc (size);
  else
    val = (void *) realloc (block, size);
  if (!val && size)
    {
      write (2, "virtual memory exhausted\n", 25);
      exit (1);
    }
  return val;
}

241 242 243 244 245 246 247 248 249
# ifdef malloc
#  undef malloc
# endif
# define malloc xmalloc
# ifdef realloc
#  undef realloc
# endif
# define realloc xrealloc

250
/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
251
   If nothing else has been done, use the method below.  */
252 253 254 255 256 257 258
# ifdef INHIBIT_STRING_HEADER
#  if !(defined HAVE_BZERO && defined HAVE_BCOPY)
#   if !defined bzero && !defined bcopy
#    undef INHIBIT_STRING_HEADER
#   endif
#  endif
# endif
259

260
/* This is the normal way of making sure we have memcpy, memcmp and bzero.
261 262
   This is used in most programs--a few other programs avoid this
   by defining INHIBIT_STRING_HEADER.  */
263 264 265 266
# ifndef INHIBIT_STRING_HEADER
#  if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
#   include <string.h>
#   ifndef bzero
267 268 269 270 271
#    ifndef _LIBC
#     define bzero(s, n)	(memset (s, '\0', n), (s))
#    else
#     define bzero(s, n)	__bzero (s, n)
#    endif
272 273 274
#   endif
#  else
#   include <strings.h>
275 276 277 278 279 280
#   ifndef memcmp
#    define memcmp(s1, s2, n)	bcmp (s1, s2, n)
#   endif
#   ifndef memcpy
#    define memcpy(d, s, n)	(bcopy (s, d, n), (d))
#   endif
281 282
#  endif
# endif
283 284 285

/* Define the syntax stuff for \<, \>, etc.  */

286
/* Sword must be nonzero for the wordchar pattern commands in re_match_2.  */
287
enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
288

289
#  define SWITCH_ENUM_CAST(x) (x)
290

291
/* Dummy macros for non-Emacs environments.  */
292 293 294 295 296
# define BASE_LEADING_CODE_P(c) (0)
# define CHAR_CHARSET(c) 0
# define CHARSET_LEADING_CODE_BASE(c) 0
# define MAX_MULTIBYTE_LENGTH 1
# define RE_MULTIBYTE_P(x) 0
297
# define RE_TARGET_MULTIBYTE_P(x) 0
298 299 300 301 302
# define WORD_BOUNDARY_P(c1, c2) (0)
# define CHAR_HEAD_P(p) (1)
# define SINGLE_BYTE_CHAR_P(c) (1)
# define SAME_CHARSET_P(c1, c2) (1)
# define MULTIBYTE_FORM_LENGTH(p, s) (1)
303
# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
304
# define STRING_CHAR(p, s) (*(p))
305
# define RE_STRING_CHAR(p, s, multibyte) STRING_CHAR ((p), (s))
306 307
# define CHAR_STRING(c, s) (*(s) = (c), 1)
# define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
308
# define RE_STRING_CHAR_AND_LENGTH(p, s, len, multibyte) STRING_CHAR_AND_LENGTH ((p), (s), (len))
309 310
# define RE_CHAR_TO_MULTIBYTE(c) (c)
# define RE_CHAR_TO_UNIBYTE(c) (c)
311
# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
312
  (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
313 314
# define GET_CHAR_AFTER(c, p, len)	\
  (c = *p, len = 1)
315
# define MAKE_CHAR(charset, c1, c2) (c1)
316 317
# define BYTE8_TO_CHAR(c) (c)
# define CHAR_BYTE8_P(c) (0)
318
# define CHAR_LEADING_CODE(c) (c)
Kenichi Handa's avatar
Kenichi Handa committed
319

320
#endif /* not emacs */
Stefan Monnier's avatar
Stefan Monnier committed
321 322

#ifndef RE_TRANSLATE
323 324
# define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
# define RE_TRANSLATE_P(TBL) (TBL)
Stefan Monnier's avatar
Stefan Monnier committed
325
#endif
326 327 328 329

/* Get the interface, including the syntax bits.  */
#include "regex.h"

330 331
/* isalpha etc. are used for the character classes.  */
#include <ctype.h>
332

333
#ifdef emacs
334

335
/* 1 if C is an ASCII character.  */
336
# define IS_REAL_ASCII(c) ((c) < 0200)
337

338
/* 1 if C is a unibyte character.  */
339
# define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
340

341
/* The Emacs definitions should not be directly affected by locales.  */
342

343
/* In Emacs, these are only used for single-byte characters.  */
344 345 346
# define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
# define ISCNTRL(c) ((c) < ' ')
# define ISXDIGIT(c) (((c) >= '0' && (c) <= '9')		\
347 348
		     || ((c) >= 'a' && (c) <= 'f')	\
		     || ((c) >= 'A' && (c) <= 'F'))
349 350

/* This is only used for single-byte characters.  */
351
# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
352 353 354

/* The rest must handle multibyte characters.  */

355
# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c)				\
356
		    ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237)	\
357 358
		    : 1)

359
# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c)				\
360
		    ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237)	\
361 362
		    : 1)

363
# define ISALNUM(c) (IS_REAL_ASCII (c)			\
364 365 366
		    ? (((c) >= 'a' && (c) <= 'z')	\
		       || ((c) >= 'A' && (c) <= 'Z')	\
		       || ((c) >= '0' && (c) <= '9'))	\
367 368
		    : SYNTAX (c) == Sword)

369
# define ISALPHA(c) (IS_REAL_ASCII (c)			\
370 371
		    ? (((c) >= 'a' && (c) <= 'z')	\
		       || ((c) >= 'A' && (c) <= 'Z'))	\
372 373
		    : SYNTAX (c) == Sword)

374
# define ISLOWER(c) (LOWERCASEP (c))
375

376
# define ISPUNCT(c) (IS_REAL_ASCII (c)				\
377 378
		    ? ((c) > ' ' && (c) < 0177			\
		       && !(((c) >= 'a' && (c) <= 'z')		\
379 380
		            || ((c) >= 'A' && (c) <= 'Z')	\
		            || ((c) >= '0' && (c) <= '9')))	\
381 382
		    : SYNTAX (c) != Sword)

383
# define ISSPACE(c) (SYNTAX (c) == Swhitespace)
384

385
# define ISUPPER(c) (UPPERCASEP (c))
386

387
# define ISWORD(c) (SYNTAX (c) == Sword)
388 389 390

#else /* not emacs */

391 392 393 394 395
/* Jim Meyering writes:

   "... Some ctype macros are valid only for character codes that
   isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
   using /bin/cc or gcc but without giving an ansi option).  So, all
396
   ctype uses should be through macros like ISPRINT...  If
397 398 399
   STDC_HEADERS is defined, then autoconf has verified that the ctype
   macros don't need to be guarded with references to isascii. ...
   Defining isascii to 1 should let any compiler worth its salt
400 401
   eliminate the && through constant folding."
   Solaris defines some of these symbols so we must undefine them first.  */
402

403
# undef ISASCII
404 405 406 407 408
# if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
#  define ISASCII(c) 1
# else
#  define ISASCII(c) isascii(c)
# endif
409 410

/* 1 if C is an ASCII character.  */
411
# define IS_REAL_ASCII(c) ((c) < 0200)
412 413

/* This distinction is not meaningful, except in Emacs.  */
414 415 416 417 418 419 420 421 422 423 424 425 426
# define ISUNIBYTE(c) 1

# ifdef isblank
#  define ISBLANK(c) (ISASCII (c) && isblank (c))
# else
#  define ISBLANK(c) ((c) == ' ' || (c) == '\t')
# endif
# ifdef isgraph
#  define ISGRAPH(c) (ISASCII (c) && isgraph (c))
# else
#  define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
# endif

427
# undef ISPRINT
428 429 430 431 432 433 434 435 436 437 438 439 440
# define ISPRINT(c) (ISASCII (c) && isprint (c))
# define ISDIGIT(c) (ISASCII (c) && isdigit (c))
# define ISALNUM(c) (ISASCII (c) && isalnum (c))
# define ISALPHA(c) (ISASCII (c) && isalpha (c))
# define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
# define ISLOWER(c) (ISASCII (c) && islower (c))
# define ISPUNCT(c) (ISASCII (c) && ispunct (c))
# define ISSPACE(c) (ISASCII (c) && isspace (c))
# define ISUPPER(c) (ISASCII (c) && isupper (c))
# define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))

# define ISWORD(c) ISALPHA(c)

441 442 443 444 445 446 447 448 449
# ifdef _tolower
#  define TOLOWER(c) _tolower(c)
# else
#  define TOLOWER(c) tolower(c)
# endif

/* How many characters in the character set.  */
# define CHAR_SET_SIZE 256

450
# ifdef SYNTAX_TABLE
451

452
extern char *re_syntax_table;
453

454 455 456 457 458 459 460 461 462 463 464 465 466 467 468
# else /* not SYNTAX_TABLE */

static char re_syntax_table[CHAR_SET_SIZE];

static void
init_syntax_once ()
{
   register int c;
   static int done = 0;

   if (done)
     return;

   bzero (re_syntax_table, sizeof re_syntax_table);

469 470 471
   for (c = 0; c < CHAR_SET_SIZE; ++c)
     if (ISALNUM (c))
	re_syntax_table[c] = Sword;
472

473
   re_syntax_table['_'] = Ssymbol;
474

475 476 477 478
   done = 1;
}

# endif /* not SYNTAX_TABLE */
479

480 481
# define SYNTAX(c) re_syntax_table[(c)]

482 483
#endif /* not emacs */

484
#ifndef NULL
485
# define NULL (void *)0
486 487 488 489 490
#endif

/* We remove any previous definition of `SIGN_EXTEND_CHAR',
   since ours (we hope) works properly with all combinations of
   machines, compilers, `char' and `unsigned char' argument types.
491
   (Per Bothner suggested the basic approach.)  */
492 493
#undef SIGN_EXTEND_CHAR
#if __STDC__
494
# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
495 496
#else  /* not __STDC__ */
/* As in Harbison and Steele.  */
497
# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
498 499 500 501 502 503
#endif

/* Should we use malloc or alloca?  If REGEX_MALLOC is not defined, we
   use `alloca' instead of `malloc'.  This is because using malloc in
   re_search* or re_match* could cause memory leaks when C-g is used in
   Emacs; also, malloc is slower and causes storage fragmentation.  On
504 505
   the other hand, malloc is more portable, and easier to debug.

506 507 508 509 510 511
   Because we sometimes use alloca, some routines have to be macros,
   not functions -- `alloca'-allocated space disappears at the end of the
   function it is called in.  */

#ifdef REGEX_MALLOC

512 513 514
# define REGEX_ALLOCATE malloc
# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
# define REGEX_FREE free
515 516 517 518

#else /* not REGEX_MALLOC  */

/* Emacs already defines alloca, sometimes.  */
519
# ifndef alloca
520 521

/* Make alloca work the best possible way.  */
522 523 524
#  ifdef __GNUC__
#   define alloca __builtin_alloca
#  else /* not __GNUC__ */
525
#   ifdef HAVE_ALLOCA_H
526 527 528
#    include <alloca.h>
#   endif /* HAVE_ALLOCA_H */
#  endif /* not __GNUC__ */
529

530
# endif /* not alloca */
531

532
# define REGEX_ALLOCATE alloca
533 534

/* Assumes a `char *destination' variable.  */
535
# define REGEX_REALLOCATE(source, osize, nsize)				\
536
  (destination = (char *) alloca (nsize),				\
537
   memcpy (destination, source, osize))
538 539

/* No need to do anything to free, after alloca.  */
540
# define REGEX_FREE(arg) ((void)0) /* Do nothing!  But inhibit gcc warning.  */
541 542 543 544 545

#endif /* not REGEX_MALLOC */

/* Define how to allocate the failure stack.  */

546
#if defined REL_ALLOC && defined REGEX_MALLOC
547

548
# define REGEX_ALLOCATE_STACK(size)				\
549
  r_alloc (&failure_stack_ptr, (size))
550
# define REGEX_REALLOCATE_STACK(source, osize, nsize)		\
551
  r_re_alloc (&failure_stack_ptr, (nsize))
552
# define REGEX_FREE_STACK(ptr)					\
553 554
  r_alloc_free (&failure_stack_ptr)

555
#else /* not using relocating allocator */
556

557
# ifdef REGEX_MALLOC
558

559 560 561
#  define REGEX_ALLOCATE_STACK malloc
#  define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
#  define REGEX_FREE_STACK free
562

563
# else /* not REGEX_MALLOC */
564

565
#  define REGEX_ALLOCATE_STACK alloca
566

567
#  define REGEX_REALLOCATE_STACK(source, osize, nsize)			\
568
   REGEX_REALLOCATE (source, osize, nsize)
Juanma Barranquero's avatar
Juanma Barranquero committed
569
/* No need to explicitly free anything.  */
570
#  define REGEX_FREE_STACK(arg) ((void)0)
571

572
# endif /* not REGEX_MALLOC */
573
#endif /* not using relocating allocator */
574 575 576 577 578


/* True if `size1' is non-NULL and PTR is pointing anywhere inside
   `string1' or just past its end.  This works if PTR is NULL, which is
   a good thing.  */
Richard M. Stallman's avatar
Richard M. Stallman committed
579
#define FIRST_STRING_P(ptr)					\
580 581 582 583 584 585 586 587 588
  (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)

/* (Re)Allocate N items of type T using malloc, or fail.  */
#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
#define RETALLOC_IF(addr, n, t) \
  if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))

589
#define BYTEWIDTH 8 /* In bits.  */
590 591 592 593 594 595 596 597

#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))

#undef MAX
#undef MIN
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))

598 599 600
/* Type of source-pattern and string chars.  */
typedef const unsigned char re_char;

601 602 603 604
typedef char boolean;
#define false 0
#define true 1

605 606 607 608 609 610
static int re_match_2_internal _RE_ARGS ((struct re_pattern_buffer *bufp,
					re_char *string1, int size1,
					re_char *string2, int size2,
					int pos,
					struct re_registers *regs,
					int stop));
611 612

/* These are the command codes that appear in compiled regular
613
   expressions.  Some opcodes are followed by argument bytes.  A
614 615 616 617 618 619 620
   command code can specify any interpretation whatsoever for its
   arguments.  Zero bytes may appear in the compiled regular expression.  */

typedef enum
{
  no_op = 0,

621
  /* Succeed right away--no more backtracking.  */
622 623
  succeed,

Richard M. Stallman's avatar
Richard M. Stallman committed
624
	/* Followed by one byte giving n, then by n literal bytes.  */
625 626
  exactn,

Richard M. Stallman's avatar
Richard M. Stallman committed
627
	/* Matches any (more or less) character.  */
628 629
  anychar,

Richard M. Stallman's avatar
Richard M. Stallman committed
630 631 632 633 634
	/* Matches any one char belonging to specified set.  First
	   following byte is number of bitmap bytes.  Then come bytes
	   for a bitmap saying which chars are in.  Bits in each byte
	   are ordered low-bit-first.  A character is in the set if its
	   bit is 1.  A character too large to have a bit in the map is
635 636 637 638 639
	   automatically not in the set.

	   If the length byte has the 0x80 bit set, then that stuff
	   is followed by a range table:
	       2 bytes of flags for character sets (low 8 bits, high 8 bits)
640
		   See RANGE_TABLE_WORK_BITS below.
641
	       2 bytes, the number of pairs that follow (upto 32767)
642
	       pairs, each 2 multibyte characters,
643
		   each multibyte character represented as 3 bytes.  */
644 645
  charset,

Richard M. Stallman's avatar
Richard M. Stallman committed
646
	/* Same parameters as charset, but match any character that is
647
	   not one of those specified.  */
648 649
  charset_not,

Richard M. Stallman's avatar
Richard M. Stallman committed
650 651 652
	/* Start remembering the text that is matched, for storing in a
	   register.  Followed by one byte with the register number, in
	   the range 0 to one less than the pattern buffer's re_nsub
653
	   field.  */
654 655
  start_memory,

Richard M. Stallman's avatar
Richard M. Stallman committed
656 657 658
	/* Stop remembering the text that is matched and store it in a
	   memory register.  Followed by one byte with the register
	   number, in the range 0 to one less than `re_nsub' in the
659
	   pattern buffer.  */
660 661
  stop_memory,

Richard M. Stallman's avatar
Richard M. Stallman committed
662
	/* Match a duplicate of something remembered. Followed by one
663
	   byte containing the register number.  */
664 665
  duplicate,

Richard M. Stallman's avatar
Richard M. Stallman committed
666
	/* Fail unless at beginning of line.  */
667 668
  begline,

669
	/* Fail unless at end of line.  */
670 671
  endline,

Richard M. Stallman's avatar
Richard M. Stallman committed
672 673
	/* Succeeds if at beginning of buffer (if emacs) or at beginning
	   of string to be matched (if not).  */
674 675
  begbuf,

Richard M. Stallman's avatar
Richard M. Stallman committed
676
	/* Analogously, for end of buffer/string.  */
677
  endbuf,
678

Richard M. Stallman's avatar
Richard M. Stallman committed
679
	/* Followed by two byte relative address to which to jump.  */
680
  jump,
681

Richard M. Stallman's avatar
Richard M. Stallman committed
682
	/* Followed by two-byte relative address of place to resume at
Juanma Barranquero's avatar
Juanma Barranquero committed
683
	   in case of failure.  */
684
  on_failure_jump,
685

Richard M. Stallman's avatar
Richard M. Stallman committed
686 687
	/* Like on_failure_jump, but pushes a placeholder instead of the
	   current string position when executed.  */
688
  on_failure_keep_string_jump,
689

690 691 692 693 694
	/* Just like `on_failure_jump', except that it checks that we
	   don't get stuck in an infinite loop (matching an empty string
	   indefinitely).  */
  on_failure_jump_loop,

695 696 697 698 699 700
	/* Just like `on_failure_jump_loop', except that it checks for
	   a different kind of loop (the kind that shows up with non-greedy
	   operators).  This operation has to be immediately preceded
	   by a `no_op'.  */
  on_failure_jump_nastyloop,

701
	/* A smart `on_failure_jump' used for greedy * and + operators.
702 703
	   It analyses the loop before which it is put and if the
	   loop does not require backtracking, it changes itself to
Stefan Monnier's avatar
Stefan Monnier committed
704 705 706
	   `on_failure_keep_string_jump' and short-circuits the loop,
	   else it just defaults to changing itself into `on_failure_jump'.
	   It assumes that it is pointing to just past a `jump'.  */
707
  on_failure_jump_smart,
708

Richard M. Stallman's avatar
Richard M. Stallman committed
709
	/* Followed by two-byte relative address and two-byte number n.
710 711 712
	   After matching N times, jump to the address upon failure.
	   Does not work if N starts at 0: use on_failure_jump_loop
	   instead.  */
713 714
  succeed_n,

Richard M. Stallman's avatar
Richard M. Stallman committed
715 716
	/* Followed by two-byte relative address, and two-byte number n.
	   Jump to the address N times, then fail.  */
717 718
  jump_n,

Richard M. Stallman's avatar
Richard M. Stallman committed
719
	/* Set the following two-byte relative address to the
Juanma Barranquero's avatar
Juanma Barranquero committed
720
	   subsequent two-byte number.  The address *includes* the two
Richard M. Stallman's avatar
Richard M. Stallman committed
721
	   bytes of number.  */
722 723 724 725 726 727
  set_number_at,

  wordbeg,	/* Succeeds if at word beginning.  */
  wordend,	/* Succeeds if at word end.  */

  wordbound,	/* Succeeds if at a word boundary.  */
Juanma Barranquero's avatar
Juanma Barranquero committed
728
  notwordbound,	/* Succeeds if not at a word boundary.  */
729

730 731 732
  symbeg,       /* Succeeds if at symbol beginning.  */
  symend,       /* Succeeds if at symbol end.  */

733
	/* Matches any character whose syntax is specified.  Followed by
Richard M. Stallman's avatar
Richard M. Stallman committed
734
	   a byte which contains a syntax code, e.g., Sword.  */
735 736 737
  syntaxspec,

	/* Matches any character whose syntax is not that specified.  */
738 739 740 741 742 743
  notsyntaxspec

#ifdef emacs
  ,before_dot,	/* Succeeds if before point.  */
  at_dot,	/* Succeeds if at point.  */
  after_dot,	/* Succeeds if after point.  */
744 745

  /* Matches any character whose category-set contains the specified
Juanma Barranquero's avatar
Juanma Barranquero committed
746 747
     category.  The operator is followed by a byte which contains a
     category code (mnemonic ASCII character).  */
748 749 750 751 752 753
  categoryspec,

  /* Matches any character whose category-set does not contain the
     specified category.  The operator is followed by a byte which
     contains the category code (mnemonic ASCII character).  */
  notcategoryspec
754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786
#endif /* emacs */
} re_opcode_t;

/* Common operations on the compiled pattern.  */

/* Store NUMBER in two contiguous bytes starting at DESTINATION.  */

#define STORE_NUMBER(destination, number)				\
  do {									\
    (destination)[0] = (number) & 0377;					\
    (destination)[1] = (number) >> 8;					\
  } while (0)

/* Same as STORE_NUMBER, except increment DESTINATION to
   the byte after where the number is stored.  Therefore, DESTINATION
   must be an lvalue.  */

#define STORE_NUMBER_AND_INCR(destination, number)			\
  do {									\
    STORE_NUMBER (destination, number);					\
    (destination) += 2;							\
  } while (0)

/* Put into DESTINATION a number stored in two contiguous bytes starting
   at SOURCE.  */

#define EXTRACT_NUMBER(destination, source)				\
  do {									\
    (destination) = *(source) & 0377;					\
    (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8;		\
  } while (0)

#ifdef DEBUG
787
static void extract_number _RE_ARGS ((int *dest, re_char *source));
788 789 790
static void
extract_number (dest, source)
    int *dest;
791
    re_char *source;
792
{
793
  int temp = SIGN_EXTEND_CHAR (*(source + 1));
794 795 796 797
  *dest = *source & 0377;
  *dest += temp << 8;
}

798
# ifndef EXTRACT_MACROS /* To debug the macros.  */
799 800 801
#  undef EXTRACT_NUMBER
#  define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
# endif /* not EXTRACT_MACROS */
802 803 804 805 806 807 808 809 810

#endif /* DEBUG */

/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
   SOURCE must be an lvalue.  */

#define EXTRACT_NUMBER_AND_INCR(destination, source)			\
  do {									\
    EXTRACT_NUMBER (destination, source);				\
Richard M. Stallman's avatar
Richard M. Stallman committed
811
    (source) += 2;							\
812 813 814
  } while (0)

#ifdef DEBUG
815 816
static void extract_number_and_incr _RE_ARGS ((int *destination,
					       re_char **source));
817 818 819
static void
extract_number_and_incr (destination, source)
    int *destination;
820
    re_char **source;
821
{
822 823 824 825
  extract_number (destination, *source);
  *source += 2;
}

826 827 828
# ifndef EXTRACT_MACROS
#  undef EXTRACT_NUMBER_AND_INCR
#  define EXTRACT_NUMBER_AND_INCR(dest, src) \
829
  extract_number_and_incr (&dest, &src)
830
# endif /* not EXTRACT_MACROS */
831 832 833

#endif /* DEBUG */

834 835
/* Store a multibyte character in three contiguous bytes starting
   DESTINATION, and increment DESTINATION to the byte after where the
Juanma Barranquero's avatar
Juanma Barranquero committed
836
   character is stored.  Therefore, DESTINATION must be an lvalue.  */
837 838 839 840 841 842 843 844 845 846

#define STORE_CHARACTER_AND_INCR(destination, character)	\
  do {								\
    (destination)[0] = (character) & 0377;			\
    (destination)[1] = ((character) >> 8) & 0377;		\
    (destination)[2] = (character) >> 16;			\
    (destination) += 3;						\
  } while (0)

/* Put into DESTINATION a character stored in three contiguous bytes
Juanma Barranquero's avatar
Juanma Barranquero committed
847
   starting at SOURCE.  */
848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863

#define EXTRACT_CHARACTER(destination, source)	\
  do {						\
    (destination) = ((source)[0]		\
		     | ((source)[1] << 8)	\
		     | ((source)[2] << 16));	\
  } while (0)


/* Macros for charset. */

/* Size of bitmap of charset P in bytes.  P is a start of charset,
   i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not.  */
#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)

/* Nonzero if charset P has range table.  */
Richard M. Stallman's avatar
Richard M. Stallman committed
864
#define CHARSET_RANGE_TABLE_EXISTS_P(p)	 ((p)[1] & 0x80)
865 866 867

/* Return the address of range table of charset P.  But not the start
   of table itself, but the before where the number of ranges is
868 869 870 871 872 873 874 875
   stored.  `2 +' means to skip re_opcode_t and size of bitmap,
   and the 2 bytes of flags at the start of the range table.  */
#define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])

/* Extract the bit flags that start a range table.  */
#define CHARSET_RANGE_TABLE_BITS(p)		\
  ((p)[2 + CHARSET_BITMAP_SIZE (p)]		\
   + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
876 877 878 879 880 881 882

/* Test if C is listed in the bitmap of charset P.  */
#define CHARSET_LOOKUP_BITMAP(p, c)				\
  ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH			\
   && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))

/* Return the address of end of RANGE_TABLE.  COUNT is number of
Juanma Barranquero's avatar
Juanma Barranquero committed
883 884
   ranges (which is a pair of (start, end)) in the RANGE_TABLE.  `* 2'
   is start of range and end of range.  `* 3' is size of each start
885 886 887 888
   and end.  */
#define CHARSET_RANGE_TABLE_END(range_table, count)	\
  ((range_table) + (count) * 2 * 3)

Juanma Barranquero's avatar
Juanma Barranquero committed
889
/* Test if C is in RANGE_TABLE.  A flag NOT is negated if C is in.
890 891 892 893
   COUNT is number of ranges in RANGE_TABLE.  */
#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count)	\
  do									\
    {									\
894 895 896
      re_wchar_t range_start, range_end;				\
      re_char *p;							\
      re_char *range_table_end						\
897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919
	= CHARSET_RANGE_TABLE_END ((range_table), (count));		\
									\
      for (p = (range_table); p < range_table_end; p += 2 * 3)		\
	{								\
	  EXTRACT_CHARACTER (range_start, p);				\
	  EXTRACT_CHARACTER (range_end, p + 3);				\
									\
	  if (range_start <= (c) && (c) <= range_end)			\
	    {								\
	      (not) = !(not);						\
	      break;							\
	    }								\
	}								\
    }									\
  while (0)

/* Test if C is in range table of CHARSET.  The flag NOT is negated if
   C is listed in it.  */
#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset)			\
  do									\
    {									\
      /* Number of ranges in range table. */				\
      int count;							\
920 921
      re_char *range_table = CHARSET_RANGE_TABLE (charset);		\
      									\
922 923 924 925 926
      EXTRACT_NUMBER_AND_INCR (count, range_table);			\
      CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count);	\
    }									\
  while (0)

927 928 929 930
/* If DEBUG is defined, Regex prints many voluminous messages about what
   it is doing (if the variable `debug' is nonzero).  If linked with the
   main program in `iregex.c', you can enter patterns and strings
   interactively.  And if linked with the main program in `main.c' and
931
   the other test files, you can run the already-written tests.  */
932 933 934 935

#ifdef DEBUG

/* We use standard I/O for debugging.  */
936
# include <stdio.h>
937 938

/* It is useful to test things that ``must'' be true when debugging.  */
939
# include <assert.h>
940

941
static int debug = -100000;
942

943 944 945 946 947 948
# define DEBUG_STATEMENT(e) e
# define DEBUG_PRINT1(x) if (debug > 0) printf (x)
# define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
# define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)				\
949
  if (debug > 0) print_partial_compiled_pattern (s, e)
950
# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)			\
951
  if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
952 953 954 955 956 957 958 959 960


/* Print the fastmap in human-readable form.  */

void
print_fastmap (fastmap)
    char *fastmap;
{
  unsigned was_a_range = 0;
961 962
  unsigned i = 0;

963 964 965 966 967
  while (i < (1 << BYTEWIDTH))
    {
      if (fastmap[i++])
	{
	  was_a_range = 0;
Richard M. Stallman's avatar
Richard M. Stallman committed
968 969 970 971 972 973
	  putchar (i - 1);
	  while (i < (1 << BYTEWIDTH)  &&  fastmap[i])
	    {
	      was_a_range = 1;
	      i++;
	    }
974
	  if (was_a_range)
Richard M. Stallman's avatar
Richard M. Stallman committed
975 976 977 978 979
	    {
	      printf ("-");
	      putchar (i - 1);
	    }
	}
980
    }
981
  putchar ('\n');
982 983 984 985 986 987 988 989
}


/* Print a compiled pattern string in human-readable form, starting at
   the START pointer into it and ending just before the pointer END.  */

void
print_partial_compiled_pattern (start, end)
990 991
    re_char *start;
    re_char *end;
992 993
{
  int mcnt, mcnt2;
994 995
  re_char *p = start;
  re_char *pend = end;
996 997 998

  if (start == NULL)
    {
999
      fprintf (stderr, "(null)\n");
1000 1001
      return;
    }
1002

1003 1004 1005
  /* Loop over pattern commands.  */
  while (p < pend)
    {
1006
      fprintf (stderr, "%d:\t", p - start);
1007 1008 1009

      switch ((re_opcode_t) *p++)
	{
Richard M. Stallman's avatar
Richard M. Stallman committed
1010
	case no_op:
1011
	  fprintf (stderr, "/no_op");
Richard M. Stallman's avatar
Richard M. Stallman committed
1012
	  break;
1013

1014
	case succeed:
1015
	  fprintf (stderr, "/succeed");
1016 1017
	  break;

1018 1019
	case exactn:
	  mcnt = *p++;
1020
	  fprintf (stderr, "/exactn/%d", mcnt);
Richard M. Stallman's avatar
Richard M. Stallman committed
1021
	  do
1022
	    {
1023
	      fprintf (stderr, "/%c", *p++);
Richard M. Stallman's avatar
Richard M. Stallman committed
1024 1025 1026
	    }
	  while (--mcnt);
	  break;
1027 1028

	case start_memory:
1029
	  fprintf (stderr, "/start_memory/%d", *p++);
Richard M. Stallman's avatar
Richard M. Stallman committed
1030
	  break;
1031 1032

	case stop_memory:
1033
	  fprintf (stderr, "/stop_memory/%d", *p++);
Richard M. Stallman's avatar
Richard M. Stallman committed
1034
	  break;
1035 1036

	case duplicate:
1037
	  fprintf (stderr, "/duplicate/%d", *p++);