regex-emacs.h 7.39 KB
Newer Older
Karl Berry's avatar
Karl Berry committed
1
/* Definitions for data structures and routines for the regular
Jim Blandy's avatar
Jim Blandy committed
2
   expression library, version 0.12.
Karl Berry's avatar
Karl Berry committed
3

Paul Eggert's avatar
Paul Eggert committed
4
   Copyright (C) 1985, 1989-1993, 1995, 2000-2018 Free Software
5
   Foundation, Inc.
Karl Berry's avatar
Karl Berry committed
6 7 8

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
Miles Bader's avatar
Miles Bader committed
9
   the Free Software Foundation; either version 3, or (at your option)
Karl Berry's avatar
Karl Berry committed
10 11 12 13 14 15 16 17
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
18
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
Karl Berry's avatar
Karl Berry committed
19

20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
#ifndef EMACS_REGEX_H
#define EMACS_REGEX_H 1

#include <stddef.h>

/* This is the structure we store register match data in.  See
   regex.texinfo for a full description of what registers match.
   Declare this before including lisp.h, since lisp.h (via thread.h)
   uses struct re_registers.  */
struct re_registers
{
  unsigned num_regs;
  ptrdiff_t *start;
  ptrdiff_t *end;
};

#include "lisp.h"

38
/* In Emacs, this is the string or buffer in which we are matching.
39
   It is used for looking up syntax properties.
40 41 42

   If the value is a Lisp string object, we are matching text in that
   string; if it's nil, we are matching text in the current buffer; if
Eli Zaretskii's avatar
Eli Zaretskii committed
43 44
   it's t, we are matching text in a C string.

45 46 47 48 49 50
   This value is effectively another parameter to re_search_2 and
   re_match_2.  No calls into Lisp or thread switches are allowed
   before setting re_match_object and calling into the regex search
   and match functions.  These functions capture the current value of
   re_match_object into gl_state on entry.

51
   TODO: turn into an actual function parameter.  */
52
extern Lisp_Object re_match_object;
53

Paul Eggert's avatar
Paul Eggert committed
54
/* Roughly the maximum number of failure points on the stack.  */
55 56 57 58
extern size_t emacs_re_max_failures;

/* Amount of memory that we can safely stack allocate.  */
extern ptrdiff_t emacs_re_safe_alloca;
Karl Berry's avatar
Karl Berry committed
59 60 61

/* This data structure represents a compiled pattern.  Before calling
   the pattern compiler, the fields `buffer', `allocated', `fastmap',
62
   and `translate' can be set.  After the pattern has been
Karl Berry's avatar
Karl Berry committed
63 64 65 66 67 68 69 70 71 72 73
   compiled, the `re_nsub' field is available.  All other fields are
   private to the regex routines.  */

struct re_pattern_buffer
{
	/* Space that holds the compiled pattern.  It is declared as
          `unsigned char *' because its elements are
           sometimes used as array indexes.  */
  unsigned char *buffer;

	/* Number of bytes to which `buffer' points.  */
74
  size_t allocated;
Karl Berry's avatar
Karl Berry committed
75 76

	/* Number of bytes actually used in `buffer'.  */
77
  size_t used;
Karl Berry's avatar
Karl Berry committed
78

79 80
        /* Charset of unibyte characters at compiling time. */
  int charset_unibyte;
81

Karl Berry's avatar
Karl Berry committed
82 83 84 85 86 87 88 89 90
        /* Pointer to a fastmap, if any, otherwise zero.  re_search uses
           the fastmap, if there is one, to skip over impossible
           starting points for matches.  */
  char *fastmap;

        /* Either a translate table to apply to all characters before
           comparing them, or zero for no translation.  The translation
           is applied to a pattern when it is compiled and to a string
           when it is matched.  */
91
  Lisp_Object translate;
Karl Berry's avatar
Karl Berry committed
92 93 94 95 96 97 98

	/* Number of subexpressions found by the compiler.  */
  size_t re_nsub;

        /* Zero if this pattern cannot match the empty string, one else.
           Well, in truth it's used only in `re_search_2', to see
           whether or not we should use the fastmap, so we don't set
Stefan Monnier's avatar
Stefan Monnier committed
99
           this absolutely perfectly; see `re_compile_fastmap'.  */
Karl Berry's avatar
Karl Berry committed
100 101 102 103 104 105 106 107
  unsigned can_be_null : 1;

        /* If REGS_UNALLOCATED, allocate space in the `regs' structure
             for `max (RE_NREGS, re_nsub + 1)' groups.
           If REGS_REALLOCATE, reallocate space if necessary.
           If REGS_FIXED, use what's there.  */
  unsigned regs_allocated : 2;

Karl Berry's avatar
Karl Berry committed
108 109
        /* Set to zero when `regex_compile' compiles a pattern; set to one
           by `re_compile_fastmap' if it updates the fastmap.  */
Karl Berry's avatar
Karl Berry committed
110 111
  unsigned fastmap_accurate : 1;

Stefan Monnier's avatar
Stefan Monnier committed
112 113 114 115
  /* If true, the compilation of the pattern had to look up the syntax table,
     so the compiled pattern is only valid for the current syntax table.  */
  unsigned used_syntax : 1;

116
  /* If true, multi-byte form in the regexp pattern should be
117
     recognized as a multibyte character.  */
118
  unsigned multibyte : 1;
119 120 121 122

  /* If true, multi-byte form in the target of match should be
     recognized as a multibyte character.  */
  unsigned target_multibyte : 1;
Karl Berry's avatar
Karl Berry committed
123 124 125 126 127 128 129
};

/* Declarations for routines.  */

/* Compile the regular expression PATTERN, with length LENGTH
   and syntax given by the global `re_syntax_options', into the buffer
   BUFFER.  Return NULL if successful, and an error string if not.  */
130
extern const char *re_compile_pattern (const char *pattern, size_t length,
131
				       bool posix_backtracking,
132
				       const char *whitespace_regexp,
133
				       struct re_pattern_buffer *buffer);
Karl Berry's avatar
Karl Berry committed
134 135 136 137 138 139


/* Search in the string STRING (with length LENGTH) for the pattern
   compiled into BUFFER.  Start searching at position START, for RANGE
   characters.  Return the starting position of the match, -1 for no
   match, or -2 for an internal error.  Also return register
140
   information in REGS (if REGS is nonzero).  */
141 142 143 144
extern ptrdiff_t re_search (struct re_pattern_buffer *buffer,
			   const char *string, size_t length,
			   ptrdiff_t start, ptrdiff_t range,
			   struct re_registers *regs);
Karl Berry's avatar
Karl Berry committed
145 146 147 148


/* Like `re_search', but search in the concatenation of STRING1 and
   STRING2.  Also, stop searching at index START + STOP.  */
149 150 151 152 153 154
extern ptrdiff_t re_search_2 (struct re_pattern_buffer *buffer,
			     const char *string1, size_t length1,
			     const char *string2, size_t length2,
			     ptrdiff_t start, ptrdiff_t range,
			     struct re_registers *regs,
			     ptrdiff_t stop);
Karl Berry's avatar
Karl Berry committed
155 156


157
/* Like 're_search_2', but return how many characters in STRING the regexp
Karl Berry's avatar
Karl Berry committed
158
   in BUFFER matched, starting at position START.  */
159 160 161 162 163
extern ptrdiff_t re_match_2 (struct re_pattern_buffer *buffer,
			    const char *string1, size_t length1,
			    const char *string2, size_t length2,
			    ptrdiff_t start, struct re_registers *regs,
			    ptrdiff_t stop);
Karl Berry's avatar
Karl Berry committed
164 165 166 167 168 169


/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
   ENDS.  Subsequent matches using BUFFER and REGS will use this memory
   for recording register information.  STARTS and ENDS must be
   allocated with malloc, and must each be at least `NUM_REGS * sizeof
170
   (ptrdiff_t)' bytes long.
Karl Berry's avatar
Karl Berry committed
171 172 173 174 175 176 177

   If NUM_REGS == 0, then subsequent matches should allocate their own
   register data.

   Unless this function is called, the first search or match using
   PATTERN_BUFFER will allocate its own register data, without
   freeing the old data.  */
178 179 180 181
extern void re_set_registers (struct re_pattern_buffer *buffer,
			      struct re_registers *regs,
			      unsigned num_regs,
			      ptrdiff_t *starts, ptrdiff_t *ends);
182 183 184 185 186 187 188 189 190 191 192 193 194

/* Character classes.  */
typedef enum { RECC_ERROR = 0,
	       RECC_ALNUM, RECC_ALPHA, RECC_WORD,
	       RECC_GRAPH, RECC_PRINT,
	       RECC_LOWER, RECC_UPPER,
	       RECC_PUNCT, RECC_CNTRL,
	       RECC_DIGIT, RECC_XDIGIT,
	       RECC_BLANK, RECC_SPACE,
	       RECC_MULTIBYTE, RECC_NONASCII,
	       RECC_ASCII, RECC_UNIBYTE
} re_wctype_t;

195 196 197
extern bool re_iswctype (int ch, re_wctype_t cc);
extern re_wctype_t re_wctype_parse (const unsigned char **strp,
				    unsigned limit);
198

199
#endif /* regex-emacs.h */