rx.el 36.4 KB
Newer Older
Gerd Moellmann's avatar
Gerd Moellmann committed
1 2
;;; rx.el --- sexp notation for regular expressions

Paul Eggert's avatar
Paul Eggert committed
3
;; Copyright (C) 2001-2015 Free Software Foundation, Inc.
Gerd Moellmann's avatar
Gerd Moellmann committed
4 5

;; Author: Gerd Moellmann <gerd@gnu.org>
6
;; Maintainer: emacs-devel@gnu.org
Gerd Moellmann's avatar
Gerd Moellmann committed
7 8 9 10
;; Keywords: strings, regexps, extensions

;; This file is part of GNU Emacs.

11
;; GNU Emacs is free software: you can redistribute it and/or modify
Gerd Moellmann's avatar
Gerd Moellmann committed
12
;; it under the terms of the GNU General Public License as published by
13 14
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
Gerd Moellmann's avatar
Gerd Moellmann committed
15 16 17 18 19 20 21

;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
22
;; along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.
Gerd Moellmann's avatar
Gerd Moellmann committed
23 24 25 26 27 28 29 30 31 32

;;; Commentary:

;; This is another implementation of sexp-form regular expressions.
;; It was unfortunately written without being aware of the Sregex
;; package coming with Emacs, but as things stand, Rx completely
;; covers all regexp features, which Sregex doesn't, doesn't suffer
;; from the bugs mentioned in the commentary section of Sregex, and
;; uses a nicer syntax (IMHO, of course :-).

Stefan Monnier's avatar
Stefan Monnier committed
33 34 35 36 37
;; This significantly extended version of the original, is almost
;; compatible with Sregex.  The only incompatibility I (fx) know of is
;; that the `repeat' form can't have multiple regexp args.

;; Now alternative forms are provided for a degree of compatibility
38 39
;; with Olin Shivers' attempted definitive SRE notation.  SRE forms
;; not catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>,
Stefan Monnier's avatar
Stefan Monnier committed
40 41 42 43 44 45 46 47
;; ,<exp>, (word ...), word+, posix-string, and character class forms.
;; Some forms are inconsistent with SRE, either for historical reasons
;; or because of the implementation -- simple translation into Emacs
;; regexp strings.  These include: any, word.  Also, case-sensitivity
;; and greediness are controlled by variables external to the regexp,
;; and you need to feed the forms to the `posix-' functions to get
;; SRE's POSIX semantics.  There are probably more difficulties.

Gerd Moellmann's avatar
Gerd Moellmann committed
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
;; Rx translates a sexp notation for regular expressions into the
;; usual string notation.  The translation can be done at compile-time
;; by using the `rx' macro.  It can be done at run-time by calling
;; function `rx-to-string'.  See the documentation of `rx' for a
;; complete description of the sexp notation.
;;
;; Some examples of string regexps and their sexp counterparts:
;;
;; "^[a-z]*"
;; (rx (and line-start (0+ (in "a-z"))))
;;
;; "\n[^ \t]"
;; (rx (and "\n" (not blank))), or
;; (rx (and "\n" (not (any " \t"))))
;;
;; "\\*\\*\\* EOOH \\*\\*\\*\n"
;; (rx "*** EOOH ***\n")
;;
;; "\\<\\(catch\\|finally\\)\\>[^_]"
;; (rx (and word-start (submatch (or "catch" "finally")) word-end
;;          (not (any ?_))))
;;
;; "[ \t\n]*:\\([^:]+\\|$\\)"
;; (rx (and (zero-or-more (in " \t\n")) ":"
;;          (submatch (or line-end (one-or-more (not (any ?:)))))))
;;
;; "^content-transfer-encoding:\\(\n?[\t ]\\)*quoted-printable\\(\n?[\t ]\\)*"
;; (rx (and line-start
;;          "content-transfer-encoding:"
77
;;          (+ (? ?\n)) blank
Gerd Moellmann's avatar
Gerd Moellmann committed
78
;;	    "quoted-printable"
79
;;	    (+ (? ?\n)) blank))
Gerd Moellmann's avatar
Gerd Moellmann committed
80 81 82 83 84 85 86 87 88 89 90 91 92 93
;;
;; (concat "^\\(?:" something-else "\\)")
;; (rx (and line-start (eval something-else))), statically or
;; (rx-to-string '(and line-start ,something-else)), dynamically.
;;
;; (regexp-opt '(STRING1 STRING2 ...))
;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically
;; calls `regexp-opt' as needed.
;;
;; "^;;\\s-*\n\\|^\n"
;; (rx (or (and line-start ";;" (0+ space) ?\n)
;;         (and line-start ?\n)))
;;
;; "\\$[I]d: [^ ]+ \\([^ ]+\\) "
94 95
;; (rx (and "$Id: "
;;          (1+ (not (in " ")))
Gerd Moellmann's avatar
Gerd Moellmann committed
96 97
;;          " "
;;          (submatch (1+ (not (in " "))))
98
;;          " "))
Gerd Moellmann's avatar
Gerd Moellmann committed
99 100 101 102 103 104 105
;;
;; "\\\\\\\\\\[\\w+"
;; (rx (and ?\\ ?\\ ?\[ (1+ word)))
;;
;; etc.

;;; History:
106
;;
Gerd Moellmann's avatar
Gerd Moellmann committed
107 108 109

;;; Code:

110 111 112
;; FIXME: support macros.

(defvar rx-constituents              ;Not `const' because some modes extend it.
Gerd Moellmann's avatar
Gerd Moellmann committed
113
  '((and		. (rx-and 1 nil))
Stefan Monnier's avatar
Stefan Monnier committed
114 115 116
    (seq		. and)		; SRE
    (:			. and)		; SRE
    (sequence		. and)		; sregex
Gerd Moellmann's avatar
Gerd Moellmann committed
117
    (or			. (rx-or 1 nil))
Stefan Monnier's avatar
Stefan Monnier committed
118
    (|			. or)		; SRE
Gerd Moellmann's avatar
Gerd Moellmann committed
119
    (not-newline	. ".")
Stefan Monnier's avatar
Stefan Monnier committed
120
    (nonl		. not-newline)	; SRE
121
    (anything		. (rx-anything 0 nil))
Stefan Monnier's avatar
Stefan Monnier committed
122
    (any		. (rx-any 1 nil rx-check-any)) ; inconsistent with SRE
123
    (any		. ".")          ; sregex
Gerd Moellmann's avatar
Gerd Moellmann committed
124
    (in			. any)
Stefan Monnier's avatar
Stefan Monnier committed
125 126
    (char		. any)		; sregex
    (not-char		. (rx-not-char 1 nil rx-check-any)) ; sregex
Gerd Moellmann's avatar
Gerd Moellmann committed
127
    (not		. (rx-not 1 1 rx-check-not))
128
    (repeat		. (rx-repeat 2 nil))
Stefan Monnier's avatar
Stefan Monnier committed
129 130 131 132
    (=			. (rx-= 2 nil))	   ; SRE
    (>=			. (rx->= 2 nil))   ; SRE
    (**			. (rx-** 2 nil))   ; SRE
    (submatch		. (rx-submatch 1 nil)) ; SRE
133
    (group		. submatch)     ; sregex
134 135
    (submatch-n		. (rx-submatch-n 2 nil))
    (group-n		. submatch-n)
Stefan Monnier's avatar
Stefan Monnier committed
136 137 138 139
    (zero-or-more	. (rx-kleene 1 nil))
    (one-or-more	. (rx-kleene 1 nil))
    (zero-or-one	. (rx-kleene 1 nil))
    (\?			. zero-or-one)	; SRE
Gerd Moellmann's avatar
Gerd Moellmann committed
140
    (\??		. zero-or-one)
Stefan Monnier's avatar
Stefan Monnier committed
141
    (*			. zero-or-more)	; SRE
Gerd Moellmann's avatar
Gerd Moellmann committed
142 143
    (*?			. zero-or-more)
    (0+			. zero-or-more)
Stefan Monnier's avatar
Stefan Monnier committed
144
    (+			. one-or-more)	; SRE
Gerd Moellmann's avatar
Gerd Moellmann committed
145 146 147
    (+?			. one-or-more)
    (1+			. one-or-more)
    (optional		. zero-or-one)
Stefan Monnier's avatar
Stefan Monnier committed
148
    (opt		. zero-or-one)	; sregex
Gerd Moellmann's avatar
Gerd Moellmann committed
149 150
    (minimal-match	. (rx-greedy 1 1))
    (maximal-match	. (rx-greedy 1 1))
151
    (backref		. (rx-backref 1 1 rx-check-backref))
Gerd Moellmann's avatar
Gerd Moellmann committed
152
    (line-start		. "^")
Stefan Monnier's avatar
Stefan Monnier committed
153
    (bol		. line-start)	; SRE
Gerd Moellmann's avatar
Gerd Moellmann committed
154
    (line-end		. "$")
Stefan Monnier's avatar
Stefan Monnier committed
155
    (eol		. line-end)	; SRE
Gerd Moellmann's avatar
Gerd Moellmann committed
156
    (string-start	. "\\`")
Stefan Monnier's avatar
Stefan Monnier committed
157 158
    (bos		. string-start)	; SRE
    (bot		. string-start)	; sregex
Gerd Moellmann's avatar
Gerd Moellmann committed
159
    (string-end		. "\\'")
Stefan Monnier's avatar
Stefan Monnier committed
160 161
    (eos		. string-end)	; SRE
    (eot		. string-end)	; sregex
Gerd Moellmann's avatar
Gerd Moellmann committed
162 163 164 165
    (buffer-start	. "\\`")
    (buffer-end		. "\\'")
    (point		. "\\=")
    (word-start		. "\\<")
Stefan Monnier's avatar
Stefan Monnier committed
166
    (bow		. word-start)	; SRE
Gerd Moellmann's avatar
Gerd Moellmann committed
167
    (word-end		. "\\>")
Stefan Monnier's avatar
Stefan Monnier committed
168
    (eow		. word-end)	; SRE
Gerd Moellmann's avatar
Gerd Moellmann committed
169
    (word-boundary	. "\\b")
Stefan Monnier's avatar
Stefan Monnier committed
170
    (not-word-boundary	. "\\B")	; sregex
171 172
    (symbol-start       . "\\_<")
    (symbol-end         . "\\_>")
Gerd Moellmann's avatar
Gerd Moellmann committed
173
    (syntax		. (rx-syntax 1 1))
Stefan Monnier's avatar
Stefan Monnier committed
174
    (not-syntax		. (rx-not-syntax 1 1)) ; sregex
Gerd Moellmann's avatar
Gerd Moellmann committed
175 176 177
    (category		. (rx-category 1 1 rx-check-category))
    (eval		. (rx-eval 1 1))
    (regexp		. (rx-regexp 1 1 stringp))
178
    (regex		. regexp)       ; sregex
Gerd Moellmann's avatar
Gerd Moellmann committed
179
    (digit		. "[[:digit:]]")
Stefan Monnier's avatar
Stefan Monnier committed
180 181 182 183 184 185 186 187 188 189 190 191 192 193
    (numeric		. digit)	; SRE
    (num		. digit)	; SRE
    (control		. "[[:cntrl:]]") ; SRE
    (cntrl		. control)	 ; SRE
    (hex-digit		. "[[:xdigit:]]") ; SRE
    (hex		. hex-digit)	  ; SRE
    (xdigit		. hex-digit)	  ; SRE
    (blank		. "[[:blank:]]")  ; SRE
    (graphic		. "[[:graph:]]")  ; SRE
    (graph		. graphic)	  ; SRE
    (printing		. "[[:print:]]")  ; SRE
    (print		. printing)	  ; SRE
    (alphanumeric	. "[[:alnum:]]")  ; SRE
    (alnum		. alphanumeric)	  ; SRE
Gerd Moellmann's avatar
Gerd Moellmann committed
194
    (letter		. "[[:alpha:]]")
Stefan Monnier's avatar
Stefan Monnier committed
195 196 197
    (alphabetic		. letter)	; SRE
    (alpha		. letter)	; SRE
    (ascii		. "[[:ascii:]]") ; SRE
Gerd Moellmann's avatar
Gerd Moellmann committed
198
    (nonascii		. "[[:nonascii:]]")
Stefan Monnier's avatar
Stefan Monnier committed
199 200 201 202 203 204 205 206 207 208 209
    (lower		. "[[:lower:]]") ; SRE
    (lower-case		. lower)	 ; SRE
    (punctuation	. "[[:punct:]]") ; SRE
    (punct		. punctuation)	 ; SRE
    (space		. "[[:space:]]") ; SRE
    (whitespace		. space)	 ; SRE
    (white		. space)	 ; SRE
    (upper		. "[[:upper:]]") ; SRE
    (upper-case		. upper)	 ; SRE
    (word		. "[[:word:]]")	 ; inconsistent with SRE
    (wordchar		. word)		 ; sregex
210
    (not-wordchar	. "\\W"))
Gerd Moellmann's avatar
Gerd Moellmann committed
211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
  "Alist of sexp form regexp constituents.
Each element of the alist has the form (SYMBOL . DEFN).
SYMBOL is a valid constituent of sexp regular expressions.
If DEFN is a string, SYMBOL is translated into DEFN.
If DEFN is a symbol, use the definition of DEFN, recursively.
Otherwise, DEFN must be a list (FUNCTION MIN-ARGS MAX-ARGS PREDICATE).
FUNCTION is used to produce code for SYMBOL.  MIN-ARGS and MAX-ARGS
are the minimum and maximum number of arguments the function-form
sexp constituent SYMBOL may have in sexp regular expressions.
MAX-ARGS nil means no limit.  PREDICATE, if specified, means that
all arguments must satisfy PREDICATE.")


(defconst rx-syntax
  '((whitespace		. ?-)
    (punctuation	. ?.)
    (word		. ?w)
    (symbol		. ?_)
    (open-parenthesis	. ?\()
    (close-parenthesis	. ?\))
    (expression-prefix	. ?\')
    (string-quote	. ?\")
    (paired-delimiter	. ?$)
    (escape		. ?\\)
    (character-quote	. ?/)
    (comment-start	. ?<)
237 238
    (comment-end	. ?>)
    (string-delimiter	. ?|)
239
    (comment-delimiter	. ?!))
Gerd Moellmann's avatar
Gerd Moellmann committed
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
  "Alist mapping Rx syntax symbols to syntax characters.
Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
symbol in `(syntax SYMBOL)', and CHAR is the syntax character
corresponding to SYMBOL, as it would be used with \\s or \\S in
regular expressions.")


(defconst rx-categories
  '((consonant			. ?0)
    (base-vowel			. ?1)
    (upper-diacritical-mark	. ?2)
    (lower-diacritical-mark	. ?3)
    (tone-mark			. ?4)
    (symbol			. ?5)
    (digit			. ?6)
    (vowel-modifying-diacritical-mark . ?7)
    (vowel-sign			. ?8)
    (semivowel-lower		. ?9)
    (not-at-end-of-line		. ?<)
    (not-at-beginning-of-line	. ?>)
    (alpha-numeric-two-byte	. ?A)
261 262
    (chinese-two-byte		. ?C)
    (chinse-two-byte		. ?C) ;; A typo in Emacs 21.1-24.3.
Gerd Moellmann's avatar
Gerd Moellmann committed
263 264 265 266 267 268
    (greek-two-byte		. ?G)
    (japanese-hiragana-two-byte . ?H)
    (indian-two-byte		. ?I)
    (japanese-katakana-two-byte . ?K)
    (korean-hangul-two-byte	. ?N)
    (cyrillic-two-byte		. ?Y)
269
    (combining-diacritic	. ?^)
Gerd Moellmann's avatar
Gerd Moellmann committed
270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299
    (ascii			. ?a)
    (arabic			. ?b)
    (chinese			. ?c)
    (ethiopic			. ?e)
    (greek			. ?g)
    (korean			. ?h)
    (indian			. ?i)
    (japanese			. ?j)
    (japanese-katakana		. ?k)
    (latin			. ?l)
    (lao			. ?o)
    (tibetan			. ?q)
    (japanese-roman		. ?r)
    (thai			. ?t)
    (vietnamese			. ?v)
    (hebrew			. ?w)
    (cyrillic			. ?y)
    (can-break			. ?|))
  "Alist mapping symbols to category characters.
Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
symbol in `(category SYMBOL)', and CHAR is the category character
corresponding to SYMBOL, as it would be used with `\\c' or `\\C' in
regular expression strings.")


(defvar rx-greedy-flag t
  "Non-nil means produce greedy regular expressions for `zero-or-one',
`zero-or-more', and `one-or-more'.  Dynamically bound.")


300
(defun rx-info (op head)
Gerd Moellmann's avatar
Gerd Moellmann committed
301 302 303
  "Return parsing/code generation info for OP.
If OP is the space character ASCII 32, return info for the symbol `?'.
If OP is the character `?', return info for the symbol `??'.
304 305 306
See also `rx-constituents'.
If HEAD is non-nil, then OP is the head of a sexp, otherwise it's
a standalone symbol."
Gerd Moellmann's avatar
Gerd Moellmann committed
307 308
  (cond ((eq op ? ) (setq op '\?))
	((eq op ??) (setq op '\??)))
309 310 311 312 313 314 315 316 317 318 319 320
  (let (old-op)
    (while (and (not (null op)) (symbolp op))
      (setq old-op op)
      (setq op (cdr (assq op rx-constituents)))
      (when (if head (stringp op) (consp op))
        ;; We found something but of the wrong kind.  Let's look for an
        ;; alternate definition for the other case.
        (let ((new-op
               (cdr (assq old-op (cdr (memq (assq old-op rx-constituents)
                                            rx-constituents))))))
          (if (and new-op (not (if head (stringp new-op) (consp new-op))))
              (setq op new-op))))))
Gerd Moellmann's avatar
Gerd Moellmann committed
321
  op)
322

Gerd Moellmann's avatar
Gerd Moellmann committed
323 324 325

(defun rx-check (form)
  "Check FORM according to its car's parsing info."
Stefan Monnier's avatar
Stefan Monnier committed
326 327
  (unless (listp form)
    (error "rx `%s' needs argument(s)" form))
328
  (let* ((rx (rx-info (car form) 'head))
Gerd Moellmann's avatar
Gerd Moellmann committed
329 330 331 332 333 334
	 (nargs (1- (length form)))
	 (min-args (nth 1 rx))
	 (max-args (nth 2 rx))
	 (type-pred (nth 3 rx)))
    (when (and (not (null min-args))
	       (< nargs min-args))
335
      (error "rx form `%s' requires at least %d args"
Gerd Moellmann's avatar
Gerd Moellmann committed
336 337 338
	     (car form) min-args))
    (when (and (not (null max-args))
	       (> nargs max-args))
339
      (error "rx form `%s' accepts at most %d args"
Gerd Moellmann's avatar
Gerd Moellmann committed
340 341 342 343
	     (car form) max-args))
    (when (not (null type-pred))
      (dolist (sub-form (cdr form))
	(unless (funcall type-pred sub-form)
344
	  (error "rx form `%s' requires args satisfying `%s'"
Gerd Moellmann's avatar
Gerd Moellmann committed
345 346 347
		 (car form) type-pred))))))


348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375
(defun rx-group-if (regexp group)
  "Put shy groups around REGEXP if seemingly necessary when GROUP
is non-nil."
  (cond
   ;; for some repetition
   ((eq group '*) (if (rx-atomic-p regexp) (setq group nil)))
   ;; for concatenation
   ((eq group ':)
    (if (rx-atomic-p
	 (if (string-match
	      "\\(?:[?*+]\\??\\|\\\\{[0-9]*,?[0-9]*\\\\}\\)\\'" regexp)
	     (substring regexp 0 (match-beginning 0))
	   regexp))
	(setq group nil)))
   ;; for OR
   ((eq group '|) (setq group nil))
   ;; do anyway
   ((eq group t))
   ((rx-atomic-p regexp t) (setq group nil)))
  (if group
      (concat "\\(?:" regexp "\\)")
    regexp))


(defvar rx-parent)
;; dynamically bound in some functions.


Gerd Moellmann's avatar
Gerd Moellmann committed
376 377 378 379
(defun rx-and (form)
  "Parse and produce code from FORM.
FORM is of the form `(and FORM1 ...)'."
  (rx-check form)
380 381 382
  (rx-group-if
   (mapconcat (lambda (x) (rx-form x ':)) (cdr form) nil)
   (and (memq rx-parent '(* t)) rx-parent)))
Gerd Moellmann's avatar
Gerd Moellmann committed
383 384 385 386 387


(defun rx-or (form)
  "Parse and produce code from FORM, which is `(or FORM1 ...)'."
  (rx-check form)
388 389 390 391 392 393 394 395 396 397
  (rx-group-if
   (if (memq nil (mapcar 'stringp (cdr form)))
       (mapconcat (lambda (x) (rx-form x '|)) (cdr form) "\\|")
     (regexp-opt (cdr form)))
   (and (memq rx-parent '(: * t)) rx-parent)))


(defun rx-anything (form)
  "Match any character."
  (if (consp form)
Paul Eggert's avatar
Paul Eggert committed
398
      (error "rx `anything' syntax error: %s" form))
399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417
  (rx-or (list 'or 'not-newline ?\n)))


(defun rx-any-delete-from-range (char ranges)
  "Delete by side effect character CHAR from RANGES.
Only both edges of each range is checked."
  (let (m)
    (cond
     ((memq char ranges) (setq ranges (delq char ranges)))
     ((setq m (assq char ranges))
      (if (eq (1+ char) (cdr m))
	  (setcar (memq m ranges) (1+ char))
	(setcar m (1+ char))))
     ((setq m (rassq char ranges))
      (if (eq (1- char) (car m))
	  (setcar (memq m ranges) (1- char))
	(setcdr m (1- char)))))
    ranges))

418

419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443
(defun rx-any-condense-range (args)
  "Condense by side effect ARGS as range for Rx `any'."
  (let (str
	l)
    ;; set STR list of all strings
    ;; set L list of all ranges
    (mapc (lambda (e) (cond ((stringp e) (push e str))
			    ((numberp e) (push (cons e e) l))
			    (t (push e l))))
	  args)
    ;; condense overlapped ranges in L
    (let ((tail (setq l (sort l #'car-less-than-car)))
	  d)
      (while (setq d (cdr tail))
	(if (>= (cdar tail) (1- (caar d)))
	    (progn
	      (setcdr (car tail) (max (cdar tail) (cdar d)))
	      (setcdr tail (cdr d)))
	  (setq tail d))))
    ;; Separate small ranges to single number, and delete dups.
    (nconc
     (apply #'nconc
	    (mapcar (lambda (e)
		      (cond
		       ((= (car e) (cdr e)) (list (car e)))
444
		       ((= (1+ (car e)) (cdr e)) (list (car e) (cdr e)))
445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472
		       ((list e))))
		    l))
     (delete-dups str))))


(defun rx-check-any-string (str)
  "Check string argument STR for Rx `any'."
  (let ((i 0)
	c1 c2 l)
    (if (= 0 (length str))
	(error "String arg for Rx `any' must not be empty"))
    (while (string-match ".-." str i)
      ;; string before range: convert it to characters
      (if (< i (match-beginning 0))
	  (setq l (nconc
		   l
		   (append (substring str i (match-beginning 0)) nil))))
      ;; range
      (setq i (match-end 0)
	    c1 (aref str (match-beginning 0))
	    c2 (aref str (1- i)))
      (cond
       ((< c1 c2) (setq l (nconc l (list (cons c1 c2)))))
       ((= c1 c2) (setq l (nconc l (list c1))))))
    ;; rest?
    (if (< i (length str))
	(setq l (nconc l (append (substring str i) nil))))
    l))
Gerd Moellmann's avatar
Gerd Moellmann committed
473 474 475 476


(defun rx-check-any (arg)
   "Check arg ARG for Rx `any'."
477 478 479
   (cond
    ((integerp arg) (list arg))
    ((symbolp arg)
Stefan Monnier's avatar
Stefan Monnier committed
480
     (let ((translation (condition-case nil
481
			    (rx-form arg)
Stefan Monnier's avatar
Stefan Monnier committed
482
			  (error nil))))
483 484 485 486 487 488 489 490 491 492
       (if (or (null translation)
	       (null (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'" translation)))
	   (error "Invalid char class `%s' in Rx `any'" arg))
       (list (substring translation 1 -1)))) ; strip outer brackets
    ((and (integerp (car-safe arg)) (integerp (cdr-safe arg)))
     (list arg))
    ((stringp arg) (rx-check-any-string arg))
    ((error
      "rx `any' requires string, character, char pair or char class args"))))

Gerd Moellmann's avatar
Gerd Moellmann committed
493 494

(defun rx-any (form)
Stefan Monnier's avatar
Stefan Monnier committed
495 496
  "Parse and produce code from FORM, which is `(any ARG ...)'.
ARG is optional."
Gerd Moellmann's avatar
Gerd Moellmann committed
497
  (rx-check form)
498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561
  (let* ((args (rx-any-condense-range
		(apply
		 #'nconc
		 (mapcar #'rx-check-any (cdr form)))))
	 m
	 s)
    (cond
     ;; single close bracket
     ;;	 => "[]...-]" or "[]...--.]"
     ((memq ?\] args)
      ;; set ] at the beginning
      (setq args (cons ?\] (delq ?\] args)))
      ;; set - at the end
      (if (or (memq ?- args) (assq ?- args))
	  (setq args (nconc (rx-any-delete-from-range ?- args)
			    (list ?-)))))
     ;; close bracket starts a range
     ;;  => "[]-....-]" or "[]-.--....]"
     ((setq m (assq ?\] args))
      ;; bring it to the beginning
      (setq args (cons m (delq m args)))
      (cond ((memq ?- args)
	     ;; to the end
	     (setq args (nconc (delq ?- args) (list ?-))))
	    ((setq m (assq ?- args))
	     ;; next to the bracket's range, make the second range
	     (setcdr args (cons m (delq m args))))))
     ;; bracket in the end range
     ;;	 => "[]...-]"
     ((setq m (rassq ?\] args))
      ;; set ] at the beginning
      (setq args (cons ?\] (rx-any-delete-from-range ?\] args)))
      ;; set - at the end
      (if (or (memq ?- args) (assq ?- args))
	  (setq args (nconc (rx-any-delete-from-range ?- args)
			    (list ?-)))))
     ;; {no close bracket appears}
     ;;
     ;; bring single bar to the beginning
     ((memq ?- args)
      (setq args (cons ?- (delq ?- args))))
     ;; bar start a range, bring it to the beginning
     ((setq m (assq ?- args))
      (setq args (cons m (delq m args))))
     ;;
     ;; hat at the beginning?
     ((or (eq (car args) ?^) (eq (car-safe (car args)) ?^))
      (setq args (if (cdr args)
		     `(,(cadr args) ,(car args) ,@(cddr args))
		   (nconc (rx-any-delete-from-range ?^ args)
			  (list ?^))))))
    ;; some 1-char?
    (if (and (null (cdr args)) (numberp (car args))
	     (or (= 1 (length
		       (setq s (regexp-quote (string (car args))))))
		 (and (equal (car args) ?^) ;; unnecessary predicate?
		      (null (eq rx-parent '!)))))
	s
      (concat "["
	      (mapconcat
	       (lambda (e) (cond
			    ((numberp e) (string e))
			    ((consp e)
			     (if (and (= (1+ (car e)) (cdr e))
562 563 564 565
                                      ;; rx-any-condense-range should
                                      ;; prevent this case from happening.
				      (null (memq (car e) '(?\] ?-)))
                                      (null (memq (cdr e) '(?\] ?-))))
566 567 568 569 570 571
				 (string (car e) (cdr e))
			       (string (car e) ?- (cdr e))))
			    (e)))
	       args
	       nil)
	      "]"))))
Gerd Moellmann's avatar
Gerd Moellmann committed
572 573


574 575
(defun rx-check-not (arg)
  "Check arg ARG for Rx `not'."
Stefan Monnier's avatar
Stefan Monnier committed
576
  (unless (or (and (symbolp arg)
577
		   (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'"
Stefan Monnier's avatar
Stefan Monnier committed
578
				 (condition-case nil
579
				     (rx-form arg)
Stefan Monnier's avatar
Stefan Monnier committed
580
				   (error ""))))
581
	      (eq arg 'word-boundary)
Stefan Monnier's avatar
Stefan Monnier committed
582 583 584 585
	      (and (consp arg)
		   (memq (car arg) '(not any in syntax category))))
    (error "rx `not' syntax error: %s" arg))
  t)
Gerd Moellmann's avatar
Gerd Moellmann committed
586 587 588 589 590


(defun rx-not (form)
  "Parse and produce code from FORM.  FORM is `(not ...)'."
  (rx-check form)
591
  (let ((result (rx-form (cadr form) '!))
592
	case-fold-search)
Gerd Moellmann's avatar
Gerd Moellmann committed
593
    (cond ((string-match "\\`\\[^" result)
594 595 596 597 598
	   (cond
	    ((equal result "[^]") "[^^]")
	    ((and (= (length result) 4) (null (eq rx-parent '!)))
	     (regexp-quote (substring result 2 3)))
	    ((concat "[" (substring result 2)))))
Stefan Monnier's avatar
Stefan Monnier committed
599
	  ((eq ?\[ (aref result 0))
Gerd Moellmann's avatar
Gerd Moellmann committed
600
	   (concat "[^" (substring result 1)))
601 602 603 604 605 606
	  ((string-match "\\`\\\\[scbw]" result)
	   (concat (upcase (substring result 0 2))
		   (substring result 2)))
	  ((string-match "\\`\\\\[SCBW]" result)
	   (concat (downcase (substring result 0 2))
		   (substring result 2)))
Gerd Moellmann's avatar
Gerd Moellmann committed
607 608 609 610
	  (t
	   (concat "[^" result "]")))))


Stefan Monnier's avatar
Stefan Monnier committed
611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643
(defun rx-not-char (form)
  "Parse and produce code from FORM.  FORM is `(not-char ...)'."
  (rx-check form)
  (rx-not `(not (in ,@(cdr form)))))


(defun rx-not-syntax (form)
  "Parse and produce code from FORM.  FORM is `(not-syntax SYNTAX)'."
  (rx-check form)
  (rx-not `(not (syntax ,@(cdr form)))))


(defun rx-trans-forms (form &optional skip)
  "If FORM's length is greater than two, transform it to length two.
A form (HEAD REST ...) becomes (HEAD (and REST ...)).
If SKIP is non-nil, allow that number of items after the head, i.e.
`(= N REST ...)' becomes `(= N (and REST ...))' if SKIP is 1."
  (unless skip (setq skip 0))
  (let ((tail (nthcdr (1+ skip) form)))
    (if (= (length tail) 1)
	form
      (let ((form (copy-sequence form)))
	(setcdr (nthcdr skip form) (list (cons 'and tail)))
	form))))


(defun rx-= (form)
  "Parse and produce code from FORM `(= N ...)'."
  (rx-check form)
  (setq form (rx-trans-forms form 1))
  (unless (and (integerp (nth 1 form))
	       (> (nth 1 form) 0))
    (error "rx `=' requires positive integer first arg"))
644
  (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
Stefan Monnier's avatar
Stefan Monnier committed
645 646 647 648 649 650 651 652 653


(defun rx->= (form)
  "Parse and produce code from FORM `(>= N ...)'."
  (rx-check form)
  (setq form (rx-trans-forms form 1))
  (unless (and (integerp (nth 1 form))
	       (> (nth 1 form) 0))
    (error "rx `>=' requires positive integer first arg"))
654
  (format "%s\\{%d,\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
Stefan Monnier's avatar
Stefan Monnier committed
655 656 657 658 659


(defun rx-** (form)
  "Parse and produce code from FORM `(** N M ...)'."
  (rx-check form)
660
  (rx-form (cons 'repeat (cdr (rx-trans-forms form 2))) '*))
Stefan Monnier's avatar
Stefan Monnier committed
661 662


Gerd Moellmann's avatar
Gerd Moellmann committed
663 664
(defun rx-repeat (form)
  "Parse and produce code from FORM.
665
FORM is either `(repeat N FORM1)' or `(repeat N M FORMS...)'."
Gerd Moellmann's avatar
Gerd Moellmann committed
666
  (rx-check form)
667 668 669
  (if (> (length form) 4)
      (setq form (rx-trans-forms form 2)))
  (if (null (nth 2 form))
670
      (setq form (cons (nth 0 form) (cons (nth 1 form) (nthcdr 3 form)))))
Gerd Moellmann's avatar
Gerd Moellmann committed
671 672 673
  (cond ((= (length form) 3)
	 (unless (and (integerp (nth 1 form))
		      (> (nth 1 form) 0))
674
	   (error "rx `repeat' requires positive integer first arg"))
675
	 (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
Gerd Moellmann's avatar
Gerd Moellmann committed
676 677 678 679 680
	((or (not (integerp (nth 2 form)))
	     (< (nth 2 form) 0)
	     (not (integerp (nth 1 form)))
	     (< (nth 1 form) 0)
	     (< (nth 2 form) (nth 1 form)))
681
	 (error "rx `repeat' range error"))
Gerd Moellmann's avatar
Gerd Moellmann committed
682
	(t
683
	 (format "%s\\{%d,%d\\}" (rx-form (nth 3 form) '*)
Gerd Moellmann's avatar
Gerd Moellmann committed
684 685 686 687 688
		 (nth 1 form) (nth 2 form)))))


(defun rx-submatch (form)
  "Parse and produce code from FORM, which is `(submatch ...)'."
689 690 691 692 693 694 695
  (concat "\\("
          (if (= 2 (length form))
              ;; Only one sub-form.
              (rx-form (cadr form))
            ;; Several sub-forms implicitly concatenated.
            (mapconcat (lambda (re) (rx-form re ':)) (cdr form) nil))
          "\\)"))
696

697 698 699 700 701 702 703 704 705 706
(defun rx-submatch-n (form)
  "Parse and produce code from FORM, which is `(submatch-n N ...)'."
  (let ((n (nth 1 form)))
    (concat "\\(?" (number-to-string n) ":"
	    (if (= 3 (length form))
		;; Only one sub-form.
		(rx-form (nth 2 form))
	      ;; Several sub-forms implicitly concatenated.
	      (mapconcat (lambda (re) (rx-form re ':)) (cddr form) nil))
	    "\\)")))
Gerd Moellmann's avatar
Gerd Moellmann committed
707

708 709 710 711 712 713 714 715 716 717
(defun rx-backref (form)
  "Parse and produce code from FORM, which is `(backref N)'."
  (rx-check form)
  (format "\\%d" (nth 1 form)))

(defun rx-check-backref (arg)
  "Check arg ARG for Rx `backref'."
  (or (and (integerp arg) (>= arg 1) (<= arg 9))
      (error "rx `backref' requires numeric 1<=arg<=9: %s" arg)))

Gerd Moellmann's avatar
Gerd Moellmann committed
718 719 720
(defun rx-kleene (form)
  "Parse and produce code from FORM.
FORM is `(OP FORM1)', where OP is one of the `zero-or-one',
721
`zero-or-more' etc.  operators.
Gerd Moellmann's avatar
Gerd Moellmann committed
722 723 724 725 726
If OP is one of `*', `+', `?', produce a greedy regexp.
If OP is one of `*?', `+?', `??', produce a non-greedy regexp.
If OP is anything else, produce a greedy regexp if `rx-greedy-flag'
is non-nil."
  (rx-check form)
Stefan Monnier's avatar
Stefan Monnier committed
727
  (setq form (rx-trans-forms form))
728
  (let ((suffix (cond ((memq (car form) '(* + ?\s)) "")
Gerd Moellmann's avatar
Gerd Moellmann committed
729 730 731 732 733
		      ((memq (car form) '(*? +? ??)) "?")
		      (rx-greedy-flag "")
		      (t "?")))
	(op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*")
		  ((memq (car form) '(+ +? 1+ one-or-more))  "+")
734 735 736 737
		  (t "?"))))
    (rx-group-if
     (concat (rx-form (cadr form) '*) op suffix)
     (and (memq rx-parent '(t *)) rx-parent))))
738

739 740

(defun rx-atomic-p (r &optional lax)
741 742 743 744 745 746 747 748
  "Return non-nil if regexp string R is atomic.
An atomic regexp R is one such that a suffix operator
appended to R will apply to all of R.  For example, \"a\"
\"[abc]\" and \"\\(ab\\|ab*c\\)\" are atomic and \"ab\",
\"[ab]c\", and \"ab\\|ab*c\" are not atomic.

This function may return false negatives, but it will not
return false positives.  It is nevertheless useful in
749
situations where an efficiency shortcut can be taken only if a
750 751 752 753 754 755 756 757 758 759 760 761 762 763 764
regexp is atomic.  The function can be improved to detect
more cases of atomic regexps.  Presently, this function
detects the following categories of atomic regexp;

  a group or shy group:  \\(...\\)
  a character class:     [...]
  a single character:    a

On the other hand, false negatives will be returned for
regexps that are atomic but end in operators, such as
\"a+\".  I think these are rare.  Probably such cases could
be detected without much effort.  A guarantee of no false
negatives would require a theoretic specification of the set
of all atomic regexps."
  (let ((l (length r)))
765 766 767 768 769 770 771 772
    (cond
     ((<= l 1))
     ((= l 2) (= (aref r 0) ?\\))
     ((= l 3) (string-match "\\`\\(?:\\\\[cCsS_]\\|\\[[^^]\\]\\)" r))
     ((null lax)
      (cond
       ((string-match "\\`\\[^?\]?\\(?:\\[:[a-z]+:]\\|[^\]]\\)*\\]\\'" r))
       ((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^\)]\\)*\\\\)\\'" r)))))))
Gerd Moellmann's avatar
Gerd Moellmann committed
773 774 775 776 777


(defun rx-syntax (form)
  "Parse and produce code from FORM, which is `(syntax SYMBOL)'."
  (rx-check form)
778
  (let* ((sym (cadr form))
779
	 (syntax (cdr (assq sym rx-syntax))))
Gerd Moellmann's avatar
Gerd Moellmann committed
780
    (unless syntax
781
      ;; Try sregex compatibility.
782
      (cond
783
       ((characterp sym) (setq syntax sym))
784 785 786 787
       ((symbolp sym)
        (let ((name (symbol-name sym)))
          (if (= 1 (length name))
              (setq syntax (aref name 0))))))
788
      (unless syntax
789 790
	(error "Unknown rx syntax `%s'" sym)))
    (format "\\s%c" syntax)))
Gerd Moellmann's avatar
Gerd Moellmann committed
791 792 793 794 795 796 797 798


(defun rx-check-category (form)
  "Check the argument FORM of a `(category FORM)'."
  (unless (or (integerp form)
	      (cdr (assq form rx-categories)))
    (error "Unknown category `%s'" form))
  t)
799

Gerd Moellmann's avatar
Gerd Moellmann committed
800 801

(defun rx-category (form)
Stefan Monnier's avatar
Stefan Monnier committed
802
  "Parse and produce code from FORM, which is `(category SYMBOL)'."
Gerd Moellmann's avatar
Gerd Moellmann committed
803 804 805 806 807 808 809 810 811 812
  (rx-check form)
  (let ((char (if (integerp (cadr form))
		  (cadr form)
		(cdr (assq (cadr form) rx-categories)))))
    (format "\\c%c" char)))


(defun rx-eval (form)
  "Parse and produce code from FORM, which is `(eval FORM)'."
  (rx-check form)
813
  (rx-form (eval (cadr form)) rx-parent))
Gerd Moellmann's avatar
Gerd Moellmann committed
814 815 816


(defun rx-greedy (form)
817 818 819 820
  "Parse and produce code from FORM.
If FORM is '(minimal-match FORM1)', non-greedy versions of `*',
`+', and `?' operators will be used in FORM1.  If FORM is
'(maximal-match FORM1)', greedy operators will be used."
Gerd Moellmann's avatar
Gerd Moellmann committed
821 822
  (rx-check form)
  (let ((rx-greedy-flag (eq (car form) 'maximal-match)))
823
    (rx-form (cadr form) rx-parent)))
Gerd Moellmann's avatar
Gerd Moellmann committed
824 825 826 827 828


(defun rx-regexp (form)
  "Parse and produce code from FORM, which is `(regexp STRING)'."
  (rx-check form)
829 830 831 832 833 834 835 836
  (rx-group-if (cadr form) rx-parent))


(defun rx-form (form &optional rx-parent)
  "Parse and produce code for regular expression FORM.
FORM is a regular expression in sexp form.
RX-PARENT shows which type of expression calls and controls putting of
shy groups around the result and some more in other functions."
837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858
  (cond
   ((stringp form)
    (rx-group-if (regexp-quote form)
                 (if (and (eq rx-parent '*) (< 1 (length form)))
                     rx-parent)))
   ((integerp form)
    (regexp-quote (char-to-string form)))
   ((symbolp form)
    (let ((info (rx-info form nil)))
      (cond ((stringp info)
             info)
            ((null info)
             (error "Unknown rx form `%s'" form))
            (t
             (funcall (nth 0 info) form)))))
   ((consp form)
    (let ((info (rx-info (car form) 'head)))
      (unless (consp info)
        (error "Unknown rx form `%s'" (car form)))
      (funcall (nth 0 info) form)))
   (t
    (error "rx syntax error at `%s'" form))))
Gerd Moellmann's avatar
Gerd Moellmann committed
859 860 861 862 863 864 865


;;;###autoload
(defun rx-to-string (form &optional no-group)
  "Parse and produce code for regular expression FORM.
FORM is a regular expression in sexp form.
NO-GROUP non-nil means don't put shy groups around the result."
866
  (rx-group-if (rx-form form) (null no-group)))
Gerd Moellmann's avatar
Gerd Moellmann committed
867 868 869


;;;###autoload
Stefan Monnier's avatar
Stefan Monnier committed
870 871 872
(defmacro rx (&rest regexps)
  "Translate regular expressions REGEXPS in sexp form to a regexp string.
REGEXPS is a non-empty sequence of forms of the sort listed below.
Chong Yidong's avatar
Chong Yidong committed
873 874

Note that `rx' is a Lisp macro; when used in a Lisp program being
875
compiled, the translation is performed by the compiler.
Chong Yidong's avatar
Chong Yidong committed
876
See `rx-to-string' for how to do such a translation at run-time.
Gerd Moellmann's avatar
Gerd Moellmann committed
877 878 879 880 881 882 883 884 885 886

The following are valid subforms of regular expressions in sexp
notation.

STRING
     matches string STRING literally.

CHAR
     matches character CHAR literally.

Stefan Monnier's avatar
Stefan Monnier committed
887
`not-newline', `nonl'
Gerd Moellmann's avatar
Gerd Moellmann committed
888
     matches any character except a newline.
889

Gerd Moellmann's avatar
Gerd Moellmann committed
890 891 892
`anything'
     matches any character

Stefan Monnier's avatar
Stefan Monnier committed
893 894 895 896
`(any SET ...)'
`(in SET ...)'
`(char SET ...)'
     matches any character in SET ....  SET may be a character or string.
Gerd Moellmann's avatar
Gerd Moellmann committed
897
     Ranges of characters can be specified as `A-Z' in strings.
Stefan Monnier's avatar
Stefan Monnier committed
898
     Ranges may also be specified as conses like `(?A . ?Z)'.
Gerd Moellmann's avatar
Gerd Moellmann committed
899

Stefan Monnier's avatar
Stefan Monnier committed
900 901 902 903
     SET may also be the name of a character class: `digit',
     `control', `hex-digit', `blank', `graph', `print', `alnum',
     `alpha', `ascii', `nonascii', `lower', `punct', `space', `upper',
     `word', or one of their synonyms.
Gerd Moellmann's avatar
Gerd Moellmann committed
904

Stefan Monnier's avatar
Stefan Monnier committed
905 906
`(not (any SET ...))'
     matches any character not in SET ...
Gerd Moellmann's avatar
Gerd Moellmann committed
907

Stefan Monnier's avatar
Stefan Monnier committed
908
`line-start', `bol'
Gerd Moellmann's avatar
Gerd Moellmann committed
909 910 911
     matches the empty string, but only at the beginning of a line
     in the text being matched

Stefan Monnier's avatar
Stefan Monnier committed
912
`line-end', `eol'
Gerd Moellmann's avatar
Gerd Moellmann committed
913 914
     is similar to `line-start' but matches only at the end of a line

Stefan Monnier's avatar
Stefan Monnier committed
915
`string-start', `bos', `bot'
Gerd Moellmann's avatar
Gerd Moellmann committed
916 917 918
     matches the empty string, but only at the beginning of the
     string being matched against.

Stefan Monnier's avatar
Stefan Monnier committed
919
`string-end', `eos', `eot'
Gerd Moellmann's avatar
Gerd Moellmann committed
920 921 922 923 924
     matches the empty string, but only at the end of the
     string being matched against.

`buffer-start'
     matches the empty string, but only at the beginning of the
Stefan Monnier's avatar
Stefan Monnier committed
925
     buffer being matched against.  Actually equivalent to `string-start'.
Gerd Moellmann's avatar
Gerd Moellmann committed
926 927 928

`buffer-end'
     matches the empty string, but only at the end of the
Stefan Monnier's avatar
Stefan Monnier committed
929
     buffer being matched against.  Actually equivalent to `string-end'.
Gerd Moellmann's avatar
Gerd Moellmann committed
930 931 932 933

`point'
     matches the empty string, but only at point.

Stefan Monnier's avatar
Stefan Monnier committed
934
`word-start', `bow'
935
     matches the empty string, but only at the beginning of a word.
Gerd Moellmann's avatar
Gerd Moellmann committed
936

Stefan Monnier's avatar
Stefan Monnier committed
937
`word-end', `eow'
Gerd Moellmann's avatar
Gerd Moellmann committed
938 939 940 941 942 943 944
     matches the empty string, but only at the end of a word.

`word-boundary'
     matches the empty string, but only at the beginning or end of a
     word.

`(not word-boundary)'
Stefan Monnier's avatar
Stefan Monnier committed
945
`not-word-boundary'
Gerd Moellmann's avatar
Gerd Moellmann committed
946 947 948
     matches the empty string, but not at the beginning or end of a
     word.

949 950 951 952 953 954
`symbol-start'
     matches the empty string, but only at the beginning of a symbol.

`symbol-end'
     matches the empty string, but only at the end of a symbol.

Stefan Monnier's avatar
Stefan Monnier committed
955
`digit', `numeric', `num'
Gerd Moellmann's avatar
Gerd Moellmann committed
956 957
     matches 0 through 9.

Stefan Monnier's avatar
Stefan Monnier committed
958
`control', `cntrl'
Gerd Moellmann's avatar
Gerd Moellmann committed
959 960
     matches ASCII control characters.

Stefan Monnier's avatar
Stefan Monnier committed
961
`hex-digit', `hex', `xdigit'
Gerd Moellmann's avatar
Gerd Moellmann committed
962 963 964 965 966
     matches 0 through 9, a through f and A through F.

`blank'
     matches space and tab only.

Stefan Monnier's avatar
Stefan Monnier committed
967
`graphic', `graph'
968 969 970
     matches graphic characters--everything except space, ASCII
     and non-ASCII control characters, surrogates, and codepoints
     unassigned by Unicode.
Gerd Moellmann's avatar
Gerd Moellmann committed
971

Stefan Monnier's avatar
Stefan Monnier committed
972
`printing', `print'
973
     matches space and graphic characters.
Gerd Moellmann's avatar
Gerd Moellmann committed
974

Stefan Monnier's avatar
Stefan Monnier committed
975
`alphanumeric', `alnum'
976 977
     matches alphabetic characters and digits.  (For multibyte characters,
     it matches according to Unicode character properties.)
Gerd Moellmann's avatar
Gerd Moellmann committed
978

Stefan Monnier's avatar
Stefan Monnier committed
979
`letter', `alphabetic', `alpha'
980 981
     matches alphabetic characters.  (For multibyte characters,
     it matches according to Unicode character properties.)
Gerd Moellmann's avatar
Gerd Moellmann committed
982 983 984 985 986 987 988

`ascii'
     matches ASCII (unibyte) characters.

`nonascii'
     matches non-ASCII (multibyte) characters.

Stefan Monnier's avatar
Stefan Monnier committed
989
`lower', `lower-case'
Gerd Moellmann's avatar
Gerd Moellmann committed
990 991
     matches anything lower-case.

Stefan Monnier's avatar
Stefan Monnier committed
992
`upper', `upper-case'
Gerd Moellmann's avatar
Gerd Moellmann committed
993 994
     matches anything upper-case.

Stefan Monnier's avatar
Stefan Monnier committed
995
`punctuation', `punct'
Gerd Moellmann's avatar
Gerd Moellmann committed
996 997 998
     matches punctuation.  (But at present, for multibyte characters,
     it matches anything that has non-word syntax.)

Stefan Monnier's avatar
Stefan Monnier committed
999
`space', `whitespace', `white'
Gerd Moellmann's avatar
Gerd Moellmann committed
1000 1001
     matches anything that has whitespace syntax.

Stefan Monnier's avatar
Stefan Monnier committed
1002
`word', `wordchar'
Gerd Moellmann's avatar
Gerd Moellmann committed
1003 1004
     matches anything that has word syntax.

Stefan Monnier's avatar
Stefan Monnier committed
1005 1006 1007
`not-wordchar'
     matches anything that has non-word syntax.

Gerd Moellmann's avatar
Gerd Moellmann committed
1008 1009
`(syntax SYNTAX)'
     matches a character with syntax SYNTAX.  SYNTAX must be one
Stefan Monnier's avatar
Stefan Monnier committed
1010 1011
     of the following symbols, or a symbol corresponding to the syntax
     character, e.g. `\\.' for `\\s.'.
Gerd Moellmann's avatar
Gerd Moellmann committed
1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025

     `whitespace'		(\\s- in string notation)
     `punctuation'		(\\s.)
     `word'			(\\sw)
     `symbol'			(\\s_)
     `open-parenthesis'		(\\s()
     `close-parenthesis'	(\\s))
     `expression-prefix'	(\\s')
     `string-quote'		(\\s\")
     `paired-delimiter'		(\\s$)
     `escape'			(\\s\\)
     `character-quote'		(\\s/)
     `comment-start'		(\\s<)
     `comment-end'		(\\s>)
1026 1027
     `string-delimiter'		(\\s|)
     `comment-delimiter'	(\\s!)
Gerd Moellmann's avatar
Gerd Moellmann committed
1028 1029

`(not (syntax SYNTAX))'
Stefan Monnier's avatar
Stefan Monnier committed
1030
     matches a character that doesn't have syntax SYNTAX.
Gerd Moellmann's avatar
Gerd Moellmann committed
1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048

`(category CATEGORY)'
     matches a character with category CATEGORY.  CATEGORY must be
     either a character to use for C, or one of the following symbols.

     `consonant'			(\\c0 in string notation)
     `base-vowel'			(\\c1)
     `upper-diacritical-mark'		(\\c2)
     `lower-diacritical-mark'		(\\c3)
     `tone-mark'		        (\\c4)
     `symbol'			        (\\c5)
     `digit'			        (\\c6)
     `vowel-modifying-diacritical-mark'	(\\c7)
     `vowel-sign'			(\\c8)
     `semivowel-lower'			(\\c9)
     `not-at-end-of-line'		(\\c<)
     `not-at-beginning-of-line'		(\\c>)
     `alpha-numeric-two-byte'		(\\cA)
1049
     `chinese-two-byte'			(\\cC)
Gerd Moellmann's avatar
Gerd Moellmann committed
1050 1051 1052 1053 1054 1055
     `greek-two-byte'			(\\cG)
     `japanese-hiragana-two-byte'	(\\cH)
     `indian-tow-byte'			(\\cI)
     `japanese-katakana-two-byte'	(\\cK)
     `korean-hangul-two-byte'		(\\cN)
     `cyrillic-two-byte'		(\\cY)
Stefan Monnier's avatar
Stefan Monnier committed
1056
     `combining-diacritic'		(\\c^)
Gerd Moellmann's avatar
Gerd Moellmann committed
1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076
     `ascii'				(\\ca)
     `arabic'				(\\cb)
     `chinese'				(\\cc)
     `ethiopic'				(\\ce)
     `greek'				(\\cg)
     `korean'				(\\ch)
     `indian'				(\\ci)
     `japanese'				(\\cj)
     `japanese-katakana'		(\\ck)
     `latin'				(\\cl)
     `lao'				(\\co)
     `tibetan'				(\\cq)
     `japanese-roman'			(\\cr)
     `thai'				(\\ct)
     `vietnamese'			(\\cv)
     `hebrew'				(\\cw)
     `cyrillic'				(\\cy)
     `can-break'			(\\c|)

`(not (category CATEGORY))'
Stefan Monnier's avatar
Stefan Monnier committed
1077
     matches a character that doesn't have category CATEGORY.
Gerd Moellmann's avatar
Gerd Moellmann committed
1078 1079

`(and SEXP1 SEXP2 ...)'
Stefan Monnier's avatar
Stefan Monnier committed
1080 1081 1082
`(: SEXP1 SEXP2 ...)'
`(seq SEXP1 SEXP2 ...)'
`(sequence SEXP1 SEXP2 ...)'
Gerd Moellmann's avatar
Gerd Moellmann committed
1083 1084 1085
     matches what SEXP1 matches, followed by what SEXP2 matches, etc.

`(submatch SEXP1 SEXP2 ...)'
Stefan Monnier's avatar
Stefan Monnier committed
1086
`(group SEXP1 SEXP2 ...)'
Gerd Moellmann's avatar
Gerd Moellmann committed
1087 1088 1089
     like `and', but makes the match accessible with `match-end',
     `match-beginning', and `match-string'.

1090 1091 1092 1093 1094
`(submatch-n N SEXP1 SEXP2 ...)'
`(group-n N SEXP1 SEXP2 ...)'
     like `group', but make it an explicitly-numbered group with
     group number N.

Gerd Moellmann's avatar
Gerd Moellmann committed
1095
`(or SEXP1 SEXP2 ...)'
Stefan Monnier's avatar
Stefan Monnier committed
1096
`(| SEXP1 SEXP2 ...)'
Gerd Moellmann's avatar
Gerd Moellmann committed
1097 1098 1099 1100 1101 1102
     matches anything that matches SEXP1 or SEXP2, etc.  If all
     args are strings, use `regexp-opt' to optimize the resulting
     regular expression.

`(minimal-match SEXP)'
     produce a non-greedy regexp for SEXP.  Normally, regexps matching
1103
     zero or more occurrences of something are \"greedy\" in that they
Gerd Moellmann's avatar
Gerd Moellmann committed
1104 1105 1106 1107
     match as much as they can, as long as the overall regexp can
     still match.  A non-greedy regexp matches as little as possible.

`(maximal-match SEXP)'
Juanma Barranquero's avatar
Juanma Barranquero committed
1108
     produce a greedy regexp for SEXP.  This is the default.
Gerd Moellmann's avatar
Gerd Moellmann committed
1109

Stefan Monnier's avatar
Stefan Monnier committed
1110 1111
Below, `SEXP ...' represents a sequence of regexp forms, treated as if
enclosed in `(and ...)'.
Gerd Moellmann's avatar
Gerd Moellmann committed
1112

Stefan Monnier's avatar
Stefan Monnier committed
1113 1114 1115
`(zero-or-more SEXP ...)'
`(0+ SEXP ...)'
     matches zero or more occurrences of what SEXP ... matches.
Gerd Moellmann's avatar
Gerd Moellmann committed
1116

Stefan Monnier's avatar
Stefan Monnier committed
1117 1118 1119
`(* SEXP ...)'
     like `zero-or-more', but always produces a greedy regexp, independent
     of `rx-greedy-flag'.
Gerd Moellmann's avatar
Gerd Moellmann committed
1120

Stefan Monnier's avatar
Stefan Monnier committed
1121 1122 1123
`(*? SEXP ...)'
     like `zero-or-more', but always produces a non-greedy regexp,
     independent of `rx-greedy-flag'.
1124

Stefan Monnier's avatar
Stefan Monnier committed
1125 1126 1127
`(one-or-more SEXP ...)'
`(1+ SEXP ...)'
     matches one or more occurrences of SEXP ...
Gerd Moellmann's avatar
Gerd Moellmann committed
1128

Stefan Monnier's avatar
Stefan Monnier committed
1129
`(+ SEXP ...)'
Gerd Moellmann's avatar
Gerd Moellmann committed
1130 1131
     like `one-or-more', but always produces a greedy regexp.

Stefan Monnier's avatar
Stefan Monnier committed
1132
`(+? SEXP ...)'
Gerd Moellmann's avatar
Gerd Moellmann committed
1133 1134
     like `one-or-more', but always produces a non-greedy regexp.

Stefan Monnier's avatar
Stefan Monnier committed
1135 1136 1137
`(zero-or-one SEXP ...)'
`(optional SEXP ...)'
`(opt SEXP ...)'
Gerd Moellmann's avatar
Gerd Moellmann committed
1138
     matches zero or one occurrences of A.
1139

Stefan Monnier's avatar
Stefan Monnier committed
1140
`(? SEXP ...)'
Gerd Moellmann's avatar
Gerd Moellmann committed
1141 1142
     like `zero-or-one', but always produces a greedy regexp.

Stefan Monnier's avatar
Stefan Monnier committed
1143
`(?? SEXP ...)'
Gerd Moellmann's avatar
Gerd Moellmann committed
1144 1145 1146
     like `zero-or-one', but always produces a non-greedy regexp.

`(repeat N SEXP)'
Stefan Monnier's avatar
Stefan Monnier committed
1147 1148 1149 1150 1151
`(= N SEXP ...)'
     matches N occurrences.

`(>= N SEXP ...)'
     matches N or more occurrences.
Gerd Moellmann's avatar
Gerd Moellmann committed
1152 1153

`(repeat N M SEXP)'
Stefan Monnier's avatar
Stefan Monnier committed
1154 1155 1156
`(** N M SEXP ...)'
     matches N to M occurrences.

1157 1158 1159
`(backref N)'
     matches what was matched previously by submatch N.

Gerd Moellmann's avatar
Gerd Moellmann committed
1160
`(eval FORM)'
1161 1162
     evaluate FORM and insert result.  If result is a string,
     `regexp-quote' it.
Gerd Moellmann's avatar
Gerd Moellmann committed
1163 1164

`(regexp REGEXP)'
1165
     include REGEXP in string notation in the result."
Stefan Monnier's avatar
Stefan Monnier committed
1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180
  (cond ((null regexps)
	 (error "No regexp"))
	((cdr regexps)
	 (rx-to-string `(and ,@regexps) t))
	(t
	 (rx-to-string (car regexps) t))))

;; ;; sregex.el replacement

;; ;;;###autoload (provide 'sregex)
;; ;;;###autoload (autoload 'sregex "rx")
;; (defalias 'sregex 'rx-to-string)
;; ;;;###autoload (autoload 'sregexq "rx" nil nil 'macro)
;; (defalias 'sregexq 'rx)

Gerd Moellmann's avatar
Gerd Moellmann committed
1181 1182 1183
(provide 'rx)

;;; rx.el ends here