Commit 25544ce1 authored by Simon Marshall's avatar Simon Marshall
Browse files

emit charsets after strings so that the final regexp finds the longest match.

parent 14aa11f4
......@@ -4,7 +4,7 @@
;; Author: Simon Marshall <simon@gnu.ai.mit.edu>
;; Keywords: strings, regexps
;; Version: 1.04.01
;; Version: 1.05
;; This file is part of GNU Emacs.
......@@ -27,10 +27,14 @@
;; The "opt" in "regexp-opt" stands for "optim\\(al\\|i\\(se\\|ze\\)\\)".
;;
;; This package generates a regexp from a given list of strings (that matches
;; one of those strings) that is equivalent to but more efficient than:
;; This package generates a regexp from a given list of strings (which matches
;; one of those strings) so that the regexp generated by:
;;
;; (mapconcat 'identity (mapcar 'regexp-quote strings) "\\|")
;; (regexp-opt strings)
;;
;; is equivalent to, but more efficient than, the regexp generated by:
;;
;; (mapconcat 'regexp-quote strings "\\|")
;;
;; For example:
;;
......@@ -42,10 +46,9 @@
;; (concat "(" (regexp-opt strings t) "\\>"))
;; => "(\\(c\\(atch\\|ond\\(ition-case\\)?\\)\\|if\\|let\\*?\\|prog[12n]\\|save-\\(current-buffer\\|excursion\\|match-data\\|restriction\\|window-excursion\\)\\|throw\\|un\\(less\\|wind-protect\\)\\|wh\\(en\\|ile\\)\\)\\>"
;;
;; Searching using the above example `regexp-opt' regexp is significantly
;; faster than searching using the equivalent `mapconcat' regexp, taking
;; approximately two-thirds of the time.
;;
;; Searching using the above example `regexp-opt' regexp takes approximately
;; two-thirds of the time taken using the equivalent `mapconcat' regexp.
;; Since this package was written to produce efficient regexps, not regexps
;; efficiently, it is probably not a good idea to in-line too many calls in
;; your code, unless you use the following trick with `eval-when-compile':
......@@ -62,7 +65,15 @@
;; (defvar definition-regexp
;; "^(\\(def\\(alias\\|const\\|macro\\|subst\\|un\\|var\\)\\)\\>")
;;
;; Originally written for font-lock.el, from an idea from Stig's hl319.el.
;; Note that if you use this trick for all instances of `regexp-opt' and
;; `regexp-opt-depth' in your code, regexp-opt.el would only have to be loaded
;; at compile time. But note also that using this trick means that should
;; regexp-opt.el be changed, perhaps to fix a bug or to add a feature to
;; improve the efficiency of `regexp-opt' regexps, you would have to recompile
;; your code for such changes to have effect in your code.
;; Originally written for font-lock.el, from an idea from Stig's hl319.el, with
;; thanks for ideas also to Michael Ernst, Bob Glickstein and Dan Nicolaescu.
;; Please don't tell me that it doesn't produce optimal regexps; I know that
;; already. For example, the above explanation for the meaning of "opt" would
;; be more efficient as "optim\\(al\\|i[sz]e\\)", but this requires complex
......@@ -73,15 +84,16 @@
;;;###autoload
(defun regexp-opt (strings &optional paren)
"Return a regexp to match a string in STRINGS.
Each string should be unique in STRINGS and should not contain any regexps.
If optional PAREN non-nil, ensure that the returned regexp is enclosed by at
least one regexp grouping construct.
Each string in STRINGS should be unique and should not contain any regexps.
The returned regexp is typically more efficient than the equivalent regexp:
(mapconcat 'identity (mapcar 'regexp-quote STRINGS) \"\\\\|\")
(let ((open-paren (if PAREN \"\\\\(\" \"\")) (close-paren (if PAREN \"\\\\)\" \"\")))
(concat open-paren (mapconcat 'regexp-quote STRINGS \"\\\\|\") close-paren))
but typically contains regexp grouping constructs. Use `regexp-opt-depth' to
count them."
but typically contains more regexp grouping constructs.
Use `regexp-opt-depth' to count them."
(save-match-data
;; Recurse on the sorted list.
(let ((max-lisp-eval-depth (* 1024 1024))
......@@ -165,12 +177,12 @@ in REGEXP."
close-group)))
;;
;; If there are several one-character strings, remove them and recurse
;; on the rest.
;; on the rest (first so the final regexp finds the longest match).
((> (length letters) 1)
(let ((rest (let ((completion-regexp-list '("^..+$")))
(all-completions "" (mapcar 'list strings)))))
(concat open-group
(regexp-opt-charset letters) "\\|" (regexp-opt-group rest)
(regexp-opt-group rest) "\\|" (regexp-opt-charset letters)
close-group)))
;;
;; Otherwise, divide the list into those that start with a particular
......@@ -196,30 +208,26 @@ in REGEXP."
(bracket "") (dash "") (caret ""))
;;
;; Make a character map but extract character set meta characters.
(let (char)
(while chars
(setq char (string-to-char (pop chars)))
(cond ((eq char ?\])
(setq bracket "]"))
((eq char ?^)
(setq caret "^"))
((eq char ?-)
(setq dash "-"))
(t
(aset charmap char t)))))
(dolist (char (mapcar 'string-to-char chars))
(case char
(?\]
(setq bracket "]"))
(?^
(setq caret "^"))
(?-
(setq dash "-"))
(otherwise
(aset charmap char t))))
;;
;; Make a character set from the map using ranges where applicable.
(let ((elt 0) start)
(while (< elt charwidth)
(when (aref charmap elt)
(setq start (1+ elt))
(while (and (< start charwidth) (aref charmap start))
(incf start))
(if (< (- start elt) 4)
(setq charset (format "%s%c" charset elt))
(setq charset (format "%s%c-%c" charset elt (1- start))
elt start)))
(incf elt)))
(dotimes (elt charwidth)
(when (aref charmap elt)
(let ((start elt))
(while (and (< elt charwidth) (aref charmap elt))
(incf elt))
(if (> (- elt start) 3)
(setq charset (format "%s%c-%c" charset start (1- elt)))
(setq charset (format "%s%c" charset (setq elt start)))))))
;;
;; Make sure a caret is not first and a dash is first or last.
(if (and (string-equal charset "") (string-equal bracket ""))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment