char-fold.el 11.1 KB
Newer Older
1
;;; char-fold.el --- match unicode to similar ASCII -*- lexical-binding: t; -*-
2

Paul Eggert's avatar
Paul Eggert committed
3
;; Copyright (C) 2015-2017 Free Software Foundation, Inc.
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24

;; Maintainer: emacs-devel@gnu.org
;; Keywords: matching

;; This file is part of GNU Emacs.

;; GNU Emacs is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.

;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.

;;; Code:

25
(eval-and-compile (put 'char-fold-table 'char-table-extra-slots 1))
26

27
(defconst char-fold-table
28
  (eval-when-compile
29 30
    (let ((equiv (make-char-table 'char-fold-table))
          (equiv-multi (make-char-table 'char-fold-table))
31
          (table (unicode-property-table-internal 'decomposition)))
32 33
      (set-char-table-extra-slot equiv 0 equiv-multi)

34
      ;; Ensure the table is populated.
35 36 37 38 39
      (let ((func (char-table-extra-slot table 1)))
        (map-char-table (lambda (char v)
                          (when (consp char)
                            (funcall func (car char) v table)))
                        table))
40

41 42
      ;; Compile a list of all complex characters that each simple
      ;; character should match.
43 44 45 46 47 48
      ;; In summary this loop does 3 things:
      ;; - A complex character might be allowed to match its decomp.
      ;; - The decomp is allowed to match the complex character.
      ;; - A single char of the decomp might be allowed to match the
      ;;   character.
      ;; Some examples in the comments below.
49
      (map-char-table
50 51
       (lambda (char decomp)
         (when (consp decomp)
Paul Eggert's avatar
Paul Eggert committed
52
           ;; Skip trivial cases like ?a decomposing to (?a).
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
           (unless (and (not (cdr decomp))
                        (eq char (car decomp)))
             (if (symbolp (car decomp))
                 ;; Discard a possible formatting tag.
                 (setq decomp (cdr decomp))
               ;; If there's no formatting tag, ensure that char matches
               ;; its decomp exactly.  This is because we want 'ä' to
               ;; match 'ä', but we don't want '¹' to match '1'.
               (aset equiv char
                     (cons (apply #'string decomp)
                           (aref equiv char))))

             ;; Allow the entire decomp to match char.  If decomp has
             ;; multiple characters, this is done by adding an entry
             ;; to the alist of the first character in decomp.  This
             ;; allows 'ff' to match 'ff', 'ä' to match 'ä', and '1' to
             ;; match '¹'.
             (let ((make-decomp-match-char
                    (lambda (decomp char)
                      (if (cdr decomp)
                          (aset equiv-multi (car decomp)
                                (cons (cons (apply #'string (cdr decomp))
                                            (regexp-quote (string char)))
                                      (aref equiv-multi (car decomp))))
                        (aset equiv (car decomp)
                              (cons (char-to-string char)
                                    (aref equiv (car decomp))))))))
               (funcall make-decomp-match-char decomp char)
               ;; Do it again, without the non-spacing characters.
               ;; This allows 'a' to match 'ä'.
               (let ((simpler-decomp nil)
                     (found-one nil))
                 (dolist (c decomp)
                   (if (> (get-char-code-property c 'canonical-combining-class) 0)
                       (setq found-one t)
                     (push c simpler-decomp)))
                 (when (and simpler-decomp found-one)
                   (funcall make-decomp-match-char simpler-decomp char)
                   ;; Finally, if the decomp only had one spacing
                   ;; character, we allow this character to match the
                   ;; decomp.  This is to let 'a' match 'ä'.
                   (unless (cdr simpler-decomp)
                     (aset equiv (car simpler-decomp)
                           (cons (apply #'string decomp)
                                 (aref equiv (car simpler-decomp)))))))))))
98 99 100
       table)

      ;; Add some manual entries.
101 102
      (dolist (it '((?\" """ "“" "”" "”" "„" "⹂" "〞" "‟" "‟" "❞" "❝" "❠" "“" "„" "〝" "〟" "🙷" "🙶" "🙸" "«" "»")
                    (?' "❟" "❛" "❜" "‘" "’" "‚" "‛" "‚" "󠀢" "❮" "❯" "‹" "›")
103
                    (?` "❛" "‘" "‛" "󠀢" "❮" "‹")))
104 105 106
        (let ((idx (car it))
              (chars (cdr it)))
          (aset equiv idx (append chars (aref equiv idx)))))
107 108

      ;; Convert the lists of characters we compiled into regexps.
109
      (map-char-table
110 111 112 113 114
       (lambda (char dec-list)
         (let ((re (regexp-opt (cons (char-to-string char) dec-list))))
           (if (consp char)
               (set-char-table-range equiv char re)
             (aset equiv char re))))
115 116
       equiv)
      equiv))
117
  "Used for folding characters of the same group during search.
118
This is a char-table with the `char-fold-table' subtype.
119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137

Let us refer to the character in question by char-x.
Each entry is either nil (meaning char-x only matches literally)
or a regexp.  This regexp should match anything that char-x can
match by itself \(including char-x).  For instance, the default
regexp for the ?+ character is \"[+⁺₊﬩﹢+]\".

This table also has one extra slot which is also a char-table.
Each entry in the extra slot is an alist used for multi-character
matching (which may be nil).  The elements of the alist should
have the form (SUFFIX . OTHER-REGEXP).  If the characters after
char-x are equal to SUFFIX, then this combination of char-x +
SUFFIX is allowed to match OTHER-REGEXP.  This is in addition to
char-x being allowed to match REGEXP.
For instance, the default alist for ?f includes:
    \((\"fl\" . \"ffl\") (\"fi\" . \"ffi\")
     (\"i\" . \"fi\") (\"f\" . \"ff\"))

Exceptionally for the space character (32), ALIST is ignored.")
138

139
(defun char-fold--make-space-string (n)
140 141 142 143
  "Return a string that matches N spaces."
  (format "\\(?:%s\\|%s\\)"
          (make-string n ?\s)
          (apply #'concat
144
                 (make-list n (or (aref char-fold-table ?\s) " ")))))
145

146
;;;###autoload
147 148
(defun char-fold-to-regexp (string &optional _lax from)
  "Return a regexp matching anything that char-folds into STRING.
149
Any character in STRING that has an entry in
150
`char-fold-table' is replaced with that entry (which is a
151 152
regexp) and other characters are `regexp-quote'd.

153 154
If the resulting regexp would be too long for Emacs to handle,
just return the result of calling `regexp-quote' on STRING.
155

156 157
FROM is for internal use.  It specifies an index in the STRING
from which to start."
158
  (let* ((spaces 0)
159
         (multi-char-table (char-table-extra-slot char-fold-table 0))
160 161 162
         (i (or from 0))
         (end (length string))
         (out nil))
163 164 165 166 167 168 169 170
    ;; When the user types a space, we want to match the table entry
    ;; for ?\s, which is generally a regexp like "[ ...]".  However,
    ;; the `search-spaces-regexp' variable doesn't "see" spaces inside
    ;; these regexp constructs, so we need to use "\\( \\|[ ...]\\)"
    ;; instead (to manually expose a space).  Furthermore, the lax
    ;; search engine acts on a bunch of spaces, not on individual
    ;; spaces, so if the string contains sequential spaces like "  ", we
    ;; need to keep them grouped together like this: "\\(  \\|[ ...][ ...]\\)".
171 172 173 174
    (while (< i end)
      (pcase (aref string i)
        (`?\s (setq spaces (1+ spaces)))
        (c (when (> spaces 0)
175
             (push (char-fold--make-space-string spaces) out)
176
             (setq spaces 0))
177
           (let ((regexp (or (aref char-fold-table c)
178
                             (regexp-quote (string c))))
179 180 181
                 ;; Long string.  The regexp would probably be too long.
                 (alist (unless (> end 50)
                          (aref multi-char-table c))))
182 183 184 185 186
             (push (let ((matched-entries nil)
                         (max-length 0))
                     (dolist (entry alist)
                       (let* ((suffix (car entry))
                              (len-suf (length suffix)))
187 188 189 190
                         (when (eq (compare-strings suffix 0 nil
                                                    string (1+ i) (+ i 1 len-suf)
                                                    nil)
                                   t)
191 192
                           (push (cons len-suf (cdr entry)) matched-entries)
                           (setq max-length (max max-length len-suf)))))
193
                     ;; If no suffixes matched, just go on.
194
                     (if (not matched-entries)
195
                         regexp
196 197 198 199 200 201 202 203 204 205 206 207 208
;;; If N suffixes match, we "branch" out into N+1 executions for the
;;; length of the longest match.  This means "fix" will match "fix" but
;;; not "fⅸ", but it's necessary to keep the regexp size from scaling
;;; exponentially.  See https://lists.gnu.org/archive/html/emacs-devel/2015-11/msg02562.html
                       (let ((subs (substring string (1+ i) (+ i 1 max-length))))
                         ;; `i' is still going to inc by 1 below.
                         (setq i (+ i max-length))
                         (concat
                          "\\(?:"
                          (mapconcat (lambda (entry)
                                       (let ((length (car entry))
                                             (suffix-regexp (cdr entry)))
                                         (concat suffix-regexp
209
                                                 (char-fold-to-regexp subs nil length))))
210 211
                                     `((0 . ,regexp) . ,matched-entries) "\\|")
                          "\\)"))))
212
                   out))))
213 214
      (setq i (1+ i)))
    (when (> spaces 0)
215
      (push (char-fold--make-space-string spaces) out))
216 217
    (let ((regexp (apply #'concat (nreverse out))))
      ;; Limited by `MAX_BUF_SIZE' in `regex.c'.
218
      (if (> (length regexp) 5000)
219 220
          (regexp-quote string)
        regexp))))
221 222 223


;;; Commands provided for completeness.
224 225 226
(defun char-fold-search-forward (string &optional bound noerror count)
  "Search forward for a char-folded version of STRING.
STRING is converted to a regexp with `char-fold-to-regexp',
227 228 229
which is searched for with `re-search-forward'.
BOUND NOERROR COUNT are passed to `re-search-forward'."
  (interactive "sSearch: ")
230
  (re-search-forward (char-fold-to-regexp string) bound noerror count))
231

232 233 234
(defun char-fold-search-backward (string &optional bound noerror count)
  "Search backward for a char-folded version of STRING.
STRING is converted to a regexp with `char-fold-to-regexp',
235 236 237
which is searched for with `re-search-backward'.
BOUND NOERROR COUNT are passed to `re-search-backward'."
  (interactive "sSearch: ")
238
  (re-search-backward (char-fold-to-regexp string) bound noerror count))
239

240
(provide 'char-fold)
241

242
;;; char-fold.el ends here