nxml-rap.el 11.2 KB
Newer Older
1
;;; nxml-rap.el --- low-level support for random access parsing for nXML mode  -*- lexical-binding:t -*-
Mark A. Hershberger's avatar
Mark A. Hershberger committed
2

Paul Eggert's avatar
Paul Eggert committed
3
;; Copyright (C) 2003-2004, 2007-2019 Free Software Foundation, Inc.
Mark A. Hershberger's avatar
Mark A. Hershberger committed
4 5

;; Author: James Clark
6
;; Keywords: wp, hypermedia, languages, XML
Mark A. Hershberger's avatar
Mark A. Hershberger committed
7

Glenn Morris's avatar
Glenn Morris committed
8
;; This file is part of GNU Emacs.
Mark A. Hershberger's avatar
Mark A. Hershberger committed
9

10
;; GNU Emacs is free software: you can redistribute it and/or modify
Glenn Morris's avatar
Glenn Morris committed
11
;; it under the terms of the GNU General Public License as published by
12 13
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
Mark A. Hershberger's avatar
Mark A. Hershberger committed
14

Glenn Morris's avatar
Glenn Morris committed
15 16 17 18 19 20
;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
21
;; along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.
Mark A. Hershberger's avatar
Mark A. Hershberger committed
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37

;;; Commentary:

;; This uses xmltok.el to do XML parsing. The fundamental problem is
;; how to handle changes. We don't want to maintain a complete parse
;; tree.  We also don't want to reparse from the start of the document
;; on every keystroke.  However, it is not possible in general to
;; parse an XML document correctly starting at a random point in the
;; middle.  The main problems are comments, CDATA sections and
;; processing instructions: these can all contain things that are
;; indistinguishable from elements. Literals in the prolog are also a
;; problem.  Attribute value literals are not a problem because
;; attribute value literals cannot contain less-than signs.
;;
;; Our strategy is to keep track of just the problematic things.
;; Specifically, we keep track of all comments, CDATA sections and
38 39 40
;; processing instructions in the instance.  We do this by marking
;; the first character of these with the generic string syntax by setting
;; a 'syntax-table' text property in `sgml-syntax-propertize'.
Mark A. Hershberger's avatar
Mark A. Hershberger committed
41 42
;;
;; Thus to parse some random point in the file we first ensure that we
43 44 45 46 47
;; have scanned up to that point.  Then we search backwards for a <.
;; Then we check whether the < has the generic string syntax.  If it
;; does we go backwards to first character of the generic string (this
;; character must be a <).  Then we start parsing forward from the <
;; we have found.
Mark A. Hershberger's avatar
Mark A. Hershberger committed
48 49 50 51 52 53
;;
;; The prolog has to be parsed specially, so we also keep track of the
;; end of the prolog in `nxml-prolog-end'. The prolog is reparsed on
;; every change to the prolog.  This won't work well if people try to
;; edit huge internal subsets. Hopefully that will be rare.
;;
54 55 56
;; We rely on the `syntax-propertize-function' machinery to keep track
;; of the changes in the buffer.  Fontification also relies on correct
;; `syntax-table' properties.  This means that scanning for these
Mark A. Hershberger's avatar
Mark A. Hershberger committed
57 58 59 60 61 62 63 64 65
;; constructs had better be quick.  Fortunately it is. Firstly, the
;; typical proportion of comments, CDATA sections and processing
;; instructions is small relative to other things.  Secondly, to scan
;; we just search for the regexp <[!?].

;;; Code:

(require 'xmltok)
(require 'nxml-util)
66
(require 'sgml-mode)
Mark A. Hershberger's avatar
Mark A. Hershberger committed
67

68
(defvar-local nxml-prolog-end nil
Mark A. Hershberger's avatar
Mark A. Hershberger committed
69 70 71
  "Integer giving position following end of the prolog.")

(defsubst nxml-get-inside (pos)
72 73 74 75 76 77 78 79 80
  "Return non-nil if inside comment, CDATA, or PI."
  (let ((ppss (save-excursion (syntax-ppss pos))))
    (or
     ;; Inside comment.
     (nth 4 ppss)
     ;; Inside "generic" string which is used for CDATA, and PI.
     ;; "Normal" double and single quoted strings are used for
     ;; attribute values.
     (eq t (nth 3 ppss)))))
Mark A. Hershberger's avatar
Mark A. Hershberger committed
81 82 83 84

(defun nxml-inside-end (pos)
  "Return the end of the inside region containing POS.
Return nil if the character at POS is not inside."
85 86 87 88 89 90 91 92
  (save-excursion
    (let ((ppss (syntax-ppss pos)))
      (when (nth 8 ppss)
        (goto-char (nth 8 ppss))
        (with-syntax-table sgml-tag-syntax-table
          (if (nth 3 ppss)
              (progn (forward-comment 1) (point))
            (or (scan-sexps (point) 1) (point-max))))))))
Mark A. Hershberger's avatar
Mark A. Hershberger committed
93 94 95 96

(defun nxml-inside-start (pos)
  "Return the start of the inside region containing POS.
Return nil if the character at POS is not inside."
97
  (save-excursion (nth 8 (syntax-ppss pos))))
Mark A. Hershberger's avatar
Mark A. Hershberger committed
98 99 100

;;; Change management

Glenn Morris's avatar
Glenn Morris committed
101 102 103
;; n-s-p only called from nxml-mode.el, where this variable is defined.
(defvar nxml-prolog-regions)

Mark A. Hershberger's avatar
Mark A. Hershberger committed
104 105 106
(defun nxml-scan-prolog ()
  (goto-char (point-min))
  (let (xmltok-dtd
107
	xmltok-errors)
Mark A. Hershberger's avatar
Mark A. Hershberger committed
108
    (setq nxml-prolog-regions (xmltok-forward-prolog))
109
    (setq nxml-prolog-end (point))))
Mark A. Hershberger's avatar
Mark A. Hershberger committed
110

111 112 113 114 115 116
(defun nxml-maybe-rescan-prolog (start _end _length)
  "Reparse the prolog if START lies within it.
`nxml-mode' adds this function on `after-change-functions'."
  (when (<= start nxml-prolog-end)
    (save-excursion
      (nxml-scan-prolog))))
Mark A. Hershberger's avatar
Mark A. Hershberger committed
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164

;;; Random access parsing

(defun nxml-token-after ()
  "Return the position after the token containing the char after point.
Sets up the variables `xmltok-type', `xmltok-start',
`xmltok-name-end', `xmltok-name-colon', `xmltok-attributes',
`xmltok-namespace-attributes' in the same was as does
`xmltok-forward'.  The prolog will be treated as a single token with
type `prolog'."
  (let ((pos (point)))
    (if (< pos nxml-prolog-end)
	(progn
	  (setq xmltok-type 'prolog
		xmltok-start (point-min))
	  (min nxml-prolog-end (point-max)))
      (nxml-ensure-scan-up-to-date)
      (if (nxml-get-inside pos)
	  (save-excursion
	    (nxml-move-outside-backwards)
	    (xmltok-forward)
	    (point))
	(save-excursion
	  (if (or (eq (char-after) ?<)
		      (search-backward "<"
				       (max (point-min) nxml-prolog-end)
				       t))
	      (nxml-move-outside-backwards)
	    (goto-char (if (<= (point-min) nxml-prolog-end)
			   nxml-prolog-end
			 (or (nxml-inside-end (point-min))
			     (point-min)))))
	  (while (and (nxml-tokenize-forward)
		      (<= (point) pos)))
	  (point))))))

(defun nxml-token-before ()
  "Return the position after the token containing the char before point.
Sets variables like `nxml-token-after'."
  (if (/= (point-min) (point))
      (save-excursion
	(goto-char (1- (point)))
	(nxml-token-after))
    (setq xmltok-start (point))
    (setq xmltok-type nil)
    (point)))

(defun nxml-tokenize-forward ()
165
  (let (xmltok-errors)
166
    (xmltok-forward)
Mark A. Hershberger's avatar
Mark A. Hershberger committed
167 168
    xmltok-type))

Michael Olson's avatar
Michael Olson committed
169
(defun nxml-move-tag-backwards (bound)
170
  "Move point backwards outside any “inside” regions or tags.
171
Point will not move past `nxml-prolog-end'.
172 173
Point will either be at BOUND or a `<' character starting a tag
outside any “inside” regions.
174
As a precondition, point must be >= BOUND."
Michael Olson's avatar
Michael Olson committed
175 176 177 178 179 180 181 182 183
  (nxml-move-outside-backwards)
  (when (not (equal (char-after) ?<))
    (if (search-backward "<" bound t)
        (progn
          (nxml-move-outside-backwards)
          (when (not (equal (char-after) ?<))
            (search-backward "<" bound t)))
      (goto-char bound))))

Mark A. Hershberger's avatar
Mark A. Hershberger committed
184 185 186 187 188
(defun nxml-move-outside-backwards ()
  "Move point to first character of the containing special thing.
Leave point unmoved if it is not inside anything special."
  (let ((start (nxml-inside-start (point))))
    (when start
189
      (goto-char start)
Mark A. Hershberger's avatar
Mark A. Hershberger committed
190
      (when (nxml-get-inside (point))
191
	(error "Char before inside-start at %s is still \"inside\"" (point))))))
Mark A. Hershberger's avatar
Mark A. Hershberger committed
192 193

(defun nxml-ensure-scan-up-to-date ()
194
  (syntax-propertize (point)))
Mark A. Hershberger's avatar
Mark A. Hershberger committed
195 196 197 198 199

;;; Element scanning

(defun nxml-scan-element-forward (from &optional up)
  "Scan forward from FROM over a single balanced element.
200 201 202 203 204 205
Point must be between tokens.  Return the position of the end of
the tag that ends the element. `xmltok-start' will contain the
position of the start of the tag.  If UP is non-nil, then scan
past end-tag of element containing point.  If no element is
found, return nil.  If a well-formedness error prevents scanning,
signal an `nxml-scan-error'.  Point is not moved."
Mark A. Hershberger's avatar
Mark A. Hershberger committed
206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
  (let ((open-tags (and up t))
	found)
    (save-excursion
      (goto-char from)
      (while (cond ((not (nxml-tokenize-forward))
		    (when (consp open-tags)
		      (nxml-scan-error (cadr open-tags)
				       "Start-tag has no end-tag"))
		    nil)
		   ((eq xmltok-type 'start-tag)
		    (setq open-tags
			  (cons (xmltok-start-tag-qname)
				(cons xmltok-start
				      open-tags)))
		    t)
		   ((eq xmltok-type 'end-tag)
		    (cond ((not open-tags) nil)
			  ((not (consp open-tags)) (setq found (point)) nil)
			  ((not (string= (car open-tags)
					 (xmltok-end-tag-qname)))
			   (nxml-scan-error (+ 2 xmltok-start)
					    "Mismatched end-tag; \
expected `%s'"
					    (car open-tags)))
			  ((setq open-tags (cddr open-tags)) t)
			  (t (setq found (point)) nil)))
		   ((memq xmltok-type '(empty-element
					partial-empty-element))
		    (if open-tags
			t
		      (setq found (point))
		      nil))
		   ((eq xmltok-type 'partial-end-tag)
		    (cond ((not open-tags) nil)
			  ((not (consp open-tags)) (setq found (point)) nil)
			  ((setq open-tags (cddr open-tags)) t)
			  (t (setq found (point)) nil)))
		   ((eq xmltok-type 'partial-start-tag)
		    (nxml-scan-error xmltok-start
				     "Missing `>'"))
		   (t t))))
    found))

(defun nxml-scan-element-backward (from &optional up bound)
  "Scan backward from FROM over a single balanced element.
251 252 253 254 255 256 257
Point must be between tokens.  Return the position of the end of
the tag that starts the element. `xmltok-start' will contain the
position of the start of the tag.  If UP is non-nil, then scan
past start-tag of element containing point.  If BOUND is non-nil,
then don't scan back past BOUND.  If no element is found, return
nil.  If a well-formedness error prevents scanning, signal an
`nxml-scan-error'.  Point is not moved."
Mark A. Hershberger's avatar
Mark A. Hershberger committed
258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317
  (let ((open-tags (and up t))
	token-end found)
    (save-excursion
      (goto-char from)
      (while (cond ((or (< (point) nxml-prolog-end)
			(not (search-backward "<"
					      (max (or bound 0)
						   nxml-prolog-end)
					      t)))
		    (when (and (consp open-tags) (not bound))
		      (nxml-scan-error (cadr open-tags)
				       "End-tag has no start-tag"))
		    nil)
		   ((progn
		      (nxml-move-outside-backwards)
		      (save-excursion
			(nxml-tokenize-forward)
			(setq token-end (point)))
		      (eq xmltok-type 'end-tag))
		    (setq open-tags
			  (cons (xmltok-end-tag-qname)
				(cons xmltok-start open-tags)))
		    t)
		   ((eq xmltok-type 'start-tag)
		    (cond ((not open-tags) nil)
			  ((not (consp open-tags))
			   (setq found token-end)
			   nil)
			  ((and (car open-tags)
				(not (string= (car open-tags)
					      (xmltok-start-tag-qname))))
			   (nxml-scan-error (1+ xmltok-start)
					    "Mismatched start-tag; \
expected `%s'"
					    (car open-tags)))
			  ((setq open-tags (cddr open-tags)) t)
			  (t (setq found token-end) nil)))
		   ((memq xmltok-type '(empty-element
					partial-empty-element))
		    (if open-tags
			t
		      (setq found token-end)
		      nil))
		   ((eq xmltok-type 'partial-end-tag)
		    (setq open-tags
			  (cons nil (cons xmltok-start open-tags)))
		    t)
		   ((eq xmltok-type 'partial-start-tag)
		    ;; if we have only a partial-start-tag
		    ;; then it's unlikely that there's a matching
		    ;; end-tag, so it's probably not helpful
		    ;; to treat it as a complete start-tag
		    (nxml-scan-error xmltok-start
				     "Missing `>'"))
		   (t t))))
    found))

(defun nxml-scan-error (&rest args)
  (signal 'nxml-scan-error args))

318 319
(define-error 'nxml-scan-error
  "Scan over element that is not well-formed" 'nxml-error)
Mark A. Hershberger's avatar
Mark A. Hershberger committed
320 321 322 323

(provide 'nxml-rap)

;;; nxml-rap.el ends here