nxml-parse.el 11.6 KB
Newer Older
Mark A. Hershberger's avatar
Mark A. Hershberger committed
1 2
;;; nxml-parse.el --- XML parser, sharing infrastructure with nxml-mode

Glenn Morris's avatar
Glenn Morris committed
3
;; Copyright (C) 2003, 2007, 2008 Free Software Foundation, Inc.
Mark A. Hershberger's avatar
Mark A. Hershberger committed
4 5 6 7

;; Author: James Clark
;; Keywords: XML

Glenn Morris's avatar
Glenn Morris committed
8
;; This file is part of GNU Emacs.
Mark A. Hershberger's avatar
Mark A. Hershberger committed
9

Glenn Morris's avatar
Glenn Morris committed
10 11 12 13
;; GNU Emacs is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 3, or (at your option)
;; any later version.
Mark A. Hershberger's avatar
Mark A. Hershberger committed
14

Glenn Morris's avatar
Glenn Morris committed
15 16 17 18 19 20 21 22 23
;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs; see the file COPYING.  If not, write to the
;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
;; Boston, MA 02110-1301, USA.
Mark A. Hershberger's avatar
Mark A. Hershberger committed
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321

;;; Commentary:

;; Entry point is `nxml-parse-file'.

;;; Code:

(require 'nxml-util)
(require 'xmltok)
(require 'nxml-enc)
(require 'nxml-ns)

(defvar nxml-parse-file-name nil)

(defvar nxml-validate-function nil
  "Nil or a function to be called by `nxml-parse-file' to perform validation.
The function will be called once for each start-tag or end-tag.  The
function is passed two arguments TEXT and START-TAG.  For a start-tag,
START-TAG is a list (NAME ATTRIBUTES) where NAME and ATTRIBUTES are in
the same form as returned by `nxml-parse-file.  For an end-tag,
START-TAG is nil.  TEXT is a string containing the text immediately
preceding the tag, or nil if there was no such text.  An empty element
is treated as a start-tag followed by an end-tag.

For a start-tag, the namespace state will be the state after
processing the namespace declarations in the start-tag. For an
end-tag, the namespace state will be the state before popping the
namespace declarations for the corresponding start-tag.

The function must return nil if no error is detected or a
cons (MESSAGE . LOCATION) where MESSAGE is a string containing
an error message and LOCATION indicates what caused the error
as follows:

- nil indicates the tag as whole caused it; this is always allowed;

- text indicates the text caused it; this is allowed only if
TEXT is non-nil;

- tag-close indicates the close of the tag caused it; this is
allowed only if START-TAG is non-nil;

- (attribute-name . N) indicates that the name of the Nth attribute
caused it; N counts from 0; this is allowed only if START-TAG is non-nil
and N must be less than the number of attributes;

- (attribute-value . N) indicates that the value of the Nth attribute
caused it; N counts from 0; this is allowed only if START-TAG is non-nil
and N must be less than the number of attributes.")

(defun nxml-parse-file (file)
  "Parse the XML document in FILE and return it as a list.
An XML element is represented as a list (NAME ATTRIBUTES . CHILDREN).
NAME is either a string, in the case where the name does not have a
namespace, or a cons (NAMESPACE . LOCAL-NAME), where NAMESPACE is a
symbol and LOCAL-NAME is a string, in the case where the name does
have a namespace. NAMESPACE is a keyword whose name is `:URI', where
URI is the namespace name.  ATTRIBUTES is an alist of attributes where
each attribute has the form (NAME . VALUE), where NAME has the same
form as an element name, and VALUE is a string.  A namespace
declaration is represented as an attribute whose name is
\(:http://www.w3.org/2000/xmlns/ . LOCAL-NAME).  CHILDREN is a list
containing strings and child elements; CHILDREN never contains two
consecutive strings and never contains an empty string.  Processing
instructions and comments are not represented.  The return value is a
list representing the document element.

If the XML document is not well-formed, an error having the condition
`nxml-file-parse-error' will be signaled; the error data will be a
list of the \(FILE POSITION MESSAGE), where POSITION is an integer
specifying the position where the error was detected, and MESSAGE is a
string describing the error.

The current contents of FILE will be parsed even if there is a
modified buffer currently visiting FILE.

If the variable `nxml-validation-function' is non-nil, it will be
called twice for each element, and any reported error will be signaled
in the same way as well-formedness error."
  (save-excursion
    (set-buffer (nxml-parse-find-file file))
    (unwind-protect
	(let ((nxml-parse-file-name file))
	  (nxml-parse-instance))
      (kill-buffer nil))))

(defun nxml-parse-find-file (file)
  (save-excursion
    (set-buffer (get-buffer-create " *nXML Parse*"))
    (erase-buffer)
    (let ((set-auto-coding-function 'nxml-set-xml-coding))
      (insert-file-contents file))
    (current-buffer)))
      
(defun nxml-parse-instance ()
  (let (xmltok-dtd)
    (xmltok-save
      (xmltok-forward-prolog)
      (nxml-check-xmltok-errors)
      (nxml-ns-save
	(nxml-parse-instance-1)))))

(defun nxml-parse-instance-1 ()
  (let* ((top (cons nil nil))
	 ;; tail is a cons cell, whose cdr is nil
	 ;; additional elements will destructively appended to tail
	 (tail top)
	 ;; stack of tails one for each open element
	 tail-stack
	 ;; list of QNames of open elements
	 open-element-tags
	 ;; list of strings buffering a text node, in reverse order
	 text
	 ;; position of beginning of first (in buffer) string in text
	 text-pos)
    (while (xmltok-forward)
      (nxml-check-xmltok-errors)
      (cond ((memq xmltok-type '(start-tag end-tag empty-element))
	     (when text
	       (setq text (apply 'concat (nreverse text)))
	       (setcdr tail (cons text nil))
	       (setq tail (cdr tail)))
	     (when (not (eq xmltok-type 'end-tag))
	       (when (and (not open-element-tags)
			  (not (eq tail top)))
		 (nxml-parse-error nil "Multiple top-level elements"))
	       (setq open-element-tags
		     (cons (xmltok-start-tag-qname)
			   open-element-tags))
	       (nxml-ns-push-state)
	       (let ((tag (nxml-parse-start-tag)))
		 (nxml-validate-tag text text-pos tag)
		 (setq text nil)
		 (setcdr tail (cons tag nil))
		 (setq tail (cdr tail))
		 (setq tail-stack (cons tail tail-stack))
		 (setq tail (last tag))))
	     (when (not (eq xmltok-type 'start-tag))
	       (or (eq xmltok-type 'empty-element)
		   (equal (car open-element-tags)
			  (xmltok-end-tag-qname))
		   (if open-element-tags
		       (nxml-parse-error nil
					 "Unbalanced end-tag; expected </%s>"
					 (car open-element-tags))
		     (nxml-parse-error nil "Extra end-tag")))
	       (nxml-validate-tag text text-pos nil)
	       (setq text nil)
	       (nxml-ns-pop-state)
	       (setq open-element-tags (cdr open-element-tags))
	       (setq tail (car tail-stack))
	       (setq tail-stack (cdr tail-stack)))
	     (setq text-pos nil))
	    ((memq xmltok-type '(space data entity-ref char-ref cdata-section))
	     (cond (open-element-tags
		    (unless text-pos
		      (setq text-pos xmltok-start))
		    (setq text
			  (cons (nxml-current-text-string) text)))
		   ((not (eq xmltok-type 'space))
		    (nxml-parse-error
		     nil
		     "%s at top-level"
		     (cdr (assq xmltok-type
				'((data . "Text characters")
				  (entity-ref . "Entity reference")
				  (char-ref . "Character reference")
				  (cdata-section . "CDATA section"))))))))))
    (unless (cdr top)
      (nxml-parse-error (point-max) "Missing document element"))
    (cadr top)))

(defun nxml-parse-start-tag ()
  (let (parsed-attributes
	parsed-namespace-attributes
	atts att prefixes prefix ns value name)
    (setq atts xmltok-namespace-attributes)
    (while atts
      (setq att (car atts))
      (setq value (or (xmltok-attribute-value att)
		      (nxml-parse-error nil "Invalid attribute value")))
      (setq ns (nxml-make-namespace value))
      (setq prefix (and (xmltok-attribute-prefix att)
			(xmltok-attribute-local-name att)))
      (cond ((member prefix prefixes)
	     (nxml-parse-error nil "Duplicate namespace declaration"))
	    ((not prefix)
	     (nxml-ns-set-default ns))
	    (ns
	     (nxml-ns-set-prefix prefix ns))
	    (t (nxml-parse-error nil "Cannot undeclare namespace prefix")))
      (setq prefixes (cons prefix prefixes))
      (setq parsed-namespace-attributes
	    (cons (cons (nxml-make-name nxml-xmlns-namespace-uri
					(xmltok-attribute-local-name att))
			value)
		  parsed-namespace-attributes))
      (setq atts (cdr atts)))
    (setq name
	  (nxml-make-name
	   (let ((prefix (xmltok-start-tag-prefix)))
	     (if prefix
		 (or (nxml-ns-get-prefix prefix)
		     (nxml-parse-error (1+ xmltok-start)
				       "Prefix `%s' undeclared"
				       prefix))
	       (nxml-ns-get-default)))
	   (xmltok-start-tag-local-name)))
    (setq atts xmltok-attributes)
    (while atts
      (setq att (car atts))
      (setq ns
	    (let ((prefix (xmltok-attribute-prefix att)))
	      (and prefix
		   (or (nxml-ns-get-prefix prefix)
		       (nxml-parse-error (xmltok-attribute-name-start att)
					 "Prefix `%s' undeclared"
					 prefix)))))
      (setq parsed-attributes
	    (let ((nm (nxml-make-name ns
				      (xmltok-attribute-local-name att))))
	      (when (assoc nm parsed-attributes)
		(nxml-parse-error (xmltok-attribute-name-start att)
				  "Duplicate attribute"))
	      (cons (cons nm (or (xmltok-attribute-value att)
				 (nxml-parse-error nil "Invalid attribute value")))
		    parsed-attributes)))
      (setq atts (cdr atts)))
    ;; We want to end up with the attributes followed by the
    ;; the namespace attributes in the same order as
    ;; xmltok-attributes and xmltok-namespace-attributes respectively.
    (when parsed-namespace-attributes
      (setq parsed-attributes
	    (nconc parsed-namespace-attributes parsed-attributes)))
    (list name (nreverse parsed-attributes))))

(defun nxml-validate-tag (text text-pos tag)
  (when nxml-validate-function
    (let ((err (funcall nxml-validate-function text tag))
	  pos)
      (when err
	(setq pos (nxml-validate-error-position (cdr err)
						(and text text-pos)
						tag))
	(or pos (error "Incorrect return value from %s"
		       nxml-validate-function))
	(nxml-parse-error pos (car err))))))

(defun nxml-validate-error-position (location text-pos tag)
  (cond ((null location) xmltok-start)
	((eq location 'text)  text-pos)
	((eq location 'tag-close)
	 (and tag (- (point) (if (eq xmltok-type 'empty-element ) 2 1))))
	((consp location)
	 (let ((att (nth (cdr location) xmltok-attributes)))
	   (when (not att)
	     (setq att (nth (- (cdr location) (length xmltok-attributes))
			    xmltok-namespace-attributes)))
	   (cond ((not att))
		 ((eq (car location) 'attribute-name)
		  (xmltok-attribute-name-start att))
		 ((eq (car location) 'attribute-value)
		  (xmltok-attribute-value-start att)))))))

(defun nxml-make-name (ns local-name)
  (if ns
      (cons ns local-name)
    local-name))

(defun nxml-current-text-string ()
  (cond ((memq xmltok-type '(space data))
	 (buffer-substring-no-properties xmltok-start
					 (point)))
	((eq xmltok-type 'cdata-section)
	 (buffer-substring-no-properties (+ xmltok-start 9)
					 (- (point) 3)))
	((memq xmltok-type '(char-ref entity-ref))
	 (unless xmltok-replacement
	   (nxml-parse-error nil
			     (if (eq xmltok-type 'char-ref)
				 "Reference to unsupported Unicode character"
			       "Unresolvable entity reference")))
	 xmltok-replacement)))

(defun nxml-parse-error (position &rest args)
  (nxml-signal-file-parse-error nxml-parse-file-name
				(or position xmltok-start)
				(apply 'format args)))

(defun nxml-check-xmltok-errors ()
  (when xmltok-errors
    (let ((err (car (last xmltok-errors))))
      (nxml-signal-file-parse-error nxml-parse-file-name
				    (xmltok-error-start err)
				    (xmltok-error-message err)))))

(provide 'nxml-parse)

Miles Bader's avatar
Miles Bader committed
322
;; arch-tag: fc19639b-1bff-4673-9992-f539da89ba1e
Mark A. Hershberger's avatar
Mark A. Hershberger committed
323
;;; nxml-parse.el ends here