Commit f3c3dee6 authored by Vinicius Jose Latorre's avatar Vinicius Jose Latorre
Browse files

Parser for EBNF used to specify XML (EBNFX)

parent 9d59cbb0
;;; ebnf-ebx.el --- parser for EBNF used to specify XML (EBNFX)
;; Copyright (C) 2004 Free Sofware Foundation, Inc.
;; Author: Vinicius Jose Latorre <viniciusjl@ig.com.br>
;; Maintainer: Vinicius Jose Latorre <viniciusjl@ig.com.br>
;; Time-stamp: <2004/03/22 08:53:21 vinicius>
;; Keywords: wp, ebnf, PostScript
;; Version: 1.0
;; This file is part of GNU Emacs.
;; GNU Emacs is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 2, or (at your option)
;; any later version.
;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs; see the file COPYING. If not, write to the
;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
;; Boston, MA 02111-1307, USA.
;;; Commentary:
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;;
;; This is part of ebnf2ps package.
;;
;; This package defines a parser for EBNF used to specify XML (EBNFX).
;;
;; See ebnf2ps.el for documentation.
;;
;;
;; EBNFX Syntax
;; ------------
;;
;; See the URL:
;; `http://www.w3.org/TR/2004/REC-xml-20040204/#sec-notation'
;; (Extensible Markup Language (XML) 1.0 (Third Edition))
;;
;;
;; rule ::= symbol '::=' expression
;; /* rules are separated by at least one blank line. */
;;
;; expression ::= concatenation ('|' concatenation)*
;;
;; concatenation ::= exception*
;;
;; exception ::= term ('-' term)?
;;
;; term ::= factor ('*' | '+' | '?')?
;;
;; factor ::= hex-char+
;; | '[' '^'? ( char ( '-' char )? )+ ']'
;; | '"' 'string' '"'
;; | "'" "string" "'"
;; | '(' expression ')'
;; | symbol
;;
;; symbol ::= 'upper or lower case letter'
;; ('upper or lower case letter' | '-' | '_')*
;; /* upper and lower 8-bit accentuated characters are included */
;;
;; hex-char ::= '#x' [0-9A-Fa-f]+
;;
;; char ::= hex-char | 'any character except control characters'
;; /* 8-bit accentuated characters are included */
;;
;; any-char ::= char | 'newline' | 'tab'
;;
;; ignore ::= '[' ('wfc' | 'WFC' | 'vc' | 'VC') ':' ( any-char - ']' )* ']'
;;
;; comment ::= '/*' ( any-char - '*/' ) '*/'
;;
;;
;; Below is the Notation section extracted from the URL cited above.
;;
;; 6 Notation
;;
;; The formal grammar of XML is given in this specification using a simple
;; Extended Backus-Naur Form (EBNF) notation. Each rule in the grammar defines
;; one symbol, in the form
;;
;; symbol ::= expression
;;
;; Symbols are written with an initial capital letter if they are the start
;; symbol of a regular language, otherwise with an initial lowercase letter.
;; Literal strings are quoted.
;;
;; Within the expression on the right-hand side of a rule, the following
;; expressions are used to match strings of one or more characters:
;;
;; #xN
;;
;; where N is a hexadecimal integer, the expression matches the character
;; whose number (code point) in ISO/IEC 10646 is N. The number of leading
;; zeros in the #xN form is insignificant.
;;
;; [a-zA-Z], [#xN-#xN]
;;
;; matches any Char with a value in the range(s) indicated (inclusive).
;;
;; [abc], [#xN#xN#xN]
;;
;; matches any Char with a value among the characters enumerated.
;; Enumerations and ranges can be mixed in one set of brackets.
;;
;; [^a-z], [^#xN-#xN]
;;
;; matches any Char with a value outside the range indicated.
;;
;; [^abc], [^#xN#xN#xN]
;;
;; matches any Char with a value not among the characters given.
;; Enumerations and ranges of forbidden values can be mixed in one set of
;; brackets.
;;
;; "string"
;;
;; matches a literal string matching that given inside the double quotes.
;;
;; 'string'
;;
;; matches a literal string matching that given inside the single quotes.
;;
;; These symbols may be combined to match more complex patterns as follows,
;; where A and B represent simple expressions:
;;
;; (expression)
;;
;; expression is treated as a unit and may be combined as described in this
;; list.
;;
;; A?
;;
;; matches A or nothing; optional A.
;;
;; A B
;;
;; matches A followed by B. This operator has higher precedence than
;; alternation; thus A B | C D is identical to (A B) | (C D).
;;
;; A | B
;;
;; matches A or B.
;;
;; A - B
;;
;; matches any string that matches A but does not match B.
;;
;; A+
;;
;; matches one or more occurrences of A. Concatenation has higher
;; precedence than alternation; thus A+ | B+ is identical to (A+) | (B+).
;;
;; A*
;;
;; matches zero or more occurrences of A. Concatenation has higher
;; precedence than alternation; thus A* | B* is identical to (A*) | (B*).
;;
;; Other notations used in the productions are:
;;
;; /* ... */
;;
;; comment.
;;
;; [ wfc: ... ]
;;
;; well-formedness constraint; this identifies by name a constraint on
;; well-formed documents associated with a production.
;;
;; [ vc: ... ]
;;
;; validity constraint; this identifies by name a constraint on valid
;; documents associated with a production.
;;
;;
;; Differences Between EBNFX And ebnf2ps EBNFX
;; -------------------------------------------
;;
;; Besides the characters that EBNFX accepts, ebnf2ps EBNFX accepts also the
;; underscore (_) and minus (-) for rule name and european 8-bit accentuated
;; characters (from \240 to \377) for rule name, string and comment. Also
;; rule name can start with upper case letter.
;;
;;
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Code:
(require 'ebnf-otz)
(defvar ebnf-ebx-lex nil
"Value returned by `ebnf-ebx-lex' function.")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Syntactic analyzer
;;; rulelist ::= rule+
(defun ebnf-ebx-parser (start)
"EBNFX parser."
(let ((total (+ (- ebnf-limit start) 1))
(bias (1- start))
(origin (point))
rule-list token rule)
(goto-char start)
(setq token (ebnf-ebx-lex))
(and (eq token 'end-of-input)
(error "Invalid EBNFX file format"))
(and (eq token 'end-of-rule)
(setq token (ebnf-ebx-lex)))
(while (not (eq token 'end-of-input))
(ebnf-message-float
"Parsing...%s%%"
(/ (* (- (point) bias) 100.0) total))
(setq token (ebnf-ebx-rule token)
rule (cdr token)
token (car token))
(or (ebnf-add-empty-rule-list rule)
(setq rule-list (cons rule rule-list))))
(goto-char origin)
rule-list))
;;; rule ::= symbol '::=' expression
(defun ebnf-ebx-rule (token)
(let ((name ebnf-ebx-lex)
(action ebnf-action)
elements)
(setq ebnf-action nil)
(or (eq token 'non-terminal)
(error "Invalid rule name"))
(setq token (ebnf-ebx-lex))
(or (eq token 'production)
(error "Invalid rule: missing `::='"))
(setq elements (ebnf-ebx-expression))
(or (memq (car elements) '(end-of-rule end-of-input))
(error "Invalid rule: there is no end of rule"))
(setq elements (cdr elements))
(ebnf-eps-add-production name)
(cons (ebnf-ebx-lex)
(ebnf-make-production name elements action))))
;; expression ::= concatenation ('|' concatenation)*
(defun ebnf-ebx-expression ()
(let (body concatenation)
(while (eq (car (setq concatenation
(ebnf-ebx-concatenation (ebnf-ebx-lex))))
'alternative)
(setq body (cons (cdr concatenation) body)))
(ebnf-token-alternative body concatenation)))
;; concatenation ::= exception*
(defun ebnf-ebx-concatenation (token)
(let ((term (ebnf-ebx-exception token))
seq)
(or (setq token (car term)
term (cdr term))
(error "Empty element"))
(setq seq (cons term seq))
(while (setq term (ebnf-ebx-exception token)
token (car term)
term (cdr term))
(setq seq (cons term seq)))
(cons token
(if (= (length seq) 1)
;; sequence with only one element
(car seq)
;; a real sequence
(ebnf-make-sequence (nreverse seq))))))
;;; exception ::= term ('-' term)?
(defun ebnf-ebx-exception (token)
(let ((term (ebnf-ebx-term token)))
(if (eq (car term) 'exception)
(let ((except (ebnf-ebx-term (ebnf-ebx-lex))))
(cons (car except)
(ebnf-make-except (cdr term) (cdr except))))
term)))
;;; term ::= factor ('*' | '+' | '?')?
(defun ebnf-ebx-term (token)
(let ((factor (ebnf-ebx-factor token)))
(when factor
(setq token (ebnf-ebx-lex))
(cond ((eq token 'zero-or-more)
(setq factor (ebnf-make-zero-or-more factor)
token (ebnf-ebx-lex)))
((eq token 'one-or-more)
(setq factor (ebnf-make-one-or-more factor)
token (ebnf-ebx-lex)))
((eq token 'optional)
(setq factor (ebnf-token-optional factor)
token (ebnf-ebx-lex)))))
(cons token factor)))
;;; factor ::= hex-char+
;;; | '[' '^'? ( char ( '-' char )? )+ ']'
;;; | '"' 'string' '"'
;;; | "'" "string" "'"
;;; | '(' expression ')'
;;; | symbol
;;;
;;; symbol ::= 'upper or lower case letter'
;;; ('upper or lower case letter' | '-' | '_')*
;;; /* upper and lower 8-bit accentuated characters are included */
;;;
;;; hex-char ::= '#x' [0-9A-Fa-f]+
;;;
;;; char ::= hex-char | 'any character except control characters'
;;; /* 8-bit accentuated characters are included */
;;;
;;; any-char ::= char | 'newline' | 'tab'
(defun ebnf-ebx-factor (token)
(cond
;; terminal
((eq token 'terminal)
(ebnf-make-terminal ebnf-ebx-lex))
;; non-terminal
((eq token 'non-terminal)
(ebnf-make-non-terminal ebnf-ebx-lex))
;; group
((eq token 'begin-group)
(let ((body (ebnf-ebx-expression)))
(or (eq (car body) 'end-group)
(error "Missing `)'"))
(cdr body)))
;; no element
(t
nil)
))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Lexical analyzer
(defconst ebnf-ebx-token-table (make-vector 256 'error)
"Vector used to map characters to a lexical token.")
(defun ebnf-ebx-initialize ()
"Initialize EBNFX token table."
;; control character & control 8-bit character are set to `error'
(let ((char ?\101))
;; printable character: A-Z
(while (< char ?\133)
(aset ebnf-ebx-token-table char 'non-terminal)
(setq char (1+ char)))
;; printable character: a-z
(setq char ?\141)
(while (< char ?\173)
(aset ebnf-ebx-token-table char 'non-terminal)
(setq char (1+ char)))
;; European 8-bit accentuated characters:
(setq char ?\240)
(while (< char ?\400)
(aset ebnf-ebx-token-table char 'non-terminal)
(setq char (1+ char)))
;; Override end of line characters:
(aset ebnf-ebx-token-table ?\n 'end-of-rule) ; [NL] linefeed
(aset ebnf-ebx-token-table ?\r 'end-of-rule) ; [CR] carriage return
;; Override space characters:
(aset ebnf-ebx-token-table ?\013 'space) ; [VT] vertical tab
(aset ebnf-ebx-token-table ?\t 'space) ; [HT] horizontal tab
(aset ebnf-ebx-token-table ?\ 'space) ; [SP] space
;; Override form feed character:
(aset ebnf-ebx-token-table ?\f 'form-feed) ; [FF] form feed
;; Override other lexical characters:
(aset ebnf-ebx-token-table ?# 'hash)
(aset ebnf-ebx-token-table ?\" 'double-quote)
(aset ebnf-ebx-token-table ?\' 'single-quote)
(aset ebnf-ebx-token-table ?\( 'begin-group)
(aset ebnf-ebx-token-table ?\) 'end-group)
(aset ebnf-ebx-token-table ?- 'exception)
(aset ebnf-ebx-token-table ?: 'colon)
(aset ebnf-ebx-token-table ?\[ 'begin-square)
(aset ebnf-ebx-token-table ?| 'alternative)
(aset ebnf-ebx-token-table ?* 'zero-or-more)
(aset ebnf-ebx-token-table ?+ 'one-or-more)
(aset ebnf-ebx-token-table ?\? 'optional)
;; Override comment character:
(aset ebnf-ebx-token-table ?/ 'comment)))
;; replace the range "\240-\377" (see `ebnf-range-regexp').
(defconst ebnf-ebx-non-terminal-chars
(ebnf-range-regexp "-_A-Za-z" ?\240 ?\377))
(defconst ebnf-ebx-non-terminal-letter-chars
(ebnf-range-regexp "A-Za-z" ?\240 ?\377))
(defun ebnf-ebx-lex ()
"Lexical analyser for EBNFX.
Return a lexical token.
See documentation for variable `ebnf-ebx-lex'."
(if (>= (point) ebnf-limit)
'end-of-input
(let (token)
;; skip spaces and comments
(while (if (> (following-char) 255)
(progn
(setq token 'error)
nil)
(setq token (aref ebnf-ebx-token-table (following-char)))
(cond
((eq token 'space)
(skip-chars-forward " \013\t" ebnf-limit)
(< (point) ebnf-limit))
((eq token 'comment)
(ebnf-ebx-skip-comment))
((eq token 'form-feed)
(forward-char)
(setq ebnf-action 'form-feed))
((eq token 'end-of-rule)
(ebnf-ebx-skip-end-of-rule))
((and (eq token 'begin-square)
(let ((case-fold-search t))
(looking-at "\\[\\(wfc\\|vc\\):")))
(ebnf-ebx-skip-constraint))
(t nil)
)))
(cond
;; end of input
((>= (point) ebnf-limit)
'end-of-input)
;; error
((eq token 'error)
(error "Illegal character"))
;; end of rule
((eq token 'end-of-rule)
'end-of-rule)
;; terminal: #x [0-9A-Fa-f]+
((eq token 'hash)
(setq ebnf-ebx-lex (ebnf-ebx-character))
'terminal)
;; terminal: "string"
((eq token 'double-quote)
(setq ebnf-ebx-lex (ebnf-ebx-string ?\"))
'terminal)
;; terminal: 'string'
((eq token 'single-quote)
(setq ebnf-ebx-lex (ebnf-ebx-string ?\'))
'terminal)
;; terminal: [ ^? ( char ( - char )? )+ ]
((eq token 'begin-square)
(setq ebnf-ebx-lex (ebnf-ebx-range))
'terminal)
;; non-terminal: NAME
((eq token 'non-terminal)
(setq ebnf-ebx-lex
(ebnf-buffer-substring ebnf-ebx-non-terminal-chars))
'non-terminal)
;; colon: ::=
((eq token 'colon)
(or (looking-at "::=")
(error "Missing `::=' token"))
(forward-char 3)
'production)
;; miscellaneous: (, ), *, +, ?, |, -
(t
(forward-char)
token)
))))
;; replace the range "\177-\237" (see `ebnf-range-regexp').
(defconst ebnf-ebx-constraint-chars
(ebnf-range-regexp "^\000-\010\016-\037]" ?\177 ?\237))
(defun ebnf-ebx-skip-constraint ()
(or (> (skip-chars-forward ebnf-ebx-constraint-chars ebnf-limit) 0)
(error "Invalid character"))
(or (= (following-char) ?\])
(error "Missing end of constraint `]'"))
(forward-char)
t)
(defun ebnf-ebx-skip-end-of-rule ()
(let (eor-p)
(while (progn
;; end of rule ==> 2 or more consecutive end of lines
(setq eor-p (or (> (skip-chars-forward "\r\n" ebnf-limit) 1)
eor-p))
;; skip spaces
(skip-chars-forward " \013\t" ebnf-limit)
;; skip comments
(and (= (following-char) ?/)
(ebnf-ebx-skip-comment))))
(not eor-p)))
;; replace the range "\177-\237" (see `ebnf-range-regexp').
(defconst ebnf-ebx-comment-chars
(ebnf-range-regexp "^\000-\010\016-\037\\*" ?\177 ?\237))
(defconst ebnf-ebx-filename-chars
(ebnf-range-regexp "^\000-\037\\*" ?\177 ?\237))
(defun ebnf-ebx-skip-comment ()
(forward-char)
(or (= (following-char) ?*)
(error "Invalid beginning of comment"))
(forward-char)
(cond
;; open EPS file
((and ebnf-eps-executing (= (following-char) ?\[))
(ebnf-eps-add-context (ebnf-ebx-eps-filename)))
;; close EPS file
((and ebnf-eps-executing (= (following-char) ?\]))
(ebnf-eps-remove-context (ebnf-ebx-eps-filename)))
;; any other action in comment
(t
(setq ebnf-action (aref ebnf-comment-table (following-char))))
)
(while (progn
(skip-chars-forward ebnf-ebx-comment-chars ebnf-limit)
(or (= (following-char) ?*)
(error "Missing end of comment"))
(forward-char)
(and (/= (following-char) ?/)
(< (point) ebnf-limit))))
;; check for a valid end of comment
(and (>= (point) ebnf-limit)
(error "Missing end of comment"))
(forward-char)
t)
(defun ebnf-ebx-eps-filename ()
(forward-char)
(let (fname nchar)
(while (progn
(setq fname
(concat fname
(ebnf-buffer-substring ebnf-ebx-filename-chars)))
(and (< (point) ebnf-limit)
(> (setq nchar (skip-chars-forward "*" ebnf-limit)) 0)
(< (point) ebnf-limit)
(/= (following-char) ?/)))
(setq fname (concat fname (make-string nchar ?*))
nchar nil))
(if (or (not nchar) (= nchar 0))
fname
(and (< (point) ebnf-limit)
(= (following-char) ?/)
(setq nchar (1- nchar)))
(concat fname (make-string nchar ?*)))))
;; replace the range "\240-\377" (see `ebnf-range-regexp').
(defconst ebnf-ebx-double-string-chars
(ebnf-range-regexp "\t -!#-~" ?\240 ?\377))
(defconst ebnf-ebx-single-string-chars
(ebnf-range-regexp "\t -&(-~" ?\240 ?\377))
(defun ebnf-ebx-string (delim)
(buffer-substring-no-properties
(progn
(forward-char)
(point))
(progn
(skip-chars-forward (if (= delim ?\")
ebnf-ebx-double-string-chars
ebnf-ebx-single-string-chars)
ebnf-limit)
(or (= (following-char) delim)
(error "Missing string delimiter `%c'" delim))
(prog1
(point)
(forward-char)))))
(defun ebnf-ebx-character ()
;; #x [0-9A-Fa-f]+
(buffer-substring-no-properties
(point)
(progn
(ebnf-ebx-hex-character)
(point))))
(defun ebnf-ebx-range ()
;; [ ^? ( char ( - char )? )+ ]
(buffer-substring-no-properties
(point)
(progn
(forward-char)
(and (= (following-char) ?^)
(forward-char))
(and (= (following-char) ?-)
(forward-char))
(while (progn
(ebnf-ebx-any-character)
(when (= (following-char) ?-)
(forward-char)
(ebnf-ebx-any-character))
(and (/= (following-char) ?\])
(< (point) ebnf-limit))))
(and (>= (point) ebnf-limit)
(error "Missing end of character range `]'"))
(forward-char)
(point))))