Commit ce7b18ec authored by Chong Yidong's avatar Chong Yidong
Browse files

Improve RFC 3986 conformance of url package.

Fix 2012-04-10 change to url.el.

* url-http.el (url-http-create-request): Ignore obsolete
attributes slot of url-object.

* url-parse.el: Improve RFC 3986 conformance.
(url-generic-parse-url): Do not populate the ATTRIBUTES slot,
since this is not reliable for general RFC 3986 URIs.  Keep the
whole path and query inside the FILENAME slot.  Improve docstring.
(url-recreate-url-attributes): Mark as obsolete.
(url-recreate-url): Handle missing scheme and userinfo.

* url-util.el (url-encode-url): New function for URL quoting.
(url-encoding-table, url-host-allowed-chars)
(url-path-allowed-chars): New constants.
(url--allowed-chars): New helper function.
(url-hexify-string): Use them.

* url-vars.el (url-nonrelative-link): Make the regexp stricter.

* url.el (url-retrieve-internal): Use url-encode-url.

Fixes: debbugs:7017
parent 66b03a53
......@@ -202,6 +202,18 @@ server properties.
*** New command `tabulated-list-sort', bound to `S', sorts the column
at point, or the Nth column if a numeric prefix argument is given.
** URL
*** Structs made by `url-generic-parse-url' have nil `attributes' slot.
Previously, this slot stored semicolon-separated attribute-value pairs
appended to some imap URLs, but this is not compatible with RFC 3986.
So now the `filename' slot stores the entire path and query components
and the `attributes' slot is always nil.
*** New function `url-encode-url' for encoding a URI string.
The `url-retrieve' function now uses this to encode its URL argument,
in case that is not properly encoded.
** Obsolete packages:
*** assoc.el
......
2012-05-09 Chong Yidong <cyd@gnu.org>
* url-util.el (url-encode-url): New function for URL quoting.
(url-encoding-table, url-host-allowed-chars)
(url-path-allowed-chars): New constants.
(url--allowed-chars): New helper function.
(url-hexify-string): Use them.
* url-parse.el: Improve RFC 3986 conformance.
(url-generic-parse-url): Do not populate the ATTRIBUTES slot,
since this is not reliable for general RFC 3986 URIs. Keep the
whole path and query inside the FILENAME slot. Improve docstring.
(url-recreate-url-attributes): Mark as obsolete.
(url-recreate-url): Handle missing scheme and userinfo.
* url-http.el (url-http-create-request): Ignore obsolete
attributes slot of url-object.
* url-vars.el (url-nonrelative-link): Make the regexp stricter.
* url.el (url-retrieve-internal): Use url-encode-url (Bug#7017).
2012-04-26 Stefan Monnier <monnier@iro.umontreal.ca>
* url.el (url-retrieve-synchronously): Replace lexical-let by
......
......@@ -223,8 +223,7 @@ request.")
(let ((url-basic-auth-storage
'url-http-proxy-basic-auth-storage))
(url-get-authentication url-http-target-url nil 'any nil))))
(real-fname (concat (url-filename url-http-target-url)
(url-recreate-url-attributes url-http-target-url)))
(real-fname (url-filename url-http-target-url))
(host (url-host url-http-target-url))
(auth (if (cdr-safe (assoc "Authorization" url-http-extra-headers))
nil
......
......@@ -48,21 +48,31 @@
;;;###autoload
(defun url-recreate-url (urlobj)
"Recreate a URL string from the parsed URLOBJ."
(concat (url-type urlobj) ":" (if (url-host urlobj) "//" "")
(if (url-user urlobj)
(concat (url-user urlobj)
(if (url-password urlobj)
(concat ":" (url-password urlobj)))
"@"))
(url-host urlobj)
(if (and (url-port urlobj)
(not (equal (url-port urlobj)
(url-scheme-get-property (url-type urlobj) 'default-port))))
(format ":%d" (url-port urlobj)))
(or (url-filename urlobj) "/")
(url-recreate-url-attributes urlobj)
(if (url-target urlobj)
(concat "#" (url-target urlobj)))))
(let ((type (url-type urlobj))
(user (url-user urlobj))
(pass (url-password urlobj))
(host (url-host urlobj))
(port (url-portspec urlobj))
(file (url-filename urlobj))
(frag (url-target urlobj)))
(concat (if type (concat type ":"))
(if (url-fullness urlobj) "//")
(if (or user pass)
(concat user
(if pass (concat ":" pass))
"@"))
host
;; RFC 3986: "omit the port component and its : delimiter
;; if port is empty or if its value would be the same as
;; that of the scheme's default."
(and port
(or (null type)
(not (equal port
(url-scheme-get-property type
'default-port))))
(format ":%d" (url-port urlobj)))
(or file "/")
(if frag (concat "#" frag)))))
(defun url-recreate-url-attributes (urlobj)
"Recreate the attributes of an URL string from the parsed URLOBJ."
......@@ -73,107 +83,129 @@
(concat (car x) "=" (cdr x))
(car x)))
(url-attributes urlobj) ";"))))
(make-obsolete 'url-recreate-url-attributes nil "24.2")
;;;###autoload
(defun url-generic-parse-url (url)
"Return an URL-struct of the parts of URL.
The CL-style struct contains the following fields:
TYPE USER PASSWORD HOST PORTSPEC FILENAME TARGET ATTRIBUTES FULLNESS."
;; See RFC 3986.
(cond
((null url)
(url-parse-make-urlobj))
((or (not (string-match url-nonrelative-link url))
(= ?/ (string-to-char url)))
;; This isn't correct, as a relative URL can be a fragment link
;; (e.g. "#foo") and many other things (see section 4.2).
;; However, let's not fix something that isn't broken, especially
;; when close to a release.
(url-parse-make-urlobj nil nil nil nil nil url))
(t
TYPE is the URI scheme (string or nil).
USER is the user name (string or nil).
PASSWORD is the password (string [deprecated] or nil).
HOST is the host (a registered name, IP literal in square
brackets, or IPv4 address in dotted-decimal form).
PORTSPEC is the specified port (a number), or nil.
FILENAME is the path AND the query component of the URI.
TARGET is the fragment identifier component (used to refer to a
subordinate resource, e.g. a part of a webpage).
ATTRIBUTES is nil; this slot originally stored the attribute and
value alists for IMAP URIs, but this feature was removed
since it conflicts with RFC 3986.
FULLNESS is non-nil iff the authority component of the URI is
present.
The parser follows RFC 3986, except that it also tries to handle
URIs that are not fully specified (e.g. lacking TYPE), and it
does not check for or perform %-encoding.
Here is an example. The URL
foo://bob:pass@example.com:42/a/b/c.dtb?type=animal&name=narwhal#nose
parses to
TYPE = \"foo\"
USER = \"bob\"
PASSWORD = \"pass\"
HOST = \"example.com\"
PORTSPEC = 42
FILENAME = \"/a/b/c.dtb?type=animal&name=narwhal\"
TARGET = \"nose\"
ATTRIBUTES = nil
FULLNESS = t"
(if (null url)
(url-parse-make-urlobj)
(with-temp-buffer
;; Don't let those temp-buffer modifications accidentally
;; deactivate the mark of the current-buffer.
(let ((deactivate-mark nil))
(set-syntax-table url-parse-syntax-table)
(let ((save-pos nil)
(prot nil)
(user nil)
(pass nil)
(host nil)
(port nil)
(file nil)
(refs nil)
(attr nil)
(full nil)
(erase-buffer)
(insert url)
(goto-char (point-min))
(let ((save-pos (point))
scheme user pass host port file fragment full
(inhibit-read-only t))
(erase-buffer)
(insert url)
(goto-char (point-min))
(setq save-pos (point))
;; 3.1. Scheme
(unless (looking-at "//")
(skip-chars-forward "a-zA-Z+.\\-")
(downcase-region save-pos (point))
(setq prot (buffer-substring save-pos (point)))
(skip-chars-forward ":")
(setq save-pos (point)))
;; This is nil for a URI that is not fully specified.
(when (looking-at "\\([a-zA-Z][-a-zA-Z0-9+.]*\\):")
(goto-char (match-end 0))
(setq save-pos (point))
(setq scheme (downcase (match-string 1))))
;; 3.2. Authority
(when (looking-at "//")
(setq full t)
(forward-char 2)
(setq save-pos (point))
(skip-chars-forward "^/")
(skip-chars-forward "^/?#")
(setq host (buffer-substring save-pos (point)))
;; 3.2.1 User Information
(if (string-match "^\\([^@]+\\)@" host)
(setq user (match-string 1 host)
host (substring host (match-end 0) nil)))
(if (and user (string-match "\\([^:]+\\):\\(.*\\)" user))
host (substring host (match-end 0))))
(if (and user (string-match "\\`\\([^:]*\\):\\(.*\\)" user))
(setq pass (match-string 2 user)
user (match-string 1 user)))
;; This gives wrong results for IPv6 literal addresses.
(if (string-match ":\\([0-9+]+\\)" host)
(setq port (string-to-number (match-string 1 host))
host (substring host 0 (match-beginning 0))))
(if (string-match ":$" host)
(setq host (substring host 0 (match-beginning 0))))
(setq host (downcase host)
save-pos (point)))
(if (not port)
(setq port (url-scheme-get-property prot 'default-port)))
;; 3.3. Path
;; Gross hack to preserve ';' in data URLs
(cond
;; IPv6 literal address.
((string-match "^\\(\\[[^]]+\\]\\)\\(?::\\([0-9]*\\)\\)?$" host)
(setq port (match-string 2 host)
host (match-string 1 host)))
;; Registered name or IPv4 address.
((string-match ":\\([0-9]*\\)$" host)
(setq port (match-string 1 host)
host (substring host 0 (match-beginning 0)))))
(cond ((equal port "")
(setq port nil))
(port
(setq port (string-to-number port))))
(setq host (downcase host)))
(and (null port)
scheme
(setq port (url-scheme-get-property scheme 'default-port)))
;; Now point is on the / ? or # which terminates the
;; authority, or at the end of the URI, or (if there is no
;; authority) at the beginning of the absolute path.
(setq save-pos (point))
(if (string= "data" scheme)
;; For the "data" URI scheme, all the rest is the FILE.
(setq file (buffer-substring save-pos (point-max)))
;; For hysterical raisins, our data structure returns the
;; path and query components together in one slot.
;; 3.3. Path
(skip-chars-forward "^?#")
;; 3.4. Query
(when (looking-at "?")
(skip-chars-forward "^#"))
(setq file (buffer-substring save-pos (point)))
;; 3.5 Fragment
(when (looking-at "#")
(let ((opoint (point)))
(forward-char 1)
(unless (eobp)
(setq fragment (buffer-substring (point) (point-max))))
(delete-region opoint (point-max)))))
;; 3.4. Query
(if (string= "data" prot)
(goto-char (point-max))
;; Now check for references
(skip-chars-forward "^#")
(if (eobp)
nil
(delete-region
(point)
(progn
(skip-chars-forward "#")
(setq refs (buffer-substring (point) (point-max)))
(point-max))))
(goto-char save-pos)
(skip-chars-forward "^;")
(unless (eobp)
(setq attr (url-parse-args (buffer-substring (point) (point-max))
t)
attr (nreverse attr))))
(setq file (buffer-substring save-pos (point)))
(if (and host (string-match "%[0-9][0-9]" host))
(setq host (url-unhex-string host)))
(url-parse-make-urlobj
prot user pass host port file refs attr full)))))))
(url-parse-make-urlobj scheme user pass host port file
fragment nil full))))))
(defmacro url-bit-for-url (method lookfor url)
`(let* ((urlobj (url-generic-parse-url url))
......
......@@ -333,40 +333,117 @@ forbidden in URL encoding."
(concat tmp str)))
(defconst url-unreserved-chars
'(
?a ?b ?c ?d ?e ?f ?g ?h ?i ?j ?k ?l ?m ?n ?o ?p ?q ?r ?s ?t ?u ?v ?w ?x ?y ?z
'(?a ?b ?c ?d ?e ?f ?g ?h ?i ?j ?k ?l ?m ?n ?o ?p ?q ?r ?s ?t ?u ?v ?w ?x ?y ?z
?A ?B ?C ?D ?E ?F ?G ?H ?I ?J ?K ?L ?M ?N ?O ?P ?Q ?R ?S ?T ?U ?V ?W ?X ?Y ?Z
?0 ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9
?- ?_ ?. ?! ?~ ?* ?' ?\( ?\))
"A list of characters that are _NOT_ reserved in the URL spec.
This is taken from RFC 2396.")
?- ?_ ?. ?~)
"List of characters that are unreserved in the URL spec.
This is taken from RFC 3986 (section 2.3).")
(defconst url-encoding-table
(let ((vec (make-vector 256 nil)))
(dotimes (byte 256)
(aset vec byte (format "%%%02x" byte)))
vec)
"Vector translating bytes to URI-encoded %-sequences.")
(defun url--allowed-chars (char-list)
"Return an \"allowed character\" mask (a 256-slot vector).
The Nth element is non-nil if character N is in CHAR-LIST. The
result can be passed as the second arg to `url-hexify-string'."
(let ((vec (make-vector 256 nil)))
(dolist (byte char-list)
(ignore-errors (aset vec byte t)))
vec))
;;;###autoload
(defun url-hexify-string (string)
"Return a new string that is STRING URI-encoded.
First, STRING is converted to utf-8, if necessary. Then, for each
character in the utf-8 string, those found in `url-unreserved-chars'
are left as-is, all others are represented as a three-character
string: \"%\" followed by two lowercase hex digits."
;; To go faster and avoid a lot of consing, we could do:
;;
;; (defconst url-hexify-table
;; (let ((map (make-vector 256 nil)))
;; (dotimes (byte 256) (aset map byte
;; (if (memq byte url-unreserved-chars)
;; (char-to-string byte)
;; (format "%%%02x" byte))))
;; map))
;;
;; (mapconcat (curry 'aref url-hexify-table) ...)
(defun url-hexify-string (string &optional allowed-chars)
"URI-encode STRING and return the result.
If STRING is multibyte, it is first converted to a utf-8 byte
string. Each byte corresponding to an allowed character is left
as-is, while all other bytes are converted to a three-character
string: \"%\" followed by two lowercase hex digits.
The allowed characters are specified by ALLOWED-CHARS. If this
argument is nil, the list `url-unreserved-chars' determines the
allowed characters. Otherwise, ALLOWED-CHARS should be a vector
whose Nth element is non-nil if character N is allowed."
(unless allowed-chars
(setq allowed-chars (url--allowed-chars url-unreserved-chars)))
(mapconcat (lambda (byte)
(if (memq byte url-unreserved-chars)
(char-to-string byte)
(format "%%%02x" byte)))
(if (multibyte-string-p string)
(encode-coding-string string 'utf-8)
string)
""))
(if (aref allowed-chars byte)
(char-to-string byte)
(aref url-encoding-table byte)))
(if (multibyte-string-p string)
(encode-coding-string string 'utf-8)
string)
""))
(defconst url-host-allowed-chars
;; Allow % to avoid re-encoding %-encoded sequences.
(url--allowed-chars (append '(?% ?! ?$ ?& ?' ?\( ?\) ?* ?+ ?, ?\; ?=)
url-unreserved-chars))
"Allowed-character byte mask for the host segment of a URI.
These characters are specified in RFC 3986, Appendix A.")
(defconst url-path-allowed-chars
(let ((vec (copy-sequence url-host-allowed-chars)))
(aset vec ?/ t)
(aset vec ?: t)
(aset vec ?@ t)
vec)
"Allowed-character byte mask for the path segment of a URI.
These characters are specified in RFC 3986, Appendix A.")
(defconst url-query-allowed-chars
(let ((vec (copy-sequence url-path-allowed-chars)))
(aset vec ?? t)
vec)
"Allowed-character byte mask for the query segment of a URI.
These characters are specified in RFC 3986, Appendix A.")
;;;###autoload
(defun url-encode-url (url)
"Return a properly URI-encoded version of URL.
This function also performs URI normalization, e.g. converting
the scheme to lowercase if it is uppercase. Apart from
normalization, if URL is already URI-encoded, this function
should return it unchanged."
(if (multibyte-string-p url)
(setq url (encode-coding-string url 'utf-8)))
(let* ((obj (url-generic-parse-url url))
(user (url-user obj))
(pass (url-password obj))
(host (url-host obj))
(file (url-filename obj))
(frag (url-target obj))
path query)
(if user
(setf (url-user obj) (url-hexify-string user)))
(if pass
(setf (url-password obj) (url-hexify-string pass)))
(when host
;; No special encoding for IPv6 literals.
(unless (string-match "\\`\\[.*\\]\\'" host)
(setf (url-host obj)
(url-hexify-string host url-host-allowed-chars))))
;; Split FILENAME slot into its PATH and QUERY components, and
;; encode them separately. The PATH component can contain
;; unreserved characters, %-encodings, and /:@!$&'()*+,;=
(when file
(if (string-match "\\?" file)
(setq path (substring file 0 (match-beginning 0))
query (substring file (match-end 0)))
(setq path file))
(setq path (url-hexify-string path url-path-allowed-chars))
(if query
(setq query (url-hexify-string query url-query-allowed-chars)))
(setf (url-filename obj)
(if query (concat path "?" query) path)))
(if frag
(setf (url-target obj)
(url-hexify-string frag url-query-allowed-chars)))
(url-recreate-url obj)))
;;;###autoload
(defun url-file-extension (fname &optional x)
......
......@@ -304,8 +304,12 @@ undefined."
:type '(choice (const :tag "None" :value nil) string)
:group 'url)
;; From RFC3986: Scheme names consist of a sequence of characters
;; beginning with a letter and followed by any combination of letters,
;; digits, plus ("+"), period ("."), or hyphen ("-").
(defvar url-nonrelative-link
"\\`\\([-a-zA-Z0-9+.]+:\\)"
"\\`\\([a-zA-Z][-a-zA-Z0-9+.]*:\\)"
"A regular expression that will match an absolute URL.")
(defcustom url-max-redirections 30
......
......@@ -125,7 +125,9 @@ variable in the original buffer as a forwarding pointer.")
;;;###autoload
(defun url-retrieve (url callback &optional cbargs silent inhibit-cookies)
"Retrieve URL asynchronously and call CALLBACK with CBARGS when finished.
URL is either a string or a parsed URL.
URL is either a string or a parsed URL. If it is a string
containing characters that are not valid in a URI, those
characters are percent-encoded; see `url-encode-url'.
CALLBACK is called when the object has been completely retrieved, with
the current buffer containing the object, and any MIME headers associated
......@@ -179,10 +181,8 @@ URL-encoded before it's used."
(url-do-setup)
(url-gc-dead-buffers)
(if (stringp url)
(set-text-properties 0 (length url) nil url))
(when (multibyte-string-p url)
(let ((url-unreserved-chars (append '(?: ?/) url-unreserved-chars)))
(setq url (url-hexify-string url))))
(set-text-properties 0 (length url) nil url))
(setq url (url-encode-url url))
(if (not (vectorp url))
(setq url (url-generic-parse-url url)))
(if (not (functionp callback))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment