Commit 568f1488 authored by Lars Ingebrigtsen's avatar Lars Ingebrigtsen

Make eww more liberal when interpreting some invalid HTML

* lisp/net/eww.el (eww--preprocess-html): New function (bug#37009)
to be more lenient with invalid HTML and translate common invalid
HTML like "a <= b" into "a &lt;= b" to be more liberal in what we
accept before parsing.
(eww-display-html): Use it.
(eww-readable): Ditto.
parent 49a4b869
Pipeline #3128 passed with stage
in 67 minutes and 26 seconds
......@@ -326,6 +326,18 @@ the default EWW buffer."
#'url-hexify-string (split-string url) "+"))))))
url)
(defun eww--preprocess-html (start end)
"Translate all < characters that do not look like start of tags into &lt;."
(save-excursion
(save-restriction
(narrow-to-region start end)
(goto-char start)
(let ((case-fold-search t))
(while (re-search-forward "<[^0-9a-z!/]" nil t)
(goto-char (match-beginning 0))
(delete-region (point) (1+ (point)))
(insert "&lt;"))))))
;;;###autoload (defalias 'browse-web 'eww)
;;;###autoload
......@@ -479,6 +491,7 @@ Currently this means either text/html or application/xhtml+xml."
;; Remove CRLF and replace NUL with &#0; before parsing.
(while (re-search-forward "\\(\r$\\)\\|\0" nil t)
(replace-match (if (match-beginning 1) "" "&#0;") t t)))
(eww--preprocess-html (point) (point-max))
(libxml-parse-html-region (point) (point-max))))))
(source (and (null document)
(buffer-substring (point) (point-max)))))
......@@ -716,6 +729,7 @@ the like."
(condition-case nil
(decode-coding-region (point-min) (point-max) 'utf-8)
(coding-system-error nil))
(eww--preprocess-html (point-min) (point-max))
(libxml-parse-html-region (point-min) (point-max))))
(base (plist-get eww-data :url)))
(eww-score-readability dom)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment