utf-7.el 4.21 KB
Newer Older
Dave Love's avatar
Dave Love committed
1 2
;;; utf-7.el --- utf-7 coding system

Paul Eggert's avatar
Paul Eggert committed
3
;; Copyright (C) 2003-2019 Free Software Foundation, Inc.
Dave Love's avatar
Dave Love committed
4 5 6 7

;; Author: Dave Love <fx@gnu.org>
;; Keywords: i18n, mail

8 9 10
;; This file is part of GNU Emacs.

;; GNU Emacs is free software: you can redistribute it and/or modify
Dave Love's avatar
Dave Love committed
11
;; it under the terms of the GNU General Public License as published by
12 13
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
Dave Love's avatar
Dave Love committed
14

15
;; GNU Emacs is distributed in the hope that it will be useful,
Dave Love's avatar
Dave Love committed
16 17 18 19 20
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
21
;; along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.
Dave Love's avatar
Dave Love committed
22 23 24 25 26 27 28 29 30 31 32

;;; Commentary:

;; Defines a coding system for UTF-7, defined in RFC 2152.  Non-ASCII
;; segments are encoded as base64-encoded big endian UTF-16.  Also
;; defines a variation required for IMAP (RFC 2060).

;; The encoding and decoding was originally taken from Jon K Hellan's
;; implementation in Gnus, but has been substantially re-done.

;; This probably needs more attention.  In particular, it's not
Glenn Morris's avatar
Glenn Morris committed
33
;; completely consistent with iconv's behavior.  It's arguable
Dave Love's avatar
Dave Love committed
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
;; whether the IMAP version should be a coding system since it's
;; apparently only used for IMAP mailbox names, so it's commented out.

;;; Code:

(defun utf-7-decode (len imap)
  "Decode LEN bytes of UTF-7 at point.
IMAP non-nil means use the IMAP version."
  (save-excursion
    (save-restriction
      (narrow-to-region (point) (+ (point) len))
      (let ((not-esc (if imap "^&" "^+"))
	    (skip-chars (if imap "A-Za-z0-9+," "A-Za-z0-9+/")))
	(while (not (eobp))
	  (skip-chars-forward not-esc)
	  (unless (eobp)
	    (forward-char)
	    (let ((p (point))
		  (run-length (skip-chars-forward skip-chars)))
	      (if (eq ?- (char-after))
		  (delete-char 1))
	      (unless (= run-length 0)	; encoded lone esc-char
		(let ((pl (mod (- run-length) 4)))
		  (insert-char ?= pl)
		  (if imap
		      (subst-char-in-region p (point) ?, ?/))
		  (base64-decode-region p (point)))
61
		(decode-coding-region p (point) 'utf-16be)
Dave Love's avatar
Dave Love committed
62 63
		(save-excursion
		  (goto-char p)
64
		  (delete-char -1)))))))
Dave Love's avatar
Dave Love committed
65 66
      (- (point-max) (point-min)))))

67
;;;###autoload
Dave Love's avatar
Dave Love committed
68 69 70
(defun utf-7-post-read-conversion (len)
  (utf-7-decode len nil))

71 72 73
;;;###autoload
(defun utf-7-imap-post-read-conversion (len)
  (utf-7-decode len t))
Dave Love's avatar
Dave Love committed
74 75 76 77 78 79 80 81

(defun utf-7-encode (from to imap)
  "Encode bytes between FROM and TO to UTF-7.
ESC and SKIP-CHARS are adjusted for the normal and IMAP versions."
  (let* ((old-buf (current-buffer))
	 (esc (if imap ?& ?+))
	 ;; These are characters which can be encoded asis.
	 (skip-chars (if imap
82
			 "\t\n\r\x20-\x25\x27-\x7e" ; rfc2060
Dave Love's avatar
Dave Love committed
83 84 85 86 87 88 89 90 91 92 93
		       ;; This includes the rfc2152 optional set.
		       ;; Perhaps it shouldn't (like iconv).
		       "\t\n\r -*,-[]-}"))
	 (not-skip-chars (format "^%s%c" skip-chars esc)))
    (set-buffer (generate-new-buffer " *temp*"))
    (if (stringp from)
	(insert from)
      (insert-buffer-substring old-buf from to))
    (goto-char (point-min))
    (while (not (eobp))
      (skip-chars-forward skip-chars)
94
      (if (eq esc (char-after))
Dave Love's avatar
Dave Love committed
95 96 97 98 99 100 101 102 103
	  (progn (forward-char)
		 (insert ?-))
	(unless (eobp)
	  (insert esc)
	  (let ((p (point)))
	    (skip-chars-forward not-skip-chars)
	    (save-restriction
	      ;; encode-coding-region doesn't preserve point
	      (narrow-to-region p (point))
104
	      (encode-coding-region p (point-max) 'utf-16be)
Dave Love's avatar
Dave Love committed
105 106 107 108 109 110 111 112
	      (base64-encode-region p (point-max))
	      (if imap
		  (subst-char-in-region p (point-max) ?/ ?,))
	      (goto-char p)
	      ;; As I read the RFC, this isn't correct, but it's
	      ;; consistent with iconv, at least regarding `='.
	      (skip-chars-forward "^= \t\n")
	      (delete-region (point) (point-max))))
113 114 115
          ;; RFC2060 stipulates that all names MUST end in US-ASCII (i.e.
          ;; a name that ends with a Unicode octet MUST end with a "-").
	  (if (or imap (not (eobp)))
Dave Love's avatar
Dave Love committed
116 117 118
	    (insert ?-)))))
    nil))

119
;;;###autoload
Dave Love's avatar
Dave Love committed
120 121 122
(defun utf-7-pre-write-conversion (from to)
  (utf-7-encode from to nil))

123 124 125
;;;###autoload
(defun utf-7-imap-pre-write-conversion (from to)
  (utf-7-encode from to t))
Dave Love's avatar
Dave Love committed
126 127

(provide 'utf-7)
Miles Bader's avatar
Miles Bader committed
128

Dave Love's avatar
Dave Love committed
129
;;; utf-7.el ends here