Commit f55d2710 authored by Dave Love's avatar Dave Love
Browse files

#

parent f848daf3
0x00 0x0000
0x01 0x0001
0x02 0x0002
0x03 0x0003
0x04 0x009C
0x05 0x0009
0x06 0x0086
0x07 0x007F
0x08 0x0097
0x09 0x008D
0x0a 0x008E
0x0b 0x000B
0x0c 0x000C
0x0d 0x000D
0x0e 0x000E
0x0f 0x000F
0x10 0x0010
0x11 0x0011
0x12 0x0012
0x13 0x0013
0x14 0x009D
0x15 0x0085
0x16 0x0008
0x17 0x0087
0x18 0x0018
0x19 0x0019
0x1a 0x0092
0x1b 0x008F
0x1c 0x001C
0x1d 0x001D
0x1e 0x001E
0x1f 0x001F
0x20 0x0080
0x21 0x0081
0x22 0x0082
0x23 0x0083
0x24 0x0084
0x25 0x000A
0x26 0x0017
0x27 0x001B
0x28 0x0088
0x29 0x0089
0x2a 0x008A
0x2b 0x008B
0x2c 0x008C
0x2d 0x0005
0x2e 0x0006
0x2f 0x0007
0x30 0x0090
0x31 0x0091
0x32 0x0016
0x33 0x0093
0x34 0x0094
0x35 0x0095
0x36 0x0096
0x37 0x0004
0x38 0x0098
0x39 0x0099
0x3a 0x009A
0x3b 0x009B
0x3c 0x0014
0x3d 0x0015
0x3e 0x009E
0x3f 0x001A
0x40 0x0020
0x4a 0x0024
0x4b 0x002E
0x4c 0x003C
0x4d 0x0028
0x4e 0x002B
0x4f 0x007C
0x50 0x0026
0x5a 0x0021
0x5b 0x00A3
0x5c 0x002A
0x5d 0x0029
0x5e 0x003B
0x5f 0x00AC
0x60 0x002D
0x61 0x002F
0x6a 0x00A6
0x6b 0x002C
0x6c 0x0025
0x6d 0x005F
0x6e 0x003E
0x6f 0x003F
0x79 0x0060
0x7a 0x003A
0x7b 0x0023
0x7c 0x0040
0x7d 0x0027
0x7e 0x003D
0x7f 0x0022
0x81 0x0061
0x82 0x0062
0x83 0x0063
0x84 0x0064
0x85 0x0065
0x86 0x0066
0x87 0x0067
0x88 0x0068
0x89 0x0069
0x91 0x006A
0x92 0x006B
0x93 0x006C
0x94 0x006D
0x95 0x006E
0x96 0x006F
0x97 0x0070
0x98 0x0071
0x99 0x0072
0xa1 0x203E
0xa2 0x0073
0xa3 0x0074
0xa4 0x0075
0xa5 0x0076
0xa6 0x0077
0xa7 0x0078
0xa8 0x0079
0xa9 0x007A
0xc0 0x007B
0xc1 0x0041
0xc2 0x0042
0xc3 0x0043
0xc4 0x0044
0xc5 0x0045
0xc6 0x0046
0xc7 0x0047
0xc8 0x0048
0xc9 0x0049
0xd0 0x007D
0xd1 0x004A
0xd2 0x004B
0xd3 0x004C
0xd4 0x004D
0xd5 0x004E
0xd6 0x004F
0xd7 0x0050
0xd8 0x0051
0xd9 0x0052
0xe0 0x005C
0xe2 0x0053
0xe3 0x0054
0xe4 0x0055
0xe5 0x0056
0xe6 0x0057
0xe7 0x0058
0xe8 0x0059
0xe9 0x005A
0xf0 0x0030
0xf1 0x0031
0xf2 0x0032
0xf3 0x0033
0xf4 0x0034
0xf5 0x0035
0xf6 0x0036
0xf7 0x0037
0xf8 0x0038
0xf9 0x0039
0xff 0x009F
This diff is collapsed.
;;; utf-8.el --- Limited UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*-
;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
;; Licensed to the Free Software Foundation.
;; Copyright (C) 2001 Free Software Foundation, Inc.
;; Author: TAKAHASHI Naoto <ntakahas@m17n.org>
;; Keywords: multilingual, Unicode, UTF-8, i18n
;; This file is part of GNU Emacs.
;; GNU Emacs is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 2, or (at your option)
;; any later version.
;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs; see the file COPYING. If not, write to the
;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
;; Boston, MA 02111-1307, USA.
;;; Commentary:
;; The coding-system `mule-utf-8' basically supports encoding/decoding
;; of the following character sets to and from UTF-8:
;;
;; ascii
;; eight-bit-control
;; latin-iso8859-1
;; mule-unicode-0100-24ff
;; mule-unicode-2500-33ff
;; mule-unicode-e000-ffff
;;
;; On decoding, Unicode characters that do not fit into the above
;; character sets are handled as `eight-bit-control' or
;; `eight-bit-graphic' characters to retain the information about the
;; original byte sequence.
;;
;; Characters from other character sets can be encoded with
;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and
;; registering the translation with `register-char-codings'.
;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
;; scalar | utf-8
;; value | 1st byte | 2nd byte | 3rd byte
;; --------------------+-----------+-----------+----------
;; 0000 0000 0xxx xxxx | 0xxx xxxx | |
;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
;;; Code:
(defvar ucs-mule-to-mule-unicode (make-translation-table)
"Translation table for encoding to `mule-utf-8'.")
;; Could have been done by ucs-tables loaded before.
(unless (get 'ucs-mule-to-mule-unicode 'translation-table)
(define-translation-table 'ucs-mule-to-mule-unicode ucs-mule-to-mule-unicode))
(define-ccl-program ccl-decode-mule-utf-8
;;
;; charset | bytes in utf-8 | bytes in emacs
;; -----------------------+----------------+---------------
;; ascii | 1 | 1
;; -----------------------+----------------+---------------
;; eight-bit-control | 2 | 2
;; eight-bit-graphic | 2 | 1
;; latin-iso8859-1 | 2 | 2
;; -----------------------+----------------+---------------
;; mule-unicode-0100-24ff | 2 | 4
;; (< 0800) | |
;; -----------------------+----------------+---------------
;; mule-unicode-0100-24ff | 3 | 4
;; (>= 8000) | |
;; mule-unicode-2500-33ff | 3 | 4
;; mule-unicode-e000-ffff | 3 | 4
;;
;; Thus magnification factor is two.
;;
`(2
((r5 = ,(charset-id 'eight-bit-control))
(r6 = ,(charset-id 'eight-bit-graphic))
(loop
(read r0)
;; 1byte encoding, i.e., ascii
(if (r0 < #x80)
(write r0)
;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
(if (r0 < #xe0)
((read r1)
(if ((r1 & #b11000000) != #b10000000)
;; Invalid 2-byte sequence
((if (r0 < #xa0)
(write-multibyte-character r5 r0)
(write-multibyte-character r6 r0))
(if (r1 < #x80)
(write r1)
(if (r1 < #xa0)
(write-multibyte-character r5 r1)
(write-multibyte-character r6 r1))))
((r0 &= #x1f)
(r0 <<= 6)
(r1 &= #x3f)
(r1 += r0)
;; Now r1 holds scalar value
;; eight-bit-control
(if (r1 < 160)
((write-multibyte-character r5 r1))
;; latin-iso8859-1
(if (r1 < 256)
((r0 = ,(charset-id 'latin-iso8859-1))
(r1 -= 128)
(write-multibyte-character r0 r1))
;; mule-unicode-0100-24ff (< 0800)
((r0 = ,(charset-id 'mule-unicode-0100-24ff))
(r1 -= #x0100)
(r2 = (((r1 / 96) + 32) << 7))
(r1 %= 96)
(r1 += (r2 + 32))
(write-multibyte-character r0 r1)))))))
;; 3byte encoding
;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
(if (r0 < #xf0)
((read r1 r2)
;; This is set to 1 if the encoding is invalid.
(r4 = 0)
(r3 = (r1 & #b11000000))
(r3 |= ((r2 >> 2) & #b00110000))
(if (r3 != #b10100000)
(r4 = 1)
((r3 = ((r0 & #x0f) << 12))
(r3 += ((r1 & #x3f) << 6))
(r3 += (r2 & #x3f))
(if (r3 < #x0800)
(r4 = 1))))
(if (r4 != 0)
;; Invalid 3-byte sequence
((if (r0 < #xa0)
(write-multibyte-character r5 r0)
(write-multibyte-character r6 r0))
(if (r1 < #x80)
(write r1)
(if (r1 < #xa0)
(write-multibyte-character r5 r1)
(write-multibyte-character r6 r1)))
(if (r2 < #x80)
(write r2)
(if (r2 < #xa0)
(write-multibyte-character r5 r2)
(write-multibyte-character r6 r2))))
;; mule-unicode-0100-24ff (>= 0800)
((if (r3 < #x2500)
((r0 = ,(charset-id 'mule-unicode-0100-24ff))
(r3 -= #x0100)
(r3 //= 96)
(r1 = (r7 + 32))
(r1 += ((r3 + 32) << 7))
(write-multibyte-character r0 r1))
;; mule-unicode-2500-33ff
(if (r3 < #x3400)
((r0 = ,(charset-id 'mule-unicode-2500-33ff))
(r3 -= #x2500)
(r3 //= 96)
(r1 = (r7 + 32))
(r1 += ((r3 + 32) << 7))
(write-multibyte-character r0 r1))
;; U+3400 .. U+DFFF
;; keep those bytes as eight-bit-{control|graphic}
(if (r3 < #xe000)
( ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
(r3 = r6)
(write-multibyte-character r3 r0)
(if (r1 < #xa0)
(r3 = r5))
(write-multibyte-character r3 r1)
(if (r2 < #xa0)
(r3 = r5)
(r3 = r6))
(write-multibyte-character r3 r2))
;; mule-unicode-e000-ffff
((r0 = ,(charset-id 'mule-unicode-e000-ffff))
(r3 -= #xe000)
(r3 //= 96)
(r1 = (r7 + 32))
(r1 += ((r3 + 32) << 7))
(write-multibyte-character r0 r1))))))))
;; 4byte encoding
;; keep those bytes as eight-bit-{control|graphic}
((read r1 r2 r3)
;; r0 > #xf0, thus eight-bit-graphic
(write-multibyte-character r6 r0)
(if (r1 < #xa0)
(write-multibyte-character r5 r1)
(write-multibyte-character r6 r1))
(if (r2 < #xa0)
(write-multibyte-character r5 r2)
(write-multibyte-character r6 r2))
(if (r3 < #xa0)
(write-multibyte-character r5 r3)
(write-multibyte-character r6 r3))))))
(repeat))))
"CCL program to decode UTF-8.
Basic decoding is done into the charsets ascii, latin-iso8859-1 and
mule-unicode-*. Encodings of un-representable Unicode characters are
decoded asis into eight-bit-control and eight-bit-graphic
characters.")
(define-ccl-program ccl-encode-mule-utf-8
`(1
((r5 = -1)
(loop
(if (r5 < 0)
((r1 = -1)
(read-multibyte-character r0 r1)
(translate-character ucs-mule-to-mule-unicode r0 r1))
(;; We have already done read-multibyte-character.
(r0 = r5)
(r1 = r6)
(r5 = -1)))
(if (r0 == ,(charset-id 'ascii))
(write r1)
(if (r0 == ,(charset-id 'latin-iso8859-1))
;; r1 scalar utf-8
;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
;; 20 0000 0000 1010 0000 1100 0010 1010 0000
;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
((r0 = (((r1 & #x40) >> 6) | #xc2))
(r1 &= #x3f)
(r1 |= #x80)
(write r0 r1))
(if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
;; #x3f80 == (0011 1111 1000 0000)b
(r1 &= #x7f)
(r1 += (r0 + 224)) ; 240 == -32 + #x0100
;; now r1 holds scalar value
(if (r1 < #x0800)
;; 2byte encoding
((r0 = (((r1 & #x07c0) >> 6) | #xc0))
;; #x07c0 == (0000 0111 1100 0000)b
(r1 &= #x3f)
(r1 |= #x80)
(write r0 r1))
;; 3byte encoding
((r0 = (((r1 & #xf000) >> 12) | #xe0))
(r2 = ((r1 & #x3f) | #x80))
(r1 &= #x0fc0)
(r1 >>= 6)
(r1 |= #x80)
(write r0 r1 r2))))
(if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
(r1 &= #x7f)
(r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
(r0 = (((r1 & #xf000) >> 12) | #xe0))
(r2 = ((r1 & #x3f) | #x80))
(r1 &= #x0fc0)
(r1 >>= 6)
(r1 |= #x80)
(write r0 r1 r2))
(if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
(r1 &= #x7f)
(r1 += (r0 + 57312)) ; 57312 == -160 + #xe000
(r0 = (((r1 & #xf000) >> 12) | #xe0))
(r2 = ((r1 & #x3f) | #x80))
(r1 &= #x0fc0)
(r1 >>= 6)
(r1 |= #x80)
(write r0 r1 r2))
(if (r0 == ,(charset-id 'eight-bit-control))
;; r1 scalar utf-8
;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
;; 80 0000 0000 1000 0000 1100 0010 1000 0000
;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
((write #xc2)
(write r1))
(if (r0 == ,(charset-id 'eight-bit-graphic))
;; r1 scalar utf-8
;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
;; a0 0000 0000 1010 0000 1100 0010 1010 0000
;; ff 0000 0000 1111 1111 1101 1111 1011 1111
((write r1)
(r1 = -1)
(read-multibyte-character r0 r1)
(if (r0 != ,(charset-id 'eight-bit-graphic))
(if (r0 != ,(charset-id 'eight-bit-control))
((r5 = r0)
(r6 = r1))))
(if (r5 < 0)
((read-multibyte-character r0 r2)
(if (r0 != ,(charset-id 'eight-bit-graphic))
(if (r0 != ,(charset-id 'eight-bit-control))
((r5 = r0)
(r6 = r2))))
(if (r5 < 0)
(write r1 r2)
(if (r1 < #xa0)
(write r1)
((write #xc2)
(write r1)))))))
;; Unsupported character.
;; Output U+FFFD, which is `ef bf bd' in UTF-8.
((write #xef)
(write #xbf)
(write #xbd)))))))))
(repeat)))
(if (r1 >= #xa0)
(write r1)
(if (r1 >= #x80)
((write #xc2)
(write r1)))))
"CCL program to encode into UTF-8.
Only characters from the charsets ascii, eight-bit-control,
eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized.
Others are encoded as U+FFFD.")
;; Dummy definition so that the CCL can be checked correctly; the
;; actual data are loaded on demand.
(unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it
(define-translation-table 'ucs-mule-8859-to-mule-unicode))
(defsubst utf-8-untranslated-to-ucs ()
(let ((b1 (char-after))
(b2 (char-after (1+ (point))))
(b3 (char-after (+ 2 (point))))
(b4 (char-after (+ 4 (point)))))
(if (and b1 b2 b3)
(cond ((< b1 ?\xf0)
(setq b2 (lsh (logand b2 ?\x3f) 6))
(setq b3 (logand b3 ?\x3f))
(logior b3 (logior b2 (lsh (logand b1 ?\x0f) 12))))
(b4
(setq b2 (lsh (logand b2 ?\x3f) 12))
(setq b3 (lsh (logand b3 ?\x3f) 6))
(setq b4 (logand b4 ?\x3f))
(logior b4 (logior b3 (logior b2 (lsh (logand b1 ?\x07)
18)))))))))
(defun utf-8-help-echo (window object position)
(format "Untranslated Unicode U+%04X"
(get-char-property position 'untranslated-utf-8 object)))
(defvar utf-8-subst-table nil
"If non-nil, a hash table mapping `untranslatable utf-8' to Emacs characters.")
;; We compose the untranslatable sequences into a single character.
;; This is infelicitous for editing, because there's currently no
;; mechanism for treating compositions as atomic, but is OK for
;; display. We try to compose an appropriate character from a hash
;; table of CJK characters to display correctly. Otherwise we use
;; U+FFFD. What we really should have is hash table lookup from CCL
;; so that we could do this properly. This function GCs too much.
(defsubst utf-8-compose ()
"Put a suitable composition on an untranslatable sequence.
Return the sequence's length."
(let* ((u (utf-8-untranslated-to-ucs))
(l (and u (if (>= u ?\x10000)
4
3)))
(subst (and utf-8-subst-table (gethash u utf-8-subst-table))))
(when u
(put-text-property (point) (min (point-max) (+ l (point)))
'untranslated-utf-8 u)
(unless subst
(put-text-property (point) (min (point-max) (+ l (point)))
'help-echo 'utf-8-help-echo)
(setq subst ?$,3u=(B))
(compose-region (point) (+ l (point)) subst)
l)))
(defcustom utf-8-compose-scripts nil
"*Non-nil means compose various scipts on decoding utf-8 text."
:group 'mule
:type 'boolean) ; omitted in Emacs 21.1
(defun utf-8-post-read-conversion (length)
"Compose untranslated utf-8 sequences into single characters.
Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
(save-excursion
;; Can't do eval-when-compile to insert a multibyte constant
;; version of the string in the loop, since it's always loaded as
;; unibyte from a byte-compiled file.
(let ((range (string-as-multibyte "^\341-\377")))
(while (and (skip-chars-forward
range)
(not (eobp)))
(forward-char (utf-8-compose)))))
;; Fixme: Takahashi-san implies it may not work this easily -- needs
;; checking with him.
(when (and utf-8-compose-scripts (> length 1))
;; These currently have definitions which cover the relevant
;; Unicodes. We could avoid loading thai-util &c by checking
;; whether the region contains any characters with the appropriate
;; categories. There aren't yet Unicode-based rules for Tibetan.
(save-excursion (setq length (diacritic-post-read-conversion length)))
(save-excursion (setq length (thai-post-read-conversion length)))
(save-excursion (setq length (lao-post-read-conversion length)))
(save-excursion (setq length (devanagari-post-read-conversion length))))
length)
(defun utf-8-pre-write-conversion (beg end)
"Semi-dummy pre-write function effectively to autoload ucs-tables."
;; Ensure translation table is loaded.
(require 'ucs-tables)
;; Don't do this again.
(coding-system-put 'mule-utf-8 'pre-write-conversion nil)
nil)
(make-coding-system
'mule-utf-8 4 ?u
"UTF-8 encoding for Emacs-supported Unicode characters.
The supported Emacs character sets are the following, plus others
which may be included in the translation table
`ucs-mule-to-mule-unicode':
ascii
eight-bit-control
eight-bit-graphic
latin-iso8859-1
latin-iso8859-2
latin-iso8859-3
latin-iso8859-4
cyrillic-iso8859-5
greek-iso8859-7
hebrew-iso8859-8
latin-iso8859-9
latin-iso8859-14
latin-iso8859-15
mule-unicode-0100-24ff
mule-unicode-2500-33ff
mule-unicode-e000-ffff
Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
are decoded into sequences of eight-bit-control and eight-bit-graphic
characters to preserve their byte sequences and composed to display as
a single character. Emacs characters that can't be encoded to these
ranges are encoded as U+FFFD."
'(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)