Commit cc926903 authored by Kenichi Handa's avatar Kenichi Handa
Browse files

(ctext-non-standard-encodings-alist):

Renamed from non-standard-icccm-encodings-alist.
(ctext-non-standard-encodings-regexp): New variable
(ctext-post-read-conversion): Full rewrite.
(ctext-non-standard-designations-alist): Renamed from
non-standard-designations-alist.
(ctext-pre-write-conversion): Full rewrite.
parent 0651bdbb
...@@ -1316,108 +1316,73 @@ ARG is a list of coding categories ordered by priority." ...@@ -1316,108 +1316,73 @@ ARG is a list of coding categories ordered by priority."
;;; X selections ;;; X selections
(defvar non-standard-icccm-encodings-alist (defvar ctext-non-standard-encodings-alist
'(("ISO8859-15" . latin-iso8859-15) '(("ISO8859-15" . latin-iso8859-15)
("ISO8859-14" . latin-iso8859-14) ("ISO8859-14" . latin-iso8859-14)
("KOI8-R" . koi8-r) ("KOI8-R" . koi8-r)
("BIG5-0" . big5)) ("BIG5-0" . big5))
"Alist of font charset names defined by XLFD. "Alist of non-standard encoding names vs Emacs coding systems.
The cdr of each element is the corresponding Emacs charset or coding system.") This alist is used to decode an extened segment of a compound text.")
(defvar ctext-non-standard-encodings-regexp
(string-to-multibyte
(concat
;; For non-standard encodings.
"\\(\e%/[0-4][\200-\377][\200-\377]\\([^\002]+\\)\002\\)"
"\\|"
;; For UTF-8 encoding.
"\\(\e%G[^\e]*\e%@\\)")))
;; Functions to support "Non-Standard Character Set Encodings" defined ;; Functions to support "Non-Standard Character Set Encodings" defined
;; by the COMPOUND-TEXT spec. ;; by the COMPOUND-TEXT spec.
;; We support that by converting the leading sequence of the ;; We support that by decoding the whole data by `ctext' which just
;; ``extended segment'' to the corresponding ISO-2022 sequences (if ;; pertains byte sequences belonging to ``extended segment'', then
;; the leading sequence names an Emacs charset), or decode the segment ;; decoding those byte sequences one by one in Lisp.
;; (if it names a coding system). Encoding does the reverse.
;; This function also supports "The UTF-8 encoding" described in the ;; This function also supports "The UTF-8 encoding" described in the
;; section 7 of the documentation fo COMPOUND-TEXT distributed with ;; section 7 of the documentation fo COMPOUND-TEXT distributed with
;; XFree86. ;; XFree86.
(defun ctext-post-read-conversion (len) (defun ctext-post-read-conversion (len)
"Decode LEN characters encoded as Compound Text with Extended Segments." "Decode LEN characters encoded as Compound Text with Extended Segments."
(buffer-disable-undo) ; minimize consing due to insertions and deletions
(narrow-to-region (point) (+ (point) len))
(save-match-data (save-match-data
(let ((pt (point-marker)) (save-restriction
(oldpt (point-marker)) (let ((case-fold-search nil)
(newpt (make-marker)) (in-workbuf (string= (buffer-name) " *code-converting-work*"))
(modified-p (buffer-modified-p)) last-coding-system-used
(case-fold-search nil) pos bytes)
;; We need multibyte conversion of "TO" type because the (or in-workbuf
;; buffer may be multibyte, and, in that case, the pattern (narrow-to-region (point) (+ (point) len)))
;; must contain eight-bit-control/graphic characters. (decode-coding-region (point-min) (point-max) 'ctext)
(pattern (string-to-multibyte "\\(\e\\)%/[0-4]\\([\200-\377][\200-\377]\\)\\([^\002]+\\)\002\\|\e%G[^\e]+\e%@")) (if in-workbuf
last-coding-system-used (set-buffer-multibyte t))
encoding textlen chset) (while (re-search-forward ctext-non-standard-encodings-regexp
(while (re-search-forward pattern nil 'move) nil 'move)
(set-marker newpt (point)) (setq pos (match-beginning 0))
(set-marker pt (match-beginning 0)) (if (match-beginning 1)
(if (= (preceding-char) ?@) ;; ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
;; We found embedded utf-8 sequence. (let* ((M (char-after (+ pos 4)))
(progn (L (char-after (+ pos 5)))
(delete-char -3) ; delete ESC % @ at the tail (encoding (match-string 2))
(goto-char pt) (coding (or (cdr (assoc-ignore-case
(delete-char 3) ; delete ESC % G at the head encoding
(if (> pt oldpt) ctext-non-standard-encodings-alist))
(decode-coding-region oldpt pt 'ctext-no-compositions)) (coding-system-p
(decode-coding-region pt newpt 'mule-utf-8) (intern (downcase encoding))))))
(goto-char newpt) (setq bytes (- (+ (* (- M 128) 128) (- L 128))
(set-marker oldpt newpt)) (- (point) (+ pos 6))))
(setq encoding (match-string 3)) (when coding
(setq textlen (- (+ (* (- (aref (match-string 2) 0) 128) 128) (delete-region pos (point))
(- (aref (match-string 2) 1) 128)) (forward-char bytes)
(1+ (length encoding)))) (decode-coding-region (- (point) bytes) (point) coding)))
(setq ;; ESC % G --UTF-8-BYTES-- ESC % @
chset (cdr (assoc-ignore-case encoding (setq bytes (- (point) pos))
non-standard-icccm-encodings-alist))) (decode-coding-region (- (point) bytes) (point) 'utf-8))))
(cond ((null chset) (goto-char (point-min))
;; This charset is not supported--leave this extended (- (point-max) (point)))))
;; segment unaltered and skip over it.
(goto-char (+ (point) textlen)))
((charsetp chset)
;; If it's a charset, replace the leading escape sequence
;; with a standard ISO-2022 sequence. We will decode all
;; such segments later, in one go, when we exit the loop
;; or find an extended segment that names a coding
;; system, not a charset.
(replace-match
(concat "\\1"
(if (= 0 (charset-iso-graphic-plane chset))
;; GL charsets
(if (= 1 (charset-dimension chset)) "(" "$(")
;; GR charsets
(if (= 96 (charset-chars chset))
"-"
(if (= 1 (charset-dimension chset)) ")" "$)")))
(string (charset-iso-final-char chset)))
t)
(goto-char (+ (point) textlen)))
((coding-system-p chset)
;; If it's a coding system, we need to decode the segment
;; right away. But first, decode what we've skipped
;; across until now.
(when (> pt oldpt)
(decode-coding-region oldpt pt 'ctext-no-compositions))
(delete-region pt newpt)
(set-marker newpt (+ newpt textlen))
(decode-coding-region pt newpt chset)
(goto-char newpt)
(set-marker oldpt newpt)))))
;; Decode what's left.
(when (> (point) oldpt)
(decode-coding-region oldpt (point) 'ctext-no-compositions))
;; This buffer started as unibyte, because the string we get from
;; the X selection is a unibyte string. We must now make it
;; multibyte, so that the decoded text is inserted as multibyte
;; into its buffer.
(set-buffer-multibyte t)
(set-buffer-modified-p modified-p)
(- (point-max) (point-min)))))
;; If you add charsets here, be sure to modify the regexp used by ;; If you add charsets here, be sure to modify the regexp used by
;; ctext-pre-write-conversion to look up non-standard charsets. ;; ctext-pre-write-conversion to look up non-standard charsets.
(defvar non-standard-designations-alist (defvar ctext-non-standard-designations-alist
'(("$(0" . (big5 "big5-0" 2)) '(("$(0" . (big5 "big5-0" 2))
("$(1" . (big5 "big5-0" 2)) ("$(1" . (big5 "big5-0" 2))
;; The following are actually standard; generating extended ;; The following are actually standard; generating extended
...@@ -1449,44 +1414,47 @@ of octets per character is variable), 1, 2, 3, or 4.") ...@@ -1449,44 +1414,47 @@ of octets per character is variable), 1, 2, 3, or 4.")
"Encode characters between FROM and TO as Compound Text w/Extended Segments. "Encode characters between FROM and TO as Compound Text w/Extended Segments.
If FROM is a string, or if the current buffer is not the one set up for us If FROM is a string, or if the current buffer is not the one set up for us
by run_pre_post_conversion_on_str, generate a new temp buffer, insert the by encode-coding-string, generate a new temp buffer, insert the
text, and convert it in the temporary buffer. Otherwise, convert in-place." text, and convert it in the temporary buffer. Otherwise, convert in-place."
(cond ((and (string= (buffer-name) " *code-converting-work*")
(not (stringp from)))
; Minimize consing due to subsequent insertions and deletions.
(buffer-disable-undo)
(narrow-to-region from to))
(t
(let ((buf (current-buffer)))
(set-buffer (generate-new-buffer " *temp"))
(buffer-disable-undo)
(if (stringp from)
(insert from)
(insert-buffer-substring buf from to))
(setq from (point-min) to (point-max)))))
(encode-coding-region from to 'ctext-no-compositions)
;; Replace ISO-2022 charset designations with extended segments, for
;; those charsets that are not part of the official X registry.
(save-match-data (save-match-data
(goto-char (point-min)) ;; Setup a working buffer if necessary.
(let ((newpt (make-marker)) (cond ((stringp from)
(case-fold-search nil) (let ((buf (current-buffer)))
pt desig encode-info encoding chset noctets textlen) (set-buffer (generate-new-buffer " *temp"))
(set-buffer-multibyte nil) (set-buffer-multibyte (multibyte-string-p from))
;; The regexp below finds the leading sequences for big5. (insert from)))
((not (string= (buffer-name) " *code-converting-work*"))
(let ((buf (current-buffer))
(multibyte enable-multibyte-characters))
(set-buffer (generate-new-buffer " *temp"))
(set-buffer-multibyte multibyte)
(insert-buffer-substring buf from to))))
;; Now we can encode the whole buffer.
(let ((case-fold-search nil)
last-coding-system-used
pos posend desig encode-info encoding chset noctets textlen)
(goto-char (point-min))
;; At first encode the whole buffer.
(encode-coding-region (point-min) (point-max) 'ctext-no-compositions)
;; Then replace ISO-2022 charset designations with extended
;; segments, for those charsets that are not part of the
;; official X registry. The regexp below finds the leading
;; sequences for big5.
(while (re-search-forward "\e\\(\$([01]\\)" nil 'move) (while (re-search-forward "\e\\(\$([01]\\)" nil 'move)
(setq desig (match-string 1) (setq pos (match-beginning 0)
pt (point-marker) posend (point)
encode-info (cdr (assoc desig non-standard-designations-alist)) desig (match-string 1)
encode-info (cdr (assoc desig
ctext-non-standard-designations-alist))
encoding (car encode-info) encoding (car encode-info)
chset (cadr encode-info) chset (cadr encode-info)
noctets (car (cddr encode-info))) noctets (car (cddr encode-info)))
(skip-chars-forward "^\e") (skip-chars-forward "^\e")
(set-marker newpt (point))
(cond (cond
((eq encoding t) ; only the leading sequence needs to be changed ((eq encoding t) ; only the leading sequence needs to be changed
(setq textlen (+ (- newpt pt) (length chset) 1)) (setq textlen (+ (- (point) posend) (length chset) 1))
;; Generate the ICCCM control sequence for an extended segment. ;; Generate the control sequence for an extended segment.
(replace-match (format "\e%%/%d%c%c%s" (replace-match (format "\e%%/%d%c%c%s"
noctets noctets
(+ (/ textlen 128) 128) (+ (/ textlen 128) 128)
...@@ -1494,20 +1462,18 @@ text, and convert it in the temporary buffer. Otherwise, convert in-place." ...@@ -1494,20 +1462,18 @@ text, and convert it in the temporary buffer. Otherwise, convert in-place."
chset) chset)
t t)) t t))
((coding-system-p encoding) ; need to recode the entire segment... ((coding-system-p encoding) ; need to recode the entire segment...
(set-marker pt (match-beginning 0)) (decode-coding-region pos (point) 'ctext-no-compositions)
(decode-coding-region pt newpt 'ctext-no-compositions) (encode-coding-region pos (point) encoding)
(set-buffer-multibyte t)
(encode-coding-region pt newpt encoding)
(set-buffer-multibyte nil) (set-buffer-multibyte nil)
(setq textlen (+ (- newpt pt) (length chset) 1)) (setq textlen (+ (- (point) pos) (length chset) 1))
(goto-char pt) (save-excursion
(insert (format "\e%%/%d%c%c%s" (goto-char pos)
noctets (insert (format "\e%%/%d%c%c%s"
(+ (/ textlen 128) 128) noctets
(+ (% textlen 128) 128) (+ (/ textlen 128) 128)
chset)))) (+ (% textlen 128) 128)
(goto-char newpt)))) chset))))))
(set-buffer-multibyte t) (goto-char (point-min))))
;; Must return nil, as build_annotations_2 expects that. ;; Must return nil, as build_annotations_2 expects that.
nil) nil)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment