characters.el 35.6 KB
Newer Older
Karl Heuer's avatar
Karl Heuer committed
1 2
;;; characters.el --- set syntax and category for multibyte characters

Glenn Morris's avatar
Glenn Morris committed
3
;; Copyright (C) 1997, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
Kenichi Handa's avatar
Kenichi Handa committed
4
;;   Free Software Foundation, Inc.
Kenichi Handa's avatar
Kenichi Handa committed
5
;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
Glenn Morris's avatar
Glenn Morris committed
6
;;   2005, 2006, 2007, 2008
Kenichi Handa's avatar
Kenichi Handa committed
7 8
;;   National Institute of Advanced Industrial Science and Technology (AIST)
;;   Registration Number H14PRO021
Kenichi Handa's avatar
Kenichi Handa committed
9
;; Copyright (C) 2003
10 11
;;   National Institute of Advanced Industrial Science and Technology (AIST)
;;   Registration Number H13PRO009
Karl Heuer's avatar
Karl Heuer committed
12 13 14 15 16

;; Keywords: multibyte character, character set, syntax, category

;; This file is part of GNU Emacs.

17
;; GNU Emacs is free software: you can redistribute it and/or modify
Karl Heuer's avatar
Karl Heuer committed
18
;; it under the terms of the GNU General Public License as published by
19 20
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
Karl Heuer's avatar
Karl Heuer committed
21 22 23 24 25 26 27

;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
28
;; along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.
Karl Heuer's avatar
Karl Heuer committed
29 30 31

;;; Commentary:

32 33
;;; Code:

Karl Heuer's avatar
Karl Heuer committed
34 35 36 37
;;; Predefined categories.

;; For each character set.

38
(define-category ?a "ASCII graphic characters 32-126 (ISO646 IRV:1983[4/0])")
Karl Heuer's avatar
Karl Heuer committed
39 40 41 42 43 44 45 46 47 48 49 50 51 52
(define-category ?l "Latin")
(define-category ?t "Thai")
(define-category ?g "Greek")
(define-category ?b "Arabic")
(define-category ?w "Hebrew")
(define-category ?y "Cyrillic")
(define-category ?k "Japanese katakana")
(define-category ?r "Japanese roman")
(define-category ?c "Chinese")
(define-category ?j "Japanese")
(define-category ?h "Korean")
(define-category ?e "Ethiopic (Ge'ez)")
(define-category ?v "Vietnamese")
(define-category ?i "Indian")
53
(define-category ?o "Lao")
54
(define-category ?q "Tibetan")
Karl Heuer's avatar
Karl Heuer committed
55 56 57

;; For each group (row) of 2-byte character sets.

Karl Heuer's avatar
Karl Heuer committed
58
(define-category ?A "Alpha-numeric characters of 2-byte character sets")
Karl Heuer's avatar
Karl Heuer committed
59
(define-category ?C "Chinese (Han) characters of 2-byte character sets")
Karl Heuer's avatar
Karl Heuer committed
60
(define-category ?G "Greek characters of 2-byte character sets")
Karl Heuer's avatar
Karl Heuer committed
61 62 63
(define-category ?H "Japanese Hiragana characters of 2-byte character sets")
(define-category ?K "Japanese Katakana characters of 2-byte character sets")
(define-category ?N "Korean Hangul characters of 2-byte character sets")
Karl Heuer's avatar
Karl Heuer committed
64
(define-category ?Y "Cyrillic characters of 2-byte character sets")
Karl Heuer's avatar
Karl Heuer committed
65 66 67 68 69
(define-category ?I "Indian Glyphs")

;; For phonetic classifications.

(define-category ?0 "consonant")
70
(define-category ?1 "base (independent) vowel")
Karl Heuer's avatar
Karl Heuer committed
71 72
(define-category ?2 "upper diacritical mark (including upper vowel)")
(define-category ?3 "lower diacritical mark (including lower vowel)")
73
(define-category ?4 "combining tone mark")
74
(define-category ?5 "symbol")
Karl Heuer's avatar
Karl Heuer committed
75 76
(define-category ?6 "digit")
(define-category ?7 "vowel-modifying diacritical mark")
77 78
(define-category ?8 "vowel-signs")
(define-category ?9 "semivowel lower")
Karl Heuer's avatar
Karl Heuer committed
79 80 81 82

;; For filling.
(define-category ?| "While filling, we can break a line at this character.")

Karl Heuer's avatar
Karl Heuer committed
83
;; For indentation calculation.
84
(define-category ?\s
Kenichi Handa's avatar
Kenichi Handa committed
85
  "This character counts as a space for indentation purposes.")
Karl Heuer's avatar
Karl Heuer committed
86

Karl Heuer's avatar
Karl Heuer committed
87
;; Keep the following for `kinsoku' processing.  See comments in
Karl Heuer's avatar
Karl Heuer committed
88 89 90 91
;; kinsoku.el.
(define-category ?> "A character which can't be placed at beginning of line.")
(define-category ?< "A character which can't be placed at end of line.")

92 93
;; Combining
(define-category ?^ "Combining diacritic or mark")
Karl Heuer's avatar
Karl Heuer committed
94 95 96 97 98

;;; Setting syntax and category.

;; ASCII

99 100 101
;; All ASCII characters have the category `a' (ASCII) and `l' (Latin).
(modify-category-entry '(32 . 127) ?a)
(modify-category-entry '(32 . 127) ?l)
Karl Heuer's avatar
Karl Heuer committed
102

Dave Love's avatar
Dave Love committed
103 104 105 106
;; Deal with the CJK charsets first.  Since the syntax of blocks is
;; defined per charset, and the charsets may contain e.g. Latin
;; characters, we end up with the wrong syntax definitions if we're
;; not careful.
Karl Heuer's avatar
Karl Heuer committed
107

Kenichi Handa's avatar
Kenichi Handa committed
108
;; Chinese characters (Unicode)
109 110
(modify-category-entry '(#x2E80 . #x312F) ?|)
(modify-category-entry '(#x3190 . #x33FF) ?|)
Kenichi Handa's avatar
Kenichi Handa committed
111 112 113 114 115 116
(modify-category-entry '(#x3400 . #x9FAF) ?C)
(modify-category-entry '(#x3400 . #x9FAF) ?c)
(modify-category-entry '(#x3400 . #x9FAF) ?|)
(modify-category-entry '(#xF900 . #xFAFF) ?C)
(modify-category-entry '(#xF900 . #xFAFF) ?c)
(modify-category-entry '(#xF900 . #xFAFF) ?|)
Kenichi Handa's avatar
Kenichi Handa committed
117 118 119
(modify-category-entry '(#x20000 . #x2AFFF) ?|)
(modify-category-entry '(#x2F800 . #x2FFFF) ?|)

Karl Heuer's avatar
Karl Heuer committed
120 121 122

;; Chinese character set (GB2312)

Kenichi Handa's avatar
Kenichi Handa committed
123 124 125
(map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2121 #x217E)
(map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2221 #x227E)
(map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2921 #x297E)
Karl Heuer's avatar
Karl Heuer committed
126

Dave Love's avatar
Dave Love committed
127
(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?c)
Kenichi Handa's avatar
Kenichi Handa committed
128 129 130
(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2330 #x2339)
(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2341 #x235A)
(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2361 #x237A)
Kenichi Handa's avatar
Kenichi Handa committed
131 132 133 134 135
(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?H #x2421 #x247E)
(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?K #x2521 #x257E)
(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?G #x2621 #x267E)
(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?Y #x2721 #x277E)
(map-charset-chars #'modify-category-entry 'chinese-gb2312 ?C #x3021 #x7E7E)
Karl Heuer's avatar
Karl Heuer committed
136 137 138

;; Chinese character set (BIG5)

Kenichi Handa's avatar
Kenichi Handa committed
139
(map-charset-chars #'modify-category-entry 'big5 ?c)
Kenichi Handa's avatar
Kenichi Handa committed
140 141 142
(map-charset-chars #'modify-category-entry 'big5 ?C #xA259 #xA25F)
(map-charset-chars #'modify-category-entry 'big5 ?C #xA440 #xC67E)
(map-charset-chars #'modify-category-entry 'big5 ?C #xC940 #xF9DF)
Karl Heuer's avatar
Karl Heuer committed
143 144 145

;; Chinese character set (CNS11643)

Dave Love's avatar
Dave Love committed
146 147 148 149
(dolist (c '(chinese-cns11643-1 chinese-cns11643-2 chinese-cns11643-3
	     chinese-cns11643-4 chinese-cns11643-5 chinese-cns11643-6
	     chinese-cns11643-7))
  (map-charset-chars #'modify-category-entry c ?c)
Kenichi Handa's avatar
Kenichi Handa committed
150 151
  (if (eq c 'chinese-cns11643-1)
      (map-charset-chars #'modify-category-entry c ?C #x4421 #x7E7E)
Kenichi Handa's avatar
Kenichi Handa committed
152
    (map-charset-chars #'modify-category-entry c ?C)))
Karl Heuer's avatar
Karl Heuer committed
153

Kenichi Handa's avatar
Kenichi Handa committed
154
;; Japanese character set (JISX0201, JISX0208, JISX0212, JISX0213)
Karl Heuer's avatar
Karl Heuer committed
155

Kenichi Handa's avatar
Kenichi Handa committed
156
(map-charset-chars #'modify-category-entry 'katakana-jisx0201 ?k)
Karl Heuer's avatar
Karl Heuer committed
157

Kenichi Handa's avatar
Kenichi Handa committed
158
(map-charset-chars #'modify-category-entry 'latin-jisx0201 ?r)
Karl Heuer's avatar
Karl Heuer committed
159

Kenichi Handa's avatar
Kenichi Handa committed
160 161
(dolist (l '(katakana-jisx0201 japanese-jisx0208 japanese-jisx0212
			       japanese-jisx0213-1 japanese-jisx0213-2))
Kenichi Handa's avatar
Kenichi Handa committed
162
  (map-charset-chars #'modify-category-entry l ?j))
Karl Heuer's avatar
Karl Heuer committed
163

164
;; Unicode equivalents of JISX0201-kana
Kenichi Handa's avatar
Kenichi Handa committed
165 166 167 168
(let ((range '(#xff61 . #xff9f)))
  (modify-category-entry range  ?k)
  (modify-category-entry range ?j)
  (modify-category-entry range ?\|))
169 170

;; Katakana block
Kenichi Handa's avatar
Kenichi Handa committed
171 172 173 174
(let ((range '(#x30a0 . #x30ff)))
  ;; ?K is double width, ?k isn't specified
  (modify-category-entry range ?K)
  (modify-category-entry range ?\|))
175 176

;; Hiragana block
Kenichi Handa's avatar
Kenichi Handa committed
177
(let ((range '(#x3040 . #x309d)))
Kenichi Handa's avatar
Kenichi Handa committed
178 179
  ;; ?H is actually defined to be double width
  ;;(modify-category-entry range ?H)
Kenichi Handa's avatar
Kenichi Handa committed
180
  (modify-category-entry range ?\|)
Kenichi Handa's avatar
Kenichi Handa committed
181
  )
182

Karl Heuer's avatar
Karl Heuer committed
183
;; JISX0208
Kenichi Handa's avatar
Kenichi Handa committed
184 185 186
(map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2121 #x227E)
(map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2821 #x287E)
(let ((chars '(? ? ? ? ? ? ? ? ? ? ? ?)))
187
  (dolist (elt chars)
Dave Love's avatar
Dave Love committed
188
    (modify-syntax-entry (car chars) "w")))
Kenichi Handa's avatar
Kenichi Handa committed
189 190 191 192 193 194 195 196 197

(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?A #x2321 #x237E)
(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?H #x2421 #x247E)
(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?K #x2521 #x257E)
(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?G #x2621 #x267E)
(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?Y #x2721 #x277E)
(map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?C #x3021 #x7E7E)
(modify-category-entry ? ?K)
(let ((chars '(? ?)))
Karl Heuer's avatar
Karl Heuer committed
198 199 200 201
  (while chars
    (modify-category-entry (car chars) ?K)
    (modify-category-entry (car chars) ?H)
    (setq chars (cdr chars))))
Kenichi Handa's avatar
Kenichi Handa committed
202
(let ((chars '(? ? ? ? ? ? ? ? ?)))
Karl Heuer's avatar
Karl Heuer committed
203 204 205 206 207 208
  (while chars
    (modify-category-entry (car chars) ?C)
    (setq chars (cdr chars))))

;; JISX0212

Kenichi Handa's avatar
Kenichi Handa committed
209
(map-charset-chars #'modify-syntax-entry 'japanese-jisx0212 "_" #x2121 #x237E)
Karl Heuer's avatar
Karl Heuer committed
210 211

;; JISX0201-Kana
Dave Love's avatar
Dave Love committed
212

Dave Love's avatar
Dave Love committed
213
(let ((chars '(? ? ?)))
Karl Heuer's avatar
Karl Heuer committed
214 215 216 217
  (while chars
    (modify-syntax-entry (car chars) ".")
    (setq chars (cdr chars))))

218 219
(modify-syntax-entry ?\「 "(」")
(modify-syntax-entry ?\」 "(「")
220

Karl Heuer's avatar
Karl Heuer committed
221 222
;; Korean character set (KSC5601)

Dave Love's avatar
Dave Love committed
223
(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?h)
Kenichi Handa's avatar
Kenichi Handa committed
224 225

(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2121 #x227E)
Dave Love's avatar
Dave Love committed
226 227 228
(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2621 #x277E)
(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2830 #x287E)
(map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2930 #x297E)
Kenichi Handa's avatar
Kenichi Handa committed
229 230 231
(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2330 #x2339)
(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2341 #x235A)
(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2361 #x237A)
Kenichi Handa's avatar
Kenichi Handa committed
232 233 234 235
(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?G #x2521 #x257E)
(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?H #x2A21 #x2A7E)
(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?K #x2B21 #x2B7E)
(map-charset-chars #'modify-category-entry 'korean-ksc5601 ?Y #x2C21 #x2C7E)
Karl Heuer's avatar
Karl Heuer committed
236

Dave Love's avatar
Dave Love committed
237
;; These are in more than one charset.
Kenichi Handa's avatar
Kenichi Handa committed
238 239 240 241 242 243 244 245 246
(let ((parens (concat "〈〉《》「」『』【】〔〕〖〗〘〙〚〛"
		      "︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄"
		      "()[]{}"))
      open close)
  (dotimes (i (/ (length parens) 2))
    (setq open (aref parens (* i 2))
	  close (aref parens (1+ (* i 2))))
    (modify-syntax-entry open (format "(%c" close))
    (modify-syntax-entry close (format ")%c" open))))
247

Dave Love's avatar
Dave Love committed
248
;; Arabic character set
249

Dave Love's avatar
Dave Love committed
250 251 252 253 254 255 256 257 258 259
(let ((charsets '(arabic-iso8859-6
		  arabic-digit
		  arabic-1-column
		  arabic-2-column)))
  (while charsets
    (map-charset-chars #'modify-category-entry (car charsets) ?b)
    (setq charsets (cdr charsets))))
(modify-category-entry '(#x600 . #x6ff) ?b)
(modify-category-entry '(#xfb50 . #xfdff) ?b)
(modify-category-entry '(#xfe70 . #xfefe) ?b)
260

Dave Love's avatar
Dave Love committed
261 262 263 264 265 266
;; Cyrillic character set (ISO-8859-5)

(modify-syntax-entry ? ".")

;; Ethiopic character set

Kenichi Handa's avatar
Kenichi Handa committed
267 268
(modify-category-entry '(#x1200 . #x1399) ?e)
(modify-category-entry '(#x2d80 . #x2dde) ?e)
269
(let ((chars '(? ? ? ? ? ? ? ?)))
Dave Love's avatar
Dave Love committed
270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287
  (while chars
    (modify-syntax-entry (car chars) ".")
    (setq chars (cdr chars))))
(map-charset-chars #'modify-category-entry 'ethiopic ?e)

;; Hebrew character set (ISO-8859-8)

(modify-syntax-entry #x5be ".") ; MAQAF
(modify-syntax-entry #x5c0 ".") ; PASEQ
(modify-syntax-entry #x5c3 ".") ; SOF PASUQ
(modify-syntax-entry #x5f3 ".") ; GERESH
(modify-syntax-entry #x5f4 ".") ; GERSHAYIM

;; Indian character set (IS 13194 and other Emacs original Indian charsets)

(modify-category-entry '(#x901 . #x970) ?i)
(map-charset-chars #'modify-category-entry 'indian-is13194 ?i)
(map-charset-chars #'modify-category-entry 'indian-2-column ?i)
288

289 290
;; Lao character set

Dave Love's avatar
Dave Love committed
291 292
(modify-category-entry '(#xe80 . #xeff) ?o)
(map-charset-chars #'modify-category-entry 'lao ?o)
293

Dave Love's avatar
Dave Love committed
294
(let ((deflist	'(("ກ-ຮ"	"w"	?0) ; consonant
295 296 297
		  ("ະາຳຽເ-ໄ"	"w"	?1) ; vowel base
		  ("ັິ-ືົໍ"	"w"	?2) ; vowel upper
		  ("ຸູ"	"w"	?3) ; vowel lower
Kenichi Handa's avatar
Kenichi Handa committed
298
		  ("່-໋"	"w"	?4) ; tone mark
299 300 301
		  ("ຼຽ"	"w"	?9) ; semivowel lower
		  ("໐-໙"	"w"	?6) ; digit
		  ("ຯໆ"	"_"	?5) ; symbol
302 303 304 305 306 307 308 309 310 311 312 313
		  ))
      elm chars len syntax category to ch i)
  (while deflist
    (setq elm (car deflist))
    (setq chars (car elm)
	  len (length chars)
	  syntax (nth 1 elm)
	  category (nth 2 elm)
	  i 0)
    (while (< i len)
      (if (= (aref chars i) ?-)
	  (setq i (1+ i)
Kenichi Handa's avatar
Kenichi Handa committed
314 315
		to (aref chars i))
	(setq ch (aref chars i)
316 317
	      to ch))
      (while (<= ch to)
318 319
	(unless (string-equal syntax "w")
	  (modify-syntax-entry ch syntax))
320 321
	(modify-category-entry ch category)
	(setq ch (1+ ch)))
Kenichi Handa's avatar
Kenichi Handa committed
322
      (setq i (1+ i)))
323 324
    (setq deflist (cdr deflist))))

Karl Heuer's avatar
Karl Heuer committed
325 326
;; Thai character set (TIS620)

Dave Love's avatar
Dave Love committed
327 328
(modify-category-entry '(#xe00 . #xe7f) ?t)
(map-charset-chars #'modify-category-entry 'thai-tis620 ?t)
Karl Heuer's avatar
Karl Heuer committed
329 330

(let ((deflist	'(;; chars	syntax	category
331 332 333 334
		  ("ก-รลว-ฮ"	"w"	?0) ; consonant
		  ("ฤฦะาำเ-ๅ"	"w"	?1) ; vowel base
		  ("ัิ-ื็๎"	"w"	?2) ; vowel upper
		  ("ุ-ฺ"	"w"	?3) ; vowel lower
Kenichi Handa's avatar
Kenichi Handa committed
335
		  ("่-ํ"	"w"	?4) ; tone mark
336 337
		  ("๐-๙"	"w"	?6) ; digit
		  ("ฯๆ฿๏๚๛"	"_"	?5) ; symbol
Karl Heuer's avatar
Karl Heuer committed
338 339
		  ))
      elm chars len syntax category to ch i)
340 341 342 343 344 345 346 347 348 349
  (while deflist
    (setq elm (car deflist))
    (setq chars (car elm)
	  len (length chars)
	  syntax (nth 1 elm)
	  category (nth 2 elm)
	  i 0)
    (while (< i len)
      (if (= (aref chars i) ?-)
	  (setq i (1+ i)
Kenichi Handa's avatar
Kenichi Handa committed
350 351
		to (aref chars i))
	(setq ch (aref chars i)
352 353
	      to ch))
      (while (<= ch to)
354 355
	(unless (string-equal syntax "w")
	  (modify-syntax-entry ch syntax))
356 357
	(modify-category-entry ch category)
	(setq ch (1+ ch)))
Kenichi Handa's avatar
Kenichi Handa committed
358
      (setq i (1+ i)))
359 360 361 362
    (setq deflist (cdr deflist))))

;; Tibetan character set

Dave Love's avatar
Dave Love committed
363 364 365
(modify-category-entry '(#xf00 . #xfff) ?q)
(map-charset-chars #'modify-category-entry 'tibetan ?q)
(map-charset-chars #'modify-category-entry 'tibetan-1-column ?q)
366 367

(let ((deflist	'(;; chars             syntax category
Dave Love's avatar
Dave Love committed
368
		  ("ཀ-ཀྵཪ"        	"w"	?0) ; consonant
369
		  ("ྐ-ྐྵྺྻྼ"       "w"     ?0) ;
Dave Love's avatar
Dave Love committed
370 371
		  ("ིེཻོཽྀ"       "w"	?2) ; upper vowel
		  ("ཾྂྃ྆྇ྈྉྊྋ" "w"	?2) ; upper modifier
372
		  ("྄ཱུ༙༵༷"       "w"	?3) ; lowel vowel/modifier
Kenichi Handa's avatar
Kenichi Handa committed
373
		  ("཰"		"w" ?3)		    ; invisible vowel a
Dave Love's avatar
Dave Love committed
374 375 376 377 378 379 380
		  ("༠-༩༪-༳"	        "w"	?6) ; digit
		  ("་།-༒༔ཿ"        "."     ?|) ; line-break char
		  ("་།༏༐༑༔ཿ"            "."     ?|) ;
		  ("༈་།-༒༔ཿ༽༴"  "."     ?>) ; prohibition
		  ("་།༏༐༑༔ཿ"            "."     ?>) ;
		  ("ༀ-༊༼࿁࿂྅"      "."     ?<) ; prohibition
		  ("༓༕-༘༚-༟༶༸-༻༾༿྾྿-࿏" "." ?q) ; others
381 382
		  ))
      elm chars len syntax category to ch i)
Karl Heuer's avatar
Karl Heuer committed
383 384 385 386 387 388 389 390 391 392
  (while deflist
    (setq elm (car deflist))
    (setq chars (car elm)
	  len (length chars)
	  syntax (nth 1 elm)
	  category (nth 2 elm)
	  i 0)
    (while (< i len)
      (if (= (aref chars i) ?-)
	  (setq i (1+ i)
Kenichi Handa's avatar
Kenichi Handa committed
393 394
		to (aref chars i))
	(setq ch (aref chars i)
Karl Heuer's avatar
Karl Heuer committed
395 396
	      to ch))
      (while (<= ch to)
397 398
	(unless (string-equal syntax "w")
	  (modify-syntax-entry ch syntax))
Karl Heuer's avatar
Karl Heuer committed
399 400
	(modify-category-entry ch category)
	(setq ch (1+ ch)))
Kenichi Handa's avatar
Kenichi Handa committed
401
      (setq i (1+ i)))
Karl Heuer's avatar
Karl Heuer committed
402 403 404 405
    (setq deflist (cdr deflist))))

;; Vietnamese character set

Dave Love's avatar
Dave Love committed
406 407 408 409 410 411
;; To make a word with Latin characters
(map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?l)
(map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?v)

(map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?l)
(map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?v)
Karl Heuer's avatar
Karl Heuer committed
412

413 414 415
(let ((tbl (standard-case-table))
      (i 32))
  (while (< i 128)
Dave Love's avatar
Dave Love committed
416 417 418 419 420 421 422 423
    (let* ((char (decode-char 'vietnamese-viscii-upper i))
	   (charl (decode-char 'vietnamese-viscii-lower i))
	   (uc (encode-char char 'ucs))
	   (lc (encode-char charl 'ucs)))
      (set-case-syntax-pair char (decode-char 'vietnamese-viscii-lower i)
			    tbl)	
      (if uc (modify-category-entry uc ?v))
      (if lc (modify-category-entry lc ?v)))
424 425
    (setq i (1+ i))))

426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448
;; Tai Viet
(let ((deflist '(;; chars	syntax	category
		 ((?.  ?)	"w"	?0) ; cosonant
		 ("ꪱꪵꪶ"		"w"	?1) ; vowel base
		 ((? . ?)	"w"	?1) ; vowel base
		 ("ꪰꪲꪳꪷꪸꪾ"	"w"	?2) ; vowel upper
		 ("ꪴ"		"w"	?3) ; vowel lower
		 ("ꫀꫂ"		"w"	?1) ; non-combining tone-mark
		 ("꪿꫁"		"w"	?4) ; combining tone-mark
		 ((? . ?)	"_"	?5) ; symbol
		 )))
  (dolist (elm deflist)
    (let ((chars (car elm))
	  (syntax (nth 1 elm))
	  (category (nth 2 elm)))
      (if (consp chars)
	  (progn
	    (modify-syntax-entry chars syntax)
	    (modify-category-entry chars category))
	(mapc #'(lambda (x)
		  (modify-syntax-entry x syntax)
		  (modify-category-entry x category))
	      chars)))))
Dave Love's avatar
Dave Love committed
449 450 451 452

;; Latin

(modify-category-entry '(#x80 . #x024F) ?l)
453

454 455
(let ((tbl (standard-case-table)) c)

Dave Love's avatar
Dave Love committed
456 457 458 459 460 461
  ;; Latin-1

  ;; Fixme: Some of the non-word syntaxes here perhaps should be
  ;; reviewed.  (Note that the following all implicitly have word
  ;; syntax: ¢£¤¥¨ª¯²³´¶¸¹º.)  There should be a well-defined way of
  ;; relating Unicode categories to Emacs syntax codes.
462 463 464 465

  ;; NBSP isn't semantically interchangeable with other whitespace chars,
  ;; so it's more like punctation.
  (set-case-syntax ?  "." tbl)
Dave Love's avatar
Dave Love committed
466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489
  (set-case-syntax ?¡ "." tbl)
  (set-case-syntax ?¦ "_" tbl)
  (set-case-syntax ?§ "." tbl)
  (set-case-syntax ?© "_" tbl)
  (set-case-syntax-delims 171 187 tbl)	; « »
  (set-case-syntax ?¬ "_" tbl)
  (set-case-syntax ?­ "_" tbl)
  (set-case-syntax ?® "_" tbl)
  (set-case-syntax ?° "_" tbl)
  (set-case-syntax ?± "_" tbl)
  (set-case-syntax ?µ "_" tbl)
  (set-case-syntax ?· "_" tbl)
  (set-case-syntax ?¼ "_" tbl)
  (set-case-syntax ?½ "_" tbl)
  (set-case-syntax ?¾ "_" tbl)
  (set-case-syntax ?¿ "." tbl)
  (let ((c 192))
    (while (<= c 222)
      (set-case-syntax-pair c (+ c 32) tbl)
      (setq c (1+ c))))
  (set-case-syntax ?× "_" tbl)
  (set-case-syntax ?ß "w" tbl)
  (set-case-syntax ?÷ "_" tbl)
  ;; See below for ÿ.
490 491 492

  ;; Latin Extended-A, Latin Extended-B
  (setq c #x0100)
493 494
  (while (<= c #x02B8)
    (modify-category-entry c ?l)
495
    (setq c (1+ c)))
Kenichi Handa's avatar
Kenichi Handa committed
496

497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524
  (let ((pair-ranges '((#x0100 . #x012F)
		       (#x0132 . #x0137)
		       (#x0139 . #x0148)
		       (#x014a . #x0177)
		       (#x0179 . #x017E)
		       (#x0182 . #x0185)
		       (#x0187 . #x018C)
		       (#x0191 . #x0192)
		       (#x0198 . #x0199)
		       (#x01A0 . #x01A5)
		       (#x01A7 . #x01A8)
		       (#x01AC . #x01AD)
		       (#x01AF . #x01B0)
		       (#x01B3 . #x01B6)
		       (#x01BC . #x01BD)
		       (#x01CD . #x01DC)
		       (#x01DE . #x01EF)
		       (#x01F4 . #x01F5)
		       (#x01F8 . #x021F)
		       (#x0222 . #x0233)
		       (#x023B . #x023C)
		       (#x0241 . #x0242)
		       (#x0246 . #x024F))))
    (dolist (elt pair-ranges)
      (let ((from (car elt)) (to (cdr elt)))
	(while (< from to)
	  (set-case-syntax-pair from (1+ from) tbl)
	  (setq from (+ from 2))))))
Kenichi Handa's avatar
Kenichi Handa committed
525 526 527 528 529 530 531 532 533 534 535 536 537

  ;; In some languages, such as Turkish, U+0049 LATIN CAPITAL LETTER I
  ;; and U+0131 LATIN SMALL LETTER DOTLESS I make a case pair, and so
  ;; do U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+0069 LATIN
  ;; SMALL LETTER I.

  ;; We used to set up half of those correspondence unconditionally,
  ;; but that makes searches slow.  So now we don't set up either half
  ;; of these correspondences by default.

  ;; (set-downcase-syntax  ?İ ?i tbl)
  ;; (set-upcase-syntax    ?I ?ı tbl)

538 539 540 541 542 543
  (set-case-syntax-pair ?DŽ ?dž tbl)
  (set-case-syntax-pair ?Dž ?dž tbl)
  (set-case-syntax-pair ?LJ ?lj tbl)
  (set-case-syntax-pair ?Lj ?lj tbl)
  (set-case-syntax-pair ?NJ ?nj tbl)
  (set-case-syntax-pair ?Nj ?nj tbl)
544

545
  ;; 01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON
546 547 548 549
  (set-case-syntax-pair ?DZ ?dz tbl)
  (set-case-syntax-pair ?Dz ?dz tbl)
  (set-case-syntax-pair ?Ƕ ?ƕ tbl)
  (set-case-syntax-pair ?Ƿ ?ƿ tbl)
550

551
  ;; Latin Extended Additional
Dave Love's avatar
Dave Love committed
552
  (modify-category-entry '(#x1e00 . #x1ef9) ?l)
553
  (setq c #x1e00)
554 555 556
  (while (<= c #x1ef9)
    (and (zerop (% c 2))
	 (or (<= c #x1e94) (>= c #x1ea0))
Dave Love's avatar
Dave Love committed
557
	 (set-case-syntax-pair c (1+ c) tbl))
558 559
    (setq c (1+ c)))

560
  ;; Greek
Dave Love's avatar
Dave Love committed
561
  (modify-category-entry '(#x0370 . #x03ff) ?g)
562
  (setq c #x0370)
563 564 565
  (while (<= c #x03ff)
    (if (or (and (>= c #x0391) (<= c #x03a1))
	    (and (>= c #x03a3) (<= c #x03ab)))
Dave Love's avatar
Dave Love committed
566
	(set-case-syntax-pair c (+ c 32) tbl))
567 568 569
    (and (>= c #x03da)
	 (<= c #x03ee)
	 (zerop (% c 2))
Dave Love's avatar
Dave Love committed
570
	 (set-case-syntax-pair c (1+ c) tbl))
571
    (setq c (1+ c)))
572 573 574 575 576 577 578
  (set-case-syntax-pair ?Ά ?ά tbl)
  (set-case-syntax-pair ?Έ ?έ tbl)
  (set-case-syntax-pair ?Ή ?ή tbl)
  (set-case-syntax-pair ?Ί ?ί tbl)
  (set-case-syntax-pair ?Ό ?ό tbl)
  (set-case-syntax-pair ?Ύ ?ύ tbl)
  (set-case-syntax-pair ?Ώ ?ώ tbl)
579

580 581 582
  ;; Armenian
  (setq c #x531)
  (while (<= c #x556)
Dave Love's avatar
Dave Love committed
583
    (set-case-syntax-pair c (+ c #x30) tbl)
584 585
    (setq c (1+ c)))

586
  ;; Greek Extended
Dave Love's avatar
Dave Love committed
587
  (modify-category-entry '(#x1f00 . #x1fff) ?g)
588
  (setq c #x1f00)
589 590 591 592 593
  (while (<= c #x1fff)
    (and (<= (logand c #x000f) 7)
	 (<= c #x1fa7)
	 (not (memq c '(#x1f50 #x1f52 #x1f54 #x1f56)))
	 (/= (logand c #x00f0) 7)
Dave Love's avatar
Dave Love committed
594
	 (set-case-syntax-pair (+ c 8) c tbl))
595
    (setq c (1+ c)))
596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619
  (set-case-syntax-pair ? ? tbl)
  (set-case-syntax-pair ? ? tbl)
  (set-case-syntax-pair ? ? tbl)
  (set-case-syntax-pair ? ? tbl)
  (set-case-syntax-pair ? ? tbl)
  (set-case-syntax-pair ? ? tbl)
  (set-case-syntax-pair ? ? tbl)
  (set-case-syntax-pair ? ? tbl)
  (set-case-syntax-pair ? ? tbl)
  (set-case-syntax-pair ? ? tbl)
  (set-case-syntax-pair ? ? tbl)
  (set-case-syntax-pair ? ? tbl)
  (set-case-syntax-pair ? ? tbl)
  (set-case-syntax-pair ? ? tbl)
  (set-case-syntax-pair ? ? tbl)
  (set-case-syntax-pair ? ? tbl)
  (set-case-syntax-pair ? ? tbl)
  (set-case-syntax-pair ? ? tbl)
  (set-case-syntax-pair ? ? tbl)
  (set-case-syntax-pair ? ? tbl)
  (set-case-syntax-pair ? ? tbl)
  (set-case-syntax-pair ? ? tbl)
  (set-case-syntax-pair ? ? tbl)
  (set-case-syntax-pair ? ? tbl)
620

621
  ;; cyrillic
Dave Love's avatar
Dave Love committed
622
  (modify-category-entry '(#x0400 . #x04FF) ?y)
623
  (setq c #x0400)
624 625 626
  (while (<= c #x04ff)
    (and (>= c #x0400)
	 (<= c #x040f)
Dave Love's avatar
Dave Love committed
627
	 (set-case-syntax-pair c (+ c 80) tbl))
628 629
    (and (>= c #x0410)
	 (<= c #x042f)
Dave Love's avatar
Dave Love committed
630
	 (set-case-syntax-pair c (+ c 32) tbl))
631 632 633 634
    (and (zerop (% c 2))
	 (or (and (>= c #x0460) (<= c #x0480))
	     (and (>= c #x048c) (<= c #x04be))
	     (and (>= c #x04d0) (<= c #x04f4)))
Kenichi Handa's avatar
Kenichi Handa committed
635
	 (set-case-syntax-pair c (1+ c) tbl))
636
    (setq c (1+ c)))
637 638 639 640 641
  (set-case-syntax-pair ?Ӂ ?ӂ tbl)
  (set-case-syntax-pair ?Ӄ ?ӄ tbl)
  (set-case-syntax-pair ?Ӈ ?ӈ tbl)
  (set-case-syntax-pair ?Ӌ ?ӌ tbl)
  (set-case-syntax-pair ?Ӹ ?ӹ tbl)
642

643 644
  ;; general punctuation
  (setq c #x2000)
645 646 647
  (while (<= c #x200b)
    (set-case-syntax c " " tbl)
    (setq c (1+ c)))
Dave Love's avatar
Dave Love committed
648 649 650 651
  (while (<= c #x200F)
    (set-case-syntax c "." tbl)
    (setq c (1+ c)))
  ;; Fixme: These aren't all right:
Kenichi Handa's avatar
Kenichi Handa committed
652 653 654 655 656 657 658 659 660
  (setq c #x2010)
  (while (<= c #x2016)
    (set-case-syntax c "_" tbl)
    (setq c (1+ c)))
  ;; Punctuation syntax for quotation marks (like `)
  (while (<= c #x201f)
    (set-case-syntax  c "." tbl)
    (setq c (1+ c)))
  ;; Fixme: These aren't all right:
661 662 663
  (while (<= c #x2027)
    (set-case-syntax c "_" tbl)
    (setq c (1+ c)))
Dave Love's avatar
Dave Love committed
664 665 666
  (while (<= c #x206F)
    (set-case-syntax c "." tbl)
    (setq c (1+ c)))
667

668 669 670
  ;; Roman numerals
  (setq c #x2160)
  (while (<= c #x216f)
Dave Love's avatar
Dave Love committed
671
    (set-case-syntax-pair c (+ c #x10) tbl)
672 673
    (setq c (1+ c)))

Dave Love's avatar
Dave Love committed
674 675
  ;; Fixme: The following blocks might be better as symbol rather than
  ;; punctuation.
Dave Love's avatar
Dave Love committed
676 677
  ;; Arrows
  (setq c #x2190)
Dave Love's avatar
Dave Love committed
678 679
  (while (<= c #x21FF)
    (set-case-syntax c "." tbl)
Dave Love's avatar
Dave Love committed
680 681 682
    (setq c (1+ c)))
  ;; Mathematical Operators
  (while (<= c #x22FF)
Dave Love's avatar
Dave Love committed
683
    (set-case-syntax c "." tbl)
Dave Love's avatar
Dave Love committed
684 685 686
    (setq c (1+ c)))
  ;; Miscellaneous Technical
  (while (<= c #x23FF)
Dave Love's avatar
Dave Love committed
687
    (set-case-syntax c "." tbl)
Dave Love's avatar
Dave Love committed
688 689 690
    (setq c (1+ c)))
  ;; Control Pictures
  (while (<= c #x243F)
Dave Love's avatar
Dave Love committed
691
    (set-case-syntax c "_" tbl)
692 693 694 695 696
    (setq c (1+ c)))

  ;; Circled Latin
  (setq c #x24b6)
  (while (<= c #x24cf)
Dave Love's avatar
Dave Love committed
697 698 699
    (set-case-syntax-pair c (+ c 26) tbl)
    (modify-category-entry c ?l)
    (modify-category-entry (+ c 26) ?l)
700 701 702 703 704
    (setq c (1+ c)))

  ;; Fullwidth Latin
  (setq c #xff21)
  (while (<= c #xff3a)
Dave Love's avatar
Dave Love committed
705 706 707
    (set-case-syntax-pair c (+ c #x20) tbl)
    (modify-category-entry c ?l)
    (modify-category-entry (+ c #x20) ?l)
708 709 710
    (setq c (1+ c)))

  ;; Combining diacritics
Dave Love's avatar
Dave Love committed
711
  (modify-category-entry '(#x300 . #x362) ?^)
712
  ;; Combining marks
Dave Love's avatar
Dave Love committed
713
  (modify-category-entry '(#x20d0 . #x20e3) ?^)
714 715 716

  ;; Fixme: syntax for symbols &c
  )
Kenichi Handa's avatar
Kenichi Handa committed
717 718

(let ((pairs
719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770
       '("⁅⁆"				; U+2045 U+2046
	 "⁽⁾"				; U+207D U+207E
	 "₍₎"				; U+208D U+208E
	 "〈〉"				; U+2329 U+232A
	 "⎴⎵"				; U+23B4 U+23B5
	 "❨❩"				; U+2768 U+2769
	 "❪❫"				; U+276A U+276B
	 "❬❭"				; U+276C U+276D
	 "❰❱"				; U+2770 U+2771
	 "❲❳"				; U+2772 U+2773
	 "❴❵"				; U+2774 U+2775
	 "⟦⟧"				; U+27E6 U+27E7
	 "⟨⟩"				; U+27E8 U+27E9
	 "⟪⟫"				; U+27EA U+27EB
	 "⦃⦄"				; U+2983 U+2984
	 "⦅⦆"				; U+2985 U+2986
	 "⦇⦈"				; U+2987 U+2988
	 "⦉⦊"				; U+2989 U+298A
	 "⦋⦌"				; U+298B U+298C
	 "⦍⦎"				; U+298D U+298E
	 "⦏⦐"				; U+298F U+2990
	 "⦑⦒"				; U+2991 U+2992
	 "⦓⦔"				; U+2993 U+2994
	 "⦕⦖"				; U+2995 U+2996
	 "⦗⦘"				; U+2997 U+2998
	 "⧼⧽"				; U+29FC U+29FD
	 "〈〉"				; U+3008 U+3009
	 "《》"				; U+300A U+300B
	 "「」"				; U+300C U+300D
	 "『』"				; U+300E U+300F
	 "【】"				; U+3010 U+3011
	 "〔〕"				; U+3014 U+3015
	 "〖〗"				; U+3016 U+3017
	 "〘〙"				; U+3018 U+3019
	 "〚〛"				; U+301A U+301B
	 "﴾﴿"				; U+FD3E U+FD3F
	 "︵︶"				; U+FE35 U+FE36
	 "︷︸"				; U+FE37 U+FE38
	 "︹︺"				; U+FE39 U+FE3A
	 "︻︼"				; U+FE3B U+FE3C
	 "︽︾"				; U+FE3D U+FE3E
	 "︿﹀"				; U+FE3F U+FE40
	 "﹁﹂"				; U+FE41 U+FE42
	 "﹃﹄"				; U+FE43 U+FE44
	 "﹙﹚"				; U+FE59 U+FE5A
	 "﹛﹜"				; U+FE5B U+FE5C
	 "﹝﹞"				; U+FE5D U+FE5E
	 "()"				; U+FF08 U+FF09
	 "[]"				; U+FF3B U+FF3D
	 "{}"				; U+FF5B U+FF5D
	 "⦅⦆"				; U+FF5F U+FF60
	 "「」"				; U+FF62 U+FF63
Kenichi Handa's avatar
Kenichi Handa committed
771 772 773 774 775
	 )))
  (dolist (elt pairs)
    (modify-syntax-entry (aref elt 0) (string ?\( (aref elt 1)))
    (modify-syntax-entry (aref elt 1) (string ?\) (aref elt 0)))))

Karl Heuer's avatar
Karl Heuer committed
776

Kenichi Handa's avatar
Kenichi Handa committed
777
;; For each character set, put the information of the most proper
778
;; coding system to encode it by `preferred-coding-system' property.
Kenichi Handa's avatar
Kenichi Handa committed
779

Dave Love's avatar
Dave Love committed
780
;; Fixme: should this be junked?
Kenichi Handa's avatar
Kenichi Handa committed
781 782 783 784 785 786 787 788 789 790 791 792 793
(let ((l '((latin-iso8859-1	. iso-latin-1)
	   (latin-iso8859-2	. iso-latin-2)
	   (latin-iso8859-3	. iso-latin-3)
	   (latin-iso8859-4	. iso-latin-4)
	   (thai-tis620		. thai-tis620)
	   (greek-iso8859-7	. greek-iso-8bit)
	   (arabic-iso8859-6	. iso-2022-7bit)
	   (hebrew-iso8859-8	. hebrew-iso-8bit)
	   (katakana-jisx0201	. japanese-shift-jis)
	   (latin-jisx0201	. japanese-shift-jis)
	   (cyrillic-iso8859-5	. cyrillic-iso-8bit)
	   (latin-iso8859-9	. iso-latin-5)
	   (japanese-jisx0208-1978 . iso-2022-jp)
Kenichi Handa's avatar
Kenichi Handa committed
794 795 796 797 798 799 800
	   (chinese-gb2312	. chinese-iso-8bit)
	   (chinese-gbk		. chinese-gbk)
	   (gb18030-2-byte	. chinese-gb18030)
	   (gb18030-4-byte-bmp	. chinese-gb18030)
	   (gb18030-4-byte-smp	. chinese-gb18030)
	   (gb18030-4-byte-ext-1 . chinese-gb18030)
	   (gb18030-4-byte-ext-2 . chinese-gb18030)
Kenichi Handa's avatar
Kenichi Handa committed
801 802 803 804 805 806 807 808 809 810 811 812 813 814
	   (japanese-jisx0208	. iso-2022-jp)
	   (korean-ksc5601	. iso-2022-kr)
	   (japanese-jisx0212	. iso-2022-jp)
	   (chinese-big5-1	. chinese-big5)
	   (chinese-big5-2	. chinese-big5)
	   (chinese-sisheng	. iso-2022-7bit)
	   (ipa			. iso-2022-7bit)
	   (vietnamese-viscii-lower . vietnamese-viscii)
	   (vietnamese-viscii-upper . vietnamese-viscii)
	   (arabic-digit	. iso-2022-7bit)
	   (arabic-1-column	. iso-2022-7bit)
	   (lao			. lao)
	   (arabic-2-column	. iso-2022-7bit)
	   (indian-is13194	. devanagari)
815
	   (indian-glyph	. devanagari)
Kenichi Handa's avatar
Kenichi Handa committed
816
	   (tibetan-1-column	. tibetan)
817
	   (ethiopic		. iso-2022-7bit)
Kenichi Handa's avatar
Kenichi Handa committed
818 819
	   (chinese-cns11643-1	. iso-2022-cn)
	   (chinese-cns11643-2	. iso-2022-cn)
Kenichi Handa's avatar
Kenichi Handa committed
820 821 822 823 824 825
	   (chinese-cns11643-3	. iso-2022-cn)
	   (chinese-cns11643-4	. iso-2022-cn)
	   (chinese-cns11643-5	. iso-2022-cn)
	   (chinese-cns11643-6	. iso-2022-cn)
	   (chinese-cns11643-7	. iso-2022-cn)
	   (indian-2-column	. devanagari)
Dave Love's avatar
Dave Love committed
826 827 828
	   (tibetan		. tibetan)
	   (latin-iso8859-14	. iso-latin-8)
	   (latin-iso8859-15	. iso-latin-9))))
Kenichi Handa's avatar
Kenichi Handa committed
829
  (while l
830
    (put-charset-property (car (car l)) 'preferred-coding-system (cdr (car l)))
Kenichi Handa's avatar
Kenichi Handa committed
831
    (setq l (cdr l))))
Kenichi Handa's avatar
Kenichi Handa committed
832 833


834
;; Setup auto-fill-chars for charsets that should invoke auto-filling.
835
;; SPACE and NEWLINE are already set.
Kenichi Handa's avatar
Kenichi Handa committed
836 837 838 839 840 841 842 843

(set-char-table-range auto-fill-chars '(#x3041 . #x30FF) t)
(set-char-table-range auto-fill-chars '(#x3400 . #x4DB5) t)
(set-char-table-range auto-fill-chars '(#x4e00 . #x9fbb) t)
(set-char-table-range auto-fill-chars '(#xF900 . #xFAFF) t)
(set-char-table-range auto-fill-chars '(#xFF00 . #xFF9F) t)
(set-char-table-range auto-fill-chars '(#x20000 . #x2FFFF) t)

844

845 846 847 848
;;; Setting char-width-table.  The default is 1.

;; 0: non-spacing, enclosing combining, formatting, Hangul Jamo medial
;;    and final characters.
849
(let ((l '((#x0300 . #x036F)

	   (#x0483 . #x0489)
	   (#x0591 . #x05BD)
	   (#x05BF . #x05BF)
	   (#x05C1 . #x05C2)
	   (#x05C4 . #x05C5)
	   (#x05C7 . #x05C7)
	   (#x0600 . #x0603)
	   (#x0610 . #x0615)
	   (#x064B . #x065E)
	   (#x0670 . #x0670)
	   (#x06D6 . #x06E4)
	   (#x06E7 . #x06E8)
	   (#x06EA . #x06ED)
	   (#x070F . #x070F)
	   (#x0711 . #x0711)
	   (#x0730 . #x074A)
	   (#x07A6 . #x07B0)
	   (#x07EB . #x07F3)
	   (#x0901 . #x0902)
	   (#x093C . #x093C)
	   (#x0941 . #x0948)
	   (#x094D . #x094D)
	   (#x0951 . #x0954)
	   (#x0962 . #x0963)
	   (#x0981 . #x0981)
	   (#x09BC . #x09BC)
	   (#x09C1 . #x09C4)
	   (#x09CD . #x09CD)
	   (#x09E2 . #x09E3)
	   (#x0A01 . #x0A02)
	   (#x0A3C . #x0A3C)
	   (#x0A41 . #x0A4D)
	   (#x0A70 . #x0A71)
	   (#x0A81 . #x0A82)
	   (#x0ABC . #x0ABC)
	   (#x0AC1 . #x0AC8)
	   (#x0ACD . #x0ACD)
	   (#x0AE2 . #x0AE3)
	   (#x0B01 . #x0B01)
	   (#x0B3C . #x0B3C)
	   (#x0B3F . #x0B3F)
	   (#x0B41 . #x0B43)
	   (#x0B4D . #x0B56)
	   (#x0B82 . #x0B82)
	   (#x0BC0 . #x0BC0)
	   (#x0BCD . #x0BCD)
	   (#x0C3E . #x0C40)
	   (#x0C46 . #x0C56)
	   (#x0CBC . #x0CBC)
	   (#x0CBF . #x0CBF)
	   (#x0CC6 . #x0CC6)
	   (#x0CCC . #x0CCD)
	   (#x0CE2 . #x0CE3)
	   (#x0D41 . #x0D43)
	   (#x0D4D . #x0D4D)
	   (#x0DCA . #x0DCA)
	   (#x0DD2 . #x0DD6)
	   (#x0E31 . #x0E31)
	   (#x0E34 . #x0E3A)
	   (#x0E47 . #x0E4E)
	   (#x0EB1 . #x0EB1)
	   (#x0EB4 . #x0EBC)
	   (#x0EC8 . #x0ECD)
	   (#x0F18 . #x0F19)
	   (#x0F35 . #x0F35)
	   (#x0F37 . #x0F37)
	   (#x0F39 . #x0F39)
	   (#x0F71 . #x0F7E)
	   (#x0F80 . #x0F84)
	   (#x0F86 . #x0F87)
	   (#x0F90 . #x0FBC)
	   (#x0FC6 . #x0FC6)
	   (#x102D . #x1030)
	   (#x1032 . #x1037)
	   (#x1039 . #x1039)
	   (#x1058 . #x1059)
	   (#x1160 . #x11FF)
	   (#x135F . #x135F)
	   (#x1712 . #x1714)
	   (#x1732 . #x1734)
	   (#x1752 . #x1753)
	   (#x1772 . #x1773)
	   (#x17B4 . #x17B5)
	   (#x17B7 . #x17BD)
	   (#x17C6 . #x17C6)
	   (#x17C9 . #x17D3)
	   (#x17DD . #x17DD)
	   (#x180B . #x180D)
	   (#x18A9 . #x18A9)
	   (#x1920 . #x1922)
	   (#x1927 . #x1928)
	   (#x1932 . #x1932)
	   (#x1939 . #x193B)
	   (#x1A17 . #x1A18)
	   (#x1B00 . #x1B03)
	   (#x1B34 . #x1B34)
	   (#x1B36 . #x1B3A)
	   (#x1B3C . #x1B3C)
	   (#x1B42 . #x1B42)
	   (#x1B6B . #x1B73)
	   (#x1DC0 . #x1DFF)
	   (#x200B . #x200F)
	   (#x202A . #x202E)
	   (#x2060 . #x206F)
	   (#x20D0 . #x20EF)
	   (#x302A . #x302F)
	   (#x3099 . #x309A)
	   (#xA806 . #xA806)
	   (#xA80B . #xA80B)
	   (#xA825 . #xA826)
	   (#xFB1E . #xFB1E)
	   (#xFE00 . #xFE0F)
	   (#xFE20 . #xFE23)
	   (#xFEFF . #xFEFF)
	   (#xFFF9 . #xFFFB)
	   (#x10A01 . #x10A0F)
	   (#x10A38 . #x10A3F)
	   (#x1D167 . #x1D169)
	   (#x1D173 . #x1D182)
	   (#x1D185 . #x1D18B)
	   (#x1D1AA . #x1D1AD)
	   (#x1D242 . #x1D244)
	   (#xE0001 . #xE01EF))))
  (dolist (elt l)
    (set-char-table-range char-width-table elt 0)))

;; 2: East Asian Wide and Full-width characters.
(let ((l '((#x1100 . #x115F)
	   (#x2329 . #x232A)
	   (#x2E80 . #x303E)
	   (#x3040 . #xA4CF)
	   (#xAC00 . #xD7A3)
982
	   (#xF900 . #xFAFF)
983
	   (#xFE30 . #xFE6F)
Kenichi Handa's avatar
Kenichi Handa committed
984
	   (#xFF01 . #xFF60)
985 986 987
	   (#xFFE0 . #xFFE6)
	   (#x20000 . #x2FFFF)
	   (#x30000 . #x3FFFF))))
988
  (dolist (elt l)
989
    (set-char-table-range char-width-table elt 2)))
990 991

;; Other double width
992 993 994 995 996 997
;;(map-charset-chars
;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
;; 'ethiopic)
;; (map-charset-chars
;;  (lambda (range ignore) (set-char-table-range char-width-table range 2))
;; 'tibetan)
998 999 1000 1001 1002 1003
(map-charset-chars
 (lambda (range ignore) (set-char-table-range char-width-table range 2))
 'indian-2-column)
(map-charset-chars
 (lambda (range ignore) (set-char-table-range char-width-table range 2))
 'arabic-2-column)
Kenichi Handa's avatar
Kenichi Handa committed
1004

1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018
(defvar cjk-char-width-table
  (let ((table (make-char-table nil)))
    (dolist (charset '(big5 chinese-gb2312 chinese-cns11643-1 
			    japanese-jisx0208 korean-ksc5601))
      (map-charset-chars #'(lambda (range arg)
			     (set-char-table-range table range 2))
			 charset))
    (optimize-char-table table)
    (set-char-table-parent table char-width-table)
    table)
  "Character width table used in CJK language environment.")

(defun use-cjk-char-width-table ()
  "Internal use only.
Kenichi Handa's avatar
Kenichi Handa committed
1019 1020
Setup char-width-table appropriate for CJK language environment."
  (setq char-width-table cjk-char-width-table))
1021 1022 1023

(defun use-default-char-width-table ()
  "Internal use only.
Kenichi Handa's avatar
Kenichi Handa committed
1024 1025
Setup char-width-table appropriate for non-CJK language environment."
  (setq char-width-table (char-table-parent cjk-char-width-table)))
1026

Dave Love's avatar
Dave Love committed
1027 1028 1029 1030
(optimize-char-table (standard-case-table))
(optimize-char-table (standard-category-table))
(optimize-char-table (standard-syntax-table))

1031 1032 1033

;; Setting char-script-table.

Dave Love's avatar
Dave Love committed
1034 1035
;; The Unicode blocks actually extend past some of these ranges with
;; undefined codepoints.
Kenichi Handa's avatar
Kenichi Handa committed
1036 1037 1038
(let ((script-list nil))
  (dolist
      (elt
1039
       '((#x0000 #x007F latin)
1040 1041 1042
	 (#x00A0 #x024F latin)
	 (#x0250 #x02AF phonetic)
	 (#x02B0 #x036F latin)
Kenichi Handa's avatar
Kenichi Handa committed
1043 1044 1045 1046 1047 1048 1049 1050
	 (#x0370 #x03E1 greek)
	 (#x03E2 #x03EF coptic)
	 (#x03F0 #x03F3 greek)
	 (#x0400 #x04FF cyrillic)
	 (#x0530 #x058F armenian)
	 (#x0590 #x05FF hebrew)
	 (#x0600 #x06FF arabic)
	 (#x0700 #x074F syriac)
Kenichi Handa's avatar
Kenichi Handa committed
1051
	 (#x07C0 #x07FA nko)
Kenichi Handa's avatar
Kenichi Handa committed
1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068
	 (#x0780 #x07BF thaana)
	 (#x0900 #x097F devanagari)
	 (#x0980 #x09FF bengali)
	 (#x0A00 #x0A7F gurmukhi)
	 (#x0A80 #x0AFF gujarati)
	 (#x0B00 #x0B7F oriya)
	 (#x0B80 #x0BFF tamil)
	 (#x0C00 #x0C7F telugu)
	 (#x0C80 #x0CFF kannada)
	 (#x0D00 #x0D7F malayalam)
	 (#x0D80 #x0DFF sinhala)
	 (#x0E00 #x0E5F thai)
	 (#x0E80 #x0EDF lao)
	 (#x0F00 #x0FFF tibetan)
	 (#x1000 #x105F myanmar)
	 (#x10A0 #x10FF georgian)
	 (#x1100 #x11FF hangul)
Kenichi Handa's avatar
Kenichi Handa committed
1069
	 (#x1200 #x139F ethiopic)
Kenichi Handa's avatar
Kenichi Handa committed
1070 1071 1072 1073 1074 1075
	 (#x13A0 #x13FF cherokee)
	 (#x1400 #x167F canadian-aboriginal)
	 (#x1680 #x169F ogham)
	 (#x16A0 #x16FF runic)
	 (#x1780 #x17FF khmer)
	 (#x1800 #x18AF mongolian)
1076
	 (#x1D00 #x1DFF phonetic)
Kenichi Handa's avatar
Kenichi Handa committed
1077 1078
	 (#x1E00 #x1EFF latin)
	 (#x1F00 #x1FFF greek)
1079
	 (#x2000 #x27FF symbol)
Kenichi Handa's avatar
Kenichi Handa committed
1080
	 (#x2800 #x28FF braille)
Kenichi Handa's avatar
Kenichi Handa committed
1081
	 (#x2D80 #x2DDF ethiopic)
Kenichi Handa's avatar
Kenichi Handa committed
1082 1083 1084 1085 1086 1087 1088 1089 1090 1091
	 (#x2E80 #x2FDF han)
	 (#x2FF0 #x2FFF ideographic-description)
	 (#x3000 #x303F cjk-misc)
	 (#x3040 #x30FF kana)
	 (#x3100 #x312F bopomofo)
	 (#x3130 #x318F hangul)
	 (#x3190 #x319F kanbun)
	 (#x31A0 #x31BF bopomofo)
	 (#x3400 #x9FAF han)
	 (#xA000 #xA4CF yi)
Kenichi Handa's avatar
Kenichi Handa committed
1092