unidata-gen.el 42.7 KB
Newer Older
Kenichi Handa's avatar
Kenichi Handa committed
1
;; unidata-gen.el -- Create files containing character property data.
2
;; Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011
Kenichi Handa's avatar
Kenichi Handa committed
3 4 5 6 7
;;   National Institute of Advanced Industrial Science and Technology (AIST)
;;   Registration Number H13PRO009

;; This file is part of GNU Emacs.

8
;; GNU Emacs is free software: you can redistribute it and/or modify
Kenichi Handa's avatar
Kenichi Handa committed
9
;; it under the terms of the GNU General Public License as published by
10 11
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
Kenichi Handa's avatar
Kenichi Handa committed
12 13 14 15 16 17 18

;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
19
;; along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.
Kenichi Handa's avatar
Kenichi Handa committed
20 21 22

;;; Commentary:

Kenichi Handa's avatar
Kenichi Handa committed
23 24 25 26 27 28
;; SPECIAL NOTICE
;;
;;   This file must be byte-compilable/loadable by `temacs' and also
;;   the entry function `unidata-gen-files' must be runnable by
;;   `temacs'.

Kenichi Handa's avatar
Kenichi Handa committed
29 30
;; FILES TO BE GENERATED
;;
Kenichi Handa's avatar
Kenichi Handa committed
31
;;   The entry function `unidata-gen-files' generates these files in
Kenichi Handa's avatar
Kenichi Handa committed
32 33 34 35
;;   the current directory.
;;
;;   charprop.el
;;	It contains a series of forms of this format:
36
;;	  (define-char-code-property PROP FILE)
Kenichi Handa's avatar
Kenichi Handa committed
37
;;	where PROP is a symbol representing a character property
38
;;	(name, general-category, etc), and FILE is a name of one of
Kenichi Handa's avatar
Kenichi Handa committed
39 40
;;	the following files.
;;
41 42 43 44
;;   uni-name.el, uni-category.el, uni-combining.el, uni-bidi.el,
;;   uni-decomposition.el, uni-decimal.el, uni-digit.el, uni-numeric.el,
;;   uni-mirrored.el, uni-old-name.el, uni-comment.el, uni-uppercase.el,
;;   uni-lowercase.el, uni-titlecase.el
45 46
;;	They contain one or more forms of this format:
;;	  (define-char-code-property PROP CHAR-TABLE)
Kenichi Handa's avatar
Kenichi Handa committed
47
;;	where PROP is the same as above, and CHAR-TABLE is a
Kenichi Handa's avatar
Kenichi Handa committed
48
;;	char-table containing property values in a compressed format.
Kenichi Handa's avatar
Kenichi Handa committed
49 50 51
;;
;;   When they are installed in .../lisp/international/, the file
;;   "charprop.el" is preloaded in loadup.el.  The other files are
52 53 54
;;   automatically loaded when the Lisp functions
;;   `get-char-code-property' and `put-char-code-property', and C
;;   function uniprop_table are called.
Kenichi Handa's avatar
Kenichi Handa committed
55 56 57 58 59 60 61 62 63 64 65
;;
;; FORMAT OF A CHAR TABLE
;;
;;   We want to make a file size containing a char-table small.  We
;;   also want to load the file and get a property value fast.  We
;;   also want to reduce the used memory after loading it.  So,
;;   instead of naively storing a property value for each character in
;;   a char-table (and write it out into a file), we store compressed
;;   data in a char-table as below.
;;
;;   If succeeding 128*N characters have the same property value, we
66 67 68 69 70 71 72 73
;;   store that value (or the encoded one) for them.  Otherwise,
;;   compress values (or the encoded ones) for succeeding 128
;;   characters into a single string and store it for those
;;   characters.  The way of compression depends on a property.  See
;;   the section "SIMPLE TABLE", "RUN-LENGTH TABLE", and "WORD-LIST
;;   TABLE".

;;   The char table has five extra slots:
Kenichi Handa's avatar
Kenichi Handa committed
74
;;      1st: property symbol
75 76 77 78 79 80 81
;;	2nd: function to call to get a property value,
;;	     or an index number of C function to decode the value,
;;	     or nil if the value can be directly got from the table.
;;	3nd: function to call to put a property value,
;;	     or an index number of C function to encode the value,
;;	     or nil if the value can be directly stored in the table.
;;	4th: function to call to get a description of a property value, or nil
Kenichi Handa's avatar
Kenichi Handa committed
82 83 84 85 86 87 88
;;	5th: data referred by the above functions

;; List of elements of this form:
;;   (CHAR-or-RANGE PROP1 PROP2 ... PROPn)
;; CHAR-or-RANGE: a character code or a cons of character codes
;; PROPn: string representing the nth property value

89 90
(defvar unidata-list nil)

91 92 93 94 95
;; Name of the directory containing files of Unicode Character
;; Database.

(defvar unidata-dir nil)

96
(defun unidata-setup-list (unidata-text-file)
Kenichi Handa's avatar
Kenichi Handa committed
97 98 99 100 101 102 103
  (let* ((table (list nil))
	 (tail table)
	 (block-names '(("^<CJK Ideograph" . CJK\ IDEOGRAPH)
			("^<Hangul Syllable" . HANGUL\ SYLLABLE)
			("^<.*Surrogate" . nil)
			("^<.*Private Use" . PRIVATE\ USE)))
	 val char name)
104
    (setq unidata-text-file (expand-file-name unidata-text-file unidata-dir))
Kenichi Handa's avatar
Kenichi Handa committed
105 106 107
    (or (file-readable-p unidata-text-file)
	(error "File not readable: %s" unidata-text-file))
    (with-temp-buffer
108 109
      ;; Insert a file of this format:
      ;;   (CHAR NAME CATEGORY ...)
110
      ;; where CHAR is a character code, the following elements are strings
111
      ;; representing character properties.
Kenichi Handa's avatar
Kenichi Handa committed
112
      (insert-file-contents unidata-text-file)
Kenichi Handa's avatar
Kenichi Handa committed
113 114 115 116 117 118 119 120 121
      (goto-char (point-min))
      (condition-case nil
	  (while t
	    (setq val (read (current-buffer))
		  char (car val)
		  name (cadr val))

	    ;; Check this kind of block.
	    ;;   4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
122
	    ;;   9FCB;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
Kenichi Handa's avatar
Kenichi Handa committed
123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
	    (if (and (= (aref name 0) ?<)
		     (string-match ", First>$" name))
		(let ((first char)
		      (l block-names)
		      block-name)
		  (setq val (read (current-buffer))
			char (car val)
			block-name (cadr val)
			name nil)
		  (while l
		    (if (string-match (caar l) block-name)
			(setq name (cdar l) l nil)
		      (setq l (cdr l))))
		  (if (not name)
		      ;; As this is a surrogate pair range, ignore it.
		      (setq val nil)
		    (setcar val (cons first char))
		    (setcar (cdr val) name))))

	    (when val
	      (setcdr tail (list val))
	      (setq tail (cdr tail))))
	(error nil)))
146
    (setq unidata-list (cdr table))))
Kenichi Handa's avatar
Kenichi Handa committed
147 148

;; Alist of this form:
149
;;   (PROP INDEX GENERATOR FILENAME DOCSTRING DESCRIBER DEFAULT VAL-LIST)
Kenichi Handa's avatar
Kenichi Handa committed
150
;; PROP: character property
151 152 153
;; INDEX: index to each element of unidata-list for PROP.
;;   It may be a function that generates an alist of character codes
;;   vs. the corresponding property values.
Kenichi Handa's avatar
Kenichi Handa committed
154 155
;; GENERATOR: function to generate a char-table
;; FILENAME: filename to store the char-table
156
;; DOCSTRING: docstring for the property
Kenichi Handa's avatar
Kenichi Handa committed
157
;; DESCRIBER: function to call to get a description string of property value
158 159 160 161 162
;; DEFAULT: the default value of the property.  It may have the form
;;   (VAL0 (FROM1 TO1 VAL1) ...) which indicates that the default
;;   value is VAL0 except for characters in the ranges specified by
;;   FROMn and TOn (incusive).  The default value of characters
;;   between FROMn and TOn is VALn.
163
;; VAL-LIST: list of specially ordered property values
Kenichi Handa's avatar
Kenichi Handa committed
164 165 166 167 168

(defconst unidata-prop-alist
  '((name
     1 unidata-gen-table-name "uni-name.el"
     "Unicode character name.
169 170 171
Property value is a string."
     nil
     "")
Kenichi Handa's avatar
Kenichi Handa committed
172 173 174 175 176 177
    (general-category
     2 unidata-gen-table-symbol "uni-category.el"
     "Unicode general category.
Property value is one of the following symbols:
  Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
  Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn"
178
     unidata-describe-general-category
179
     Cn
180 181 182 183
     ;; The order of elements must be in sync with unicode_category_t
     ;; in src/character.h.
     (Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Pc Pd Ps Pe Pi Pf Po
	 Sm Sc Sk So Zs Zl Zp Cc Cf Cs Co Cn))
Kenichi Handa's avatar
Kenichi Handa committed
184 185 186 187
    (canonical-combining-class
     3 unidata-gen-table-integer "uni-combining.el"
     "Unicode canonical combining class.
Property value is an integer."
188 189
     unidata-describe-canonical-combining-class
     0)
Kenichi Handa's avatar
Kenichi Handa committed
190 191 192 193 194 195
    (bidi-class
     4 unidata-gen-table-symbol "uni-bidi.el"
     "Unicode bidi class.
Property value is one of the following symbols:
  L, LRE, LRO, R, AL, RLE, RLO, PDF, EN, ES, ET,
  AN, CS, NSM, BN, B, S, WS, ON"
196
     unidata-describe-bidi-class
197 198 199 200 201
     ;; The assignment of default values to blocks of code points
     ;; follows the file DerivedBidiClass.txt from the Unicode
     ;; Character Database (UCD).
     (L (#x0600 #x06FF AL) (#xFB50 #xFDFF AL) (#xFE70 #xFEFF AL)
	(#x0590 #x05FF R) (#x07C0 #x08FF R)
202
	(#xFB1D #xFB4F R) (#x10800 #x10FFF R) (#x1E800 #x1EFFF R))
203 204 205
     ;; The order of elements must be in sync with bidi_type_t in
     ;; src/dispextern.h.
     (L R EN AN BN B AL LRE LRO RLE RLO PDF ES ET CS NSM S WS ON))
Kenichi Handa's avatar
Kenichi Handa committed
206 207 208 209
    (decomposition
     5 unidata-gen-table-decomposition "uni-decomposition.el"
     "Unicode decomposition mapping.
Property value is a list of characters.  The first element may be
Kenichi Handa's avatar
Kenichi Handa committed
210
one of these symbols representing compatibility formatting tag:
211 212
  font, noBreak, initial, medial, final, isolated, circle, super,
  sub, vertical, wide, narrow, small, square, fraction, compat"
Kenichi Handa's avatar
Kenichi Handa committed
213 214 215 216
     unidata-describe-decomposition)
    (decimal-digit-value
     6 unidata-gen-table-integer "uni-decimal.el"
     "Unicode numeric value (decimal digit).
217 218
Property value is an integer 0..9, or nil.
The value nil stands for NaN \"Numeric_Value\".")
Kenichi Handa's avatar
Kenichi Handa committed
219 220 221
    (digit-value
     7 unidata-gen-table-integer "uni-digit.el"
     "Unicode numeric value (digit).
222 223
Property value is an integer 0..9, or nil.
The value nil stands for NaN \"Numeric_Value\".")
Kenichi Handa's avatar
Kenichi Handa committed
224
    (numeric-value
225
     8 unidata-gen-table-numeric "uni-numeric.el"
Kenichi Handa's avatar
Kenichi Handa committed
226
     "Unicode numeric value (numeric).
227 228
Property value is an integer, a floating point, or nil.
The value nil stands for NaN \"Numeric_Value\".")
Kenichi Handa's avatar
Kenichi Handa committed
229 230 231
    (mirrored
     9 unidata-gen-table-symbol "uni-mirrored.el"
     "Unicode bidi mirrored flag.
232 233 234
Property value is a symbol `Y' or `N'.  See also the property `mirroring'."
     nil
     N)
Kenichi Handa's avatar
Kenichi Handa committed
235 236 237 238 239 240 241 242 243 244 245
    (old-name
     10 unidata-gen-table-name "uni-old-name.el"
     "Unicode old names as published in Unicode 1.0.
Property value is a string.")
    (iso-10646-comment
     11 unidata-gen-table-name "uni-comment.el"
     "Unicode ISO 10646 comment.
Property value is a string.")
    (uppercase
     12 unidata-gen-table-character "uni-uppercase.el"
     "Unicode simple uppercase mapping.
246 247 248
Property value is a character or nil.
The value nil means that the actual property value of a character
is the character itself."
Kenichi Handa's avatar
Kenichi Handa committed
249 250 251 252
     string)
    (lowercase
     13 unidata-gen-table-character "uni-lowercase.el"
     "Unicode simple lowercase mapping.
253 254 255
Property value is a character or nil.
The value nil means that the actual property value of a character
is the character itself."
Kenichi Handa's avatar
Kenichi Handa committed
256 257 258 259
     string)
    (titlecase
     14 unidata-gen-table-character "uni-titlecase.el"
     "Unicode simple titlecase mapping.
260 261 262
Property value is a character or nil.
The value nil means that the actual property value of a character
is the character itself."
263 264 265 266
     string)
    (mirroring
     unidata-gen-mirroring-list unidata-gen-table-character "uni-mirrored.el"
     "Unicode bidi-mirroring characters.
267 268 269
Property value is a character that has the corresponding mirroring image or nil.
The value nil means that the actual property value of a character
is the character itself.")))
Kenichi Handa's avatar
Kenichi Handa committed
270 271 272 273 274 275 276

;; Functions to access the above data.
(defsubst unidata-prop-index (prop) (nth 1 (assq prop unidata-prop-alist)))
(defsubst unidata-prop-generator (prop) (nth 2 (assq prop unidata-prop-alist)))
(defsubst unidata-prop-file (prop) (nth 3 (assq prop unidata-prop-alist)))
(defsubst unidata-prop-docstring (prop) (nth 4 (assq prop unidata-prop-alist)))
(defsubst unidata-prop-describer (prop) (nth 5 (assq prop unidata-prop-alist)))
277 278
(defsubst unidata-prop-default (prop) (nth 6 (assq prop unidata-prop-alist)))
(defsubst unidata-prop-val-list (prop) (nth 7 (assq prop unidata-prop-alist)))
Kenichi Handa's avatar
Kenichi Handa committed
279 280 281 282 283 284 285 286


;; SIMPLE TABLE
;;
;; If the type of character property value is character, and the
;; values of succeeding character codes are usually different, we use
;; a char-table described here to store such values.
;;
287 288 289 290 291 292
;; A char-table divides character code space (#x0..#x3FFFFF) into
;; #x8000 blocks (each block contains 128 characters).

;; If all characters of a block have no property, a char-table has the
;; symbol nil for that block.  Otherwise a char-table has a string of
;; the following format for it.
Kenichi Handa's avatar
Kenichi Handa committed
293
;;
294 295 296 297 298
;; The first character of the string is ?\001.
;; The second character of the string is FIRST-INDEX.
;; The Nth (N > 1) character of the string is a property value of the
;; character (BLOCK-HEAD + FIRST-INDEX + N - 2), where BLOCK-HEAD is
;; the first character of the block.
Kenichi Handa's avatar
Kenichi Handa committed
299
;;
300 301 302 303 304
;; This kind of char-table has these extra slots:
;;   1st: the property symbol
;;   2nd: nil
;;   3rd: 0 (corresponding to uniprop_encode_character in chartab.c)
;;   4th to 5th: nil
Kenichi Handa's avatar
Kenichi Handa committed
305

306
(defun unidata-gen-table-character (prop &rest ignore)
Kenichi Handa's avatar
Kenichi Handa committed
307 308 309 310 311
  (let ((table (make-char-table 'char-code-property-table))
	(prop-idx (unidata-prop-index prop))
	(vec (make-vector 128 0))
	(tail unidata-list)
	elt range val idx slot)
312 313 314
    (if (functionp prop-idx)
	(setq tail (funcall prop-idx)
	      prop-idx 1))
Kenichi Handa's avatar
Kenichi Handa committed
315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342
    (while tail
      (setq elt (car tail) tail (cdr tail))
      (setq range (car elt)
	    val (nth prop-idx elt))
      (if (= (length val) 0)
	  (setq val nil)
	(setq val (string-to-number val 16)))
      (if (consp range)
	  (if val
	      (set-char-table-range table range val))
	(let* ((start (lsh (lsh range -7) 7))
	       (limit (+ start 127))
	       first-index last-index)
	  (fillarray vec 0)
	  (if val
	      (aset vec (setq last-index (setq first-index (- range start)))
		    val))
	  (while (and (setq elt (car tail) range (car elt))
		      (integerp range)
		      (<= range limit))
	    (setq val (nth prop-idx elt))
	    (when (> (length val) 0)
	      (aset vec (setq last-index (- range start))
		    (string-to-number val 16))
	      (or first-index
		  (setq first-index last-index)))
	    (setq tail (cdr tail)))
	  (when first-index
343
	    (let ((str (string 1 first-index))
Kenichi Handa's avatar
Kenichi Handa committed
344 345 346 347 348 349 350
		  c)
	      (while (<= first-index last-index)
		(setq str (format "%s%c"  str (or (aref vec first-index) 0))
		      first-index (1+ first-index)))
	      (set-char-table-range table (cons start limit) str))))))

    (set-char-table-extra-slot table 0 prop)
351
    (set-char-table-extra-slot table 2 0)
Kenichi Handa's avatar
Kenichi Handa committed
352 353 354 355 356 357
    table))



;; RUN-LENGTH TABLE
;;
358 359 360
;; If many characters of successive character codes have the same
;; property value, we use a char-table described here to store the
;; values.
Kenichi Handa's avatar
Kenichi Handa committed
361
;;
362 363 364 365 366 367 368 369 370 371 372 373 374
;; At first, instead of a value itself, we store an index number to
;; the VAL-TABLE (5th extra slot) in the table.  We call that index
;; number as VAL-CODE here after.
;;
;; A char-table divides character code space (#x0..#x3FFFFF) into
;; #x8000 blocks (each block contains 128 characters).
;;
;; If all characters of a block have the same value, a char-table has
;; VAL-CODE for that block.  Otherwise a char-table has a string of
;; the following format for that block.
;;
;; The first character of the string is ?\002.
;; The following characters has this form:
Kenichi Handa's avatar
Kenichi Handa committed
375 376
;;   ( VAL-CODE RUN-LENGTH ? ) +
;; where:
377
;;   VAL-CODE (0..127): index into VAL-TABLE.
Kenichi Handa's avatar
Kenichi Handa committed
378 379 380
;;   RUN-LENGTH (130..255):
;;     (RUN-LENGTH - 128) specifies how many characters have the same
;;     value.  If omitted, it means 1.
381 382 383 384 385 386 387
;;
;; This kind of char-table has these extra slots:
;;   1st: the property symbol
;;   2nd: 0 (corresponding to uniprop_decode_value in chartab.c)
;;   3rd: 1..3 (corresponding to uniprop_encode_xxx in chartab.c)
;;   4th: function or nil
;;   5th: VAL-TABLE
388

Kenichi Handa's avatar
Kenichi Handa committed
389 390 391
;; Encode the character property value VAL into an integer value by
;; VAL-LIST.  By side effect, VAL-LIST is modified.
;; VAL-LIST has this form:
392 393 394 395
;;   ((nil . 0) (VAL1 . 1) (VAL2 . 2) ...)
;; If VAL is one of VALn, just return n.
;; Otherwise, VAL-LIST is modified to this:
;;   ((nil . 0) (VAL1 . 1) (VAL2 . 2) ... (VAL . n+1))
Kenichi Handa's avatar
Kenichi Handa committed
396 397

(defun unidata-encode-val (val-list val)
398
  (let ((slot (assoc val val-list))
Kenichi Handa's avatar
Kenichi Handa committed
399 400 401
	val-code)
    (if slot
	(cdr slot)
402 403
      (setq val-code (length val-list))
      (nconc val-list (list (cons val val-code)))
Kenichi Handa's avatar
Kenichi Handa committed
404 405 406 407
      val-code)))

;; Generate a char-table for the character property PROP.

408
(defun unidata-gen-table (prop val-func default-value val-list)
Kenichi Handa's avatar
Kenichi Handa committed
409 410 411
  (let ((table (make-char-table 'char-code-property-table))
	(prop-idx (unidata-prop-index prop))
	(vec (make-vector 128 0))
412 413
	tail elt range val val-code idx slot
	prev-range-data)
414 415 416 417 418 419
    (setq val-list (cons nil (copy-sequence val-list)))
    (setq tail val-list val-code 0)
    ;; Convert (nil A B ...) to ((nil . 0) (A . 1) (B . 2) ...)
    (while tail
      (setcar tail (cons (car tail) val-code))
      (setq tail (cdr tail) val-code (1+ val-code)))
420 421 422 423 424 425 426 427 428 429 430 431
    (if (consp default-value)
	(setq default-value (copy-sequence default-value))
      (setq default-value (list default-value)))
    (setcar default-value
	    (unidata-encode-val val-list (car default-value)))
    (set-char-table-range table t (car default-value))
    (set-char-table-range table nil (car default-value))
    (dolist (elm (cdr default-value))
      (setcar (nthcdr 2 elm)
	      (unidata-encode-val val-list (nth 2 elm)))
      (set-char-table-range table (cons (car elm) (nth 1 elm)) (nth 2 elm)))

Kenichi Handa's avatar
Kenichi Handa committed
432 433 434 435 436 437 438
    (setq tail unidata-list)
    (while tail
      (setq elt (car tail) tail (cdr tail))
      (setq range (car elt)
	    val (funcall val-func (nth prop-idx elt)))
      (setq val-code (if val (unidata-encode-val val-list val)))
      (if (consp range)
439
	  (when val-code
440
	    (set-char-table-range table range val-code)
441 442 443 444 445 446 447 448 449 450 451 452
	    (let ((from (car range)) (to (cdr range)))
	      ;; If RANGE doesn't end at the char-table boundary (each
	      ;; 128 characters), we may have to carry over the data
	      ;; for the last several characters (at most 127 chars)
	      ;; to the next loop.  In that case, set PREV-RANGE-DATA
	      ;; to ((FROM . TO) . VAL-CODE) where (FROM . TO)
	      ;; specifies the range of characters handled in the next
	      ;; loop.
	      (when (< (logand to #x7F) #x7F)
		(if (< from (logand to #x1FFF80))
		    (setq from (logand to #x1FFF80)))
		(setq prev-range-data (cons (cons from to) val-code)))))
Kenichi Handa's avatar
Kenichi Handa committed
453 454
	(let* ((start (lsh (lsh range -7) 7))
	       (limit (+ start 127))
455 456 457 458 459 460 461 462 463
	       str count new-val from to vcode)
	  (fillarray vec (car default-value))
	  (dolist (elm (cdr default-value))
	    (setq from (car elm) to (nth 1 elm))
	    (when (and (<= from limit)
		       (or (>= from start) (>= to start)))
	      (setq from (max from start)
		    to (min to limit)
		    vcode (nth 2 elm))
464 465 466
	      (while (<= from to)
		(aset vec (- from start) vcode)
		(setq from (1+ from)))))
467 468 469 470 471 472 473 474 475
	  ;; See the comment above.
	  (when (and prev-range-data
		     (>= (cdr (car prev-range-data)) start))
	    (setq from (car (car prev-range-data))
		  to (cdr (car prev-range-data))
		  vcode (cdr prev-range-data))
	    (while (<= from to)
	      (aset vec (- from start) vcode)
	      (setq from (1+ from))))
476
	  (setq prev-range-data nil)
Kenichi Handa's avatar
Kenichi Handa committed
477 478 479 480 481 482 483 484 485 486 487 488
	  (if val-code
	      (aset vec (- range start) val-code))
	  (while (and (setq elt (car tail) range (car elt))
		      (integerp range)
		      (<= range limit))
	    (setq new-val (funcall val-func (nth prop-idx elt)))
	    (if (not (eq val new-val))
		(setq val new-val
		      val-code (if val (unidata-encode-val val-list val))))
	    (if val-code
		(aset vec (- range start) val-code))
	    (setq tail (cdr tail)))
489
	  (setq str "\002" val-code -1 count 0)
Kenichi Handa's avatar
Kenichi Handa committed
490 491 492 493 494 495 496 497 498 499 500 501 502 503
	  (mapc #'(lambda (x)
		    (if (= val-code x)
			(setq count (1+ count))
		      (if (> count 2)
			  (setq str (concat str (string val-code
							(+ count 128))))
			(if (= count 2)
			    (setq str (concat str (string val-code val-code)))
			  (if (= count 1)
			      (setq str (concat str (string val-code))))))
		      (setq val-code x count 1)))
		vec)
	  (if (= count 128)
	      (if val
504
		  (set-char-table-range table (cons start limit) val-code))
Kenichi Handa's avatar
Kenichi Handa committed
505 506 507 508 509 510 511 512 513
	    (if (= val-code 0)
		(set-char-table-range table (cons start limit) str)
	      (if (> count 2)
		  (setq str (concat str (string val-code (+ count 128))))
		(if (= count 2)
		    (setq str (concat str (string val-code val-code)))
		  (setq str (concat str (string val-code)))))
	      (set-char-table-range table (cons start limit) str))))))

514
    (set-char-table-extra-slot table 0 prop)
Kenichi Handa's avatar
Kenichi Handa committed
515 516 517
    (set-char-table-extra-slot table 4 (vconcat (mapcar 'car val-list)))
    table))

518
(defun unidata-gen-table-symbol (prop default-value val-list)
Kenichi Handa's avatar
Kenichi Handa committed
519 520 521
  (let ((table (unidata-gen-table prop
				  #'(lambda (x) (and (> (length x) 0)
						     (intern x)))
522 523 524
				  default-value val-list)))
    (set-char-table-extra-slot table 1 0)
    (set-char-table-extra-slot table 2 1)
Kenichi Handa's avatar
Kenichi Handa committed
525 526
    table))

527
(defun unidata-gen-table-integer (prop default-value val-list)
Kenichi Handa's avatar
Kenichi Handa committed
528 529 530
  (let ((table (unidata-gen-table prop
				  #'(lambda (x) (and (> (length x) 0)
						     (string-to-number x)))
531 532 533
				  default-value val-list)))
    (set-char-table-extra-slot table 1 0)
    (set-char-table-extra-slot table 2 1)
Kenichi Handa's avatar
Kenichi Handa committed
534 535
    table))

536
(defun unidata-gen-table-numeric (prop default-value val-list)
537 538 539 540 541 542 543 544
  (let ((table (unidata-gen-table prop
				  #'(lambda (x)
				      (if (string-match "/" x)
					  (/ (float (string-to-number x))
					     (string-to-number
					      (substring x (match-end 0))))
					(if (> (length x) 0)
					    (string-to-number x))))
545 546 547
				  default-value val-list)))
    (set-char-table-extra-slot table 1 0)
    (set-char-table-extra-slot table 2 2)
548 549
    table))

Kenichi Handa's avatar
Kenichi Handa committed
550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587

;; WORD-LIST TABLE

;;   If the table is for `name' property, each character in the string
;;   is one of these:
;;	DIFF-HEAD-CODE (0, 1, or 2):
;;	  specifies how to decode the following characters.
;;	WORD-CODE (3..#x7FF excluding '-', '0'..'9', 'A'..'Z'):
;;	  specifies an index number into WORD-TABLE (see below)
;;      Otherwise (' ', '-', '0'..'9', 'A'..'Z'):
;;	  specifies a literal word.
;;
;;   The 4th slots is a vector:
;;	[ WORD-TABLE BLOCK-NAME HANGUL-JAMO-TABLE ]
;;   WORD-TABLE is a vector of word symbols.
;;   BLOCK-NAME is a vector of name symbols for a block of characters.
;;   HANGUL-JAMO-TABLE is `unidata-name-jamo-name-table'.

;; Return the difference of symbol list L1 and L2 in this form:
;;   (DIFF-HEAD SYM1 SYM2 ...)
;; DIFF-HEAD is ((SAME-HEAD-LENGTH * 16) + SAME-TAIL-LENGTH).
;; Ex: If L1 is (a b c d e f) and L2 is (a g h e f), this function
;; returns ((+ (* 1 16) 2) g h).
;; It means that we can get L2 from L1 by prepending the first element
;; of L1 and appending the last 2 elements of L1 to the list (g h).
;; If L1 and L2 don't have common elements at the head and tail,
;; set DIFF-HEAD to -1 and SYM1 ... to the elements of L2.

(defun unidata-word-list-diff (l1 l2)
  (let ((beg 0)
	(end 0)
	(len1 (length l1))
	(len2 (length l2))
	result)
    (when (< len1 16)
      (while (and l1 (eq (car l1) (car l2)))
	(setq beg (1+ beg)
	      l1 (cdr l1) len1 (1- len1) l2 (cdr l2) len2 (1- len2)))
588
      (while (and (< end len1) (< end len2)
Kenichi Handa's avatar
Kenichi Handa committed
589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659
		  (eq (nth (- len1 end 1) l1) (nth (- len2 end 1) l2)))
	(setq end (1+ end))))
    (if (= (+ beg end) 0)
	(setq result (list -1))
      (setq result (list (+ (* beg 16) (+ beg (- len1 end))))))
    (while (< end len2)
      (setcdr result (cons (nth (- len2 end 1) l2) (cdr result)))
      (setq end (1+ end)))
    result))

;; Return a compressed form of the vector VEC.  Each element of VEC is
;; a list of symbols of which names can be concatenated to form a
;; character name.  This function changes those elements into
;; compressed forms by utilizing the fact that diff of consecutive
;; elements is usually small.

(defun unidata-word-list-compress (vec)
  (let (last-elt last-idx diff-head tail elt val)
    (dotimes (i 128)
      (setq elt (aref vec i))
      (when elt
	(if (null last-elt)
	    (setq diff-head -1
		  val (cons 0 elt))
	  (setq val (unidata-word-list-diff last-elt elt))
	  (if (= (car val) -1)
	      (setq diff-head -1
		    val (cons 0 (cdr val)))
	    (if (eq diff-head (car val))
		(setq val (cons 2 (cdr val)))
	      (setq diff-head (car val))
	      (if (>= diff-head 0)
		  (setq val (cons 1 val))))))
	(aset vec i val)
	(setq last-idx i last-elt elt)))
    (if (not last-idx)
	(setq vec nil)
      (if (< last-idx 127)
	  (let ((shorter (make-vector (1+ last-idx) nil)))
	    (dotimes (i (1+ last-idx))
	      (aset shorter i (aref vec i)))
	    (setq vec shorter))))
    vec))

;; Encode the word index IDX into a characters code that can be
;; embedded in a string.

(defsubst unidata-encode-word (idx)
  ;; Exclude 0, 1, 2.
  (+ idx 3))

;; Decode the character code CODE (that is embedded in a string) into
;; the corresponding word name by looking up WORD-TABLE.

(defsubst unidata-decode-word (code word-table)
  (setq code (- code 3))
  (if (< code (length word-table))
      (aref word-table code)))

;; Table of short transliterated name symbols of Hangul Jamo divided
;; into Choseong, Jungseong, and Jongseong.

(defconst unidata-name-jamo-name-table
  [[G GG N D DD R M B BB S SS nil J JJ C K T P H]
   [A AE YA YAE EO E YEO YE O WA WAE OE YO U WEO WE WI YU EU YI I]
   [G GG GS N NJ NH D L LG LM LB LS LT LP LH M B BS S SS NG J C K T P H]])

;; Return a name of CHAR.  VAL is the current value of (aref TABLE
;; CHAR).

(defun unidata-get-name (char val table)
660
  (cond
Kenichi Handa's avatar
Kenichi Handa committed
661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690
   ((stringp val)
    (if (> (aref val 0) 0)
	val
      (let* ((first-char (lsh (lsh char -7) 7))
	     (word-table (aref (char-table-extra-slot table 4) 0))
	     (i 1)
	     (len (length val))
	     (vec (make-vector 128 nil))
	     (idx 0)
	     (case-fold-search nil)
	     c word-list tail-list last-list word diff-head)
	(while (< i len)
	  (setq c (aref val i))
	  (if (< c 3)
	      (progn
		(if (or word-list tail-list)
		    (aset vec idx
			  (setq last-list (nconc word-list tail-list))))
		(setq i (1+ i) idx (1+ idx)
		      word-list nil tail-list nil)
		(if (> c 0)
		    (let ((l last-list))
		      (if (= c 1)
			  (setq diff-head
				(prog1 (aref val i) (setq i (1+ i)))))
		      (setq tail-list (nthcdr (% diff-head 16) last-list))
		      (dotimes (i (/ diff-head 16))
			(setq word-list (nconc word-list (list (car l)))
			      l (cdr l))))))
	    (setq word-list
691
		  (nconc word-list
Kenichi Handa's avatar
Kenichi Handa committed
692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714
			 (list (symbol-name
				(unidata-decode-word c word-table))))
		  i (1+ i))))
	(if (or word-list tail-list)
	    (aset vec idx (nconc word-list tail-list)))
	(setq val nil)
	(dotimes (i 128)
	  (setq c (+ first-char i))
	  (let ((name (aref vec i)))
	    (if name
		(let ((tail (cdr (setq name (copy-sequence name))))
		      elt)
		  (while tail
		    (setq elt (car tail))
		    (or (string= elt "-")
			(progn
			  (setcdr tail (cons elt (cdr tail)))
			  (setcar tail " ")))
		    (setq tail (cddr tail)))
		  (setq name (apply 'concat name))))
	    (aset table c name)
	    (if (= c char)
		(setq val name))))
715
	(or val ""))))
Kenichi Handa's avatar
Kenichi Handa committed
716 717 718 719 720 721 722 723 724 725 726 727 728 729

   ((and (integerp val) (> val 0))
    (let* ((symbol-table (aref (char-table-extra-slot table 4) 1))
	   (sym (aref symbol-table (1- val))))
      (cond ((eq sym 'HANGUL\ SYLLABLE)
	     (let ((jamo-name-table (aref (char-table-extra-slot table 4) 2)))
	       ;; SIndex = S - SBase
	       (setq char (- char #xAC00))
	       (let ( ;; LIndex = SIndex / NCount
		     (L (/ char 588))
		     ;; VIndex = (SIndex % NCount) * TCount
		     (V (/ (% char 588) 28))
		     ;; TIndex = SIndex % TCount
		     (T (% char 28)))
730
		 (format "HANGUL SYLLABLE %s%s%s"
731 732
			 ;; U+110B is nil in this table.
			 (or (aref (aref jamo-name-table 0) L) "")
Kenichi Handa's avatar
Kenichi Handa committed
733 734 735
			 (aref (aref jamo-name-table 1) V)
			 (if (= T 0) ""
			   (aref (aref jamo-name-table 2) (1- T)))))))
736 737
	    ((eq sym 'CJK\ IDEOGRAPH)
	     (format "%s-%04X" sym char))
Kenichi Handa's avatar
Kenichi Handa committed
738 739 740
	    ((eq sym 'CJK\ COMPATIBILITY\ IDEOGRAPH)
	     (format "%s-%04X" sym char))
	    ((eq sym 'VARIATION\ SELECTOR)
741 742 743
	     (format "%s-%d" sym (+ (- char #xe0100) 17))))))

   (t "")))
Kenichi Handa's avatar
Kenichi Handa committed
744 745 746 747 748 749 750 751 752 753 754

;; Store VAL as the name of CHAR in TABLE.

(defun unidata-put-name (char val table)
  (let ((current-val (aref table char)))
    (if (and (stringp current-val) (= (aref current-val 0) 0))
	(funcall (char-table-extra-slot table 1) char current-val table))
    (aset table char val)))

(defun unidata-get-decomposition (char val table)
  (cond
755 756 757
   ((not val)
    (list char))

Kenichi Handa's avatar
Kenichi Handa committed
758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790
   ((consp val)
    val)

   ((stringp val)
    (if (> (aref val 0) 0)
	val
      (let* ((first-char (lsh (lsh char -7) 7))
	     (word-table (char-table-extra-slot table 4))
	     (i 1)
	     (len (length val))
	     (vec (make-vector 128 nil))
	     (idx 0)
	     (case-fold-search nil)
	     c word-list tail-list last-list word diff-head)
	(while (< i len)
	  (setq c (aref val i))
	  (if (< c 3)
	      (progn
		(if (or word-list tail-list)
		    (aset vec idx
			  (setq last-list (nconc word-list tail-list))))
		(setq i (1+ i) idx (1+ idx)
		      word-list nil tail-list nil)
		(if (> c 0)
		    (let ((l last-list))
		      (if (= c 1)
			  (setq diff-head
				(prog1 (aref val i) (setq i (1+ i)))))
		      (setq tail-list (nthcdr (% diff-head 16) last-list))
		      (dotimes (i (/ diff-head 16))
			(setq word-list (nconc word-list (list (car l)))
			      l (cdr l))))))
	    (setq word-list
791
		  (nconc word-list
Kenichi Handa's avatar
Kenichi Handa committed
792 793 794 795 796 797
			 (list (or (unidata-decode-word c word-table) c)))
		  i (1+ i))))
	(if (or word-list tail-list)
	    (aset vec idx (nconc word-list tail-list)))
	(dotimes (i 128)
	  (aset table (+ first-char i) (aref vec i)))
798 799
	(setq val (aref vec (- char first-char)))
	(or val (list char)))))
Kenichi Handa's avatar
Kenichi Handa committed
800

Kenichi Handa's avatar
Kenichi Handa committed
801 802 803 804 805 806 807 808
   ;; Hangul syllable
   ((and (eq val 0) (>= char #xAC00) (<= char #xD7A3))
    ;; SIndex = S (char) - SBase (#xAC00)
    (setq char (- char #xAC00))
    (let (;; L = LBase + SIndex / NCount
	  (L (+ #x1100 (/ char 588)))
	  ;; V = VBase + (SIndex % NCount) * TCount
	  (V (+ #x1161 (/ (% char 588) 28)))
809 810
	  ;; LV = SBase + (SIndex / TCount) * TCount
	  (LV (+ #xAC00 (* (/ char 28) 28)))
Kenichi Handa's avatar
Kenichi Handa committed
811 812
	  ;; T = TBase + SIndex % TCount
	  (T (+ #x11A7 (% char 28))))
813 814
      (if (= T #x11A7)
	  (list L V)
815
	(list LV T))))
Kenichi Handa's avatar
Kenichi Handa committed
816 817 818 819

   ))

;; Store VAL as the decomposition information of CHAR in TABLE.
Kenichi Handa's avatar
Kenichi Handa committed
820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923

(defun unidata-put-decomposition (char val table)
  (let ((current-val (aref table char)))
    (if (and (stringp current-val) (= (aref current-val 0) 0))
	(funcall (char-table-extra-slot table 1) char current-val table))
    (aset table char val)))

;; UnicodeData.txt contains these lines:
;;   0000;<control>;Cc;0;BN;;;;;N;NULL;;;;
;;   ...
;;   0020;SPACE;Zs;0;WS;;;;;N;;;;;
;;   ...
;; The following command yields a file of about 96K bytes.
;;   % gawk -F ';' '{print $1,$2;}' < UnicodeData.txt | gzip > temp.gz
;; With the following function, we can get a file of almost the same
;; the size.

;; Generate a char-table for character names.

(defun unidata-gen-table-word-list (prop val-func)
  (let ((table (make-char-table 'char-code-property-table))
	(prop-idx (unidata-prop-index prop))
	(word-list (list nil))
	word-table
	block-list block-word-table block-end
	tail elt range val idx slot)
    (setq tail unidata-list)
    (setq block-end -1)
    (while tail
      (setq elt (car tail) tail (cdr tail))
      (setq range (car elt)
	    val (funcall val-func (nth prop-idx elt)))
      ;; Treat the sequence of "CJK COMPATIBILITY IDEOGRAPH-XXXX" and
      ;; "VARIATION SELECTOR-XXX" as a block.
      (if (and (consp val) (eq prop 'name)
	       (or (and (eq (car val) 'CJK)
			(eq (nth 1 val) 'COMPATIBILITY))
		   (and (>= range #xe0100)
			(eq (car val) 'VARIATION)
			(eq (nth 1 val) 'SELECTOR))))
	  (let ((first (car val))
		(second (nth 1 val))
		(start range))
	    (while (and (setq elt (car tail) range (car elt)
			      val (funcall val-func (nth prop-idx elt)))
			(consp val)
			(eq first (car val))
			(eq second (nth 1 val)))
	      (setq block-end range
		    tail (cdr tail)))
	    (setq range (cons start block-end)
		  val (if (eq first 'CJK) 'CJK\ COMPATIBILITY\ IDEOGRAPH
			'VARIATION\ SELECTOR))))

      (if (consp range)
	  (if val
	      (let ((slot (assq val block-list)))
		(setq range (cons (car range) (cdr range)))
		(setq block-end (cdr range))
		(if slot
		    (nconc slot (list range))
		  (push (list val range) block-list))))
	(let* ((start (lsh (lsh range -7) 7))
	       (limit (+ start 127))
	       (first tail)
	       (vec (make-vector 128 nil))
	       c name len)
	  (if (<= start block-end)
	      ;; START overlap with the previous block.
	      (aset table range (nth prop-idx elt))
	    (if val
		(aset vec (- range start) val))
	    (while (and (setq elt (car tail) range (car elt))
			(integerp range)
			(<= range limit))
	      (setq val (funcall val-func (nth prop-idx elt)))
	      (if val
		  (aset vec (- range start) val))
	      (setq tail (cdr tail)))
	    (setq vec (unidata-word-list-compress vec))
	    (when vec
	      (dotimes (i (length vec))
		(dolist (elt (aref vec i))
		  (if (symbolp elt)
		      (let ((slot (assq elt word-list)))
			(if slot
			    (setcdr slot (1+ (cdr slot)))
			  (setcdr word-list
				  (cons (cons elt 1) (cdr word-list))))))))
	      (set-char-table-range table (cons start limit) vec))))))
    (setq word-list (sort (cdr word-list)
			  #'(lambda (x y) (> (cdr x) (cdr y)))))
    (setq tail word-list idx 0)
    (while tail
      (setcdr (car tail) (unidata-encode-word idx))
      (setq idx (1+ idx) tail (cdr tail)))
    (setq word-table (make-vector (length word-list) nil))
    (setq idx 0)
    (dolist (elt word-list)
      (aset word-table idx (car elt))
      (setq idx (1+ idx)))

    (if (and (eq prop 'decomposition)
	     (> idx 32))
Kenichi Handa's avatar
Kenichi Handa committed
924
	(error "Too many symbols in decomposition data"))
Kenichi Handa's avatar
Kenichi Handa committed
925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971

    (dotimes (i (/ #x110000 128))
      (let* ((idx (* i 128))
	     (vec (aref table idx)))
	(when (vectorp vec)
	  (dotimes (i (length vec))
	    (let ((tail (aref vec i))
		  elt code)
	      (if (not tail)
		  (aset vec i "\0")
		(while tail
		  (setq elt (car tail)
			code (if (integerp elt) elt
			       (cdr (assq elt word-list))))
		  (setcar tail (string code))
		  (setq tail (cdr tail)))
		(aset vec i (mapconcat 'identity (aref vec i) "")))))
	  (set-char-table-range
	   table (cons idx (+ idx 127))
	   (mapconcat 'identity vec "")))))

    (setq block-word-table (make-vector (length block-list) nil))
    (setq idx 0)
    (dolist (elt block-list)
      (dolist (e (cdr elt))
	(set-char-table-range table e (1+ idx)))
      (aset block-word-table idx (car elt))
      (setq idx (1+ idx)))

    (set-char-table-extra-slot table 0 prop)
    (set-char-table-extra-slot table 4 (cons word-table block-word-table))
    table))

(defun unidata-split-name (str)
  (if (symbolp str)
      str
    (let ((len (length str))
	  (l nil)
	  (idx 0)
	  c)
      (if (= len 0)
	  nil
	(dotimes (i len)
	  (setq c (aref str i))
	  (if (= c 32)
	      (setq l (cons (intern (substring str idx i)) l)
		    idx (1+ i))
972
	    (if (and (= c ?-) (< idx i)
Kenichi Handa's avatar
Kenichi Handa committed
973 974 975 976 977
		     (< (1+ i) len) (/= (aref str (1+ i)) 32))
		(setq l (cons '- (cons (intern (substring str idx i)) l))
		      idx (1+ i)))))
	(nreverse (cons (intern (substring str idx)) l))))))

978
(defun unidata-gen-table-name (prop &rest ignore)
Kenichi Handa's avatar
Kenichi Handa committed
979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007
  (let* ((table (unidata-gen-table-word-list prop 'unidata-split-name))
	 (word-tables (char-table-extra-slot table 4)))
    (byte-compile 'unidata-get-name)
    (byte-compile 'unidata-put-name)
    (set-char-table-extra-slot table 1 (symbol-function 'unidata-get-name))
    (set-char-table-extra-slot table 2 (symbol-function 'unidata-put-name))

    (if (eq prop 'name)
	(set-char-table-extra-slot table 4
				   (vector (car word-tables)
					   (cdr word-tables)
					   unidata-name-jamo-name-table))
      (set-char-table-extra-slot table 4
				 (vector (car word-tables))))
    table))

(defun unidata-split-decomposition (str)
  (if (symbolp str)
      str
    (let ((len (length str))
	  (l nil)
	  (idx 0)
	  c)
      (if (= len 0)
	  nil
	(dotimes (i len)
	  (setq c (aref str i))
	  (if (= c 32)
	      (setq l (if (= (aref str idx) ?<)
1008
			  (cons (intern (substring str (1+ idx) (1- i))) l)
Kenichi Handa's avatar
Kenichi Handa committed
1009
			(cons (string-to-number (substring str idx i) 16) l))
Kenichi Handa's avatar
Kenichi Handa committed
1010 1011
		    idx (1+ i))))
	(if (= (aref str idx) ?<)
1012
	    (setq l (cons (intern (substring str (1+ idx) (1- len))) l))
Kenichi Handa's avatar
Kenichi Handa committed
1013
	  (setq l (cons (string-to-number (substring str idx len) 16) l)))
Kenichi Handa's avatar
Kenichi Handa committed
1014 1015 1016
	(nreverse l)))))


1017
(defun unidata-gen-table-decomposition (prop &rest ignore)
Kenichi Handa's avatar
Kenichi Handa committed
1018 1019 1020 1021 1022 1023
  (let* ((table (unidata-gen-table-word-list prop 'unidata-split-decomposition))
	 (word-tables (char-table-extra-slot table 4)))
    (byte-compile 'unidata-get-decomposition)
    (byte-compile 'unidata-put-decomposition)
    (set-char-table-extra-slot table 1
			       (symbol-function 'unidata-get-decomposition))
1024
    (set-char-table-extra-slot table 2
Kenichi Handa's avatar
Kenichi Handa committed
1025 1026 1027 1028 1029 1030 1031 1032
			       (symbol-function 'unidata-put-decomposition))
    (set-char-table-extra-slot table 4 (car word-tables))
    table))



(defun unidata-describe-general-category (val)
  (cdr (assq val
1033 1034
	     '((nil . "Uknown")
	       (Lu . "Letter, Uppercase")
Kenichi Handa's avatar
Kenichi Handa committed
1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116
	       (Ll . "Letter, Lowercase")
	       (Lt . "Letter, Titlecase")
	       (Lm . "Letter, Modifier")
	       (Lo . "Letter, Other")
	       (Mn . "Mark, Nonspacing")
	       (Mc . "Mark, Spacing Combining")
	       (Me . "Mark, Enclosing")
	       (Nd . "Number, Decimal Digit")
	       (Nl . "Number, Letter")
	       (No . "Number, Other")
	       (Pc . "Punctuation, Connector")
	       (Pd . "Punctuation, Dash")
	       (Ps . "Punctuation, Open")
	       (Pe . "Punctuation, Close")
	       (Pi . "Punctuation, Initial quote")
	       (Pf . "Punctuation, Final quote")
	       (Po . "Punctuation, Other")
	       (Sm . "Symbol, Math")
	       (Sc . "Symbol, Currency")
	       (Sk . "Symbol, Modifier")
	       (So . "Symbol, Other")
	       (Zs . "Separator, Space")
	       (Zl . "Separator, Line")
	       (Zp . "Separator, Paragraph")
	       (Cc . "Other, Control")
	       (Cf . "Other, Format")
	       (Cs . "Other, Surrogate")
	       (Co . "Other, Private Use")
	       (Cn . "Other, Not Assigned")))))

(defun unidata-describe-canonical-combining-class (val)
  (cdr (assq val
	     '((0 . "Spacing, split, enclosing, reordrant, and Tibetan subjoined")
	       (1 . "Overlays and interior")
	       (7 . "Nuktas")
	       (8 . "Hiragana/Katakana voicing marks")
	       (9 . "Viramas")
	       (10 . "Start of fixed position classes")
	       (199 . "End of fixed position classes")
	       (200 . "Below left attached")
	       (202 . "Below attached")
	       (204 . "Below right attached")
	       (208 . "Left attached (reordrant around single base character)")
	       (210 . "Right attached")
	       (212 . "Above left attached")
	       (214 . "Above attached")
	       (216 . "Above right attached")
	       (218 . "Below left")
	       (220 . "Below")
	       (222 . "Below right")
	       (224 . "Left (reordrant around single base character)")
	       (226 . "Right")
	       (228 . "Above left")
	       (230 . "Above")
	       (232 . "Above right")
	       (233 . "Double below")
	       (234 . "Double above")
	       (240 . "Below (iota subscript)")))))

(defun unidata-describe-bidi-class (val)
  (cdr (assq val
	     '((L . "Left-to-Right")
	       (LRE . "Left-to-Right Embedding")
	       (LRO . "Left-to-Right Override")
	       (R . "Right-to-Left")
	       (AL . "Right-to-Left Arabic")
	       (RLE . "Right-to-Left Embedding")
	       (RLO . "Right-to-Left Override")
	       (PDF . "Pop Directional Format")
	       (EN . "European Number")
	       (ES . "European Number Separator")
	       (ET . "European Number Terminator")
	       (AN . "Arabic Number")
	       (CS . "Common Number Separator")
	       (NSM . "Non-Spacing Mark")
	       (BN . "Boundary Neutral")
	       (B . "Paragraph Separator")
	       (S . "Segment Separator")
	       (WS . "Whitespace")
	       (ON . "Other Neutrals")))))

(defun unidata-describe-decomposition (val)
1117 1118 1119 1120 1121 1122 1123
  (mapconcat
   #'(lambda (x)
       (if (symbolp x) (symbol-name x)
	 (concat (string ?')
		 (compose-string (string x) 0 1 (string ?\t x ?\t))
		 (string ?'))))
   val " "))
Kenichi Handa's avatar
Kenichi Handa committed
1124

1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137
(defun unidata-gen-mirroring-list ()
  (let ((head (list nil))
	tail)
    (with-temp-buffer
      (insert-file-contents (expand-file-name "BidiMirroring.txt" unidata-dir))
      (goto-char (point-min))
      (setq tail head)
      (while (re-search-forward "^\\([0-9A-F]+\\);\\s +\\([0-9A-F]+\\)" nil t)
	(let ((char (string-to-number (match-string 1) 16))
	      (mirror (match-string 2)))
	  (setq tail (setcdr tail (list (list char mirror)))))))
    (cdr head)))

Kenichi Handa's avatar
Kenichi Handa committed
1138 1139 1140 1141 1142 1143 1144 1145
;; Verify if we can retrieve correct values from the generated
;; char-tables.

(defun unidata-check ()
  (dolist (elt unidata-prop-alist)
    (let* ((prop (car elt))
	   (index (unidata-prop-index prop))
	   (generator (unidata-prop-generator prop))
1146
	   (table (progn
Kenichi Handa's avatar
Kenichi Handa committed
1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162
		    (message "Generating %S table..." prop)
		    (funcall generator prop)))
	   (decoder (char-table-extra-slot table 1))
	   (check #x400))
      (dolist (e unidata-list)
	(let ((char (car e))
	      (val1 (nth index e))
	      val2)
	  (if (and (stringp val1) (= (length val1) 0))
	      (setq val1 nil))
	  (unless (consp char)
	    (setq val2 (funcall decoder char (aref table char) table))
	    (if val1
		(cond ((eq generator 'unidata-gen-table-symbol)
		       (setq val1 (intern val1)))
		      ((eq generator 'unidata-gen-table-integer)
Kenichi Handa's avatar
Kenichi Handa committed
1163
		       (setq val1 (string-to-number val1)))
Kenichi Handa's avatar
Kenichi Handa committed
1164
		      ((eq generator 'unidata-gen-table-character)
Kenichi Handa's avatar
Kenichi Handa committed
1165
		       (setq val1 (string-to-number val1 16)))
Kenichi Handa's avatar
Kenichi Handa committed
1166 1167 1168 1169 1170 1171
		      ((eq generator 'unidata-gen-table-decomposition)
		       (setq val1 (unidata-split-decomposition val1)))))
	    (when (>= char check)
	      (message "%S %04X" prop check)
	      (setq check (+ check #x400)))
	    (or (equal val1 val2)
1172
		(insert (format "> %04X %S\n< %04X %S\n"
Kenichi Handa's avatar
Kenichi Handa committed
1173 1174 1175 1176 1177 1178
				char val1 char val2)))
	    (sit-for 0)))))))

;; The entry function.  It generates files described in the header
;; comment of this file.

1179 1180 1181 1182 1183
(defun unidata-gen-files (&optional data-dir unidata-text-file)
  (or data-dir
      (setq data-dir (car command-line-args-left)
	    command-line-args-left (cdr command-line-args-left)
	    unidata-text-file (car command-line-args-left)
1184
	    command-line-args-left (cdr command-line-args-left)))
1185
  (let ((coding-system-for-write 'utf-8-unix)
1186 1187 1188 1189 1190 1191 1192 1193
	(charprop-file "charprop.el")
	(unidata-dir data-dir))
    (dolist (elt unidata-prop-alist)
      (let* ((prop (car elt))
	     (file (unidata-prop-file prop)))
	(if (file-exists-p file)
	    (delete-file file))))
    (unidata-setup-list unidata-text-file)
Kenichi Handa's avatar
Kenichi Handa committed
1194 1195 1196 1197 1198 1199 1200 1201
    (with-temp-file charprop-file
      (insert ";; Automatically generated by unidata-gen.el.\n")
      (dolist (elt unidata-prop-alist)
	(let* ((prop (car elt))
	       (generator (unidata-prop-generator prop))
	       (file (unidata-prop-file prop))
	       (docstring (unidata-prop-docstring prop))
	       (describer (unidata-prop-describer prop))
1202 1203
	       (default-value (unidata-prop-default prop))
	       (val-list (unidata-prop-val-list prop))
Kenichi Handa's avatar
Kenichi Handa committed
1204 1205 1206 1207 1208 1209
	       table)
	  ;; Filename in this comment line is extracted by sed in
	  ;; Makefile.
	  (insert (format ";; FILE: %s\n" file))
	  (insert (format "(define-char-code-property '%S %S\n  %S)\n"
			  prop file docstring))
1210
	  (with-temp-buffer
Kenichi Handa's avatar
Kenichi Handa committed
1211
	    (message "Generating %s..." file)
1212 1213 1214 1215 1216
	    (when (file-exists-p file)
	      (insert-file-contents file)
	      (goto-char (point-max))
	      (search-backward ";; Local Variables:"))
	    (setq table (funcall generator prop default-value val-list))
Kenichi Handa's avatar
Kenichi Handa committed
1217 1218 1219 1220 1221
	    (when describer
	      (unless (subrp (symbol-function describer))
		(byte-compile describer)
		(setq describer (symbol-function describer)))
	      (set-char-table-extra-slot table 3 describer))
1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236
	    (if (bobp)
		(insert ";; Copyright (C) 1991-2009 Unicode, Inc.
;; This file was generated from the Unicode data files at
;; http://www.unicode.org/Public/UNIDATA/.
;; See lisp/international/README for the copyright and permission notice.\n"))
	    (insert (format "(define-char-code-property '%S %S %S)\n"
			    prop table docstring))
	    (if (eobp)
		(insert ";; Local Variables:\n"
			";; coding: utf-8\n"
			";; no-byte-compile: t\n"
			";; End:\n\n"
			(format ";; %s ends here\n" file)))
	    (write-file file)
	    (message "Generating %s...done" file))))
Kenichi Handa's avatar
Kenichi Handa committed
1237
      (message "Writing %s..." charprop-file)
Kenichi Handa's avatar
Kenichi Handa committed
1238
      (insert ";; Local Variables:\n"
Kenichi Handa's avatar
Kenichi Handa committed
1239 1240 1241 1242 1243
	      ";; coding: utf-8\n"
	      ";; no-byte-compile: t\n"
	      ";; End:\n\n"
	      (format ";; %s ends here\n" charprop-file)))))

Kenichi Handa's avatar
Kenichi Handa committed
1244 1245


Kenichi Handa's avatar
Kenichi Handa committed
1246
;;; unidata-gen.el ends here