charset.c 69.7 KB
Newer Older
1
/* Basic character set support.
Karl Heuer's avatar
Karl Heuer committed
2

Paul Eggert's avatar
Paul Eggert committed
3
Copyright (C) 2001-2019 Free Software Foundation, Inc.
4 5 6 7 8 9 10 11 12

Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
  2005, 2006, 2007, 2008, 2009, 2010, 2011
  National Institute of Advanced Industrial Science and Technology (AIST)
  Registration Number H14PRO021

Copyright (C) 2003, 2004
  National Institute of Advanced Industrial Science and Technology (AIST)
  Registration Number H13PRO009
Karl Heuer's avatar
Karl Heuer committed
13

Karl Heuer's avatar
Karl Heuer committed
14 15
This file is part of GNU Emacs.

16
GNU Emacs is free software: you can redistribute it and/or modify
Karl Heuer's avatar
Karl Heuer committed
17
it under the terms of the GNU General Public License as published by
18 19
the Free Software Foundation, either version 3 of the License, or (at
your option) any later version.
Karl Heuer's avatar
Karl Heuer committed
20

Karl Heuer's avatar
Karl Heuer committed
21 22 23 24
GNU Emacs is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
Karl Heuer's avatar
Karl Heuer committed
25

Karl Heuer's avatar
Karl Heuer committed
26
You should have received a copy of the GNU General Public License
27
along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.  */
Karl Heuer's avatar
Karl Heuer committed
28

29 30
#include <config.h>

31
#include <errno.h>
Paul Eggert's avatar
Paul Eggert committed
32
#include <stdlib.h>
33
#include <unistd.h>
34
#include <limits.h>
Karl Heuer's avatar
Karl Heuer committed
35 36
#include <sys/types.h>
#include "lisp.h"
37
#include "character.h"
Karl Heuer's avatar
Karl Heuer committed
38 39
#include "charset.h"
#include "coding.h"
40
#include "buffer.h"
41
#include "sysstdio.h"
Karl Heuer's avatar
Karl Heuer committed
42

43
/*** GENERAL NOTES on CODED CHARACTER SETS (CHARSETS) ***
Karl Heuer's avatar
Karl Heuer committed
44

45
  A coded character set ("charset" hereafter) is a meaningful
46
  collection (i.e. language, culture, functionality, etc.) of
47
  characters.  Emacs handles multiple charsets at once.  In Emacs Lisp
48 49
  code, a charset is represented by a symbol.  In C code, a charset is
  represented by its ID number or by a pointer to a struct charset.
Karl Heuer's avatar
Karl Heuer committed
50

51 52 53
  The actual information about each charset is stored in two places.
  Lispy information is stored in the hash table Vcharset_hash_table as
  a vector (charset attributes).  The other information is stored in
54
  charset_table as a struct charset.
Karl Heuer's avatar
Karl Heuer committed
55

56
*/
Karl Heuer's avatar
Karl Heuer committed
57

58 59 60
/* Hash table that contains attributes of each charset.  Keys are
   charset symbols, and values are vectors of charset attributes.  */
Lisp_Object Vcharset_hash_table;
Karl Heuer's avatar
Karl Heuer committed
61

62 63
/* Table of struct charset.  */
struct charset *charset_table;
Karl Heuer's avatar
Karl Heuer committed
64

65
static ptrdiff_t charset_table_size;
66
static int charset_table_used;
Karl Heuer's avatar
Karl Heuer committed
67

68
/* Special charsets corresponding to symbols.  */
69
int charset_ascii;
70
int charset_eight_bit;
71
static int charset_iso_8859_1;
72
int charset_unicode;
73
static int charset_emacs;
74

75 76 77 78
/* The other special charsets.  */
int charset_jisx0201_roman;
int charset_jisx0208_1978;
int charset_jisx0208;
79
int charset_ksc5601;
80

81 82
/* Charset of unibyte characters.  */
int charset_unibyte;
Karl Heuer's avatar
Karl Heuer committed
83

84 85
/* List of charsets ordered by the priority.  */
Lisp_Object Vcharset_ordered_list;
Karl Heuer's avatar
Karl Heuer committed
86

87 88 89 90
/* Sub-list of Vcharset_ordered_list that contains all non-preferred
   charsets.  */
Lisp_Object Vcharset_non_preferred_head;

91
/* Incremented every time we change the priority of charsets.
92
   Wraps around.  */
93
EMACS_UINT charset_ordered_list_tick;
Karl Heuer's avatar
Karl Heuer committed
94

95 96
/* List of iso-2022 charsets.  */
Lisp_Object Viso_2022_charset_list;
97

98 99 100
/* List of emacs-mule charsets.  */
Lisp_Object Vemacs_mule_charset_list;

101
int emacs_mule_charset[256];
Karl Heuer's avatar
Karl Heuer committed
102 103 104

/* Mapping table from ISO2022's charset (specified by DIMENSION,
   CHARS, and FINAL-CHAR) to Emacs' charset.  */
105 106
int iso_charset_table[ISO_MAX_DIMENSION][ISO_MAX_CHARS][ISO_MAX_FINAL];

107 108
#define CODE_POINT_TO_INDEX(charset, code)				\
  ((charset)->code_linear_p						\
109
   ? (int) ((code) - (charset)->min_code)				\
110 111 112 113
   : (((charset)->code_space_mask[(code) >> 24] & 0x8)			\
      && ((charset)->code_space_mask[((code) >> 16) & 0xFF] & 0x4)	\
      && ((charset)->code_space_mask[((code) >> 8) & 0xFF] & 0x2)	\
      && ((charset)->code_space_mask[(code) & 0xFF] & 0x1))		\
114 115 116 117 118 119 120 121
   ? (int) (((((code) >> 24) - (charset)->code_space[12])		\
	     * (charset)->code_space[11])				\
	    + (((((code) >> 16) & 0xFF) - (charset)->code_space[8])	\
	       * (charset)->code_space[7])				\
	    + (((((code) >> 8) & 0xFF) - (charset)->code_space[4])	\
	       * (charset)->code_space[3])				\
	    + (((code) & 0xFF) - (charset)->code_space[0])		\
	    - ((charset)->char_index_offset))				\
122 123 124
   : -1)


125 126 127
/* Return the code-point for the character index IDX in CHARSET.
   IDX should be an unsigned int variable in a valid range (which is
   always in nonnegative int range too).  IDX contains garbage afterwards.  */
128

129 130 131 132 133 134 135 136 137 138 139 140 141
#define INDEX_TO_CODE_POINT(charset, idx)				     \
  ((charset)->code_linear_p						     \
   ? (idx) + (charset)->min_code					     \
   : (idx += (charset)->char_index_offset,				     \
      (((charset)->code_space[0] + (idx) % (charset)->code_space[2])	     \
       | (((charset)->code_space[4]					     \
	   + ((idx) / (charset)->code_space[3] % (charset)->code_space[6]))  \
	  << 8)								     \
       | (((charset)->code_space[8]					     \
	   + ((idx) / (charset)->code_space[7] % (charset)->code_space[10])) \
	  << 16)							     \
       | (((charset)->code_space[12] + ((idx) / (charset)->code_space[11]))  \
	  << 24))))
142

143 144
/* Structure to hold mapping tables for a charset.  Used by temacs
   invoked for dumping.  */
145

146 147 148 149 150 151 152 153
static struct
{
  /* The current charset for which the following tables are setup.  */
  struct charset *current;

  /* 1 iff the following table is used for encoder.  */
  short for_encoder;

Juanma Barranquero's avatar
Juanma Barranquero committed
154 155
  /* When the following table is used for encoding, minimum and
     maximum character of the current charset.  */
156 157
  int min_char, max_char;

Paul Eggert's avatar
Paul Eggert committed
158
  /* A Unicode character corresponding to the code index 0 (i.e. the
159
     minimum code-point) of the current charset, or -1 if the code
Paul Eggert's avatar
Paul Eggert committed
160
     index 0 is not a Unicode character.  This is checked when
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
     table.encoder[CHAR] is zero.  */
  int zero_index_char;

  union {
    /* Table mapping code-indices (not code-points) of the current
       charset to Unicode characters.  If decoder[CHAR] is -1, CHAR
       doesn't belong to the current charset.  */
    int decoder[0x10000];
    /* Table mapping Unicode characters to code-indices of the current
       charset.  The first 0x10000 elements are for BMP (0..0xFFFF),
       and the last 0x10000 are for SMP (0x10000..0x1FFFF) or SIP
       (0x20000..0x2FFFF).  Note that there is no charset map that
       uses both SMP and SIP.  */
    unsigned short encoder[0x20000];
  } table;
} *temp_charset_work;

#define SET_TEMP_CHARSET_WORK_ENCODER(C, CODE)			\
  do {								\
    if ((CODE) == 0)						\
      temp_charset_work->zero_index_char = (C);			\
    else if ((C) < 0x20000)					\
      temp_charset_work->table.encoder[(C)] = (CODE);		\
    else							\
      temp_charset_work->table.encoder[(C) - 0x10000] = (CODE);	\
  } while (0)

#define GET_TEMP_CHARSET_WORK_ENCODER(C)				  \
  ((C) == temp_charset_work->zero_index_char ? 0			  \
   : (C) < 0x20000 ? (temp_charset_work->table.encoder[(C)]		  \
		      ? (int) temp_charset_work->table.encoder[(C)] : -1) \
   : temp_charset_work->table.encoder[(C) - 0x10000]			  \
   ? temp_charset_work->table.encoder[(C) - 0x10000] : -1)

#define SET_TEMP_CHARSET_WORK_DECODER(C, CODE)	\
  (temp_charset_work->table.decoder[(CODE)] = (C))

#define GET_TEMP_CHARSET_WORK_DECODER(CODE)	\
  (temp_charset_work->table.decoder[(CODE)])
Richard M. Stallman's avatar
Richard M. Stallman committed
200

201

202 203
/* Set to 1 to warn that a charset map is loaded and thus a buffer
   text and a string data may be relocated.  */
204
bool charset_map_loaded;
Karl Heuer's avatar
Karl Heuer committed
205

206
struct charset_map_entries
Karl Heuer's avatar
Karl Heuer committed
207
{
208 209 210 211 212 213 214
  struct {
    unsigned from, to;
    int c;
  } entry[0x10000];
  struct charset_map_entries *next;
};

215 216 217 218 219 220 221 222 223
/* Load the mapping information of CHARSET from ENTRIES for
   initializing (CONTROL_FLAG == 0), decoding (CONTROL_FLAG == 1), and
   encoding (CONTROL_FLAG == 2).

   If CONTROL_FLAG is 0, setup CHARSET->min_char, CHARSET->max_char,
   and CHARSET->fast_map.

   If CONTROL_FLAG is 1, setup the following tables according to
   CHARSET->method and inhibit_load_charset_map.
224

225 226 227 228 229
   CHARSET->method       | inhibit_lcm == 0   | inhibit_lcm == 1
   ----------------------+--------------------+---------------------------
   CHARSET_METHOD_MAP    | CHARSET->decoder   | temp_charset_work->decoder
   ----------------------+--------------------+---------------------------
   CHARSET_METHOD_OFFSET | Vchar_unify_table  | temp_charset_work->decoder
230

231
   If CONTROL_FLAG is 2, setup the following tables.
232

233 234 235 236 237 238
   CHARSET->method       | inhibit_lcm == 0   | inhibit_lcm == 1
   ----------------------+--------------------+---------------------------
   CHARSET_METHOD_MAP    | CHARSET->encoder   | temp_charset_work->encoder
   ----------------------+--------------------+--------------------------
   CHARSET_METHOD_OFFSET | CHARSET->deunifier | temp_charset_work->encoder
*/
Karl Heuer's avatar
Karl Heuer committed
239

240
static void
241
load_charset_map (struct charset *charset, struct charset_map_entries *entries, int n_entries, int control_flag)
Karl Heuer's avatar
Karl Heuer committed
242
{
Paul Eggert's avatar
Paul Eggert committed
243
  Lisp_Object vec UNINIT;
244
  Lisp_Object table UNINIT;
245
  unsigned max_code = CHARSET_MAX_CODE (charset);
246
  bool ascii_compatible_p = charset->ascii_compatible_p;
247 248 249
  int min_char, max_char, nonascii_min_char;
  int i;
  unsigned char *fast_map = charset->fast_map;
250

251 252 253
  if (n_entries <= 0)
    return;

254
  if (control_flag)
255
    {
256 257 258 259 260 261 262
      if (! inhibit_load_charset_map)
	{
	  if (control_flag == 1)
	    {
	      if (charset->method == CHARSET_METHOD_MAP)
		{
		  int n = CODE_POINT_TO_INDEX (charset, max_code) + 1;
263

264 265
		  vec = Fmake_vector (make_number (n), make_number (-1));
		  set_charset_attr (charset, charset_decoder, vec);
266 267 268 269 270 271 272 273 274 275 276
		}
	      else
		{
		  char_table_set_range (Vchar_unify_table,
					charset->min_char, charset->max_char,
					Qnil);
		}
	    }
	  else
	    {
	      table = Fmake_char_table (Qnil, Qnil);
277 278 279 280
	      set_charset_attr (charset,
				(charset->method == CHARSET_METHOD_MAP
				 ? charset_encoder : charset_deunifier),
				table);
281 282 283 284 285
	    }
	}
      else
	{
	  if (! temp_charset_work)
286
	    temp_charset_work = xmalloc (sizeof *temp_charset_work);
287 288 289 290 291 292 293 294 295 296 297 298 299 300 301
	  if (control_flag == 1)
	    {
	      memset (temp_charset_work->table.decoder, -1,
		      sizeof (int) * 0x10000);
	    }
	  else
	    {
	      memset (temp_charset_work->table.encoder, 0,
		      sizeof (unsigned short) * 0x20000);
	      temp_charset_work->zero_index_char = -1;
	    }
	  temp_charset_work->current = charset;
	  temp_charset_work->for_encoder = (control_flag == 2);
	  control_flag += 2;
	}
302
      charset_map_loaded = 1;
303
    }
304

305
  min_char = max_char = entries->entry[0].c;
306
  nonascii_min_char = MAX_CHAR;
307
  for (i = 0; i < n_entries; i++)
308
    {
309
      unsigned from, to;
310
      int from_index, to_index, lim_index;
311
      int from_c, to_c;
312
      int idx = i % 0x10000;
313

314 315 316 317
      if (i > 0 && idx == 0)
	entries = entries->next;
      from = entries->entry[idx].from;
      to = entries->entry[idx].to;
318 319 320
      from_c = entries->entry[idx].c;
      from_index = CODE_POINT_TO_INDEX (charset, from);
      if (from == to)
321
	{
322 323
	  to_index = from_index;
	  to_c = from_c;
324
	}
325
      else
326
	{
327 328
	  to_index = CODE_POINT_TO_INDEX (charset, to);
	  to_c = from_c + (to_index - from_index);
329
	}
330 331
      if (from_index < 0 || to_index < 0)
	continue;
332
      lim_index = to_index + 1;
333

334 335 336 337
      if (to_c > max_char)
	max_char = to_c;
      else if (from_c < min_char)
	min_char = from_c;
338

339 340 341
      if (control_flag == 1)
	{
	  if (charset->method == CHARSET_METHOD_MAP)
342
	    for (; from_index < lim_index; from_index++, from_c++)
343 344
	      ASET (vec, from_index, make_number (from_c));
	  else
345
	    for (; from_index < lim_index; from_index++, from_c++)
346 347 348 349 350 351 352 353
	      CHAR_TABLE_SET (Vchar_unify_table,
			      CHARSET_CODE_OFFSET (charset) + from_index,
			      make_number (from_c));
	}
      else if (control_flag == 2)
	{
	  if (charset->method == CHARSET_METHOD_MAP
	      && CHARSET_COMPACT_CODES_P (charset))
354
	    for (; from_index < lim_index; from_index++, from_c++)
355
	      {
356 357
		unsigned code = from_index;
		code = INDEX_TO_CODE_POINT (charset, code);
358 359 360 361 362

		if (NILP (CHAR_TABLE_REF (table, from_c)))
		  CHAR_TABLE_SET (table, from_c, make_number (code));
	      }
	  else
363
	    for (; from_index < lim_index; from_index++, from_c++)
364 365 366 367 368 369
	      {
		if (NILP (CHAR_TABLE_REF (table, from_c)))
		  CHAR_TABLE_SET (table, from_c, make_number (from_index));
	      }
	}
      else if (control_flag == 3)
370
	for (; from_index < lim_index; from_index++, from_c++)
371 372
	  SET_TEMP_CHARSET_WORK_DECODER (from_c, from_index);
      else if (control_flag == 4)
373
	for (; from_index < lim_index; from_index++, from_c++)
374 375 376
	  SET_TEMP_CHARSET_WORK_ENCODER (from_c, from_index);
      else			/* control_flag == 0 */
	{
377 378
	  if (ascii_compatible_p)
	    {
379
	      if (! ASCII_CHAR_P (from_c))
380 381 382 383
		{
		  if (from_c < nonascii_min_char)
		    nonascii_min_char = from_c;
		}
384
	      else if (! ASCII_CHAR_P (to_c))
385 386 387 388
		{
		  nonascii_min_char = 0x80;
		}
	    }
389

390 391
	  for (; from_c <= to_c; from_c++)
	    CHARSET_FAST_MAP_SET (from_c, fast_map);
392
	}
393
    }
394

395
  if (control_flag == 0)
Karl Heuer's avatar
Karl Heuer committed
396
    {
397 398 399
      CHARSET_MIN_CHAR (charset) = (ascii_compatible_p
				    ? nonascii_min_char : min_char);
      CHARSET_MAX_CHAR (charset) = max_char;
Karl Heuer's avatar
Karl Heuer committed
400
    }
401 402 403 404 405
  else if (control_flag == 4)
    {
      temp_charset_work->min_char = min_char;
      temp_charset_work->max_char = max_char;
    }
Karl Heuer's avatar
Karl Heuer committed
406 407
}

408

409
/* Read a hexadecimal number (preceded by "0x") from the file FP while
410 411 412 413
   paying attention to comment character '#'.  LOOKAHEAD is the
   lookahead byte if it is nonnegative.  Store into *TERMINATOR the
   input byte after the number, or EOF if an end-of-file or input
   error occurred.  Set *OVERFLOW if the number overflows.  */
414

415
static unsigned
416
read_hex (FILE *fp, int lookahead, int *terminator, bool *overflow)
417
{
418
  int c = lookahead < 0 ? getc_unlocked (fp) : lookahead;
419

420
  while (true)
421
    {
422
      if (c == '#')
423 424 425
	do
	  c = getc_unlocked (fp);
	while (0 <= c && c != '\n');
426 427
      else if (c == '0')
	{
428 429
	  c = getc_unlocked (fp);
	  if (c < 0 || c == 'x')
430 431
	    break;
	}
432
      if (c < 0)
433
	break;
434
      c = getc_unlocked (fp);
435
    }
436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452

  unsigned n = 0;
  bool v = false;

  if (0 <= c)
    while (true)
      {
	c = getc_unlocked (fp);
	int digit = char_hexdigit (c);
	if (digit < 0)
	  break;
	v |= INT_LEFT_SHIFT_OVERFLOW (n, 4);
	n = (n << 4) + digit;
      }

  *terminator = c;
  *overflow |= v;
453 454
  return n;
}
455

456
/* Return a mapping vector for CHARSET loaded from MAPFILE.
457 458 459 460 461 462 463
   Each line of MAPFILE has this form
	0xAAAA 0xCCCC
   where 0xAAAA is a code-point and 0xCCCC is the corresponding
   character code, or this form
	0xAAAA-0xBBBB 0xCCCC
   where 0xAAAA and 0xBBBB are code-points specifying a range, and
   0xCCCC is the first character code of the range.
Karl Heuer's avatar
Karl Heuer committed
464

465 466
   The returned vector has this form:
	[ CODE1 CHAR1 CODE2 CHAR2 .... ]
467
   where CODE1 is a code-point or a cons of code-points specifying a
468 469
   range.

Juanma Barranquero's avatar
Juanma Barranquero committed
470 471
   Note that this function uses `openp' to open MAPFILE but ignores
   `file-name-handler-alist' to avoid running any Lisp code.  */
Karl Heuer's avatar
Karl Heuer committed
472

473
static void
474 475
load_charset_map_from_file (struct charset *charset, Lisp_Object mapfile,
			    int control_flag)
Karl Heuer's avatar
Karl Heuer committed
476
{
477 478
  unsigned min_code = CHARSET_MIN_CODE (charset);
  unsigned max_code = CHARSET_MAX_CODE (charset);
479 480
  int fd;
  FILE *fp;
481
  struct charset_map_entries *head, *entries;
482
  int n_entries;
483 484 485 486
  AUTO_STRING (map, ".map");
  AUTO_STRING (txt, ".txt");
  AUTO_LIST2 (suffixes, map, txt);
  ptrdiff_t count = SPECPDL_INDEX ();
487
  record_unwind_protect_nothing ();
488
  specbind (Qfile_name_handler_alist, Qnil);
489
  fd = openp (Vcharset_map_path, mapfile, suffixes, NULL, Qnil, false);
490 491 492 493 494 495 496 497 498
  fp = fd < 0 ? 0 : fdopen (fd, "r");
  if (!fp)
    {
      int open_errno = errno;
      emacs_close (fd);
      report_file_errno ("Loading charset map", mapfile, open_errno);
    }
  set_unwind_protect_ptr (count, fclose_unwind, fp);
  unbind_to (count + 1, Qnil);
Karl Heuer's avatar
Karl Heuer committed
499

500
  /* Use record_xmalloc, as `charset_map_entries' is
501
     large (larger than MAX_ALLOCA).  */
502
  head = record_xmalloc (sizeof *head);
503
  entries = head;
504
  memset (entries, 0, sizeof (struct charset_map_entries));
505

506
  n_entries = 0;
507 508
  int ch = -1;
  while (true)
509
    {
510 511 512
      bool overflow = false;
      unsigned from = read_hex (fp, ch, &ch, &overflow), to;
      if (ch < 0)
513
	break;
514 515 516 517 518 519
      if (ch == '-')
	{
	  to = read_hex (fp, -1, &ch, &overflow);
	  if (ch < 0)
	    break;
	}
520
      else
521 522 523 524 525 526
	{
	  to = from;
	  ch = -1;
	}
      unsigned c = read_hex (fp, ch, &ch, &overflow);
      if (ch < 0)
527
	break;
528

529 530
      if (overflow)
	continue;
531 532
      if (from < min_code || to > max_code || from > to || c > MAX_CHAR)
	continue;
533

534
      if (n_entries == 0x10000)
535
	{
536
	  entries->next = record_xmalloc (sizeof *entries->next);
537
	  entries = entries->next;
538
	  memset (entries, 0, sizeof (struct charset_map_entries));
539
	  n_entries = 0;
540
	}
541
      int idx = n_entries;
542 543 544 545
      entries->entry[idx].from = from;
      entries->entry[idx].to = to;
      entries->entry[idx].c = c;
      n_entries++;
546 547
    }
  fclose (fp);
548
  clear_unwind_protect (count);
549

550
  load_charset_map (charset, head, n_entries, control_flag);
551
  unbind_to (count, Qnil);
Karl Heuer's avatar
Karl Heuer committed
552 553
}

554
static void
555
load_charset_map_from_vector (struct charset *charset, Lisp_Object vec, int control_flag)
Kenichi Handa's avatar
Kenichi Handa committed
556
{
557 558 559 560 561 562
  unsigned min_code = CHARSET_MIN_CODE (charset);
  unsigned max_code = CHARSET_MAX_CODE (charset);
  struct charset_map_entries *head, *entries;
  int n_entries;
  int len = ASIZE (vec);
  int i;
563
  USE_SAFE_ALLOCA;
Kenichi Handa's avatar
Kenichi Handa committed
564

565
  if (len % 2 == 1)
566
    {
Paul Eggert's avatar
Paul Eggert committed
567
      add_to_log ("Failure in loading charset map: %V", vec);
568
      return;
569
    }
570

571 572
  /* Use SAFE_ALLOCA instead of alloca, as `charset_map_entries' is
     large (larger than MAX_ALLOCA).  */
573
  head = SAFE_ALLOCA (sizeof *head);
574
  entries = head;
575
  memset (entries, 0, sizeof (struct charset_map_entries));
576

577 578
  n_entries = 0;
  for (i = 0; i < len; i += 2)
579
    {
580 581
      Lisp_Object val, val2;
      unsigned from, to;
582
      EMACS_INT c;
583
      int idx;
584

585 586
      val = AREF (vec, i);
      if (CONSP (val))
Kenichi Handa's avatar
Kenichi Handa committed
587
	{
588 589 590 591
	  val2 = XCDR (val);
	  val = XCAR (val);
	  from = XFASTINT (val);
	  to = XFASTINT (val2);
Kenichi Handa's avatar
Kenichi Handa committed
592
	}
593
      else
594
	from = to = XFASTINT (val);
595 596 597
      val = AREF (vec, i + 1);
      CHECK_NATNUM (val);
      c = XFASTINT (val);
598

599 600
      if (from < min_code || to > max_code || from > to || c > MAX_CHAR)
	continue;
601

602
      if (n_entries > 0 && (n_entries % 0x10000) == 0)
603
	{
604
	  entries->next = SAFE_ALLOCA (sizeof *entries->next);
605
	  entries = entries->next;
606
	  memset (entries, 0, sizeof (struct charset_map_entries));
607 608 609 610 611 612 613
	}
      idx = n_entries % 0x10000;
      entries->entry[idx].from = from;
      entries->entry[idx].to = to;
      entries->entry[idx].c = c;
      n_entries++;
    }
614

615
  load_charset_map (charset, head, n_entries, control_flag);
616
  SAFE_FREE ();
617 618
}

619 620 621 622

/* Load a mapping table for CHARSET.  CONTROL-FLAG tells what kind of
   map it is (see the comment of load_charset_map for the detail).  */

623
static void
624
load_charset (struct charset *charset, int control_flag)
625
{
626
  Lisp_Object map;
627

628 629 630
  if (inhibit_load_charset_map
      && temp_charset_work
      && charset == temp_charset_work->current
631
      && ((control_flag == 2) == temp_charset_work->for_encoder))
632 633 634 635
    return;

  if (CHARSET_METHOD (charset) == CHARSET_METHOD_MAP)
    map = CHARSET_MAP (charset);
636 637 638
  else
    {
      if (! CHARSET_UNIFIED_P (charset))
639
	emacs_abort ();
640 641
      map = CHARSET_UNIFY_MAP (charset);
    }
642 643 644 645
  if (STRINGP (map))
    load_charset_map_from_file (charset, map, control_flag);
  else
    load_charset_map_from_vector (charset, map, control_flag);
Karl Heuer's avatar
Karl Heuer committed
646
}
647

648 649 650

DEFUN ("charsetp", Fcharsetp, Scharsetp, 1, 1, 0,
       doc: /* Return non-nil if and only if OBJECT is a charset.*/)
651
  (Lisp_Object object)
Kenichi Handa's avatar
Kenichi Handa committed
652
{
653
  return (CHARSETP (object) ? Qt : Qnil);
654 655
}

Karl Heuer's avatar
Karl Heuer committed
656

657 658 659 660
static void
map_charset_for_dump (void (*c_function) (Lisp_Object, Lisp_Object),
		      Lisp_Object function, Lisp_Object arg,
		      unsigned int from, unsigned int to)
661 662 663
{
  int from_idx = CODE_POINT_TO_INDEX (temp_charset_work->current, from);
  int to_idx = CODE_POINT_TO_INDEX (temp_charset_work->current, to);
664
  Lisp_Object range = Fcons (Qnil, Qnil);
665 666 667 668 669
  int c, stop;

  c = temp_charset_work->min_char;
  stop = (temp_charset_work->max_char < 0x20000
	  ? temp_charset_work->max_char : 0xFFFF);
670

671 672
  while (1)
    {
673
      int idx = GET_TEMP_CHARSET_WORK_ENCODER (c);
674

675
      if (idx >= from_idx && idx <= to_idx)
676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709
	{
	  if (NILP (XCAR (range)))
	    XSETCAR (range, make_number (c));
	}
      else if (! NILP (XCAR (range)))
	{
	  XSETCDR (range, make_number (c - 1));
	  if (c_function)
	    (*c_function) (arg, range);
	  else
	    call2 (function, range, arg);
	  XSETCAR (range, Qnil);
	}
      if (c == stop)
	{
	  if (c == temp_charset_work->max_char)
	    {
	      if (! NILP (XCAR (range)))
		{
		  XSETCDR (range, make_number (c));
		  if (c_function)
		    (*c_function) (arg, range);
		  else
		    call2 (function, range, arg);
		}
	      break;
	    }
	  c = 0x1FFFF;
	  stop = temp_charset_work->max_char;
	}
      c++;
    }
}

Karl Heuer's avatar
Karl Heuer committed
710
void
711 712
map_charset_chars (void (*c_function)(Lisp_Object, Lisp_Object), Lisp_Object function,
		   Lisp_Object arg, struct charset *charset, unsigned from, unsigned to)
Karl Heuer's avatar
Karl Heuer committed
713
{
714
  Lisp_Object range;
715 716
  bool partial = (from > CHARSET_MIN_CODE (charset)
		  || to < CHARSET_MAX_CODE (charset));
717

718
  if (CHARSET_METHOD (charset) == CHARSET_METHOD_OFFSET)
Karl Heuer's avatar
Karl Heuer committed
719
    {
720 721 722 723 724
      int from_idx = CODE_POINT_TO_INDEX (charset, from);
      int to_idx = CODE_POINT_TO_INDEX (charset, to);
      int from_c = from_idx + CHARSET_CODE_OFFSET (charset);
      int to_c = to_idx + CHARSET_CODE_OFFSET (charset);

725 726 727 728 729 730 731 732 733 734 735 736
      if (CHARSET_UNIFIED_P (charset))
	{
	  if (! CHAR_TABLE_P (CHARSET_DEUNIFIER (charset)))
	    load_charset (charset, 2);
	  if (CHAR_TABLE_P (CHARSET_DEUNIFIER (charset)))
	    map_char_table_for_charset (c_function, function,
					CHARSET_DEUNIFIER (charset), arg,
					partial ? charset : NULL, from, to);
	  else
	    map_charset_for_dump (c_function, function, arg, from, to);
	}

737
      range = Fcons (make_number (from_c), make_number (to_c));
738
      if (NILP (function))
739
	(*c_function) (arg, range);
740 741
      else
	call2 (function, range, arg);
742
    }
743 744 745
  else if (CHARSET_METHOD (charset) == CHARSET_METHOD_MAP)
    {
      if (! CHAR_TABLE_P (CHARSET_ENCODER (charset)))
746 747 748 749 750 751 752
	load_charset (charset, 2);
      if (CHAR_TABLE_P (CHARSET_ENCODER (charset)))
	map_char_table_for_charset (c_function, function,
				    CHARSET_ENCODER (charset), arg,
				    partial ? charset : NULL, from, to);
      else
	map_charset_for_dump (c_function, function, arg, from, to);
753
    }
754
  else if (CHARSET_METHOD (charset) == CHARSET_METHOD_SUBSET)
Karl Heuer's avatar
Karl Heuer committed
755
    {
756 757 758 759 760 761 762 763 764 765 766 767 768
      Lisp_Object subset_info;
      int offset;

      subset_info = CHARSET_SUBSET (charset);
      charset = CHARSET_FROM_ID (XFASTINT (AREF (subset_info, 0)));
      offset = XINT (AREF (subset_info, 3));
      from -= offset;
      if (from < XFASTINT (AREF (subset_info, 1)))
	from = XFASTINT (AREF (subset_info, 1));
      to -= offset;
      if (to > XFASTINT (AREF (subset_info, 2)))
	to = XFASTINT (AREF (subset_info, 2));
      map_charset_chars (c_function, function, arg, charset, from, to);
Karl Heuer's avatar
Karl Heuer committed
769
    }
770 771 772
  else				/* i.e. CHARSET_METHOD_SUPERSET */
    {
      Lisp_Object parents;
Karl Heuer's avatar
Karl Heuer committed
773

774 775
      for (parents = CHARSET_SUPERSET (charset); CONSP (parents);
	   parents = XCDR (parents))
Kenichi Handa's avatar
Kenichi Handa committed
776
	{
777 778 779 780 781
	  int offset;
	  unsigned this_from, this_to;

	  charset = CHARSET_FROM_ID (XFASTINT (XCAR (XCAR (parents))));
	  offset = XINT (XCDR (XCAR (parents)));
782 783
	  this_from = from > offset ? from - offset : 0;
	  this_to = to > offset ? to - offset : 0;
784 785 786 787
	  if (this_from < CHARSET_MIN_CODE (charset))
	    this_from = CHARSET_MIN_CODE (charset);
	  if (this_to > CHARSET_MAX_CODE (charset))
	    this_to = CHARSET_MAX_CODE (charset);
788 789
	  map_charset_chars (c_function, function, arg, charset,
			     this_from, this_to);
Kenichi Handa's avatar
Kenichi Handa committed
790
	}
791
    }
Karl Heuer's avatar
Karl Heuer committed
792 793
}

794
DEFUN ("map-charset-chars", Fmap_charset_chars, Smap_charset_chars, 2, 5, 0,
795
       doc: /* Call FUNCTION for all characters in CHARSET.
796
FUNCTION is called with an argument RANGE and the optional 3rd
797
argument ARG.
Karl Heuer's avatar
Karl Heuer committed
798

799 800
RANGE is a cons (FROM .  TO), where FROM and TO indicate a range of
characters contained in CHARSET.
Karl Heuer's avatar
Karl Heuer committed
801

802
The optional 4th and 5th arguments FROM-CODE and TO-CODE specify the
803
range of code points (in CHARSET) of target characters.  */)
804
  (Lisp_Object function, Lisp_Object charset, Lisp_Object arg, Lisp_Object from_code, Lisp_Object to_code)
Karl Heuer's avatar
Karl Heuer committed
805
{
806
  struct charset *cs;
807
  unsigned from, to;
Karl Heuer's avatar
Karl Heuer committed
808

809 810
  CHECK_CHARSET_GET_CHARSET (charset, cs);
  if (NILP (from_code))
811
    from = CHARSET_MIN_CODE (cs);
812
  else
Karl Heuer's avatar
Karl Heuer committed
813
    {
814 815 816
      from = XINT (from_code);
      if (from < CHARSET_MIN_CODE (cs))
	from = CHARSET_MIN_CODE (cs);
Karl Heuer's avatar
Karl Heuer committed
817
    }
818
  if (NILP (to_code))
819
    to = CHARSET_MAX_CODE (cs);
Karl Heuer's avatar
Karl Heuer committed
820 821
  else
    {
822 823 824
      to = XINT (to_code);
      if (to > CHARSET_MAX_CODE (cs))
	to = CHARSET_MAX_CODE (cs);
Karl Heuer's avatar
Karl Heuer committed
825
    }
826
  map_charset_chars (NULL, function, arg, cs, from, to);
827
  return Qnil;
828
}
Karl Heuer's avatar
Karl Heuer committed
829 830


831 832 833 834
/* Define a charset according to the arguments.  The Nth argument is
   the Nth attribute of the charset (the last attribute `charset-id'
   is not included).  See the docstring of `define-charset' for the
   detail.  */
Karl Heuer's avatar
Karl Heuer committed
835

836 837
DEFUN ("define-charset-internal", Fdefine_charset_internal,
       Sdefine_charset_internal, charset_arg_max, MANY, 0,
838 839
       doc: /* For internal use only.
usage: (define-charset-internal ...)  */)
840
  (ptrdiff_t nargs, Lisp_Object *args)
Karl Heuer's avatar
Karl Heuer committed
841
{
842 843 844
  /* Charset attr vector.  */
  Lisp_Object attrs;
  Lisp_Object val;
845
  EMACS_UINT hash_code;
846
  struct Lisp_Hash_Table *hash_table = XHASH_TABLE (Vcharset_hash_table);
847
  int i, j;
848 849 850
  struct charset charset;
  int id;
  int dimension;
851
  bool new_definition_p;
852 853 854
  int nchars;

  if (nargs != charset_arg_max)
Paul Eggert's avatar
Paul Eggert committed
855 856 857
    Fsignal (Qwrong_number_of_arguments,
	     Fcons (intern ("define-charset-internal"),
		    make_number (nargs)));
858 859 860 861 862 863 864

  attrs = Fmake_vector (make_number (charset_attr_max), Qnil);

  CHECK_SYMBOL (args[charset_arg_name]);
  ASET (attrs, charset_name, args[charset_arg_name]);

  val = args[charset_arg_code_space];
865
  for (i = 0, dimension = 0, nchars = 1; ; i++)
866
    {
867
      Lisp_Object min_byte_obj, max_byte_obj;
868 869
      int min_byte, max_byte;

870 871
      min_byte_obj = Faref (val, make_number (i * 2));
      max_byte_obj = Faref (val, make_number (i * 2 + 1));
872
      CHECK_RANGED_INTEGER (min_byte_obj, 0, 255);
873
      min_byte = XINT (min_byte_obj);
874
      CHECK_RANGED_INTEGER (max_byte_obj, min_byte, 255);
875
      max_byte = XINT (max_byte_obj);
876 877 878 879 880
      charset.code_space[i * 4] = min_byte;
      charset.code_space[i * 4 + 1] = max_byte;
      charset.code_space[i * 4 + 2] = max_byte - min_byte + 1;
      if (max_byte > 0)
	dimension = i + 1;
881 882 883 884
      if (i == 3)
	break;
      nchars *= charset.code_space[i * 4 + 2];
      charset.code_space[i * 4 + 3] = nchars;
885
    }
Karl Heuer's avatar
Karl Heuer committed
886

887 888 889 890
  val = args[charset_arg_dimension];
  if (NILP (val))
    charset.dimension = dimension;
  else
Karl Heuer's avatar
Karl Heuer committed
891
    {
892
      CHECK_RANGED_INTEGER (val, 1, 4);
893
      charset.dimension = XINT (val);
Karl Heuer's avatar
Karl Heuer committed
894 895
    }

896 897 898 899 900 901 902 903
  charset.code_linear_p
    = (charset.dimension == 1
       || (charset.code_space[2] == 256
	   && (charset.dimension == 2
	       || (charset.code_space[6] == 256
		   && (charset.dimension == 3
		       || charset.code_space[10] == 256)))));

904
  if (! charset.code_linear_p)
Karl Heuer's avatar
Karl Heuer committed
905
    {
Dmitry Antipov's avatar
Dmitry Antipov committed
906
      charset.code_space_mask = xzalloc (256);
907 908 909 910
      for (i = 0; i < 4; i++)
	for (j = charset.code_space[i * 4]; j <= charset.code_space[i * 4 + 1];
	     j++)
	  charset.code_space_mask[j] |= (1 << i);
Karl Heuer's avatar
Karl Heuer committed
911 912
    }

913
  charset.iso_chars_96 = charset.code_space[2] == 96;
Karl Heuer's avatar
Karl Heuer committed
914

915 916 917
  charset.min_code = (charset.code_space[0]
		      | (charset.code_space[4] << 8)
		      | (charset.code_space[8] << 16)
918
		      | ((unsigned) charset.code_space[12] << 24));
919 920 921
  charset.max_code = (charset.code_space[1]
		      | (charset.code_space[5] << 8)
		      | (charset.code_space[9] << 16)
922
		      | ((unsigned) charset.code_space[13] << 24));
923
  charset.char_index_offset = 0;
924

925 926 927
  val = args[charset_arg_min_code];
  if (! NILP (val))
    {
928
      unsigned code = cons_to_unsigned (val, UINT_MAX);
929

930 931
      if (code < charset.min_code
	  || code > charset.max_code)
932 933
	args_out_of_range_3 (make_fixnum_or_float (charset.min_code),
			     make_fixnum_or_float (charset.max_code), val);
934 935 936
      charset.char_index_offset = CODE_POINT_TO_INDEX (&charset, code);
      charset.min_code = code;
    }
937

938 939
  val = args[charset_arg_max_code];
  if (! NILP (val))
940
    {
941
      unsigned code = cons_to_unsigned (val, UINT_MAX);
942 943 944

      if (code < charset.min_code
	  || code > charset.max_code)
945 946
	args_out_of_range_3 (make_fixnum_or_float (charset.min_code),
			     make_fixnum_or_float (charset.max_code), val);
947
      charset.max_code = code;
948 949
    }

950
  charset.compact_codes_p = charset.max_code < 0x10000;
Karl Heuer's avatar
Karl Heuer committed
951

952 953 954 955 956
  val = args[charset_arg_invalid_code];
  if (NILP (val))
    {
      if (charset.min_code > 0)
	charset.invalid_code = 0;
Kenichi Handa's avatar
Kenichi Handa committed
957 958
      else
	{
959
	  if (charset.max_code < UINT_MAX)
960 961 962
	    charset.invalid_code = charset.max_code + 1;
	  else
	    error ("Attribute :invalid-code must be specified");
963 964
	}
    }
965
  else
966
    charset.invalid_code = cons_to_unsigned (val, UINT_MAX);
Karl Heuer's avatar
Karl Heuer committed
967

968 969 970 971 972 973 974
  val = args[charset_arg_iso_final];
  if (NILP (val))
    charset.iso_final = -1;
  else
    {
      CHECK_NUMBER (val);
      if (XINT (val) < '0' || XINT (val) > 127)
975
	error ("Invalid iso-final-char: %"pI"d", XINT (val));
976 977
      charset.iso_final = XINT (val);
    }
Karl Heuer's avatar
Karl Heuer committed
978

979 980 981 982
  val = args[charset_arg_iso_revision];
  if (NILP (val))
    charset.iso_revision = -1;
  else
Karl Heuer's avatar
Karl Heuer committed
983
    {
984
      CHECK_RANGED_INTEGER (val, -1, 63);
985
      charset.iso_revision = XINT (val);
Karl Heuer's avatar
Karl Heuer committed
986 987
    }

988 989 990
  val = args[charset_arg_emacs_mule_id];
  if (NILP (val))
    charset.emacs_mule_id = -1;
Karl Heuer's avatar
Karl Heuer committed
991 992
  else
    {
993 994
      CHECK_NATNUM (val);
      if ((XINT (val) > 0 && XINT (val) <= 128) || XINT (val) >= 256)
995
	error ("Invalid emacs-mule-id: %"pI"d", XINT (val));
996
      charset.emacs_mule_id = XINT (val);
997
    }
998

999
  charset.ascii_compatible_p = ! NILP (args[charset_arg_ascii_compatible_p]);
1000

1001
  charset.supplementary_p = ! NILP (args[charset_arg_supplementary_p]);
Karl Heuer's avatar
Karl Heuer committed
1002

1003 1004
  charset.unified_p = 0;

1005
  memset (charset.fast_map, 0, sizeof (charset.fast_map));
1006 1007 1008 1009

  if (! NILP (args[charset_arg_code_offset]))
    {
      val = args[charset_arg_code_offset];
1010
      CHECK_CHARACTER (val);