charset.c 65.4 KB
Newer Older
1
/* Basic multilingual character support.
2
   Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3
   Licensed to the Free Software Foundation.
Karl Heuer's avatar
Karl Heuer committed
4

Karl Heuer's avatar
Karl Heuer committed
5 6 7 8 9 10
This file is part of GNU Emacs.

GNU Emacs is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
Karl Heuer's avatar
Karl Heuer committed
11

Karl Heuer's avatar
Karl Heuer committed
12 13 14 15
GNU Emacs is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
Karl Heuer's avatar
Karl Heuer committed
16

Karl Heuer's avatar
Karl Heuer committed
17 18 19 20
You should have received a copy of the GNU General Public License
along with GNU Emacs; see the file COPYING.  If not, write to
the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA.  */
Karl Heuer's avatar
Karl Heuer committed
21 22 23 24

/* At first, see the document in `charset.h' to understand the code in
   this file.  */

25 26 27 28
#ifdef emacs
#include <config.h>
#endif

Karl Heuer's avatar
Karl Heuer committed
29 30 31 32 33 34 35 36 37
#include <stdio.h>

#ifdef emacs

#include <sys/types.h>
#include "lisp.h"
#include "buffer.h"
#include "charset.h"
#include "coding.h"
Kenichi Handa's avatar
Kenichi Handa committed
38
#include "disptab.h"
Karl Heuer's avatar
Karl Heuer committed
39 40 41 42 43 44 45 46

#else  /* not emacs */

#include "mulelib.h"

#endif /* emacs */

Lisp_Object Qcharset, Qascii, Qcomposition;
Kenichi Handa's avatar
Kenichi Handa committed
47
Lisp_Object Qunknown;
Karl Heuer's avatar
Karl Heuer committed
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66

/* Declaration of special leading-codes.  */
int leading_code_composition;	/* for composite characters */
int leading_code_private_11;	/* for private DIMENSION1 of 1-column */
int leading_code_private_12;	/* for private DIMENSION1 of 2-column */
int leading_code_private_21;	/* for private DIMENSION2 of 1-column */
int leading_code_private_22;	/* for private DIMENSION2 of 2-column */

/* Declaration of special charsets.  */
int charset_ascii;		/* ASCII */
int charset_composition;	/* for a composite character */
int charset_latin_iso8859_1;	/* ISO8859-1 (Latin-1) */
int charset_jisx0208_1978;	/* JISX0208.1978 (Japanese Kanji old set) */
int charset_jisx0208;		/* JISX0208.1983 (Japanese Kanji) */
int charset_katakana_jisx0201;	/* JISX0201.Kana (Japanese Katakana) */
int charset_latin_jisx0201;	/* JISX0201.Roman (Japanese Roman) */
int charset_big5_1;		/* Big5 Level 1 (Chinese Traditional) */
int charset_big5_2;		/* Big5 Level 2 (Chinese Traditional) */

67 68
int min_composite_char;

Karl Heuer's avatar
Karl Heuer committed
69 70 71 72 73 74 75 76 77 78 79 80
Lisp_Object Qcharset_table;

/* A char-table containing information of each character set.  */
Lisp_Object Vcharset_table;

/* A vector of charset symbol indexed by charset-id.  This is used
   only for returning charset symbol from C functions.  */
Lisp_Object Vcharset_symbol_table;

/* A list of charset symbols ever defined.  */
Lisp_Object Vcharset_list;

81 82 83
/* Vector of translation table ever defined.
   ID of a translation table is used to index this vector.  */
Lisp_Object Vtranslation_table_vector;
84

85 86 87 88 89
/* A char-table for characters which may invoke auto-filling.  */
Lisp_Object Vauto_fill_chars;

Lisp_Object Qauto_fill_chars;

Karl Heuer's avatar
Karl Heuer committed
90 91 92 93 94 95 96 97
/* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD.  */
int bytes_by_char_head[256];
int width_by_char_head[256];

/* Mapping table from ISO2022's charset (specified by DIMENSION,
   CHARS, and FINAL-CHAR) to Emacs' charset.  */
int iso_charset_table[2][2][128];

98 99 100 101 102 103 104 105
/* Table of pointers to the structure `cmpchar_info' indexed by
   CMPCHAR-ID.  */
struct cmpchar_info **cmpchar_table;
/* The current size of `cmpchar_table'.  */
static int cmpchar_table_size;
/* Number of the current composite characters.  */
int n_cmpchars;

Karl Heuer's avatar
Karl Heuer committed
106 107 108 109
/* Variables used locally in the macro FETCH_MULTIBYTE_CHAR.  */
unsigned char *_fetch_multibyte_char_p;
int _fetch_multibyte_char_len;

110 111 112
/* Offset to add to a non-ASCII value when inserting it.  */
int nonascii_insert_offset;

113 114
/* Translation table for converting non-ASCII unibyte characters
   to multibyte codes, or nil.  */
115
Lisp_Object Vnonascii_translation_table;
116

117 118 119
/* List of all possible generic characters.  */
Lisp_Object Vgeneric_character_list;

Richard M. Stallman's avatar
Richard M. Stallman committed
120 121 122
#define min(X, Y) ((X) < (Y) ? (X) : (Y))
#define max(X, Y) ((X) > (Y) ? (X) : (Y))

123 124 125 126
void
invalid_character (c)
     int c;
{
127
  error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
128 129
}

Kenichi Handa's avatar
Kenichi Handa committed
130 131 132 133 134 135 136 137 138 139 140 141 142 143
/* Parse composite character string STR of length LENGTH (>= 2) and
   set BYTES, CHARSET, C1, and C2 as below.

   It is assumed that *STR is LEADING_CODE_COMPOSITION and the
   following (LENGTH - 1) bytes satisfy !CHAR_HEAD_P.

   If there is a valid composite character, set CHARSET, C1, and C2 to
   such values that MAKE_CHAR can make the composite character from
   them.  Otherwise, set CHARSET to CHARSET_COMPOSITION, set C1 to the
   second byte of the sequence, C2 to -1 so that MAKE_CHAR can make
   the invalid multibyte character whose string representation is two
   bytes of STR[0] and STR[1].  In any case, set BYTES to LENGTH.

   This macro should be called only from SPLIT_MULTIBYTE_SEQ.  */
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162

#define SPLIT_COMPOSITE_SEQ(str, length, bytes, charset, c1, c2)	\
  do {									\
    int cmpchar_id = str_cmpchar_id ((str), (length));			\
    									\
    (charset) = CHARSET_COMPOSITION;					\
    (bytes) = (length);							\
    if (cmpchar_id >= 0)						\
      {									\
	(c1) = CHAR_FIELD2 (cmpchar_id);				\
	(c2) = CHAR_FIELD3 (cmpchar_id);				\
      }									\
    else								\
      {									\
	(c1) = (str)[1] & 0x7F;						\
	(c2) = -1;							\
      }									\
  } while (0)

Kenichi Handa's avatar
Kenichi Handa committed
163 164 165 166 167 168 169 170 171 172
/* Parse non-composite multibyte character string STR of length LENGTH
   (>= 2) and set BYTES to the length of actual multibyte sequence,
   CHARSET, C1, and C2 to such values that MAKE_CHAR can make the
   multibyte character from them.

   It is assumed that *STR is one of base leading codes (excluding
   LEADING_CODE_COMPOSITION) and the following (LENGTH - 1) bytes
   satisfy !CHAR_HEAD_P.

   This macro should be called only from SPLIT_MULTIBYTE_SEQ.  */
173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199

#define SPLIT_CHARACTER_SEQ(str, length, bytes, charset, c1, c2)	\
  do {									\
    (bytes) = 1;							\
    (charset) = (str)[0];						\
    if ((charset) >= LEADING_CODE_PRIVATE_11				\
	&& (charset) <= LEADING_CODE_PRIVATE_22)			\
      (charset) = (str)[(bytes)++];					\
    if ((bytes) < (length))						\
      {									\
	(c1) = (str)[(bytes)++] & 0x7F;					\
	if ((bytes) < (length))						\
	  (c2) = (str)[(bytes)++] & 0x7F;				\
	else								\
	  (c2) = -1;							\
      }									\
    else								\
      (c1) = (c2) = -1;							\
  } while (0)

/* Parse string STR of length LENGTH and check if a multibyte
   characters is at STR.  set BYTES to the actual length, CHARSET, C1,
   C2 to proper values for that character.  */

#define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2)	\
  do {									\
    int i;								\
Kenichi Handa's avatar
Kenichi Handa committed
200 201 202 203
    if (ASCII_BYTE_P ((str)[0]))					\
      i = 1;								\
    else								\
      for (i = 1; i < (length) && ! CHAR_HEAD_P ((str)[i]); i++);	\
204 205 206 207 208 209 210 211 212 213 214 215 216
    if (i == 1)								\
      (bytes) = 1, (charset) = CHARSET_ASCII, (c1) = (str)[0] ;		\
    else if ((str)[0] == LEADING_CODE_COMPOSITION)			\
      SPLIT_COMPOSITE_SEQ (str, i, bytes, charset, c1, c2);		\
    else								\
      {									\
	if (i > BYTES_BY_CHAR_HEAD ((str)[0]))				\
	  i = BYTES_BY_CHAR_HEAD ((str)[0]);				\
	SPLIT_CHARACTER_SEQ (str, i, bytes, charset, c1, c2);		\
      }									\
  } while (0)

/* 1 if CHARSET, C1, and C2 compose a valid character, else 0.  */
Kenichi Handa's avatar
Kenichi Handa committed
217
#define CHAR_COMPONENTS_VALID_P(charset, c1, c2)	\
218 219 220 221 222
  (charset == CHARSET_ASCII				\
   ? ((c1) >= 0 && (c1) <= 0x7F)			\
   : (CHARSET_DIMENSION (charset) == 1			\
      ? ((c1) >= 0x20 && (c1) <= 0x7F)			\
      : ((c1) >= 0x20 && (c1) <= 0x7F && (c2) >= 0x20 && (c2) <= 0x7F)))
223

Karl Heuer's avatar
Karl Heuer committed
224 225 226 227
/* Set STR a pointer to the multi-byte form of the character C.  If C
   is not a composite character, the multi-byte form is set in WORKBUF
   and STR points WORKBUF.  The caller should allocate at least 4-byte
   area at WORKBUF in advance.  Returns the length of the multi-byte
228 229
   form.  If C is an invalid character, store (C & 0xFF) in WORKBUF[0]
   and return 1.
Karl Heuer's avatar
Karl Heuer committed
230 231 232 233 234 235 236 237 238

   Use macro `CHAR_STRING (C, WORKBUF, STR)' instead of calling this
   function directly if C can be an ASCII character.  */

int
non_ascii_char_to_string (c, workbuf, str)
     int c;
     unsigned char *workbuf, **str;
{
239
  if (c & CHAR_MODIFIER_MASK)	/* This includes the case C is negative.  */
240
    {
241 242 243 244 245
      /* Multibyte character can't have a modifier bit.  */
      if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
	invalid_character (c);

      /* For Meta, Shift, and Control modifiers, we need special care.  */
246
      if (c & CHAR_META)
247 248 249 250 251 252 253 254 255 256 257 258
	{
	  /* Move the meta bit to the right place for a string.  */
	  c = (c & ~CHAR_META) | 0x80;
	}
      if (c & CHAR_SHIFT)
	{
	  /* Shift modifier is valid only with [A-Za-z].  */
	  if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
	    c &= ~CHAR_SHIFT;
	  else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
	    c = (c & ~CHAR_SHIFT) - ('a' - 'A');
	}
259
      if (c & CHAR_CTL)
260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278
	{
	  /* Simulate the code in lread.c.  */
	  /* Allow `\C- ' and `\C-?'.  */
	  if (c == (CHAR_CTL | ' '))
	    c = 0;
	  else if (c == (CHAR_CTL | '?'))
	    c = 127;
	  /* ASCII control chars are made from letters (both cases),
	     as well as the non-letters within 0100...0137.  */
	  else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
	    c &= (037 | (~0177 & ~CHAR_CTL));
	  else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
	    c &= (037 | (~0177 & ~CHAR_CTL));
	}

      /* If C still has any modifier bits, it is an invalid character.  */
      if (c & CHAR_MODIFIER_MASK)
	invalid_character (c);

279
      *str = workbuf;
280
      *workbuf++ = c;
281
    }
282
  else
Karl Heuer's avatar
Karl Heuer committed
283
    {
284
      int charset, c1, c2;
Karl Heuer's avatar
Karl Heuer committed
285

286 287
      SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
      if (charset == CHARSET_COMPOSITION)
Karl Heuer's avatar
Karl Heuer committed
288
	{
289 290 291 292 293 294 295 296 297 298 299 300 301 302 303
	  if (c >= MAX_CHAR)
	    invalid_character (c);
	  if (c >= MIN_CHAR_COMPOSITION)
	    {
	      /* Valid composite character.  */
	      *str = cmpchar_table[COMPOSITE_CHAR_ID (c)]->data;
	      workbuf = *str + cmpchar_table[COMPOSITE_CHAR_ID (c)]->len;
	    }
	  else
	    {
	      /* Invalid but can have multibyte form.  */
	      *str = workbuf;
	      *workbuf++ = LEADING_CODE_COMPOSITION;
	      *workbuf++ = c1 | 0x80;
	    }
Karl Heuer's avatar
Karl Heuer committed
304
	}
305
      else if (charset > CHARSET_COMPOSITION)
Karl Heuer's avatar
Karl Heuer committed
306
	{
307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
	  *str = workbuf;
	  if (charset >= LEADING_CODE_EXT_11)
	    *workbuf++ = (charset < LEADING_CODE_EXT_12
			  ? LEADING_CODE_PRIVATE_11
			  : (charset < LEADING_CODE_EXT_21
			     ? LEADING_CODE_PRIVATE_12
			     : (charset < LEADING_CODE_EXT_22
				? LEADING_CODE_PRIVATE_21
				: LEADING_CODE_PRIVATE_22)));
	  *workbuf++ = charset;
	  if (c1 > 0 && c1 < 32 || c2 > 0 && c2 < 32)
	    invalid_character (c);
	  if (c1)
	    {
	      *workbuf++ = c1 | 0x80;
	      if (c2 > 0)
		*workbuf++ = c2 | 0x80;
	    }
Karl Heuer's avatar
Karl Heuer committed
325
	}
326 327 328 329
      else if (charset == CHARSET_ASCII)
	*workbuf++= c & 0x7F;
      else
	invalid_character (c);
Karl Heuer's avatar
Karl Heuer committed
330 331 332 333 334
    }

  return (workbuf - *str);
}

Kenichi Handa's avatar
Kenichi Handa committed
335 336 337
/* Return the non-ASCII character corresponding to multi-byte form at
   STR of length LEN.  If ACTUAL_LEN is not NULL, store the byte
   length of the multibyte form in *ACTUAL_LEN.
338

Karl Heuer's avatar
Karl Heuer committed
339
   Use macro `STRING_CHAR (STR, LEN)' instead of calling this function
Kenichi Handa's avatar
Kenichi Handa committed
340
   directly if you want ot handle ASCII characters as well.  */
Karl Heuer's avatar
Karl Heuer committed
341

Andreas Schwab's avatar
Andreas Schwab committed
342
int
343
string_to_non_ascii_char (str, len, actual_len)
344
     const unsigned char *str;
345
     int len, *actual_len;
Karl Heuer's avatar
Karl Heuer committed
346
{
347
  int c, bytes, charset, c1, c2;
Karl Heuer's avatar
Karl Heuer committed
348

349 350
  SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
  c = MAKE_CHAR (charset, c1, c2);
Karl Heuer's avatar
Karl Heuer committed
351
  if (actual_len)
352
    *actual_len = bytes;
Karl Heuer's avatar
Karl Heuer committed
353 354 355
  return c;
}

Kenichi Handa's avatar
Kenichi Handa committed
356 357
/* Return the length of the multi-byte form at string STR of length LEN.
   Use the macro MULTIBYTE_FORM_LENGTH instead.  */
Karl Heuer's avatar
Karl Heuer committed
358 359
int
multibyte_form_length (str, len)
360
     const unsigned char *str;
Karl Heuer's avatar
Karl Heuer committed
361 362
     int len;
{
363
  int bytes;
Karl Heuer's avatar
Karl Heuer committed
364

365
  PARSE_MULTIBYTE_SEQ (str, len, bytes);
366
  return bytes;
Karl Heuer's avatar
Karl Heuer committed
367 368
}

369 370 371
/* Check multibyte form at string STR of length LEN and set variables
   pointed by CHARSET, C1, and C2 to charset and position codes of the
   character at STR, and return 0.  If there's no multibyte character,
Karl Heuer's avatar
Karl Heuer committed
372 373 374
   return -1.  This should be used only in the macro SPLIT_STRING
   which checks range of STR in advance.  */

Andreas Schwab's avatar
Andreas Schwab committed
375
int
Karl Heuer's avatar
Karl Heuer committed
376
split_non_ascii_string (str, len, charset, c1, c2)
377 378 379
     const unsigned char *str;
     unsigned char *c1, *c2;
     int len, *charset;
Karl Heuer's avatar
Karl Heuer committed
380
{
381
  register int bytes, cs, code1, code2 = -1;
Karl Heuer's avatar
Karl Heuer committed
382

383 384
  SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2);
  if (cs == CHARSET_ASCII)
Karl Heuer's avatar
Karl Heuer committed
385
    return -1;
386 387 388
  *charset = cs;
  *c1 = code1;
  *c2 = code2;
389
  return 0;
390 391
}

Kenichi Handa's avatar
Kenichi Handa committed
392 393
/* Return 1 iff character C has valid printable glyph.
   Use the macro CHAR_PRINTABLE_P instead.  */
394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417
int
char_printable_p (c)
     int c;
{
  int charset, c1, c2, chars;

  if (SINGLE_BYTE_CHAR_P (c))
    return 1;
  if (c >= MIN_CHAR_COMPOSITION)
    return (c < MAX_CHAR);
  
  SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
  if (! CHARSET_DEFINED_P (charset))
    return 0;
  if (CHARSET_CHARS (charset) == 94
      ? c1 <= 32 || c1 >= 127
      : c1 < 32)
    return 0;
  if (CHARSET_DIMENSION (charset) == 2
      && (CHARSET_CHARS (charset) == 94
	  ? c2 <= 32 || c2 >= 127
	  : c2 < 32))
    return 0;
  return 1;
Karl Heuer's avatar
Karl Heuer committed
418 419
}

420
/* Translate character C by translation table TABLE.  If C
421 422 423
   is negative, translate a character specified by CHARSET, C1, and C2
   (C1 and C2 are code points of the character).  If no translation is
   found in TABLE, return C.  */
Andreas Schwab's avatar
Andreas Schwab committed
424
int
425
translate_char (table, c, charset, c1, c2)
Kenichi Handa's avatar
Kenichi Handa committed
426 427 428 429 430 431
     Lisp_Object table;
     int c, charset, c1, c2;
{
  Lisp_Object ch;
  int alt_charset, alt_c1, alt_c2, dimension;

432
  if (c < 0) c = MAKE_CHAR (charset, (c1 & 0x7F) , (c2 & 0x7F));
Kenichi Handa's avatar
Kenichi Handa committed
433
  if (!CHAR_TABLE_P (table)
434
      || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
Kenichi Handa's avatar
Kenichi Handa committed
435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452
    return c;

  SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
  dimension = CHARSET_DIMENSION (alt_charset);
  if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
    /* CH is not a generic character, just return it.  */
    return XFASTINT (ch);

  /* Since CH is a generic character, we must return a specific
     charater which has the same position codes as C from CH.  */
  if (charset < 0)
    SPLIT_CHAR (c, charset, c1, c2);
  if (dimension != CHARSET_DIMENSION (charset))
    /* We can't make such a character because of dimension mismatch.  */
    return c;
  return MAKE_CHAR (alt_charset, c1, c2);
}

453
/* Convert the unibyte character C to multibyte based on
454
   Vnonascii_translation_table or nonascii_insert_offset.  If they can't
455 456
   convert C to a valid multibyte character, convert it based on
   DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character.  */
457

Andreas Schwab's avatar
Andreas Schwab committed
458
int
459 460 461
unibyte_char_to_multibyte (c)
     int c;
{
462
  if (c < 0400 && c >= 0200)
463
    {
464 465
      int c_save = c;

466
      if (! NILP (Vnonascii_translation_table))
Kenichi Handa's avatar
Kenichi Handa committed
467 468
	{
	  c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
469
	  if (c >= 0400 && ! char_valid_p (c, 0))
Kenichi Handa's avatar
Kenichi Handa committed
470 471 472 473 474
	    c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
	}
      else if (c >= 0240 && nonascii_insert_offset > 0)
	{
	  c += nonascii_insert_offset;
475
	  if (c < 0400 || ! char_valid_p (c, 0))
Kenichi Handa's avatar
Kenichi Handa committed
476 477 478
	    c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
	}
      else if (c >= 0240)
479
	c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
480 481 482
    }
  return c;
}
483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509


/* Convert the multibyte character C to unibyte 8-bit character based
   on Vnonascii_translation_table or nonascii_insert_offset.  If
   REV_TBL is non-nil, it should be a reverse table of
   Vnonascii_translation_table, i.e. what given by:
     Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0))  */

int
multibyte_char_to_unibyte (c, rev_tbl)
     int c;
     Lisp_Object rev_tbl;
{
  if (!SINGLE_BYTE_CHAR_P (c))
    {
      int c_save = c;

      if (! CHAR_TABLE_P (rev_tbl)
	  && CHAR_TABLE_P (Vnonascii_translation_table))
	rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
					  make_number (0));
      if (CHAR_TABLE_P (rev_tbl))
	{
	  Lisp_Object temp;
	  temp = Faref (rev_tbl, make_number (c));
	  if (INTEGERP (temp))
	    c = XINT (temp);
Kenichi Handa's avatar
Kenichi Handa committed
510 511 512 513 514 515 516 517 518
	  if (c >= 256)
	    c = (c_save & 0177) + 0200;
	}
      else
	{
	  if (nonascii_insert_offset > 0)
	    c -= nonascii_insert_offset;
	  if (c < 128 || c >= 256)
	    c = (c_save & 0177) + 0200;
519 520 521 522 523 524
	}
    }

  return c;
}

525

Karl Heuer's avatar
Karl Heuer committed
526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543
/* Update the table Vcharset_table with the given arguments (see the
   document of `define-charset' for the meaning of each argument).
   Several other table contents are also updated.  The caller should
   check the validity of CHARSET-ID and the remaining arguments in
   advance.  */

void
update_charset_table (charset_id, dimension, chars, width, direction,
		      iso_final_char, iso_graphic_plane,
		      short_name, long_name, description)
     Lisp_Object charset_id, dimension, chars, width, direction;
     Lisp_Object iso_final_char, iso_graphic_plane;
     Lisp_Object short_name, long_name, description;
{
  int charset = XINT (charset_id);
  int bytes;
  unsigned char leading_code_base, leading_code_ext;

544 545 546
  if (NILP (CHARSET_TABLE_ENTRY (charset)))
    CHARSET_TABLE_ENTRY (charset)
      = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
Karl Heuer's avatar
Karl Heuer committed
547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574

  /* Get byte length of multibyte form, base leading-code, and
     extended leading-code of the charset.  See the comment under the
     title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h.  */
  bytes = XINT (dimension);
  if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
    {
      /* Official charset, it doesn't have an extended leading-code.  */
      if (charset != CHARSET_ASCII)
	bytes += 1; /* For a base leading-code.  */
      leading_code_base = charset;
      leading_code_ext = 0;
    }
  else
    {
      /* Private charset.  */
      bytes += 2; /* For base and extended leading-codes.  */
      leading_code_base
	= (charset < LEADING_CODE_EXT_12
	   ? LEADING_CODE_PRIVATE_11
	   : (charset < LEADING_CODE_EXT_21
	      ? LEADING_CODE_PRIVATE_12
	      : (charset < LEADING_CODE_EXT_22
		 ? LEADING_CODE_PRIVATE_21
		 : LEADING_CODE_PRIVATE_22)));
      leading_code_ext = charset;
    } 

575 576 577
  if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
    error ("Invalid dimension for the charset-ID %d", charset);

Karl Heuer's avatar
Karl Heuer committed
578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603
  CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
  CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
  CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
  CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
  CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
  CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
  CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
    = make_number (leading_code_base);
  CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
    = make_number (leading_code_ext);
  CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
  CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
    = iso_graphic_plane;
  CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
  CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
  CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
  CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;

  {
    /* If we have already defined a charset which has the same
       DIMENSION, CHARS and ISO-FINAL-CHAR but the different
       DIRECTION, we must update the entry REVERSE-CHARSET of both
       charsets.  If there's no such charset, the value of the entry
       is set to nil.  */
    int i;

604
    for (i = 0; i <= MAX_CHARSET; i++)
Karl Heuer's avatar
Karl Heuer committed
605 606 607 608 609 610 611 612 613 614 615 616 617
      if (!NILP (CHARSET_TABLE_ENTRY (i)))
	{
	  if (CHARSET_DIMENSION (i) == XINT (dimension)
	      && CHARSET_CHARS (i) == XINT (chars)
	      && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
	      && CHARSET_DIRECTION (i) != XINT (direction))
	    {
	      CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
		= make_number (i);
	      CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
	      break;
	    }
	}
618
    if (i > MAX_CHARSET)
Karl Heuer's avatar
Karl Heuer committed
619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680
      /* No such a charset.  */
      CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
	= make_number (-1);
  }

  if (charset != CHARSET_ASCII
      && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
    {
      width_by_char_head[leading_code_base] = XINT (width);

      /* Update table emacs_code_class.  */
      emacs_code_class[charset] = (bytes == 2
				   ? EMACS_leading_code_2
				   : (bytes == 3
				      ? EMACS_leading_code_3
				      : EMACS_leading_code_4));
    }

  /* Update table iso_charset_table.  */
  if (ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
    ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
}

#ifdef emacs

/* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
   is invalid.  */
int
get_charset_id (charset_symbol)
     Lisp_Object charset_symbol;
{
  Lisp_Object val;
  int charset;

  return ((SYMBOLP (charset_symbol)
	   && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
	   && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
	       CHARSET_VALID_P (charset)))
	  ? charset : -1);
}

/* Return an identification number for a new private charset of
   DIMENSION and WIDTH.  If there's no more room for the new charset,
   return 0.  */
Lisp_Object
get_new_private_charset_id (dimension, width)
     int dimension, width;
{
  int charset, from, to;

  if (dimension == 1)
    {
      if (width == 1)
	from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12;
      else
	from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21;
    }
  else
    {
      if (width == 1)
	from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22;
      else
681
	from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX + 1;
Karl Heuer's avatar
Karl Heuer committed
682 683 684 685 686 687 688 689 690 691
    }

  for (charset = from; charset < to; charset++)
    if (!CHARSET_DEFINED_P (charset)) break;

  return make_number (charset < to ? charset : 0);
}

DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
  "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
Kenichi Handa's avatar
Kenichi Handa committed
692
If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
Karl Heuer's avatar
Karl Heuer committed
693 694 695 696 697 698 699 700 701 702 703
 treated as a private charset.\n\
INFO-VECTOR is a vector of the format:\n\
   [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
    SHORT-NAME LONG-NAME DESCRIPTION]\n\
The meanings of each elements is as follows:\n\
DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
WIDTH (integer) is the number of columns a character in the charset\n\
occupies on the screen: one of 0, 1, and 2.\n\
\n\
DIRECTION (integer) is the rendering direction of characters in the\n\
704 705
charset when rendering.  If 0, render from left to right, else\n\
render from right to left.\n\
Karl Heuer's avatar
Karl Heuer committed
706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760
\n\
ISO-FINAL-CHAR (character) is the final character of the\n\
corresponding ISO 2022 charset.\n\
\n\
ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
while encoding to variants of ISO 2022 coding system, one of the\n\
following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
\n\
SHORT-NAME (string) is the short name to refer to the charset.\n\
\n\
LONG-NAME (string) is the long name to refer to the charset.\n\
\n\
DESCRIPTION (string) is the description string of the charset.")
  (charset_id, charset_symbol, info_vector)
     Lisp_Object charset_id, charset_symbol, info_vector;
{
  Lisp_Object *vec;

  if (!NILP (charset_id))
    CHECK_NUMBER (charset_id, 0);
  CHECK_SYMBOL (charset_symbol, 1);
  CHECK_VECTOR (info_vector, 2);

  if (! NILP (charset_id))
    {
      if (! CHARSET_VALID_P (XINT (charset_id)))
	error ("Invalid CHARSET: %d", XINT (charset_id));
      else if (CHARSET_DEFINED_P (XINT (charset_id)))
	error ("Already defined charset: %d", XINT (charset_id));
    }

  vec = XVECTOR (info_vector)->contents;
  if (XVECTOR (info_vector)->size != 9
      || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
      || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
      || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
      || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
      || !INTEGERP (vec[4]) || !(XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
      || !INTEGERP (vec[5]) || !(XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
      || !STRINGP (vec[6])
      || !STRINGP (vec[7])
      || !STRINGP (vec[8]))
    error ("Invalid info-vector argument for defining charset %s",
	   XSYMBOL (charset_symbol)->name->data);

  if (NILP (charset_id))
    {
      charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
      if (XINT (charset_id) == 0)
	error ("There's no room for a new private charset %s",
	       XSYMBOL (charset_symbol)->name->data);
    }

  update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
			vec[4], vec[5], vec[6], vec[7], vec[8]);
761
  Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
Karl Heuer's avatar
Karl Heuer committed
762 763 764 765 766
  CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
  Vcharset_list = Fcons (charset_symbol, Vcharset_list);
  return Qnil;
}

767 768 769 770 771 772 773 774 775
DEFUN ("generic-character-list", Fgeneric_character_list,
       Sgeneric_character_list, 0, 0, 0,
  "Return a list of all possible generic characters.\n\
It includes a generic character for a charset not yet defined.")
  ()
{
  return Vgeneric_character_list;
}

776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805
DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
       Sget_unused_iso_final_char, 2, 2, 0,
  "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
CHARS is the number of characters in a dimension: 94 or 96.\n\
\n\
This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
If there's no unused final char for the specified kind of charset,\n\
return nil.")
  (dimension, chars)
     Lisp_Object dimension, chars;
{
  int final_char;

  CHECK_NUMBER (dimension, 0);
  CHECK_NUMBER (chars, 1);
  if (XINT (dimension) != 1 && XINT (dimension) != 2)
    error ("Invalid charset dimension %d, it should be 1 or 2",
	   XINT (dimension));
  if (XINT (chars) != 94 && XINT (chars) != 96)
    error ("Invalid charset chars %d, it should be 94 or 96",
	   XINT (chars));
  for (final_char = '0'; final_char <= '?'; final_char++)
    {
      if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
	break;
    }
  return (final_char <= '?' ? make_number (final_char) : Qnil);
}

Karl Heuer's avatar
Karl Heuer committed
806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834
DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
       4, 4, 0,
  "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
CHARSET should be defined by `defined-charset' in advance.")
  (dimension, chars, final_char, charset_symbol)
     Lisp_Object dimension, chars, final_char, charset_symbol;
{
  int charset;

  CHECK_NUMBER (dimension, 0);
  CHECK_NUMBER (chars, 1);
  CHECK_NUMBER (final_char, 2);
  CHECK_SYMBOL (charset_symbol, 3);

  if (XINT (dimension) != 1 && XINT (dimension) != 2)
    error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
  if (XINT (chars) != 94 && XINT (chars) != 96)
    error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
  if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
    error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
  if ((charset = get_charset_id (charset_symbol)) < 0)
    error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);

  ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
  return Qnil;
}

/* Return number of different charsets in STR of length LEN.  In
   addition, for each found charset N, CHARSETS[N] is set 1.  The
835
   caller should allocate CHARSETS (MAX_CHARSET + 1 elements) in advance.
836 837 838 839
   It may lookup a translation table TABLE if supplied.

   If CMPCHARP is nonzero and some composite character is found,
   CHARSETS[128] is also set 1 and the returned number is incremented
Kenichi Handa's avatar
Kenichi Handa committed
840 841 842 843 844
   by 1.

   If MULTIBYTE is zero, do not check multibyte characters, i.e. if
   any ASCII codes (7-bit) are found, CHARSET[0] is set to 1, if any
   8-bit codes are found CHARSET[1] is set to 1.  */
Karl Heuer's avatar
Karl Heuer committed
845 846

int
Kenichi Handa's avatar
Kenichi Handa committed
847
find_charset_in_str (str, len, charsets, table, cmpcharp, multibyte)
848 849
     unsigned char *str;
     int len, *charsets;
Kenichi Handa's avatar
Kenichi Handa committed
850
     Lisp_Object table;
851
     int cmpcharp;
Kenichi Handa's avatar
Kenichi Handa committed
852
     int multibyte;
Karl Heuer's avatar
Karl Heuer committed
853
{
854
  register int num = 0, c;
Karl Heuer's avatar
Karl Heuer committed
855

Kenichi Handa's avatar
Kenichi Handa committed
856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875
  if (! multibyte)
    {
      unsigned char *endp = str + len;
      int maskbits = 0;
	
      while (str < endp && maskbits != 3)
	maskbits |=  (*str++ < 0x80 ? 1 : 2);
      if (maskbits & 1)
	{
	  charsets[0] = 1;
	  num++;
	}
      if (maskbits & 2)
	{
	  charsets[1] = 1;
	  num++;
	}
      return num;
    }

Kenichi Handa's avatar
Kenichi Handa committed
876 877 878
  if (! CHAR_TABLE_P (table))
    table = Qnil;

Karl Heuer's avatar
Karl Heuer committed
879 880
  while (len > 0)
    {
881
      int bytes, charset;
882
      c = *str;
Kenichi Handa's avatar
Kenichi Handa committed
883
      
884
      if (c == LEADING_CODE_COMPOSITION)
885
	{
886 887
	  int cmpchar_id = str_cmpchar_id (str, len);
	  GLYPH *glyph;
888

889
	  if (cmpchar_id >= 0)
890
	    {
891
	      struct cmpchar_info *cmp_p = cmpchar_table[cmpchar_id];
892 893
	      int i;

894
	      for (i = 0; i < cmp_p->glyph_len; i++)
895
		{
896
		  c = cmp_p->glyph[i];
897 898
		  if (!NILP (table))
		    {
899
		      if ((c = translate_char (table, c, 0, 0, 0)) < 0)
900
			c = cmp_p->glyph[i];
901 902 903 904 905 906 907 908 909
		    }
		  if ((charset = CHAR_CHARSET (c)) < 0)
		    charset = CHARSET_ASCII;
		  if (!charsets[charset])
		    {
		      charsets[charset] = 1;
		      num += 1;
		    }
		}
910 911 912 913 914 915 916
	      str += cmp_p->len;
	      len -= cmp_p->len;
	      if (cmpcharp && !charsets[CHARSET_COMPOSITION])
		{
		  charsets[CHARSET_COMPOSITION] = 1;
		  num += 1;
		}
917
	      continue;
918 919
	    }

Kenichi Handa's avatar
Kenichi Handa committed
920
	  charset = 1;		/* This leads to `unknown' charset.  */
921 922
	  bytes = 1;
	}
Kenichi Handa's avatar
Kenichi Handa committed
923 924
      else
	{
925 926 927
	  c = STRING_CHAR_AND_LENGTH (str, len, bytes);
	  if (! NILP (table))
	    {
928
	      int c1 = translate_char (table, c, 0, 0, 0);
929 930 931 932
	      if (c1 >= 0)
		c = c1;
	    }
	  charset = CHAR_CHARSET (c);
Kenichi Handa's avatar
Kenichi Handa committed
933
	}
Karl Heuer's avatar
Karl Heuer committed
934 935 936 937 938 939 940 941 942 943 944 945 946

      if (!charsets[charset])
	{
	  charsets[charset] = 1;
	  num += 1;
	}
      str += bytes;
      len -= bytes;
    }
  return num;
}

DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
Kenichi Handa's avatar
Kenichi Handa committed
947
       2, 3, 0,
Karl Heuer's avatar
Karl Heuer committed
948
  "Return a list of charsets in the region between BEG and END.\n\
Kenichi Handa's avatar
Kenichi Handa committed
949
BEG and END are buffer positions.\n\
950 951
If the region contains any composite character,\n\
`composition' is included in the returned list.\n\
Kenichi Handa's avatar
Kenichi Handa committed
952 953 954
Optional arg TABLE if non-nil is a translation table to look up.\n\
\n\
If the region contains invalid multiybte characters,\n\
955
`unknown' is included in the returned list.\n\
Kenichi Handa's avatar
Kenichi Handa committed
956 957 958 959
\n\
If the current buffer is unibyte, the returned list contains\n\
`ascii' if any 7-bit characters are found,\n\
and `unknown' if any 8-bit characters are found.")
Kenichi Handa's avatar
Kenichi Handa committed
960 961
  (beg, end, table)
     Lisp_Object beg, end, table;
Karl Heuer's avatar
Karl Heuer committed
962
{
963
  int charsets[MAX_CHARSET + 1];
964
  int from, from_byte, to, stop, stop_byte, i;
Karl Heuer's avatar
Karl Heuer committed
965
  Lisp_Object val;
Kenichi Handa's avatar
Kenichi Handa committed
966 967
  int undefined;
  int multibyte = !NILP (current_buffer->enable_multibyte_characters);
Karl Heuer's avatar
Karl Heuer committed
968 969 970 971

  validate_region (&beg, &end);
  from = XFASTINT (beg);
  stop = to = XFASTINT (end);
972

Karl Heuer's avatar
Karl Heuer committed
973
  if (from < GPT && GPT < to)
974 975 976 977 978 979 980 981 982
    {
      stop = GPT;
      stop_byte = GPT_BYTE;
    }
  else
    stop_byte = CHAR_TO_BYTE (stop);

  from_byte = CHAR_TO_BYTE (from);

983
  bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
Karl Heuer's avatar
Karl Heuer committed
984 985
  while (1)
    {
986
      find_charset_in_str (BYTE_POS_ADDR (from_byte), stop_byte - from_byte,
Kenichi Handa's avatar
Kenichi Handa committed
987
			   charsets, table, 1, multibyte);
Karl Heuer's avatar
Karl Heuer committed
988
      if (stop < to)
989 990 991 992
	{
	  from = stop, from_byte = stop_byte;
	  stop = to, stop_byte = CHAR_TO_BYTE (stop);
	}
Karl Heuer's avatar
Karl Heuer committed
993 994 995
      else
	break;
    }
996

Karl Heuer's avatar
Karl Heuer committed
997
  val = Qnil;
Kenichi Handa's avatar
Kenichi Handa committed
998 999
  undefined = 0;
  for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
Karl Heuer's avatar
Karl Heuer committed
1000
    if (charsets[i])
Kenichi Handa's avatar
Kenichi Handa committed
1001 1002 1003 1004 1005 1006 1007 1008
      {
	if (CHARSET_DEFINED_P (i) || i == CHARSET_COMPOSITION)
	  val = Fcons (CHARSET_SYMBOL (i), val);
	else
	  undefined = 1;
      }
  if (undefined)
    val = Fcons (Qunknown, val);
Karl Heuer's avatar
Karl Heuer committed
1009 1010 1011 1012
  return val;
}

DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
Kenichi Handa's avatar
Kenichi Handa committed
1013 1014
       1, 2, 0,
  "Return a list of charsets in STR.\n\
1015 1016
If the string contains any composite characters,\n\
`composition' is included in the returned list.\n\
Kenichi Handa's avatar
Kenichi Handa committed
1017 1018 1019 1020 1021 1022 1023 1024
Optional arg TABLE if non-nil is a translation table to look up.\n\
\n\
If the region contains invalid multiybte characters,\n\
`unknown' is included in the returned list.\n\
\n\
If STR is unibyte, the returned list contains\n\
`ascii' if any 7-bit characters are found,\n\
and `unknown' if any 8-bit characters are found.")
Kenichi Handa's avatar
Kenichi Handa committed
1025 1026
  (str, table)
     Lisp_Object str, table;
Karl Heuer's avatar
Karl Heuer committed
1027
{
1028
  int charsets[MAX_CHARSET + 1];
Karl Heuer's avatar
Karl Heuer committed
1029 1030
  int i;
  Lisp_Object val;
Kenichi Handa's avatar
Kenichi Handa committed
1031 1032
  int undefined;
  int multibyte;
Karl Heuer's avatar
Karl Heuer committed
1033 1034

  CHECK_STRING (str, 0);
Kenichi Handa's avatar
Kenichi Handa committed
1035
  multibyte = STRING_MULTIBYTE (str);
1036

1037
  bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
1038
  find_charset_in_str (XSTRING (str)->data, STRING_BYTES (XSTRING (str)),
Kenichi Handa's avatar
Kenichi Handa committed
1039
		       charsets, table, 1, multibyte);
Karl Heuer's avatar
Karl Heuer committed
1040
  val = Qnil;
Kenichi Handa's avatar
Kenichi Handa committed
1041 1042
  undefined = 0;
  for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
Karl Heuer's avatar
Karl Heuer committed
1043
    if (charsets[i])
Kenichi Handa's avatar
Kenichi Handa committed
1044 1045 1046 1047 1048 1049 1050 1051
      {
	if (CHARSET_DEFINED_P (i) || i == CHARSET_COMPOSITION)
	  val = Fcons (CHARSET_SYMBOL (i), val);
	else
	  undefined = 1;
      }
  if (undefined)
    val = Fcons (Qunknown, val);
Karl Heuer's avatar
Karl Heuer committed
1052 1053 1054 1055
  return val;
}

DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
1056
  "")
Karl Heuer's avatar
Karl Heuer committed
1057 1058 1059
  (charset, code1, code2)
     Lisp_Object charset, code1, code2;
{
1060 1061
  int charset_id, c1, c2;

Karl Heuer's avatar
Karl Heuer committed
1062
  CHECK_NUMBER (charset, 0);
1063 1064 1065
  charset_id = XINT (charset);
  if (!CHARSET_DEFINED_P (charset_id))
    error ("Invalid charset ID: %d", XINT (charset));
Karl Heuer's avatar
Karl Heuer committed
1066 1067

  if (NILP (code1))
1068
    c1 = 0;
Karl Heuer's avatar
Karl Heuer committed
1069
  else
1070 1071 1072 1073
    {
      CHECK_NUMBER (code1, 1);
      c1 = XINT (code1);
    }
Karl Heuer's avatar
Karl Heuer committed
1074
  if (NILP (code2))
1075
    c2 = 0;
Karl Heuer's avatar
Karl Heuer committed
1076
  else
1077 1078 1079 1080
    {
      CHECK_NUMBER (code2, 2);
      c2 = XINT (code2);
    }
Karl Heuer's avatar
Karl Heuer committed
1081

1082
  if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
1083
    error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
1084 1085 1086 1087 1088
  c1 &= 0x7F;
  c2 &= 0x7F;
  if (c1 == 0
      ? c2 != 0
      : (c2 == 0
Kenichi Handa's avatar
Kenichi Handa committed
1089 1090
	 ? !CHAR_COMPONENTS_VALID_P (charset, c1, 0x20)
	 : !CHAR_COMPONENTS_VALID_P (charset, c1, c2)))
1091
    error ("Invalid code points for charset ID %d: %d %d", charset_id, c1, c2);
1092 1093

  return make_number (MAKE_CHAR (charset_id, c1, c2));
Karl Heuer's avatar
Karl Heuer committed
1094 1095 1096
}

DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
Kenichi Handa's avatar
Kenichi Handa committed
1097 1098 1099
  "Return list of charset and one or two position-codes of CHAR.\n\
If CHAR is invalid as a character code,\n\
return a list of symbol `unknown' and CHAR.")
Karl Heuer's avatar
Karl Heuer committed
1100 1101 1102 1103
  (ch)
     Lisp_Object ch;
{
  Lisp_Object val;
Kenichi Handa's avatar
Kenichi Handa committed
1104
  int c, charset, c1, c2;
Karl Heuer's avatar
Karl Heuer committed
1105 1106

  CHECK_NUMBER (ch, 0);
Kenichi Handa's avatar
Kenichi Handa committed
1107 1108 1109
  c = XFASTINT (ch);
  if (!CHAR_VALID_P (c, 1))
    return Fcons (Qunknown, Fcons (ch, Qnil));
Karl Heuer's avatar
Karl Heuer committed
1110
  SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
1111
  return (c2 >= 0
Karl Heuer's avatar
Karl Heuer committed
1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126
	  ? Fcons (CHARSET_SYMBOL (charset),
		   Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
	  : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
}

DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
  "Return charset of CHAR.")
  (ch)
     Lisp_Object ch;
{
  CHECK_NUMBER (ch, 0);

  return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
}

1127
DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
1128
  "Return charset of a character in the current buffer at position POS.\n\
1129 1130
If POS is nil, it defauls to the current point.\n\
If POS is out of range, the value is nil.")
1131 1132 1133
  (pos)
     Lisp_Object pos;
{
1134
  register int pos_byte, bytes, charset, c1, c2;
1135 1136 1137 1138 1139
  register unsigned char *p;

  if (NILP (pos))
    pos_byte = PT_BYTE;
  else if (MARKERP (pos))
1140 1141 1142 1143 1144
    {
      pos_byte = marker_byte_position (pos);
      if (pos_byte < BEGV_BYTE || pos_byte >= ZV_BYTE)
	return Qnil;
    }
1145 1146 1147
  else
    {
      CHECK_NUMBER (pos, 0);
1148 1149
      if (XINT (pos) < BEGV || XINT (pos) >= ZV)
	return Qnil;
1150 1151 1152
      pos_byte = CHAR_TO_BYTE (XINT (pos));
    }
  p = BYTE_POS_ADDR (pos_byte);
1153 1154 1155 1156 1157 1158 1159 1160 1161
  if (BASE_LEADING_CODE_P (*p))
    {
      SPLIT_MULTIBYTE_SEQ (p, Z_BYTE - pos_byte, bytes, charset, c1, c2);
      if (charset < 0)
	charset = 1;
    }
  else
    charset = CHARSET_ASCII;

1162 1163 1164
  return CHARSET_SYMBOL (charset);
}

Karl Heuer's avatar
Karl Heuer committed
1165
DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
1166 1167 1168 1169 1170 1171 1172
  "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
\n\
ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
by their DIMENSION, CHARS, and FINAL-CHAR,\n\
where as Emacs distinguishes them by charset symbol.\n\
See the documentation of the function `charset-info' for the meanings of\n\
DIMENSION, CHARS, and FINAL-CHAR.")
Karl Heuer's avatar
Karl Heuer committed
1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186
  (dimension, chars, final_char)
     Lisp_Object dimension, chars, final_char;
{
  int charset;

  CHECK_NUMBER (dimension, 0);
  CHECK_NUMBER (chars, 1);
  CHECK_NUMBER (final_char, 2);

  if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
    return Qnil;
  return CHARSET_SYMBOL (charset);
}

Kenichi Handa's avatar
Kenichi Handa committed
1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201
/* If GENERICP is nonzero, return nonzero iff C is a valid normal or
   generic character.  If GENERICP is zero, return nonzero iff C is a
   valid normal character.  Do not call this function directly,
   instead use macro CHAR_VALID_P.  */
int
char_valid_p (c, genericp)
     int c, genericp;
{
  int charset, c1, c2;

  if (c < 0)
    return 0;
  if (SINGLE_BYTE_CHAR_P (c))
    return 1;
  SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217
  if (charset == CHARSET_COMPOSITION)
    return ((c >= MIN_CHAR_COMPOSITION
	     && c < MIN_CHAR_COMPOSITION + n_cmpchars)
	    || (genericp && c == GENERIC_COMPOSITION_CHAR));
  if (genericp)
    {
      if (c1)
	{
	  if (c2 <= 0) c