charset.h 29 KB
Newer Older
1
/* Header for multibyte character handler.
Richard M. Stallman's avatar
Richard M. Stallman committed
2
   Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3
   Licensed to the Free Software Foundation.
Karl Heuer's avatar
Karl Heuer committed
4

Karl Heuer's avatar
Karl Heuer committed
5 6 7 8 9 10
This file is part of GNU Emacs.

GNU Emacs is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
Karl Heuer's avatar
Karl Heuer committed
11

Karl Heuer's avatar
Karl Heuer committed
12 13 14 15
GNU Emacs is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
Karl Heuer's avatar
Karl Heuer committed
16

Karl Heuer's avatar
Karl Heuer committed
17 18 19 20
You should have received a copy of the GNU General Public License
along with GNU Emacs; see the file COPYING.  If not, write to
the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA.  */
Karl Heuer's avatar
Karl Heuer committed
21 22 23 24 25 26 27 28 29

#ifndef _CHARSET_H
#define _CHARSET_H

/*** GENERAL NOTE on CHARACTER SET (CHARSET) ***

  A character set ("charset" hereafter) is a meaningful collection
  (i.e. language, culture, functionality, etc) of characters.  Emacs
  handles multiple charsets at once.  Each charset corresponds to one
30 31 32 33
  of ISO charsets.  Emacs identifies a charset by a unique
  identification number, whereas ISO identifies a charset by a triplet
  of DIMENSION, CHARS and FINAL-CHAR.  So, hereafter, just saying
  "charset" means an identification number (integer value).
Karl Heuer's avatar
Karl Heuer committed
34

35
  The value range of charset is 0x00, 0x81..0xFE.  There are four
Karl Heuer's avatar
Karl Heuer committed
36 37
  kinds of charset depending on DIMENSION (1 or 2) and CHARS (94 or
  96).  For instance, a charset of DIMENSION2_CHARS94 contains 94x94
38
  characters.
Karl Heuer's avatar
Karl Heuer committed
39 40 41 42

  Within Emacs Lisp, a charset is treated as a symbol which has a
  property `charset'.  The property value is a vector containing
  various information about the charset.  For readability of C codes,
43
  we use the following convention for C variable names:
Karl Heuer's avatar
Karl Heuer committed
44 45 46 47 48 49 50 51 52 53 54 55 56
	charset_symbol: Emacs Lisp symbol of a charset
	charset_id: Emacs Lisp integer of an identification number of a charset
	charset: C integer of an identification number of a charset

  Each charset (except for ASCII) is assigned a base leading-code
  (range 0x80..0x9D).  In addition, a charset of greater than 0xA0
  (whose base leading-code is 0x9A..0x9D) is assigned an extended
  leading-code (range 0xA0..0xFE).  In this case, each base
  leading-code specify the allowable range of extended leading-code as
  shown in the table below.  A leading-code is used to represent a
  character in Emacs' buffer and string.

  We call a charset which has extended leading-code as "private
57
  charset" because those are mainly for a charset which is not yet
Karl Heuer's avatar
Karl Heuer committed
58 59 60 61 62 63 64 65 66
  registered by ISO.  On the contrary, we call a charset which does
  not have extended leading-code as "official charset".

  ---------------------------------------------------------------------------
  charset	dimension	 base leading-code	extended leading-code
  ---------------------------------------------------------------------------
  0x00		official dim1    -- none --		-- none --
		(ASCII)
  0x01..0x7F	--never used--
67
  0x80		--never used--
Karl Heuer's avatar
Karl Heuer committed
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
  0x81..0x8F	official dim1    same as charset	-- none --
  0x90..0x99	official dim2	 same as charset	-- none --
  0x9A..0x9F	--never used--
  0xA0..0xDF	private dim1	    0x9A		same as charset
		of 1-column width
  0xE0..0xEF	private dim1	    0x9B		same as charset
		of 2-column width
  0xF0..0xF4	private dim2	    0x9C		same as charset
		of 1-column width
  0xF5..0xFE	private dim2	    0x9D		same as charset
		of 2-column width
  0xFF		--never used--
  ---------------------------------------------------------------------------

*/

/* Definition of special leading-codes.  */
/* Leading-code followed by extended leading-code.  */
#define LEADING_CODE_PRIVATE_11	0x9A /* for private DIMENSION1 of 1-column */
#define LEADING_CODE_PRIVATE_12	0x9B /* for private DIMENSION1 of 2-column */
#define LEADING_CODE_PRIVATE_21	0x9C /* for private DIMENSION2 of 1-column */
89
#define LEADING_CODE_PRIVATE_22	0x9D /* for private DIMENSION2 of 2-column */
Karl Heuer's avatar
Karl Heuer committed
90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107

/* Extended leading-code.  */
/* Start of each extended leading-codes.  */
#define LEADING_CODE_EXT_11 0xA0 /* follows LEADING_CODE_PRIVATE_11 */
#define LEADING_CODE_EXT_12 0xE0 /* follows LEADING_CODE_PRIVATE_12 */
#define LEADING_CODE_EXT_21 0xF0 /* follows LEADING_CODE_PRIVATE_21 */
#define LEADING_CODE_EXT_22 0xF5 /* follows LEADING_CODE_PRIVATE_22 */
/* Maximum value of extended leading-codes.  */
#define LEADING_CODE_EXT_MAX 0xFE

/* Definition of minimum/maximum charset of each DIMENSION.  */
#define MIN_CHARSET_OFFICIAL_DIMENSION1	0x81
#define MAX_CHARSET_OFFICIAL_DIMENSION1	0x8F
#define MIN_CHARSET_OFFICIAL_DIMENSION2	0x90
#define MAX_CHARSET_OFFICIAL_DIMENSION2 0x99
#define MIN_CHARSET_PRIVATE_DIMENSION1	LEADING_CODE_EXT_11
#define MIN_CHARSET_PRIVATE_DIMENSION2	LEADING_CODE_EXT_21

108 109 110
/* Maximum value of overall charset identification number.  */
#define MAX_CHARSET 0xFE

Karl Heuer's avatar
Karl Heuer committed
111 112 113 114 115 116 117 118 119 120 121 122
/* Definition of special charsets.  */
#define CHARSET_ASCII		0

extern int charset_ascii;	/* ASCII */
extern int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
extern int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
extern int charset_jisx0208;	/* JISX0208.1983 (Japanese Kanji) */
extern int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
extern int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
extern int charset_big5_1;	/* Big5 Level 1 (Chinese Traditional) */
extern int charset_big5_2;	/* Big5 Level 2 (Chinese Traditional) */

123 124 125
/* Check if CH is the head of multi-byte form, i.e.,
   an ASCII character or a base leading-code.  */
#define CHAR_HEAD_P(ch) ((unsigned char) (ch) < 0xA0)
Karl Heuer's avatar
Karl Heuer committed
126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143

/*** GENERAL NOTE on CHARACTER REPRESENTATION ***

  At first, the term "character" or "char" is used for a multilingual
  character (of course, including ASCII character), not for a byte in
  computer memory.  We use the term "code" or "byte" for the latter
  case.

  A character is identified by charset and one or two POSITION-CODEs.
  POSITION-CODE is the position of the character in the charset.  A
  character of DIMENSION1 charset has one POSITION-CODE: POSITION-CODE-1.
  A character of DIMENSION2 charset has two POSITION-CODE:
  POSITION-CODE-1 and POSITION-CODE-2.  The code range of
  POSITION-CODE is 0x20..0x7F.

  Emacs has two kinds of representation of a character: multi-byte
  form (for buffer and string) and single-word form (for character
  object in Emacs Lisp).  The latter is called "character code" here
Karl Heuer's avatar
Karl Heuer committed
144
  after.  Both representations encode the information of charset and
Karl Heuer's avatar
Karl Heuer committed
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
  POSITION-CODE but in a different way (for instance, MSB of
  POSITION-CODE is set in multi-byte form).

  For details of multi-byte form, see the section "2. Emacs internal
  format handlers" of `coding.c'.

  Emacs uses 19 bits for a character code.  The bits are divided into
  3 fields: FIELD1(5bits):FIELD2(7bits):FIELD3(7bits).

  A character code of DIMENSION1 character uses FIELD2 to hold charset
  and FIELD3 to hold POSITION-CODE-1.  A character code of DIMENSION2
  character uses FIELD1 to hold charset, FIELD2 and FIELD3 to hold
  POSITION-CODE-1 and POSITION-CODE-2 respectively.

  More precisely...

  FIELD2 of DIMENSION1 character (except for ASCII) is "charset - 0x70".
  This is to make all character codes except for ASCII greater than
  256 (ASCII's FIELD2 is 0).  So, the range of FIELD2 of DIMENSION1
  character is 0 or 0x11..0x7F.

  FIELD1 of DIMENSION2 character is "charset - 0x8F" for official
  charset and "charset - 0xE0" for private charset.  So, the range of
  FIELD1 of DIMENSION2 character is 0x01..0x1E.

  -----------------------------------------------------------------------
  charset	FIELD1 (5-bit)	    FIELD2 (7-bit)	FIELD3 (7-bit)
  -----------------------------------------------------------------------
  ASCII		0		    0			POSITION-CODE-1
  DIMENSION1	0		    charset - 0x70	POSITION-CODE-1
  DIMENSION2(o)	charset - 0x8F	    POSITION-CODE-1	POSITION-CODE-2
  DIMENSION2(p)	charset - 0xE0	    POSITION-CODE-1	POSITION-CODE-2
  -----------------------------------------------------------------------
  "(o)": official, "(p)": private
  -----------------------------------------------------------------------

*/

/* Masks of each field of character code.  */
#define CHAR_FIELD1_MASK (0x1F << 14)
#define CHAR_FIELD2_MASK (0x7F << 7)
#define CHAR_FIELD3_MASK 0x7F

/* Macros to access each field of character C.  */
#define CHAR_FIELD1(c) (((c) & CHAR_FIELD1_MASK) >> 14)
#define CHAR_FIELD2(c) (((c) & CHAR_FIELD2_MASK) >> 7)
#define CHAR_FIELD3(c) ((c) & CHAR_FIELD3_MASK)

/* Minimum character code of character of each DIMENSION.  */
#define MIN_CHAR_OFFICIAL_DIMENSION1 \
  ((MIN_CHARSET_OFFICIAL_DIMENSION1 - 0x70) << 7)
#define MIN_CHAR_PRIVATE_DIMENSION1 \
  ((MIN_CHARSET_PRIVATE_DIMENSION1 - 0x70) << 7)
#define MIN_CHAR_OFFICIAL_DIMENSION2 \
  ((MIN_CHARSET_OFFICIAL_DIMENSION2 - 0x8F) << 14)
#define MIN_CHAR_PRIVATE_DIMENSION2 \
  ((MIN_CHARSET_PRIVATE_DIMENSION2 - 0xE0) << 14)
202 203
/* Maximum character code currently used plus 1.  */
#define MAX_CHAR (0x1F << 14)
Karl Heuer's avatar
Karl Heuer committed
204 205

/* 1 if C is an ASCII character, else 0.  */
206
#define SINGLE_BYTE_CHAR_P(c) ((c) >= 0 && (c) < 0x100)
Karl Heuer's avatar
Karl Heuer committed
207

Karl Heuer's avatar
Karl Heuer committed
208 209 210
/* 1 if BYTE is a character in itself, in multibyte mode.  */
#define ASCII_BYTE_P(byte) ((byte) < 0x80)

Karl Heuer's avatar
Karl Heuer committed
211 212 213 214 215 216 217 218 219 220 221 222 223
/* A char-table containing information of each character set.

   Unlike ordinary char-tables, this doesn't contain any nested table.
   Only the top level elements are used.  Each element is a vector of
   the following information:
	CHARSET-ID, BYTES, DIMENSION, CHARS, WIDTH, DIRECTION,
	LEADING-CODE-BASE, LEADING-CODE-EXT,
	ISO-FINAL-CHAR, ISO-GRAPHIC-PLANE,
	REVERSE-CHARSET, SHORT-NAME, LONG-NAME,	DESCRIPTION,
	PLIST.

   CHARSET-ID (integer) is the identification number of the charset.

Kenichi Handa's avatar
Kenichi Handa committed
224
   BYTES (integer) is the length of multi-byte form of a character in
Karl Heuer's avatar
Karl Heuer committed
225 226 227 228 229 230 231 232 233 234
   the charset: one of 1, 2, 3, and 4.

   DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.

   CHARS (integer) is the number of characters in a dimension: 94 or 96.

   WIDTH (integer) is the number of columns a character in the charset
   occupies on the screen: one of 0, 1, and 2.

   DIRECTION (integer) is the rendering direction of characters in the
235 236
   charset when rendering.  If 0, render from left to right, else
   render from right to left.
Karl Heuer's avatar
Karl Heuer committed
237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269

   LEADING-CODE-BASE (integer) is the base leading-code for the
   charset.

   LEADING-CODE-EXT (integer) is the extended leading-code for the
   charset.  All charsets of less than 0xA0 has the value 0.

   ISO-FINAL-CHAR (character) is the final character of the
   corresponding ISO 2022 charset.

   ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked
   while encoding to variants of ISO 2022 coding system, one of the
   following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).

   REVERSE-CHARSET (integer) is the charset which differs only in
   LEFT-TO-RIGHT value from the charset.  If there's no such a
   charset, the value is -1.
   
   SHORT-NAME (string) is the short name to refer to the charset.

   LONG-NAME (string) is the long name to refer to the charset.

   DESCRIPTION (string) is the description string of the charset.

   PLIST (property list) may contain any type of information a user
   want to put and get by functions `put-charset-property' and
   `get-charset-property' respectively.  */
extern Lisp_Object Vcharset_table;

/* Macros to access various information of CHARSET in Vcharset_table.
   We provide these macros for efficiency.  No range check of CHARSET.  */

/* Return entry of CHARSET (lisp integer) in Vcharset_table.  */
270 271 272
#define CHARSET_TABLE_ENTRY(charset)					\
  XCHAR_TABLE (Vcharset_table)->contents[((charset) == CHARSET_ASCII	\
					  ? 0 : (charset) + 128)]
Karl Heuer's avatar
Karl Heuer committed
273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329

/* Return information INFO-IDX of CHARSET.  */
#define CHARSET_TABLE_INFO(charset, info_idx) \
  XVECTOR (CHARSET_TABLE_ENTRY (charset))->contents[info_idx]

#define CHARSET_ID_IDX (0)
#define CHARSET_BYTES_IDX (1)
#define CHARSET_DIMENSION_IDX (2)
#define CHARSET_CHARS_IDX (3)
#define CHARSET_WIDTH_IDX (4)
#define CHARSET_DIRECTION_IDX (5)
#define CHARSET_LEADING_CODE_BASE_IDX (6)
#define CHARSET_LEADING_CODE_EXT_IDX (7)
#define CHARSET_ISO_FINAL_CHAR_IDX (8)
#define CHARSET_ISO_GRAPHIC_PLANE_IDX (9)
#define CHARSET_REVERSE_CHARSET_IDX (10)
#define CHARSET_SHORT_NAME_IDX (11)
#define CHARSET_LONG_NAME_IDX (12)
#define CHARSET_DESCRIPTION_IDX (13)
#define CHARSET_PLIST_IDX (14)
/* Size of a vector of each entry of Vcharset_table.  */
#define CHARSET_MAX_IDX (15)

/* And several more macros to be used frequently.  */
#define CHARSET_BYTES(charset) \
  XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX))
#define CHARSET_DIMENSION(charset) \
  XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX))
#define CHARSET_CHARS(charset) \
  XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX))
#define CHARSET_WIDTH(charset) \
  XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX))
#define CHARSET_DIRECTION(charset) \
  XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX))
#define CHARSET_LEADING_CODE_BASE(charset) \
  XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX))
#define CHARSET_LEADING_CODE_EXT(charset) \
  XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX))
#define CHARSET_ISO_FINAL_CHAR(charset) \
  XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX))
#define CHARSET_ISO_GRAPHIC_PLANE(charset) \
  XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX))
#define CHARSET_REVERSE_CHARSET(charset) \
  XINT (CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX))

/* Macros to specify direction of a charset.  */
#define CHARSET_DIRECTION_LEFT_TO_RIGHT 0
#define CHARSET_DIRECTION_RIGHT_TO_LEFT 1

/* A vector of charset symbol indexed by charset-id.  This is used
   only for returning charset symbol from C functions.  */
extern Lisp_Object Vcharset_symbol_table;

/* Return symbol of CHARSET.  */
#define CHARSET_SYMBOL(charset) \
  XVECTOR (Vcharset_symbol_table)->contents[charset]

330
/* 1 if CHARSET is in valid value range, else 0.  */
Karl Heuer's avatar
Karl Heuer committed
331 332
#define CHARSET_VALID_P(charset)					 \
  ((charset) == 0							 \
333
   || ((charset) > 0x80 && (charset) <= MAX_CHARSET_OFFICIAL_DIMENSION2) \
334
   || ((charset) >= MIN_CHARSET_PRIVATE_DIMENSION1 && (charset) <= MAX_CHARSET))
Karl Heuer's avatar
Karl Heuer committed
335

336
/* 1 if CHARSET is already defined, else 0.  */
Karl Heuer's avatar
Karl Heuer committed
337
#define CHARSET_DEFINED_P(charset)			\
338
  (((charset) >= 0) && ((charset) <= MAX_CHARSET)	\
Karl Heuer's avatar
Karl Heuer committed
339 340 341
   && !NILP (CHARSET_TABLE_ENTRY (charset)))

/* Since the information CHARSET-BYTES and CHARSET-WIDTH of
342
   Vcharset_table can be retrieved only the first byte of
Karl Heuer's avatar
Karl Heuer committed
343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359
   multi-byte form (an ASCII code or a base leading-code), we provide
   here tables to be used by macros BYTES_BY_CHAR_HEAD and
   WIDTH_BY_CHAR_HEAD for faster information retrieval.  */
extern int bytes_by_char_head[256];
extern int width_by_char_head[256];

#define BYTES_BY_CHAR_HEAD(char_head) bytes_by_char_head[char_head]
#define WIDTH_BY_CHAR_HEAD(char_head) width_by_char_head[char_head]

/* Charset of the character C.  */
#define CHAR_CHARSET(c)			 	\
  (SINGLE_BYTE_CHAR_P (c)		 	\
   ? CHARSET_ASCII			 	\
   : ((c) < MIN_CHAR_OFFICIAL_DIMENSION2 	\
      ? CHAR_FIELD2 (c) + 0x70		 	\
      : ((c) < MIN_CHAR_PRIVATE_DIMENSION2	\
	 ? CHAR_FIELD1 (c) + 0x8F	 	\
360
	 : CHAR_FIELD1 (c) + 0xE0)))
Karl Heuer's avatar
Karl Heuer committed
361 362

/* Return charset at the place pointed by P.  */
363 364 365 366 367 368 369 370
#define CHARSET_AT(p)				\
  (*(p) < 0x80					\
   ? CHARSET_ASCII				\
   : (*(p) < LEADING_CODE_PRIVATE_11		\
      ? (int)*(p)				\
      : (*(p) <= LEADING_CODE_PRIVATE_22	\
	 ? (int)*((p) + 1)			\
	 : -1)))
Karl Heuer's avatar
Karl Heuer committed
371 372 373

/* Same as `CHARSET_AT ()' but perhaps runs faster because of an
   additional argument C which is the code (byte) at P.  */
374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389
#define FIRST_CHARSET_AT(p, c)		\
  ((c) < 0x80				\
   ? CHARSET_ASCII			\
   : ((c) < LEADING_CODE_PRIVATE_11	\
      ? (int)(c)			\
      : ((c) <= LEADING_CODE_PRIVATE_22	\
	 ? (int)*((p) + 1)		\
	 : -1)))

/* Check if two characters C1 and C2 belong to the same charset.  */
#define SAME_CHARSET_P(c1, c2)					\
  (SINGLE_BYTE_CHAR_P (c1)					\
   ? SINGLE_BYTE_CHAR_P (c2)					\
   : (c1 < MIN_CHAR_OFFICIAL_DIMENSION2				\
      ? (c1 & CHAR_FIELD2_MASK) == (c2 & CHAR_FIELD2_MASK)	\
      : (c1 & CHAR_FIELD1_MASK) == (c2 & CHAR_FIELD1_MASK)))
Karl Heuer's avatar
Karl Heuer committed
390 391 392

/* Return a non-ASCII character of which charset is CHARSET and
   position-codes are C1 and C2.  DIMENSION1 character ignores C2.  */
393
#define MAKE_NON_ASCII_CHAR(charset, c1, c2)				\
394 395 396 397 398 399 400
  (! CHARSET_DEFINED_P (charset) || CHARSET_DIMENSION (charset) == 1	\
   ? (((charset) - 0x70) << 7) | ((c1) <= 0 ? 0 : (c1))			\
   : ((charset) < MIN_CHARSET_PRIVATE_DIMENSION2			\
      ? ((((charset) - 0x8F) << 14)					\
	 | ((c1) <= 0 ? 0 : ((c1) << 7)) | ((c2) <= 0 ? 0 : (c2)))	\
      : ((((charset) - 0xE0) << 14)					\
	 | ((c1) <= 0 ? 0 : ((c1) << 7)) | ((c2) <= 0 ? 0 : (c2)))))
Karl Heuer's avatar
Karl Heuer committed
401 402 403

/* Return a character of which charset is CHARSET and position-codes
   are C1 and C2.  DIMENSION1 character ignores C2.  */
404 405 406 407
#define MAKE_CHAR(charset, c1, c2)	\
  ((charset) == CHARSET_ASCII		\
   ? (c1)				\
   : MAKE_NON_ASCII_CHAR ((charset), (c1), (c2)))
Karl Heuer's avatar
Karl Heuer committed
408

409 410 411 412 413 414
/* If GENERICP is nonzero, return nonzero iff C is a valid normal or
   generic character.  If GENERICP is zero, return nonzero iff C is a
   valid normal character.  */
#define CHAR_VALID_P(c, genericp)	\
  ((c) >= 0				\
   && (SINGLE_BYTE_CHAR_P (c) || char_valid_p (c, genericp)))
Kenichi Handa's avatar
Kenichi Handa committed
415

416
/* This default value is used when nonascii-translation-table or
417 418 419 420 421
   nonascii-insert-offset fail to convert unibyte character to a valid
   multibyte character.  This makes a Latin-1 character.  */

#define DEFAULT_NONASCII_INSERT_OFFSET 0x800

422 423 424 425 426 427 428
/* Parse string STR of length LENGTH and check if a multibyte
   characters is at STR.  If so, set BYTES for that character, else
   set BYTES to 1.  */

#define PARSE_MULTIBYTE_SEQ(str, length, bytes)			\
  do {								\
    int i = 1;							\
429 430 431
    while (i < (length) && ! CHAR_HEAD_P ((str)[i])) i++;	\
    if (i == 1)							\
      (bytes) = 1;						\
432
    else							\
Kenichi Handa's avatar
Kenichi Handa committed
433
      {								\
434 435 436
	(bytes) = BYTES_BY_CHAR_HEAD ((str)[0]);		\
	if ((bytes) > (length))					\
	  (bytes) = (length);					\
Kenichi Handa's avatar
Kenichi Handa committed
437
      }								\
438
  } while (0)
439

Richard M. Stallman's avatar
Richard M. Stallman committed
440 441 442 443 444 445
/* The charset of non-ASCII character C is stored in CHARSET, and the
   position-codes of C are stored in C1 and C2.
   We store -1 in C2 if the character is just 2 bytes.

   Do not use this macro for an ASCII character.  */

446 447 448 449 450 451 452 453
#define SPLIT_NON_ASCII_CHAR(c, charset, c1, c2)			\
  ((c) & CHAR_FIELD1_MASK						\
   ? (charset = (CHAR_FIELD1 (c)					\
		 + ((c) < MIN_CHAR_PRIVATE_DIMENSION2 ? 0x8F : 0xE0)),	\
      c1 = CHAR_FIELD2 (c),						\
      c2 = CHAR_FIELD3 (c))						\
   : (charset = CHAR_FIELD2 (c) + 0x70,					\
      c1 = CHAR_FIELD3 (c),						\
454
      c2 = -1))
Karl Heuer's avatar
Karl Heuer committed
455

Richard M. Stallman's avatar
Richard M. Stallman committed
456 457
/* The charset of character C is stored in CHARSET, and the
   position-codes of C are stored in C1 and C2.
Kenichi Handa's avatar
Kenichi Handa committed
458
   We store -1 in C2 if the dimension of the charset is 1.  */
Richard M. Stallman's avatar
Richard M. Stallman committed
459

Karl Heuer's avatar
Karl Heuer committed
460 461
#define SPLIT_CHAR(c, charset, c1, c2)		 	\
  (SINGLE_BYTE_CHAR_P (c)			 	\
462
   ? charset = CHARSET_ASCII, c1 = (c), c2 = -1	 	\
Karl Heuer's avatar
Karl Heuer committed
463 464
   : SPLIT_NON_ASCII_CHAR (c, charset, c1, c2))

465
/* Return 1 iff character C has valid printable glyph.  */
466 467 468
#define CHAR_PRINTABLE_P(c)	\
  (SINGLE_BYTE_CHAR_P (c)	\
   || char_printable_p (c))
469

Richard M. Stallman's avatar
Richard M. Stallman committed
470 471
/* The charset of the character at STR is stored in CHARSET, and the
   position-codes are stored in C1 and C2.
472
   We store -1 in C2 if the character is just 2 bytes.  */
Richard M. Stallman's avatar
Richard M. Stallman committed
473

474 475 476 477 478
#define SPLIT_STRING(str, len, charset, c1, c2)			\
  ((BYTES_BY_CHAR_HEAD ((unsigned char) *(str)) < 2		\
    || BYTES_BY_CHAR_HEAD ((unsigned char) *(str)) > len	\
    || split_string (str, len, &charset, &c1, &c2) < 0)		\
   ? c1 = *(str), charset = CHARSET_ASCII			\
Karl Heuer's avatar
Karl Heuer committed
479 480 481 482 483 484 485 486 487 488 489 490
   : charset)

/* Mapping table from ISO2022's charset (specified by DIMENSION,
   CHARS, and FINAL_CHAR) to Emacs' charset.  Should be accessed by
   macro ISO_CHARSET_TABLE (DIMENSION, CHARS, FINAL_CHAR).  */
extern int iso_charset_table[2][2][128];

#define ISO_CHARSET_TABLE(dimension, chars, final_char) \
  iso_charset_table[XINT (dimension) - 1][XINT (chars) > 94][XINT (final_char)]

#define BASE_LEADING_CODE_P(c) (BYTES_BY_CHAR_HEAD ((unsigned char) (c)) > 1)

Kenichi Handa's avatar
Kenichi Handa committed
491
/* Return how many bytes C will occupy in a multibyte buffer.  */
492 493 494
#define CHAR_BYTES(c)							\
  ((SINGLE_BYTE_CHAR_P ((c)) || ((c) & ~((1 << CHARACTERBITS) - 1)))	\
   ? 1 : char_bytes (c))
Kenichi Handa's avatar
Kenichi Handa committed
495

Karl Heuer's avatar
Karl Heuer committed
496 497 498 499 500
/* The following two macros CHAR_STRING and STRING_CHAR are the main
   entry points to convert between Emacs two types of character
   representations: multi-byte form and single-word form (character
   code).  */

501 502 503 504
/* Store multi-byte form of the character C in STR.  The caller should
   allocate at least 4-byte area at STR in advance.  Returns the
   length of the multi-byte form.  If C is an invalid character code,
   signal an error.  */
Karl Heuer's avatar
Karl Heuer committed
505

506 507 508 509
#define CHAR_STRING(c, str)		\
  (SINGLE_BYTE_CHAR_P (c)		\
   ? *(str) = (unsigned char)(c), 1	\
   : char_to_string (c, (unsigned char *)str))
Karl Heuer's avatar
Karl Heuer committed
510 511 512 513 514

/* Return a character code of the character of which multi-byte form
   is at STR and the length is LEN.  If STR doesn't contain valid
   multi-byte form, only the first byte in STR is returned.  */

515 516 517
#define STRING_CHAR(str, len)				\
  (BYTES_BY_CHAR_HEAD ((unsigned char) *(str)) == 1	\
   ? (unsigned char) *(str)				\
518
   : string_to_char (str, len, 0))
Karl Heuer's avatar
Karl Heuer committed
519

520 521
/* This is like STRING_CHAR but the third arg ACTUAL_LEN is set to the
   length of the multi-byte form.  Just to know the length, use
Karl Heuer's avatar
Karl Heuer committed
522 523
   MULTIBYTE_FORM_LENGTH.  */

524 525 526
#define STRING_CHAR_AND_LENGTH(str, len, actual_len)	\
  (BYTES_BY_CHAR_HEAD ((unsigned char) *(str)) == 1	\
   ? ((actual_len) = 1), (unsigned char) *(str)		\
527
   : string_to_char (str, len, &(actual_len)))
528

529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552
/* Fetch the "next" multibyte character from Lisp string STRING
   at byte position BYTEIDX, character position CHARIDX.
   Store it into OUTPUT.

   All the args must be side-effect-free.
   BYTEIDX and CHARIDX must be lvalues;
   we increment them past the character fetched.  */

#define FETCH_STRING_CHAR_ADVANCE(OUTPUT, STRING, CHARIDX, BYTEIDX)	      \
if (1)									      \
  {									      \
    unsigned char *fetch_string_char_ptr = &XSTRING (STRING)->data[BYTEIDX];  \
    int fetch_string_char_space_left = XSTRING (STRING)->size_byte - BYTEIDX; \
    int actual_len;							      \
    									      \
    OUTPUT								      \
      = STRING_CHAR_AND_LENGTH (fetch_string_char_ptr,			      \
			        fetch_string_char_space_left, actual_len);    \
									      \
    BYTEIDX += actual_len;						      \
    CHARIDX++;								      \
  }									      \
else

553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572
/* Like FETCH_STRING_CHAR_SPACE_LEFT but fetch character from the
   current buffer.  */

#define FETCH_CHAR_ADVANCE(OUTPUT, CHARIDX, BYTEIDX)			  \
if (1)									  \
  {									  \
    unsigned char *fetch_buf_char_ptr = BYTE_POS_ADDR (BYTEIDX);	  \
    int fetch_buf_char_space_left = ((CHARIDX < GPT ? GPT_BYTE : Z_BYTE)  \
  				       - BYTEIDX);			  \
    int actual_len;							  \
    									  \
    OUTPUT								  \
  	= STRING_CHAR_AND_LENGTH (fetch_buf_char_ptr,			  \
  				  fetch_buf_char_space_left, actual_len); \
    									  \
    BYTEIDX += actual_len;						  \
    CHARIDX++;								  \
  }									  \
else

Karl Heuer's avatar
Karl Heuer committed
573 574
/* Return the length of the multi-byte form at string STR of length LEN.  */

575 576 577
#define MULTIBYTE_FORM_LENGTH(str, len)			\
  (BYTES_BY_CHAR_HEAD (*(unsigned char *)(str)) == 1	\
   ? 1							\
Karl Heuer's avatar
Karl Heuer committed
578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604
   : multibyte_form_length (str, len))

/* Set C a (possibly multibyte) character at P.  P points into a
   string which is the virtual concatenation of STR1 (which ends at
   END1) or STR2 (which ends at END2).  */

#define GET_CHAR_AFTER_2(c, p, str1, end1, str2, end2)			    \
  do {									    \
    const char *dtemp = (p) == (end1) ? (str2) : (p);			    \
    const char *dlimit = ((p) >= (str1) && (p) < (end1)) ? (end1) : (end2); \
    c = STRING_CHAR (dtemp, dlimit - dtemp);				    \
  } while (0)

/* Set C a (possibly multibyte) character before P.  P points into a
   string which is the virtual concatenation of STR1 (which ends at
   END1) or STR2 (which ends at END2).  */

#define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2)			    \
  do {									    \
    const char *dtemp = (p);						    \
    const char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
    while (dtemp-- > dlimit && (unsigned char) *dtemp >= 0xA0);		    \
    c = STRING_CHAR (dtemp, p - dtemp);					    \
  } while (0)

#ifdef emacs

605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620
/* Increase the buffer byte position POS_BYTE of the current buffer to
   the next character boundary.  This macro relies on the fact that
   *GPT_ADDR and *Z_ADDR are always accessible and the values are
   '\0'.  No range checking of POS.  */
#define INC_POS(pos_byte)				\
  do {							\
    unsigned char *p = BYTE_POS_ADDR (pos_byte);	\
    if (BASE_LEADING_CODE_P (*p))			\
      {							\
	int len, bytes;					\
	len = Z_BYTE - pos_byte;			\
	PARSE_MULTIBYTE_SEQ (p, len, bytes);		\
	pos_byte += bytes;				\
      }							\
    else						\
      pos_byte++;					\
Karl Heuer's avatar
Karl Heuer committed
621 622
  } while (0)

623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643
/* Decrease the buffer byte position POS_BYTE of the current buffer to
   the previous character boundary.  No range checking of POS.  */
#define DEC_POS(pos_byte)						\
  do {									\
    unsigned char *p, *p_min;						\
    									\
    pos_byte--;								\
    if (pos_byte < GPT_BYTE)						\
      p = BEG_ADDR + pos_byte - 1, p_min = BEG_ADDR;			\
    else								\
      p = BEG_ADDR + GAP_SIZE + pos_byte - 1, p_min = GAP_END_ADDR;	\
    if (p > p_min && !CHAR_HEAD_P (*p))					\
      {									\
	unsigned char *pend = p--;					\
	int len, bytes;							\
	while (p > p_min && !CHAR_HEAD_P (*p)) p--;			\
	len = pend + 1 - p;						\
	PARSE_MULTIBYTE_SEQ (p, len, bytes);				\
	if (bytes == len)						\
	  pos_byte -= len - 1;						\
      }									\
644 645 646 647
  } while (0)

/* Increment both CHARPOS and BYTEPOS, each in the appropriate way.  */

648 649 650 651 652 653 654 655 656
#define INC_BOTH(charpos, bytepos)				\
do								\
  {								\
    (charpos)++;						\
    if (NILP (current_buffer->enable_multibyte_characters))	\
      (bytepos)++;						\
    else							\
      INC_POS ((bytepos));					\
  }								\
657 658 659 660
while (0)

/* Decrement both CHARPOS and BYTEPOS, each in the appropriate way.  */

661 662 663 664 665 666 667 668 669
#define DEC_BOTH(charpos, bytepos)				\
do								\
  {								\
    (charpos)--;						\
    if (NILP (current_buffer->enable_multibyte_characters))	\
      (bytepos)--;						\
    else							\
      DEC_POS ((bytepos));					\
  }								\
670 671
while (0)

672 673 674 675 676 677 678 679
/* Increase the buffer byte position POS_BYTE of the current buffer to
   the next character boundary.  This macro relies on the fact that
   *GPT_ADDR and *Z_ADDR are always accessible and the values are
   '\0'.  No range checking of POS_BYTE.  */
#define BUF_INC_POS(buf, pos_byte)				\
  do {								\
    unsigned char *p = BUF_BYTE_ADDRESS (buf, pos_byte);	\
    if (BASE_LEADING_CODE_P (*p))				\
680
      {								\
681 682 683 684
	int len, bytes;						\
	len = BUF_Z_BYTE (buf) - pos_byte;			\
	PARSE_MULTIBYTE_SEQ (p, len, bytes);			\
	pos_byte += bytes;					\
685
      }								\
686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715
    else							\
      pos_byte++;						\
  } while (0)

/* Decrease the buffer byte position POS_BYTE of the current buffer to
   the previous character boundary.  No range checking of POS_BYTE.  */
#define BUF_DEC_POS(buf, pos_byte)					\
  do {									\
    unsigned char *p, *p_min;						\
    pos_byte--;								\
    if (pos_byte < BUF_GPT_BYTE (buf))					\
      {									\
	p = BUF_BEG_ADDR (buf) + pos_byte - 1;				\
	p_min = BUF_BEG_ADDR (buf);					\
      }									\
    else								\
      {									\
	p = BUF_BEG_ADDR (buf) + BUF_GAP_SIZE (buf) + pos_byte - 1;	\
	p_min = BUF_GAP_END_ADDR (buf);					\
      }									\
    if (p > p_min && !CHAR_HEAD_P (*p))					\
      {									\
	unsigned char *pend = p--;					\
	int len, bytes;							\
	while (p > p_min && !CHAR_HEAD_P (*p)) p--;			\
	len = pend + 1 - p;						\
	PARSE_MULTIBYTE_SEQ (p, len, bytes);				\
	if (bytes == len)						\
	  pos_byte -= len - 1;						\
      }									\
Karl Heuer's avatar
Karl Heuer committed
716 717 718 719
  } while (0)

#endif /* emacs */

720 721
/* This is the maximum byte length of multi-byte sequence.  */
#define MAX_MULTIBYTE_LENGTH 4
Karl Heuer's avatar
Karl Heuer committed
722

723 724
extern void invalid_character P_ ((int));

725
extern int translate_char P_ ((Lisp_Object, int, int, int, int));
726
extern int split_string P_ ((const unsigned char *, int, int *,
727
				       unsigned char *, unsigned char *));
728 729
extern int char_to_string P_ ((int, unsigned char *));
extern int string_to_char P_ ((const unsigned char *, int, int *));
730
extern int char_printable_p P_ ((int c));
731
extern int multibyte_form_length P_ ((const unsigned char *, int));
732
extern int get_charset_id P_ ((Lisp_Object));
733
extern int find_charset_in_str P_ ((unsigned char *, int, int *,
734
				    Lisp_Object, int));
735
extern int strwidth P_ ((unsigned char *, int));
Kenichi Handa's avatar
Kenichi Handa committed
736
extern int char_bytes P_ ((int));
Andreas Schwab's avatar
Andreas Schwab committed
737
extern int char_valid_p P_ ((int, int));
Kenichi Handa's avatar
Kenichi Handa committed
738

739
extern Lisp_Object Vtranslation_table_vector;
740

741
/* Return a translation table of id number ID.  */
742
#define GET_TRANSLATION_TABLE(id) \
743
  (XCDR(XVECTOR(Vtranslation_table_vector)->contents[(id)]))
Kenichi Handa's avatar
Kenichi Handa committed
744

Kenichi Handa's avatar
Kenichi Handa committed
745 746 747
/* A char-table for characters which may invoke auto-filling.  */
extern Lisp_Object Vauto_fill_chars;

Kenichi Handa's avatar
Kenichi Handa committed
748 749 750 751 752 753 754
/* Copy LEN bytes from FROM to TO.  This macro should be used only
   when a caller knows that LEN is short and the obvious copy loop is
   faster than calling bcopy which has some overhead.  */

#define BCOPY_SHORT(from, to, len)		\
  do {						\
    int i = len;				\
755
    unsigned char *from_p = from, *to_p = to;	\
Kenichi Handa's avatar
Kenichi Handa committed
756 757 758
    while (i--) *from_p++ = *to_p++;		\
  } while (0)

759 760
/* Length of C in bytes.  */

761
#define CHAR_LEN(C) CHARSET_BYTES (CHAR_CHARSET ((C)))
762

763
#endif /* _CHARSET_H */