character.c 30.7 KB
Newer Older
Kenichi Handa's avatar
Kenichi Handa committed
1
/* Basic character support.
2

Paul Eggert's avatar
Paul Eggert committed
3
Copyright (C) 2001-2020 Free Software Foundation, Inc.
4 5 6 7 8
Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
  Licensed to the Free Software Foundation.
Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
  National Institute of Advanced Industrial Science and Technology (AIST)
  Registration Number H13PRO009
Kenichi Handa's avatar
Kenichi Handa committed
9 10 11

This file is part of GNU Emacs.

12
GNU Emacs is free software: you can redistribute it and/or modify
Kenichi Handa's avatar
Kenichi Handa committed
13
it under the terms of the GNU General Public License as published by
14 15
the Free Software Foundation, either version 3 of the License, or (at
your option) any later version.
Kenichi Handa's avatar
Kenichi Handa committed
16 17 18 19 20 21 22

GNU Emacs is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
23
along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.  */
Kenichi Handa's avatar
Kenichi Handa committed
24 25 26 27 28 29 30 31 32

/* At first, see the document in `character.h' to understand the code
   in this file.  */

#include <config.h>

#include <stdio.h>

#include <sys/types.h>
33
#include <intprops.h>
Kenichi Handa's avatar
Kenichi Handa committed
34 35 36
#include "lisp.h"
#include "character.h"
#include "buffer.h"
37
#include "dispextern.h"
Kenichi Handa's avatar
Kenichi Handa committed
38 39 40
#include "composite.h"
#include "disptab.h"

Kenichi Handa's avatar
Kenichi Handa committed
41
/* Char-table of information about which character to unify to which
42
   Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
Kenichi Handa's avatar
Kenichi Handa committed
43 44 45 46
Lisp_Object Vchar_unify_table;



47 48 49
/* If character code C has modifier masks, reflect them to the
   character code if possible.  Return the resulting code.  */

50 51
EMACS_INT
char_resolve_modifier_mask (EMACS_INT c)
52
{
53
  /* A non-ASCII character can't reflect modifier bits to the code.  */
54 55 56 57 58 59 60 61 62 63 64
  if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
    return c;

  /* For Meta, Shift, and Control modifiers, we need special care.  */
  if (c & CHAR_SHIFT)
    {
      /* Shift modifier is valid only with [A-Za-z].  */
      if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
	c &= ~CHAR_SHIFT;
      else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
	c = (c & ~CHAR_SHIFT) - ('a' - 'A');
65 66
      /* Shift modifier for control characters and SPC is ignored.  */
      else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
67 68
	c &= ~CHAR_SHIFT;
    }
69 70 71 72
  if (c & CHAR_CTL)
    {
      /* Simulate the code in lread.c.  */
      /* Allow `\C- ' and `\C-?'.  */
73 74 75 76
      if ((c & 0377) == ' ')
	c &= ~0177 & ~ CHAR_CTL;
      else if ((c & 0377) == '?')
	c = 0177 | (c & ~0177 & ~CHAR_CTL);
77 78 79 80 81 82 83
      /* ASCII control chars are made from letters (both cases),
	 as well as the non-letters within 0100...0137.  */
      else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
	c &= (037 | (~0177 & ~CHAR_CTL));
      else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
	c &= (037 | (~0177 & ~CHAR_CTL));
    }
84
#if 0	/* This is outside the scope of this function.  (bug#4751)  */
85 86 87 88 89
  if (c & CHAR_META)
    {
      /* Move the meta bit to the right place for a string.  */
      c = (c & ~CHAR_META) | 0x80;
    }
90
#endif
91 92 93 94 95

  return c;
}


Kenichi Handa's avatar
Kenichi Handa committed
96 97 98
/* Store multibyte form of character C at P.  If C has modifier bits,
   handle them appropriately.  */

Kenichi Handa's avatar
Kenichi Handa committed
99
int
100
char_string (unsigned int c, unsigned char *p)
Kenichi Handa's avatar
Kenichi Handa committed
101 102 103
{
  int bytes;

Kenichi Handa's avatar
Kenichi Handa committed
104 105
  if (c & CHAR_MODIFIER_MASK)
    {
106
      c = char_resolve_modifier_mask (c);
Kenichi Handa's avatar
Kenichi Handa committed
107 108 109 110 111
      /* If C still has any modifier bits, just ignore it.  */
      c &= ~CHAR_MODIFIER_MASK;
    }

  if (c <= MAX_3_BYTE_CHAR)
Kenichi Handa's avatar
Kenichi Handa committed
112 113 114 115 116 117 118 119 120 121 122
    {
      bytes = CHAR_STRING (c, p);
    }
  else if (c <= MAX_4_BYTE_CHAR)
    {
      p[0] = (0xF0 | (c >> 18));
      p[1] = (0x80 | ((c >> 12) & 0x3F));
      p[2] = (0x80 | ((c >> 6) & 0x3F));
      p[3] = (0x80 | (c & 0x3F));
      bytes = 4;
    }
Kenichi Handa's avatar
Kenichi Handa committed
123
  else if (c <= MAX_5_BYTE_CHAR)
Kenichi Handa's avatar
Kenichi Handa committed
124 125 126 127 128 129 130 131
    {
      p[0] = 0xF8;
      p[1] = (0x80 | ((c >> 18) & 0x0F));
      p[2] = (0x80 | ((c >> 12) & 0x3F));
      p[3] = (0x80 | ((c >> 6) & 0x3F));
      p[4] = (0x80 | (c & 0x3F));
      bytes = 5;
    }
132
  else if (c <= MAX_CHAR)
Kenichi Handa's avatar
Kenichi Handa committed
133 134 135 136
    {
      c = CHAR_TO_BYTE8 (c);
      bytes = BYTE8_STRING (c, p);
    }
137
  else
138
    error ("Invalid character: %x", c);
139

Kenichi Handa's avatar
Kenichi Handa committed
140 141 142 143
  return bytes;
}


144
/* Return a character whose multibyte form is at P.  If LEN is not
Kenichi Handa's avatar
Kenichi Handa committed
145
   NULL, it must be a pointer to integer.  In that case, set *LEN to
146
   the byte length of the multibyte form.  If ADVANCED is not NULL, it
Kenichi Handa's avatar
Kenichi Handa committed
147
   must be a pointer to unsigned char.  In that case, set *ADVANCED to
148
   the ending address (i.e., the starting address of the next
Kenichi Handa's avatar
Kenichi Handa committed
149 150
   character) of the multibyte form.  */

Kenichi Handa's avatar
Kenichi Handa committed
151
int
152
string_char (const unsigned char *p, const unsigned char **advanced, int *len)
Kenichi Handa's avatar
Kenichi Handa committed
153
{
154
  int c;
155
  const unsigned char *saved_p = p;
Kenichi Handa's avatar
Kenichi Handa committed
156 157 158

  if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
    {
159
      /* 1-, 2-, and 3-byte sequences can be handled by the macro.  */
Kenichi Handa's avatar
Kenichi Handa committed
160 161 162 163
      c = STRING_CHAR_ADVANCE (p);
    }
  else if (! (*p & 0x08))
    {
164 165 166
      /* A 4-byte sequence of this form:
	 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx  */
      c = ((((p)[0] & 0x7) << 18)
Kenichi Handa's avatar
Kenichi Handa committed
167 168 169 170 171 172 173
	   | (((p)[1] & 0x3F) << 12)
	   | (((p)[2] & 0x3F) << 6)
	   | ((p)[3] & 0x3F));
      p += 4;
    }
  else
    {
174 175 176 177 178 179 180 181
      /* A 5-byte sequence of this form:

	 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

	 Note that the top 4 `x's are always 0, so shifting p[1] can
	 never exceed the maximum valid character codepoint. */
      c = (/* (((p)[0] & 0x3) << 24) ... always 0, so no need to shift. */
	   (((p)[1] & 0x3F) << 18)
Kenichi Handa's avatar
Kenichi Handa committed
182 183 184 185 186 187 188 189 190 191 192 193 194 195
	   | (((p)[2] & 0x3F) << 12)
	   | (((p)[3] & 0x3F) << 6)
	   | ((p)[4] & 0x3F));
      p += 5;
    }

  if (len)
    *len = p - saved_p;
  if (advanced)
    *advanced = p;
  return c;
}


196 197 198 199
/* Translate character C by translation table TABLE.  If no translation is
   found in TABLE, return the untranslated character.  If TABLE is a list,
   elements are char tables.  In that case, recursively translate C by all the
   tables in the list.  */
Kenichi Handa's avatar
Kenichi Handa committed
200 201

int
202
translate_char (Lisp_Object table, int c)
Kenichi Handa's avatar
Kenichi Handa committed
203
{
204 205 206 207 208 209
  if (CHAR_TABLE_P (table))
    {
      Lisp_Object ch;

      ch = CHAR_TABLE_REF (table, c);
      if (CHARACTERP (ch))
Tom Tromey's avatar
Tom Tromey committed
210
	c = XFIXNUM (ch);
211 212 213 214 215 216 217
    }
  else
    {
      for (; CONSP (table); table = XCDR (table))
	c = translate_char (XCAR (table), c);
    }
  return c;
Kenichi Handa's avatar
Kenichi Handa committed
218 219 220
}

DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
221
       doc: /* Return non-nil if OBJECT is a character.
Chong Yidong's avatar
Chong Yidong committed
222 223 224
In Emacs Lisp, characters are represented by character codes, which
are non-negative integers.  The function `max-char' returns the
maximum character code.
225 226
usage: (characterp OBJECT)  */
       attributes: const)
227
  (Lisp_Object object, Lisp_Object ignore)
Kenichi Handa's avatar
Kenichi Handa committed
228 229 230 231 232
{
  return (CHARACTERP (object) ? Qt : Qnil);
}

DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
233 234
       doc: /* Return the character of the maximum code.  */
       attributes: const)
235
  (void)
Kenichi Handa's avatar
Kenichi Handa committed
236
{
237
  return make_fixnum (MAX_CHAR);
Kenichi Handa's avatar
Kenichi Handa committed
238 239 240 241
}

DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
       Sunibyte_char_to_multibyte, 1, 1, 0,
242
       doc: /* Convert the byte CH to multibyte character.  */)
243
  (Lisp_Object ch)
Kenichi Handa's avatar
Kenichi Handa committed
244 245 246 247
{
  int c;

  CHECK_CHARACTER (ch);
Tom Tromey's avatar
Tom Tromey committed
248
  c = XFIXNAT (ch);
249 250
  if (c >= 0x100)
    error ("Not a unibyte character: %d", c);
251
  MAKE_CHAR_MULTIBYTE (c);
252
  return make_fixnum (c);
Kenichi Handa's avatar
Kenichi Handa committed
253 254 255 256
}

DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
       Smultibyte_char_to_unibyte, 1, 1, 0,
257 258
       doc: /* Convert the multibyte character CH to a byte.
If the multibyte character does not represent a byte, return -1.  */)
259
  (Lisp_Object ch)
Kenichi Handa's avatar
Kenichi Handa committed
260
{
261
  int cm;
Kenichi Handa's avatar
Kenichi Handa committed
262 263

  CHECK_CHARACTER (ch);
Tom Tromey's avatar
Tom Tromey committed
264
  cm = XFIXNAT (ch);
265 266 267 268 269 270
  if (cm < 256)
    /* Can't distinguish a byte read from a unibyte buffer from
       a latin1 char, so let's let it slide.  */
    return ch;
  else
    {
271
      int cu = CHAR_TO_BYTE_SAFE (cm);
272
      return make_fixnum (cu);
273
    }
Kenichi Handa's avatar
Kenichi Handa committed
274 275
}

276 277 278

/* Return width (columns) of C considering the buffer display table DP. */

Paul Eggert's avatar
Paul Eggert committed
279
static ptrdiff_t
280 281
char_width (int c, struct Lisp_Char_Table *dp)
{
282
  ptrdiff_t width = CHARACTER_WIDTH (c);
283 284 285 286 287 288 289 290 291

  if (dp)
    {
      Lisp_Object disp = DISP_CHAR_VECTOR (dp, c), ch;
      int i;

      if (VECTORP (disp))
	for (i = 0, width = 0; i < ASIZE (disp); i++)
	  {
292
	    int c = -1;
293
	    ch = AREF (disp, i);
294 295 296
	    if (GLYPH_CODE_P (ch))
	      c = GLYPH_CODE_CHAR (ch);
	    else if (CHARACTERP (ch))
297
	      c = XFIXNUM (ch);
298
	    if (c >= 0)
299
	      {
300
		int w = CHARACTER_WIDTH (c);
301
		if (INT_ADD_WRAPV (width, w, &width))
302 303
		  string_overflow ();
	      }
304 305 306 307 308 309
	  }
    }
  return width;
}


Paul Eggert's avatar
Paul Eggert committed
310
DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
Kenichi Handa's avatar
Kenichi Handa committed
311 312
       doc: /* Return width of CHAR when displayed in the current buffer.
The width is measured by how many columns it occupies on the screen.
313 314
Tab is taken to occupy `tab-width' columns.
usage: (char-width CHAR)  */)
315
  (Lisp_Object ch)
Kenichi Handa's avatar
Kenichi Handa committed
316
{
317
  int c;
Paul Eggert's avatar
Paul Eggert committed
318
  ptrdiff_t width;
Kenichi Handa's avatar
Kenichi Handa committed
319 320

  CHECK_CHARACTER (ch);
Tom Tromey's avatar
Tom Tromey committed
321
  c = XFIXNUM (ch);
322
  width = char_width (c, buffer_display_table ());
323
  return make_fixnum (width);
Kenichi Handa's avatar
Kenichi Handa committed
324 325 326 327 328 329 330 331 332
}

/* Return width of string STR of length LEN when displayed in the
   current buffer.  The width is measured by how many columns it
   occupies on the screen.  If PRECISION > 0, return the width of
   longest substring that doesn't exceed PRECISION, and set number of
   characters and bytes of the substring in *NCHARS and *NBYTES
   respectively.  */

333 334 335
ptrdiff_t
c_string_width (const unsigned char *str, ptrdiff_t len, int precision,
		ptrdiff_t *nchars, ptrdiff_t *nbytes)
Kenichi Handa's avatar
Kenichi Handa committed
336
{
337 338
  ptrdiff_t i = 0, i_byte = 0;
  ptrdiff_t width = 0;
Kenichi Handa's avatar
Kenichi Handa committed
339 340 341 342
  struct Lisp_Char_Table *dp = buffer_display_table ();

  while (i_byte < len)
    {
343
      int bytes;
344
      int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
Paul Eggert's avatar
Paul Eggert committed
345
      ptrdiff_t thiswidth = char_width (c, dp);
Kenichi Handa's avatar
Kenichi Handa committed
346

347
      if (0 < precision && precision - width < thiswidth)
Kenichi Handa's avatar
Kenichi Handa committed
348 349 350 351 352
	{
	  *nchars = i;
	  *nbytes = i_byte;
	  return width;
	}
353 354
      if (INT_ADD_WRAPV (thiswidth, width, &width))
	string_overflow ();
Kenichi Handa's avatar
Kenichi Handa committed
355 356 357 358 359 360 361 362 363 364 365 366 367
      i++;
      i_byte += bytes;
  }

  if (precision > 0)
    {
      *nchars = i;
      *nbytes = i_byte;
    }

  return width;
}

368 369 370 371
/* Return width of string STR of length LEN when displayed in the
   current buffer.  The width is measured by how many columns it
   occupies on the screen.  */

372 373
ptrdiff_t
strwidth (const char *str, ptrdiff_t len)
374
{
375
  return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
376 377
}

Kenichi Handa's avatar
Kenichi Handa committed
378 379 380 381 382 383 384
/* Return width of Lisp string STRING when displayed in the current
   buffer.  The width is measured by how many columns it occupies on
   the screen while paying attention to compositions.  If PRECISION >
   0, return the width of longest substring that doesn't exceed
   PRECISION, and set number of characters and bytes of the substring
   in *NCHARS and *NBYTES respectively.  */

385 386 387
ptrdiff_t
lisp_string_width (Lisp_Object string, ptrdiff_t precision,
		   ptrdiff_t *nchars, ptrdiff_t *nbytes)
Kenichi Handa's avatar
Kenichi Handa committed
388
{
389
  ptrdiff_t len = SCHARS (string);
390 391 392
  /* This set multibyte to 0 even if STRING is multibyte when it
     contains only ascii and eight-bit-graphic, but that's
     intentional.  */
393
  bool multibyte = len < SBYTES (string);
Kenichi Handa's avatar
Kenichi Handa committed
394
  unsigned char *str = SDATA (string);
395 396
  ptrdiff_t i = 0, i_byte = 0;
  ptrdiff_t width = 0;
Kenichi Handa's avatar
Kenichi Handa committed
397 398 399 400
  struct Lisp_Char_Table *dp = buffer_display_table ();

  while (i < len)
    {
401
      ptrdiff_t chars, bytes, thiswidth;
Kenichi Handa's avatar
Kenichi Handa committed
402
      Lisp_Object val;
403
      ptrdiff_t cmp_id;
404
      ptrdiff_t ignore, end;
Kenichi Handa's avatar
Kenichi Handa committed
405 406 407 408 409 410 411 412 413 414 415

      if (find_composition (i, -1, &ignore, &end, &val, string)
	  && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
	      >= 0))
	{
	  thiswidth = composition_table[cmp_id]->width;
	  chars = end - i;
	  bytes = string_char_to_byte (string, end) - i_byte;
	}
      else
	{
416
	  int c;
Kenichi Handa's avatar
Kenichi Handa committed
417

418
	  if (multibyte)
419 420 421 422 423
	    {
	      int cbytes;
	      c = STRING_CHAR_AND_LENGTH (str + i_byte, cbytes);
	      bytes = cbytes;
	    }
424 425
	  else
	    c = str[i_byte], bytes = 1;
Kenichi Handa's avatar
Kenichi Handa committed
426
	  chars = 1;
427
	  thiswidth = char_width (c, dp);
Kenichi Handa's avatar
Kenichi Handa committed
428 429
	}

430
      if (0 < precision && precision - width < thiswidth)
Kenichi Handa's avatar
Kenichi Handa committed
431 432 433 434 435
	{
	  *nchars = i;
	  *nbytes = i_byte;
	  return width;
	}
436 437
      if (INT_ADD_WRAPV (thiswidth, width, &width))
	string_overflow ();
Kenichi Handa's avatar
Kenichi Handa committed
438 439
      i += chars;
      i_byte += bytes;
440
    }
Kenichi Handa's avatar
Kenichi Handa committed
441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456

  if (precision > 0)
    {
      *nchars = i;
      *nbytes = i_byte;
    }

  return width;
}

DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
       doc: /* Return width of STRING when displayed in the current buffer.
Width is measured by how many columns it occupies on the screen.
When calculating width of a multibyte character in STRING,
only the base leading-code is considered; the validity of
the following bytes is not checked.  Tabs in STRING are always
457 458
taken to occupy `tab-width' columns.
usage: (string-width STRING)  */)
459
  (Lisp_Object str)
Kenichi Handa's avatar
Kenichi Handa committed
460 461 462 463 464 465 466 467 468 469 470 471 472 473
{
  Lisp_Object val;

  CHECK_STRING (str);
  XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
  return val;
}

/* Return the number of characters in the NBYTES bytes at PTR.
   This works by looking at the contents and checking for multibyte
   sequences while assuming that there's no invalid sequence.
   However, if the current buffer has enable-multibyte-characters =
   nil, we treat each byte as a character.  */

474 475
ptrdiff_t
chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
Kenichi Handa's avatar
Kenichi Handa committed
476 477 478
{
  /* current_buffer is null at early stages of Emacs initialization.  */
  if (current_buffer == 0
Tom Tromey's avatar
Tom Tromey committed
479
      || NILP (BVAR (current_buffer, enable_multibyte_characters)))
Kenichi Handa's avatar
Kenichi Handa committed
480 481 482 483 484 485 486 487 488 489
    return nbytes;

  return multibyte_chars_in_text (ptr, nbytes);
}

/* Return the number of characters in the NBYTES bytes at PTR.
   This works by looking at the contents and checking for multibyte
   sequences while assuming that there's no invalid sequence.  It
   ignores enable-multibyte-characters.  */

490 491
ptrdiff_t
multibyte_chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
Kenichi Handa's avatar
Kenichi Handa committed
492
{
Kenichi Handa's avatar
Kenichi Handa committed
493
  const unsigned char *endp = ptr + nbytes;
494
  ptrdiff_t chars = 0;
Kenichi Handa's avatar
Kenichi Handa committed
495 496 497

  while (ptr < endp)
    {
498
      int len = MULTIBYTE_LENGTH (ptr, endp);
Kenichi Handa's avatar
Kenichi Handa committed
499 500

      if (len == 0)
501
	emacs_abort ();
Kenichi Handa's avatar
Kenichi Handa committed
502 503 504 505 506 507 508 509 510 511 512 513 514 515
      ptr += len;
      chars++;
    }

  return chars;
}

/* Parse unibyte text at STR of LEN bytes as a multibyte text, count
   characters and bytes in it, and store them in *NCHARS and *NBYTES
   respectively.  On counting bytes, pay attention to that 8-bit
   characters not constructing a valid multibyte sequence are
   represented by 2-byte in a multibyte text.  */

void
516 517
parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
			ptrdiff_t *nchars, ptrdiff_t *nbytes)
Kenichi Handa's avatar
Kenichi Handa committed
518
{
Kenichi Handa's avatar
Kenichi Handa committed
519
  const unsigned char *endp = str + len;
520 521
  int n;
  ptrdiff_t chars = 0, bytes = 0;
Kenichi Handa's avatar
Kenichi Handa committed
522 523 524

  if (len >= MAX_MULTIBYTE_LENGTH)
    {
Kenichi Handa's avatar
Kenichi Handa committed
525
      const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
Kenichi Handa's avatar
Kenichi Handa committed
526 527
      while (str < adjusted_endp)
	{
528 529
	  if (! CHAR_BYTE8_HEAD_P (*str)
	      && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
Kenichi Handa's avatar
Kenichi Handa committed
530 531 532 533 534 535 536 537
	    str += n, bytes += n;
	  else
	    str++, bytes += 2;
	  chars++;
	}
    }
  while (str < endp)
    {
538 539
      if (! CHAR_BYTE8_HEAD_P (*str)
	  && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
Kenichi Handa's avatar
Kenichi Handa committed
540 541 542 543 544 545 546 547 548 549 550 551
	str += n, bytes += n;
      else
	str++, bytes += 2;
      chars++;
    }

  *nchars = chars;
  *nbytes = bytes;
  return;
}

/* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
Paul Eggert's avatar
Paul Eggert committed
552
   It actually converts only such 8-bit characters that don't construct
Kenichi Handa's avatar
Kenichi Handa committed
553 554 555 556 557 558
   a multibyte sequence to multibyte forms of Latin-1 characters.  If
   NCHARS is nonzero, set *NCHARS to the number of characters in the
   text.  It is assured that we can use LEN bytes at STR as a work
   area and that is enough.  Return the number of bytes of the
   resulting text.  */

559 560 561
ptrdiff_t
str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
		  ptrdiff_t *nchars)
Kenichi Handa's avatar
Kenichi Handa committed
562 563 564
{
  unsigned char *p = str, *endp = str + nbytes;
  unsigned char *to;
565
  ptrdiff_t chars = 0;
Kenichi Handa's avatar
Kenichi Handa committed
566 567 568 569 570 571
  int n;

  if (nbytes >= MAX_MULTIBYTE_LENGTH)
    {
      unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
      while (p < adjusted_endp
572
	     && ! CHAR_BYTE8_HEAD_P (*p)
Kenichi Handa's avatar
Kenichi Handa committed
573 574 575
	     && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
	p += n, chars++;
    }
576 577 578
  while (p < endp
	 && ! CHAR_BYTE8_HEAD_P (*p)
	 && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
Kenichi Handa's avatar
Kenichi Handa committed
579 580 581 582 583 584 585 586 587
    p += n, chars++;
  if (nchars)
    *nchars = chars;
  if (p == endp)
    return nbytes;

  to = p;
  nbytes = endp - p;
  endp = str + len;
588
  memmove (endp - nbytes, p, nbytes);
Kenichi Handa's avatar
Kenichi Handa committed
589 590 591 592 593 594 595
  p = endp - nbytes;

  if (nbytes >= MAX_MULTIBYTE_LENGTH)
    {
      unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
      while (p < adjusted_endp)
	{
596 597
	  if (! CHAR_BYTE8_HEAD_P (*p)
	      && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
Kenichi Handa's avatar
Kenichi Handa committed
598 599 600 601 602 603 604 605 606 607 608 609 610 611 612
	    {
	      while (n--)
		*to++ = *p++;
	    }
	  else
	    {
	      int c = *p++;
	      c = BYTE8_TO_CHAR (c);
	      to += CHAR_STRING (c, to);
	    }
	}
      chars++;
    }
  while (p < endp)
    {
613 614
      if (! CHAR_BYTE8_HEAD_P (*p)
	  && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
Kenichi Handa's avatar
Kenichi Handa committed
615 616 617
	{
	  while (n--)
	    *to++ = *p++;
Kenichi Handa's avatar
Kenichi Handa committed
618
	}
Kenichi Handa's avatar
Kenichi Handa committed
619 620 621 622 623 624 625 626 627 628 629 630 631 632
      else
	{
	  int c = *p++;
	  c = BYTE8_TO_CHAR (c);
	  to += CHAR_STRING (c, to);
	}
      chars++;
    }
  if (nchars)
    *nchars = chars;
  return (to - str);
}

/* Parse unibyte string at STR of LEN bytes, and return the number of
Paul Eggert's avatar
Paul Eggert committed
633
   bytes it may occupy when converted to multibyte string by
Kenichi Handa's avatar
Kenichi Handa committed
634 635
   `str_to_multibyte'.  */

636 637
ptrdiff_t
count_size_as_multibyte (const unsigned char *str, ptrdiff_t len)
Kenichi Handa's avatar
Kenichi Handa committed
638
{
639
  const unsigned char *endp = str + len;
640
  ptrdiff_t bytes;
Kenichi Handa's avatar
Kenichi Handa committed
641 642

  for (bytes = 0; str < endp; str++)
643 644
    {
      int n = *str < 0x80 ? 1 : 2;
645
      if (INT_ADD_WRAPV (bytes, n, &bytes))
646 647
        string_overflow ();
    }
Kenichi Handa's avatar
Kenichi Handa committed
648 649 650 651
  return bytes;
}


652
/* Convert unibyte text at STR of BYTES bytes to a multibyte text
Kenichi Handa's avatar
Kenichi Handa committed
653 654 655 656 657
   that contains the same single-byte characters.  It actually
   converts all 8-bit characters to multibyte forms.  It is assured
   that we can use LEN bytes at STR as a work area and that is
   enough.  */

658 659
ptrdiff_t
str_to_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t bytes)
Kenichi Handa's avatar
Kenichi Handa committed
660 661 662 663 664 665 666 667 668 669
{
  unsigned char *p = str, *endp = str + bytes;
  unsigned char *to;

  while (p < endp && *p < 0x80) p++;
  if (p == endp)
    return bytes;
  to = p;
  bytes = endp - p;
  endp = str + len;
670
  memmove (endp - bytes, p, bytes);
Kenichi Handa's avatar
Kenichi Handa committed
671
  p = endp - bytes;
Kenichi Handa's avatar
Kenichi Handa committed
672
  while (p < endp)
Kenichi Handa's avatar
Kenichi Handa committed
673 674 675 676 677 678 679 680 681 682 683 684 685 686
    {
      int c = *p++;

      if (c >= 0x80)
	c = BYTE8_TO_CHAR (c);
      to += CHAR_STRING (c, to);
    }
  return (to - str);
}

/* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
   actually converts characters in the range 0x80..0xFF to
   unibyte.  */

687 688
ptrdiff_t
str_as_unibyte (unsigned char *str, ptrdiff_t bytes)
Kenichi Handa's avatar
Kenichi Handa committed
689
{
690 691
  const unsigned char *p = str, *endp = str + bytes;
  unsigned char *to;
Kenichi Handa's avatar
Kenichi Handa committed
692 693 694 695 696 697 698 699 700 701
  int c, len;

  while (p < endp)
    {
      c = *p;
      len = BYTES_BY_CHAR_HEAD (c);
      if (CHAR_BYTE8_HEAD_P (c))
	break;
      p += len;
    }
702
  to = str + (p - str);
Kenichi Handa's avatar
Kenichi Handa committed
703
  while (p < endp)
Kenichi Handa's avatar
Kenichi Handa committed
704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719
    {
      c = *p;
      len = BYTES_BY_CHAR_HEAD (c);
      if (CHAR_BYTE8_HEAD_P (c))
	{
	  c = STRING_CHAR_ADVANCE (p);
	  *to++ = CHAR_TO_BYTE8 (c);
	}
      else
	{
	  while (len--) *to++ = *p++;
	}
    }
  return (to - str);
}

720 721 722 723
/* Convert eight-bit chars in SRC (in multibyte form) to the
   corresponding byte and store in DST.  CHARS is the number of
   characters in SRC.  The value is the number of bytes stored in DST.
   Usually, the value is the same as CHARS, but is less than it if SRC
724
   contains a non-ASCII, non-eight-bit character.  */
725

726
ptrdiff_t
727
str_to_unibyte (const unsigned char *src, unsigned char *dst, ptrdiff_t chars)
728
{
729
  ptrdiff_t i;
730 731 732 733 734 735 736

  for (i = 0; i < chars; i++)
    {
      int c = STRING_CHAR_ADVANCE (src);

      if (CHAR_BYTE8_P (c))
	c = CHAR_TO_BYTE8 (c);
737
      else if (! ASCII_CHAR_P (c))
738 739 740 741 742 743 744
	return i;
      *dst++ = c;
    }
  return i;
}


745
static ptrdiff_t
746
string_count_byte8 (Lisp_Object string)
Kenichi Handa's avatar
Kenichi Handa committed
747
{
748
  bool multibyte = STRING_MULTIBYTE (string);
749
  ptrdiff_t nbytes = SBYTES (string);
Kenichi Handa's avatar
Kenichi Handa committed
750
  unsigned char *p = SDATA (string);
Kenichi Handa's avatar
Kenichi Handa committed
751
  unsigned char *pend = p + nbytes;
752
  ptrdiff_t count = 0;
Kenichi Handa's avatar
Kenichi Handa committed
753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775
  int c, len;

  if (multibyte)
    while (p < pend)
      {
	c = *p;
	len = BYTES_BY_CHAR_HEAD (c);

	if (CHAR_BYTE8_HEAD_P (c))
	  count++;
	p += len;
      }
  else
    while (p < pend)
      {
	if (*p++ >= 0x80)
	  count++;
      }
  return count;
}


Lisp_Object
776
string_escape_byte8 (Lisp_Object string)
Kenichi Handa's avatar
Kenichi Handa committed
777
{
778 779
  ptrdiff_t nchars = SCHARS (string);
  ptrdiff_t nbytes = SBYTES (string);
780
  bool multibyte = STRING_MULTIBYTE (string);
781
  ptrdiff_t byte8_count;
782
  ptrdiff_t thrice_byte8_count, uninit_nchars, uninit_nbytes;
783 784
  const unsigned char *src, *src_end;
  unsigned char *dst;
Kenichi Handa's avatar
Kenichi Handa committed
785 786 787 788 789 790 791 792 793 794 795
  Lisp_Object val;
  int c, len;

  if (multibyte && nchars == nbytes)
    return string;

  byte8_count = string_count_byte8 (string);

  if (byte8_count == 0)
    return string;

796 797 798
  if (INT_MULTIPLY_WRAPV (byte8_count, 3, &thrice_byte8_count))
    string_overflow ();

Kenichi Handa's avatar
Kenichi Handa committed
799
  if (multibyte)
800 801
    {
      /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
802 803 804 805
      if (INT_ADD_WRAPV (nchars, thrice_byte8_count, &uninit_nchars)
	  || INT_ADD_WRAPV (nbytes, 2 * byte8_count, &uninit_nbytes))
	string_overflow ();
      val = make_uninit_multibyte_string (uninit_nchars, uninit_nbytes);
806
    }
Kenichi Handa's avatar
Kenichi Handa committed
807
  else
808 809
    {
      /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
810 811 812
      if (INT_ADD_WRAPV (thrice_byte8_count, nbytes, &uninit_nbytes))
	string_overflow ();
      val = make_uninit_string (uninit_nbytes);
813
    }
Kenichi Handa's avatar
Kenichi Handa committed
814

Kenichi Handa's avatar
Kenichi Handa committed
815
  src = SDATA (string);
Kenichi Handa's avatar
Kenichi Handa committed
816
  src_end = src + nbytes;
Kenichi Handa's avatar
Kenichi Handa committed
817
  dst = SDATA (val);
Kenichi Handa's avatar
Kenichi Handa committed
818 819 820 821 822 823 824 825 826 827
  if (multibyte)
    while (src < src_end)
      {
	c = *src;
	len = BYTES_BY_CHAR_HEAD (c);

	if (CHAR_BYTE8_HEAD_P (c))
	  {
	    c = STRING_CHAR_ADVANCE (src);
	    c = CHAR_TO_BYTE8 (c);
828
	    dst += sprintf ((char *) dst, "\\%03o", c + 0u);
Kenichi Handa's avatar
Kenichi Handa committed
829 830 831 832 833 834 835 836 837
	  }
	else
	  while (len--) *dst++ = *src++;
      }
  else
    while (src < src_end)
      {
	c = *src++;
	if (c >= 0x80)
838
	  dst += sprintf ((char *) dst, "\\%03o", c + 0u);
Kenichi Handa's avatar
Kenichi Handa committed
839 840 841 842 843 844 845
	else
	  *dst++ = c;
      }
  return val;
}


Paul Eggert's avatar
Paul Eggert committed
846
DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
Kenichi Handa's avatar
Kenichi Handa committed
847
       doc: /*
Dave Love's avatar
Dave Love committed
848 849
Concatenate all the argument characters and make the result a string.
usage: (string &rest CHARACTERS)  */)
850
  (ptrdiff_t n, Lisp_Object *args)
Kenichi Handa's avatar
Kenichi Handa committed
851
{
852
  ptrdiff_t i;
853
  int c;
854 855 856 857
  unsigned char *buf, *p;
  Lisp_Object str;
  USE_SAFE_ALLOCA;

858
  SAFE_NALLOCA (buf, MAX_MULTIBYTE_LENGTH, n);
859
  p = buf;
Kenichi Handa's avatar
Kenichi Handa committed
860 861 862 863

  for (i = 0; i < n; i++)
    {
      CHECK_CHARACTER (args[i]);
Tom Tromey's avatar
Tom Tromey committed
864
      c = XFIXNUM (args[i]);
Kenichi Handa's avatar
Kenichi Handa committed
865 866 867
      p += CHAR_STRING (c, p);
    }

868 869 870
  str = make_string_from_bytes ((char *) buf, n, p - buf);
  SAFE_FREE ();
  return str;
Kenichi Handa's avatar
Kenichi Handa committed
871 872
}

873
DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
874 875
       doc: /* Concatenate all the argument bytes and make the result a unibyte string.
usage: (unibyte-string &rest BYTES)  */)
876
  (ptrdiff_t n, Lisp_Object *args)
877
{
878
  ptrdiff_t i;
879 880
  Lisp_Object str;
  USE_SAFE_ALLOCA;
881 882
  unsigned char *buf = SAFE_ALLOCA (n);
  unsigned char *p = buf;
883 884 885

  for (i = 0; i < n; i++)
    {
886
      CHECK_RANGED_INTEGER (args[i], 0, 255);
Tom Tromey's avatar
Tom Tromey committed
887
      *p++ = XFIXNUM (args[i]);
888 889
    }

890 891 892
  str = make_string_from_bytes ((char *) buf, n, p - buf);
  SAFE_FREE ();
  return str;
893 894
}

895
DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
896 897 898 899
       Schar_resolve_modifiers, 1, 1, 0,
       doc: /* Resolve modifiers in the character CHAR.
The value is a character with modifiers resolved into the character
code.  Unresolved modifiers are kept in the value.
900
usage: (char-resolve-modifiers CHAR)  */)
901
  (Lisp_Object character)
902
{
903
  EMACS_INT c;
904

905
  CHECK_FIXNUM (character);
Tom Tromey's avatar
Tom Tromey committed
906
  c = XFIXNUM (character);
907
  return make_fixnum (char_resolve_modifier_mask (c));
908 909
}

Kenichi Handa's avatar
Kenichi Handa committed
910 911 912 913 914 915 916 917 918
DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
       doc: /* Return a byte value of a character at point.
Optional 1st arg POSITION, if non-nil, is a position of a character to get
a byte value.
Optional 2nd arg STRING, if non-nil, is a string of which first
character is a target to get a byte value.  In this case, POSITION, if
non-nil, is an index of a target character in the string.

If the current buffer (or STRING) is multibyte, and the target
Paul Eggert's avatar
Paul Eggert committed
919
character is not ASCII nor 8-bit character, an error is signaled.  */)
920
  (Lisp_Object position, Lisp_Object string)
Kenichi Handa's avatar
Kenichi Handa committed
921 922
{
  int c;
923
  ptrdiff_t pos;
Kenichi Handa's avatar
Kenichi Handa committed
924 925 926 927 928 929 930
  unsigned char *p;

  if (NILP (string))
    {
      if (NILP (position))
	{
	  p = PT_ADDR;
931
	}
Kenichi Handa's avatar
Kenichi Handa committed
932 933
      else
	{
934
	  CHECK_FIXNUM_COERCE_MARKER (position);
Tom Tromey's avatar
Tom Tromey committed
935
	  if (XFIXNUM (position) < BEGV || XFIXNUM (position) >= ZV)
936
	    args_out_of_range_3 (position, make_fixnum (BEGV), make_fixnum (ZV));
Tom Tromey's avatar
Tom Tromey committed
937
	  pos = XFIXNAT (position);
Kenichi Handa's avatar
Kenichi Handa committed
938 939
	  p = CHAR_POS_ADDR (pos);
	}
Tom Tromey's avatar
Tom Tromey committed
940
      if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
941
	return make_fixnum (*p);
Kenichi Handa's avatar
Kenichi Handa committed
942 943 944 945 946 947 948 949 950 951
    }
  else
    {
      CHECK_STRING (string);
      if (NILP (position))
	{
	  p = SDATA (string);
	}
      else
	{
952
	  CHECK_FIXNAT (position);
Tom Tromey's avatar
Tom Tromey committed
953
	  if (XFIXNUM (position) >= SCHARS (string))
Kenichi Handa's avatar
Kenichi Handa committed
954
	    args_out_of_range (string, position);
Tom Tromey's avatar
Tom Tromey committed
955
	  pos = XFIXNAT (position);
Kenichi Handa's avatar
Kenichi Handa committed
956 957
	  p = SDATA (string) + string_char_to_byte (string, pos);
	}
958
      if (! STRING_MULTIBYTE (string))
959
	return make_fixnum (*p);
Kenichi Handa's avatar
Kenichi Handa committed
960
    }
961
  c = STRING_CHAR (p);
Kenichi Handa's avatar
Kenichi Handa committed
962 963 964 965
  if (CHAR_BYTE8_P (c))
    c = CHAR_TO_BYTE8 (c);
  else if (! ASCII_CHAR_P (c))
    error ("Not an ASCII nor an 8-bit character: %d", c);
966
  return make_fixnum (c);
Kenichi Handa's avatar
Kenichi Handa committed
967 968
}

969
/* Return true if C is an alphabetic character.  */
970 971 972 973
bool
alphabeticp (int c)
{
  Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
974
  if (! FIXNUMP (category))
975
    return false;
Tom Tromey's avatar
Tom Tromey committed
976
  EMACS_INT gen_cat = XFIXNUM (category);
977 978 979 980 981 982 983 984 985 986 987 988 989

  /* See UTS #18.  There are additional characters that should be
     here, those designated as Other_uppercase, Other_lowercase,
     and Other_alphabetic; FIXME.  */
  return (gen_cat == UNICODE_CATEGORY_Lu
	  || gen_cat == UNICODE_CATEGORY_Ll
	  || gen_cat == UNICODE_CATEGORY_Lt
	  || gen_cat == UNICODE_CATEGORY_Lm
	  || gen_cat == UNICODE_CATEGORY_Lo
	  || gen_cat == UNICODE_CATEGORY_Mn
	  || gen_cat == UNICODE_CATEGORY_Mc
	  || gen_cat == UNICODE_CATEGORY_Me
	  || gen_cat == UNICODE_CATEGORY_Nl);
990 991
}

992
/* Return true if C is an alphabetic or decimal-number character.  */
993
bool
994
alphanumericp (int c)
995 996
{
  Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
997
  if (! FIXNUMP (category))
998
    return false;
Tom Tromey's avatar
Tom Tromey committed
999
  EMACS_INT gen_cat = XFIXNUM (category);
1000

1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011
  /* See UTS #18.  Same comment as for alphabeticp applies.  FIXME. */
  return (gen_cat == UNICODE_CATEGORY_Lu
	  || gen_cat == UNICODE_CATEGORY_Ll
	  || gen_cat == UNICODE_CATEGORY_Lt
	  || gen_cat == UNICODE_CATEGORY_Lm
	  || gen_cat == UNICODE_CATEGORY_Lo
	  || gen_cat == UNICODE_CATEGORY_Mn
	  || gen_cat == UNICODE_CATEGORY_Mc
	  || gen_cat == UNICODE_CATEGORY_Me
	  || gen_cat == UNICODE_CATEGORY_Nl
	  || gen_cat == UNICODE_CATEGORY_Nd);
1012 1013
}

1014
/* Return true if C is a graphic character.  */
1015 1016 1017
bool
graphicp (int c)
{
1018
  Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
1019
  if (! FIXNUMP (category))
1020
    return false;
Tom Tromey's avatar
Tom Tromey committed
1021
  EMACS_INT gen_cat = XFIXNUM (category);
1022 1023 1024 1025 1026 1027 1028 1029

  /* See UTS #18.  */
  return (!(gen_cat == UNICODE_CATEGORY_Zs /* space separator */
	    || gen_cat == UNICODE_CATEGORY_Zl /* line separator */
	    || gen_cat == UNICODE_CATEGORY_Zp /* paragraph separator */
	    || gen_cat == UNICODE_CATEGORY_Cc /* control */
	    || gen_cat == UNICODE_CATEGORY_Cs /* surrogate */
	    || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
1030 1031
}

1032
/* Return true if C is a printable character.  */
1033 1034 1035 1036
bool
printablep (int c)
{
  Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
1037
  if (! FIXNUMP (category))
1038
    return false;
Tom Tromey's avatar
Tom Tromey committed
1039
  EMACS_INT gen_cat = XFIXNUM (category);
1040 1041 1042 1043 1044 1045 1046

  /* See UTS #18.  */
  return (!(gen_cat == UNICODE_CATEGORY_Cc /* control */
	    || gen_cat == UNICODE_CATEGORY_Cs /* surrogate */
	    || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
}

1047
/* Return true if C is a horizontal whitespace character, as defined
Paul Eggert's avatar
Paul Eggert committed
1048
   by https://www.unicode.org/reports/tr18/tr18-19.html#blank.  */
1049 1050 1051 1052
bool
blankp (int c)
{
  Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
1053
  if (! FIXNUMP (category))
1054 1055
    return false;

Tom Tromey's avatar
Tom Tromey committed
1056
  return XFIXNUM (category) == UNICODE_CATEGORY_Zs; /* separator, space */
1057 1058
}

1059 1060 1061
/* hexdigit[C] is one greater than C's numeric value if C is a
   hexadecimal digit, zero otherwise.  */
signed char const hexdigit[UCHAR_MAX + 1] =
1062
  {
1063 1064 1065 1066 1067 1068
    ['0'] = 1 + 0, ['1'] = 1 + 1, ['2'] = 1 + 2, ['3'] = 1 + 3, ['4'] = 1 + 4,
    ['5'] = 1 + 5, ['6'] = 1 + 6, ['7'] = 1 + 7, ['8'] = 1 + 8, ['9'] = 1 + 9,
    ['A'] = 1 + 10, ['B'] = 1 + 11, ['C'] = 1 + 12,
    ['D'] = 1 + 13, ['E'] = 1 + 14, ['F'] = 1 + 15,
    ['a'] = 1 + 10, ['b'] = 1 + 11, ['c'] = 1 + 12,
    ['d'] = 1 + 13, ['e'] = 1 + 14, ['f'] = 1 + 15
1069 1070
  };

Kenichi Handa's avatar
Kenichi Handa committed
1071
void
1072
syms_of_character (void)
Kenichi Handa's avatar
Kenichi Handa committed
1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086
{
  DEFSYM (Qcharacterp, "characterp");
  DEFSYM (Qauto_fill_chars, "auto-fill-chars");

  staticpro (&Vchar_unify_table);
  Vchar_unify_table = Qnil;

  defsubr (&Smax_char);
  defsubr (&Scharacterp);
  defsubr (&Sunibyte_char_to_multibyte);
  defsubr (&Smultibyte_char_to_unibyte);
  defsubr (&Schar_width);
  defsubr (&Sstring_width);
  defsubr (&Sstring);
1087
  defsubr (&Sunibyte_string);
1088
  defsubr (&Schar_resolve_modifiers);
Kenichi Handa's avatar
Kenichi Handa committed
1089
  defsubr (&Sget_byte);
Kenichi Handa's avatar
Kenichi Handa committed
1090

1091
  DEFVAR_LISP ("translation-table-vector",  Vtranslation_table_vector,
Kenichi Handa's avatar
Kenichi Handa committed
1092
	       doc: /*
1093 1094 1095
Vector recording all translation tables ever defined.
Each element is a pair (SYMBOL . TABLE) relating the table to the
symbol naming it.  The ID of a translation table is an index into this vector.  */);
1096
  Vtranslation_table_vector = make_nil_vector (16);
Kenichi Handa's avatar
Kenichi Handa committed
1097

1098
  DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
Kenichi Handa's avatar
Kenichi Handa committed
1099 1100 1101 1102
	       doc: /*
A char-table for characters which invoke auto-filling.
Such characters have value t in this table.  */);
  Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1103 1104
  CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
  CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
Kenichi Handa's avatar
Kenichi Handa committed
1105

1106
  DEFVAR_LISP ("char-width-table", Vchar_width_table,
Kenichi Handa's avatar
Kenichi Handa committed
1107 1108
	       doc: /*
A char-table for width (columns) of each character.  */);
1109 1110
  Vchar_width_table = Fmake_char_table (Qnil, make_fixnum (1));
  char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_fixnum (4));
1111
  char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1112
			make_fixnum (4));
Kenichi Handa's avatar
Kenichi Handa committed
1113

1114
  DEFVAR_LISP ("printable-chars", Vprintable_chars,
Kenichi Handa's avatar
Kenichi Handa committed
1115
	       doc: /* A char-table for each printable character.  */);
1116
  Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1117
  Fset_char_table_range (Vprintable_chars,
1118
			 Fcons (make_fixnum (32), make_fixnum (126)), Qt);
1119
  Fset_char_table_range (Vprintable_chars,
1120 1121
			 Fcons (make_fixnum (160),
				make_fixnum (MAX_5_BYTE_CHAR)), Qt);
1122

1123
  DEFVAR_LISP ("char-script-table", Vchar_script_table,
1124 1125 1126 1127
	       doc: /* Char table of script symbols.
It has one extra slot whose value is a list of script symbols.  */);

  DEFSYM (Qchar_script_table, "char-script-table");
1128
  Fput (Qchar_script_table, Qchar_table_extra_slots, make_fixnum (1));
1129
  Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1130

1131
  DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
1132
	       doc: /* Alist of scripts vs the representative characters.
1133
Each element is a cons (SCRIPT . CHARS).
1134
SCRIPT is a symbol representing a script or a subgroup of a script.
1135
CHARS is a list or a vector of characters.
1136
If it is a list, all characters in the list are necessary for supporting SCRIPT.
1137 1138
If it is a vector, one of the characters in the vector is necessary.
This variable is used to find a font for a specific script.  */);
1139
  Vscript_representative_chars = Qnil;
1140

1141
  DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
1142
	       doc: /* Char table of Unicode's "General Category".
1143 1144
All Unicode characters have one of the following values (symbol):
  Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1145 1146 1147 1148
  Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
See The Unicode Standard for the meaning of those values.  */);
  /* The correct char-table is setup in characters.el.  */
  Vunicode_category_table = Qnil;
Kenichi Handa's avatar
Kenichi Handa committed
1149
}