syntax.c 104 KB
Newer Older
Richard M. Stallman's avatar
Richard M. Stallman committed
1
/* GNU Emacs routines to deal with syntax tables; also word and list parsing.
2 3
   Copyright (C) 1985, 1987, 1993-1995, 1997-1999, 2001-2013 Free
   Software Foundation, Inc.
Richard M. Stallman's avatar
Richard M. Stallman committed
4 5 6

This file is part of GNU Emacs.

7
GNU Emacs is free software: you can redistribute it and/or modify
Richard M. Stallman's avatar
Richard M. Stallman committed
8
it under the terms of the GNU General Public License as published by
9 10
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Richard M. Stallman's avatar
Richard M. Stallman committed
11 12 13 14 15 16 17

GNU Emacs is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
18
along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
Richard M. Stallman's avatar
Richard M. Stallman committed
19 20


21
#include <config.h>
22 23

#include <sys/types.h>
24

Richard M. Stallman's avatar
Richard M. Stallman committed
25 26
#include "lisp.h"
#include "commands.h"
27
#include "character.h"
28
#include "buffer.h"
Stefan Monnier's avatar
Stefan Monnier committed
29
#include "keymap.h"
Richard M. Stallman's avatar
Richard M. Stallman committed
30
#include "regex.h"
31 32 33 34

/* Make syntax table lookup grant data in gl_state.  */
#define SYNTAX_ENTRY_VIA_PROPERTY

Richard M. Stallman's avatar
Richard M. Stallman committed
35
#include "syntax.h"
36
#include "intervals.h"
37 38
#include "category.h"

Paul Eggert's avatar
Paul Eggert committed
39
/* Eight single-bit flags have the following meanings:
40 41 42 43 44 45 46 47 48 49 50
  1. This character is the first of a two-character comment-start sequence.
  2. This character is the second of a two-character comment-start sequence.
  3. This character is the first of a two-character comment-end sequence.
  4. This character is the second of a two-character comment-end sequence.
  5. This character is a prefix, for backward-prefix-chars.
  6. The char is part of a delimiter for comments of style "b".
  7. This character is part of a nestable comment sequence.
  8. The char is part of a delimiter for comments of style "c".
  Note that any two-character sequence whose first character has flag 1
  and whose second character has flag 2 will be interpreted as a comment start.

Paul Eggert's avatar
Paul Eggert committed
51
  Bits 6 and 8 discriminate among different comment styles.
52
  Languages such as C++ allow two orthogonal syntax start/end pairs
Paul Eggert's avatar
Paul Eggert committed
53
  and bit 6 determines whether a comment-end or Scommentend
54 55
  ends style a or b.  Comment markers can start style a, b, c, or bc.
  Style a is always the default.
Paul Eggert's avatar
Paul Eggert committed
56
  For 2-char comment markers, the style b flag is looked up only on the second
57
  char of the comment marker and on the first char of the comment ender.
Paul Eggert's avatar
Paul Eggert committed
58 59
  For style c (like the nested flag), the flag can be placed on any of
  the chars.  */
60

61
/* These functions extract specific flags from an integer
62 63
   that holds the syntax code and the flags.  */

64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
static bool
SYNTAX_FLAGS_COMSTART_FIRST (int flags)
{
  return (flags >> 16) & 1;
}
static bool
SYNTAX_FLAGS_COMSTART_SECOND (int flags)
{
  return (flags >> 17) & 1;
}
static bool
SYNTAX_FLAGS_COMEND_FIRST (int flags)
{
  return (flags >> 18) & 1;
}
static bool
SYNTAX_FLAGS_COMEND_SECOND (int flags)
{
  return (flags >> 19) & 1;
}
static bool
SYNTAX_FLAGS_PREFIX (int flags)
{
  return (flags >> 20) & 1;
}
static bool
SYNTAX_FLAGS_COMMENT_STYLEB (int flags)
{
  return (flags >> 21) & 1;
}
static bool
SYNTAX_FLAGS_COMMENT_STYLEC (int flags)
{
  return (flags >> 23) & 1;
}
static int
SYNTAX_FLAGS_COMMENT_STYLEC2 (int flags)
{
  return (flags >> 22) & 2; /* SYNTAX_FLAGS_COMMENT_STYLEC (flags) * 2 */
}
static bool
SYNTAX_FLAGS_COMMENT_NESTED (int flags)
{
  return (flags >> 22) & 1;
}
109 110 111

/* FLAGS should be the flags of the main char of the comment marker, e.g.
   the second for comstart and the first for comend.  */
112 113 114 115 116 117 118
static int
SYNTAX_FLAGS_COMMENT_STYLE (int flags, int other_flags)
{
  return (SYNTAX_FLAGS_COMMENT_STYLEB (flags)
	  | SYNTAX_FLAGS_COMMENT_STYLEC2 (flags)
	  | SYNTAX_FLAGS_COMMENT_STYLEC2 (other_flags));
}
119

120
/* Extract a particular flag for a given character.  */
121

122 123 124 125 126
static bool
SYNTAX_COMEND_FIRST (int c)
{
  return SYNTAX_FLAGS_COMEND_FIRST (SYNTAX_WITH_FLAGS (c));
}
127 128

/* We use these constants in place for comment-style and
129
   string-ender-char to distinguish comments/strings started by
130 131
   comment_fence and string_fence codes.  */

132 133 134 135 136
enum
  {
    ST_COMMENT_STYLE = 256 + 1,
    ST_STRING_STYLE = 256 + 2
  };
Richard M. Stallman's avatar
Richard M. Stallman committed
137

138 139
static Lisp_Object Qsyntax_table_p;
static Lisp_Object Qsyntax_table, Qscan_error;
Richard M. Stallman's avatar
Richard M. Stallman committed
140

141 142 143 144
/* This is the internal form of the parse state used in parse-partial-sexp.  */

struct lisp_parse_state
  {
145
    EMACS_INT depth;	/* Depth at end of parsing.  */
146
    int instring;  /* -1 if not within string, else desired terminator.  */
147
    EMACS_INT incomment; /* -1 if in unnestable comment else comment nesting */
148
    int comstyle;  /* comment style a=0, or b=1, or ST_COMMENT_STYLE.  */
Paul Eggert's avatar
Paul Eggert committed
149
    bool quoted;   /* True if just after an escape char at end of parsing.  */
150
    EMACS_INT mindepth;	/* Minimum depth seen while scanning.  */
151
    /* Char number of most recent start-of-expression at current level */
152
    ptrdiff_t thislevelstart;
153
    /* Char number of start of containing expression */
154 155
    ptrdiff_t prevlevelstart;
    ptrdiff_t location;	     /* Char number at which parsing stopped.  */
156
    ptrdiff_t location_byte; /* Corresponding byte position.  */
157
    ptrdiff_t comstr_start;  /* Position of last comment/string starter.  */
158 159
    Lisp_Object levelstarts; /* Char numbers of starts-of-expression
				of levels (starting from outermost).  */
160 161
  };

162 163 164
/* These variables are a cache for finding the start of a defun.
   find_start_pos is the place for which the defun start was found.
   find_start_value is the defun start position found for it.
165
   find_start_value_byte is the corresponding byte position.
166 167 168 169
   find_start_buffer is the buffer it was found in.
   find_start_begv is the BEGV value when it was found.
   find_start_modiff is the value of MODIFF when it was found.  */

170 171 172
static ptrdiff_t find_start_pos;
static ptrdiff_t find_start_value;
static ptrdiff_t find_start_value_byte;
173
static struct buffer *find_start_buffer;
174 175
static ptrdiff_t find_start_begv;
static EMACS_INT find_start_modiff;
176 177


Paul Eggert's avatar
Paul Eggert committed
178 179 180
static Lisp_Object skip_chars (bool, Lisp_Object, Lisp_Object, bool);
static Lisp_Object skip_syntaxes (bool, Lisp_Object, Lisp_Object);
static Lisp_Object scan_lists (EMACS_INT, EMACS_INT, EMACS_INT, bool);
181
static void scan_sexps_forward (struct lisp_parse_state *,
182
                                ptrdiff_t, ptrdiff_t, ptrdiff_t, EMACS_INT,
Paul Eggert's avatar
Paul Eggert committed
183 184
                                bool, Lisp_Object, int);
static bool in_classes (int, Lisp_Object);
Paul Eggert's avatar
Paul Eggert committed
185 186

/* This setter is used only in this file, so it can be private.  */
187
static void
Paul Eggert's avatar
Paul Eggert committed
188 189 190 191
bset_syntax_table (struct buffer *b, Lisp_Object val)
{
  b->INTERNAL_FIELD (syntax_table) = val;
}
192

193
/* Whether the syntax of the character C has the prefix flag set.  */
Paul Eggert's avatar
Paul Eggert committed
194 195
bool
syntax_prefix_flag_p (int c)
196
{
197
  return SYNTAX_FLAGS_PREFIX (SYNTAX_WITH_FLAGS (c));
198
}
199 200 201

struct gl_state_s gl_state;		/* Global state of syntax parser.  */

202
enum { INTERVALS_AT_ONCE = 10 };	/* 1 + max-number of intervals
203 204
					   to scan to property-change.  */

205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297
/* Set the syntax entry VAL for char C in table TABLE.  */

static void
SET_RAW_SYNTAX_ENTRY (Lisp_Object table, int c, Lisp_Object val)
{
  CHAR_TABLE_SET (table, c, val);
}

/* Set the syntax entry VAL for char-range RANGE in table TABLE.
   RANGE is a cons (FROM . TO) specifying the range of characters.  */

static void
SET_RAW_SYNTAX_ENTRY_RANGE (Lisp_Object table, Lisp_Object range,
			    Lisp_Object val)
{
  Fset_char_table_range (table, range, val);
}

/* Extract the information from the entry for character C
   in the current syntax table.  */

static Lisp_Object
SYNTAX_MATCH (int c)
{
  Lisp_Object ent = SYNTAX_ENTRY (c);
  return CONSP (ent) ? XCDR (ent) : Qnil;
}

/* This should be called with FROM at the start of forward
   search, or after the last position of the backward search.  It
   makes sure that the first char is picked up with correct table, so
   one does not need to call UPDATE_SYNTAX_TABLE immediately after the
   call.
   Sign of COUNT gives the direction of the search.
 */

static void
SETUP_SYNTAX_TABLE (ptrdiff_t from, ptrdiff_t count)
{
  SETUP_BUFFER_SYNTAX_TABLE ();
  gl_state.b_property = BEGV;
  gl_state.e_property = ZV + 1;
  gl_state.object = Qnil;
  gl_state.offset = 0;
  if (parse_sexp_lookup_properties)
    if (count > 0 || from > BEGV)
      update_syntax_table (count > 0 ? from : from - 1, count, 1, Qnil);
}

/* Same as above, but in OBJECT.  If OBJECT is nil, use current buffer.
   If it is t (which is only used in fast_c_string_match_ignore_case),
   ignore properties altogether.

   This is meant for regex.c to use.  For buffers, regex.c passes arguments
   to the UPDATE_SYNTAX_TABLE functions which are relative to BEGV.
   So if it is a buffer, we set the offset field to BEGV.  */

void
SETUP_SYNTAX_TABLE_FOR_OBJECT (Lisp_Object object,
			       ptrdiff_t from, ptrdiff_t count)
{
  SETUP_BUFFER_SYNTAX_TABLE ();
  gl_state.object = object;
  if (BUFFERP (gl_state.object))
    {
      struct buffer *buf = XBUFFER (gl_state.object);
      gl_state.b_property = 1;
      gl_state.e_property = BUF_ZV (buf) - BUF_BEGV (buf) + 1;
      gl_state.offset = BUF_BEGV (buf) - 1;
    }
  else if (NILP (gl_state.object))
    {
      gl_state.b_property = 1;
      gl_state.e_property = ZV - BEGV + 1;
      gl_state.offset = BEGV - 1;
    }
  else if (EQ (gl_state.object, Qt))
    {
      gl_state.b_property = 0;
      gl_state.e_property = PTRDIFF_MAX;
      gl_state.offset = 0;
    }
  else
    {
      gl_state.b_property = 0;
      gl_state.e_property = 1 + SCHARS (gl_state.object);
      gl_state.offset = 0;
    }
  if (parse_sexp_lookup_properties)
    update_syntax_table (from + gl_state.offset - (count <= 0),
			 count, 1, gl_state.object);
}

298 299
/* Update gl_state to an appropriate interval which contains CHARPOS.  The
   sign of COUNT give the relative position of CHARPOS wrt the previously
300
   valid interval.  If INIT, only [be]_property fields of gl_state are
301
   valid at start, the rest is filled basing on OBJECT.
302

303
   `gl_state.*_i' are the intervals, and CHARPOS is further in the search
304 305
   direction than the intervals - or in an interval.  We update the
   current syntax-table basing on the property of this interval, and
306
   update the interval to start further than CHARPOS - or be
Dmitry Antipov's avatar
Dmitry Antipov committed
307
   NULL.  We also update lim_property to be the next value of
308
   charpos to call this subroutine again - or be before/after the
309 310 311
   start/end of OBJECT.  */

void
Paul Eggert's avatar
Paul Eggert committed
312
update_syntax_table (ptrdiff_t charpos, EMACS_INT count, bool init,
313
		     Lisp_Object object)
314 315
{
  Lisp_Object tmp_table;
Paul Eggert's avatar
Paul Eggert committed
316 317
  int cnt = 0;
  bool invalidate = 1;
318
  INTERVAL i;
319 320 321

  if (init)
    {
322
      gl_state.old_prop = Qnil;
323 324
      gl_state.start = gl_state.b_property;
      gl_state.stop = gl_state.e_property;
325 326
      i = interval_of (charpos, object);
      gl_state.backward_i = gl_state.forward_i = i;
327
      invalidate = 0;
Dmitry Antipov's avatar
Dmitry Antipov committed
328
      if (!i)
329
	return;
330
      /* interval_of updates only ->position of the return value, so
331
	 update the parents manually to speed up update_interval.  */
332
      while (!NULL_PARENT (i))
333 334
	{
	  if (AM_RIGHT_CHILD (i))
335
	    INTERVAL_PARENT (i)->position = i->position
336
	      - LEFT_TOTAL_LENGTH (i) + TOTAL_LENGTH (i) /* right end */
337 338
	      - TOTAL_LENGTH (INTERVAL_PARENT (i))
	      + LEFT_TOTAL_LENGTH (INTERVAL_PARENT (i));
339
	  else
340
	    INTERVAL_PARENT (i)->position = i->position - LEFT_TOTAL_LENGTH (i)
341
	      + TOTAL_LENGTH (i);
342
	  i = INTERVAL_PARENT (i);
343 344
	}
      i = gl_state.forward_i;
345
      gl_state.b_property = i->position - gl_state.offset;
346
      gl_state.e_property = INTERVAL_LAST_POS (i) - gl_state.offset;
347 348
      goto update;
    }
349
  i = count > 0 ? gl_state.forward_i : gl_state.backward_i;
350

351
  /* We are guaranteed to be called with CHARPOS either in i,
352
     or further off.  */
Dmitry Antipov's avatar
Dmitry Antipov committed
353
  if (!i)
354
    error ("Error in syntax_table logic for to-the-end intervals");
355
  else if (charpos < i->position)		/* Move left.  */
356 357
    {
      if (count > 0)
358
	error ("Error in syntax_table logic for intervals <-");
359
      /* Update the interval.  */
360
      i = update_interval (i, charpos);
361
      if (INTERVAL_LAST_POS (i) != gl_state.b_property)
362 363 364
	{
	  invalidate = 0;
	  gl_state.forward_i = i;
365
	  gl_state.e_property = INTERVAL_LAST_POS (i) - gl_state.offset;
366
	}
367
    }
368
  else if (charpos >= INTERVAL_LAST_POS (i)) /* Move right.  */
369 370
    {
      if (count < 0)
371
	error ("Error in syntax_table logic for intervals ->");
372
      /* Update the interval.  */
373
      i = update_interval (i, charpos);
374
      if (i->position != gl_state.e_property)
375 376 377
	{
	  invalidate = 0;
	  gl_state.backward_i = i;
378
	  gl_state.b_property = i->position - gl_state.offset;
379 380 381 382 383 384 385 386
	}
    }

  update:
  tmp_table = textget (i->plist, Qsyntax_table);

  if (invalidate)
    invalidate = !EQ (tmp_table, gl_state.old_prop); /* Need to invalidate? */
Juanma Barranquero's avatar
Juanma Barranquero committed
387

388 389 390
  if (invalidate)		/* Did not get to adjacent interval.  */
    {				/* with the same table => */
				/* invalidate the old range.  */
391 392 393
      if (count > 0)
	{
	  gl_state.backward_i = i;
394 395 396
	  gl_state.b_property = i->position - gl_state.offset;
	}
      else
397
	{
398
	  gl_state.forward_i = i;
399
	  gl_state.e_property = INTERVAL_LAST_POS (i) - gl_state.offset;
400 401
	}
    }
402

403
  if (!EQ (tmp_table, gl_state.old_prop))
404
    {
405 406 407 408 409
      gl_state.current_syntax_table = tmp_table;
      gl_state.old_prop = tmp_table;
      if (EQ (Fsyntax_table_p (tmp_table), Qt))
	{
	  gl_state.use_global = 0;
Juanma Barranquero's avatar
Juanma Barranquero committed
410
	}
411 412 413 414 415
      else if (CONSP (tmp_table))
	{
	  gl_state.use_global = 1;
	  gl_state.global_code = tmp_table;
	}
Juanma Barranquero's avatar
Juanma Barranquero committed
416
      else
417 418
	{
	  gl_state.use_global = 0;
Tom Tromey's avatar
Tom Tromey committed
419
	  gl_state.current_syntax_table = BVAR (current_buffer, syntax_table);
420
	}
421 422
    }

Dmitry Antipov's avatar
Dmitry Antipov committed
423
  while (i)
424 425 426 427
    {
      if (cnt && !EQ (tmp_table, textget (i->plist, Qsyntax_table)))
	{
	  if (count > 0)
428 429 430 431 432 433
	    {
	      gl_state.e_property = i->position - gl_state.offset;
	      gl_state.forward_i = i;
	    }
	  else
	    {
434 435
	      gl_state.b_property
		= i->position + LENGTH (i) - gl_state.offset;
436 437 438
	      gl_state.backward_i = i;
	    }
	  return;
439
	}
Juanma Barranquero's avatar
Juanma Barranquero committed
440
      else if (cnt == INTERVALS_AT_ONCE)
441 442
	{
	  if (count > 0)
443
	    {
444 445 446 447 448
	      gl_state.e_property
		= i->position + LENGTH (i) - gl_state.offset
		/* e_property at EOB is not set to ZV but to ZV+1, so that
		   we can do INC(from);UPDATE_SYNTAX_TABLE_FORWARD without
		   having to check eob between the two.  */
Dmitry Antipov's avatar
Dmitry Antipov committed
449
		+ (next_interval (i) ? 0 : 1);
450 451 452 453 454 455 456 457
	      gl_state.forward_i = i;
	    }
	  else
	    {
	      gl_state.b_property = i->position - gl_state.offset;
	      gl_state.backward_i = i;
	    }
	  return;
458 459 460 461
	}
      cnt++;
      i = count > 0 ? next_interval (i) : previous_interval (i);
    }
Dmitry Antipov's avatar
Dmitry Antipov committed
462
  eassert (i == NULL); /* This property goes to the end.  */
463 464 465 466
  if (count > 0)
    gl_state.e_property = gl_state.stop;
  else
    gl_state.b_property = gl_state.start;
467 468
}

Paul Eggert's avatar
Paul Eggert committed
469
/* Returns true if char at CHARPOS is quoted.
470 471
   Global syntax-table data should be set up already to be good at CHARPOS
   or after.  On return global syntax data is good for lookup at CHARPOS. */
472

Paul Eggert's avatar
Paul Eggert committed
473
static bool
474
char_quoted (ptrdiff_t charpos, ptrdiff_t bytepos)
475
{
Paul Eggert's avatar
Paul Eggert committed
476 477 478
  enum syntaxcode code;
  ptrdiff_t beg = BEGV;
  bool quoted = 0;
479
  ptrdiff_t orig = charpos;
480

481
  while (charpos > beg)
482
    {
483
      int c;
484
      DEC_BOTH (charpos, bytepos);
485

486
      UPDATE_SYNTAX_TABLE_BACKWARD (charpos);
487
      c = FETCH_CHAR_AS_MULTIBYTE (bytepos);
488
      code = SYNTAX (c);
489 490 491 492
      if (! (code == Scharquote || code == Sescape))
	break;

      quoted = !quoted;
493
    }
494 495

  UPDATE_SYNTAX_TABLE (orig);
496 497
  return quoted;
}
498 499 500 501

/* Return the bytepos one character before BYTEPOS.
   We assume that BYTEPOS is not at the start of the buffer.  */

502
static ptrdiff_t
503
dec_bytepos (ptrdiff_t bytepos)
504
{
Tom Tromey's avatar
Tom Tromey committed
505
  if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
506 507
    return bytepos - 1;

508 509 510
  DEC_POS (bytepos);
  return bytepos;
}
511

Juanma Barranquero's avatar
Juanma Barranquero committed
512
/* Return a defun-start position before POS and not too far before.
Richard M. Stallman's avatar
Richard M. Stallman committed
513 514 515
   It should be the last one before POS, or nearly the last.

   When open_paren_in_column_0_is_defun_start is nonzero,
516
   only the beginning of the buffer is treated as a defun-start.
Richard M. Stallman's avatar
Richard M. Stallman committed
517 518 519 520

   We record the information about where the scan started
   and what its result was, so that another call in the same area
   can return the same value very quickly.
521 522 523 524

   There is no promise at which position the global syntax data is
   valid on return from the subroutine, so the caller should explicitly
   update the global data.  */
525

526 527
static ptrdiff_t
find_defun_start (ptrdiff_t pos, ptrdiff_t pos_byte)
528
{
529
  ptrdiff_t opoint = PT, opoint_byte = PT_BYTE;
530

531 532
  if (!open_paren_in_column_0_is_defun_start)
    {
533
      find_start_value = BEGV;
534
      find_start_value_byte = BEGV_BYTE;
535 536 537 538
      find_start_buffer = current_buffer;
      find_start_modiff = MODIFF;
      find_start_begv = BEGV;
      find_start_pos = pos;
539 540 541
      return BEGV;
    }

542 543 544 545 546 547 548 549 550 551 552 553
  /* Use previous finding, if it's valid and applies to this inquiry.  */
  if (current_buffer == find_start_buffer
      /* Reuse the defun-start even if POS is a little farther on.
	 POS might be in the next defun, but that's ok.
	 Our value may not be the best possible, but will still be usable.  */
      && pos <= find_start_pos + 1000
      && pos >= find_start_value
      && BEGV == find_start_begv
      && MODIFF == find_start_modiff)
    return find_start_value;

  /* Back up to start of line.  */
554
  scan_newline (pos, pos_byte, BEGV, BEGV_BYTE, -1, 1);
555

556 557 558
  /* We optimize syntax-table lookup for rare updates.  Thus we accept
     only those `^\s(' which are good in global _and_ text-property
     syntax-tables.  */
559
  SETUP_BUFFER_SYNTAX_TABLE ();
560
  while (PT > BEGV)
561
    {
562 563
      int c;

564 565
      /* Open-paren at start of line means we may have found our
	 defun-start.  */
566
      c = FETCH_CHAR_AS_MULTIBYTE (PT_BYTE);
567
      if (SYNTAX (c) == Sopen)
568
	{
569
	  SETUP_SYNTAX_TABLE (PT + 1, -1);	/* Try again... */
570
	  c = FETCH_CHAR_AS_MULTIBYTE (PT_BYTE);
571
	  if (SYNTAX (c) == Sopen)
572 573
	    break;
	  /* Now fallback to the default value.  */
574
	  SETUP_BUFFER_SYNTAX_TABLE ();
575
	}
576 577
      /* Move to beg of previous line.  */
      scan_newline (PT, PT_BYTE, BEGV, BEGV_BYTE, -2, 1);
578 579 580
    }

  /* Record what we found, for the next try.  */
581 582
  find_start_value = PT;
  find_start_value_byte = PT_BYTE;
583 584 585 586 587
  find_start_buffer = current_buffer;
  find_start_modiff = MODIFF;
  find_start_begv = BEGV;
  find_start_pos = pos;

588 589
  TEMP_SET_PT_BOTH (opoint, opoint_byte);

590 591 592
  return find_start_value;
}

593 594
/* Return the SYNTAX_COMEND_FIRST of the character before POS, POS_BYTE.  */

Paul Eggert's avatar
Paul Eggert committed
595
static bool
596
prev_char_comend_first (ptrdiff_t pos, ptrdiff_t pos_byte)
597
{
Paul Eggert's avatar
Paul Eggert committed
598 599
  int c;
  bool val;
600 601 602 603 604 605 606 607 608

  DEC_BOTH (pos, pos_byte);
  UPDATE_SYNTAX_TABLE_BACKWARD (pos);
  c = FETCH_CHAR (pos_byte);
  val = SYNTAX_COMEND_FIRST (c);
  UPDATE_SYNTAX_TABLE_FORWARD (pos + 1);
  return val;
}

Paul Eggert's avatar
Paul Eggert committed
609
/* Check whether charpos FROM is at the end of a comment.
610 611 612
   FROM_BYTE is the bytepos corresponding to FROM.
   Do not move back before STOP.

Paul Eggert's avatar
Paul Eggert committed
613
   Return true if we find a comment ending at FROM/FROM_BYTE.
614 615 616

   If successful, store the charpos of the comment's beginning
   into *CHARPOS_PTR, and the bytepos into *BYTEPOS_PTR.
617 618 619

   Global syntax data remains valid for backward search starting at
   the returned value (or at FROM, if the search was not successful).  */
620

Paul Eggert's avatar
Paul Eggert committed
621 622 623 624
static bool
back_comment (ptrdiff_t from, ptrdiff_t from_byte, ptrdiff_t stop,
	      bool comnested, int comstyle, ptrdiff_t *charpos_ptr,
	      ptrdiff_t *bytepos_ptr)
625 626 627 628 629 630 631 632 633 634
{
  /* Look back, counting the parity of string-quotes,
     and recording the comment-starters seen.
     When we reach a safe place, assume that's not in a string;
     then step the main scan to the earliest comment-starter seen
     an even number of string quotes away from the safe place.

     OFROM[I] is position of the earliest comment-starter seen
     which is I+2X quotes from the comment-end.
     PARITY is current parity of quotes from the comment end.  */
635
  int string_style = -1;	/* Presumed outside of any string. */
Paul Eggert's avatar
Paul Eggert committed
636
  bool string_lossage = 0;
637
  /* Not a real lossage: indicates that we have passed a matching comment
Juanma Barranquero's avatar
Juanma Barranquero committed
638
     starter plus a non-matching comment-ender, meaning that any matching
639 640 641
     comment-starter we might see later could be a false positive (hidden
     inside another comment).
     Test case:  { a (* b } c (* d *) */
Paul Eggert's avatar
Paul Eggert committed
642
  bool comment_lossage = 0;
643 644 645 646
  ptrdiff_t comment_end = from;
  ptrdiff_t comment_end_byte = from_byte;
  ptrdiff_t comstart_pos = 0;
  ptrdiff_t comstart_byte IF_LINT (= 0);
647 648
  /* Place where the containing defun starts,
     or 0 if we didn't come across it yet.  */
649 650
  ptrdiff_t defun_start = 0;
  ptrdiff_t defun_start_byte = 0;
Paul Eggert's avatar
Paul Eggert committed
651 652
  enum syntaxcode code;
  ptrdiff_t nesting = 1;		/* current comment nesting */
653
  int c;
654 655 656 657 658
  int syntax = 0;

  /* FIXME: A }} comment-ender style leads to incorrect behavior
     in the case of {{ c }}} because we ignore the last two chars which are
     assumed to be comment-enders although they aren't.  */
659 660 661 662 663

  /* At beginning of range to scan, we're outside of strings;
     that determines quote parity to the comment-end.  */
  while (from != stop)
    {
664
      ptrdiff_t temp_byte;
Paul Eggert's avatar
Paul Eggert committed
665 666
      int prev_syntax;
      bool com2start, com2end, comstart;
667

668
      /* Move back and examine a character.  */
669
      DEC_BOTH (from, from_byte);
670 671
      UPDATE_SYNTAX_TABLE_BACKWARD (from);

672
      prev_syntax = syntax;
673
      c = FETCH_CHAR_AS_MULTIBYTE (from_byte);
674
      syntax = SYNTAX_WITH_FLAGS (c);
675 676
      code = SYNTAX (c);

677 678 679
      /* Check for 2-char comment markers.  */
      com2start = (SYNTAX_FLAGS_COMSTART_FIRST (syntax)
		   && SYNTAX_FLAGS_COMSTART_SECOND (prev_syntax)
680 681
		   && (comstyle
		       == SYNTAX_FLAGS_COMMENT_STYLE (prev_syntax, syntax))
682 683 684 685
		   && (SYNTAX_FLAGS_COMMENT_NESTED (prev_syntax)
		       || SYNTAX_FLAGS_COMMENT_NESTED (syntax)) == comnested);
      com2end = (SYNTAX_FLAGS_COMEND_FIRST (syntax)
		 && SYNTAX_FLAGS_COMEND_SECOND (prev_syntax));
686
      comstart = (com2start || code == Scomment);
687

688 689 690 691 692 693 694 695 696 697
      /* Nasty cases with overlapping 2-char comment markers:
	 - snmp-mode: -- c -- foo -- c --
	              --- c --
		      ------ c --
	 - c-mode:    *||*
		      |* *|* *|
		      |*| |* |*|
		      ///   */

      /* If a 2-char comment sequence partly overlaps with another,
698 699 700
	 we don't try to be clever.  E.g. |*| in C, or }% in modes that
	 have %..\n and %{..}%.  */
      if (from > stop && (com2end || comstart))
701
	{
702
	  ptrdiff_t next = from, next_byte = from_byte;
703
	  int next_c, next_syntax;
704 705
	  DEC_BOTH (next, next_byte);
	  UPDATE_SYNTAX_TABLE_BACKWARD (next);
706
	  next_c = FETCH_CHAR_AS_MULTIBYTE (next_byte);
707
	  next_syntax = SYNTAX_WITH_FLAGS (next_c);
708
	  if (((comstart || comnested)
709 710 711 712
	       && SYNTAX_FLAGS_COMEND_SECOND (syntax)
	       && SYNTAX_FLAGS_COMEND_FIRST (next_syntax))
	      || ((com2end || comnested)
		  && SYNTAX_FLAGS_COMSTART_SECOND (syntax)
713 714
		  && (comstyle
		      == SYNTAX_FLAGS_COMMENT_STYLE (syntax, prev_syntax))
715 716 717
		  && SYNTAX_FLAGS_COMSTART_FIRST (next_syntax)))
	    goto lossage;
	  /* UPDATE_SYNTAX_TABLE_FORWARD (next + 1); */
718
	}
719 720 721 722 723 724 725 726 727 728 729 730 731 732 733

      if (com2start && comstart_pos == 0)
	/* We're looking at a comment starter.  But it might be a comment
	   ender as well (see snmp-mode).  The first time we see one, we
	   need to consider it as a comment starter,
	   and the subsequent times as a comment ender.  */
	com2end = 0;

      /* Turn a 2-char comment sequences into the appropriate syntax.  */
      if (com2end)
	code = Sendcomment;
      else if (com2start)
	code = Scomment;
      /* Ignore comment starters of a different style.  */
      else if (code == Scomment
734
	       && (comstyle != SYNTAX_FLAGS_COMMENT_STYLE (syntax, 0)
735
		   || SYNTAX_FLAGS_COMMENT_NESTED (syntax) != comnested))
736
	continue;
737

738 739
      /* Ignore escaped characters, except comment-enders.  */
      if (code != Sendcomment && char_quoted (from, from_byte))
740 741
	continue;

742
      switch (code)
743
	{
744 745 746 747 748 749 750 751 752 753 754 755 756 757
	case Sstring_fence:
	case Scomment_fence:
	  c = (code == Sstring_fence ? ST_STRING_STYLE : ST_COMMENT_STYLE);
	case Sstring:
	  /* Track parity of quotes.  */
	  if (string_style == -1)
	    /* Entering a string.  */
	    string_style = c;
	  else if (string_style == c)
	    /* Leaving the string.  */
	    string_style = -1;
	  else
	    /* If we have two kinds of string delimiters.
	       There's no way to grok this scanning backwards.  */
758
	    string_lossage = 1;
759
	  break;
Juanma Barranquero's avatar
Juanma Barranquero committed
760

761 762
	case Scomment:
	  /* We've already checked that it is the relevant comstyle.  */
763
	  if (string_style != -1 || comment_lossage || string_lossage)
764 765 766 767
	    /* There are odd string quotes involved, so let's be careful.
	       Test case in Pascal: " { " a { " } */
	    goto lossage;

768 769 770 771 772 773 774
	  if (!comnested)
	    {
	      /* Record best comment-starter so far.  */
	      comstart_pos = from;
	      comstart_byte = from_byte;
	    }
	  else if (--nesting <= 0)
775 776 777 778 779 780
	    /* nested comments have to be balanced, so we don't need to
	       keep looking for earlier ones.  We use here the same (slightly
	       incorrect) reasoning as below:  since it is followed by uniform
	       paired string quotes, this comment-start has to be outside of
	       strings, else the comment-end itself would be inside a string. */
	    goto done;
781 782
	  break;

783
	case Sendcomment:
784
	  if (SYNTAX_FLAGS_COMMENT_STYLE (syntax, 0) == comstyle
785
	      && ((com2end && SYNTAX_FLAGS_COMMENT_NESTED (prev_syntax))
786
		  || SYNTAX_FLAGS_COMMENT_NESTED (syntax)) == comnested)
787 788 789 790 791 792 793 794 795
	    /* This is the same style of comment ender as ours. */
	    {
	      if (comnested)
		nesting++;
	      else
		/* Anything before that can't count because it would match
		   this comment-ender rather than ours.  */
		from = stop;	/* Break out of the loop.  */
	    }
796 797 798 799 800 801 802 803 804
	  else if (comstart_pos != 0 || c != '\n')
	    /* We're mixing comment styles here, so we'd better be careful.
	       The (comstart_pos != 0 || c != '\n') check is not quite correct
	       (we should just always set comment_lossage), but removing it
	       would imply that any multiline comment in C would go through
	       lossage, which seems overkill.
	       The failure should only happen in the rare cases such as
	         { (* } *)   */
	    comment_lossage = 1;
805
	  break;
806

807 808 809 810 811 812 813 814 815 816 817
	case Sopen:
	  /* Assume a defun-start point is outside of strings.  */
	  if (open_paren_in_column_0_is_defun_start
	      && (from == stop
		  || (temp_byte = dec_bytepos (from_byte),
		      FETCH_CHAR (temp_byte) == '\n')))
	    {
	      defun_start = from;
	      defun_start_byte = from_byte;
	      from = stop;	/* Break out of the loop.  */
	    }
818
	  break;
819 820

	default:
821
	  break;
822
	}
823 824 825 826 827
    }

  if (comstart_pos == 0)
    {
      from = comment_end;
828
      from_byte = comment_end_byte;
829 830
      UPDATE_SYNTAX_TABLE_FORWARD (comment_end - 1);
    }
831 832 833
  /* If comstart_pos is set and we get here (ie. didn't jump to `lossage'
     or `done'), then we've found the beginning of the non-nested comment.  */
  else if (1)	/* !comnested */
834 835
    {
      from = comstart_pos;
836
      from_byte = comstart_byte;
837
      UPDATE_SYNTAX_TABLE_FORWARD (from - 1);
838 839 840
    }
  else
    {
841 842
      struct lisp_parse_state state;
    lossage:
843 844
      /* We had two kinds of string delimiters mixed up
	 together.  Decode this going forwards.
845
	 Scan fwd from a known safe place (beginning-of-defun)
846 847
	 to the one in question; this records where we
	 last passed a comment starter.  */
848 849 850 851 852 853
      /* If we did not already find the defun start, find it now.  */
      if (defun_start == 0)
	{
	  defun_start = find_defun_start (comment_end, comment_end_byte);
	  defun_start_byte = find_start_value_byte;
	}
854
      do
855
	{
856 857
	  scan_sexps_forward (&state,
			      defun_start, defun_start_byte,
858 859
			      comment_end, TYPE_MINIMUM (EMACS_INT),
			      0, Qnil, 0);
860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878
	  defun_start = comment_end;
	  if (state.incomment == (comnested ? 1 : -1)
	      && state.comstyle == comstyle)
	    from = state.comstr_start;
	  else
	    {
	      from = comment_end;
	      if (state.incomment)
		/* If comment_end is inside some other comment, maybe ours
		   is nested, so we need to try again from within the
		   surrounding comment.  Example: { a (* " *)  */
		{
		  /* FIXME: We should advance by one or two chars. */
		  defun_start = state.comstr_start + 2;
		  defun_start_byte = CHAR_TO_BYTE (defun_start);
		}
	    }
	} while (defun_start < comment_end);

879
      from_byte = CHAR_TO_BYTE (from);
880 881
      UPDATE_SYNTAX_TABLE_FORWARD (from - 1);
    }
Juanma Barranquero's avatar
Juanma Barranquero committed
882

883
 done:
884 885 886
  *charpos_ptr = from;
  *bytepos_ptr = from_byte;

Paul Eggert's avatar
Paul Eggert committed
887
  return from != comment_end;
888 889
}

Richard M. Stallman's avatar
Richard M. Stallman committed
890
DEFUN ("syntax-table-p", Fsyntax_table_p, Ssyntax_table_p, 1, 1, 0,
891 892
       doc: /* Return t if OBJECT is a syntax table.
Currently, any char-table counts as a syntax table.  */)
Dan Nicolaescu's avatar
Dan Nicolaescu committed
893
  (Lisp_Object object)
Richard M. Stallman's avatar
Richard M. Stallman committed
894
{
895
  if (CHAR_TABLE_P (object)
896
      && EQ (XCHAR_TABLE (object)->purpose, Qsyntax_table))
Richard M. Stallman's avatar
Richard M. Stallman committed
897 898 899 900
    return Qt;
  return Qnil;
}

901
static void
902
check_syntax_table (Lisp_Object obj)
Richard M. Stallman's avatar
Richard M. Stallman committed
903
{
904 905
  CHECK_TYPE (CHAR_TABLE_P (obj) && EQ (XCHAR_TABLE (obj)->purpose, Qsyntax_table),
	      Qsyntax_table_p, obj);
Juanma Barranquero's avatar
Juanma Barranquero committed
906
}
Richard M. Stallman's avatar
Richard M. Stallman committed
907 908

DEFUN ("syntax-table", Fsyntax_table, Ssyntax_table, 0, 0, 0,
909 910
       doc: /* Return the current syntax table.
This is the one specified by the current buffer.  */)
Dan Nicolaescu's avatar
Dan Nicolaescu committed
911
  (void)
Richard M. Stallman's avatar
Richard M. Stallman committed
912
{
Tom Tromey's avatar
Tom Tromey committed
913
  return BVAR (current_buffer, syntax_table);
Richard M. Stallman's avatar
Richard M. Stallman committed
914 915 916 917
}

DEFUN ("standard-syntax-table", Fstandard_syntax_table,
   Sstandard_syntax_table, 0, 0, 0,
918 919
       doc: /* Return the standard syntax table.
This is the one used for new buffers.  */)
Dan Nicolaescu's avatar
Dan Nicolaescu committed
920
  (void)
Richard M. Stallman's avatar
Richard M. Stallman committed
921 922 923 924 925
{
  return Vstandard_syntax_table;
}

DEFUN ("copy-syntax-table", Fcopy_syntax_table, Scopy_syntax_table, 0, 1, 0,
926 927
       doc: /* Construct a new syntax table and return it.
It is a copy of the TABLE, which defaults to the standard syntax table.  */)
Dan Nicolaescu's avatar
Dan Nicolaescu committed
928
  (Lisp_Object table)
Richard M. Stallman's avatar
Richard M. Stallman committed
929
{
930 931
  Lisp_Object copy;

Jim Blandy's avatar
Jim Blandy committed
932
  if (!NILP (table))
933 934 935 936 937
    check_syntax_table (table);
  else
    table = Vstandard_syntax_table;

  copy = Fcopy_sequence (table);
938 939 940

  /* Only the standard syntax table should have a default element.
     Other syntax tables should inherit from parents instead.  */
Paul Eggert's avatar
Paul Eggert committed
941
  set_char_table_defalt (copy, Qnil);
942 943 944 945 946 947

  /* Copied syntax tables should all have parents.
     If we copied one with no parent, such as the standard syntax table,
     use the standard syntax table as the copy's parent.  */
  if (NILP (XCHAR_TABLE (copy)->parent))
    Fset_char_table_parent (copy, Vstandard_syntax_table);
948
  return copy;
Richard M. Stallman's avatar
Richard M. Stallman committed
949 950 951
}

DEFUN ("set-syntax-table", Fset_syntax_table, Sset_syntax_table, 1, 1, 0,
952 953
       doc: /* Select a new syntax table for the current buffer.
One argument, a syntax table.  */)
Dan Nicolaescu's avatar
Dan Nicolaescu committed
954
  (Lisp_Object table)
Richard M. Stallman's avatar
Richard M. Stallman committed
955
{
956
  int idx;
957
  check_syntax_table (table);
Paul Eggert's avatar
Paul Eggert committed
958
  bset_syntax_table (current_buffer, table);
Richard M. Stallman's avatar
Richard M. Stallman committed
959
  /* Indicate that this buffer now has a specified syntax table.  */
Gerd Moellmann's avatar
Gerd Moellmann committed
960 961
  idx = PER_BUFFER_VAR_IDX (syntax_table);
  SET_PER_BUFFER_VALUE_P (current_buffer, idx, 1);
Richard M. Stallman's avatar
Richard M. Stallman committed
962 963 964 965 966
  return table;
}

/* Convert a letter which signifies a syntax code
 into the code it signifies.
967
 This is used by modify-syntax-entry, and other things.  */
Richard M. Stallman's avatar
Richard M. Stallman committed
968

Paul Eggert's avatar
Paul Eggert committed
969
unsigned char const syntax_spec_code[0400] =
Richard M. Stallman's avatar
Richard M. Stallman committed
970 971 972 973
  { 0377, 0377, 0377, 0377, 0377, 0377, 0377, 0377,
    0377, 0377, 0377, 0377, 0377, 0377, 0377, 0377,
    0377, 0377, 0377, 0377, 0377, 0377, 0377, 0377,
    0377, 0377, 0377, 0377, 0377, 0377, 0377, 0377,
Paul Eggert's avatar
Paul Eggert committed
974 975
    Swhitespace, Scomment_fence, Sstring, 0377, Smath, 0377, 0377, Squote,
    Sopen, Sclose, 0377, 0377, 0377, Swhitespace, Spunct, Scharquote,
Richard M. Stallman's avatar
Richard M. Stallman committed
976
    0377, 0377, 0377, 0377, 0377, 0377, 0377, 0377,
Paul Eggert's avatar
Paul Eggert committed
977 978
    0377, 0377, 0377, 0377, Scomment, 0377, Sendcomment, 0377,
    Sinherit, 0377, 0377, 0377, 0377, 0377, 0377, 0377,   /* @, A ... */
Richard M. Stallman's avatar
Richard M. Stallman committed
979
    0377, 0377, 0377, 0377, 0377, 0377, 0377, 0377,
Paul Eggert's avatar
Paul Eggert committed
980 981
    0377, 0377, 0377, 0377, 0377, 0377, 0377, Sword,
    0377, 0377, 0377, 0377, Sescape, 0377, 0377, Ssymbol,
Richard M. Stallman's avatar
Richard M. Stallman committed
982 983
    0377, 0377, 0377, 0377, 0377, 0377, 0377, 0377,   /* `, a, ... */
    0377, 0377, 0377, 0377, 0377, 0377, 0377, 0377,
Paul Eggert's avatar
Paul Eggert committed
984 985
    0377, 0377, 0377, 0377, 0377, 0377, 0377, Sword,
    0377, 0377, 0377, 0377, Sstring_fence, 0377, 0377, 0377
Richard M. Stallman's avatar
Richard M. Stallman committed
986 987
  };

988
/* Indexed by syntax code, give the letter that describes it.  */
Richard M. Stallman's avatar
Richard M. Stallman committed
989

Paul Eggert's avatar
Paul Eggert committed
990
char const syntax_code_spec[16] =
Richard M. Stallman's avatar
Richard M. Stallman committed
991
  {
992 993
    ' ', '.', 'w', '_', '(', ')', '\'', '\"', '$', '\\', '/', '<', '>', '@',
    '!', '|'
Richard M. Stallman's avatar
Richard M. Stallman committed
994
  };
Karl Heuer's avatar
Karl Heuer committed
995 996 997 998 999 1000 1001 1002

/* Indexed by syntax code, give the object (cons of syntax code and
   nil) to be stored in syntax table.  Since these objects can be
   shared among syntax tables, we generate them in advance.  By
   sharing objects, the function `describe-syntax' can give a more
   compact listing.  */
static Lisp_Object Vsyntax_code_object;

Richard M. Stallman's avatar
Richard M. Stallman committed
1003 1004

DEFUN ("char-syntax", Fchar_syntax, Schar_syntax, 1, 1, 0,
1005
       doc: /* Return the syntax code of CHARACTER, described by a character.
1006 1007
For example, if CHARACTER is a word constituent, the
character `w' (119) is returned.
1008 1009
The characters that correspond to various syntax codes
are listed in the documentation of `modify-syntax-entry'.  */)
Dan Nicolaescu's avatar
Dan Nicolaescu committed
1010
  (Lisp_Object character)
Richard M. Stallman's avatar
Richard M. Stallman committed
1011
{
1012
  int char_int;
1013
  CHECK_CHARACTER (character);
1014
  char_int = XINT (character);
1015
  SETUP_BUFFER_SYNTAX_TABLE ();
Paul Eggert's avatar
Paul Eggert committed
1016
  return make_number (syntax_code_spec[SYNTAX (char_int)]);
1017 1018 1019
}

DEFUN ("matching-paren", Fmatching_paren, Smatching_paren, 1, 1, 0,
1020
       doc: /* Return the matching parenthesis of CHARACTER, or nil if none.  */)
Dan Nicolaescu's avatar
Dan Nicolaescu committed
1021
  (Lisp_Object character)
1022
{
Paul Eggert's avatar
Paul Eggert committed
1023 1024 1025
  int char_int;
  enum syntaxcode code;
  CHECK_CHARACTER (character);
1026
  char_int = XINT (character);
1027
  SETUP_BUFFER_SYNTAX_TABLE ();
1028
  code = SYNTAX (char_int);
1029
  if (code == Sopen || code == Sclose)
1030
    return SYNTAX_MATCH (char_int);
1031
  return Qnil;
Richard M. Stallman's avatar
Richard M. Stallman committed
1032 1033
}

1034
DEFUN ("string-to-syntax", Fstring_to_syntax, Sstring_to_syntax, 1, 1, 0,
1035 1036 1037 1038 1039
       doc: /* Convert a syntax descriptor STRING into a raw syntax descriptor.
STRING should be a string of the form allowed as argument of
`modify-syntax-entry'.  The return value is a raw syntax descriptor: a
cons cell \(CODE . MATCHING-CHAR) which can be used, for example, as
the value of a `syntax-table' text property.  */)
Dan Nicolaescu's avatar
Dan Nicolaescu committed
1040
  (Lisp_Object string)
Richard M. Stallman's avatar
Richard M. Stallman committed
1041
{
Paul Eggert's avatar
Paul Eggert committed
1042
  const unsigned char *p;
1043
  int val;
1044
  Lisp_Object match;
Richard M. Stallman's avatar
Richard M. Stallman committed
1045

1046
  CHECK_STRING (string);
Richard M. Stallman's avatar
Richard M. Stallman committed
1047

1048
  p = SDATA (string);
Paul Eggert's avatar
Paul Eggert committed
1049 1050
  val = syntax_spec_code[*p++];
  if (val == 0377)
Juanma Barranquero's avatar