syntax.c 102 KB
Newer Older
Richard M. Stallman's avatar
Richard M. Stallman committed
1
/* GNU Emacs routines to deal with syntax tables; also word and list parsing.
2
   Copyright (C) 1985, 1987, 1993-1995, 1997-1999, 2001-2011
3
                 Free Software Foundation, Inc.
Richard M. Stallman's avatar
Richard M. Stallman committed
4 5 6

This file is part of GNU Emacs.

7
GNU Emacs is free software: you can redistribute it and/or modify
Richard M. Stallman's avatar
Richard M. Stallman committed
8
it under the terms of the GNU General Public License as published by
9 10
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Richard M. Stallman's avatar
Richard M. Stallman committed
11 12 13 14 15 16 17

GNU Emacs is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
18
along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
Richard M. Stallman's avatar
Richard M. Stallman committed
19 20


21
#include <config.h>
Richard M. Stallman's avatar
Richard M. Stallman committed
22
#include <ctype.h>
23
#include <setjmp.h>
Richard M. Stallman's avatar
Richard M. Stallman committed
24 25 26
#include "lisp.h"
#include "commands.h"
#include "buffer.h"
27
#include "character.h"
Stefan Monnier's avatar
Stefan Monnier committed
28
#include "keymap.h"
Richard M. Stallman's avatar
Richard M. Stallman committed
29
#include "regex.h"
30 31 32 33

/* Make syntax table lookup grant data in gl_state.  */
#define SYNTAX_ENTRY_VIA_PROPERTY

Richard M. Stallman's avatar
Richard M. Stallman committed
34
#include "syntax.h"
35
#include "intervals.h"
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
#include "category.h"

/* Then there are seven single-bit flags that have the following meanings:
  1. This character is the first of a two-character comment-start sequence.
  2. This character is the second of a two-character comment-start sequence.
  3. This character is the first of a two-character comment-end sequence.
  4. This character is the second of a two-character comment-end sequence.
  5. This character is a prefix, for backward-prefix-chars.
  6. The char is part of a delimiter for comments of style "b".
  7. This character is part of a nestable comment sequence.
  8. The char is part of a delimiter for comments of style "c".
  Note that any two-character sequence whose first character has flag 1
  and whose second character has flag 2 will be interpreted as a comment start.

  bit 6 and 8 are used to discriminate between different comment styles.
  Languages such as C++ allow two orthogonal syntax start/end pairs
  and bit 6 is used to determine whether a comment-end or Scommentend
  ends style a or b.  Comment markers can start style a, b, c, or bc.
  Style a is always the default.
  For 2-char comment markers, the style b flag is only looked up on the second
  char of the comment marker and on the first char of the comment ender.
  For style c (like to for the nested flag), the flag can be placed on any
  one of the chars.
  */

/* These macros extract specific flags from an integer
   that holds the syntax code and the flags.  */

#define SYNTAX_FLAGS_COMSTART_FIRST(flags) (((flags) >> 16) & 1)

#define SYNTAX_FLAGS_COMSTART_SECOND(flags) (((flags) >> 17) & 1)

#define SYNTAX_FLAGS_COMEND_FIRST(flags) (((flags) >> 18) & 1)

#define SYNTAX_FLAGS_COMEND_SECOND(flags) (((flags) >> 19) & 1)

#define SYNTAX_FLAGS_PREFIX(flags) (((flags) >> 20) & 1)

#define SYNTAX_FLAGS_COMMENT_STYLEB(flags) (((flags) >> 21) & 1)
#define SYNTAX_FLAGS_COMMENT_STYLEC(flags) (((flags) >> 22) & 2)
/* FLAGS should be the flags of the main char of the comment marker, e.g.
   the second for comstart and the first for comend.  */
#define SYNTAX_FLAGS_COMMENT_STYLE(flags, other_flags) \
  (SYNTAX_FLAGS_COMMENT_STYLEB (flags) \
   | SYNTAX_FLAGS_COMMENT_STYLEC (flags) \
   | SYNTAX_FLAGS_COMMENT_STYLEC (other_flags))

#define SYNTAX_FLAGS_COMMENT_NESTED(flags) (((flags) >> 22) & 1)

/* These macros extract a particular flag for a given character.  */

#define SYNTAX_COMEND_FIRST(c) \
  (SYNTAX_FLAGS_COMEND_FIRST (SYNTAX_WITH_FLAGS (c)))
#define SYNTAX_PREFIX(c) (SYNTAX_FLAGS_PREFIX (SYNTAX_WITH_FLAGS (c)))
90 91 92 93 94 95 96

/* We use these constants in place for comment-style and
   string-ender-char to distinguish  comments/strings started by
   comment_fence and string_fence codes.  */

#define ST_COMMENT_STYLE (256 + 1)
#define ST_STRING_STYLE (256 + 2)
Richard M. Stallman's avatar
Richard M. Stallman committed
97

98 99
static Lisp_Object Qsyntax_table_p;
static Lisp_Object Qsyntax_table, Qscan_error;
Richard M. Stallman's avatar
Richard M. Stallman committed
100

101
#ifndef __GNUC__
102 103 104 105
/* Used as a temporary in SYNTAX_ENTRY and other macros in syntax.h,
   if not compiled with GCC.  No need to mark it, since it is used
   only very temporarily.  */
Lisp_Object syntax_temp;
106
#endif
107

108 109 110 111
/* This is the internal form of the parse state used in parse-partial-sexp.  */

struct lisp_parse_state
  {
112 113 114 115 116 117 118 119 120 121 122 123 124 125
    int depth;	   /* Depth at end of parsing.  */
    int instring;  /* -1 if not within string, else desired terminator.  */
    int incomment; /* -1 if in unnestable comment else comment nesting */
    int comstyle;  /* comment style a=0, or b=1, or ST_COMMENT_STYLE.  */
    int quoted;	   /* Nonzero if just after an escape char at end of parsing */
    int mindepth;  /* Minimum depth seen while scanning.  */
    /* Char number of most recent start-of-expression at current level */
    EMACS_INT thislevelstart;
    /* Char number of start of containing expression */
    EMACS_INT prevlevelstart;
    EMACS_INT location;	     /* Char number at which parsing stopped.  */
    EMACS_INT comstr_start;  /* Position of last comment/string starter.  */
    Lisp_Object levelstarts; /* Char numbers of starts-of-expression
				of levels (starting from outermost).  */
126 127
  };

128 129 130
/* These variables are a cache for finding the start of a defun.
   find_start_pos is the place for which the defun start was found.
   find_start_value is the defun start position found for it.
131
   find_start_value_byte is the corresponding byte position.
132 133 134 135
   find_start_buffer is the buffer it was found in.
   find_start_begv is the BEGV value when it was found.
   find_start_modiff is the value of MODIFF when it was found.  */

136 137 138
static EMACS_INT find_start_pos;
static EMACS_INT find_start_value;
static EMACS_INT find_start_value_byte;
139
static struct buffer *find_start_buffer;
140
static EMACS_INT find_start_begv;
141
static int find_start_modiff;
142 143


144
static Lisp_Object Fsyntax_table_p (Lisp_Object);
145 146 147 148 149 150 151
static Lisp_Object skip_chars (int, Lisp_Object, Lisp_Object, int);
static Lisp_Object skip_syntaxes (int, Lisp_Object, Lisp_Object);
static Lisp_Object scan_lists (EMACS_INT, EMACS_INT, EMACS_INT, int);
static void scan_sexps_forward (struct lisp_parse_state *,
                                EMACS_INT, EMACS_INT, EMACS_INT, int,
                                int, Lisp_Object, int);
static int in_classes (int, Lisp_Object);
152

153 154 155 156 157
/* Whether the syntax of the character C has the prefix flag set.  */
int syntax_prefix_flag_p (int c)
{
  return SYNTAX_PREFIX (c);
}
158 159 160 161 162 163

struct gl_state_s gl_state;		/* Global state of syntax parser.  */

#define INTERVALS_AT_ONCE 10		/* 1 + max-number of intervals
					   to scan to property-change.  */

164 165
/* Update gl_state to an appropriate interval which contains CHARPOS.  The
   sign of COUNT give the relative position of CHARPOS wrt the previously
166
   valid interval.  If INIT, only [be]_property fields of gl_state are
167
   valid at start, the rest is filled basing on OBJECT.
168

169
   `gl_state.*_i' are the intervals, and CHARPOS is further in the search
170 171
   direction than the intervals - or in an interval.  We update the
   current syntax-table basing on the property of this interval, and
172
   update the interval to start further than CHARPOS - or be
173
   NULL_INTERVAL.  We also update lim_property to be the next value of
174
   charpos to call this subroutine again - or be before/after the
175 176 177
   start/end of OBJECT.  */

void
178
update_syntax_table (EMACS_INT charpos, EMACS_INT count, int init,
179
		     Lisp_Object object)
180 181
{
  Lisp_Object tmp_table;
182 183
  unsigned cnt = 0;
  int invalidate = 1;
184
  INTERVAL i;
185 186 187

  if (init)
    {
188
      gl_state.old_prop = Qnil;
189 190
      gl_state.start = gl_state.b_property;
      gl_state.stop = gl_state.e_property;
191 192
      i = interval_of (charpos, object);
      gl_state.backward_i = gl_state.forward_i = i;
193 194 195
      invalidate = 0;
      if (NULL_INTERVAL_P (i))
	return;
196
      /* interval_of updates only ->position of the return value, so
197
	 update the parents manually to speed up update_interval.  */
198
      while (!NULL_PARENT (i))
199 200
	{
	  if (AM_RIGHT_CHILD (i))
201
	    INTERVAL_PARENT (i)->position = i->position
202
	      - LEFT_TOTAL_LENGTH (i) + TOTAL_LENGTH (i) /* right end */
203 204
	      - TOTAL_LENGTH (INTERVAL_PARENT (i))
	      + LEFT_TOTAL_LENGTH (INTERVAL_PARENT (i));
205
	  else
206
	    INTERVAL_PARENT (i)->position = i->position - LEFT_TOTAL_LENGTH (i)
207
	      + TOTAL_LENGTH (i);
208
	  i = INTERVAL_PARENT (i);
209 210
	}
      i = gl_state.forward_i;
211
      gl_state.b_property = i->position - gl_state.offset;
212
      gl_state.e_property = INTERVAL_LAST_POS (i) - gl_state.offset;
213 214
      goto update;
    }
215
  i = count > 0 ? gl_state.forward_i : gl_state.backward_i;
216

217
  /* We are guaranteed to be called with CHARPOS either in i,
218
     or further off.  */
219 220
  if (NULL_INTERVAL_P (i))
    error ("Error in syntax_table logic for to-the-end intervals");
221
  else if (charpos < i->position)		/* Move left.  */
222 223
    {
      if (count > 0)
224
	error ("Error in syntax_table logic for intervals <-");
225
      /* Update the interval.  */
226
      i = update_interval (i, charpos);
227
      if (INTERVAL_LAST_POS (i) != gl_state.b_property)
228 229 230
	{
	  invalidate = 0;
	  gl_state.forward_i = i;
231
	  gl_state.e_property = INTERVAL_LAST_POS (i) - gl_state.offset;
232
	}
233
    }
234
  else if (charpos >= INTERVAL_LAST_POS (i)) /* Move right.  */
235 236
    {
      if (count < 0)
237
	error ("Error in syntax_table logic for intervals ->");
238
      /* Update the interval.  */
239
      i = update_interval (i, charpos);
240
      if (i->position != gl_state.e_property)
241 242 243
	{
	  invalidate = 0;
	  gl_state.backward_i = i;
244
	  gl_state.b_property = i->position - gl_state.offset;
245 246 247 248 249 250 251 252
	}
    }

  update:
  tmp_table = textget (i->plist, Qsyntax_table);

  if (invalidate)
    invalidate = !EQ (tmp_table, gl_state.old_prop); /* Need to invalidate? */
Juanma Barranquero's avatar
Juanma Barranquero committed
253

254 255 256
  if (invalidate)		/* Did not get to adjacent interval.  */
    {				/* with the same table => */
				/* invalidate the old range.  */
257 258 259
      if (count > 0)
	{
	  gl_state.backward_i = i;
260 261 262
	  gl_state.b_property = i->position - gl_state.offset;
	}
      else
263
	{
264
	  gl_state.forward_i = i;
265
	  gl_state.e_property = INTERVAL_LAST_POS (i) - gl_state.offset;
266 267
	}
    }
268

269
  if (!EQ (tmp_table, gl_state.old_prop))
270
    {
271 272 273 274 275
      gl_state.current_syntax_table = tmp_table;
      gl_state.old_prop = tmp_table;
      if (EQ (Fsyntax_table_p (tmp_table), Qt))
	{
	  gl_state.use_global = 0;
Juanma Barranquero's avatar
Juanma Barranquero committed
276
	}
277 278 279 280 281
      else if (CONSP (tmp_table))
	{
	  gl_state.use_global = 1;
	  gl_state.global_code = tmp_table;
	}
Juanma Barranquero's avatar
Juanma Barranquero committed
282
      else
283 284
	{
	  gl_state.use_global = 0;
Tom Tromey's avatar
Tom Tromey committed
285
	  gl_state.current_syntax_table = BVAR (current_buffer, syntax_table);
286
	}
287 288 289 290 291 292 293
    }

  while (!NULL_INTERVAL_P (i))
    {
      if (cnt && !EQ (tmp_table, textget (i->plist, Qsyntax_table)))
	{
	  if (count > 0)
294 295 296 297 298 299
	    {
	      gl_state.e_property = i->position - gl_state.offset;
	      gl_state.forward_i = i;
	    }
	  else
	    {
300 301
	      gl_state.b_property
		= i->position + LENGTH (i) - gl_state.offset;
302 303 304
	      gl_state.backward_i = i;
	    }
	  return;
305
	}
Juanma Barranquero's avatar
Juanma Barranquero committed
306
      else if (cnt == INTERVALS_AT_ONCE)
307 308
	{
	  if (count > 0)
309
	    {
310 311 312 313 314 315
	      gl_state.e_property
		= i->position + LENGTH (i) - gl_state.offset
		/* e_property at EOB is not set to ZV but to ZV+1, so that
		   we can do INC(from);UPDATE_SYNTAX_TABLE_FORWARD without
		   having to check eob between the two.  */
		+ (NULL_INTERVAL_P (next_interval (i)) ? 1 : 0);
316 317 318 319 320 321 322 323
	      gl_state.forward_i = i;
	    }
	  else
	    {
	      gl_state.b_property = i->position - gl_state.offset;
	      gl_state.backward_i = i;
	    }
	  return;
324 325 326 327
	}
      cnt++;
      i = count > 0 ? next_interval (i) : previous_interval (i);
    }
328 329 330 331 332
  eassert (NULL_INTERVAL_P (i)); /* This property goes to the end.  */
  if (count > 0)
    gl_state.e_property = gl_state.stop;
  else
    gl_state.b_property = gl_state.start;
333 334
}

335 336 337
/* Returns TRUE if char at CHARPOS is quoted.
   Global syntax-table data should be set up already to be good at CHARPOS
   or after.  On return global syntax data is good for lookup at CHARPOS. */
338 339

static int
340
char_quoted (EMACS_INT charpos, EMACS_INT bytepos)
341 342
{
  register enum syntaxcode code;
343
  register EMACS_INT beg = BEGV;
344
  register int quoted = 0;
345
  EMACS_INT orig = charpos;
346

347
  while (charpos > beg)
348
    {
349
      int c;
350
      DEC_BOTH (charpos, bytepos);
351

352
      UPDATE_SYNTAX_TABLE_BACKWARD (charpos);
353
      c = FETCH_CHAR_AS_MULTIBYTE (bytepos);
354
      code = SYNTAX (c);
355 356 357 358
      if (! (code == Scharquote || code == Sescape))
	break;

      quoted = !quoted;
359
    }
360 361

  UPDATE_SYNTAX_TABLE (orig);
362 363
  return quoted;
}
364 365 366 367

/* Return the bytepos one character before BYTEPOS.
   We assume that BYTEPOS is not at the start of the buffer.  */

368
static INLINE EMACS_INT
369
dec_bytepos (EMACS_INT bytepos)
370
{
Tom Tromey's avatar
Tom Tromey committed
371
  if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
372 373
    return bytepos - 1;

374 375 376
  DEC_POS (bytepos);
  return bytepos;
}
377

Juanma Barranquero's avatar
Juanma Barranquero committed
378
/* Return a defun-start position before POS and not too far before.
Richard M. Stallman's avatar
Richard M. Stallman committed
379 380 381
   It should be the last one before POS, or nearly the last.

   When open_paren_in_column_0_is_defun_start is nonzero,
382
   only the beginning of the buffer is treated as a defun-start.
Richard M. Stallman's avatar
Richard M. Stallman committed
383 384 385 386

   We record the information about where the scan started
   and what its result was, so that another call in the same area
   can return the same value very quickly.
387 388 389 390

   There is no promise at which position the global syntax data is
   valid on return from the subroutine, so the caller should explicitly
   update the global data.  */
391

392
static EMACS_INT
393
find_defun_start (EMACS_INT pos, EMACS_INT pos_byte)
394
{
395
  EMACS_INT opoint = PT, opoint_byte = PT_BYTE;
396

397 398 399 400 401 402
  if (!open_paren_in_column_0_is_defun_start)
    {
      find_start_value_byte = BEGV_BYTE;
      return BEGV;
    }

403 404 405 406 407 408 409 410 411 412 413 414
  /* Use previous finding, if it's valid and applies to this inquiry.  */
  if (current_buffer == find_start_buffer
      /* Reuse the defun-start even if POS is a little farther on.
	 POS might be in the next defun, but that's ok.
	 Our value may not be the best possible, but will still be usable.  */
      && pos <= find_start_pos + 1000
      && pos >= find_start_value
      && BEGV == find_start_begv
      && MODIFF == find_start_modiff)
    return find_start_value;

  /* Back up to start of line.  */
415
  scan_newline (pos, pos_byte, BEGV, BEGV_BYTE, -1, 1);
416

417 418 419
  /* We optimize syntax-table lookup for rare updates.  Thus we accept
     only those `^\s(' which are good in global _and_ text-property
     syntax-tables.  */
420
  SETUP_BUFFER_SYNTAX_TABLE ();
421
  while (PT > BEGV)
422
    {
423 424
      int c;

425 426
      /* Open-paren at start of line means we may have found our
	 defun-start.  */
427
      c = FETCH_CHAR_AS_MULTIBYTE (PT_BYTE);
428
      if (SYNTAX (c) == Sopen)
429
	{
430
	  SETUP_SYNTAX_TABLE (PT + 1, -1);	/* Try again... */
431
	  c = FETCH_CHAR_AS_MULTIBYTE (PT_BYTE);
432
	  if (SYNTAX (c) == Sopen)
433 434
	    break;
	  /* Now fallback to the default value.  */
435
	  SETUP_BUFFER_SYNTAX_TABLE ();
436
	}
437 438
      /* Move to beg of previous line.  */
      scan_newline (PT, PT_BYTE, BEGV, BEGV_BYTE, -2, 1);
439 440 441
    }

  /* Record what we found, for the next try.  */
442 443
  find_start_value = PT;
  find_start_value_byte = PT_BYTE;
444 445 446 447 448
  find_start_buffer = current_buffer;
  find_start_modiff = MODIFF;
  find_start_begv = BEGV;
  find_start_pos = pos;

449 450
  TEMP_SET_PT_BOTH (opoint, opoint_byte);

451 452 453
  return find_start_value;
}

454 455 456
/* Return the SYNTAX_COMEND_FIRST of the character before POS, POS_BYTE.  */

static int
457
prev_char_comend_first (EMACS_INT pos, EMACS_INT pos_byte)
458 459 460 461 462 463 464 465 466 467 468 469 470
{
  int c, val;

  DEC_BOTH (pos, pos_byte);
  UPDATE_SYNTAX_TABLE_BACKWARD (pos);
  c = FETCH_CHAR (pos_byte);
  val = SYNTAX_COMEND_FIRST (c);
  UPDATE_SYNTAX_TABLE_FORWARD (pos + 1);
  return val;
}

/* Return the SYNTAX_COMSTART_FIRST of the character before POS, POS_BYTE.  */

471 472 473 474 475
/* static int
 * prev_char_comstart_first (pos, pos_byte)
 *      int pos, pos_byte;
 * {
 *   int c, val;
Juanma Barranquero's avatar
Juanma Barranquero committed
476
 *
477 478 479 480 481 482 483
 *   DEC_BOTH (pos, pos_byte);
 *   UPDATE_SYNTAX_TABLE_BACKWARD (pos);
 *   c = FETCH_CHAR (pos_byte);
 *   val = SYNTAX_COMSTART_FIRST (c);
 *   UPDATE_SYNTAX_TABLE_FORWARD (pos + 1);
 *   return val;
 * } */
484

485 486 487 488 489 490 491 492 493
/* Checks whether charpos FROM is at the end of a comment.
   FROM_BYTE is the bytepos corresponding to FROM.
   Do not move back before STOP.

   Return a positive value if we find a comment ending at FROM/FROM_BYTE;
   return -1 otherwise.

   If successful, store the charpos of the comment's beginning
   into *CHARPOS_PTR, and the bytepos into *BYTEPOS_PTR.
494 495 496

   Global syntax data remains valid for backward search starting at
   the returned value (or at FROM, if the search was not successful).  */
497 498

static int
499
back_comment (EMACS_INT from, EMACS_INT from_byte, EMACS_INT stop, int comnested, int comstyle, EMACS_INT *charpos_ptr, EMACS_INT *bytepos_ptr)
500 501 502 503 504 505 506 507 508 509
{
  /* Look back, counting the parity of string-quotes,
     and recording the comment-starters seen.
     When we reach a safe place, assume that's not in a string;
     then step the main scan to the earliest comment-starter seen
     an even number of string quotes away from the safe place.

     OFROM[I] is position of the earliest comment-starter seen
     which is I+2X quotes from the comment-end.
     PARITY is current parity of quotes from the comment end.  */
510
  int string_style = -1;	/* Presumed outside of any string. */
511
  int string_lossage = 0;
512
  /* Not a real lossage: indicates that we have passed a matching comment
Juanma Barranquero's avatar
Juanma Barranquero committed
513
     starter plus a non-matching comment-ender, meaning that any matching
514 515 516 517
     comment-starter we might see later could be a false positive (hidden
     inside another comment).
     Test case:  { a (* b } c (* d *) */
  int comment_lossage = 0;
518 519 520
  EMACS_INT comment_end = from;
  EMACS_INT comment_end_byte = from_byte;
  EMACS_INT comstart_pos = 0;
521
  EMACS_INT comstart_byte IF_LINT (= 0);
522 523
  /* Place where the containing defun starts,
     or 0 if we didn't come across it yet.  */
524 525
  EMACS_INT defun_start = 0;
  EMACS_INT defun_start_byte = 0;
526
  register enum syntaxcode code;
527
  int nesting = 1;		/* current comment nesting */
528
  int c;
529 530 531 532 533
  int syntax = 0;

  /* FIXME: A }} comment-ender style leads to incorrect behavior
     in the case of {{ c }}} because we ignore the last two chars which are
     assumed to be comment-enders although they aren't.  */
534 535 536 537 538

  /* At beginning of range to scan, we're outside of strings;
     that determines quote parity to the comment-end.  */
  while (from != stop)
    {
539 540
      EMACS_INT temp_byte;
      int prev_syntax, com2start, com2end;
541
      int comstart;
542

543
      /* Move back and examine a character.  */
544
      DEC_BOTH (from, from_byte);
545 546
      UPDATE_SYNTAX_TABLE_BACKWARD (from);

547
      prev_syntax = syntax;
548
      c = FETCH_CHAR_AS_MULTIBYTE (from_byte);
549
      syntax = SYNTAX_WITH_FLAGS (c);
550 551
      code = SYNTAX (c);

552 553 554
      /* Check for 2-char comment markers.  */
      com2start = (SYNTAX_FLAGS_COMSTART_FIRST (syntax)
		   && SYNTAX_FLAGS_COMSTART_SECOND (prev_syntax)
555 556
		   && (comstyle
		       == SYNTAX_FLAGS_COMMENT_STYLE (prev_syntax, syntax))
557 558 559 560
		   && (SYNTAX_FLAGS_COMMENT_NESTED (prev_syntax)
		       || SYNTAX_FLAGS_COMMENT_NESTED (syntax)) == comnested);
      com2end = (SYNTAX_FLAGS_COMEND_FIRST (syntax)
		 && SYNTAX_FLAGS_COMEND_SECOND (prev_syntax));
561
      comstart = (com2start || code == Scomment);
562

563 564 565 566 567 568 569 570 571 572
      /* Nasty cases with overlapping 2-char comment markers:
	 - snmp-mode: -- c -- foo -- c --
	              --- c --
		      ------ c --
	 - c-mode:    *||*
		      |* *|* *|
		      |*| |* |*|
		      ///   */

      /* If a 2-char comment sequence partly overlaps with another,
573 574 575
	 we don't try to be clever.  E.g. |*| in C, or }% in modes that
	 have %..\n and %{..}%.  */
      if (from > stop && (com2end || comstart))
576
	{
577 578
	  EMACS_INT next = from, next_byte = from_byte;
	  int next_c, next_syntax;
579 580
	  DEC_BOTH (next, next_byte);
	  UPDATE_SYNTAX_TABLE_BACKWARD (next);
581
	  next_c = FETCH_CHAR_AS_MULTIBYTE (next_byte);
582
	  next_syntax = SYNTAX_WITH_FLAGS (next_c);
583
	  if (((comstart || comnested)
584 585 586 587
	       && SYNTAX_FLAGS_COMEND_SECOND (syntax)
	       && SYNTAX_FLAGS_COMEND_FIRST (next_syntax))
	      || ((com2end || comnested)
		  && SYNTAX_FLAGS_COMSTART_SECOND (syntax)
588 589
		  && (comstyle
		      == SYNTAX_FLAGS_COMMENT_STYLE (syntax, prev_syntax))
590 591 592
		  && SYNTAX_FLAGS_COMSTART_FIRST (next_syntax)))
	    goto lossage;
	  /* UPDATE_SYNTAX_TABLE_FORWARD (next + 1); */
593
	}
594 595 596 597 598 599 600 601 602 603 604 605 606 607 608

      if (com2start && comstart_pos == 0)
	/* We're looking at a comment starter.  But it might be a comment
	   ender as well (see snmp-mode).  The first time we see one, we
	   need to consider it as a comment starter,
	   and the subsequent times as a comment ender.  */
	com2end = 0;

      /* Turn a 2-char comment sequences into the appropriate syntax.  */
      if (com2end)
	code = Sendcomment;
      else if (com2start)
	code = Scomment;
      /* Ignore comment starters of a different style.  */
      else if (code == Scomment
609
	       && (comstyle != SYNTAX_FLAGS_COMMENT_STYLE (syntax, 0)
610
		   || SYNTAX_FLAGS_COMMENT_NESTED (syntax) != comnested))
611
	continue;
612

613 614
      /* Ignore escaped characters, except comment-enders.  */
      if (code != Sendcomment && char_quoted (from, from_byte))
615 616
	continue;

617
      switch (code)
618
	{
619 620 621 622 623 624 625 626 627 628 629 630 631 632
	case Sstring_fence:
	case Scomment_fence:
	  c = (code == Sstring_fence ? ST_STRING_STYLE : ST_COMMENT_STYLE);
	case Sstring:
	  /* Track parity of quotes.  */
	  if (string_style == -1)
	    /* Entering a string.  */
	    string_style = c;
	  else if (string_style == c)
	    /* Leaving the string.  */
	    string_style = -1;
	  else
	    /* If we have two kinds of string delimiters.
	       There's no way to grok this scanning backwards.  */
633
	    string_lossage = 1;
634
	  break;
Juanma Barranquero's avatar
Juanma Barranquero committed
635

636 637
	case Scomment:
	  /* We've already checked that it is the relevant comstyle.  */
638
	  if (string_style != -1 || comment_lossage || string_lossage)
639 640 641 642
	    /* There are odd string quotes involved, so let's be careful.
	       Test case in Pascal: " { " a { " } */
	    goto lossage;

643 644 645 646 647 648 649
	  if (!comnested)
	    {
	      /* Record best comment-starter so far.  */
	      comstart_pos = from;
	      comstart_byte = from_byte;
	    }
	  else if (--nesting <= 0)
650 651 652 653 654 655
	    /* nested comments have to be balanced, so we don't need to
	       keep looking for earlier ones.  We use here the same (slightly
	       incorrect) reasoning as below:  since it is followed by uniform
	       paired string quotes, this comment-start has to be outside of
	       strings, else the comment-end itself would be inside a string. */
	    goto done;
656 657
	  break;

658
	case Sendcomment:
659
	  if (SYNTAX_FLAGS_COMMENT_STYLE (syntax, 0) == comstyle
660
	      && ((com2end && SYNTAX_FLAGS_COMMENT_NESTED (prev_syntax))
661
		  || SYNTAX_FLAGS_COMMENT_NESTED (syntax)) == comnested)
662 663 664 665 666 667 668 669 670
	    /* This is the same style of comment ender as ours. */
	    {
	      if (comnested)
		nesting++;
	      else
		/* Anything before that can't count because it would match
		   this comment-ender rather than ours.  */
		from = stop;	/* Break out of the loop.  */
	    }
671 672 673 674 675 676 677 678 679
	  else if (comstart_pos != 0 || c != '\n')
	    /* We're mixing comment styles here, so we'd better be careful.
	       The (comstart_pos != 0 || c != '\n') check is not quite correct
	       (we should just always set comment_lossage), but removing it
	       would imply that any multiline comment in C would go through
	       lossage, which seems overkill.
	       The failure should only happen in the rare cases such as
	         { (* } *)   */
	    comment_lossage = 1;
680
	  break;
681

682 683 684 685 686 687 688 689 690 691 692
	case Sopen:
	  /* Assume a defun-start point is outside of strings.  */
	  if (open_paren_in_column_0_is_defun_start
	      && (from == stop
		  || (temp_byte = dec_bytepos (from_byte),
		      FETCH_CHAR (temp_byte) == '\n')))
	    {
	      defun_start = from;
	      defun_start_byte = from_byte;
	      from = stop;	/* Break out of the loop.  */
	    }
693
	  break;
694 695

	default:
696
	  break;
697
	}
698 699 700 701 702
    }

  if (comstart_pos == 0)
    {
      from = comment_end;
703
      from_byte = comment_end_byte;
704 705
      UPDATE_SYNTAX_TABLE_FORWARD (comment_end - 1);
    }
706 707 708
  /* If comstart_pos is set and we get here (ie. didn't jump to `lossage'
     or `done'), then we've found the beginning of the non-nested comment.  */
  else if (1)	/* !comnested */
709 710
    {
      from = comstart_pos;
711
      from_byte = comstart_byte;
712
      UPDATE_SYNTAX_TABLE_FORWARD (from - 1);
713 714 715
    }
  else
    {
716 717
      struct lisp_parse_state state;
    lossage:
718 719
      /* We had two kinds of string delimiters mixed up
	 together.  Decode this going forwards.
720
	 Scan fwd from a known safe place (beginning-of-defun)
721 722
	 to the one in question; this records where we
	 last passed a comment starter.  */
723 724 725 726 727 728
      /* If we did not already find the defun start, find it now.  */
      if (defun_start == 0)
	{
	  defun_start = find_defun_start (comment_end, comment_end_byte);
	  defun_start_byte = find_start_value_byte;
	}
729
      do
730
	{
731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752
	  scan_sexps_forward (&state,
			      defun_start, defun_start_byte,
			      comment_end, -10000, 0, Qnil, 0);
	  defun_start = comment_end;
	  if (state.incomment == (comnested ? 1 : -1)
	      && state.comstyle == comstyle)
	    from = state.comstr_start;
	  else
	    {
	      from = comment_end;
	      if (state.incomment)
		/* If comment_end is inside some other comment, maybe ours
		   is nested, so we need to try again from within the
		   surrounding comment.  Example: { a (* " *)  */
		{
		  /* FIXME: We should advance by one or two chars. */
		  defun_start = state.comstr_start + 2;
		  defun_start_byte = CHAR_TO_BYTE (defun_start);
		}
	    }
	} while (defun_start < comment_end);

753
      from_byte = CHAR_TO_BYTE (from);
754 755
      UPDATE_SYNTAX_TABLE_FORWARD (from - 1);
    }
Juanma Barranquero's avatar
Juanma Barranquero committed
756

757
 done:
758 759 760
  *charpos_ptr = from;
  *bytepos_ptr = from_byte;

761
  return (from == comment_end) ? -1 : from;
762 763
}

Richard M. Stallman's avatar
Richard M. Stallman committed
764
DEFUN ("syntax-table-p", Fsyntax_table_p, Ssyntax_table_p, 1, 1, 0,
765 766
       doc: /* Return t if OBJECT is a syntax table.
Currently, any char-table counts as a syntax table.  */)
Dan Nicolaescu's avatar
Dan Nicolaescu committed
767
  (Lisp_Object object)
Richard M. Stallman's avatar
Richard M. Stallman committed
768
{
769
  if (CHAR_TABLE_P (object)
770
      && EQ (XCHAR_TABLE (object)->purpose, Qsyntax_table))
Richard M. Stallman's avatar
Richard M. Stallman committed
771 772 773 774
    return Qt;
  return Qnil;
}

775
static void
776
check_syntax_table (Lisp_Object obj)
Richard M. Stallman's avatar
Richard M. Stallman committed
777
{
778 779
  CHECK_TYPE (CHAR_TABLE_P (obj) && EQ (XCHAR_TABLE (obj)->purpose, Qsyntax_table),
	      Qsyntax_table_p, obj);
Juanma Barranquero's avatar
Juanma Barranquero committed
780
}
Richard M. Stallman's avatar
Richard M. Stallman committed
781 782

DEFUN ("syntax-table", Fsyntax_table, Ssyntax_table, 0, 0, 0,
783 784
       doc: /* Return the current syntax table.
This is the one specified by the current buffer.  */)
Dan Nicolaescu's avatar
Dan Nicolaescu committed
785
  (void)
Richard M. Stallman's avatar
Richard M. Stallman committed
786
{
Tom Tromey's avatar
Tom Tromey committed
787
  return BVAR (current_buffer, syntax_table);
Richard M. Stallman's avatar
Richard M. Stallman committed
788 789 790 791
}

DEFUN ("standard-syntax-table", Fstandard_syntax_table,
   Sstandard_syntax_table, 0, 0, 0,
792 793
       doc: /* Return the standard syntax table.
This is the one used for new buffers.  */)
Dan Nicolaescu's avatar
Dan Nicolaescu committed
794
  (void)
Richard M. Stallman's avatar
Richard M. Stallman committed
795 796 797 798 799
{
  return Vstandard_syntax_table;
}

DEFUN ("copy-syntax-table", Fcopy_syntax_table, Scopy_syntax_table, 0, 1, 0,
800 801
       doc: /* Construct a new syntax table and return it.
It is a copy of the TABLE, which defaults to the standard syntax table.  */)
Dan Nicolaescu's avatar
Dan Nicolaescu committed
802
  (Lisp_Object table)
Richard M. Stallman's avatar
Richard M. Stallman committed
803
{
804 805
  Lisp_Object copy;

Jim Blandy's avatar
Jim Blandy committed
806
  if (!NILP (table))
807 808 809 810 811
    check_syntax_table (table);
  else
    table = Vstandard_syntax_table;

  copy = Fcopy_sequence (table);
812 813 814 815 816 817 818 819 820 821

  /* Only the standard syntax table should have a default element.
     Other syntax tables should inherit from parents instead.  */
  XCHAR_TABLE (copy)->defalt = Qnil;

  /* Copied syntax tables should all have parents.
     If we copied one with no parent, such as the standard syntax table,
     use the standard syntax table as the copy's parent.  */
  if (NILP (XCHAR_TABLE (copy)->parent))
    Fset_char_table_parent (copy, Vstandard_syntax_table);
822
  return copy;
Richard M. Stallman's avatar
Richard M. Stallman committed
823 824 825
}

DEFUN ("set-syntax-table", Fset_syntax_table, Sset_syntax_table, 1, 1, 0,
826 827
       doc: /* Select a new syntax table for the current buffer.
One argument, a syntax table.  */)
Dan Nicolaescu's avatar
Dan Nicolaescu committed
828
  (Lisp_Object table)
Richard M. Stallman's avatar
Richard M. Stallman committed
829
{
830
  int idx;
831
  check_syntax_table (table);
Tom Tromey's avatar
Tom Tromey committed
832
  BVAR (current_buffer, syntax_table) = table;
Richard M. Stallman's avatar
Richard M. Stallman committed
833
  /* Indicate that this buffer now has a specified syntax table.  */
Gerd Moellmann's avatar
Gerd Moellmann committed
834 835
  idx = PER_BUFFER_VAR_IDX (syntax_table);
  SET_PER_BUFFER_VALUE_P (current_buffer, idx, 1);
Richard M. Stallman's avatar
Richard M. Stallman committed
836 837 838 839 840
  return table;
}

/* Convert a letter which signifies a syntax code
 into the code it signifies.
841
 This is used by modify-syntax-entry, and other things.  */
Richard M. Stallman's avatar
Richard M. Stallman committed
842 843 844 845 846 847

unsigned char syntax_spec_code[0400] =
  { 0377, 0377, 0377, 0377, 0377, 0377, 0377, 0377,
    0377, 0377, 0377, 0377, 0377, 0377, 0377, 0377,
    0377, 0377, 0377, 0377, 0377, 0377, 0377, 0377,
    0377, 0377, 0377, 0377, 0377, 0377, 0377, 0377,
848
    (char) Swhitespace, (char) Scomment_fence, (char) Sstring, 0377,
Richard M. Stallman's avatar
Richard M. Stallman committed
849 850 851 852 853 854
        (char) Smath, 0377, 0377, (char) Squote,
    (char) Sopen, (char) Sclose, 0377, 0377,
	0377, (char) Swhitespace, (char) Spunct, (char) Scharquote,
    0377, 0377, 0377, 0377, 0377, 0377, 0377, 0377,
    0377, 0377, 0377, 0377,
	(char) Scomment, 0377, (char) Sendcomment, 0377,
855
    (char) Sinherit, 0377, 0377, 0377, 0377, 0377, 0377, 0377,   /* @, A ... */
Richard M. Stallman's avatar
Richard M. Stallman committed
856 857 858 859 860 861
    0377, 0377, 0377, 0377, 0377, 0377, 0377, 0377,
    0377, 0377, 0377, 0377, 0377, 0377, 0377, (char) Sword,
    0377, 0377, 0377, 0377, (char) Sescape, 0377, 0377, (char) Ssymbol,
    0377, 0377, 0377, 0377, 0377, 0377, 0377, 0377,   /* `, a, ... */
    0377, 0377, 0377, 0377, 0377, 0377, 0377, 0377,
    0377, 0377, 0377, 0377, 0377, 0377, 0377, (char) Sword,
862
    0377, 0377, 0377, 0377, (char) Sstring_fence, 0377, 0377, 0377
Richard M. Stallman's avatar
Richard M. Stallman committed
863 864
  };

865
/* Indexed by syntax code, give the letter that describes it.  */
Richard M. Stallman's avatar
Richard M. Stallman committed
866

867
char syntax_code_spec[16] =
Richard M. Stallman's avatar
Richard M. Stallman committed
868
  {
869 870
    ' ', '.', 'w', '_', '(', ')', '\'', '\"', '$', '\\', '/', '<', '>', '@',
    '!', '|'
Richard M. Stallman's avatar
Richard M. Stallman committed
871
  };
Karl Heuer's avatar
Karl Heuer committed
872 873 874 875 876 877 878 879

/* Indexed by syntax code, give the object (cons of syntax code and
   nil) to be stored in syntax table.  Since these objects can be
   shared among syntax tables, we generate them in advance.  By
   sharing objects, the function `describe-syntax' can give a more
   compact listing.  */
static Lisp_Object Vsyntax_code_object;

Richard M. Stallman's avatar
Richard M. Stallman committed
880 881

DEFUN ("char-syntax", Fchar_syntax, Schar_syntax, 1, 1, 0,
882
       doc: /* Return the syntax code of CHARACTER, described by a character.
883 884
For example, if CHARACTER is a word constituent, the
character `w' (119) is returned.
885 886
The characters that correspond to various syntax codes
are listed in the documentation of `modify-syntax-entry'.  */)
Dan Nicolaescu's avatar
Dan Nicolaescu committed
887
  (Lisp_Object character)
Richard M. Stallman's avatar
Richard M. Stallman committed
888
{
889
  int char_int;
890
  CHECK_CHARACTER (character);
891
  char_int = XINT (character);
892
  SETUP_BUFFER_SYNTAX_TABLE ();
893
  return make_number (syntax_code_spec[(int) SYNTAX (char_int)]);
894 895 896
}

DEFUN ("matching-paren", Fmatching_paren, Smatching_paren, 1, 1, 0,
897
       doc: /* Return the matching parenthesis of CHARACTER, or nil if none.  */)
Dan Nicolaescu's avatar
Dan Nicolaescu committed
898
  (Lisp_Object character)
899
{
900
  int char_int, code;
901
  CHECK_NUMBER (character);
902
  char_int = XINT (character);
903
  SETUP_BUFFER_SYNTAX_TABLE ();
904
  code = SYNTAX (char_int);
905
  if (code == Sopen || code == Sclose)
906
    return SYNTAX_MATCH (char_int);
907
  return Qnil;
Richard M. Stallman's avatar
Richard M. Stallman committed
908 909
}

910
DEFUN ("string-to-syntax", Fstring_to_syntax, Sstring_to_syntax, 1, 1, 0,
911 912 913
       doc: /* Convert a syntax specification STRING into syntax cell form.
STRING should be a string as it is allowed as argument of
`modify-syntax-entry'.  Value is the equivalent cons cell
914
\(CODE . MATCHING-CHAR) that can be used as value of a `syntax-table'
915
text property.  */)
Dan Nicolaescu's avatar
Dan Nicolaescu committed
916
  (Lisp_Object string)
Richard M. Stallman's avatar
Richard M. Stallman committed
917
{
918
  register const unsigned char *p;
Richard M. Stallman's avatar
Richard M. Stallman committed
919
  register enum syntaxcode code;
920
  int val;
921
  Lisp_Object match;
Richard M. Stallman's avatar
Richard M. Stallman committed
922

923
  CHECK_STRING (string);
Richard M. Stallman's avatar
Richard M. Stallman committed
924

925
  p = SDATA (string);
Richard M. Stallman's avatar
Richard M. Stallman committed
926 927
  code = (enum syntaxcode) syntax_spec_code[*p++];
  if (((int) code & 0377) == 0377)
928
    error ("Invalid syntax description letter: %c", p[-1]);
Richard M. Stallman's avatar
Richard M. Stallman committed
929

930
  if (code == Sinherit)
931
    return Qnil;
932 933

  if (*p)
934
    {
Karl Heuer's avatar
Karl Heuer committed
935
      int len;
936
      int character = STRING_CHAR_AND_LENGTH (p, len);
Karl Heuer's avatar
Karl Heuer committed
937
      XSETINT (match, character);
938 939
      if (XFASTINT (match) == ' ')
	match = Qnil;
Karl Heuer's avatar
Karl Heuer committed
940
      p += len;
941 942
    }
  else
943
    match = Qnil;
Richard M. Stallman's avatar
Richard M. Stallman committed
944

945
  val = (int) code;
Richard M. Stallman's avatar
Richard M. Stallman committed
946 947 948 949
  while (*p)
    switch (*p++)
      {
      case '1':
950
	val |= 1 << 16;
Richard M. Stallman's avatar
Richard M. Stallman committed
951 952 953
	break;

      case '2':
954
	val |= 1 << 17;
Richard M. Stallman's avatar
Richard M. Stallman committed
955 956 957
	break;

      case '3':
958
	val |= 1 << 18;
Richard M. Stallman's avatar
Richard M. Stallman committed
959 960 961
	break;

      case '4':
962
	val |= 1 << 19;
Richard M. Stallman's avatar
Richard M. Stallman committed