Commit f4646fff authored by Andreas Schwab's avatar Andreas Schwab
Browse files

(boyer_moore): Use zero as marker value for a possible

match instead of depending on overflow behavior.
parent 05fcb8da
2009-04-16 Andreas Schwab <schwab@linux-m68k.org>
* search.c (boyer_moore): Use zero as marker value for a possible
match instead of depending on overflow behavior.
2009-04-16 Chong Yidong <cyd@stupidchicken.com> 2009-04-16 Chong Yidong <cyd@stupidchicken.com>
   
* keyboard.c (adjust_point_for_property): Disable 2009-02-12 * keyboard.c (adjust_point_for_property): Disable 2009-02-12
......
...@@ -1729,9 +1729,8 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, ...@@ -1729,9 +1729,8 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
{ {
int direction = ((n > 0) ? 1 : -1); int direction = ((n > 0) ? 1 : -1);
register int dirlen; register int dirlen;
int infinity, limit, stride_for_teases = 0; int limit, stride_for_teases = 0;
register int *BM_tab; int BM_tab[0400];
int *BM_tab_base;
register unsigned char *cursor, *p_limit; register unsigned char *cursor, *p_limit;
register int i, j; register int i, j;
unsigned char *pat, *pat_end; unsigned char *pat, *pat_end;
...@@ -1747,37 +1746,28 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, ...@@ -1747,37 +1746,28 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
int translate_prev_byte3 = 0; int translate_prev_byte3 = 0;
int translate_prev_byte4 = 0; int translate_prev_byte4 = 0;
BM_tab = (int *) alloca (0400 * sizeof (int)); /* The general approach is that we are going to maintain that we know
the first (closest to the present position, in whatever direction
/* The general approach is that we are going to maintain that we know */ we're searching) character that could possibly be the last
/* the first (closest to the present position, in whatever direction */ (furthest from present position) character of a valid match. We
/* we're searching) character that could possibly be the last */ advance the state of our knowledge by looking at that character
/* (furthest from present position) character of a valid match. We */ and seeing whether it indeed matches the last character of the
/* advance the state of our knowledge by looking at that character */ pattern. If it does, we take a closer look. If it does not, we
/* and seeing whether it indeed matches the last character of the */ move our pointer (to putative last characters) as far as is
/* pattern. If it does, we take a closer look. If it does not, we */ logically possible. This amount of movement, which I call a
/* move our pointer (to putative last characters) as far as is */ stride, will be the length of the pattern if the actual character
/* logically possible. This amount of movement, which I call a */ appears nowhere in the pattern, otherwise it will be the distance
/* stride, will be the length of the pattern if the actual character */ from the last occurrence of that character to the end of the
/* appears nowhere in the pattern, otherwise it will be the distance */ pattern. If the amount is zero we have a possible match. */
/* from the last occurrence of that character to the end of the */
/* pattern. */ /* Here we make a "mickey mouse" BM table. The stride of the search
/* As a coding trick, an enormous stride is coded into the table for */ is determined only by the last character of the putative match.
/* characters that match the last character. This allows use of only */ If that character does not match, we will stride the proper
/* a single test, a test for having gone past the end of the */ distance to propose a match that superimposes it on the last
/* permissible match region, to test for both possible matches (when */ instance of a character that matches it (per trt), or misses
/* the stride goes past the end immediately) and failure to */ it entirely if there is none. */
/* match (where you get nudged past the end one stride at a time). */
/* Here we make a "mickey mouse" BM table. The stride of the search */
/* is determined only by the last character of the putative match. */
/* If that character does not match, we will stride the proper */
/* distance to propose a match that superimposes it on the last */
/* instance of a character that matches it (per trt), or misses */
/* it entirely if there is none. */
dirlen = len_byte * direction; dirlen = len_byte * direction;
infinity = dirlen - (lim_byte + pos_byte + len_byte + len_byte) * direction;
/* Record position after the end of the pattern. */ /* Record position after the end of the pattern. */
pat_end = base_pat + len_byte; pat_end = base_pat + len_byte;
...@@ -1787,23 +1777,14 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, ...@@ -1787,23 +1777,14 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
if (direction < 0) if (direction < 0)
base_pat = pat_end - 1; base_pat = pat_end - 1;
BM_tab_base = BM_tab; /* A character that does not appear in the pattern induces a
BM_tab += 0400; stride equal to the pattern length. */
j = dirlen; /* to get it in a register */ for (i = 0; i < 0400; i++)
/* A character that does not appear in the pattern induces a */ BM_tab[i] = dirlen;
/* stride equal to the pattern length. */
while (BM_tab_base != BM_tab)
{
*--BM_tab = j;
*--BM_tab = j;
*--BM_tab = j;
*--BM_tab = j;
}
/* We use this for translation, instead of TRT itself. /* We use this for translation, instead of TRT itself.
We fill this in to handle the characters that actually We fill this in to handle the characters that actually
occur in the pattern. Others don't matter anyway! */ occur in the pattern. Others don't matter anyway! */
bzero (simple_translate, sizeof simple_translate);
for (i = 0; i < 0400; i++) for (i = 0; i < 0400; i++)
simple_translate[i] = i; simple_translate[i] = i;
...@@ -1828,12 +1809,10 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, ...@@ -1828,12 +1809,10 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
} }
i = 0; i = 0;
while (i != infinity) while (i != dirlen)
{ {
unsigned char *ptr = base_pat + i; unsigned char *ptr = base_pat + i;
i += direction; i += direction;
if (i == dirlen)
i = infinity;
if (! NILP (trt)) if (! NILP (trt))
{ {
/* If the byte currently looking at is the last of a /* If the byte currently looking at is the last of a
...@@ -1861,7 +1840,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, ...@@ -1861,7 +1840,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
else else
j = *ptr; j = *ptr;
if (i == infinity) if (i == dirlen)
stride_for_teases = BM_tab[j]; stride_for_teases = BM_tab[j];
BM_tab[j] = dirlen - i; BM_tab[j] = dirlen - i;
...@@ -1894,17 +1873,16 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, ...@@ -1894,17 +1873,16 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
{ {
j = *ptr; j = *ptr;
if (i == infinity) if (i == dirlen)
stride_for_teases = BM_tab[j]; stride_for_teases = BM_tab[j];
BM_tab[j] = dirlen - i; BM_tab[j] = dirlen - i;
} }
/* stride_for_teases tells how much to stride if we get a */ /* stride_for_teases tells how much to stride if we get a
/* match on the far character but are subsequently */ match on the far character but are subsequently
/* disappointed, by recording what the stride would have been */ disappointed, by recording what the stride would have been
/* for that character if the last character had been */ for that character if the last character had been
/* different. */ different. */
} }
infinity = dirlen - infinity;
pos_byte += dirlen - ((direction > 0) ? direction : 0); pos_byte += dirlen - ((direction > 0) ? direction : 0);
/* loop invariant - POS_BYTE points at where last char (first /* loop invariant - POS_BYTE points at where last char (first
char if reverse) of pattern would align in a possible match. */ char if reverse) of pattern would align in a possible match. */
...@@ -1948,43 +1926,34 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, ...@@ -1948,43 +1926,34 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
p_limit = BYTE_POS_ADDR (limit); p_limit = BYTE_POS_ADDR (limit);
p2 = (cursor = BYTE_POS_ADDR (pos_byte)); p2 = (cursor = BYTE_POS_ADDR (pos_byte));
/* In this loop, pos + cursor - p2 is the surrogate for pos */ /* In this loop, pos + cursor - p2 is the surrogate for pos. */
while (1) /* use one cursor setting as long as i can */ while (1) /* use one cursor setting as long as i can */
{ {
if (direction > 0) /* worth duplicating */ if (direction > 0) /* worth duplicating */
{ {
/* Use signed comparison if appropriate while (cursor <= p_limit)
to make cursor+infinity sure to be > p_limit. {
Assuming that the buffer lies in a range of addresses if (BM_tab[*cursor] == 0)
that are all "positive" (as ints) or all "negative", goto hit;
either kind of comparison will work as long
as we don't step by infinity. So pick the kind
that works when we do step by infinity. */
if ((EMACS_INT) (p_limit + infinity) > (EMACS_INT) p_limit)
while ((EMACS_INT) cursor <= (EMACS_INT) p_limit)
cursor += BM_tab[*cursor];
else
while ((EMACS_UINT) cursor <= (EMACS_UINT) p_limit)
cursor += BM_tab[*cursor]; cursor += BM_tab[*cursor];
}
} }
else else
{ {
if ((EMACS_INT) (p_limit + infinity) < (EMACS_INT) p_limit) while (cursor >= p_limit)
while ((EMACS_INT) cursor >= (EMACS_INT) p_limit) {
cursor += BM_tab[*cursor]; if (BM_tab[*cursor] == 0)
else goto hit;
while ((EMACS_UINT) cursor >= (EMACS_UINT) p_limit)
cursor += BM_tab[*cursor]; cursor += BM_tab[*cursor];
}
} }
/* If you are here, cursor is beyond the end of the searched region. */ /* If you are here, cursor is beyond the end of the
/* This can happen if you match on the far character of the pattern, */ searched region. You fail to match within the
/* because the "stride" of that character is infinity, a number able */ permitted region and would otherwise try a character
/* to throw you well beyond the end of the search. It can also */ beyond that region. */
/* happen if you fail to match within the permitted region and would */ break;
/* otherwise try a character beyond that region */
if ((cursor - p_limit) * direction <= len_byte) hit:
break; /* a small overrun is genuine */
cursor -= infinity; /* large overrun = hit */
i = dirlen - direction; i = dirlen - direction;
if (! NILP (trt)) if (! NILP (trt))
{ {
...@@ -2056,8 +2025,8 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, ...@@ -2056,8 +2025,8 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
pos_byte += cursor - p2; pos_byte += cursor - p2;
} }
else else
/* Now we'll pick up a clump that has to be done the hard */ /* Now we'll pick up a clump that has to be done the hard
/* way because it covers a discontinuity */ way because it covers a discontinuity. */
{ {
limit = ((direction > 0) limit = ((direction > 0)
? BUFFER_CEILING_OF (pos_byte - dirlen + 1) ? BUFFER_CEILING_OF (pos_byte - dirlen + 1)
...@@ -2069,19 +2038,21 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, ...@@ -2069,19 +2038,21 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
and still be valid for a possible match. */ and still be valid for a possible match. */
while (1) while (1)
{ {
/* This loop can be coded for space rather than */ /* This loop can be coded for space rather than
/* speed because it will usually run only once. */ speed because it will usually run only once.
/* (the reach is at most len + 21, and typically */ (the reach is at most len + 21, and typically
/* does not exceed len) */ does not exceed len). */
while ((limit - pos_byte) * direction >= 0) while ((limit - pos_byte) * direction >= 0)
pos_byte += BM_tab[FETCH_BYTE (pos_byte)]; {
/* now run the same tests to distinguish going off the */ int ch = FETCH_BYTE (pos_byte);
/* end, a match or a phony match. */ if (BM_tab[ch] == 0)
if ((pos_byte - limit) * direction <= len_byte) goto hit2;
break; /* ran off the end */ pos_byte += BM_tab[ch];
/* Found what might be a match. }
Set POS_BYTE back to last (first if reverse) pos. */ break; /* ran off the end */
pos_byte -= infinity;
hit2:
/* Found what might be a match. */
i = dirlen - direction; i = dirlen - direction;
while ((i -= direction) + direction != 0) while ((i -= direction) + direction != 0)
{ {
...@@ -2110,7 +2081,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, ...@@ -2110,7 +2081,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
/* Above loop has moved POS_BYTE part or all the way /* Above loop has moved POS_BYTE part or all the way
back to the first pos (last pos if reverse). back to the first pos (last pos if reverse).
Set it once again at the last (first if reverse) char. */ Set it once again at the last (first if reverse) char. */
pos_byte += dirlen - i- direction; pos_byte += dirlen - i - direction;
if (i + direction == 0) if (i + direction == 0)
{ {
int position, start, end; int position, start, end;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment