/************************************************************************/
/* */
/* STBM -- Self-Tuning Boyer-Moore string search */
/* */
/* Implements behoffski's variant of the Boyer-Moore search. */
/* This version was [re-?]invented after contemplating the */
/* 1977 BM paper and the 1980 Sunday paper... but before */
/* behoffski knew of the 1991 Hume and Sunday paper. */
/* */
/* There are two essential differences between this algorithm */
/* and the one used in GNU Grep: */
/* 1. GNU Grep uses the "Tuned Boyer-Moore" algorithm */
/* as suggested by Hume and Sunday's 1991 paper. */
/* This version uses a *Self*-Tuning BM algorithm. */
/* 2. GNU Grep handles case insensitivity by adding a */
/* general-purpose translation table that maps each */
/* byte before being examined by the search. As */
/* already demonstrated by Grouse Grep's state tables, */
/* this is inefficient if the translation is static, */
/* as the extra lookup per character can be eliminated */
/* by modifying the state tables to reflect the effects */
/* of the translation. */
/* */
/* So what is the Self-Tuning Boyer-Moore algorithm? Like */
/* most BM algorithms, it has a skip loop, a guard test, */
/* and a match loop. The critical difference is that we */
/* feed information from the match loop back into the guard */
/* test so that we can find the most effective guard. This */
/* dynamic feedback costs us little or nothing to implement, */
/* yet can substantially improve the search efficiency. */
/* This dynamic feedback is more effective than a static */
/* guard as we tune ourselves to the actual data, instead of */
/* relying on some guess about the likely nature of the input. */
/* */
/* The feedback is simple and cheap -- we remember the */
/* offset where the match loop detected a mismatch, and use */
/* that position as the guard! In addition, we impose a */
/* round-robin ordering on the match loop, so that each */
/* position of the search string has an equal opportunity */
/* to become the guard. We must pay the cost of setting up */
/* two loops when matching instead of just one, but this */
/* effort is more than repaid if (usually) we find a good */
/* guard. To some extent, the performance of this code will */
/* depend on the characteristics of the code generated by */
/* the compiler. This initial version was written and */
/* evaluated using GCC. */
/* */
/* Note also that the correct probability function for the */
/* guard is conditional on the success of the BM skip */
/* search -- so, for example, when searching for "tree" */
/* in /usr/dict/words, the best guard character (the */
/* character least likely to match) -- given that the skip */
/* loop has matched an "e" -- is in fact the preceding "e", */
/* despite it being the most common letter in the file. */
/* Most static guess algorithms don't take this conditional */
/* probability into account: The self-tuning BM search gets */
/* it right as all the operations on the guard that affect */
/* its mismatch efficiency occur in the context where the */
/* last character has already matched. */
/* */
/* References: */
/* "A Fast String Searching Algorithm" */
/* Robert S. Boyer & J Strother Moore */
/* Communications of the ACM */
/* Vol. 20 No. 10, October 1977 pp. 762-772 */
/* */
/* "A Very Fast Substring Search Algorithm" */
/* Daniel M. Sunday */
/* Communications of the ACM */
/* Vol. 33 No. 8, August 1990, pp. 132-142 */
/* */
/* "Fast String Searching" */
/* Andrew Hume & Daniel Sunday */
/* Software - Practice & Experience */
/* Vol. 21(11), 1221-1248 (November 1991) */
/* */
/* Copyright (C) 1998-2000 Grouse Software. All rights reserved. */
/* Written for Grouse by behoffski (Brenton Hoff). */
/* */
/* Free software: no warranty; use anywhere is ok; spread the */
/* sources; note any mods; share variations and derivatives */
/* (including sending to behoffski@grouse.com.au). */
/* */
/************************************************************************/
#include "ascii.h"
#include <compdef.h>
#include "ctype.h"
#include "platform.h"
#include "stbm.h"
#include <stdio.h>
#include <stdlib.h>
/*Macro to provide very-low-cost case folding to lower case*/
#define STBM_TOLOWER(c) (gSTBM.CaseFold[c])
struct STBM_SearchSpec_Struct {
/*Lookup table for "fast" loop*/
UINT SkipTable[STBM_ALPHABET_SIZE];
UINT md2;
/*Original pattern, needed to complete match*/
UINT PatLen;
BYTE *pPattern;
LWORD Flags;
/*Skip table for where guard fails to match*/
/* ?? not sure if it's worth doing this*/
};
typedef struct {
/*Streamlined lowercase-folding table*/
BYTE CaseFold[256];
} STBM_MODULE_CONTEXT;
module_scope STBM_MODULE_CONTEXT gSTBM;
/************************************************************************/
/* */
/* Init -- Prepare module for operation */
/* */
/************************************************************************/
public_scope void
STBM_Init(void)
{
UINT i;
/*Initialise low-cost case folding table*/
for (i = 0; i < 256; i++) {
gSTBM.CaseFold[(BYTE) i] = tolower((BYTE) i);
}
} /*Init*/
/************************************************************************/
/* */
/* Compile -- Analyse pattern and prepare for search */
/* */
/* Converts the pattern string into a format optimal for */
/* the Search function, and returns the compiled format */
/* as a reference to use in subsequent searches. */
/* */
/* The caller must maintain the pattern string provided here */
/* for the duration of the search, as this function does not */
/* bother to take its own copy. Even worse, if the search */
/* is case insensitive, the pattern string will be edited to */
/* make the case of all letters consistent. If necessary, */
/* copy the string to a separate buffer if either of these */
/* behaviours conflicts with other parts of the program. */
/* */
/* Returns FALSE if unable to handle pattern (probably */
/* due to insufficient memory). */
/* */
/************************************************************************/
public_scope BOOL
STBM_Compile(UINT PatLen, CHAR *pPattern, LWORD Flags,
STBM_SearchSpec **ppSpec)
{
STBM_SearchSpec *pSpec = (STBM_SearchSpec *) NIL;
BYTE *pPat;
BYTE *pPatEnd;
UINT i;
/*Destroy return value to reduce chances of being misinterpreted*/
*ppSpec = (STBM_SearchSpec *) NIL;
pSpec = (STBM_SearchSpec *) Platform_SmallMalloc(sizeof(*pSpec));
if (pSpec == NULL) {
/*Sorry, unable to allocate memory for compiled format*/
return FALSE;
}
/*Record the original pattern in the specification*/
pSpec->pPattern = (BYTE *) pPattern;
pSpec->PatLen = PatLen;
pSpec->Flags = Flags;
/*Set up skip table in the usual Tuned-BM fashion*/
pPat = (BYTE *) pPattern;
pPatEnd = pPat + PatLen - 1;
for (i = 0; i < DIM(pSpec->SkipTable); i++) {
pSpec->SkipTable[i] = PatLen;
}
if ((Flags & STBM_SEARCH_CASE_INSENSITIVE) == 0) {
/*Exact match required (case sensitive)*/
for (; pPat < pPatEnd; pPat++) {
pSpec->SkipTable[*pPat] = pPatEnd - pPat;
}
pSpec->md2 = pSpec->SkipTable[*pPat];
pSpec->SkipTable[*pPat] = 0;
} else {
/*Case-insensitive search -- set up table accordingly*/
for (; pPat < pPatEnd; pPat++) {
/*Force supplied pattern to have consistent case*/
*pPat = tolower(*pPat);
pSpec->SkipTable[*pPat] = pPatEnd - pPat;
pSpec->SkipTable[toupper(*pPat)] = pPatEnd - pPat;
}
*pPat = tolower(*pPat);
pSpec->md2 = pSpec->SkipTable[*pPat];
pSpec->SkipTable[*pPat] = 0;
pSpec->SkipTable[toupper(*pPat)] = 0;
}
/*Prepared for search successfully*/
*ppSpec = pSpec;
return TRUE;
} /*Compile*/
/************************************************************************/
/* */
/* Search -- Report first occurrence of pattern in buffer */
/* */
/************************************************************************/
public_scope CHAR *
STBM_Search(STBM_SearchSpec *pSpec,
CHAR *pBuffer, UINT BufferLength)
{
UINT *pSkip;
BYTE *pText;
UINT Skip;
BYTE *pEnd;
INT GuardOffset;
BYTE *pPattern = pSpec->pPattern;
BYTE *pPatEnd;
BYTE GuardChar;
INT i;
INT NegLen;
/*Is the buffer too short to hold the target string?*/
if (BufferLength < pSpec->PatLen) {
/*Yes, can't match*/
return NULL;
}
/*Prepare for search*/
pEnd = (BYTE *) (pBuffer + BufferLength);
pSkip = &pSpec->SkipTable[0];
pText = (BYTE *) pBuffer;
pText += (pSpec->PatLen - 1);
pPatEnd = pPattern + pSpec->PatLen - 1;
NegLen = -pSpec->PatLen;
GuardOffset = -1;
if (pSpec->PatLen == 1) {
GuardOffset = 0;
}
GuardChar = pPatEnd[GuardOffset];
goto SkipLoop;
CheckGuard:
if (pText >= pEnd) return NULL;
if (pText[GuardOffset] == GuardChar) goto TryMatch;
pText += pSpec->md2;
/* FALLTHROUGH */
/*Main search is in "fast" skip loop*/
SkipLoop:
for (;;) {
/*Note: When (re-)entering skip loop, checks more often*/
Skip = pSkip[*pText];
if (Skip == 0) goto CheckGuard;
pText += Skip;
pText += pSkip[*pText];
pText += pSkip[*pText];
Skip = pSkip[*pText];
if (Skip == 0) goto CheckGuard;
pText += Skip;
pText += pSkip[*pText];
pText += pSkip[*pText];
/*May need end-of-buffer check here in some applications*/
/*if (pText >= pEnd) return NULL;*/
}
TryMatch:
/*Check pattern from (Guard-1)..0*/
for (i = GuardOffset - 1; i > NegLen; i--) {
if (pPatEnd[i] != pText[i]) goto Mismatch;
}
/*Now check pattern from (PatLen - 1) to (Guard + 1)*/
for (i = -1; i > GuardOffset; i--) {
if (pPatEnd[i] != pText[i]) goto Mismatch;
}
/*?? Stash current guard position to use for next search*/
/*Match found: return pointer to start of text in buffer*/
return pText + NegLen + 1;
Mismatch:
/*Use last mismatch as new guard character*/
GuardChar = pPatEnd[i];
GuardOffset = i;
pText += pSpec->md2;
goto SkipLoop;
} /*Search*/
/************************************************************************/
/* */
/* SearchCI -- Search for pattern in buffer (case insensitive) */
/* */
/************************************************************************/
public_scope CHAR *
STBM_SearchCI(STBM_SearchSpec *pSpec,
CHAR *pBuffer, UINT BufferLength)
{
UINT *pSkip;
BYTE *pText;
UINT Skip;
BYTE *pEnd;
INT GuardOffset;
BYTE GuardChar;
BYTE *pPattern = pSpec->pPattern;
BYTE *pPatEnd;
INT i;
INT NegLen;
/*Prepare for search*/
pEnd = (BYTE *) (pBuffer + BufferLength);
pSkip = &pSpec->SkipTable[0];
pText = (BYTE *) pBuffer;
pPatEnd = pPattern + pSpec->PatLen - 1;
NegLen = -pSpec->PatLen;
GuardOffset = -1;
if (pSpec->PatLen == 1) {
GuardOffset = 0;
}
GuardChar = STBM_TOLOWER(pPatEnd[GuardOffset]);
goto SkipLoop;
CheckGuard:
if (pText >= pEnd) return NULL;
if (STBM_TOLOWER(pText[GuardOffset]) == GuardChar) goto TryMatch;
pText += pSpec->md2;
/* FALLTHROUGH */
/*Main search is in "fast" skip loop*/
SkipLoop:
for (;;) {
/*Note: When (re-)entering skip loop, checks more often*/
Skip = pSkip[*pText];
if (Skip == 0) goto CheckGuard;
pText += Skip;
pText += pSkip[*pText];
pText += pSkip[*pText];
Skip = pSkip[*pText];
if (Skip == 0) goto CheckGuard;
pText += Skip;
pText += pSkip[*pText];
pText += pSkip[*pText];
/*May need end-of-buffer check here in some applications*/
/*if (pText >= pEnd) return NULL;*/
}
TryMatch:
/*Check pattern from (Guard-1)..0*/
for (i = GuardOffset - 1; i > NegLen; i--) {
if (pPatEnd[i] != STBM_TOLOWER(pText[i])) goto Mismatch;
}
/*Now check pattern from (PatLen - 1) to (Guard + 1)*/
for (i = -1; i > GuardOffset; i--) {
if (pPatEnd[i] != STBM_TOLOWER(pText[i])) goto Mismatch;
}
/*?? Stash current guard position to use for next search*/
/*Match found: return pointer to start of text in buffer*/
return pText + NegLen + 1;
Mismatch:
/*Use last mismatch as new guard character*/
GuardChar = STBM_TOLOWER(pPatEnd[i]);
GuardOffset = i;
pText += pSpec->md2;
goto SkipLoop;
} /*SearchCI*/
/************************************************************************/
/* */
/* SearchTBM -- Search using Tuned BM (not self-tuned BM) */
/* */
/* This routine is provided merely for reference/comparison. */
/* */
/* It allows the Tuned Boyer-Moore algorithm to be used to */
/* perform fixed-string searches so that direct comparisons */
/* with behoffski's self-tuned version can be made. */
/* Otherwise, it's too hard to separate algorithm performance */
/* from the performance of the program control. */
/* */
/* The variant of the Tuned BM search implemented here is */
/* based on the version implemented by GNU Grep. In */
/* particular, the static guard is always the second-to-last */
/* character of the string. */
/* */
/************************************************************************/
public_scope CHAR *
STBM_SearchTBM(STBM_SearchSpec *pSpec,
CHAR *pBuffer, UINT BufferLength)
{
UINT *pSkip;
BYTE *pText;
UINT Skip;
BYTE *pEnd;
BYTE *pPattern = pSpec->pPattern;
BYTE *pPatEnd;
BYTE GuardChar;
INT i;
INT NegLen;
/*Is the buffer too short to hold the target string?*/
if (BufferLength < pSpec->PatLen) {
/*Yes, can't match*/
return NULL;
}
/*Prepare for search*/
pEnd = (BYTE *) (pBuffer + BufferLength);
pSkip = &pSpec->SkipTable[0];
pText = (BYTE *) pBuffer;
pText += (pSpec->PatLen - 1);
pPatEnd = pPattern + pSpec->PatLen - 1;
NegLen = -pSpec->PatLen;
GuardChar = pPatEnd[-1];
goto SkipLoop;
CheckGuard:
if (pText >= pEnd) return NULL;
if (pText[-1] == GuardChar) goto TryMatch;
pText += pSpec->md2;
/* FALLTHROUGH */
/*Main search is in "fast" skip loop*/
SkipLoop:
for (;;) {
/*Note: When (re-)entering skip loop, checks more often*/
Skip = pSkip[*pText];
if (Skip == 0) goto CheckGuard;
pText += Skip;
pText += pSkip[*pText];
pText += pSkip[*pText];
Skip = pSkip[*pText];
if (Skip == 0) goto CheckGuard;
pText += Skip;
pText += pSkip[*pText];
pText += pSkip[*pText];
/*May need end-of-buffer check here in some applications*/
/*if (pText >= pEnd) return NULL;*/
}
TryMatch:
/*Check pattern from (Guard-1)..0*/
for (i = -2; i > NegLen; i--) {
if (pPatEnd[i] != pText[i]) goto Mismatch;
}
/*Match found: return pointer to start of text in buffer*/
return pText + NegLen + 1;
Mismatch:
/*Use last mismatch as new guard character*/
pText += pSpec->md2;
goto SkipLoop;
} /*SearchTBM*/
/************************************************************************/
/* */
/* GetPattern -- Retrieve originally-supplied pattern pointer */
/* */
/* In many cases, the client may malloc the text, and then hand */
/* the text to STBM to be compiled. It's possible to have a */
/* memory leak if the pattern is destroyed without freeing */
/* the text. However, it's mighty inconvenient to carry around */
/* the text pointer in parallel to the STBM version, especially */
/* as STBM stores the pointer anyway. */
/* */
/* So this function allows the client to retrieve the text */
/* pointer stored by STBM so that the memory may be freed. */
/* */
/************************************************************************/
public_scope void
STBM_GetPattern(STBM_SearchSpec *pSpec, CHAR **ppPatternText)
{
/*Write pattern pointer to caller*/
*ppPatternText = pSpec->pPattern;
} /*GetPattern*/
/************************************************************************/
/* */
/* Destroy -- Get rid of optimised search spec */
/* */
/* Discards the resources acquired when the search pattern */
/* was compiled. Note the text containing the original */
/* pattern might need freeing also; this is not done here. */
/* See GetPattern if you need to recover the pointer for */
/* freeing. */
/* */
/************************************************************************/
public_scope void
STBM_Destroy(STBM_SearchSpec *pSpec)
{
/*Free memory acquired to hold search spec*/
Platform_SmallFree(pSpec);
} /*Destroy*/