/************************************************************************/
/* */
/* STBMShim -- Interface module to join STBM to ScanFile */
/* */
/* This module is a shim (interface adaptor) between the */
/* self-tuned Boyer-Moore search algorithm and the file */
/* scanning control interface wielded by ScanFile. Using */
/* a shim like this is part of creating truly reusable */
/* software: this shim especially allows STBM to be written */
/* in a simple and portable fashion, which is useful as it */
/* will have application outside of ggrep. */
/* */
/* Copyright (C) 1995-2000 Grouse Software. All rights reserved. */
/* Written for Grouse by behoffski (Brenton Hoff). */
/* */
/* Free software: no warranty; use anywhere is ok; spread the */
/* sources; note any mods; share variations and derivatives */
/* (including sending to behoffski@grouse.com.au). */
/* */
/************************************************************************/
#include "ascii.h"
#include <compdef.h>
#include "matcheng.h"
#include "platform.h"
#include "regexp00.h"
#include "stbm.h"
#include "stbmshim.h"
#include <stdlib.h>
/************************************************************************/
/* */
/* Init -- Prepare module for operation */
/* */
/************************************************************************/
void
STBMShim_Init(void)
{
} /*Init*/
/************************************************************************/
/* */
/* Pattern -- Convert RegExp pattern to STBM search spec */
/* */
/* The conversion checks that the supplied RE is a simple */
/* string, where "simple string" means: */
/* - All elements of the RE are literals, (no classes, */
/* no anchors, no match-any components), and */
/* - All elements merely match themselves (no repeats */
/* or optional matching), and */
/* - Case insensitivity is consistent: either no */
/* element is case-insensitive, or else all */
/* components are case-insensitive. */
/* */
/* The check tries to be as light and fast as possible, as it */
/* can be called multiple times. */
/* */
/* Returns NULL if unable to search for the specified RE */
/* using the STBM algorithm. */
/* */
/************************************************************************/
public_scope STBM_SearchSpec *
STBMShim_Pattern(RegExp_Specification *pRESpec, UINT *pPatternLength,
BOOL *pIgnoreCase, BYTE *pTrailingLiteral)
{
RE_ELEMENT *pInitialRE = &pRESpec->CodeList[0];
RE_ELEMENT *pRE = pInitialRE;
CHAR *pPattern;
char *pPat;
BOOL IgnoreCase = FALSE;
STBM_SearchSpec *pSpec;
LWORD Flags;
RE_ELEMENT *pEndRE;
/*Is the first element a literal?*/
if (REGEXP_TYPE_R(*pRE) != REGEXP_TYPE_LITERAL) {
/*No, cannot optimise*/
return NULL;
}
/*Is the pattern too short to be searched efficiently?*/
if (pRESpec->NrStates < 3) {
/*Yes, cannot optimise*/
return NULL;
}
/*Obtain case-sensitivity sense from first literal*/
if ((*pRE & REGEXP_CASE_MASK) == REGEXP_CASE_INSENSITIVE) {
IgnoreCase = TRUE;
}
*pIgnoreCase = IgnoreCase;
/*Loop through all components, checking that STBM is applicable*/
for (;;) {
switch (REGEXP_TYPE_R(*pRE)) {
case REGEXP_TYPE_END_OF_EXPR:
/*Okay, entire pattern can be searched using STBM*/
pEndRE = pRE;
goto PatternOkay;
/*break;*/
case REGEXP_TYPE_LITERAL:
/*Found literal: consider it further below*/
break;
default:
/*Anything else is too hard for STBM search*/
return NULL;
}
/*Literal found: does it have nonlinear modifications?*/
if (REGEXP_REPEAT_R(*pRE) != REGEXP_REPEAT_MATCH_SELF) {
/*Yes, cannot match as a simple string*/
return NULL;
}
/*Is literal's case sensitivity selection consistent?*/
if ((*pRE & REGEXP_CASE_MASK) == REGEXP_CASE_INSENSITIVE) {
if (! IgnoreCase) {
/*Sorry, case sensitivity isn't consistent*/
return FALSE;
}
} else {
if (IgnoreCase) {
/*Sorry, case sensitivity isn't consistent*/
return FALSE;
}
}
/*Does literal require more than an octet to be stored?*/
if (REGEXP_LITERAL_R(*pRE) > 255) {
/*Sorry, cannot treat Unicode characters simply*/
return FALSE;
}
/*This element is okay: Look at next element, if any*/
pRE++;
}
/*Control should never reach here*/
PatternOkay:
/*Extract text from literals into simple string for STBM*/
pPattern = Platform_SmallMalloc(pRESpec->NrStates + 1);
pPat = pPattern;
for (pRE = pInitialRE; pRE != pEndRE; ) {
/*Add literal to string*/
*pPat++ = REGEXP_LITERAL_R(*pRE++);
}
/*Trim pattern to 64 characters as that's all ScanFile handles (sigh)*/
if ((pPat - pPattern) > 64) {
pPat = pPattern + 64;
}
/*Add NUL to pattern string in case a C user tries to print it*/
*pPat = NUL;
Flags = 0;
if (IgnoreCase) {
Flags |= STBM_SEARCH_CASE_INSENSITIVE;
}
/*Prepare data structures for optimised search*/
if (! STBM_Compile(pPat - pPattern, pPattern, Flags, &pSpec)) {
/*Sorry, unable to prepare optimised version*/
Platform_SmallFree(pPattern);
return FALSE;
}
/*Prepared search successfully, report search handle to caller*/
*pPatternLength = pPat - pPattern;
*pTrailingLiteral = pPat[-1];
return pSpec;
} /*Pattern*/
/************************************************************************/
/* */
/* Destroy -- Ret rid of STBM search spec supplied by Pattern */
/* */
/************************************************************************/
public_scope void
STBMShim_Destroy(STBM_SearchSpec *pSearchSpec)
{
CHAR *pPattern;
/*Obtain pattern we malloc'd during creation*/
STBM_GetPattern(pSearchSpec, &pPattern);
/*Get rid of search spec and pattern text*/
STBM_Destroy(pSearchSpec);
Platform_SmallFree(pPattern);
} /*Destroy*/
/************************************************************************/
/* */
/* Search -- Search for pattern in buffer (case sensitive) */
/* */
/************************************************************************/
public_scope BOOL
STBMShim_Search(MatchEng_Spec *pSearchSpec,
BYTE *pTextStart,
MatchEng_Details *pDetails)
{
BOOL Found;
UINT BufferLength;
STBM_SearchSpec *pSTBM = (STBM_SearchSpec *) pSearchSpec->pSpare1;
CHAR *pText;
/*Obtain buffer length using buffer current and end pointers*/
BufferLength = pSearchSpec->pAfterEndOfBuffer - (BYTE *) pTextStart;
pText = STBM_Search(pSTBM, pTextStart, BufferLength);
Found = (pText != NULL);
pDetails->pLineStart = NULL;
pDetails->pMatchStart = pText;
pDetails->pMatchEnd = pText + pSearchSpec->PatternLength;
return Found;
} /*Search*/
/************************************************************************/
/* */
/* SearchInCase -- Search for pattern in buffer (case INsensitive) */
/* */
/************************************************************************/
public_scope BOOL
STBMShim_SearchInCase(MatchEng_Spec *pSearchSpec,
BYTE *pTextStart,
MatchEng_Details *pDetails)
{
BOOL Found;
UINT BufferLength;
STBM_SearchSpec *pSTBM = (STBM_SearchSpec *) pSearchSpec->pSpare1;
CHAR *pText;
/*Obtain buffer length using buffer current and end pointers*/
BufferLength = pSearchSpec->pAfterEndOfBuffer - (BYTE *) pTextStart;
pText = STBM_SearchCI(pSTBM, pTextStart, BufferLength);
Found = (pText != NULL);
pDetails->pLineStart = NULL;
pDetails->pMatchStart = pText;
pDetails->pMatchEnd = pText + pSearchSpec->PatternLength;
return Found;
} /*SearchInCase*/
/************************************************************************/
/* */
/* SearchTBM -- Search for pattern in buffer (use tuned BM) */
/* */
/* This interface is provided merely for comparison/reference. */
/* It allows the Tuned Boyer-Moore algorithm to be used to */
/* perform fixed-string searches so that direct comparisons */
/* with behoffski's self-tuned version can be made. */
/* Otherwise, it's too hard to separate algorithm performance */
/* from the performance of the program control. */
/* */
/* The variant of the Tuned BM search implemented here is */
/* based on the version implemented by GNU Grep. In */
/* particular, the static guard is always the second-to-last */
/* character of the string. */
/* */
/************************************************************************/
public_scope BOOL
STBMShim_SearchTBM(MatchEng_Spec *pSearchSpec,
BYTE *pTextStart,
MatchEng_Details *pDetails)
{
BOOL Found;
UINT BufferLength;
STBM_SearchSpec *pSTBM = (STBM_SearchSpec *) pSearchSpec->pSpare1;
CHAR *pText;
/*Obtain buffer length using buffer current and end pointers*/
BufferLength = pSearchSpec->pAfterEndOfBuffer - (BYTE *) pTextStart;
pText = STBM_SearchTBM(pSTBM, pTextStart, BufferLength);
Found = (pText != NULL);
pDetails->pLineStart = NULL;
pDetails->pMatchStart = pText;
pDetails->pMatchEnd = pText + pSearchSpec->PatternLength;
return Found;
} /*SearchTBM*/