A Developer's Diary

May 29, 2010

Boyer Moore Algorithm

Boyer Moore Algorithm is one of the fastest pattern searching algorithm based on two techniques:

Techniques Used
1. The looking glass technique where you find a pattern P in text T by moving backwards through P, starting at it's end.
2. The character jump heuristic when a mismatch occurs between the characters at position Text[t] = c
Three cases are checked in the order for calculating the character jump. Before moving on to understand the individual cases, let us understand the computation of last occurrence position in the pattern string

Computing last occurrence
The compute last occurrence method assumes that all the characters under the consideration are ASCII characters. The function computes last occurrences of all the characters present in the Pattern P starting from the left. The rest of the ASCII chars return -1


Consider the ASCII character set {a, b, c, d, ..., y, z} then the last occurrence function would be computed as shown in the figure below

The function f(x) points to the last occurrence of character in the pattern P

computeBmpLast(const std::string &pattern){
    const size_t NUM_ASCII_CHARS = 128;
    vector<int> bmpLast(NUM_ASCII_CHARS);

    for(size_t i = 0; i < NUM_ASCII_CHARS; i++){
        bmpLast[i] = -1;
    }

    for(size_t i = 0; i < pattern.size(); i++){
        bmpLast[pattern[i]] = i;
    }
    return bmpLast;
}



Character Jump Heuristics

Case 1:
The character mismatch occurs at the location T[t] = 'x' and the character 'x' is found to the left of the character P[p] = 'c' OR 1 + f(T[t]) <= p. In this case move pattern P to the right to align last occurrence of x in P with x in T

tnew = t + length of pattern - ( 1 + last occurrence of 'x' in pattern)
pnew = length of pattern - 1

Case 2:
The character mismatch occurs at the location T[t] = 'x' and the character 'x' is found to the right of the character P[p] = 'c' OR 1 + f(T[t]) > p. In this scenario alignment is not possible by moving the pattern to the right on the basis of last occurrence. Here we shift the pattern by 1 so that P[p] = c aligns with T[t+1] = a
tnew = t + length of pattern - p
pnew = length of pattern - 1

Case 3:
If the case 1 and case 2 cannot be applied, that means the character T[t] is not found in the pattern P. In this case, we move the pattern P so that P[0] = x aligns itself with T[t+1] = a

tnew = t + length of pattern
pnew = length of pattern - 1

//Character Jump Heuristics
int lastOccur = bmpLast[text[tIdx]];
if(lastOccur != -1){
    if(pIdx > lastOccur){// Case 1: last occurrence of char is to left or equal to the mismatch point 
        tIdx = tIdx + patternSize - (1 + lastOccur);
        pIdx = patternSize - 1;
    }
    else{// Case 2: last occurrence of char is to right of the mismatch point
        tIdx = tIdx + patternSize - (pIdx);
        pIdx = patternSize - 1;
    }
}
else{// Case 3: character is not found in the pattern string
    tIdx = tIdx + patternSize;
    pIdx = patternSize - 1;
}

Merging Case 1 and Case 2 in the above code
//Character Jump Heuristics
int lastOccur = bmpLast[text[tIdx]];
if(lastOccur != -1){
    tIdx = tIdx + patternSize - min<int>(pIdx, 1 + lastOccur);
}
else{// Case 3: character is not found in the pattern string
    tIdx = tIdx + patternSize;
}
pIdx = patternSize - 1;

In our case as computeBmpLast() function stores -1 for characters not found in the search pattern. We can safely merge the Case 3 in the above code to look as
//Character Jump Heuristics
int lastOccur = bmpLast[text[tIdx]];
tIdx = tIdx + patternSize - min<int>(pIdx, 1 + lastOccur);
pIdx = patternSize - 1;

Example:




Complete Code Example
#ifndef _PatternMatcher_H_
#define _PatternMatcher_H_
#include <iostream>
#include <string>
#include <vector>
using namespace std;
class PatternMatcher{
public:
    static int bmpSearch(const string& text, const string& pattern);

private:
    static vector<int> computeBmpLast(const string& pattern);

    PatternMatcher();
    PatternMatcher(const PatternMatcher&);
    const PatternMatcher& operator=(const PatternMatcher&);
};
#endif //_PatternMatcher_H_

#include "PatternMatcher.h"
#include <algorithm>

using namespace std;

int PatternMatcher::bmpSearch(const std::string &text, const std::string &pattern){
    size_t textSize = text.size();
    size_t patternSize = pattern.size();
    if(textSize == 0 || patternSize == 0){
        return -1;
    }
    if(patternSize > textSize){
        return -1;
    }

    vector<int> bmpLast = computeBmpLast(pattern);
    size_t tIdx = patternSize - 1;
    size_t pIdx = patternSize - 1;
    while(tIdx < textSize){
        if(pattern[pIdx] == text[tIdx]){
            if(pIdx == 0){   //found a match
                return tIdx;
            }
            tIdx--;
            pIdx--;
        }
        else {
            //Character Jump Heuristics
            int lastOccur = bmpLast[text[tIdx]];
            tIdx = tIdx + patternSize - min<int>(pIdx, 1 + lastOccur);
            pIdx = patternSize - 1;
        }
    }
    return - 1;
}

vector<int> PatternMatcher::computeBmpLast(const std::string &pattern){
    const size_t NUM_ASCII_CHARS = 128;
    vector<int> bmpLast(NUM_ASCII_CHARS);
    for(size_t i = 0; i < NUM_ASCII_CHARS; i++){
        bmpLast[i] = -1;
    }
    for(size_t i = 0; i < pattern.size(); i++){
        bmpLast[pattern[i]] = i;
    }
    return bmpLast;
}

#include "PatternMatcher.h"

int main(){
    cout << PatternMatcher::bmpSearch
        ("abacaabaccabacabaabb", "abacab")
        << endl;
    cout << PatternMatcher::bmpSearch
        ("abacaabaccabacabaabb", "baabb")
        << endl;
    cout << PatternMatcher::bmpSearch
        ("abacaabaccabacabaabb", "abacad")
        << endl;
    cout << PatternMatcher::bmpSearch
        ("abacaabaccabacabaabb", "abacaab")
        << endl;
    cout << PatternMatcher::bmpSearch
        ("abacaabaccabacabaabb", "abacab")
        << endl;
    cout << PatternMatcher::bmpSearch
        ("abacaabaccabacabaabb", "aabaccaba")
        << endl;
    cout << PatternMatcher::bmpSearch
        ("abacaabaccabacabaabb", "abacaabaccabacabaabb")
        << endl;
    cout << PatternMatcher::bmpSearch
        ("abacaabaccabacabaabb", "")
        << endl;
    cout << PatternMatcher::bmpSearch
        ("", "abacaabaccabacabaabb")
        << endl;
    cout << PatternMatcher::bmpSearch
        ("abacaabaccabacabaabb", "bacaabaccabacabaab")
        << endl;
    cout << PatternMatcher::bmpSearch
        ("abacaabaccabacabaabb", "abacaabac")
        << endl;
    cout << PatternMatcher::bmpSearch
        ("abacaabaccabacabaabb", "ccabacabaabb")
        << endl;
    cout << PatternMatcher::bmpSearch
        ("abacaabaccabacabaabb", "bacaabaccabacabaabb")
        << endl;

    return 0;
}

2 comments :

Anonymous said...

Thank you! It was very helpful!

Anonymous said...

Hello! I was wondering, how can you transform the code so that it continues to search for the same pattern even after finding one instance of it. I am trying to modify it so that it searches a text file with a long text and find the number of "words" there are in the text. Thanks in advance for the answer!

Post a Comment