A Developer's Diary

May 30, 2010

Knuth Morris Pratt Algorithm

The KMP algorithm compares the pattern string to the text in left to right direction as compared to Boyer Moore Algorithm. The algorithm shifts the pattern more intelligently than the brute-force algorithm.

Key Idea
Whenever a mismatch occurs, what is the most we can shift the pattern so as to avoid redundant comparisons?
Answer: The largest prefix of P[0..j] that is a suffix of [1..j]

The KMP algorithm pre-processes the pattern string to find matches of the prefixes of the pattern with the pattern itself. The information thus calculated is used to shift the pattern appropriately whenever a mismatch occurs or a comparison fails. The compuatation is performed by the function called KMP prefix function
KMP Prefix Function
The prefix function F(j) is defined as the size of the largest prefix in P[0..j] that is also a suffix of P[1..j]


Code for computing the KMP Prefix Function
computeKmpPrefix(const std::string &pattern){
    int patternSize = pattern.size();
    vector<int> kmpPrefix(patternSize);
    size_t prefixPos = 0;
    size_t suffixPos = 1;

    while(suffixPos < patternSize){
        if(pattern[prefixPos] == pattern[suffixPos]){
            kmpPrefix[suffixPos] = prefixPos + 1;
            prefixPos++;
            suffixPos++;
        }
        else if(prefixPos > 0){//found some match
            prefixPos = kmpPrefix[prefixPos -1];//backtrack for matching prefix   e.g. aaaaabaaaaaa
        }
        else{
            kmpPrefix[suffixPos] = 0;
            suffixPos++;
        }
    }
    return kmpPrefix;
}

The algorithm in picture

Example

The Complete Code
#ifndef _PatternMatcher_H_
#define _PatternMatcher_H_

#include <iostream>

#include <string>
#include <vector>

using namespace std;

class PatternMatcher{
public:
    static int kmpSearch(const string& text, const string& pattern);

private:
    static vector<int> computeKmpPrefix(const string& pattern);

    PatternMatcher();
    PatternMatcher(const PatternMatcher&);
    const PatternMatcher& operator=(const PatternMatcher&);
};

#endif //_PatternMatcher_H_

#include "PatternMatcher.h"

vector<int> PatternMatcher::computeKmpPrefix(const std::string &pattern){
    int patternSize = pattern.size();
    vector<int> kmpPrefix(patternSize);
    size_t prefixPos = 0;
    size_t suffixPos = 1;

    while(suffixPos < patternSize){
        if(pattern[prefixPos] == pattern[suffixPos]){
            kmpPrefix[suffixPos] = prefixPos + 1;
            prefixPos++;
            suffixPos++;
        }
        else if(prefixPos > 0){//found some match
            prefixPos = kmpPrefix[prefixPos -1];//backtrack for matching prefix e.g. aaaaabaaaaaa
        }
        else{
            kmpPrefix[suffixPos] = 0;
            suffixPos++;
        }
    }
    return kmpPrefix;
}

int PatternMatcher::kmpSearch(const std::string &text, const std::string &pattern){

    size_t textSize = text.size();
    size_t patternSize = pattern.size();

    if(patternSize > textSize)
        return -1;

    vector<int> kmpNext = computeKmpPrefix(pattern);
    int tIdx = 0;
    int pIdx = 0;

    while(tIdx < textSize){
        if(pattern[pIdx] == text[tIdx]){
            if(pIdx == patternSize - 1) 
                return tIdx - (patternSize - 1);
            tIdx++;
            pIdx++;
        }
        else if(pIdx > 0){
            pIdx = kmpNext[pIdx - 1];
        }
        else{
            tIdx++;
        }
    }
    return -1;
}

#include "PatternMatcher.h"

int main(){
    cout << PatternMatcher::kmpSearch
        ("abacaabaccabacabaabb", "abacab")
        << endl;
    cout << PatternMatcher::kmpSearch
        ("abacaabaccabacabaabb", "baabb")
        << endl;
    cout << PatternMatcher::kmpSearch
        ("abacaabaccabacabaabb", "abacad")
        << endl;
    cout << PatternMatcher::kmpSearch
        ("abacaabaccabacabaabb", "abacaab")
        << endl;
    cout << PatternMatcher::kmpSearch
        ("abacaabaccabacabaabb", "abacab")
        << endl;
    cout << PatternMatcher::kmpSearch
        ("abacaabaccabacabaabb", "aabaccaba")
        << endl;
    cout << PatternMatcher::kmpSearch
        ("abacaabaccabacabaabb", "abacaabaccabacabaabb")
        << endl;
    cout << PatternMatcher::kmpSearch
        ("abacaabaccabacabaabb", "")
        << endl;
    cout << PatternMatcher::kmpSearch
        ("", "abacaabaccabacabaabb")
        << endl;
    cout << PatternMatcher::kmpSearch
        ("abacaabaccabacabaabb", "bacaabaccabacabaab")
        << endl;
    cout << PatternMatcher::kmpSearch
        ("abacaabaccabacabaabb", "abacaabac")
        << endl;
    cout << PatternMatcher::kmpSearch
        ("abacaabaccabacabaabb", "ccabacabaabb")
        << endl;
    cout << PatternMatcher::kmpSearch
        ("abacaabaccabacabaabb", "bacaabaccabacabaabb")
        << endl;
    return 0;
}

Read more ...

May 29, 2010

Boyer Moore Algorithm

Boyer Moore Algorithm is one of the fastest pattern searching algorithm based on two techniques:

Techniques Used
1. The looking glass technique where you find a pattern P in text T by moving backwards through P, starting at it's end.
2. The character jump heuristic when a mismatch occurs between the characters at position Text[t] = c
Three cases are checked in the order for calculating the character jump. Before moving on to understand the individual cases, let us understand the computation of last occurrence position in the pattern string

Computing last occurrence
The compute last occurrence method assumes that all the characters under the consideration are ASCII characters. The function computes last occurrences of all the characters present in the Pattern P starting from the left. The rest of the ASCII chars return -1


Consider the ASCII character set {a, b, c, d, ..., y, z} then the last occurrence function would be computed as shown in the figure below

The function f(x) points to the last occurrence of character in the pattern P

computeBmpLast(const std::string &pattern){
    const size_t NUM_ASCII_CHARS = 128;
    vector<int> bmpLast(NUM_ASCII_CHARS);

    for(size_t i = 0; i < NUM_ASCII_CHARS; i++){
        bmpLast[i] = -1;
    }

    for(size_t i = 0; i < pattern.size(); i++){
        bmpLast[pattern[i]] = i;
    }
    return bmpLast;
}



Character Jump Heuristics

Case 1:
The character mismatch occurs at the location T[t] = 'x' and the character 'x' is found to the left of the character P[p] = 'c' OR 1 + f(T[t]) <= p. In this case move pattern P to the right to align last occurrence of x in P with x in T

tnew = t + length of pattern - ( 1 + last occurrence of 'x' in pattern)
pnew = length of pattern - 1

Case 2:
The character mismatch occurs at the location T[t] = 'x' and the character 'x' is found to the right of the character P[p] = 'c' OR 1 + f(T[t]) > p. In this scenario alignment is not possible by moving the pattern to the right on the basis of last occurrence. Here we shift the pattern by 1 so that P[p] = c aligns with T[t+1] = a
tnew = t + length of pattern - p
pnew = length of pattern - 1

Case 3:
If the case 1 and case 2 cannot be applied, that means the character T[t] is not found in the pattern P. In this case, we move the pattern P so that P[0] = x aligns itself with T[t+1] = a

tnew = t + length of pattern
pnew = length of pattern - 1

//Character Jump Heuristics
int lastOccur = bmpLast[text[tIdx]];
if(lastOccur != -1){
    if(pIdx > lastOccur){// Case 1: last occurrence of char is to left or equal to the mismatch point 
        tIdx = tIdx + patternSize - (1 + lastOccur);
        pIdx = patternSize - 1;
    }
    else{// Case 2: last occurrence of char is to right of the mismatch point
        tIdx = tIdx + patternSize - (pIdx);
        pIdx = patternSize - 1;
    }
}
else{// Case 3: character is not found in the pattern string
    tIdx = tIdx + patternSize;
    pIdx = patternSize - 1;
}

Merging Case 1 and Case 2 in the above code
//Character Jump Heuristics
int lastOccur = bmpLast[text[tIdx]];
if(lastOccur != -1){
    tIdx = tIdx + patternSize - min<int>(pIdx, 1 + lastOccur);
}
else{// Case 3: character is not found in the pattern string
    tIdx = tIdx + patternSize;
}
pIdx = patternSize - 1;

In our case as computeBmpLast() function stores -1 for characters not found in the search pattern. We can safely merge the Case 3 in the above code to look as
//Character Jump Heuristics
int lastOccur = bmpLast[text[tIdx]];
tIdx = tIdx + patternSize - min<int>(pIdx, 1 + lastOccur);
pIdx = patternSize - 1;

Example:




Complete Code Example
#ifndef _PatternMatcher_H_
#define _PatternMatcher_H_
#include <iostream>
#include <string>
#include <vector>
using namespace std;
class PatternMatcher{
public:
    static int bmpSearch(const string& text, const string& pattern);

private:
    static vector<int> computeBmpLast(const string& pattern);

    PatternMatcher();
    PatternMatcher(const PatternMatcher&);
    const PatternMatcher& operator=(const PatternMatcher&);
};
#endif //_PatternMatcher_H_

#include "PatternMatcher.h"
#include <algorithm>

using namespace std;

int PatternMatcher::bmpSearch(const std::string &text, const std::string &pattern){
    size_t textSize = text.size();
    size_t patternSize = pattern.size();
    if(textSize == 0 || patternSize == 0){
        return -1;
    }
    if(patternSize > textSize){
        return -1;
    }

    vector<int> bmpLast = computeBmpLast(pattern);
    size_t tIdx = patternSize - 1;
    size_t pIdx = patternSize - 1;
    while(tIdx < textSize){
        if(pattern[pIdx] == text[tIdx]){
            if(pIdx == 0){   //found a match
                return tIdx;
            }
            tIdx--;
            pIdx--;
        }
        else {
            //Character Jump Heuristics
            int lastOccur = bmpLast[text[tIdx]];
            tIdx = tIdx + patternSize - min<int>(pIdx, 1 + lastOccur);
            pIdx = patternSize - 1;
        }
    }
    return - 1;
}

vector<int> PatternMatcher::computeBmpLast(const std::string &pattern){
    const size_t NUM_ASCII_CHARS = 128;
    vector<int> bmpLast(NUM_ASCII_CHARS);
    for(size_t i = 0; i < NUM_ASCII_CHARS; i++){
        bmpLast[i] = -1;
    }
    for(size_t i = 0; i < pattern.size(); i++){
        bmpLast[pattern[i]] = i;
    }
    return bmpLast;
}

#include "PatternMatcher.h"

int main(){
    cout << PatternMatcher::bmpSearch
        ("abacaabaccabacabaabb", "abacab")
        << endl;
    cout << PatternMatcher::bmpSearch
        ("abacaabaccabacabaabb", "baabb")
        << endl;
    cout << PatternMatcher::bmpSearch
        ("abacaabaccabacabaabb", "abacad")
        << endl;
    cout << PatternMatcher::bmpSearch
        ("abacaabaccabacabaabb", "abacaab")
        << endl;
    cout << PatternMatcher::bmpSearch
        ("abacaabaccabacabaabb", "abacab")
        << endl;
    cout << PatternMatcher::bmpSearch
        ("abacaabaccabacabaabb", "aabaccaba")
        << endl;
    cout << PatternMatcher::bmpSearch
        ("abacaabaccabacabaabb", "abacaabaccabacabaabb")
        << endl;
    cout << PatternMatcher::bmpSearch
        ("abacaabaccabacabaabb", "")
        << endl;
    cout << PatternMatcher::bmpSearch
        ("", "abacaabaccabacabaabb")
        << endl;
    cout << PatternMatcher::bmpSearch
        ("abacaabaccabacabaabb", "bacaabaccabacabaab")
        << endl;
    cout << PatternMatcher::bmpSearch
        ("abacaabaccabacabaabb", "abacaabac")
        << endl;
    cout << PatternMatcher::bmpSearch
        ("abacaabaccabacabaabb", "ccabacabaabb")
        << endl;
    cout << PatternMatcher::bmpSearch
        ("abacaabaccabacabaabb", "bacaabaccabacabaabb")
        << endl;

    return 0;
}


Read more ...