The KMP algorithm compares the pattern string to the text in left to right direction as compared to Boyer Moore Algorithm. The algorithm shifts the pattern more intelligently than the brute-force algorithm.
Key Idea
Whenever a mismatch occurs, what is the most we can shift the pattern so as to avoid redundant comparisons?
Answer: The largest prefix of P[0..j] that is a suffix of [1..j]
The KMP algorithm pre-processes the pattern string to find matches of the prefixes of the pattern with the pattern itself. The information thus calculated is used to shift the pattern appropriately whenever a mismatch occurs or a comparison fails. The compuatation is performed by the function called KMP prefix function
KMP Prefix Function
The prefix function F(j) is defined as the size of the largest prefix in P[0..j] that is also a suffix of P[1..j]
Code for computing the KMP Prefix Function
computeKmpPrefix(const std::string &pattern){ int patternSize = pattern.size(); vector<int> kmpPrefix(patternSize); size_t prefixPos = 0; size_t suffixPos = 1; while(suffixPos < patternSize){ if(pattern[prefixPos] == pattern[suffixPos]){ kmpPrefix[suffixPos] = prefixPos + 1; prefixPos++; suffixPos++; } else if(prefixPos > 0){//found some match prefixPos = kmpPrefix[prefixPos -1];//backtrack for matching prefix e.g. aaaaabaaaaaa } else{ kmpPrefix[suffixPos] = 0; suffixPos++; } } return kmpPrefix; }
The algorithm in picture
Example
The Complete Code
#ifndef _PatternMatcher_H_ #define _PatternMatcher_H_ #include <iostream> #include <string> #include <vector> using namespace std; class PatternMatcher{ public: static int kmpSearch(const string& text, const string& pattern); private: static vector<int> computeKmpPrefix(const string& pattern); PatternMatcher(); PatternMatcher(const PatternMatcher&); const PatternMatcher& operator=(const PatternMatcher&); }; #endif //_PatternMatcher_H_
#include "PatternMatcher.h" vector<int> PatternMatcher::computeKmpPrefix(const std::string &pattern){ int patternSize = pattern.size(); vector<int> kmpPrefix(patternSize); size_t prefixPos = 0; size_t suffixPos = 1; while(suffixPos < patternSize){ if(pattern[prefixPos] == pattern[suffixPos]){ kmpPrefix[suffixPos] = prefixPos + 1; prefixPos++; suffixPos++; } else if(prefixPos > 0){//found some match prefixPos = kmpPrefix[prefixPos -1];//backtrack for matching prefix e.g. aaaaabaaaaaa } else{ kmpPrefix[suffixPos] = 0; suffixPos++; } } return kmpPrefix; } int PatternMatcher::kmpSearch(const std::string &text, const std::string &pattern){ size_t textSize = text.size(); size_t patternSize = pattern.size(); if(patternSize > textSize) return -1; vector<int> kmpNext = computeKmpPrefix(pattern); int tIdx = 0; int pIdx = 0; while(tIdx < textSize){ if(pattern[pIdx] == text[tIdx]){ if(pIdx == patternSize - 1) return tIdx - (patternSize - 1); tIdx++; pIdx++; } else if(pIdx > 0){ pIdx = kmpNext[pIdx - 1]; } else{ tIdx++; } } return -1; }
#include "PatternMatcher.h" int main(){ cout << PatternMatcher::kmpSearch ("abacaabaccabacabaabb", "abacab") << endl; cout << PatternMatcher::kmpSearch ("abacaabaccabacabaabb", "baabb") << endl; cout << PatternMatcher::kmpSearch ("abacaabaccabacabaabb", "abacad") << endl; cout << PatternMatcher::kmpSearch ("abacaabaccabacabaabb", "abacaab") << endl; cout << PatternMatcher::kmpSearch ("abacaabaccabacabaabb", "abacab") << endl; cout << PatternMatcher::kmpSearch ("abacaabaccabacabaabb", "aabaccaba") << endl; cout << PatternMatcher::kmpSearch ("abacaabaccabacabaabb", "abacaabaccabacabaabb") << endl; cout << PatternMatcher::kmpSearch ("abacaabaccabacabaabb", "") << endl; cout << PatternMatcher::kmpSearch ("", "abacaabaccabacabaabb") << endl; cout << PatternMatcher::kmpSearch ("abacaabaccabacabaabb", "bacaabaccabacabaab") << endl; cout << PatternMatcher::kmpSearch ("abacaabaccabacabaabb", "abacaabac") << endl; cout << PatternMatcher::kmpSearch ("abacaabaccabacabaabb", "ccabacabaabb") << endl; cout << PatternMatcher::kmpSearch ("abacaabaccabacabaabb", "bacaabaccabacabaabb") << endl; return 0; }
Read more ...