字符串匹配算法之总结

来源:互联网 发布:税友软件深圳分公司 编辑:程序博客网 时间:2024/05/19 05:02

问题提出:

定字符串text, pattern,确定是否pattern 是text的子串,若是,请返回最先匹配的位置。


问题解决:

       1.暴力搜索:

       2.KMP 算法:涉及到求pattern的后缀数组,以保证每次的有效偏移,加快搜索;

       3.Boyer-Moore算法:详述请参见维基百科;

       4.Karp-Rabin 算法:详述请参见维基百科;


实现代码:

#ifndef  _PATTERN_SEARCH_H_#define  _PATTERN_SEARCH_H_#include <assert.h>/** The implementation of brute force search**/int PatternMatchBrute( const char* text, const char* pattern ){assert( text );assert( pattern );size_t textLen = strlen( text );size_t patternLen = strlen( pattern );for( size_t i = 0; i <= textLen - patternLen; i++ ){size_t start = i;size_t j = 0;for( ; j < patternLen; j++ ){if( text[start] == pattern[j] ){start++;}else{break;}}if( j == patternLen ){return i;}}return -1;}/** Helper function it can be used to calculate suffix array which associated with KMP algorithm **/int* CalcSuffix( const char* pattern, size_t len ){int* suffix = new int[ len + 1 ];memset( suffix, 0x00, sizeof(int)*( len + 1) );int j = 0;for( int i = 1; i < len; i++ ){while( j > 0 && pattern[i] != pattern[j] ){j = suffix[j - 1];}if( pattern[i] == pattern[j] ){suffix[i] = ++j;}}return suffix;}/** The implementation of KMP algorithm**/int PatternMatchKMP( const char* text, const char* pattern ){assert( text );assert( pattern );size_t textLen = strlen( text );size_t patternLen = strlen( pattern );int* suffix = CalcSuffix( pattern,  patternLen );for( int i = 0; i <= textLen - patternLen;  ){int start = i;int j = 0;for( j = 0; j < patternLen; j++ ){if( text[i] == pattern[j] ){i++;}else{i += j - suffix[j];break;}}if( j == patternLen ){delete [] suffix;return i - patternLen;}if( start == i ){i++;}}delete [] suffix;return -1;}/** The implementation of BoyerMoore algorithm**/int PatternMatchBME( const char* text, const char* pattern ){int base = 256;int* right = new int[base];memset( right, -1, sizeof(int) * base );int len = strlen( pattern );for( int i = 0; i < len; i++ ){right[pattern[i]] = i;}int skip = -1;int strLen = strlen( text );for( int i = 0; i <= strLen - len; i += skip ){skip = 0;for( int j = len - 1; j >= 0; j-- ){if( text[i + j] != pattern[j] ){skip = j - right[text[i + j]]; // key pointif( skip < 1 ){skip = 1;}break;}}if( 0 == skip ){delete [] right;return i;}}delete [] right;return -1;}/** Helper function**/int hashValue( int len, int base, int prime ){int res = 1;for( int i = 0; i < len - 1; i++ ){res = ( res * base )%prime;}return res;}/** The implementation of Rabin-Karp algorithm**/int PatternMatchKRB( const char* str, const char* pat ){int base = 256;int prime = 101;int len = strlen( pat );int h = hashValue( len, base, prime );int hashStr = 0;int hashPat = 0;for( int i = 0; i < len; i++ ){hashStr = ( hashStr * base + str[i] ) % prime;hashPat = ( hashPat * base + pat[i] ) % prime;}int strLen = strlen( str );for( int i = 0; i <= strLen - len; i++ ){if( hashStr == hashPat ){int j = 0;for( ; j < len; j++ ){if( str[i + j] != pat[j] )break;}if( j == len ){return i;}}if( i < strLen - len ){hashStr = ( base * ( hashStr - str[i] * h ) + str[i + len]) % prime;if( hashStr < 0 ){hashStr += prime;}}}return -1;}/** Test interface**/void TestPatternSearch(){const char* text = "acccwdocccwwhowccsiowiowwwccwweioewchcccandccswwveaoiewddddiweoicccchacccwwsfchadchanddsoisndochandischurcccchandchinawitnessbrchandeakoutmiters";const char* pattern = "cccwws";const char* substr = strstr( text, pattern );int pos = substr - text;int newPos = PatternMatchBrute( text, pattern );int kmpPos = PatternMatchKMP( text, pattern );int boyerPos = PatternMatchBME( text, pattern );int krbPos = PatternMatchKRB( text, pattern );getchar();}#endif 

0 0
原创粉丝点击