sunday 字符串匹配算法的实现(支持二进制匹配)

来源:互联网 发布:微博登录网络异常 编辑:程序博客网 时间:2024/06/06 03:49
之前在解析multi-part formdata协议的http请求包时, 用字符串匹配的方式寻找包体中的 boundary标记.  这就涉及到了字符串匹配算法,  然后就选择了sunday算法.

sunday是我所知到目前最快的单模式字符串匹配算法了, 由于请求包体中可能含有二进制, 所以把sunday算法改造成了支持二进制串匹配的.

sunday算法的原理不多说,网上一搜一大把, 下面贴下我的实现:

/* * @desc    : Sunday String pattern matching algorithm (also support binary buf pattern match) * @author  : nemozhang * */#ifndef __SUNDAY_H_20111203__#define __SUNDAY_H_20111203__#include <stdio.h>#include <vector>#ifndef u_char#define u_char unsigned char#endifclass SundayAlgo{public:        enum        {                   JUMP_TABLE_LEN                          = 256    // 跳转表大小        };          enum        {                MATCH_RULE_STEP_ONE_CHAR        = 0,    // 匹配上时, 跳过一个字符长度继续匹配                MATCH_RULE_STEP_ONE_PATTEN      = 1,    // 匹配上时, 跳过一个模式串长度继续匹配        };public:        SundayAlgo():                _jump_table_inited(false),                 _pat_start(0),                _pat_end(0),                _match_rule(MATCH_RULE_STEP_ONE_CHAR)        {}  public:        // 源串     [text_start, text_end)        // 模式串   [pat_start, pat_end)        // @return -1 没找到, else 目标相对于起始串的偏移        int Search(const char *text_start, const char *text_end)        {                   if (text_start >= text_end)                {                           return -1;                 }                   if (!_jump_table_inited)                {                           return -1;                 }                   int text_len = text_end - text_start;                int pat_len = _pat_end - _pat_start;                for (int i=0; i<text_len-pat_len+1; )                {                           bool finded = true;                        // 从后往前匹配                        for (int j=0; j<pat_len; ++j)                        {                                   // 匹配不上, 跳                                if (text_start[i+pat_len-j-1] != _pat_start[pat_len-j-1])                                {                                           //printf("i:%d, j:%d\n", i, j);                                        //printf("text:%d [%c], pat:%d [%c] \n", i+pat_len-j-1, text_start[i+pat_len-j-1],  pat_len-j-1, _pat_start[pat_len-j-1]);                                        //printf("i:%d [%c], j:%d [%c] \n", i, text_start[i],  j, _pat_start[j]);                                        u_char next_c = (u_char)text_start[i + pat_len];                                        //printf("next c:%d, [%c], jmp:%d\n", i+pat_len, next_c, _jump_table[next_c]);                                        i += _jump_table[next_c];                                        finded = false;                                        break;                                }                           }                           if (finded)                        {                                   // 匹配上了                                 return i;                        }                }                return -1;        }        // 将每一个匹配项的偏移存于pos_vec        void Search(const char *text_start, const char *text_end, std::vector<int> &pos_vec)        {                int pos = 0;                const char *text_start_new = (const char*)text_start;                int pat_len = _pat_end - _pat_start;                while(pos != -1)                {                        pos = Search(text_start_new, text_end);                        if (pos != -1)                        {                                pos_vec.push_back(pos + text_start_new - text_start);                                if (MATCH_RULE_STEP_ONE_CHAR == _match_rule)                                {                                        text_start_new += (1 + pos);                                }                                else                                {                                        text_start_new += (pat_len + pos);                                }                        }                        else                        {                                break;                        }                }        }        // 设置模式串        // [pat_start, pat_end) 不含pat_end.        void SetPatten(const char* pat_start, const char* pat_end)        {                _pat_start = pat_start;                _pat_end = pat_end;                PreCompute(pat_start, pat_end);        }        // 设置匹配策略        // 假设文本串为 "aaaaaa", 模式串为 "aaa"        // 如果rule:MATCH_RULE_STEP_ONE_CHAR, 则会产生 4次匹配        // 如果rule:MATCH_RULE_STEP_ONE_PATTERN, 则会产生 2次匹配        void SetMatchRule(int rule)        {                _match_rule = rule;        }private:        // 生成跳转表        void PreCompute(const char* pat_start, const char* pat_end)        {                if (pat_start >= pat_end)                {                        return;                }                int pat_len = pat_end - pat_start ;                // 初始化                for (int i=0; i<JUMP_TABLE_LEN; ++i)                {                        _jump_table[i] = pat_len + 1; // pat长度+1                }                const char* p = pat_start;                for (; p!=pat_end; ++p)                {                        _jump_table[(u_char)(*p)] = pat_end - p;                }                _jump_table_inited = true;        }private:        u_char  _jump_table[JUMP_TABLE_LEN];        bool    _jump_table_inited;        const char    *_pat_start;        const char    *_pat_end;        int             _match_rule;};#endif


测试用例:
// by nemozhang#include <gtest/gtest.h>#include "sunday.h"#include <unistd.h>using namespace std;TEST(autorun_SundayAlgo, test_ascii_str) {    const char *text = "sunhello world !\n taday is sunday, i feel good now.\nthis is a text for sunday algo test program.day, sunhow,dslasun.sdslsunday" ;    const char *pat = "sunday";    int text_len = strlen(text);    int pat_len = strlen(pat);    SundayAlgo sunday;    const char * pat_start = (const char*)pat;    const char * pat_end = pat_start + pat_len;    sunday.SetPatten(pat_start, pat_end);    vector<int> pos_vec;    sunday.Search(text, text + text_len, pos_vec);    printf("hit times : %d\n", pos_vec.size());    for (size_t i=0; i<pos_vec.size(); ++i)    {        printf("the %u time : %d\n", i, pos_vec[i]);        for (int j=pos_vec[i]; j<pos_vec[i]+pat_len+5; ++j)        {            if (j >= text_len)            {                break;            }            printf("%c",text[j]);        }        printf("\n");    }}TEST(autorun_SundayAlgo, test_binary_str) {        u_char text[] = {1,2,255,253,0,255,0,253,0,3,4,5,6,7,8,9,0,0,1,2,3,4,0,0,1,2,0,4,5,0,9,6,4,2,0,0,0,0,0,0,3,2,1,1,2,3,4,5,6,7,0,3,4,6,55,4,2,3,4,234,12,111,255,253,0,255,253,0,255,253,0};        //u_char pat[] = {255,253,0};        u_char pat[] = {0,0,0};        int text_len = sizeof(text);        int pat_len = sizeof(pat);        SundayAlgo sunday;        const char * pat_start = (const char*)pat;        const char * pat_end = pat_start + pat_len;        sunday.SetPatten(pat_start, pat_end);        vector<int> pos_vec;        sunday.SetMatchRule(SundayAlgo::MATCH_RULE_STEP_ONE_PATTEN);        sunday.Search((const char*)text, (const char*)text + text_len, pos_vec);        printf("\n");        printf("\n");        printf("hit times : %d\n", pos_vec.size());        for (size_t i=0; i<pos_vec.size(); ++i)        {                printf("the %u time : %d\n", i, pos_vec[i]);                for (int j=pos_vec[i]; j<pos_vec[i]+pat_len+5; ++j)                {                        if (j >= text_len)                        {                                break;                        }                        printf("%d,",text[j]);                }                printf("\n");        }}

输出如下:
nemo@vm04_sles10:[unittest]$ ./sunday_unittest 
Running main() from gtest_main.cc
[==========] Running 2 tests from 1 test case.
[----------] Global test environment set-up.
[----------] 2 tests from autorun_SundayAlgo
[ RUN      ] autorun_SundayAlgo.test_ascii_str
hit times : 3
the 0 time : 27
sunday, i f
the 1 time : 71
sunday algo
the 2 time : 120
sunday
[       OK ] autorun_SundayAlgo.test_ascii_str (0 ms)
[ RUN      ] autorun_SundayAlgo.test_binary_str




hit times : 2
the 0 time : 34
0,0,0,0,0,0,3,2,
the 1 time : 37
0,0,0,3,2,1,1,2,
[       OK ] autorun_SundayAlgo.test_binary_str (0 ms)
[----------] 2 tests from autorun_SundayAlgo (0 ms total)


[----------] Global test environment tear-down
[==========] 2 tests from 1 test case ran. (0 ms total)
[  PASSED  ] 2 tests.

原创粉丝点击