sunday 字符串匹配算法的实现(支持二进制匹配)
来源:互联网 发布:微博登录网络异常 编辑:程序博客网 时间:2024/06/06 03:49
之前在解析multi-part formdata协议的http请求包时, 用字符串匹配的方式寻找包体中的 boundary标记. 这就涉及到了字符串匹配算法, 然后就选择了sunday算法.
输出如下:
nemo@vm04_sles10:[unittest]$ ./sunday_unittest
Running main() from gtest_main.cc
[==========] Running 2 tests from 1 test case.
[----------] Global test environment set-up.
[----------] 2 tests from autorun_SundayAlgo
[ RUN ] autorun_SundayAlgo.test_ascii_str
hit times : 3
the 0 time : 27
sunday, i f
the 1 time : 71
sunday algo
the 2 time : 120
sunday
[ OK ] autorun_SundayAlgo.test_ascii_str (0 ms)
[ RUN ] autorun_SundayAlgo.test_binary_str
hit times : 2
the 0 time : 34
0,0,0,0,0,0,3,2,
the 1 time : 37
0,0,0,3,2,1,1,2,
[ OK ] autorun_SundayAlgo.test_binary_str (0 ms)
[----------] 2 tests from autorun_SundayAlgo (0 ms total)
[----------] Global test environment tear-down
[==========] 2 tests from 1 test case ran. (0 ms total)
[ PASSED ] 2 tests.
sunday是我所知到目前最快的单模式字符串匹配算法了, 由于请求包体中可能含有二进制, 所以把sunday算法改造成了支持二进制串匹配的.
sunday算法的原理不多说,网上一搜一大把, 下面贴下我的实现:
/* * @desc : Sunday String pattern matching algorithm (also support binary buf pattern match) * @author : nemozhang * */#ifndef __SUNDAY_H_20111203__#define __SUNDAY_H_20111203__#include <stdio.h>#include <vector>#ifndef u_char#define u_char unsigned char#endifclass SundayAlgo{public: enum { JUMP_TABLE_LEN = 256 // 跳转表大小 }; enum { MATCH_RULE_STEP_ONE_CHAR = 0, // 匹配上时, 跳过一个字符长度继续匹配 MATCH_RULE_STEP_ONE_PATTEN = 1, // 匹配上时, 跳过一个模式串长度继续匹配 };public: SundayAlgo(): _jump_table_inited(false), _pat_start(0), _pat_end(0), _match_rule(MATCH_RULE_STEP_ONE_CHAR) {} public: // 源串 [text_start, text_end) // 模式串 [pat_start, pat_end) // @return -1 没找到, else 目标相对于起始串的偏移 int Search(const char *text_start, const char *text_end) { if (text_start >= text_end) { return -1; } if (!_jump_table_inited) { return -1; } int text_len = text_end - text_start; int pat_len = _pat_end - _pat_start; for (int i=0; i<text_len-pat_len+1; ) { bool finded = true; // 从后往前匹配 for (int j=0; j<pat_len; ++j) { // 匹配不上, 跳 if (text_start[i+pat_len-j-1] != _pat_start[pat_len-j-1]) { //printf("i:%d, j:%d\n", i, j); //printf("text:%d [%c], pat:%d [%c] \n", i+pat_len-j-1, text_start[i+pat_len-j-1], pat_len-j-1, _pat_start[pat_len-j-1]); //printf("i:%d [%c], j:%d [%c] \n", i, text_start[i], j, _pat_start[j]); u_char next_c = (u_char)text_start[i + pat_len]; //printf("next c:%d, [%c], jmp:%d\n", i+pat_len, next_c, _jump_table[next_c]); i += _jump_table[next_c]; finded = false; break; } } if (finded) { // 匹配上了 return i; } } return -1; } // 将每一个匹配项的偏移存于pos_vec void Search(const char *text_start, const char *text_end, std::vector<int> &pos_vec) { int pos = 0; const char *text_start_new = (const char*)text_start; int pat_len = _pat_end - _pat_start; while(pos != -1) { pos = Search(text_start_new, text_end); if (pos != -1) { pos_vec.push_back(pos + text_start_new - text_start); if (MATCH_RULE_STEP_ONE_CHAR == _match_rule) { text_start_new += (1 + pos); } else { text_start_new += (pat_len + pos); } } else { break; } } } // 设置模式串 // [pat_start, pat_end) 不含pat_end. void SetPatten(const char* pat_start, const char* pat_end) { _pat_start = pat_start; _pat_end = pat_end; PreCompute(pat_start, pat_end); } // 设置匹配策略 // 假设文本串为 "aaaaaa", 模式串为 "aaa" // 如果rule:MATCH_RULE_STEP_ONE_CHAR, 则会产生 4次匹配 // 如果rule:MATCH_RULE_STEP_ONE_PATTERN, 则会产生 2次匹配 void SetMatchRule(int rule) { _match_rule = rule; }private: // 生成跳转表 void PreCompute(const char* pat_start, const char* pat_end) { if (pat_start >= pat_end) { return; } int pat_len = pat_end - pat_start ; // 初始化 for (int i=0; i<JUMP_TABLE_LEN; ++i) { _jump_table[i] = pat_len + 1; // pat长度+1 } const char* p = pat_start; for (; p!=pat_end; ++p) { _jump_table[(u_char)(*p)] = pat_end - p; } _jump_table_inited = true; }private: u_char _jump_table[JUMP_TABLE_LEN]; bool _jump_table_inited; const char *_pat_start; const char *_pat_end; int _match_rule;};#endif
// by nemozhang#include <gtest/gtest.h>#include "sunday.h"#include <unistd.h>using namespace std;TEST(autorun_SundayAlgo, test_ascii_str) { const char *text = "sunhello world !\n taday is sunday, i feel good now.\nthis is a text for sunday algo test program.day, sunhow,dslasun.sdslsunday" ; const char *pat = "sunday"; int text_len = strlen(text); int pat_len = strlen(pat); SundayAlgo sunday; const char * pat_start = (const char*)pat; const char * pat_end = pat_start + pat_len; sunday.SetPatten(pat_start, pat_end); vector<int> pos_vec; sunday.Search(text, text + text_len, pos_vec); printf("hit times : %d\n", pos_vec.size()); for (size_t i=0; i<pos_vec.size(); ++i) { printf("the %u time : %d\n", i, pos_vec[i]); for (int j=pos_vec[i]; j<pos_vec[i]+pat_len+5; ++j) { if (j >= text_len) { break; } printf("%c",text[j]); } printf("\n"); }}TEST(autorun_SundayAlgo, test_binary_str) { u_char text[] = {1,2,255,253,0,255,0,253,0,3,4,5,6,7,8,9,0,0,1,2,3,4,0,0,1,2,0,4,5,0,9,6,4,2,0,0,0,0,0,0,3,2,1,1,2,3,4,5,6,7,0,3,4,6,55,4,2,3,4,234,12,111,255,253,0,255,253,0,255,253,0}; //u_char pat[] = {255,253,0}; u_char pat[] = {0,0,0}; int text_len = sizeof(text); int pat_len = sizeof(pat); SundayAlgo sunday; const char * pat_start = (const char*)pat; const char * pat_end = pat_start + pat_len; sunday.SetPatten(pat_start, pat_end); vector<int> pos_vec; sunday.SetMatchRule(SundayAlgo::MATCH_RULE_STEP_ONE_PATTEN); sunday.Search((const char*)text, (const char*)text + text_len, pos_vec); printf("\n"); printf("\n"); printf("hit times : %d\n", pos_vec.size()); for (size_t i=0; i<pos_vec.size(); ++i) { printf("the %u time : %d\n", i, pos_vec[i]); for (int j=pos_vec[i]; j<pos_vec[i]+pat_len+5; ++j) { if (j >= text_len) { break; } printf("%d,",text[j]); } printf("\n"); }}
输出如下:
nemo@vm04_sles10:[unittest]$ ./sunday_unittest
Running main() from gtest_main.cc
[==========] Running 2 tests from 1 test case.
[----------] Global test environment set-up.
[----------] 2 tests from autorun_SundayAlgo
[ RUN ] autorun_SundayAlgo.test_ascii_str
hit times : 3
the 0 time : 27
sunday, i f
the 1 time : 71
sunday algo
the 2 time : 120
sunday
[ OK ] autorun_SundayAlgo.test_ascii_str (0 ms)
[ RUN ] autorun_SundayAlgo.test_binary_str
hit times : 2
the 0 time : 34
0,0,0,0,0,0,3,2,
the 1 time : 37
0,0,0,3,2,1,1,2,
[ OK ] autorun_SundayAlgo.test_binary_str (0 ms)
[----------] 2 tests from autorun_SundayAlgo (0 ms total)
[----------] Global test environment tear-down
[==========] 2 tests from 1 test case ran. (0 ms total)
[ PASSED ] 2 tests.
- sunday 字符串匹配算法的实现(支持二进制匹配)
- 字符串匹配Sunday算法实现
- 字符串匹配Sunday算法实现
- 字符串匹配---Sunday算法
- 字符串匹配Sunday算法
- 字符串匹配-sunday算法
- 字符串匹配--Sunday算法
- 字符串匹配 sunday算法
- 字符串匹配sunday算法
- Sunday 字符串匹配算法
- Sunday字符串匹配算法
- 字符串匹配-Sunday算法
- Sunday字符串匹配算法,java实现
- 字符串匹配--Sunday算法-C语言实现
- 字符串匹配Sunday算法C++实现
- 字符串匹配Sunday算法C++实现
- 字符串匹配sunday算法java实现
- 字符串匹配算法Sunday实现(二)
- HDU4475(找规律+预处理加速)
- 自定义Android带图片和文字的ImageButton
- java中的日期类
- Android HttpURLConnection应用技巧分享
- 移植linux-2.6.29+busybox最小系统到单板OK6410
- sunday 字符串匹配算法的实现(支持二进制匹配)
- (转载)虚继承之单继承的内存布局
- Android网络连接之HttpURLConnection和HttpClient
- 黑马程序员 WPF页面布局的基础随笔摘录
- VBS生成Excel报告的常用操作
- Hadoop 稳定性与性能提升浅析
- JSP九大内置对象及常用方法
- Android开发 按钮点击事件的几种写法
- HDU 3376 费用流