如何设计高效的聊天过滤词算法?

来源:互联网 发布:md5加密 java验证 编辑:程序博客网 时间:2024/04/30 17:17

关于聊天过滤词算法,一直困扰着我,了解到很多算法,比如:KMP, 正则循环匹配等,然后在http://www.dewen.org/q/41/%E5%A6%82%E4%BD%95%E8%AE%BE%E8%AE%A1%E9%AB%98%E6%95%88%E7%9A%84%E8%81%8A%E5%A4%A9%E8%BF%87%E6%BB%A4%E8%AF%8D%E7%AE%97%E6%B3%95%EF%BC%9F看到了一篇文章,现摘要几种相对好的答案,以备不时之需。

1trie树算法

我们的解决方法是用构造一个tire树。 每个节点都存储0- 256个字符。

用脏词字典来构造这个树。

树的结构大概如下

具体实现代码如下:

namespace KGame{class WordFilter{public:    WordFilter() {}    ~WordFilter()     {        Clean(&m_Filter);    }    void AddWord(const char* word)    {        UInt32 len = (UInt32)strlen(word);        Filter* filter = &m_Filter;        for (UInt32 i = 0; i < len; i++)        {            unsigned char c = word[i];            if (i == len - 1)            {                filter->m_NodeArray[c].m_Flag |= FilterNode::NODE_IS_END;                break;            }            else            {                filter->m_NodeArray[c].m_Flag |= FilterNode::NODE_HAS_NEXT;            }            if (filter->m_NodeArray[c].m_NextFilter == NULL)            {                Filter* tmpFilter = XNEW (Filter)();                filter->m_NodeArray[c].m_NextFilter = tmpFilter;            }            filter = (Filter *)filter->m_NodeArray[c].m_NextFilter;        }    }    void AddWords(const std::set<std::string>& wordList)    {        for (std::set<std::string>::const_iterator it = wordList.begin();            it != wordList.end(); it++)        {            AddWord(it->c_str());        }    }    void AddWords(const std::vector<std::string>& wordList)    {        for (std::vector<std::string>::const_iterator it = wordList.begin();            it != wordList.end(); it++)        {            AddWord(it->c_str());        }    }    void AddWords(const KGame::Set<std::string>& worldList)    {        for (KGame::Set<std::string>::Iter* iter = worldList.Begin();            iter != worldList.End(); iter = worldList.Next(iter))        {            AddWord(iter->m_Value.c_str());        }    }    Int32 Check(const char* str)    {        Filter* filter = NULL;        for (Int32 i = 0; i < (int)strlen(str) - 1; i++)        {            filter = &m_Filter;            for (UInt32 j = i; j < strlen(str); j++)            {                unsigned char c = str[j];                 if ((c >= 'A' && c <= 'Z'))                {                    c += 32;                }                if (filter->m_NodeArray[c].m_Flag == FilterNode::NODE_IS_NULL)                {                    break;                }                else if (filter->m_NodeArray[c].m_Flag & FilterNode::NODE_IS_END)                {                    return i;                }                else // NODE_HAS_NEXT                {                    filter = (Filter*)filter->m_NodeArray[c].m_NextFilter;                }            }        }        return -1;    }    void CheckAndModify(char* str, const char replace = '*')    {        Filter* filter = NULL;        for (Int32 i = 0; i < (int)strlen(str) - 1; i++)        {            filter = &m_Filter;            for (UInt32 j = i; j < strlen(str); j++)            {                unsigned char c = str[j];                 if ((c >= 'A' && c <= 'Z'))                {                    c += 32;                }                if (filter->m_NodeArray[c].m_Flag == FilterNode::NODE_IS_NULL)                {                    break;                }                else if (filter->m_NodeArray[c].m_Flag & FilterNode::NODE_IS_END)                {                    for (UInt32 k = i; k <= j; k++)                    {                        str[k] = replace;                    }                    if (filter->m_NodeArray[c].m_Flag & FilterNode::NODE_HAS_NEXT)                    {                        filter = (Filter*)filter->m_NodeArray[c].m_NextFilter;                    }                    else                    {                        continue;                    }                }                else // NODE_HAS_NEXT                {                    filter = (Filter*)filter->m_NodeArray[c].m_NextFilter;                }            }        }    }    void CheckAndModify(std::string& str, const char replace = '*')    {        Filter* filter = NULL;        for (Int32 i = 0; i < (int)str.size() - 1; i++)        {            filter = &m_Filter;            for (UInt32 j = i; j < str.size(); j++)            {                unsigned char c = str[j];                 if ((c >= 'A' && c <= 'Z'))                {                    c += 32;                }                if (filter->m_NodeArray[c].m_Flag == FilterNode::NODE_IS_NULL)                {                    break;                }                else if (filter->m_NodeArray[c].m_Flag & FilterNode::NODE_IS_END)                {                    for (UInt32 k = i; k <= j; k++)                    {                        str[k] = replace;                    }                    if (filter->m_NodeArray[c].m_Flag & FilterNode::NODE_HAS_NEXT)                    {                        filter = (Filter*)filter->m_NodeArray[c].m_NextFilter;                    }                    else                    {                        continue;                    }                }                else // NODE_HAS_NEXT                {                    filter = (Filter*)filter->m_NodeArray[c].m_NextFilter;                }            }        }    }private:    struct FilterNode    {        char m_Flag;        void* m_NextFilter;        enum Flag        {            NODE_IS_NULL          = 0x00,            NODE_HAS_NEXT         = 0x01,            NODE_IS_END           = 0x10,        };        FilterNode() : m_Flag(NODE_IS_NULL), m_NextFilter(NULL) {}    };    struct Filter    {        FilterNode m_NodeArray[256];    } m_Filter;    void Clean(Filter* filter)    {        for (UInt32 i = 0; i < 256; i++)        {            if (filter->m_NodeArray[i].m_NextFilter)            {                Clean((Filter *)filter->m_NodeArray[i].m_NextFilter);                XDELETE((Filter*)filter->m_NodeArray[i].m_NextFilter);            }        }    }};} // namespace KGame

2.基于KMP算法

聊天过滤词算法的解决思路
提高过滤的算法个人认为主要从两个方面考虑:(1)尽量减少内存、IO的次数。(2)增加串内查找的速度。
基于这两点我想采用连续的内存片,可以减少内存地址跳跃的次数,采用静态的内存这就解决了(1)的问题,第二点是增加串内查找的速度,这个比较公认的事KMP算法

class WordFilter{public:WordFilter();~WordFilter();void Init();void FilterWord(string& word);int  Index_KMP(const char* S, const char* T, int pos);private:std::set<string> m_storage;const char** m_words;uint32 m_count;};WordFilter::WordFilter(){m_words = NULL;m_count = 0;}WordFilter::~WordFilter(){if(m_words) {free(m_words);}}void WordFilter::Init(){// 把所有屏蔽词都放到m_storage里m_count = m_storage.size();if(m_count) {m_words = (const char**)malloc(sizeof(char*)*m_count);std::set<string>::iterator ptr;int i = 0;for(ptr = m_storage.begin(); ptr != m_storage.end(); ++ptr,i++) {m_words[i] = ptr->c_str();}}}static inline void _filterWord(char* word, const char* lowerWord, const char* oldstr){int len = strlen(oldstr);const char* tmp;memset(word, '*', len);word += len;lowerWord += len;while((tmp = Index_KMP(lowerWord, oldstr)) != NULL) {word += (tmp-lowerWord);memset(word, '*', len);word += len;lowerWord = tmp + len;}}void WordFilter::FilterWord(string& word){string tmp(word);str_tolower(tmp);const char** p = (const char**)m_words;const char* dest;for(uint32 i=0; i<m_count; i++, p++) {dest = Index_KMP(tmp.c_str(), *p, 0);if(dest) {_filterWord((char*)(word.c_str() + (dest-tmp.c_str())), dest, *p);}}}int WordFilter::Index_KMP(const char* S, const char* T, int pos){    i=pos;   j=1;  while(i <= S[0] && j<= T[0]){  if(j == 0 || S[i] == T[j]) { ++i; ++j; }else   j = next[j];}  if(j>T[0])   return   i-T[0]; else   return   0;  }

以上两种方法相对比较好一点。以做参考。


原创粉丝点击