SegWord::UString的待完全测试代码

来源:互联网 发布:淘宝账号被冻结 编辑:程序博客网 时间:2024/05/16 10:29

 

l          UString.h

#ifndef __USTRING_H__

#define __USTRING_H__

 

#include <list>

 

/*

* 文件名: UString.h

* 创建日期: 2005-12-12

* 创建者: Percy Lee

* 修改列表:

*

* 说明:

*   Unicode string class for c++(in namespace UStr) with basic operation such as:

*        length() : get the length of string;

*        u_str(): get the unicode char array by pointer;

*        c_str(): nil(return NULL);

*        resize(Size): set the new capacity(=Size) of string;

*        append(str): append the str back;

*        sub_ustr(): get the sub-string under the given section;

*        find(str): find sub-string str's section list;

*        find_overlap(str): find sub-string str which can be overlap-occured in base-string;

*        find_first(str): find the first pos of sub-string str;

*        be_first(str): judge if str is the very beginning sub-string or not.

*   And also it supplies two global functions US_TO_S & S_TO_US

*   for transforming unicode string and mutibyte string.

*

* Copyright (c) the Semean Studio.All rights reserved.

* E-mail: semean@163.com

********************************************************************************/

namespace UStr

{

     /*

     * 联合体: Section

     * 说明: 本联合体的对象可记录一个位置,或者一个区间(起始位置与长度)

     ******************************************************************/

     union Section

     {

         typedef Section value_type;

         typedef Section* pointer_type;

         typedef Section& reference_type;

 

         size_t _begin;

         struct sect

         {

              unsigned int _begin;

              unsigned int _length;

         } _sect;

         Section(void);

     };

 

     typedef long long Int64;

     typedef std::list<Section> SectionList;

     typedef std::list<Section>::iterator SectListIter;

 

     /*

     * : UString

     * 说明:

     * UString具有两种不同类型的对象,一是普通字符串,内部存储空间大小

     * 为字符串所需要的空间大小;一是缓存字符串,内部存储空间大小是1K

     * 的整数倍(管理策略见实现).

     ******************************************************************/

 

     enum USType //UString的存储管理类型

     {

         eUSNormal = 0,     //普通字符串

         eUSBuffer     //用作缓存的字符串

     };

    

     class UString

     {

     public:

         typedef UString value_type;

         typedef UString* pointer_type;

         typedef UString& reference_type;

 

         UString(USType Ustype = eUSNormal);

         UString(const UString& Str);

         UString(const char* pStr);

         UString(const wchar_t* pStr);

         ~UString(void);

     public: //公共接口

         size_t length(void) const;

         wchar_t* u_str(void) const;

         char* c_str(void) const;

         void resize(size_t Size);

         void append(const UString& Str);

         void append(const char* pStr);

         void append(const wchar_t* pStr);

         void append(const wchar_t* pStr, size_t Begin, size_t Length);

         UString sub_ustr(size_t Begin, size_t Length) const;

         UString sub_ustr(const Section& Sect) const;

         /* 检索(find*)系列的方法

         *  说明:

         *        (1)findfind_overlap返回子串所有出现的位置列表(因长度固定,

         *             故联合体只存储首位置).其中find_overlap所检索的子串允许交叉.

          *        (2)返回所有子串区间的列表仅提供一种方便的手段,但返回列表会

         *             影响算法性能(若以引用参数输出STL list,却在DLL输出时有存储

         *             管理的异常).故某些情况下可以使用find_first遍历所有子串.

         *        (3)本系列函数所实现的算法有待严格的大规模的测试.

         *                                                  percylee 2006/3

         **************************************************************************/

         SectionList find(const UString& Str) const;

         SectionList find(wchar_t WCh) const;

         SectionList find(const wchar_t* pStr,size_t Begin, size_t Length) const;

         SectionList find_overlap(const UString& Str) const;

         SectionList find_overlap(const wchar_t* pStr,size_t Begin, size_t Length) const;

         Int64 find_first(const UString& Str,size_t Start) const;

         Int64 find_first(wchar_t WCh, size_t Start) const;

         Int64 find_first(const wchar_t* pStr,size_t Begin, size_t Length, size_t Start) const;

         /**************************************************************************/

         bool be_first(const UString& Str,size_t Start) const;

         bool be_first(const wchar_t* pStr,size_t Begin,size_t Length,size_t Start) const;

     public: //操作符重载

         UString& operator =(const UString& Str);

         UString& operator =(const char* pStr);

         UString& operator =(const wchar_t* pStr);

         wchar_t& operator [](size_t pos);

 

         friend bool operator == ( const UString& Str1, const UString& Str2 );

         friend bool operator < ( const UString& Str1, const UString& Str2 );

         friend bool operator <= ( const UString& Str1, const UString& Str2 );

         friend bool operator > ( const UString& Str1, const UString& Str2 );

         friend bool operator >= ( const UString& Str1, const UString& Str2 );

     private:

         wchar_t* _pUStr;

         size_t   _length;

         size_t   _capacity;

         USType   _usType;

     private:

         void set_ustring(const UString& Str);

         void set_ustring(const char* pStr);

         void set_ustring(const wchar_t* pStr);

         void default_construct(size_t DefaultSize = 0);

         size_t d_capacity(size_t Size);

     };

    

     /*

     * 多字节流与unicode字符串的全局转换函数

     * 说明:

     * 对于S_TO_US,需满足UStrLen>=StrLen;

     * 对于US_TO_S,需满足StrLen>=UStrLen*2

     **************************************************************************/

     size_t S_TO_US(char* pStr,size_t StrLen,wchar_t* pUStr,size_t UStrLen);

     size_t US_TO_S(wchar_t* pUStr,size_t UStrLen,char* pStr,size_t StrLen);

};

 

#endif //__USTRING_H__

 

l          UString.cpp

 

#include "StdAfx.h"

#include "./ustring.h"

#include <stdlib.h>

#include <locale.h>

 

/*

* 内部使用的全局变量与全局函数

*************************************************************************/

const unsigned int DEFAULT_CAPACITY = 1024;

 

// KMP模式匹配算法的后移向量计算

inline size_t* KMPNext(const wchar_t* pStr, size_t Length)

{

     if( !pStr || Length <= 0 )

     {

         return NULL;

     }

     size_t* pN = new size_t[Length];

     if( !pN )

     {

         return NULL;

     }

     size_t var = 0;

     pN[0] = 0;

 

     for( size_t i = 1; i < Length; i ++ )

     {

         var = pN[i-1];

         while( var > 0 && pStr[i] != pStr[var] )

         {

              var = pN[var-1];

         }

 

         if( pStr[i] == pStr[var] )

         {

              pN[i] = var + 1;

         }

         else

         {

              pN[i] = 0;

         }

     }

 

     return pN;

}

 

inline size_t* KMPNext(const UStr::UString& Str)

{

     return KMPNext(Str.u_str(),Str.length());

}

 

/*

* UStr名空间内的类实现

*************************************************************************/

UStr::Section::Section(void)

{

     _begin = 0;

     _sect._begin = 0;

     _sect._length = 0;

}

 

UStr::UString::UString(USType Ustype/* = eUSNormal*/)

{

     _usType = Ustype;

     default_construct();

}

 

UStr::UString::UString(const UString& Str)

{

     _usType = eUSNormal;

     default_construct(((UString&)Str).length());

     set_ustring(Str);

}

 

UStr::UString::UString(const char* pStr)

{

     _usType = eUSNormal;

     if( pStr )

     {

         default_construct(strlen(pStr));

         set_ustring(pStr);

     }

     else

     {

         default_construct();

     }

}

 

UStr::UString::UString(const wchar_t* pStr)

{

     _usType = eUSNormal;

     if( pStr )

     {

         default_construct(wcslen(pStr));

         set_ustring(pStr);

     }

     else

     {

         default_construct();

     }

}

 

UStr::UString::~UString(void)

{

     delete[] _pUStr;

     _length = 0;

}

 

size_t UStr::UString::length(void) const

{

     return _length;

}

 

wchar_t* UStr::UString::u_str(void) const

{

     return _pUStr;

}

 

char* UStr::UString::c_str(void) const

{

     return NULL;

}

 

void UStr::UString::resize(size_t Size)

{

     delete[] _pUStr;

     _length = 0;

     default_construct(Size);

}

 

void UStr::UString::append(const UString& Str)

{

     size_t len = Str.length();

     if( len <= 0 )

     {

         return;

     }

    

     append(Str.u_str(),0,len);

}

 

void UStr::UString::append(const char* pStr)

{

     if( !pStr )

     {

         return;

     }

     UString ustr(pStr);

     append(ustr);

}

 

void UStr::UString::append(const wchar_t* pStr)

{

     if( !pStr )

     {

         return;

     }

     size_t len = wcslen(pStr);

     if( len <= 0 )

     {

         return;

     }

    

     append(pStr,0,len);

}

 

void UStr::UString::append(const wchar_t* pStr, size_t Begin, size_t Length)

{

     if( !pStr || Length <= 0 )

     {

         return;

     }

    

     if( _capacity < _length + Length )

     {

         _capacity += d_capacity(Length);

         wchar_t* pBuf = new wchar_t[_capacity+1];

         wcsncpy(pBuf,_pUStr,_length);

         for(size_t i = 0; i < Length; i ++)

         {

              pBuf[_length+i] = pStr[Begin+i];

         }

         _length += Length;

         pBuf[_length] = L'/0';

 

         delete[] _pUStr;

         _pUStr = pBuf;

         pBuf = NULL;

     }

     else

     {

         for( size_t i = 0; i < Length; i ++ )

         {

              _pUStr[_length+i] = pStr[Begin+i];

         }

         _length += Length;

         _pUStr[_length] = L'/0';

     }

}

 

UStr::UString UStr::UString::sub_ustr(size_t Begin, size_t Length) const

{

     UString ustr;

     if( _length < Begin + Length )

     {

         return ustr;

     }

     ustr.append(_pUStr,Begin,Length);

 

     return ustr;

}

 

UStr::UString UStr::UString::sub_ustr(const UStr::Section& Sect) const

{

     UString ustr;

     if( _length < Sect._sect._begin + Sect._sect._length )

     {

         return ustr;

     }

     ustr.append(_pUStr,Sect._sect._begin,Sect._sect._length);

 

     return ustr;

}

 

UStr::SectionList UStr::UString::find(const UString& Str) const

{

     return find(Str.u_str(),0,Str.length());

}

 

UStr::SectionList UStr::UString::find(wchar_t WCh) const

{

     SectionList ustrList;

     Section aSection;

     for( size_t i = 0; i < _length; i ++ )

     {

         if( _pUStr[i] == WCh )

         {

              aSection._begin = i;

              ustrList.push_back(aSection);

         }

     }

 

     return ustrList;

}

 

UStr::SectionList UStr::UString::find(const wchar_t* pStr,size_t Begin, size_t Length) const

{

     SectionList ustrList;

     if( Length > _length )

     {

         return ustrList;

     }

     const wchar_t* pStrBegin = pStr+Begin;

     size_t* pKMPNext = KMPNext(pStrBegin,Length);

     if( !pKMPNext )

     {

         return ustrList;

     }

     Section aSection;

     size_t strPos = 0;

     for( size_t i = 0; i < _length; i ++ )

     {

         while( pStrBegin[strPos] != _pUStr[i] && strPos > 0 )

         {

              strPos = pKMPNext[strPos-1];

         }

         if( pStrBegin[strPos] == _pUStr[i] )

         {

              strPos ++;

         }

         if( strPos == Length )

         {

              aSection._begin = i - Length + 1;

              ustrList.push_back(aSection);

              strPos = 0; //start form the very beginning of pKMPNext

         }

     }

 

     delete[] pKMPNext;

     return ustrList;

}

 

UStr::SectionList UStr::UString::find_overlap(const UString& Str) const

{

     return find_overlap(Str.u_str(),0,Str.length());

}

 

UStr::SectionList UStr::UString::find_overlap(const wchar_t* pStr,size_t Begin, size_t Length) const

{

     SectionList ustrList;

     if( Length > _length )

     {

         return ustrList;

     }

     const wchar_t* pStrBegin = pStr+Begin;

     size_t* pKMPNext = KMPNext(pStrBegin,Length);

     if( !pKMPNext )

     {

         return ustrList;

     }

     Section aSection;

     size_t strPos = 0;

     for( size_t i = 0; i < _length; i ++ )

     {

         while( pStrBegin[strPos] != _pUStr[i] && strPos > 0 )

         {

              strPos = pKMPNext[strPos-1];

         }

         if( pStrBegin[strPos] == _pUStr[i] )

         {

              strPos ++;

         }

         if( strPos == Length )

         {

              aSection._begin = i - Length + 1;

              ustrList.push_back(aSection);

              strPos = 0; //start form the very beginning of pKMPNext

              i = i - Length + 1;//overlap

         }

     }

 

     delete[] pKMPNext;

     return ustrList;

}

 

UStr::Int64 UStr::UString::find_first(const UStr::UString& Str,size_t Start) const

{

     size_t length = Str.length();

     if( _length < Start+length )

     {

         return -1;

     }

 

     return find_first(Str.u_str(),0,length,Start);

}

 

UStr::Int64 UStr::UString::find_first(wchar_t WCh, size_t Start) const

{

     if( _length <= Start )

     {

         return -1;

     }

    

     for( size_t i = Start; i < _length; i ++ )

     {

         if( _pUStr[i] == WCh )

         {

              return (Int64)i;

         }

     }

 

     return -1;

}

 

UStr::Int64 UStr::UString::find_first(const wchar_t* pStr,

                                            size_t Begin,

                                            size_t Length,

                                            size_t Start

                                            ) const

{

     if( _length < Start+Length )

     {

         return -1;

     }

     const wchar_t* pStrBegin = pStr+Begin;

     size_t* pKMPNext = KMPNext(pStrBegin,Length);

     if( !pKMPNext )

     {

         return -1;

     }

     size_t strPos = 0;

     for( size_t i = Start; i < _length; i ++ )

     {

         while( pStrBegin[strPos] != _pUStr[i] && strPos > 0 )

         {

              strPos = pKMPNext[strPos-1];

         }

         if( pStrBegin[strPos] == _pUStr[i] )

         {

              strPos ++;

         }

         if( strPos == Length )

         {

              delete[] pKMPNext;

              return (Int64)((Int64)i-(Int64)Length+1);

         }

     }

 

     delete[] pKMPNext;

     return -1;

}

 

bool UStr::UString::be_first(const UString& Str,size_t Start) const

{

     return be_first(Str.u_str(),0,Str.length(),Start);

}

 

bool UStr::UString::be_first(const wchar_t* pStr,

                                  size_t Begin,

                                  size_t Length,

                                  size_t Start

                                  ) const

{

     if( _length < Start+Length )

     {

         return false;

     }

     for( size_t i = 0; i < Length; i ++ )

     {

         if( _pUStr[Start+i] != pStr[Begin+i] )

         {

              return false;

         }

     }

 

     return true;

}

 

UStr::UString& UStr::UString::operator =(const UString& Str)

{

     if( this != &Str )

     {

         set_ustring(Str);

     }

     return *this;

}

 

UStr::UString& UStr::UString::operator =(const char* pStr)

{

     if( pStr )

     {

         set_ustring(pStr);

     }

     else

     {

         delete[] _pUStr;

         default_construct();

     }

     return *this;

}

 

UStr::UString& UStr::UString::operator =(const wchar_t* pStr)

{

     if( pStr && pStr != this->_pUStr )

     {

         set_ustring(pStr);

     }

     else if( !pStr )

     {

         delete[] _pUStr;

         default_construct();

     }

     return *this;

}

 

wchar_t& UStr::UString::operator [](size_t pos)

{

     static wchar_t wch;

     if( pos >= _length )

     {

         return wch;

     }

 

     return _pUStr[pos];

}

 

void UStr::UString::set_ustring(const UString& Str)

{

     _length = Str.length();

     if( _capacity < _length )

     {

         delete[] _pUStr;

         _capacity = d_capacity(_length);

         _pUStr = new wchar_t[ _capacity+1 ];

     }

    

     wcsncpy(_pUStr,Str.u_str(),_length);

     _pUStr[_length] = L'/0';

}

 

void UStr::UString::set_ustring(const char* pStr)

{

     size_t len = strlen(pStr);

     if( _capacity < len )

     {

         delete[] _pUStr;

         _capacity = d_capacity(len);

         _pUStr = new wchar_t[_capacity+1];

     }

 

     (void)setlocale(LC_ALL,"");

    

     _length = mbstowcs(_pUStr,pStr,len*sizeof(char));

     _pUStr[ _length ] = L'/0';

}

 

void UStr::UString::set_ustring(const wchar_t* pStr)

{

     _length = wcslen(pStr);

     if( _capacity < _length )

     {

         delete[] _pUStr;

         _capacity = d_capacity(_length);

         _pUStr = new wchar_t[_capacity+1];

     }

 

     wcsncpy( _pUStr,pStr,_length );

     _pUStr[_length] = L'/0';

}

 

void UStr::UString::default_construct(size_t DefaultSize/* = 0*/)

{

     _capacity = d_capacity(DefaultSize);

     if( _pUStr = new wchar_t[_capacity+1] )

     {

         _pUStr[0] = L'/0';

     }

     else

     {

         _capacity = 0;

     }

 

     _length = 0;

}

 

size_t UStr::UString::d_capacity(size_t Size)

{

     if( _usType == eUSNormal )

     {

         return Size+1;

     }

     else //if( _usType == eUSBuffer )

     {

         return (Size/DEFAULT_CAPACITY + 1)*DEFAULT_CAPACITY;

     }

}

 

/*

* UString友元比较函数

**************************************************************************************/

 

bool UStr::operator == ( const UStr::UString& Str1, const UStr::UString& Str2 )

{

     size_t len = Str1.length();

     if( len != Str2.length() )

     {

         return false;

     }

 

     wchar_t* pStr1 = Str1.u_str();

     wchar_t* pStr2 = Str2.u_str();

 

     if( wcscmp(pStr1,pStr2) != 0 )

     {

         return false;

     }

 

     return true;

}

 

bool UStr::operator < ( const UStr::UString& Str1, const UStr::UString& Str2 )

{

     size_t length1 = Str1.length();

     size_t length2 = Str2.length();

     if( length1 < length2 )

     {

         return true;

     }

     else if( length2 < length1 )

     {

         return false;

     }

     else

     {

         wchar_t* pStr1 = Str1.u_str();

         wchar_t* pStr2 = Str2.u_str();

 

         if( wcscmp(pStr1,pStr2) < 0 )

         {

              return true;

         }

 

         return false;

     }

}

 

bool UStr::operator <= ( const UStr::UString& Str1, const UStr::UString& Str2 )

{

     size_t length1 = Str1.length();

     size_t length2 = Str2.length();

     if( length1 < length2 )

     {

         return true;

     }

     else if( length2 < length1 )

     {

         return false;

     }

     else

     {

         wchar_t* pStr1 = Str1.u_str();

         wchar_t* pStr2 = Str2.u_str();

 

         if( wcscmp(pStr1,pStr2) <= 0 )

         {

              return true;

         }

 

         return false;

     }

}

 

bool UStr::operator > ( const UStr::UString& Str1, const UStr::UString& Str2 )

{

     size_t length1 = Str1.length();

     size_t length2 = Str2.length();

     if( length1 > length2 )

     {

         return true;

     }

     else if( length2 > length1 )

     {

         return false;

     }

     else

     {

         wchar_t* pStr1 = Str1.u_str();

         wchar_t* pStr2 = Str2.u_str();

 

         if( wcscmp(pStr1,pStr2) > 0 )

         {

              return true;

         }

 

         return false;

     }

}

 

bool UStr::operator >= ( const UStr::UString& Str1, const UStr::UString& Str2 )

{

     size_t length1 = Str1.length();

     size_t length2 = Str2.length();

     if( length1 > length2 )

     {

         return true;

     }

     else if( length2 > length1 )

     {

         return false;

     }

     else

     {

         wchar_t* pStr1 = Str1.u_str();

         wchar_t* pStr2 = Str2.u_str();

 

         if( wcscmp(pStr1,pStr2) >= 0 )

         {

              return true;

         }

 

         return false;

     }

}

 

 

/*

* UStr名空间内的全局函数的实现

************************************************************************************/

size_t UStr::S_TO_US(char* pStr,size_t StrLen,wchar_t* pUStr,size_t UStrLen)

{

     if( !pStr || !pUStr || UStrLen < StrLen )

     {

         return 0;

     }

 

     char* pStr2 = new char[StrLen+1];

     strncpy(pStr2,pStr,StrLen);

     pStr2[StrLen] = '/0';

 

     (void)setlocale(LC_ALL,"");

    

     size_t len = mbstowcs(pUStr,pStr2,StrLen*sizeof(char));

     pUStr[ len ] = L'/0';

 

     delete[] pStr2;

     return len;

}

 

size_t UStr::US_TO_S(wchar_t* pUStr,size_t UStrLen,char* pStr,size_t StrLen)

{

     if( !pStr || !pUStr || StrLen < 2*UStrLen )

     {

         return 0;

     }

 

     wchar_t* pUStr2 = new wchar_t[UStrLen+1];

     wcsncpy(pUStr2,pUStr,UStrLen);

     pUStr2[UStrLen] = L'/0';

 

     (void)setlocale(LC_ALL,"");

 

     size_t len = wcstombs(pStr,pUStr2,UStrLen*sizeof(wchar_t));

     pStr[ len ] = '/0';

 

     delete[] pUStr2;

     return len;

}

 

 

原创粉丝点击