SegWord::UString的待完全测试代码

来源：互联网发布：淘宝账号被冻结编辑：程序博客网时间：2024/05/16 10:29

l UString.h

#ifndef __USTRING_H__

#define __USTRING_H__

#include <list>

* 文件名: UString.h

* 创建日期: 2005-12-12

* 创建者: Percy Lee

* 修改列表:

* 说明:

* Unicode string class for c++(in namespace UStr) with basic operation such as:

* length() : get the length of string;

* u_str(): get the unicode char array by pointer;

* c_str(): nil(return NULL);

* resize(Size): set the new capacity(=Size) of string;

* append(str): append the str back;

* sub_ustr(): get the sub-string under the given section;

* find(str): find sub-string str's section list;

* find_overlap(str): find sub-string str which can be overlap-occured in base-string;

* find_first(str): find the first pos of sub-string str;

* be_first(str): judge if str is the very beginning sub-string or not.

* And also it supplies two global functions US_TO_S & S_TO_US

* for transforming unicode string and mutibyte string.

* E-mail: semean@163.com

********************************************************************************/

namespace UStr

{

* 联合体: Section

* 说明: 本联合体的对象可记录一个位置,或者一个区间(起始位置与长度)

******************************************************************/

union Section

{

typedef Section value_type;

typedef Section* pointer_type;

typedef Section& reference_type;

size_t _begin;

struct sect

{

unsigned int _begin;

unsigned int _length;

} _sect;

Section(void);

};

typedef long long Int64;

typedef std::list<Section> SectionList;

typedef std::list<Section>::iterator SectListIter;

* 类: UString

* 说明:

* UString具有两种不同类型的对象,一是普通字符串,内部存储空间大小

* 为字符串所需要的空间大小;一是缓存字符串,内部存储空间大小是1K

* 的整数倍(管理策略见实现).

******************************************************************/

enum USType //UString的存储管理类型

{

eUSNormal = 0, //普通字符串

eUSBuffer //用作缓存的字符串

};

class UString

{

public:

typedef UString value_type;

typedef UString* pointer_type;

typedef UString& reference_type;

UString(USType Ustype = eUSNormal);

UString(const UString& Str);

UString(const char* pStr);

UString(const wchar_t* pStr);

~UString(void);

public: //公共接口

size_t length(void) const;

wchar_t* u_str(void) const;

char* c_str(void) const;

void resize(size_t Size);

void append(const UString& Str);

void append(const char* pStr);

void append(const wchar_t* pStr);

void append(const wchar_t* pStr, size_t Begin, size_t Length);

UString sub_ustr(size_t Begin, size_t Length) const;

UString sub_ustr(const Section& Sect) const;

/* 检索(find*)系列的方法

* 说明:

* (1)find与find_overlap返回子串所有出现的位置列表(因长度固定,

* 故联合体只存储首位置).其中find_overlap所检索的子串允许交叉.

* (2)返回所有子串区间的列表仅提供一种方便的手段,但返回列表会

* 影响算法性能(若以引用参数输出STL list,却在DLL输出时有存储

* 管理的异常).故某些情况下可以使用find_first遍历所有子串.

* (3)本系列函数所实现的算法有待严格的大规模的测试.

* percylee 2006/3

**************************************************************************/

SectionList find(const UString& Str) const;

SectionList find(wchar_t WCh) const;

SectionList find(const wchar_t* pStr,size_t Begin, size_t Length) const;

SectionList find_overlap(const UString& Str) const;

SectionList find_overlap(const wchar_t* pStr,size_t Begin, size_t Length) const;

Int64 find_first(const UString& Str,size_t Start) const;

Int64 find_first(wchar_t WCh, size_t Start) const;

Int64 find_first(const wchar_t* pStr,size_t Begin, size_t Length, size_t Start) const;

/**************************************************************************/

bool be_first(const UString& Str,size_t Start) const;

bool be_first(const wchar_t* pStr,size_t Begin,size_t Length,size_t Start) const;

public: //操作符重载

UString& operator =(const UString& Str);

UString& operator =(const char* pStr);

UString& operator =(const wchar_t* pStr);

wchar_t& operator [](size_t pos);

friend bool operator == ( const UString& Str1, const UString& Str2 );

friend bool operator < ( const UString& Str1, const UString& Str2 );

friend bool operator <= ( const UString& Str1, const UString& Str2 );

friend bool operator > ( const UString& Str1, const UString& Str2 );

friend bool operator >= ( const UString& Str1, const UString& Str2 );

private:

wchar_t* _pUStr;

size_t _length;

size_t _capacity;

USType _usType;

private:

void set_ustring(const UString& Str);

void set_ustring(const char* pStr);

void set_ustring(const wchar_t* pStr);

void default_construct(size_t DefaultSize = 0);

size_t d_capacity(size_t Size);

};

* 多字节流与unicode字符串的全局转换函数

* 说明:

* 对于S_TO_US,需满足UStrLen>=StrLen;

* 对于US_TO_S,需满足StrLen>=UStrLen*2

**************************************************************************/

size_t S_TO_US(char* pStr,size_t StrLen,wchar_t* pUStr,size_t UStrLen);

size_t US_TO_S(wchar_t* pUStr,size_t UStrLen,char* pStr,size_t StrLen);

};

#endif //__USTRING_H__

l UString.cpp

#include "StdAfx.h"

#include "./ustring.h"

#include <stdlib.h>

#include <locale.h>

* 内部使用的全局变量与全局函数

*************************************************************************/

const unsigned int DEFAULT_CAPACITY = 1024;

// KMP模式匹配算法的后移向量计算

inline size_t* KMPNext(const wchar_t* pStr, size_t Length)

{

if( !pStr || Length <= 0 )

{

return NULL;

}

size_t* pN = new size_t[Length];

if( !pN )

{

return NULL;

}

size_t var = 0;

pN[0] = 0;

for( size_t i = 1; i < Length; i ++ )

{

var = pN[i-1];

while( var > 0 && pStr[i] != pStr[var] )

{

var = pN[var-1];

}

if( pStr[i] == pStr[var] )

{

pN[i] = var + 1;

}

else

{

pN[i] = 0;

}

return pN;

}

inline size_t* KMPNext(const UStr::UString& Str)

{

return KMPNext(Str.u_str(),Str.length());

}

* UStr名空间内的类实现

*************************************************************************/

UStr::Section::Section(void)

{

_begin = 0;

_sect._begin = 0;

_sect._length = 0;

}

UStr::UString::UString(USType Ustype/* = eUSNormal*/)

{

_usType = Ustype;

default_construct();

}

UStr::UString::UString(const UString& Str)

{

_usType = eUSNormal;

default_construct(((UString&)Str).length());

set_ustring(Str);

}

UStr::UString::UString(const char* pStr)

{

_usType = eUSNormal;

if( pStr )

{

default_construct(strlen(pStr));

set_ustring(pStr);

}

else

{

default_construct();

}

UStr::UString::UString(const wchar_t* pStr)

{

_usType = eUSNormal;

if( pStr )

{

default_construct(wcslen(pStr));

set_ustring(pStr);

}

else

{

default_construct();

}

UStr::UString::~UString(void)

{

delete[] _pUStr;

_length = 0;

}

size_t UStr::UString::length(void) const

{

return _length;

}

wchar_t* UStr::UString::u_str(void) const

{

return _pUStr;

}

char* UStr::UString::c_str(void) const

{

return NULL;

}

void UStr::UString::resize(size_t Size)

{

delete[] _pUStr;

_length = 0;

default_construct(Size);

}

void UStr::UString::append(const UString& Str)

{

size_t len = Str.length();

if( len <= 0 )

{

return;

}

append(Str.u_str(),0,len);

}

void UStr::UString::append(const char* pStr)

{

if( !pStr )

{

return;

}

UString ustr(pStr);

append(ustr);

}

void UStr::UString::append(const wchar_t* pStr)

{

if( !pStr )

{

return;

}

size_t len = wcslen(pStr);

if( len <= 0 )

{

return;

}

append(pStr,0,len);

}

void UStr::UString::append(const wchar_t* pStr, size_t Begin, size_t Length)

{

if( !pStr || Length <= 0 )

{

return;

}

if( _capacity < _length + Length )

{

_capacity += d_capacity(Length);

wchar_t* pBuf = new wchar_t[_capacity+1];

wcsncpy(pBuf,_pUStr,_length);

for(size_t i = 0; i < Length; i ++)

{

pBuf[_length+i] = pStr[Begin+i];

}

_length += Length;

pBuf[_length] = L'/0';

delete[] _pUStr;

_pUStr = pBuf;

pBuf = NULL;

}

else

{

for( size_t i = 0; i < Length; i ++ )

{

_pUStr[_length+i] = pStr[Begin+i];

}

_length += Length;

_pUStr[_length] = L'/0';

}

UStr::UString UStr::UString::sub_ustr(size_t Begin, size_t Length) const

{

UString ustr;

if( _length < Begin + Length )

{

return ustr;

}

ustr.append(_pUStr,Begin,Length);

return ustr;

}

UStr::UString UStr::UString::sub_ustr(const UStr::Section& Sect) const

{

UString ustr;

if( _length < Sect._sect._begin + Sect._sect._length )

{

return ustr;

}

ustr.append(_pUStr,Sect._sect._begin,Sect._sect._length);

return ustr;

}

UStr::SectionList UStr::UString::find(const UString& Str) const

{

return find(Str.u_str(),0,Str.length());

}

UStr::SectionList UStr::UString::find(wchar_t WCh) const

{

SectionList ustrList;

Section aSection;

for( size_t i = 0; i < _length; i ++ )

{

if( _pUStr[i] == WCh )

{

aSection._begin = i;

ustrList.push_back(aSection);

}

return ustrList;

}

UStr::SectionList UStr::UString::find(const wchar_t* pStr,size_t Begin, size_t Length) const

{

SectionList ustrList;

if( Length > _length )

{

return ustrList;

}

const wchar_t* pStrBegin = pStr+Begin;

size_t* pKMPNext = KMPNext(pStrBegin,Length);

if( !pKMPNext )

{

return ustrList;

}

Section aSection;

size_t strPos = 0;

for( size_t i = 0; i < _length; i ++ )

{

while( pStrBegin[strPos] != _pUStr[i] && strPos > 0 )

{

strPos = pKMPNext[strPos-1];

}

if( pStrBegin[strPos] == _pUStr[i] )

{

strPos ++;

}

if( strPos == Length )

{

aSection._begin = i - Length + 1;

ustrList.push_back(aSection);

strPos = 0; //start form the very beginning of pKMPNext

}

delete[] pKMPNext;

return ustrList;

}

UStr::SectionList UStr::UString::find_overlap(const UString& Str) const

{

return find_overlap(Str.u_str(),0,Str.length());

}

UStr::SectionList UStr::UString::find_overlap(const wchar_t* pStr,size_t Begin, size_t Length) const

{

SectionList ustrList;

if( Length > _length )

{

return ustrList;

}

const wchar_t* pStrBegin = pStr+Begin;

size_t* pKMPNext = KMPNext(pStrBegin,Length);

if( !pKMPNext )

{

return ustrList;

}

Section aSection;

size_t strPos = 0;

for( size_t i = 0; i < _length; i ++ )

{

while( pStrBegin[strPos] != _pUStr[i] && strPos > 0 )

{

strPos = pKMPNext[strPos-1];

}

if( pStrBegin[strPos] == _pUStr[i] )

{

strPos ++;

}

if( strPos == Length )

{

aSection._begin = i - Length + 1;

ustrList.push_back(aSection);

strPos = 0; //start form the very beginning of pKMPNext

i = i - Length + 1;//overlap

}

delete[] pKMPNext;

return ustrList;

}

UStr::Int64 UStr::UString::find_first(const UStr::UString& Str,size_t Start) const

{

size_t length = Str.length();

if( _length < Start+length )

{

return -1;

}

return find_first(Str.u_str(),0,length,Start);

}

UStr::Int64 UStr::UString::find_first(wchar_t WCh, size_t Start) const

{

if( _length <= Start )

{

return -1;

}

for( size_t i = Start; i < _length; i ++ )

{

if( _pUStr[i] == WCh )

{

return (Int64)i;

}

return -1;

}

UStr::Int64 UStr::UString::find_first(const wchar_t* pStr,

size_t Begin,

size_t Length,

size_t Start

) const

{

if( _length < Start+Length )

{

return -1;

}

const wchar_t* pStrBegin = pStr+Begin;

size_t* pKMPNext = KMPNext(pStrBegin,Length);

if( !pKMPNext )

{

return -1;

}

size_t strPos = 0;

for( size_t i = Start; i < _length; i ++ )

{

while( pStrBegin[strPos] != _pUStr[i] && strPos > 0 )

{

strPos = pKMPNext[strPos-1];

}

if( pStrBegin[strPos] == _pUStr[i] )

{

strPos ++;

}

if( strPos == Length )

{

delete[] pKMPNext;

return (Int64)((Int64)i-(Int64)Length+1);

}

delete[] pKMPNext;

return -1;

}

bool UStr::UString::be_first(const UString& Str,size_t Start) const

{

return be_first(Str.u_str(),0,Str.length(),Start);

}

bool UStr::UString::be_first(const wchar_t* pStr,

size_t Begin,

size_t Length,

size_t Start

) const

{

if( _length < Start+Length )

{

return false;

}

for( size_t i = 0; i < Length; i ++ )

{

if( _pUStr[Start+i] != pStr[Begin+i] )

{

return false;

}

return true;

}

UStr::UString& UStr::UString::operator =(const UString& Str)

{

if( this != &Str )

{

set_ustring(Str);

}

return *this;

}

UStr::UString& UStr::UString::operator =(const char* pStr)

{

if( pStr )

{

set_ustring(pStr);

}

else

{

delete[] _pUStr;

default_construct();

}

return *this;

}

UStr::UString& UStr::UString::operator =(const wchar_t* pStr)

{

if( pStr && pStr != this->_pUStr )

{

set_ustring(pStr);

}

else if( !pStr )

{

delete[] _pUStr;

default_construct();

}

return *this;

}

wchar_t& UStr::UString::operator [](size_t pos)

{

static wchar_t wch;

if( pos >= _length )

{

return wch;

}

return _pUStr[pos];

}

void UStr::UString::set_ustring(const UString& Str)

{

_length = Str.length();

if( _capacity < _length )

{

delete[] _pUStr;

_capacity = d_capacity(_length);

_pUStr = new wchar_t[ _capacity+1 ];

}

wcsncpy(_pUStr,Str.u_str(),_length);

_pUStr[_length] = L'/0';

}

void UStr::UString::set_ustring(const char* pStr)

{

size_t len = strlen(pStr);

if( _capacity < len )

{

delete[] _pUStr;

_capacity = d_capacity(len);

_pUStr = new wchar_t[_capacity+1];

}

(void)setlocale(LC_ALL,"");

_length = mbstowcs(_pUStr,pStr,len*sizeof(char));

_pUStr[ _length ] = L'/0';

}

void UStr::UString::set_ustring(const wchar_t* pStr)

{

_length = wcslen(pStr);

if( _capacity < _length )

{

delete[] _pUStr;

_capacity = d_capacity(_length);

_pUStr = new wchar_t[_capacity+1];

}

wcsncpy( _pUStr,pStr,_length );

_pUStr[_length] = L'/0';

}

void UStr::UString::default_construct(size_t DefaultSize/* = 0*/)

{

_capacity = d_capacity(DefaultSize);

if( _pUStr = new wchar_t[_capacity+1] )

{

_pUStr[0] = L'/0';

}

else

{

_capacity = 0;

}

_length = 0;

}

size_t UStr::UString::d_capacity(size_t Size)

{

if( _usType == eUSNormal )

{

return Size+1;

}

else //if( _usType == eUSBuffer )

{

return (Size/DEFAULT_CAPACITY + 1)*DEFAULT_CAPACITY;

}

* UString友元比较函数

**************************************************************************************/

bool UStr::operator == ( const UStr::UString& Str1, const UStr::UString& Str2 )

{

size_t len = Str1.length();

if( len != Str2.length() )

{

return false;

}

wchar_t* pStr1 = Str1.u_str();

wchar_t* pStr2 = Str2.u_str();

if( wcscmp(pStr1,pStr2) != 0 )

{

return false;

}

return true;

}

bool UStr::operator < ( const UStr::UString& Str1, const UStr::UString& Str2 )

{

size_t length1 = Str1.length();

size_t length2 = Str2.length();

if( length1 < length2 )

{

return true;

}

else if( length2 < length1 )

{

return false;

}

else

{

wchar_t* pStr1 = Str1.u_str();

wchar_t* pStr2 = Str2.u_str();

if( wcscmp(pStr1,pStr2) < 0 )

{

return true;

}

return false;

}

bool UStr::operator <= ( const UStr::UString& Str1, const UStr::UString& Str2 )

{

size_t length1 = Str1.length();

size_t length2 = Str2.length();

if( length1 < length2 )

{

return true;

}

else if( length2 < length1 )

{

return false;

}

else

{

wchar_t* pStr1 = Str1.u_str();

wchar_t* pStr2 = Str2.u_str();

if( wcscmp(pStr1,pStr2) <= 0 )

{

return true;

}

return false;

}

bool UStr::operator > ( const UStr::UString& Str1, const UStr::UString& Str2 )

{

size_t length1 = Str1.length();

size_t length2 = Str2.length();

if( length1 > length2 )

{

return true;

}

else if( length2 > length1 )

{

return false;

}

else

{

wchar_t* pStr1 = Str1.u_str();

wchar_t* pStr2 = Str2.u_str();

if( wcscmp(pStr1,pStr2) > 0 )

{

return true;

}

return false;

}

bool UStr::operator >= ( const UStr::UString& Str1, const UStr::UString& Str2 )

{

size_t length1 = Str1.length();

size_t length2 = Str2.length();

if( length1 > length2 )

{

return true;

}

else if( length2 > length1 )

{

return false;

}

else

{

wchar_t* pStr1 = Str1.u_str();

wchar_t* pStr2 = Str2.u_str();

if( wcscmp(pStr1,pStr2) >= 0 )

{

return true;

}

return false;

}

* UStr名空间内的全局函数的实现

************************************************************************************/

size_t UStr::S_TO_US(char* pStr,size_t StrLen,wchar_t* pUStr,size_t UStrLen)

{

if( !pStr || !pUStr || UStrLen < StrLen )

{

return 0;

}

char* pStr2 = new char[StrLen+1];

strncpy(pStr2,pStr,StrLen);

pStr2[StrLen] = '/0';

(void)setlocale(LC_ALL,"");

size_t len = mbstowcs(pUStr,pStr2,StrLen*sizeof(char));

pUStr[ len ] = L'/0';

delete[] pStr2;

return len;

}

size_t UStr::US_TO_S(wchar_t* pUStr,size_t UStrLen,char* pStr,size_t StrLen)

{

if( !pStr || !pUStr || StrLen < 2*UStrLen )

{

return 0;

}

wchar_t* pUStr2 = new wchar_t[UStrLen+1];

wcsncpy(pUStr2,pUStr,UStrLen);

pUStr2[UStrLen] = L'/0';

(void)setlocale(LC_ALL,"");

size_t len = wcstombs(pStr,pUStr2,UStrLen*sizeof(wchar_t));

pStr[ len ] = '/0';

delete[] pUStr2;

return len;

}