调用海量智能分词研究版的dll获取分词的结果(C#)

来源:互联网 发布:小米彻底删除双开数据 编辑:程序博客网 时间:2024/05/25 12:21

中文分词是中文搜索引擎的基础,主要应用在信息检索、信息挖掘、中外文对译、中文校对、自动聚类、自动分类等很多方面.

这个是我参照VC的例子修改的C#版本。^  ^

using System;
using System.Drawing;
using System.Text;
using System.Collections;
using System.ComponentModel;
using System.Windows.Forms;
using System.Runtime.InteropServices;
namespace ChineseParse
{

public struct SHLSegWord
{
public strings_szWord; //字符串
public ints_dwPOS;  //词性标志
public float   s_fWeight ;//关键词权重,如果不是关键词,权重为0
//System.UInt32
}

/// <summary>
/// HLParse 的摘要说明。
/// </summary>
public class frmHLParse : System.Windows.Forms.Form
{
private System.Windows.Forms.RichTextBox txtOutput;
private System.Windows.Forms.RichTextBox txtInput;
private System.Windows.Forms.Label label1;
private System.Windows.Forms.Label label2;
private System.Windows.Forms.Button btnExit;
private System.Windows.Forms.Button btnParse;
private string m_strKey;
private string m_strWords;
private string m_strFinger;
/************************************************************/
//常量定义部分//
/************************************************************/
const int HL_CAL_OPT_KEYWORD = 0x1;//计算关键词附加标识
const int HL_CAL_OPT_FINGER = 0x2;//计算文章语义指纹标识
const int HL_CAL_OPT_POS = 0x4;//计算词性标识
const int HL_CAL_OPT_SEARCH = 0x8;//输出面向检索的分词结果
/************************************************************/
//词性定义部分//
/************************************************************/
public const int NATURE_D_A=0x40000000;//形容词 形语素
public const int NATURE_D_B=0x20000000;//区别词 区别语素
public const int NATURE_D_C=0x10000000;//连词 连语素
public const int NATURE_D_D=0x08000000;//副词 副语素
public const int NATURE_D_E=0x04000000;//叹词 叹语素
public const int NATURE_D_F=0x02000000;//方位词 方位语素
public const int NATURE_D_I=0x01000000;//成语
public const int NATURE_D_L=0x00800000;//习语
public const int NATURE_A_M=0x00400000;//数词 数语素
public const int NATURE_D_MQ=0x00200000;//数量词
public const int NATURE_D_N=0x00100000;//名词 名语素
public const int NATURE_D_O=0x00080000;//拟声词
public const int NATURE_D_P=0x00040000;//介词
public const int NATURE_A_Q=0x00020000;//量词 量语素
public const int NATURE_D_R=0x00010000;//代词 代语素
public const int NATURE_D_S=0x00008000;//处所词
public const int NATURE_D_T=0x00004000;//时间词
public const int NATURE_D_U=0x00002000;//助词 助语素
public const int NATURE_D_V=0x00001000;//动词 动语素
public const int NATURE_D_W=0x00000800;//标点符号
public const int NATURE_D_X=0x00000400;//非语素字
public const int NATURE_D_Y=0x00000200;//语气词 语气语素
public const int NATURE_D_Z=0x00000100;//状态词
public const int NATURE_A_NR=0x00000080;//人名
public const int NATURE_A_NS=0x00000040;//地名
public const int NATURE_A_NT=0x00000020;//机构团体
public const int NATURE_A_NX=0x00000010;//外文字符
public const int NATURE_A_NZ=0x00000008;//其他专名
public const int NATURE_D_H=0x00000004;
private System.Windows.Forms.RichTextBox txtKey;
private System.Windows.Forms.CheckBox chkPos;
private System.Windows.Forms.CheckBox chkSeach;
private System.Windows.Forms.Label txtMsg;
private System.Windows.Forms.CheckBox chkKeyword;
private System.Windows.Forms.CheckBox chkFinger;
private System.Windows.Forms.Label lblFinger;
private System.Windows.Forms.Button btn;//前接成分
public const int NATURE_D_K=0x00000002;//后接成分

//初始化分词词典
[DllImport("HLSSplit.dll",SetLastError=true,EntryPoint="HLSplitInit")]
private static extern bool HLSplitInit(string path);
//创建分词句柄
[DllImport("HLSSplit.dll",SetLastError=true,EntryPoint="HLOpenSplit")]
private static extern IntPtr HLOpenSplit();
//对一段字符串分词
[DllImport("HLSSplit.dll",SetLastError=true,EntryPoint="HLSplitWord")]
private static extern bool HLSplitWord(IntPtr pHandle,string text,int flag);

//取得分词个数
[DllImport("HLSSplit.dll",SetLastError=true,EntryPoint="HLGetWordCnt")]
private static extern int HLGetWordCnt(IntPtr pHandle);

//获取指定的分词结果
[DllImport("HLSSplit.dll",SetLastError=true,EntryPoint="HLGetWordAt")]
private static extern IntPtr HLGetWordAt(IntPtr pHandle,int pos);

//获取关键词个数
[DllImport("HLSSplit.dll",SetLastError=true,EntryPoint="HLGetFileKeyCnt")]
private static extern int HLGetFileKeyCnt(IntPtr pHandle);

//获取指定下标的关键词
[DllImport("HLSSplit.dll",SetLastError=true,EntryPoint="HLGetFileKeyAt")]
private static extern IntPtr HLGetFileKeyAt(IntPtr pHandle,int pos);

//装载用户自定义词典
[DllImport("HLSSplit.dll",SetLastError=true,EntryPoint="HLOpenUsrDict")]
private static extern bool HLOpenUsrDict(string lpUserDictName);

//卸载用户自定义词典
[DllImport("HLSSplit.dll",SetLastError=true,EntryPoint="HLFreeUsrDict")]
private static extern bool HLFreeUsrDict();

//获得语义指纹
[DllImport("HLSSplit.dll",SetLastError=true,EntryPoint="HLGetFingerM")]
private static extern bool HLGetFingerM(IntPtr hHandle,ref IntPtr rpData, ref Int32 rdwLen);
 
//关闭分词句柄
[DllImport("HLSSplit.dll",SetLastError=true,EntryPoint="HLCloseSplit")]
private static extern void HLCloseSplit(IntPtr pHandle);
//海量分词系统卸载
[DllImport("HLSSplit.dll",SetLastError=true,EntryPoint="HLFreeSplit")]
private static extern void HLFreeSplit();
 
/// <summary>
/// 必需的设计器变量。
/// </summary>
private System.ComponentModel.Container components = null;

public frmHLParse()
{
//
// Windows 窗体设计器支持所必需的
//
InitializeComponent();

//
// TODO: 在 InitializeComponent 调用后添加任何构造函数代码
//
}
/// <summary>
/// 清理所有正在使用的资源。
/// </summary>
protected override void Dispose( bool disposing )
{
if( disposing )
{
if(components != null)
{
components.Dispose();
}
}
base.Dispose( disposing );
}

#region Windows 窗体设计器生成的代码
/// <summary>
/// 设计器支持所需的方法 - 不要使用代码编辑器修改
/// 此方法的内容。
/// </summary>
private void InitializeComponent()
{
this.txtOutput = new System.Windows.Forms.RichTextBox();
this.txtInput = new System.Windows.Forms.RichTextBox();
this.label1 = new System.Windows.Forms.Label();
this.label2 = new System.Windows.Forms.Label();
this.btnExit = new System.Windows.Forms.Button();
this.btnParse = new System.Windows.Forms.Button();
this.txtKey = new System.Windows.Forms.RichTextBox();
this.chkPos = new System.Windows.Forms.CheckBox();
this.chkKeyword = new System.Windows.Forms.CheckBox();
this.chkFinger = new System.Windows.Forms.CheckBox();
this.chkSeach = new System.Windows.Forms.CheckBox();
this.txtMsg = new System.Windows.Forms.Label();
this.lblFinger = new System.Windows.Forms.Label();
this.btn = new System.Windows.Forms.Button();
this.SuspendLayout();
//
// txtOutput
//
this.txtOutput.Location = new System.Drawing.Point(34, 208);
this.txtOutput.Name = "txtOutput";
this.txtOutput.ReadOnly = true;
this.txtOutput.Size = new System.Drawing.Size(488, 136);
this.txtOutput.TabIndex = 1;
this.txtOutput.Text = "";
//
// txtInput
//
this.txtInput.Location = new System.Drawing.Point(34, 4);
this.txtInput.Name = "txtInput";
this.txtInput.Size = new System.Drawing.Size(488, 154);
this.txtInput.TabIndex = 0;
this.txtInput.Text = "海量中文智能分词基础件具有灵活定制的特点,支持多平台、 支持多码制、 针对不同应用可量身定做多种版本";
//
// label1
//
this.label1.Location = new System.Drawing.Point(6, 6);
this.label1.Name = "label1";
this.label1.Size = new System.Drawing.Size(24, 36);
this.label1.TabIndex = 2;
this.label1.Text = "输入";
//
// label2
//
this.label2.Location = new System.Drawing.Point(4, 204);
this.label2.Name = "label2";
this.label2.Size = new System.Drawing.Size(24, 36);
this.label2.TabIndex = 3;
this.label2.Text = "输出";
//
// btnExit
//
this.btnExit.Location = new System.Drawing.Point(604, 352);
this.btnExit.Name = "btnExit";
this.btnExit.TabIndex = 5;
this.btnExit.Text = "退出";
this.btnExit.Click += new System.EventHandler(this.btnExit_Click);
//
// btnParse
//
this.btnParse.Location = new System.Drawing.Point(272, 352);
this.btnParse.Name = "btnParse";
this.btnParse.TabIndex = 4;
this.btnParse.Text = "分词";
this.btnParse.Click += new System.EventHandler(this.btnParse_Click);
//
// txtKey
//
this.txtKey.Location = new System.Drawing.Point(528, 4);
this.txtKey.Name = "txtKey";
this.txtKey.ReadOnly = true;
this.txtKey.Size = new System.Drawing.Size(160, 340);
this.txtKey.TabIndex = 6;
this.txtKey.Text = "";
//
// chkPos
//
this.chkPos.Location = new System.Drawing.Point(42, 164);
this.chkPos.Name = "chkPos";
this.chkPos.Size = new System.Drawing.Size(74, 20);
this.chkPos.TabIndex = 7;
this.chkPos.Text = "词性";
//
// chkKeyword
//
this.chkKeyword.Checked = true;
this.chkKeyword.CheckState = System.Windows.Forms.CheckState.Checked;
this.chkKeyword.Location = new System.Drawing.Point(120, 164);
this.chkKeyword.Name = "chkKeyword";
this.chkKeyword.Size = new System.Drawing.Size(74, 20);
this.chkKeyword.TabIndex = 8;
this.chkKeyword.Text = "关键词";
//
// chkFinger
//
this.chkFinger.Checked = true;
this.chkFinger.CheckState = System.Windows.Forms.CheckState.Checked;
this.chkFinger.Location = new System.Drawing.Point(198, 164);
this.chkFinger.Name = "chkFinger";
this.chkFinger.Size = new System.Drawing.Size(74, 20);
this.chkFinger.TabIndex = 9;
this.chkFinger.Text = "语义指纹";
//
// chkSeach
//
this.chkSeach.Location = new System.Drawing.Point(276, 164);
this.chkSeach.Name = "chkSeach";
this.chkSeach.Size = new System.Drawing.Size(74, 20);
this.chkSeach.TabIndex = 10;
this.chkSeach.Text = "检索优化";
//
// txtMsg
//
this.txtMsg.Location = new System.Drawing.Point(356, 164);
this.txtMsg.Name = "txtMsg";
this.txtMsg.Size = new System.Drawing.Size(166, 20);
this.txtMsg.TabIndex = 11;
this.txtMsg.TextAlign = System.Drawing.ContentAlignment.MiddleLeft;
//
// lblFinger
//
this.lblFinger.Location = new System.Drawing.Point(36, 188);
this.lblFinger.Name = "lblFinger";
this.lblFinger.Size = new System.Drawing.Size(486, 18);
this.lblFinger.TabIndex = 12;
this.lblFinger.TextAlign = System.Drawing.ContentAlignment.MiddleLeft;
//
// btn
//
this.btn.Location = new System.Drawing.Point(32, 352);
this.btn.Name = "btn";
this.btn.TabIndex = 13;
this.btn.Text = "其它";
this.btn.Click += new System.EventHandler(this.btn_Click);
//
// frmHLParse
//
this.AutoScaleBaseSize = new System.Drawing.Size(6, 14);
this.ClientSize = new System.Drawing.Size(696, 381);
this.Controls.Add(this.btn);
this.Controls.Add(this.lblFinger);
this.Controls.Add(this.txtMsg);
this.Controls.Add(this.chkSeach);
this.Controls.Add(this.chkFinger);
this.Controls.Add(this.chkKeyword);
this.Controls.Add(this.chkPos);
this.Controls.Add(this.txtKey);
this.Controls.Add(this.btnExit);
this.Controls.Add(this.btnParse);
this.Controls.Add(this.label2);
this.Controls.Add(this.label1);
this.Controls.Add(this.txtOutput);
this.Controls.Add(this.txtInput);
this.Name = "frmHLParse";
this.Text = "HLParse";
this.ResumeLayout(false);

}
#endregion

/// <summary>
/// 应用程序的主入口点。
/// </summary>
[STAThread]
static void Main()
{
Application.Run(new frmHLParse());
}
private void btnParse_Click(object sender, System.EventArgs e)
{
this.m_strWords="";
this.m_strFinger="";
this.m_strKey="";
ParseWord(this.txtInput.Text);
this.txtOutput.Text=this.m_strWords;
this.txtKey.Text=this.m_strKey;
this.lblFinger.Text=this.m_strFinger;
}

private void btnExit_Click(object sender, System.EventArgs e)
{
this.Close();
}
private void ParseWord(string text)
{
 
bool bInitDict=HLSplitInit(@"D:/MyProjects/ChineseParse/bin/Debug/");
if(!bInitDict)
{
MessageBox.Show("初始化分词字典失败!","错误");
return ;
}
 
IntPtr hHandle = HLOpenSplit(); //创建分词句柄
if(hHandle==IntPtr.Zero)
{
//创建分词句柄失败
MessageBox.Show("创建分词句柄失败!","错误");
HLFreeSplit() ;//卸载分词字典
return ;
}
 
int iExtraCalcFlag = 0; //附加计算标志,不进行附加计算
//获得附加计算标识
if(this.chkPos.Checked)
iExtraCalcFlag |= HL_CAL_OPT_POS ;//
if(this.chkKeyword.Checked)
iExtraCalcFlag |= HL_CAL_OPT_KEYWORD;
if(this.chkSeach.Checked)
iExtraCalcFlag |= HL_CAL_OPT_SEARCH;
if(this.chkFinger.Checked)
iExtraCalcFlag |= HL_CAL_OPT_FINGER;
DateTime bgdt=DateTime.Now;
bool bSuccess = HLSplitWord (hHandle,text,iExtraCalcFlag);
System.TimeSpan ts=DateTime.Now-bgdt;
this.txtMsg.Text=string.Format("用时{0}分{1}秒{2}毫秒",ts.Minutes,ts.Seconds,ts.Milliseconds);
if(bSuccess)
{
//分词成功
int nResultCnt = HLGetWordCnt(hHandle);//取得分词个数
for(int i = 0;i<nResultCnt;i++)
{
//取得分词结果
IntPtr h=HLGetWordAt(hHandle,i) ;//取得一个分词结果
SHLSegWordpWord = (SHLSegWord)Marshal.PtrToStructure(h,typeof(SHLSegWord));
m_strWords+=pWord.s_szWord;
if(this.chkPos.Checked)
m_strWords+=GetNatureString(pWord.s_dwPOS);
m_strWords+="|";
}
if(this.chkKeyword.Checked)
{
//获取关键词
int nKeyCnt = HLGetFileKeyCnt(hHandle) ;//获得关键词个数
for(int j = 0 ; j < nKeyCnt ; j++)
{
IntPtr h = HLGetFileKeyAt(hHandle,j);//获得指定的关键词
SHLSegWord pKey= (SHLSegWord)Marshal.PtrToStructure(h,typeof(SHLSegWord));
if(pKey.s_szWord==null|| pKey.s_szWord=="")
continue ;
string strKey=string.Format("{0}.{1} {2}/r/n",j+1,pKey.s_szWord ,pKey.s_fWeight);
m_strKey += strKey ;
}
}
if(this.chkFinger.Checked)
{
//获取语义指纹
IntPtr PtrData=Marshal.AllocHGlobal(64);
Int32 PtrDataLen=0;
//int nDataLen=20 ;c2 53 e2 2d 91 5c 99 ac c2 24 42 56 eb 1d 78
m_strFinger="语义指纹:";
 
//StringBuilder
if(HLGetFingerM(hHandle,ref PtrData,ref PtrDataLen))//获得语义指纹
{
//int len=(int)Marshal.PtrToStructure(PtrDataLen,typeof(System.Int32));
 
for(int j = 0;j<PtrDataLen;j++)
{
string strU;
IntPtr p=(IntPtr)(PtrData.ToInt32()+j);
Byte b= (Byte)Marshal.PtrToStructure(p,typeof(Byte));
strU=string.Format("{0:x}",b);
m_strFinger+=strU+" ";
}
}
 
}
HLCloseSplit(hHandle) ;//关闭分词句柄
}
else
{
//分词失败
MessageBox.Show("分词失败!","错误");
HLCloseSplit(hHandle) ;//关闭分词句柄
HLFreeSplit() ;//卸载分词字典
return ;
}
HLFreeSplit() ; //卸载分词词典
}
public struct aaaa
{
public byte[] data;
}
private string GetNatureString(int dwPos)
{
string Nature=".";
if((dwPos & NATURE_D_A) == NATURE_D_A)
{
Nature+="a";//形容词
}
else if((dwPos & NATURE_D_B) == NATURE_D_B)
{
Nature+="b";//区别词
}
else if((dwPos & NATURE_D_C) == NATURE_D_C)
{
Nature+="c";//连词
}
else if((dwPos & NATURE_D_D) == NATURE_D_D)
{
Nature+="d";//副词
}
else if((dwPos & NATURE_D_E) == NATURE_D_E)
{
Nature+="e";//叹词
}
else if((dwPos & NATURE_D_F) == NATURE_D_F)
{
Nature+="f";//方位词
}
else if((dwPos & NATURE_D_I) == NATURE_D_I)
{
Nature+="i"; //成语
}
else if((dwPos & NATURE_D_L) == NATURE_D_L)
{
Nature+="l";//习语
}
else if((dwPos & NATURE_A_M) == NATURE_A_M)
{
Nature+="m";//数词
}
else if((dwPos & NATURE_D_MQ) == NATURE_D_MQ)
{
Nature+="mq";//数量词
}
else if((dwPos & NATURE_D_N) == NATURE_D_N)
{
Nature+="n";//名词
}
else if((dwPos & NATURE_D_O) == NATURE_D_O)
{
Nature+="o";//拟声词
}
else if((dwPos & NATURE_D_P) == NATURE_D_P)
{
Nature+="p";//介词
}
else if((dwPos & NATURE_A_Q) == NATURE_A_Q)
{
Nature+="q";//量词
}
else if((dwPos & NATURE_D_R) == NATURE_D_R)
{
Nature+=".r";//代词
}
else if((dwPos & NATURE_D_S) == NATURE_D_S)
{
Nature+="s";//处所词
}
else if((dwPos & NATURE_D_T) == NATURE_D_T)
{
Nature+=".t";//时间词
}
else if((dwPos & NATURE_D_U) == NATURE_D_U)
{
Nature+="u";//助词
}
else if((dwPos & NATURE_D_V) == NATURE_D_V)
{
Nature+="v";//动词
}
else if((dwPos & NATURE_D_W) == NATURE_D_W)
{
Nature+="w";//标点符号
}
else if((dwPos & NATURE_D_X) == NATURE_D_X)
{
Nature+="x";//非语素字
}
else if((dwPos & NATURE_D_Y) == NATURE_D_Y)
{
Nature+="y";//语气词
}
else if((dwPos & NATURE_D_Z) == NATURE_D_Z)
{
Nature+="z";//状态词
}
else if((dwPos & NATURE_A_NR) == NATURE_A_NR)
{
Nature+="nr";//人名
}
else if((dwPos & NATURE_A_NS) == NATURE_A_NS)
{
Nature+="ns";//地名
}
else if((dwPos & NATURE_A_NT) == NATURE_A_NT)
{
Nature+="nt";//机构团体
}
else if((dwPos & NATURE_A_NX) == NATURE_A_NX)
{
Nature+="nx";//外文字符
}
else if((dwPos & NATURE_A_NZ) == NATURE_A_NZ)
{
Nature+="nz";//其他专名
}
else if((dwPos & NATURE_D_H) == NATURE_D_H)
{
Nature+="h";//前接成分
}
else if((dwPos & NATURE_D_K) == NATURE_D_K)
{
Nature+="k";//后接成分
}
else
{
Nature+="?";//未知词性
}
return Nature;
}

private void btn_Click(object sender, System.EventArgs e)
{
Form1 frm=new Form1();
frm.ShowDialog();
}
}
}

 海量智能分词研究版下载:http://www.hylanda.com/cgi-bin/download/download.asp?id=8

原创粉丝点击