内容分析程序

来源：互联网发布：淘宝详情图关注怎么做编辑：程序博客网时间：2024/05/16 09:05

内容分析程序

* DataAnalysis.java

* 内容分析程序

* Created on 2007年10月9日, 下午7:43

* 马如林桂林电子科技大学

package com.mrl;

import java.io.*;

import java.util.*;

import java.sql.*;

import java.util.Hashtable;

import com.xjt.nlp.word.ICTCLAS;

import com.mrl.DbConn;

import com.mrl.FileProcess;

/**

* @author rulinma

public class DataAnalysis

{

/** 哈希表 */

private static Hashtable KeywordProbality = new Hashtable();

/** 最大的Double值 */

private static double maxDouble = 1.0;

/** 最小的Double值*/

private static double minDouble = -1.0;

private static String normalStatus = "1";

private static String abnormalStatus = "3";

/** Creates a new instance of DataAnalysis */

public DataAnalysis()

{

}

* 主函数入口

public static void main(String[] args)

{

// 从参数列表中获取

String tableName = "dlog_diary";

String primaryKey = "diary_id";

String content = "content";

String status = "status";

String statusFlag = "2";

// 获取关键词及对应的概率

getKeywordProbaliy();

maxDouble = Double.MAX_VALUE / maxDouble;

minDouble = Double.MIN_VALUE / minDouble;

// 不断循环,也可以使用定时器

while(true)

{

// 获取内容

getContent(tableName,primaryKey,content,status,statusFlag);

}

* JavaBean对象调用入口,需要修改其中的static定义

public static void DataAnalysis()

{

// 从参数列表中获取

String tableName = "dlog_diary";

String primaryKey = "diary_id";

String content = "content";

String status = "status";

String statusFlag = "2";

getKeywordProbaliy();

while(true)

{

// 获取内容

getContent(tableName,primaryKey,content,status,statusFlag);

}

private static void getContent(String tableName, String primaryKey, String content, String status, String statusFlag)

{

Connection con = DbConn.getConn ();

Statement stmt = null;

ResultSet rs = null;

while(true)

{

try

{

stmt = con.createStatement();

String querySql = "SELECT " + primaryKey + "," + content+ "," + status + " FROM "+ tableName + " WHERE " + status +" = " + statusFlag ;

rs = stmt.executeQuery(querySql);

while(rs.next ())

{

String id = rs.getString(primaryKey);

String blogContent = rs.getString(content);

// 内容分析

boolean result = contentAnalysis(blogContent);

// 反馈程序

if(result)

{

feedBack(tableName, primaryKey, status, id, normalStatus);

}

else

{

feedBack(tableName, primaryKey, status, id, abnormalStatus);

}

catch (SQLException e)

{

e.getStackTrace ();

}

finally

{

if(rs!=null)

{

try

{

rs.close ();

}

catch(Exception e)

{

e.getStackTrace ();

}

if(stmt!=null)

{

try

{

stmt.close ();

}

catch(Exception e)

{

e.getStackTrace ();

}

* 字符串分词

private static boolean contentAnalysis(String strContent)

{

ICTCLAS ictclas = new ICTCLAS();

if(!ictclas.init (0,2))

{

ictclas.init (0,2);

}

String strTrans=ictclas.paragraphProcess(strContent);

// 根据字符串的先验概率计算概率

return(filterString(strTrans));

}

* 根据字符串的先验概率计算概率

private static boolean filterString(String srcStr)

{

/** 使用trim去掉前后多余空格防止发生意外 */

String strTemp=srcStr.trim();

String tempText = "";

double probality = 1.0;

StringTokenizer st = new StringTokenizer(strTemp," ");

int len = st.countTokens();

int i = 0;

while(i<len)

{

tempText = st.nextToken();

i++;

// 获取该词对应的概率

double keywordProbality = divStr(tempText);

{

probality = probality * keywordProbality;

// 用2个参数分别表示上溢出和下溢出值

// 溢出处理

// 一旦遇到上溢出表明其中含有不良信息即可停止计算

if(probality > maxDouble)

{

return false;

}

else if(probality < minDouble)

{

// 继续往下计算

probality = 1.0;

}

System.out.println (probality);

if(probality >1.0)

{

return false;

}

else

{

return true;

}

* 分词

private static double divStr(String srcStr)

{

StringTokenizer st = new StringTokenizer(srcStr,"/");

double probality = 1.0;

int len = st.countTokens();

if(len == 2)

{

String strPre = st.nextToken ();

// 获取该词的先验概率

probality = Double.parseDouble(KeywordProbality.get(strPre).toString());

}

return probality;

}

* 根据系统默认设置获取对应的词和概率存储在向量中

private static void getKeywordProbaliy()

{

FileProcess fileProcess = new FileProcess();

/** 读取数据配置文件所在目录 */

String dir = fileProcess.GetCurrDir ();

/** 系统默认设置的目录文件 */

String fileName = "/blogAudit/Incoming/300ArticlesKeywordsPossible.txt";

/** 完整的文件访问路径 */

String fullFileName = dir + fileName;

File myFile = new File(fullFileName);

if(!myFile.exists())

{

System.err.println("Can't Find " + fullFileName);

}

try

{

BufferedReader in = new BufferedReader(new FileReader(myFile));

String str;

while ((str = in.readLine()) != null)

{

// |为分隔符

int divPos = str.lastIndexOf("|");

try

{

if(maxDouble < Double.parseDouble(str.substring(divPos+1,str.length())))

{

// 最大值

maxDouble = Double.parseDouble(str.substring(divPos+1,str.length()));

}

if(minDouble > Double.parseDouble(str.substring(divPos+1,str.length())))

{

// 最小值

minDouble = Double.parseDouble(str.substring(divPos+1,str.length()));

}

// 添加词及对应的概率

KeywordProbality.put(str.substring(0,divPos),str.substring(divPos+1,str.length()));

}

catch(Exception e)

{

e.getStackTrace();

}

in.close();

}

catch (IOException e)

{

e.getStackTrace();

}

* 反馈系统

private static void feedBack(String tableName, String primaryKey, String status, String id, String statsUpdateFlag)

{

Connection con = DbConn.getConn ();

Statement stmt = null;

try

{

stmt = con.createStatement();

String upSql = null;

upSql = "UPDATE "+ tableName +" SET "+ status + "=" + statsUpdateFlag +" WHERE "+ primaryKey + "=" +id ;

/** 执行插入操作*/

stmt.executeUpdate(upSql);

}

catch (SQLException e)

{

e.getStackTrace ();

}

finally

{

if(stmt!=null)

{

try

{

stmt.close ();

}

catch(Exception e)

{

e.getStackTrace ();

}