利用Stanford Parser进行中文观点抽取(附代码)

来源:互联网 发布:苹果手机查看mac地址 编辑:程序博客网 时间:2024/05/21 18:32

问题:

所谓的观点抽取就是从文本中获取关于某个特征词的观点词语。特征词在句子结构中通常为主语或者宾语,从词性上看一般为名词或者形容词,而观点词通常为带有情感色彩的形容词或者副词。观点词的抽取在用户对产品评价分析中非常有用。

例如:在句子“卖家 的 服务 态度 不错 , 快递 也 很 迅速”这个句子中,“服务”和“快递”是两个描述卖家的特征词,而“不错”和“迅速”则是这两个词的观点词。

 

方法:

1.选择文本数据(数据源,如产品评论文本等)

2.对文本进行断句和分词

3.筛选相关句子(找出含有特征描述对象的句子,直接匹配)

5.语法分析(Stanford Parser)

6.抽取观点词(遍历stanford-parser生成的语法结构树,找到离特征词节点最近的观点词节点,具体参见下面代码)

 

代码:

这里给的代码直接略过了前面几步,输入为:分词后的句子和特征词,输出:该特征词的观点词。

 

 

package textAnalysis;

 

import java.io.StringReader;

import java.util.Iterator;

import java.util.List;

 

import edu.stanford.nlp.ling.HasWord;

import edu.stanford.nlp.parser.lexparser.LexicalizedParser;

import edu.stanford.nlp.process.Tokenizer;

import edu.stanford.nlp.trees.Tree;

import edu.stanford.nlp.trees.TreebankLanguagePack;

import edu.stanford.nlp.trees.international.pennchinese.ChineseTreebankLanguagePack;

 

public class DepedWordExtra {

 

    static String[] options = { "-MAX_ITEMS", "200000000" };

    static LexicalizedParser lp = new LexicalizedParser(

           "grammar/chinesePCFG.ser.gz", options);

 

    public static void main(String[] args) {

 

       String sentence = "老师 穿 着 一件 很 美丽 的 衣服";

       String keyword = "衣服";

       int kwIndex = 0;

       String sentArry[] = sentence.split(" ");

       for (int i = 0; i < sentArry.length; i++) {

           if (keyword.equals(sentArry[i])) {

              kwIndex = i;

              break;

           }

       }

       // System.out.println(kwIndex);

 

       extraDepWord(sentence, keyword);

 

    }

 

    private static void extraDepWord(String sentence, String keyword) {

       // TODO Auto-generated method stub

       TreebankLanguagePack tlp = new ChineseTreebankLanguagePack();

       Tokenizer<? extends HasWord> toke = tlp.getTokenizerFactory()

              .getTokenizer(new StringReader(sentence));

       List<? extends HasWord> sentList = toke.tokenize();

       Tree parse = lp.apply(sentList);

       List<Tree> leaves = parse.getLeaves();

 

       Iterator<Tree> it = leaves.iterator();

       while (it.hasNext()) {

           Tree leaf = it.next();

           if (leaf.nodeString().trim().equals(keyword)) {

              Tree start = leaf;

              start = start.parent(parse);

              String tag = start.value().toString().trim();

              boolean extraedflg = false;

              // 如果当前节点的父节点是NN,则遍历该父节点的父节点的兄弟节点

              if (tag.equals("NN") || tag.equals("VA")) {

                  for (int i = 0; i < parse.depth(); i++) {

                     start = start.parent(parse);

                     if (start.value().toString().trim().equals("ROOT")

                            || extraedflg == true) {

                         break;

                     } else {

 

                         List<Tree> bros = start.siblings(parse);

                         if (bros != null) {

 

                            Iterator<Tree> it1 = bros.iterator();

                            while (it1.hasNext()) {

 

                                Tree bro = it1.next();

                                extraedflg = IteratorTree(bro, tag);

                                if (extraedflg) {

                                   break;

                                }

 

                            }

                         }

                     }

                  }

              }

 

           }

       }

    }

 

    private static boolean IteratorTree(Tree bro, String tagKey) {

       List<Tree> ends = bro.getChildrenAsList();

       Iterator<Tree> it = ends.iterator();

      

       while (it.hasNext()) {

           Tree end = it.next();

           String tagDep = end.value().toString().trim();

           if ((tagKey.equals("NN") && tagDep.equals("VA")) || (tagKey.equals("VA") && tagDep.equals("AD"))) {

              Tree depTree = end.getChild(0);

              System.out.println(depTree.value().toString());

              return true;

           } else if (IteratorTree(end, tagKey)) {

              return true;

           }

       }

       return false;

    }

}

原创粉丝点击