NLP：stanfordNLP

来源：互联网发布：淘宝新开店铺采集编辑：程序博客网时间：2024/06/05 02:30

http://www.zmonster.me/2016/06/08/use-stanford-nlp-package-in-nltk.html

http://stanfordnlp.github.io/CoreNLP/

http://blog.csdn.net/ltbylc/article/details/8557965

1. stanfordNLP

分词: StanfordTokenizer
词性标注: StanfordPOSTagger
命名实体识别: StanfordNERTagger
句法分析: StanfordParser
依存句法分析: StanfordDependencyParser, StanfordNeuralDependencyParser

2. 分词

2.1 分词demo

vm参数

-mx1g

import java.util.*;  import edu.stanford.nlp.ie.crf.CRFClassifier;  public class stanfordSeg {      public static String doSegment(String data, CRFClassifier c) {              String[] strs = (String[]) c.segmentString(data).toArray();              StringBuffer buf = new StringBuffer();              for (String s : strs) {                  buf.append(s + " ");              }              return buf.toString();          }        public static void main(String[] args) throws Exception {              Properties props = new Properties();              props.setProperty("sighanCorporaDict", "data");              props.setProperty("serDictionary", "data/dict-chris6.ser.gz");              props.setProperty("inputEncoding", "UTF-8");              props.setProperty("sighanPostProcessing", "true");              CRFClassifier classifier = new CRFClassifier(props);              classifier.loadClassifierNoExceptions("data/ctb.gz", props);              classifier.flags.setProperties(props);              String sentence = "某处女同志去吃饭。";              String ret = doSegment(sentence, classifier);              System.out.println(ret);          }      }

3. 词性标准

java -mx300m -classpath stanford-postagger.jar edu.stanford.nlp.tagger.maxent.MaxentTagger
-model models/chinese-distsim.tagger -textFile inputFile > outputFile

import java.util.*;  import edu.stanford.nlp.ie.crf.CRFClassifier;  import java.io.BufferedReader;import java.io.FileReader;import java.util.List;import edu.stanford.nlp.ling.Sentence;import edu.stanford.nlp.ling.TaggedWord;import edu.stanford.nlp.ling.HasWord;import edu.stanford.nlp.tagger.maxent.MaxentTagger;class stanfordSeg {  private stanfordSeg() {}  public static void main(String[] args) throws Exception {    if (args.length != 2) {      System.err.println("usage: java TaggerDemo modelFile fileToTag");      return;    }    MaxentTagger tagger = new MaxentTagger(args[0]);    List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new BufferedReader(new FileReader(args[1])));    for (List<HasWord> sentence : sentences) {      List<TaggedWord> tSentence = tagger.tagSentence(sentence);      System.out.println(Sentence.listToString(tSentence, false));    }  }}

4. 实体识别

http://blog.csdn.net/sparkexpert/article/details/49497231

http://blog.csdn.net/shijiebei2009/article/details/42525091

import edu.stanford.nlp.ie.AbstractSequenceClassifier;   import edu.stanford.nlp.ie.crf.CRFClassifier;   import edu.stanford.nlp.ling.CoreLabel;   /**  *  * <p>  * ClassName ExtractDemo  * </p>  * <p>  * Description 加载NER模块  * </p>  *  * @author wangxu wangx89@126.com  * <p>  * Date 2015年1月8日 下午2:53:45  * </p>  * @version V1.0.0  *  */   public class stanfordtest{   private static AbstractSequenceClassifier<CoreLabel> ner;   public stanfordtest() {   InitNer();   }   public void InitNer() {   String serializedClassifier = "classifiers/chinese.misc.distsim.crf.ser.gz"; // chinese.misc.distsim.crf.ser.gz   if (ner == null) {   ner = CRFClassifier.getClassifierNoExceptions(serializedClassifier);   }   }   public String doNer(String sent) {   return ner.classifyWithInlineXML(sent);   }   public static void main(String args[]) {   String str = "我 去 吃饭 ， 告诉 李强 一声 。";   stanfordtest extractDemo = new stanfordtest();   System.out.println(extractDemo.doNer(str));   System.out.println("Complete!");   }   }

5. 依存句法分析

http://blog.sina.com.cn/s/blog_8af106960101abvu.html

//package com.parser;  import java.util.List;  import java.io.StringReader;  import edu.stanford.nlp.process.Tokenizer;  import edu.stanford.nlp.process.TokenizerFactory;  import edu.stanford.nlp.process.CoreLabelTokenFactory;  import edu.stanford.nlp.process.PTBTokenizer;  import edu.stanford.nlp.ling.CoreLabel;  import edu.stanford.nlp.trees.*;  import edu.stanford.nlp.parser.lexparser.LexicalizedParser;  public class stanfordtest {    private stanfordtest() {} // static methods only    public static void main(String[] args) {        String parserModel = "edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz";        LexicalizedParser lp = LexicalizedParser.loadModel(parserModel);        String sent2 = "他 和 我  在 学校  里 常  打 台球.";        demoAPI(lp,sent2);    }    public static void demoAPI(LexicalizedParser lp,String str) {        TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");        Tokenizer<CoreLabel> tok = tokenizerFactory.getTokenizer(new StringReader(str));        List<CoreLabel> rawWords2 = tok.tokenize();        Tree parse = lp.apply(rawWords2);        TreebankLanguagePack tlp = lp.treebankLanguagePack(); // PennTreebankLanguagePack for English        GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();        GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);        List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();        for(int i = 0; i < tdl.size(); i++){            System.out.println(tdl.get(i));        }        //System.out.println(tdl);        // System.out.println();            // You can also use a TreePrint object to print trees and dependencies            //TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");            //tp.printTree(parse);    }  }

6. 句法分析

import java.util.Collection;import java.util.List;import java.io.StringReader;import edu.stanford.nlp.process.Tokenizer;import edu.stanford.nlp.process.TokenizerFactory;import edu.stanford.nlp.process.CoreLabelTokenFactory;import edu.stanford.nlp.process.DocumentPreprocessor;import edu.stanford.nlp.process.PTBTokenizer;import edu.stanford.nlp.ling.CoreLabel;import edu.stanford.nlp.ling.HasWord;import edu.stanford.nlp.ling.Sentence;import edu.stanford.nlp.trees.*;import edu.stanford.nlp.parser.lexparser.LexicalizedParser;class stanfordtest {  /**   * The main method demonstrates the easiest way to load a parser.   * Simply call loadModel and specify the path of a serialized grammar   * model, which can be a file, a resource on the classpath, or even a URL.   * For example, this demonstrates loading a grammar from the models jar   * file, which you therefore need to include on the classpath for ParserDemo   * to work.   *   * Usage: {@code java ParserDemo [[model] textFile]}   * e.g.: java ParserDemo edu/stanford/nlp/models/lexparser/chineseFactored.ser.gz data/chinese-onesent-utf8.txt   *   */  public static void main(String[] args) {    String parserModel = "edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz";    if (args.length > 0) {      parserModel = args[0];    }    LexicalizedParser lp = LexicalizedParser.loadModel(parserModel);    if (args.length == 0) {      demoAPI(lp);    } else {      String textFile = (args.length > 1) ? args[1] : args[0];      demoDP(lp, textFile);    }  }  /**   * demoDP demonstrates turning a file into tokens and then parse   * trees.  Note that the trees are printed by calling pennPrint on   * the Tree object.  It is also possible to pass a PrintWriter to   * pennPrint if you want to capture the output.   * This code will work with any supported language.   */  public static void demoDP(LexicalizedParser lp, String filename) {    // This option shows loading, sentence-segmenting and tokenizing    // a file using DocumentPreprocessor.    TreebankLanguagePack tlp = lp.treebankLanguagePack(); // a PennTreebankLanguagePack for English    GrammaticalStructureFactory gsf = null;    if (tlp.supportsGrammaticalStructures()) {      gsf = tlp.grammaticalStructureFactory();    }    // You could also create a tokenizer here (as below) and pass it    // to DocumentPreprocessor    for (List<HasWord> sentence : new DocumentPreprocessor(filename)) {      Tree parse = lp.apply(sentence);      parse.pennPrint();      System.out.println();      if (gsf != null) {        GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);        Collection tdl = gs.typedDependenciesCCprocessed();        System.out.println(tdl);        System.out.println();      }    }  }  /**   * demoAPI demonstrates other ways of calling the parser with   * already tokenized text, or in some cases, raw text that needs to   * be tokenized as a single sentence.  Output is handled with a   * TreePrint object.  Note that the options used when creating the   * TreePrint can determine what results to print out.  Once again,   * one can capture the output by passing a PrintWriter to   * TreePrint.printTree. This code is for English.   */  public static void demoAPI(LexicalizedParser lp) {    // This option shows parsing a list of correctly tokenized words    String[] sent = { "他", "和", "我", "经常", "打", "台球","." };    List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent);    Tree parse = lp.apply(rawWords);    parse.pennPrint();    System.out.println();    // This option shows loading and using an explicit tokenizer    String sent2 = "This is another sentence.";    TokenizerFactory<CoreLabel> tokenizerFactory =        PTBTokenizer.factory(new CoreLabelTokenFactory(), "");    Tokenizer<CoreLabel> tok =        tokenizerFactory.getTokenizer(new StringReader(sent2));    List<CoreLabel> rawWords2 = tok.tokenize();    parse = lp.apply(rawWords2);    TreebankLanguagePack tlp = lp.treebankLanguagePack(); // PennTreebankLanguagePack for English    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);    List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();    System.out.println(tdl);    System.out.println();    // You can also use a TreePrint object to print trees and dependencies    TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");    tp.printTree(parse);  }  private stanfordtest() {} // static methods only}

0 0