利用AC自动机进行关键字的提取和过滤

来源:互联网 发布:python编程(第四版) 编辑:程序博客网 时间:2024/05/21 11:12

昨天看了meituan.com的AC算法在美团上单系统的应用一文,深受启发,原来ACM算法在工程中也能有这样赤裸裸的运用~~~ 于是便复习了AC自动机,并把代码用java重新搞了一遍~~



AC自动机整体的结果大概是长这样的,其实就是在trie树上做KMP :




AC自动机里面比较难理解的应该是它的失配指针的计算过程。
这个计算过程从本质上讲就是进行一遍广搜,于此同时维护
fail指针,每一步的维护过程可用下图表示。





Keyword.java

package com.AC.domain;import java.io.*;import java.util.*;import java.math.*;public class Keyword implements Serializable{/** *  */private Integer id;private Map<Integer, Integer> categoryTypeMap;private String word;private List<Integer> categories;private static final long serialVersionUID = 1L;public Keyword(){id = null;categories=null;categoryTypeMap=null;word=null;}public Keyword(String key){id = null;categories=null;categoryTypeMap=null;word=key;}public Keyword(Keyword p){this.categories=p.categories;this.categoryTypeMap=p.categoryTypeMap;this.id=p.id;this.word=p.word;}@Overridepublic boolean equals(Object o) {// TODO Auto-generated method stubif (this == o) return true;if(o==null||getClass()!=o.getClass()) return false;Keyword keyword = (Keyword) o;if(id!=null?!id.equals(keyword.id):keyword.id!=null)return false;return true;}@Overridepublic int hashCode() {// TODO Auto-generated method stubreturn id != null ?id.hashCode():0;}public Integer getId() {return id;}public void setId(Integer id) {this.id = id;}public Map<Integer, Integer> getCategoryTypeMap() {return categoryTypeMap;}public void setCategoryTypeMap(Map<Integer, Integer> categoryTypeMap) {this.categoryTypeMap = categoryTypeMap;}public String getWord() {return word;}public void setWord(String word) {this.word = word;}public List<Integer> getCategories() {return categories;}public void setCategories(List<Integer> categories) {this.categories = categories;}}

Node.java

package com.AC.domain;import java.util.ArrayList;import java.util.List;public class Node {public Integer state;public char  character = 0;  //鎸囧悜褰撳墠鑺傜偣鐨勫瓧绗�public Node failureNode;public List <Keyword> keywords;public List <Node> childrenList;public Node(){keywords=new ArrayList<Keyword>();childrenList = new ArrayList<Node>();state = 0;failureNode = null;character = 0;}public Node (char c,Node node) {keywords=new ArrayList<Keyword>();childrenList = new ArrayList<Node>();state =1;character =c ;failureNode = node;}public Boolean containsChild (char c){for(Node childNode : childrenList) {if(childNode.character==c) return true;}return false;}public Node getChild (char c){for (Node childNode : childrenList){if(childNode.character==c) return childNode;} return null;}public void addKeyword(Keyword keyword){keywords.add(keyword);}public void addKeywords(List<Keyword> k){keywords.addAll(k);}public void addChild(Node child){childrenList.add(child);}}

Patterns.java
package com.AC.domain;import java.util.*;import java.io.*;import java.math.*;public class Patterns {private final Node root = new Node();private List<Node> tree;public Patterns(List<Keyword> keywords){tree = new ArrayList<Node> ();root.failureNode=root;tree.add(root);for(Keyword keyword : keywords){addKeyword(keyword);}setFailNode();}private  void setFailNode() {// TODO Auto-generated method stubQueue<Node> queue = new LinkedList<Node>();Node node =root;for (Node d1 : node.childrenList){queue.offer(d1);}while (!queue.isEmpty()){node = queue.poll();if (node.childrenList!=null){for (Node curNode : node.childrenList) {queue.offer(curNode);Node failNode = node.failureNode;while(!failNode.containsChild(curNode.character)){failNode = failNode.failureNode;if(failNode==null||failNode.state==0) break;}if(failNode!=null&&failNode.containsChild(curNode.character)) {curNode.failureNode = failNode.getChild(curNode.character);curNode.addKeywords(curNode.failureNode.keywords);}}}}}private  void addKeyword(Keyword keyword) {// TODO Auto-generated method stubchar [] wordCharArr = keyword.getWord().toCharArray();Node current = root;for(char currentChar : wordCharArr){if(current.containsChild(currentChar)){current = current.getChild(currentChar);}else{Node node = new Node (currentChar,root);current.addChild(node);current=node;tree.add(node);}}current.addKeyword(keyword);}public List<Keyword> searchKeyword(String data,Integer category) {List<Keyword> matchResult = new ArrayList<Keyword>();Node node = root;char[] chs = data.toCharArray();for (int i=0;i<chs.length;i++){while(node!=null&&!node.containsChild(chs[i])){//if(node.state==0) break;node = node.failureNode;if(node==null||node.state==0) break;}if(node!=null&&node.containsChild(chs[i])) {node = node.getChild(chs[i]);if(node.keywords!=null){for(Keyword pattern : node.keywords){if(category == null){//System.out.println(pattern.getWord());matchResult.add(new Keyword(pattern.getWord()));}else{if(pattern.getCategories().contains(category)){matchResult.add(pattern);}}}}}}return matchResult;}}


Test.java
package com.AC.domain;import java.util.ArrayList;import java.util.HashSet;import java.util.List;import java.util.Set;public class Test {public static void main(String []args){//abcd abc abe ae bc be bce cm kcabcmghList<Keyword> keywords = new ArrayList<Keyword>();List<Keyword> result = new ArrayList<Keyword> ();/*List<Keyword> re= new ArrayList<Keyword> ();re.clear();Keyword a= new Keyword("abcd");re.add(a);Keyword b= new Keyword("abc");re.add(b);System.out.println(re.size());*/Keyword a1= new Keyword();a1.setWord("abcd");keywords.add(a1);Keyword a2= new Keyword();a2.setWord("abc");keywords.add(a2);Keyword a3= new Keyword();a3.setWord("abe");keywords.add(a3);Keyword a5= new Keyword();a5.setWord("ae");keywords.add(a5);Keyword a6= new Keyword();a6.setWord("bc");keywords.add(a6);Keyword a7= new Keyword();a7.setWord("be");keywords.add(a7);Keyword a8= new Keyword();a8.setWord("bce");keywords.add(a8);Keyword a9= new Keyword();a9.setWord("cm");keywords.add(a9);Patterns patterns=new Patterns(keywords);result=patterns.searchKeyword("kcabcmgha", null);//System.out.println(result.size());System.out.println("keys: ");for(Keyword key:result){System.out.println(key.getWord());}//System.out.println(result);}}

附美团文章链接:http://tech.meituan.com/ac.html


0 0