后缀树的自底向上(bottom-up)遍历算法
来源:互联网 发布:绝密淘宝小类目 编辑:程序博客网 时间:2024/06/14 09:32
与后缀数组的自底向上遍历算法的相比(参考博文“后缀数组的自底向上(bottom-up)遍历算法”),对后缀树的Bottom-Up遍历要直接一些。后缀树的Bottom-Up遍历过程是:如果孩子结点是内部结点,对这些孩子结点按照从左到右的词典序(lexicographic order)递归调用遍历,然后访问该孩子结点本身。
children list计算:孩子结点直接加入当前结点(currNode)的children列表,而不像后缀数组给某个lcp-interval添加孩子lcp-interval时需要考虑两个cases。
lb和rb计算:计算后缀树currNode的lcp-interval的最小下标lb和最大下标rb,需要考虑它的第一个和最后一个孩子结点为叶子结点还是内部结点:第一个孩子结点如果是叶子结点,直接更新currNode的lcp-interval最小下标lb;第一个孩子结点如果是内部结点,使用该孩子结点的lcp-interval值更新currNode的lcp-interval最小下标lb。最后一个孩子结点做法类似,用来更新currNode的lcp-interval最大下标rb。
实现:
import java.util.ArrayList;import java.util.LinkedList;import java.util.List; /** * * Bottom-Up traversal of a suffix tree * (The suffix tree is built with ukk algorithm) * * * Copyright (c) 2011 ljs (http://blog.csdn.net/ljsspace/) * Licensed under GPL (http://www.opensource.org/licenses/gpl-license.php) * * @author ljs * 2011-07-24 * */public class BottomUpTraverseSuffixTree {private class SuffixNode {private StringBuilder sb; private List<SuffixNode> children = new LinkedList<SuffixNode>(); private SuffixNode link; private int start; private int end; private int pathlen; public SuffixNode(StringBuilder sb,int start,int end,int pathlen){ this.sb = sb; this.start = start; this.end = end; this.pathlen = pathlen; } public SuffixNode(StringBuilder sb){ this.sb = sb; this.start = -1; this.end = -1; this.pathlen = 0; } public int getLength(){ if(start == -1) return 0; else return end - start + 1; } public String getString(){ if(start != -1){ return this.sb.substring(start,end+1); }else{ return ""; } } public boolean isRoot(){ return start == -1; } public String getCoordinate(){ return "[" + start+".." + end + "/" + this.pathlen + "]"; } public String toString(){ return getString() + "(" + getCoordinate() + ",link:" + ((this.link==null)?"N/A":this.link.getCoordinate()) + ",children:" + children.size() +")"; } }private class State{private SuffixNode u; //parent(v)//private SuffixNode w; private SuffixNode v; //private int k; //the global index of text starting from 0 to text.length()//private boolean finished; }private SuffixNode root;private StringBuilder sb = new StringBuilder(); //build a suffix-tree for a string of textpublic void buildSuffixTree(String text) throws Exception{int m = text.length();if(m==0)return;if(root==null){root = new SuffixNode(sb);root.link = root; //link to itself}List<SuffixNode> leaves = new ArrayList<SuffixNode>();//add first nodesb.append(text.charAt(0));SuffixNode node = new SuffixNode(sb,0,0,1);leaves.add(node);root.children.add(node);int j_star = 0; //j_{i-1}SuffixNode u = root;SuffixNode v = root;for(int i=1;i<=m-1;i++){//do phase isb.append(text.charAt(i));//step 1: do implicit extensions for(SuffixNode leafnode:leaves){leafnode.end++;leafnode.pathlen++;}//step 2: do explicit extensions until rule #3 is appliedState state = new State();//for the first explicit extension, we reuse the last phase's u and do slowscan//also note: suffix link doesn't span two phases.int j=j_star+1;SuffixNode s = u; int k = s.pathlen + j;state.u = s;state.v = s; SuffixNode newleaf = slowscan(state,s,k);if(newleaf == null){//if rule #3 is applied, then we can terminate this phasej_star = j - 1;//Note: no need to update state.v because it is not going to be used//at the next phaseu = state.u;continue;}else{j_star = j;leaves.add(newleaf);u = state.u;v = state.v;}j++;//for other explicit extensions, we start with fast scan.for(;j<=i;j++){s = u.link;int uvLen=v.pathlen - u.pathlen; if(u.isRoot() && !v.isRoot()){uvLen--;}//starting with index k of the textk = s.pathlen + j;//init statestate.u = s;state.v = s; //if uvLen = 0 //execute fast scannewleaf = fastscan(state,s,uvLen,k);//establish the suffix link with vv.link = state.v;if(newleaf == null){//if rule #3 is applied, then we can terminate this phasej_star = j - 1;u = state.u;break;}else{j_star = j;leaves.add(newleaf);u = state.u;v = state.v;}}}}//slow scan from currNode until state.v is found//return the new leaf if a new one is created right after v;//return null otherwise (i.e. when rule #3 is applied)private SuffixNode slowscan(State state,SuffixNode currNode,int k){SuffixNode newleaf = null;boolean done = false;int keyLen = sb.length() - k;for(int i=0;i<currNode.children.size();i++){SuffixNode child = currNode.children.get(i);//use min(child.key.length, key.length)int childKeyLen = child.getLength();int len = childKeyLen<keyLen?childKeyLen:keyLen;int delta = 0;for(;delta<len;delta++){if(sb.charAt(k+delta) != sb.charAt(child.start+delta)){break;}}if(delta==0){//this child doesn't matchany character with the new key//order keys by lexi-orderif(sb.charAt(k) < sb.charAt(child.start)){//e.g. child="e" (currNode="abc")// abc abc// / \ =========> / | \// e f insert "c" c e fint pathlen = sb.length() - k + currNode.pathlen;SuffixNode node = new SuffixNode(sb,k,sb.length()-1,pathlen);currNode.children.add(i,node);//state.u = currNode; //currNode is already registered as state.u, so commented outstate.v = currNode;newleaf = node;done = true;break;}else{ //key.charAt(0)>child.key.charAt(0)//don't forget to add the largest new key after iterating all childrencontinue;}}else{//current child's key partially matches with the new keyif(delta==len){if(keyLen==childKeyLen){//e.g. child="ab"// ab ab// / \ =========> / \// e f insert "ab" e f//terminate this phase (implicit tree with rule #3)state.u = child;state.v = currNode;}else if(keyLen>childKeyLen){ //TODO: still need an example to test this condition//e.g. child="ab"// ab ab// / \ ==========> / | \ // e f insert "abc" c e f//recursionstate.u = child;state.v = child;k += childKeyLen;//state.k = k;newleaf = slowscan(state,child,k);}else{ //keyLen<childKeyLen//e.g. child="abc"// abc abc// / \ =========> / \ // e f insert "ab" e f // //terminate this phase (implicit tree with rule #3)//state.u = currNode;state.v = currNode;}}else{//0<delta<len //e.g. child="abc"// abc ab// / \ ==========> / \// e f insert "abd" c d // / \// e f//insert the new node: ab int nodepathlen = child.pathlen - (child.getLength()-delta);SuffixNode node = new SuffixNode(sb,child.start,child.start + delta - 1,nodepathlen); node.children = new LinkedList<SuffixNode>();int leafpathlen = (sb.length() - (k + delta)) + nodepathlen;SuffixNode leaf = new SuffixNode(sb,k+delta,sb.length()-1,leafpathlen);//update child node: cchild.start += delta;if(sb.charAt(k+delta)<sb.charAt(child.start)){node.children.add(leaf);node.children.add(child);}else{node.children.add(child);node.children.add(leaf);}//update parentcurrNode.children.set(i, node);//state.u = currNode; //currNode is already registered as state.u, so commented outstate.v = node;newleaf = leaf;}done = true;break;}}if(!done){int pathlen = sb.length() - k + currNode.pathlen;SuffixNode node = new SuffixNode(sb,k,sb.length()-1,pathlen);currNode.children.add(node);//state.u = currNode; //currNode is already registered as state.u, so commented outstate.v = currNode;newleaf = node;}return newleaf;}//fast scan until state.v is found;//return the new leaf if a new one is created right after v;//return null otherwise (i.e. when rule #3 is applied)private SuffixNode fastscan(State state,SuffixNode currNode,int uvLen,int k){if(uvLen==0){//state.u = currNode; //currNode is already registered as state.u, so commented out//continue with slow scanreturn slowscan(state,currNode,k);}SuffixNode newleaf = null;boolean done = false;for(int i=0;i<currNode.children.size();i++){SuffixNode child = currNode.children.get(i);if(sb.charAt(child.start) == sb.charAt(k)){int len = child.getLength();if(uvLen==len){//then we find v//uvLen = 0;state.u = child;//state.v = child;k += len;//state.k = k;//continue with slow scannewleaf = slowscan(state,child,k);}else if(uvLen<len){//we know v must be an internal node; branchingand cut child short//e.g. child="abc",uvLen = 2// abc ab// / \ ================> / \// e f suffix part: "abd" c d // / \// e f//insert the new node: ab; child is now c int nodepathlen = child.pathlen - (child.getLength()-uvLen);SuffixNode node = new SuffixNode(sb,child.start,child.start + uvLen - 1,nodepathlen); node.children = new LinkedList<SuffixNode>();int leafpathlen = (sb.length() - (k + uvLen)) + nodepathlen;SuffixNode leaf = new SuffixNode(sb,k+uvLen,sb.length()-1,leafpathlen);//update child node: cchild.start += uvLen;if(sb.charAt(k+uvLen)<sb.charAt(child.start)){node.children.add(leaf);node.children.add(child);}else{node.children.add(child);node.children.add(leaf);}//update parentcurrNode.children.set(i, node);//uvLen = 0;//state.u = currNode; //currNode is already registered as state.u, so commented outstate.v = node;newleaf = leaf;}else{//uvLen>len//e.g. child="abc", uvLen = 4// abc // / \ ================> // e f suffix part: "abcde" // // //jump to next nodeuvLen -= len;state.u = child;//state.v = child;k += len;//state.k = k;newleaf = fastscan(state,child,uvLen,k);}done = true;break;}}if(!done){//TODO: still need an example to test this condition//add a leaf under the currNodeint pathlen = sb.length() - k + currNode.pathlen;SuffixNode node = new SuffixNode(sb,k,sb.length()-1,pathlen);currNode.children.add(node);//state.u = currNode; //currNode is already registered as state.u, so commented outstate.v = currNode;newleaf = node;}return newleaf;}class LCPInterval{int lcp; //the lcp value of the lcp-intervalint lb; //the left boundary suffix indexint rb; //the right boundary suffix indexList<LCPInterval> children = new ArrayList<LCPInterval>();public LCPInterval(int lcp,int lb,int rb){this.lcp = lcp;this.lb = lb;this.rb = rb;}public String toString(){return String.format("%d-[%d..%d]", this.lcp,this.lb,this.rb);}} public void bottomUpTraverse(){LCPInterval interval = bottomUpTraverse(root);//visit rootvisit(root,interval);}private int maxk=0; //the max k of suffix array index until nowpublic LCPInterval bottomUpTraverse(SuffixNode currNode){LCPInterval interval = new LCPInterval(currNode.pathlen,-1,-1);for(int i=0;i<currNode.children.size();i++){SuffixNode child = currNode.children.get(i);if(!child.children.isEmpty()){//internal nodeLCPInterval childInterval = bottomUpTraverse(child);visit(child,childInterval);if(i==0){interval.lb = childInterval.lb;}if(i==currNode.children.size()-1){interval.rb = childInterval.rb;}interval.children.add(childInterval);}else{if(i==0){interval.lb = maxk;}if(i==currNode.children.size()-1){interval.rb = maxk;}maxk++;}}return interval;}//visit internal nodeprivate void visit(SuffixNode node,LCPInterval interval){if(interval.children.size()>0){StringBuilder sb = new StringBuilder();for(LCPInterval child:interval.children){sb.append(child.toString());sb.append(",");}sb.deleteCharAt(sb.length()-1);System.out.format("%s, children={%s}%n", interval,sb.toString());}else{System.out.format("%s%n", interval);}}//for test purpose onlypublic void printTree(){System.out.format("The suffix tree for S = %s is: %n",this.sb);this.print(0, this.root);}private void print(int level, SuffixNode node){for (int i = 0; i < level; i++) { System.out.format(" "); }System.out.format("|"); for (int i = 0; i < level; i++) { System.out.format("-"); } System.out.format("%s(%d..%d/%d)%n", node.getString(),node.start,node.end,node.pathlen); //System.out.format("(%d,%d)%n", node.start,node.end); for (SuffixNode child : node.children) { print(level + 1, child); }}public static void main(String[] args) throws Exception {//test suffix-treeSystem.out.println("****************************");String text = "mississippi#"; //the last char must be unique!BottomUpTraverseSuffixTree stree = new BottomUpTraverseSuffixTree();stree.buildSuffixTree(text);//stree.printTree(); System.out.format("%nText: %s %n",text);stree.bottomUpTraverse();System.out.println(); System.out.println("****************************");text = "GACCCACCACC#"; //the last char must be unique!stree = new BottomUpTraverseSuffixTree();stree.buildSuffixTree(text);//stree.printTree(); System.out.format("Text: %s %n",text);stree.bottomUpTraverse();System.out.println(); }}
测试:
****************************
Text: mississippi#
4-[3..4]
1-[1..4], children={4-[3..4]}
1-[6..7]
2-[8..9]
3-[10..11]
1-[8..11], children={2-[8..9],3-[10..11]}
0-[0..11], children={1-[1..4],1-[6..7],1-[8..11]}
****************************
Text: GACCCACCACC#
3-[1..3]
4-[5..6]
5-[8..9]
2-[7..10], children={5-[8..9]}
1-[4..10], children={4-[5..6],2-[7..10]}
0-[0..11], children={3-[1..3],1-[4..10]}
- 后缀树的自底向上(bottom-up)遍历算法
- 后缀数组的自底向上(bottom-up)遍历算法
- 自底向上的归并排序算法
- 后缀数组的自顶向下(top-down)遍历算法
- 图示经典算法--自底向上的归并排序
- 算法之自底向上的归并排序
- 3-4 自底向上的归并排序算法
- 算法入门之归并排序(自底向上方法)
- 后缀树的自顶向下(top-down)遍历
- 最优二叉搜索树自底向上非递归的动态规划算法
- 《数据结构与算法分析》伸展树(自底向上)详解
- 自底向上的归并排序-转自算法C语言实现
- 动态规划之钢条切割问题自底向上发的实现(算法导论第15章)
- 自底向上的归并排序
- 自底向上的归并排序
- 自底向上的合并排序
- 自底向上弹出的window
- 自底向上的归并排序
- dropdownlist与gridview的关联
- ZOJ1886/POJ2540 Hotter Colder(半平面交)
- 倚天屠龙记笛子音乐我找到了!就是似水柔情!
- 关于android系列博客的更新
- dropdownlist默认为——请选择——的方法
- 后缀树的自底向上(bottom-up)遍历算法
- POJ--3394[Containers] 哈希判重
- android开发人员必须知道的事---苹果HTC专利战升级 败诉将影响整个Android市场,android将比iphone贵
- Debian的包管理命令
- ubuntu中安装jdk,eclipse
- stdafx.h
- C++ return exit(0) abort()关于回收
- 命令重定向操作符
- 窗体透明,窗体上控件不透明的设置方法