后缀树的自顶向下(top-down)遍历

来源：互联网发布：重新加载js不刷新页面编辑：程序博客网时间：2024/05/19 04:53

与后缀数组的top-down遍历相比，后缀树的自顶向下遍历相对直接一些。下面的实现中首先确定每一个内部结点的左右后缀边界下标（prepare方法），然后先序遍历所有内部结点。

实现：

import java.util.ArrayList;import java.util.LinkedList;import java.util.List;  /** *  * Top-Down traversal of a suffix tree * (The suffix tree is built with ukk algorithm) *  *   * Copyright (c) 2011 ljs (http://blog.csdn.net/ljsspace/) * Licensed under GPL (http://www.opensource.org/licenses/gpl-license.php)  *  * @author ljs * 2011-07-25 * */public class TopDownTraverseSuffixTree {private class SuffixNode {private StringBuilder sb;    private List<SuffixNode> children = new LinkedList<SuffixNode>();        private SuffixNode link;    private int start;    private int end;    private int pathlen;        private int lb;    private int rb;            public SuffixNode(StringBuilder sb,int start,int end,int pathlen){    this.sb = sb;    this.start = start;    this.end = end;    this.pathlen = pathlen;    }    public SuffixNode(StringBuilder sb){        this.sb = sb;    this.start = -1;    this.end = -1;        this.pathlen = 0;    }    public int getLength(){    if(start == -1) return 0;    else return end - start + 1;    }    public String getString(){    if(start != -1){    return this.sb.substring(start,end+1);    }else{    return "";    }    }    public boolean isRoot(){    return start == -1;    }    public String getCoordinate(){    return "[" + start+".." + end + "/" + this.pathlen + "]";    }    public String toString(){        return getString() + "(" + getCoordinate()     + ",link:" + ((this.link==null)?"N/A":this.link.getCoordinate())     + ",children:" + children.size() +")";    }   }private class State{private SuffixNode u; //parent(v)//private SuffixNode w;  private SuffixNode v;  //private int k; //the global index of text starting from 0 to text.length()//private boolean finished;  }private SuffixNode root;private StringBuilder sb = new StringBuilder(); //build a suffix-tree for a string of textpublic void  buildSuffixTree(String text) throws Exception{int m = text.length();if(m==0)return;if(root==null){root = new SuffixNode(sb);root.link = root; //link to itself}List<SuffixNode> leaves =  new ArrayList<SuffixNode>();//add first nodesb.append(text.charAt(0));SuffixNode node = new SuffixNode(sb,0,0,1);leaves.add(node);root.children.add(node);int j_star = 0; //j_{i-1}SuffixNode u = root;SuffixNode v = root;for(int i=1;i<=m-1;i++){//do phase isb.append(text.charAt(i));//step 1: do implicit extensions for(SuffixNode leafnode:leaves){leafnode.end++;leafnode.pathlen++;}//step 2: do explicit extensions until rule #3 is appliedState state = new State();//for the first explicit extension, we reuse the last phase's u and do slowscan//also note: suffix link doesn't span two phases.int j=j_star+1;SuffixNode s = u; int k = s.pathlen + j;state.u = s;state.v = s;  SuffixNode newleaf = slowscan(state,s,k);if(newleaf == null){//if rule #3 is applied, then we can terminate this phasej_star = j - 1;//Note: no need to update state.v because it is not going to be used//at the next phaseu = state.u;continue;}else{j_star = j;leaves.add(newleaf);u = state.u;v = state.v;}j++;//for other explicit extensions, we start with fast scan.for(;j<=i;j++){s = u.link;int uvLen=v.pathlen - u.pathlen;  if(u.isRoot() && !v.isRoot()){uvLen--;}//starting with index k of the textk = s.pathlen + j;//init statestate.u = s;state.v = s; //if uvLen = 0 //execute fast scannewleaf = fastscan(state,s,uvLen,k);//establish the suffix link with vv.link = state.v;if(newleaf == null){//if rule #3 is applied, then we can terminate this phasej_star = j - 1;u = state.u;break;}else{j_star = j;leaves.add(newleaf);u = state.u;v = state.v;}}}}//slow scan from currNode until state.v is found//return the new leaf if a new one is created right after v;//return null otherwise (i.e. when rule #3 is applied)private SuffixNode slowscan(State state,SuffixNode currNode,int k){SuffixNode newleaf = null;boolean done = false;int keyLen = sb.length() - k;for(int i=0;i<currNode.children.size();i++){SuffixNode child = currNode.children.get(i);//use min(child.key.length, key.length)int childKeyLen = child.getLength();int len = childKeyLen<keyLen?childKeyLen:keyLen;int delta = 0;for(;delta<len;delta++){if(sb.charAt(k+delta) != sb.charAt(child.start+delta)){break;}}if(delta==0){//this child doesn't matchany character with the new key//order keys by lexi-orderif(sb.charAt(k) < sb.charAt(child.start)){//e.g. child="e" (currNode="abc")//   abc                     abc//    /  \    =========>      / | \//   e    f   insert "c"     c  e  fint pathlen = sb.length() - k + currNode.pathlen;SuffixNode node = new SuffixNode(sb,k,sb.length()-1,pathlen);currNode.children.add(i,node);//state.u = currNode; //currNode is already registered as state.u, so commented outstate.v = currNode;newleaf = node;done = true;break;}else{ //key.charAt(0)>child.key.charAt(0)//don't forget to add the largest new key after iterating all childrencontinue;}}else{//current child's key partially matches with the new keyif(delta==len){if(keyLen==childKeyLen){//e.g. child="ab"//   ab                    ab//    /  \    =========>    /  \//   e    f   insert "ab"  e    f//terminate this phase  (implicit tree with rule #3)state.u = child;state.v = currNode;}else if(keyLen>childKeyLen){ //TODO: still need an example to test this condition//e.g. child="ab"//   ab                      ab//    /  \    ==========>     / | \ //   e    f   insert "abc"   c e  f//recursionstate.u = child;state.v = child;k += childKeyLen;//state.k = k;newleaf = slowscan(state,child,k);}else{ //keyLen<childKeyLen//e.g. child="abc"//   abc                      abc//    /   \      =========>     /  \ //   e     f     insert "ab"   e   f   //          //terminate this phase  (implicit tree with rule #3)//state.u = currNode;state.v = currNode;}}else{//0<delta<len //e.g. child="abc"//   abc                     ab//    /  \     ==========>     / \//   e    f   insert "abd"    c   d //                           /  \//                          e    f//insert the new node: ab int nodepathlen = child.pathlen - (child.getLength()-delta);SuffixNode node = new SuffixNode(sb,child.start,child.start + delta - 1,nodepathlen); node.children = new LinkedList<SuffixNode>();int leafpathlen = (sb.length() - (k + delta)) + nodepathlen;SuffixNode leaf = new SuffixNode(sb,k+delta,sb.length()-1,leafpathlen);//update child node: cchild.start += delta;if(sb.charAt(k+delta)<sb.charAt(child.start)){node.children.add(leaf);node.children.add(child);}else{node.children.add(child);node.children.add(leaf);}//update parentcurrNode.children.set(i, node);//state.u = currNode; //currNode is already registered as state.u, so commented outstate.v = node;newleaf = leaf;}done = true;break;}}if(!done){int pathlen = sb.length() - k + currNode.pathlen;SuffixNode node = new SuffixNode(sb,k,sb.length()-1,pathlen);currNode.children.add(node);//state.u = currNode; //currNode is already registered as state.u, so commented outstate.v = currNode;newleaf = node;}return newleaf;}//fast scan until state.v is found;//return the new leaf if a new one is created right after v;//return null otherwise (i.e. when rule #3 is applied)private SuffixNode fastscan(State state,SuffixNode currNode,int uvLen,int k){if(uvLen==0){//state.u = currNode; //currNode is already registered as state.u, so commented out//continue with slow scanreturn slowscan(state,currNode,k);}SuffixNode newleaf = null;boolean done  = false;for(int i=0;i<currNode.children.size();i++){SuffixNode child = currNode.children.get(i);if(sb.charAt(child.start) == sb.charAt(k)){int len = child.getLength();if(uvLen==len){//then we find v//uvLen = 0;state.u = child;//state.v = child;k += len;//state.k = k;//continue with slow scannewleaf = slowscan(state,child,k);}else if(uvLen<len){//we know v must be an internal node; branchingand cut child short//e.g. child="abc",uvLen = 2//   abc                          ab//    /  \    ================>     / \//   e    f   suffix part: "abd"   c   d //                                /  \//                               e    f//insert the new node: ab; child is now c int nodepathlen = child.pathlen - (child.getLength()-uvLen);SuffixNode node = new SuffixNode(sb,child.start,child.start + uvLen - 1,nodepathlen); node.children = new LinkedList<SuffixNode>();int leafpathlen = (sb.length() - (k + uvLen)) + nodepathlen;SuffixNode leaf = new SuffixNode(sb,k+uvLen,sb.length()-1,leafpathlen);//update child node: cchild.start += uvLen;if(sb.charAt(k+uvLen)<sb.charAt(child.start)){node.children.add(leaf);node.children.add(child);}else{node.children.add(child);node.children.add(leaf);}//update parentcurrNode.children.set(i, node);//uvLen = 0;//state.u = currNode; //currNode is already registered as state.u, so commented outstate.v = node;newleaf = leaf;}else{//uvLen>len//e.g. child="abc", uvLen = 4//   abc                          //    /  \    ================>      //   e    f   suffix part: "abcde"   //                                //                  //jump to next nodeuvLen -= len;state.u = child;//state.v = child;k += len;//state.k = k;newleaf = fastscan(state,child,uvLen,k);}done = true;break;}}if(!done){//TODO: still need an example to test this condition//add a leaf under the currNodeint pathlen = sb.length() - k + currNode.pathlen;SuffixNode node = new SuffixNode(sb,k,sb.length()-1,pathlen);currNode.children.add(node);//state.u = currNode; //currNode is already registered as state.u, so commented outstate.v = currNode;newleaf = node;}return newleaf;}  private int maxk=0; //the suffix array index private void prepare(SuffixNode currNode){for(int i=0;i<currNode.children.size();i++){SuffixNode child = currNode.children.get(i);prepare(child);} if(!currNode.children.isEmpty()){currNode.lb = currNode.children.get(0).lb;currNode.rb = currNode.children.get(currNode.children.size()-1).rb;}else{currNode.lb = currNode.rb = maxk;maxk++;}}public void topDownTraverse(){//prepare lb and rb for each internal nodeprepare(root); topDownTraverse(root);}public void topDownTraverse(SuffixNode currNode){if(!currNode.children.isEmpty())visit(currNode);for(int i=0;i<currNode.children.size();i++){SuffixNode child = currNode.children.get(i);if(!child.children.isEmpty()){topDownTraverse(child);}} }//visit internal nodeprivate void visit(SuffixNode node){String interval = String.format("%d-[%d..%d]", node.pathlen,node.lb,node.rb);if(node.children.size()>0){StringBuilder sb = new StringBuilder();int internalNodes = 0;for(SuffixNode child:node.children){if(!child.children.isEmpty()){internalNodes++;String childInterval = String.format("%d-[%d..%d]", child.pathlen,child.lb,child.rb);sb.append(childInterval);sb.append(",");}}if(internalNodes>0){sb.deleteCharAt(sb.length()-1);System.out.format("%s, children={%s}%n", interval,sb.toString());}else{System.out.format("%s%n", interval);}}else{System.out.format("%s%n", interval);}}//for test purpose onlypublic void printTree(){System.out.format("The suffix tree for S = %s is: %n",this.sb);this.print(0, this.root);}private void print(int level, SuffixNode node){for (int i = 0; i < level; i++) {            System.out.format(" ");        }System.out.format("|");        for (int i = 0; i < level; i++) {        System.out.format("-");        }        System.out.format("%s(%d..%d/%d)%n", node.getString(),node.start,node.end,node.pathlen);        //System.out.format("(%d,%d)%n", node.start,node.end);        for (SuffixNode child : node.children) {        print(level + 1, child);        }}public static void main(String[] args) throws Exception {//test suffix-treeSystem.out.println("****************************");String text = "mississippi#"; //the last char must be unique!TopDownTraverseSuffixTree stree = new TopDownTraverseSuffixTree();stree.buildSuffixTree(text);//stree.printTree(); System.out.format("%nInternal Nodes for text: %s %n",text);stree.topDownTraverse();System.out.println(); System.out.println("****************************");text = "GACCCACCACC#"; //the last char must be unique!stree = new TopDownTraverseSuffixTree();stree.buildSuffixTree(text);//stree.printTree(); System.out.format("Internal Nodes for text: %s %n",text);stree.topDownTraverse();System.out.println(); System.out.println("****************************");text = "abcdefghijklmmnopqrstuvwxyz#"; //the last char must be unique!stree = new TopDownTraverseSuffixTree();stree.buildSuffixTree(text);//stree.printTree(); System.out.format("Internal Nodes for text: %s %n",text);stree.topDownTraverse();System.out.println("****************************");text = "yabbadabbado#"; //the last char must be unique!stree = new TopDownTraverseSuffixTree();stree.buildSuffixTree(text);//stree.printTree(); System.out.format("Internal Nodes for text: %s %n",text);stree.topDownTraverse();System.out.println("****************************");text = "AAAAAAAAAAAAAAAAAAAAAAAAAA#"; //the last char must be unique!stree = new TopDownTraverseSuffixTree();stree.buildSuffixTree(text);//stree.printTree(); System.out.format("Internal Nodes for text: %s %n",text);stree.topDownTraverse();System.out.println("****************************");text = "GGGGGGGGGGGGCGCAAAAGCGAGCAGAGAGAAAAAAAAAAAAAAAAAAAAAA#"; //the last char must be unique!stree = new TopDownTraverseSuffixTree();stree.buildSuffixTree(text);//stree.printTree(); System.out.format("Internal Nodes for text: %s %n",text);stree.topDownTraverse();}}

测试：

****************************

Internal Nodes for text: mississippi#
0-[0..11], children={1-[1..4],1-[6..7],1-[8..11]}
1-[1..4], children={4-[3..4]}
4-[3..4]
1-[6..7]
1-[8..11], children={2-[8..9],3-[10..11]}
2-[8..9]
3-[10..11]

****************************
Internal Nodes for text: GACCCACCACC#
0-[0..11], children={3-[1..3],1-[4..10]}
3-[1..3]
1-[4..10], children={4-[5..6],2-[7..10]}
4-[5..6]
2-[7..10], children={5-[8..9]}
5-[8..9]

****************************
Internal Nodes for text: abcdefghijklmmnopqrstuvwxyz#
0-[0..27], children={1-[13..14]}
1-[13..14]
****************************
Internal Nodes for text: yabbadabbado#
0-[0..12], children={1-[1..4],1-[5..8],1-[9..10]}
1-[1..4], children={5-[1..2],2-[3..4]}
5-[1..2]
2-[3..4]
1-[5..8], children={3-[5..6],4-[7..8]}
3-[5..6]
4-[7..8]
1-[9..10]
****************************
Internal Nodes for text: AAAAAAAAAAAAAAAAAAAAAAAAAA#
0-[0..26], children={1-[1..26]}
1-[1..26], children={2-[2..26]}
2-[2..26], children={3-[3..26]}
3-[3..26], children={4-[4..26]}
4-[4..26], children={5-[5..26]}
5-[5..26], children={6-[6..26]}
6-[6..26], children={7-[7..26]}
7-[7..26], children={8-[8..26]}
8-[8..26], children={9-[9..26]}
9-[9..26], children={10-[10..26]}
10-[10..26], children={11-[11..26]}
11-[11..26], children={12-[12..26]}
12-[12..26], children={13-[13..26]}
13-[13..26], children={14-[14..26]}
14-[14..26], children={15-[15..26]}
15-[15..26], children={16-[16..26]}
16-[16..26], children={17-[17..26]}
17-[17..26], children={18-[18..26]}
18-[18..26], children={19-[19..26]}
19-[19..26], children={20-[20..26]}
20-[20..26], children={21-[21..26]}
21-[21..26], children={22-[22..26]}
22-[22..26], children={23-[23..26]}
23-[23..26], children={24-[24..26]}
24-[24..26], children={25-[25..26]}
25-[25..26]
****************************
Internal Nodes for text: GGGGGGGGGGGGCGCAAAAGCGAGCAGAGAGAAAAAAAAAAAAAAAAAAAAAA#
0-[0..53], children={1-[1..30],1-[31..34],1-[35..53]}
1-[1..30], children={2-[2..25],2-[26..30]}
2-[2..25], children={3-[3..24]}
3-[3..24], children={4-[4..23]}
4-[4..23], children={5-[5..22]}
5-[5..22], children={6-[6..22]}
6-[6..22], children={7-[7..22]}
7-[7..22], children={8-[8..22]}
8-[8..22], children={9-[9..22]}
9-[9..22], children={10-[10..22]}
10-[10..22], children={11-[11..22]}
11-[11..22], children={12-[12..22]}
12-[12..22], children={13-[13..22]}
13-[13..22], children={14-[14..22]}
14-[14..22], children={15-[15..22]}
15-[15..22], children={16-[16..22]}
16-[16..22], children={17-[17..22]}
17-[17..22], children={18-[18..22]}
18-[18..22], children={19-[19..22]}
19-[19..22], children={20-[20..22]}
20-[20..22], children={21-[21..22]}
21-[21..22]
2-[26..30], children={3-[26..28],3-[29..30]}
3-[26..28], children={5-[27..28]}
5-[27..28]
3-[29..30]
1-[31..34], children={2-[31..32],2-[33..34]}
2-[31..32]
2-[33..34]
1-[35..53], children={2-[35..38],2-[39..42],2-[43..53]}
2-[35..38], children={3-[36..38]}
3-[36..38], children={4-[36..37]}
4-[36..37]
2-[39..42], children={3-[39..40],3-[41..42]}
3-[39..40]
3-[41..42]
2-[43..53], children={3-[44..53]}
3-[44..53], children={4-[45..53]}
4-[45..53], children={5-[46..53]}
5-[46..53], children={6-[47..53]}
6-[47..53], children={7-[48..53]}
7-[48..53], children={8-[49..53]}
8-[49..53], children={9-[50..53]}
9-[50..53], children={10-[51..53]}
10-[51..53], children={11-[52..53]}
11-[52..53]