利用后缀树求多个字符串的最长公共连续子串(Longest Common Substring)

来源:互联网 发布:php transaction 编辑:程序博客网 时间:2024/05/01 09:17

问题:

有N个字符串,计算它们公共的最长连续子串。例如,字符串集合{abcd,bcca,aabc},它们公共的最长连续子串是bc。又如{abcfd,bcabca,aabcf}的公共的最长连续子串是abc。


针对本问题,可以用特殊的字符(该字符必须是N个串中没有出现的字符)将N个字符串连起来形成一个新串,然后构建这个新串的后缀树。比如对字符串集合 {abcd,bcca,aabc},可以连成新串abcd$bcca@aabc%,其中子串之间的分隔符为 '$','@','%'。满足条件的最长子串就是最深的分叉结点,而且该分叉结点下面的后缀的suffix(注意是后缀的后缀!)包含了这三个分隔字符:即至少有一个后缀的suffix为xxx$.....,至少有一个后缀的suffix为yyy@....(这里字符串yyy中没有字符$),至少有一个后缀的suffix为zzz%...(其中字符串zzz中没有$和@字符)。以下代码中,判断分隔字符是否在子树中的函数为 containTerminators()。


利用这种方式生成的后缀树有一个特点:带终结字符的结点都是叶子结点,不可能为内部结点,这一点跟前缀树和不带分隔符的单个子串后缀树是不同的(类似于Ukkonen构造法中末尾添加一个特殊字符将隐式后缀树自动转换成真正的后缀树)。利用这个特点,只需要考虑:非叶子结点的情况,因为满足条件的分叉结点不会是叶子结点。

在下面的实现中,首先采用蛮力法insert()构造后缀树;然后调用findLCS()查找最长公共子串。只需要考虑非叶子结点(child.terminal = false)。findLCS函数返回的是currNode的子树中满足条件的LCS。


实现:

import java.util.LinkedList;import java.util.List; /** *  * Find LCS(Longest Common Substring) of N strings (N>=2) *   * Copyright (c) 2011 ljs (http://blog.csdn.net/ljsspace/) * Licensed under GPL (http://www.opensource.org/licenses/gpl-license.php)  *  * @author ljs * 2011-06-29 * */public class LCS {private class SuffixNode {private String key;    private List<SuffixNode> children = new LinkedList<SuffixNode>();        //use "#" for terminal char    private boolean terminal;             public SuffixNode(){        this.key = "";    }    public SuffixNode(String key){    this.key = key;        }        public String toString(){        return this.key + (this.terminal?"#":"") + "(" + children.size() +")";    }   }private SuffixNode root;private String text;//terminators should be ordered according to input stringsprivate char[] terminators;public LCS(String text,char[] terminators){this.text = text;this.terminators = terminators;}private void insert(SuffixNode currNode,String key,int startIndex) throws Exception{boolean done = false;for(int i=0;i<currNode.children.size();i++){SuffixNode child = currNode.children.get(i);//use min(child.key.length, key.length)int len = child.key.length()<key.length()?child.key.length():key.length();int j = 0;for(;j<len;j++){if(key.charAt(j) != child.key.charAt(j)){break;}}if(j==0){//this child doesn't matchany character with the new key//order keys by lexi-orderif(key.charAt(0)<child.key.charAt(0)){//e.g. child="e" (currNode="abc")//   abc                     abc//    /  /    =========>      / | ///   e    f   insert "c"     c# e  fSuffixNode node = new SuffixNode(key);currNode.children.add(i,node);node.terminal = true;done = true;break;}else{ //key.charAt(0)>child.key.charAt(0)//don't forget to add the largest new key after iterating all childrencontinue;}}else{//current child's key partially matches with the new key; 0<j<=lenif(j==len){if(key.length()==child.key.length()){if(child.terminal){throw new Exception("Duplicate Key is found when insertion!");}else{//e.g. child="ab"//   ab                    ab#//    /  /    =========>    /   ///   e    f   insert "ab"  e     fchild.terminal = true;}}else if(key.length()>child.key.length()){//e.g. child="ab#"//   ab#                    ab#//    /  /    ==========>    / | / //   e    f   insert "abc"  c# e  fString subkey = key.substring(j);//recursioninsert(child,subkey,startIndex+j);}else{ //key.length()<child.key.length()//e.g. child="abc#"//   abc#                      ab#//    /   /      =========>      /   //   e     f     insert "ab"    c#    //           /  ///                            e    fString childSubkey = child.key.substring(j); //cSuffixNode subChildNode = new SuffixNode(childSubkey);subChildNode.terminal = child.terminal;subChildNode.children = child.children; //inherited from parentchild.key = key;  //abchild.terminal = true;  //ab#child.children = new LinkedList<SuffixNode>();child.children.add(subChildNode);}}else{//0<j<len//e.g. child="abc#"//   abc#                     ab//    /  /     ==========>     / ///   e    f   insert "abd"    c#  d# //                           /  ///                          e    f//split at jString childSubkey = child.key.substring(j);  //cString subkey = key.substring(j); //dSuffixNode subChildNode = new SuffixNode(childSubkey);subChildNode.terminal = child.terminal;subChildNode.children = child.children; //inherited from parent//update child's keychild.key = child.key.substring(0,j);//child is not terminal now due to split, it is inherited by subChildNodechild.terminal = false;//Note: no need to merge subChildNodeSuffixNode node = new SuffixNode(subkey);node.terminal = true;child.children = new LinkedList<SuffixNode>();if(subkey.charAt(0)<childSubkey.charAt(0)){child.children.add(node);child.children.add(subChildNode);}else{child.children.add(subChildNode);child.children.add(node);}}done = true;break;}}if(!done){SuffixNode node = new SuffixNode(key);node.terminal = true;currNode.children.add(node);}}public void insert(String suffix,int startIndex) throws Exception{if(suffix == null || suffix.length() == 0) return;if(root==null){root = new SuffixNode();}insert(root,suffix,startIndex);}//build a suffix-tree for a string of textpublic void buildSuffixTree() throws Exception{for(int i=0;i<text.length();i++){this.insert(text.substring(i), i);}}//for test purpose onlypublic void printTree(){this.print(0, this.root);}private void print(int level, SuffixNode node){for (int i = 0; i < level; i++) {            System.out.format(" ");        }System.out.format("|");        for (int i = 0; i < level; i++) {        System.out.format("-");        }        if (node.terminal)        System.out.format("%s#%n", node.key);        else        System.out.format("%s%n", node.key);        for (SuffixNode child : node.children) {        print(level + 1, child);        }}public String findLCS(){return findLCS(root);}//return the longest substring starting with current node (but not including currNode.key)private String findLCS(SuffixNode currNode){int maxDepth = currNode.key.length();int currDepth = currNode.key.length();String longestSubstrSuffix = "";for(int i=0;i<currNode.children.size();i++){SuffixNode child = currNode.children.get(i);if(!child.terminal){int depth = currDepth + child.key.length();//terminators are unique, so terminal child is excludedboolean containsTerminators = containTerminators(child);if(containsTerminators){if(depth > maxDepth){maxDepth = depth;longestSubstrSuffix =  child.key;}String longestChildSubstrSuffix = findLCS(child);if(longestChildSubstrSuffix.length()>0){ //not a part of LCS if longestChildSubstrSuffix's lenght is 0int maxChildDepth = longestChildSubstrSuffix.length() + depth;if(maxChildDepth > maxDepth){maxDepth = maxChildDepth;//the substring is relative to currNodelongestSubstrSuffix = child.key + longestChildSubstrSuffix;}}}}}return longestSubstrSuffix;}private boolean containTerminators(SuffixNode currNode){boolean[] done = new boolean[terminators.length];return containTerminators(currNode,done);}private boolean containTerminators(SuffixNode currNode,boolean[] done){boolean allDone = false;for(int i=0;i<currNode.children.size();i++){SuffixNode child = currNode.children.get(i);if(child.terminal){//Note: here the order of terminator is important for(int j=0;j<terminators.length;j++){int pos = child.key.indexOf(terminators[j]);if(pos>=0){done[j]=true;break;}}}else{containTerminators(child,done);}allDone = true;for(int j=0;j<done.length;j++){if(!done[j]){allDone = false;break;}}if(allDone)break;}return allDone;}public static void main(String[] args) throws Exception {System.out.println("****************************");System.out.format("LCS for 3 strings:{abc,bcabca,aabcf}%n");String text = "abc$bcabca@aabcf%";LCS strie = new LCS(text,new char[]{'

,'@','%'});strie.buildSuffixTree();//strie.printTree();String longestSubstr = strie.findLCS();System.out.format("%s%n", longestSubstr);System.out.println("****************************");System.out.format("LCS for 3 strings:{abcd,bcca,aabc}%n");text = "abcd$bcca@aabc%";strie = new LCS(text,new char[]{'
,'@','%'});strie.buildSuffixTree();//strie.printTree();longestSubstr = strie.findLCS();System.out.format("%s%n", longestSubstr);System.out.println("****************************");System.out.format("LCS for 2 strings:{asdfldkjxlfjax123456789abckljddfdfe123456789ees, xsdldkjalfla123456789abcfleur123456789ljafa}%n");text = "asdfldkjxlfjax123456789abckljddfdfe123456789ees$xsdldkjalfla123456789abcfleur123456789ljafa@";strie = new LCS(text,new char[]{'
,'@'});strie.buildSuffixTree();//strie.printTree();longestSubstr = strie.findLCS();System.out.format("%s%n", longestSubstr);System.out.println("****************************");System.out.format("LCS for 4 strings:{abcd,abce,abd,bdess}%n");text = "abcd$abce@abd%bdess&";strie = new LCS(text,new char[]{'
,'@','%','&'});strie.buildSuffixTree();//strie.printTree();longestSubstr = strie.findLCS();System.out.format("%s%n", longestSubstr);}}

测试输出:

****************************
LCS for 3 strings:{abc,bcabca,aabcf}
abc
****************************
LCS for 3 strings:{abcd,bcca,aabc}
bc
****************************
LCS for 2 strings:{asdfldkjxlfjax123456789abckljddfdfe123456789ees, xsdldkjalfla123456789abcfleur123456789ljafa}
123456789abc
****************************
LCS for 4 strings:{abcd,abce,abd,bdess}
b