sequence alignment

来源:互联网 发布:淘宝发空包平台 编辑:程序博客网 时间:2024/05/17 08:32

# this is also called 'edit distance'.# consider sequence X = (x1,x2,..,xm) and Y=(y1,y2, ..., yn);# an alignment is a subset A belongs to {1,...,m} * {1, .., n}# for any (i,j) and (i',j'): i != i', j!= j', i < i' -> j<j'# cost function: the number of unmatched pairs + sum(matching values (if two chars are equal, is should be zeros else 1)).# c(i,j) minimum cost of align x1,...,xi and y1,...,yj,  we need to  find c(m,n)# case 1: match the last , c(i,j) = c(i-1, j-1) + matching_values(i, j)# case 2: leaving xi unmmached, c(i,j) = c(i-1, j) + 1# case 3: leavinf yj unmanched, c(i,j) = c(i, j-1) + 1# c(i,j) = min(case1. case2. case3)# base case# c(0,j) = j# c(i,0) = i# when compute c(m,n) back track from (m,n) to (0,0) and record the pathdef alignment_value(char1, char2):    if char1 == char2:        return 0    else:        return 1.5 # better choose a number which can be distinct to 1 def sequence_align(s1, s2):    """    s1, s2 are strings    """    m = len(s1)    n = len(s2)    cost = [[0 for _ in range(n+1)] for _ in range(m+1)]    # intialize values    for i in range(n+1):        cost[0][i] = i    for i in range(m+1):        cost[i][0] = i    # compute cost    for r in range(1, m+1):        for c in range(1, n+1):            cost[r][c] = min([cost[r-1][c-1] + alignment_value(s1[r-1], s2[c-1]), cost[r-1][c] + 1, cost[r][c-1] + 1])        # output s1,s2    output_s1 = []    output_s2 = []    cost_pos = (m,n)    while cost_pos != (0,0):        r,c = cost_pos        min_val = min([cost[r][c-1], cost[r-1][c], cost[r-1][c-1]])        if min_val == cost[r-1][c-1]:            output_s1.append(s1[r-1])            output_s2.append(s2[c-1])            cost_pos = (r-1,c-1)        elif min_val == cost[r-1][c]:            output_s1.append(s1[r-1])            output_s2.append("-")            cost_pos = (r-1, c)        elif min_val == cost[r][c-1]:            output_s1.append("-")            output_s2.append(s2[c-1])            cost_pos = (r, c-1)    output_s1.reverse()    output_s2.reverse()    return ("".join(output_s1), "".join(output_s2))if __name__ == "__main__":    s1 = "snowy"    s2 = "sunny"    new_s1, new_s2 = sequence_align(s1,s2)    print(new_s1 + "\n" + new_s2)    # s-nowy    # sun-ny 


原创粉丝点击