[10] | 1 | %EDITDISTM Edit Distance Matrix between Strings |
---|
| 2 | % |
---|
| 3 | % D = EDITDISTM (A,B,COST,SC) |
---|
| 4 | % |
---|
| 5 | % INPUT |
---|
| 6 | % A Cell structure of N strings |
---|
| 7 | % B Cell structure of M strings (optional; default: B = A) |
---|
| 8 | % COST Vector of edit costs, COST = [cost_ins cost_del cost_sub] |
---|
| 9 | % (optional; default: all cost equal 1, COST = [1 1 1]) |
---|
| 10 | % SC Scale (1) or not (0) the computed distance to [0,1] by |
---|
| 11 | % max {K*COST(3)+(L-K)*COST(1), K*cost(2)+L*COST(1),... |
---|
| 12 | % L*COST(2)+K*COST(1), K*cost(3)+(L-K)*COST(2)]}, where |
---|
| 13 | % K =length(A{i}) and L = length(B{j}) and K > L. |
---|
| 14 | % (optional; default: 0, i.e. no scaling) |
---|
| 15 | % |
---|
| 16 | % OUTPUT |
---|
| 17 | % D NxM Dissimilarity matrix |
---|
| 18 | % |
---|
| 19 | % DESCRIPTION |
---|
| 20 | % A and B are either sets of cell structures of strings or strings. |
---|
| 21 | % The edit distance is defined for strings of arbitrary lengths. It finds |
---|
| 22 | % the minimum total cost of transforming one string into another, given |
---|
| 23 | % the costs of insertion, deletion and substitution. |
---|
| 24 | % |
---|
| 25 | % Dynamic programming is used to compute the edit distance. Assume we are |
---|
| 26 | % given two strings S and T of the lengths K and L, respectively. An |
---|
| 27 | % (K+1)x(L+1) array M is filled with partial costs such that M(K,L) |
---|
| 28 | % yields the edit distance D(S,T). The definition of the entries of M is |
---|
| 29 | % recursive: |
---|
| 30 | % M(0,j) = j*C_del, j=0,1,...,L |
---|
| 31 | % M(i,0) = i*C_ins, i=0,1,...,K |
---|
| 32 | % For the pairs (i,j), use |
---|
| 33 | % M(i,j) = min {M(i-1,j)+C_del, M(i,j-1)+C_ins, M(i-1,j-1)+C_subs(S(i)~=T(j))) |
---|
| 34 | % |
---|
| 35 | % DEFAULT |
---|
| 36 | % B = A |
---|
| 37 | % COST = [1 1 1] |
---|
| 38 | % SC = 1 |
---|
| 39 | % |
---|
| 40 | |
---|
| 41 | % Copyright: Elzbieta Pekalska, ela.pekalska@googlemail.com |
---|
| 42 | % Faculty EWI, Delft University of Technology and |
---|
| 43 | % School of Computer Science, University of Manchester |
---|
| 44 | |
---|
| 45 | |
---|
| 46 | function d = editdistm(a,b,cost,scale) |
---|
| 47 | if nargin < 4, |
---|
| 48 | scale = 0; |
---|
| 49 | end |
---|
| 50 | if nargin < 3, |
---|
| 51 | cost = [1 1 1]; |
---|
| 52 | end |
---|
| 53 | if nargin < 2, |
---|
| 54 | b = a; |
---|
| 55 | end |
---|
| 56 | |
---|
| 57 | if length(cost) ~= 3, |
---|
| 58 | error('Wrong vector of edit costs.'); |
---|
| 59 | end |
---|
| 60 | if any(cost) < 0, |
---|
| 61 | error('Edit costs should be nonnegative.'); |
---|
| 62 | end |
---|
| 63 | |
---|
| 64 | if ~iscell(a), |
---|
| 65 | if isstr(a) | (~isstr(a) & min(size(a)) == 1) |
---|
| 66 | a = {a}; |
---|
| 67 | else |
---|
| 68 | error('A is improper.'); |
---|
| 69 | end |
---|
| 70 | end |
---|
| 71 | if ~iscell(b), |
---|
| 72 | if isstr(b) | (~isstr(b) & min(size(b)) == 1) |
---|
| 73 | b = {b}; |
---|
| 74 | else |
---|
| 75 | error('B is improper.'); |
---|
| 76 | end |
---|
| 77 | end |
---|
| 78 | |
---|
| 79 | |
---|
| 80 | m = length(a); |
---|
| 81 | n = length(b); |
---|
| 82 | |
---|
| 83 | d = zeros(m,n); |
---|
| 84 | if scale == 1, |
---|
| 85 | for i=1:m |
---|
| 86 | al = length(a{i}); |
---|
| 87 | for j=1:n |
---|
| 88 | bl = length(b{j}); |
---|
| 89 | if bl < al, h = bl; bl = al; al = h; end |
---|
| 90 | scale = max([al*cost(3)+(bl-al)*cost(1), al*cost(2)+bl*cost(1),... |
---|
| 91 | bl*cost(2)+al*cost(1), al*cost(3)+(bl-al)*cost(2)]); |
---|
| 92 | d(i,j) = editd(a{i},b{j},cost)/scale; |
---|
| 93 | end |
---|
| 94 | end |
---|
| 95 | elseif scale == 0, |
---|
| 96 | for i=1:m |
---|
| 97 | for j=1:n |
---|
| 98 | d(i,j) = editd(a{i},b{j},cost); |
---|
| 99 | end |
---|
| 100 | end |
---|
| 101 | else |
---|
| 102 | ; |
---|
| 103 | end |
---|
| 104 | return; |
---|
| 105 | |
---|
| 106 | |
---|
| 107 | |
---|
| 108 | |
---|
| 109 | function d = editd (s,t,cost) |
---|
| 110 | m = length(s); |
---|
| 111 | n = length(t); |
---|
| 112 | |
---|
| 113 | if isstr(s), |
---|
| 114 | s = ['a',s]; |
---|
| 115 | t = ['a',t]; |
---|
| 116 | else |
---|
| 117 | if size(s,1) > size(s,2) |
---|
| 118 | s = [s(1); s]; |
---|
| 119 | t = [s(1); t]; |
---|
| 120 | else |
---|
| 121 | s = [s(1) s]; |
---|
| 122 | t = [s(1) t]; |
---|
| 123 | end |
---|
| 124 | end |
---|
| 125 | |
---|
| 126 | M = zeros(m+1,n+1); |
---|
| 127 | M(1,1) = 0; |
---|
| 128 | M(1,2:n+1) = (1:n) *cost(1); % Insertions |
---|
| 129 | M(2:m+1,1) = (1:m)'*cost(2); % Deletions |
---|
| 130 | |
---|
| 131 | for i=2:m+1 |
---|
| 132 | for j=2:n+1 |
---|
| 133 | M(i,j) = min ([M(i-1,j)+cost(2), M(i,j-1)+cost(1), M(i-1,j-1)+cost(3)*(s(i) ~= t(j))]); |
---|
| 134 | end |
---|
| 135 | end |
---|
| 136 | d = M(m+1,n+1); |
---|