1 | %EDITDISTM Edit Distance Matrix between Strings |
---|
2 | % |
---|
3 | % D = EDITDISTM (A,B,COST,SC) |
---|
4 | % |
---|
5 | % INPUT |
---|
6 | % A Cell structure of N strings |
---|
7 | % B Cell structure of M strings (optional; default: B = A) |
---|
8 | % COST Vector of edit costs, COST = [cost_ins cost_del cost_sub] |
---|
9 | % (optional; default: all cost equal 1, COST = [1 1 1]) |
---|
10 | % SC Scale (1) or not (0) the computed distance to [0,1] by |
---|
11 | % max {K*COST(3)+(L-K)*COST(1), K*cost(2)+L*COST(1),... |
---|
12 | % L*COST(2)+K*COST(1), K*cost(3)+(L-K)*COST(2)]}, where |
---|
13 | % K =length(A{i}) and L = length(B{j}) and K > L. |
---|
14 | % (optional; default: 0, i.e. no scaling) |
---|
15 | % |
---|
16 | % OUTPUT |
---|
17 | % D NxM Dissimilarity matrix |
---|
18 | % |
---|
19 | % DESCRIPTION |
---|
20 | % A and B are either sets of cell structures of strings or strings. |
---|
21 | % The edit distance is defined for strings of arbitrary lengths. It finds |
---|
22 | % the minimum total cost of transforming one string into another, given |
---|
23 | % the costs of insertion, deletion and substitution. |
---|
24 | % |
---|
25 | % Dynamic programming is used to compute the edit distance. Assume we are |
---|
26 | % given two strings S and T of the lengths K and L, respectively. An |
---|
27 | % (K+1)x(L+1) array M is filled with partial costs such that M(K,L) |
---|
28 | % yields the edit distance D(S,T). The definition of the entries of M is |
---|
29 | % recursive: |
---|
30 | % M(0,j) = j*C_del, j=0,1,...,L |
---|
31 | % M(i,0) = i*C_ins, i=0,1,...,K |
---|
32 | % For the pairs (i,j), use |
---|
33 | % M(i,j) = min {M(i-1,j)+C_del, M(i,j-1)+C_ins, M(i-1,j-1)+C_subs(S(i)~=T(j))) |
---|
34 | % |
---|
35 | % DEFAULT |
---|
36 | % B = A |
---|
37 | % COST = [1 1 1] |
---|
38 | % SC = 1 |
---|
39 | % |
---|
40 | |
---|
41 | % Copyright: Elzbieta Pekalska, ela.pekalska@googlemail.com |
---|
42 | % Faculty EWI, Delft University of Technology and |
---|
43 | % School of Computer Science, University of Manchester |
---|
44 | |
---|
45 | |
---|
46 | function d = editdistm(a,b,cost,scale) |
---|
47 | if nargin < 4, |
---|
48 | scale = 0; |
---|
49 | end |
---|
50 | if nargin < 3, |
---|
51 | cost = [1 1 1]; |
---|
52 | end |
---|
53 | if nargin < 2, |
---|
54 | b = a; |
---|
55 | end |
---|
56 | |
---|
57 | if length(cost) ~= 3, |
---|
58 | error('Wrong vector of edit costs.'); |
---|
59 | end |
---|
60 | if any(cost) < 0, |
---|
61 | error('Edit costs should be nonnegative.'); |
---|
62 | end |
---|
63 | |
---|
64 | if ~iscell(a), |
---|
65 | if isstr(a) | (~isstr(a) & min(size(a)) == 1) |
---|
66 | a = {a}; |
---|
67 | else |
---|
68 | error('A is improper.'); |
---|
69 | end |
---|
70 | end |
---|
71 | if ~iscell(b), |
---|
72 | if isstr(b) | (~isstr(b) & min(size(b)) == 1) |
---|
73 | b = {b}; |
---|
74 | else |
---|
75 | error('B is improper.'); |
---|
76 | end |
---|
77 | end |
---|
78 | |
---|
79 | |
---|
80 | m = length(a); |
---|
81 | n = length(b); |
---|
82 | |
---|
83 | d = zeros(m,n); |
---|
84 | if scale == 1, |
---|
85 | for i=1:m |
---|
86 | al = length(a{i}); |
---|
87 | for j=1:n |
---|
88 | bl = length(b{j}); |
---|
89 | if bl < al, h = bl; bl = al; al = h; end |
---|
90 | scale = max([al*cost(3)+(bl-al)*cost(1), al*cost(2)+bl*cost(1),... |
---|
91 | bl*cost(2)+al*cost(1), al*cost(3)+(bl-al)*cost(2)]); |
---|
92 | d(i,j) = editd(a{i},b{j},cost)/scale; |
---|
93 | end |
---|
94 | end |
---|
95 | elseif scale == 0, |
---|
96 | for i=1:m |
---|
97 | for j=1:n |
---|
98 | d(i,j) = editd(a{i},b{j},cost); |
---|
99 | end |
---|
100 | end |
---|
101 | else |
---|
102 | ; |
---|
103 | end |
---|
104 | return; |
---|
105 | |
---|
106 | |
---|
107 | |
---|
108 | |
---|
109 | function d = editd (s,t,cost) |
---|
110 | m = length(s); |
---|
111 | n = length(t); |
---|
112 | |
---|
113 | if isstr(s), |
---|
114 | s = ['a',s]; |
---|
115 | t = ['a',t]; |
---|
116 | else |
---|
117 | if size(s,1) > size(s,2) |
---|
118 | s = [s(1); s]; |
---|
119 | t = [s(1); t]; |
---|
120 | else |
---|
121 | s = [s(1) s]; |
---|
122 | t = [s(1) t]; |
---|
123 | end |
---|
124 | end |
---|
125 | |
---|
126 | M = zeros(m+1,n+1); |
---|
127 | M(1,1) = 0; |
---|
128 | M(1,2:n+1) = (1:n) *cost(1); % Insertions |
---|
129 | M(2:m+1,1) = (1:m)'*cost(2); % Deletions |
---|
130 | |
---|
131 | for i=2:m+1 |
---|
132 | for j=2:n+1 |
---|
133 | M(i,j) = min ([M(i-1,j)+cost(2), M(i,j-1)+cost(1), M(i-1,j-1)+cost(3)*(s(i) ~= t(j))]); |
---|
134 | end |
---|
135 | end |
---|
136 | d = M(m+1,n+1); |
---|