source: distools/editdistm.m @ 100

Last change on this file since 100 was 10, checked in by bduin, 14 years ago
File size: 3.1 KB
RevLine 
[10]1%EDITDISTM  Edit Distance Matrix between Strings
2%
3%   D = EDITDISTM (A,B,COST,SC)
4%
5% INPUT
6%   A     Cell structure of N strings
7%   B     Cell structure of M strings (optional; default: B = A)
8%   COST  Vector of edit costs, COST = [cost_ins cost_del cost_sub]
9%         (optional; default: all cost equal 1, COST = [1 1 1])
10%   SC    Scale (1) or not (0) the computed distance to [0,1] by
11%         max {K*COST(3)+(L-K)*COST(1), K*cost(2)+L*COST(1),...
12%         L*COST(2)+K*COST(1), K*cost(3)+(L-K)*COST(2)]}, where
13%         K =length(A{i}) and L = length(B{j}) and K > L.
14%         (optional; default: 0, i.e. no scaling)
15%
16% OUTPUT
17%   D     NxM Dissimilarity matrix
18%
19% DESCRIPTION
20% A and B are either sets of cell structures of strings or strings.
21% The edit distance is defined for strings of arbitrary lengths. It finds
22% the minimum total cost of transforming one string into another, given
23% the costs of insertion, deletion and substitution.
24%
25% Dynamic programming is used to compute the edit distance. Assume we are
26% given two strings S and T of the lengths K and L, respectively. An
27% (K+1)x(L+1) array M is filled with partial costs such that M(K,L)
28% yields the edit distance D(S,T). The definition of the entries of M is
29% recursive:
30%   M(0,j) = j*C_del,  j=0,1,...,L
31%   M(i,0) = i*C_ins,  i=0,1,...,K
32% For the pairs (i,j), use
33%   M(i,j) = min {M(i-1,j)+C_del, M(i,j-1)+C_ins, M(i-1,j-1)+C_subs(S(i)~=T(j)))
34%
35% DEFAULT
36%   B    = A
37%   COST = [1 1 1]
38%   SC   = 1
39%
40
41% Copyright: Elzbieta Pekalska, ela.pekalska@googlemail.com
42% Faculty EWI, Delft University of Technology and
43% School of Computer Science, University of Manchester
44
45
46function d = editdistm(a,b,cost,scale)
47if nargin < 4,
48  scale = 0;
49end
50if nargin < 3,
51  cost = [1 1 1];
52end
53if nargin < 2,
54  b = a;
55end
56
57if length(cost) ~= 3,
58  error('Wrong vector of edit costs.');
59end
60if any(cost) < 0,
61  error('Edit costs should be nonnegative.');
62end
63
64if ~iscell(a),
65  if isstr(a) | (~isstr(a) & min(size(a)) == 1)
66    a = {a};
67  else
68    error('A is improper.');
69  end
70end
71if ~iscell(b),
72  if isstr(b) | (~isstr(b) & min(size(b)) == 1)
73    b = {b};
74  else
75    error('B is improper.');
76  end
77end
78
79
80m = length(a);
81n = length(b);
82
83d = zeros(m,n);
84if scale == 1,
85  for i=1:m
86    al = length(a{i});
87    for j=1:n
88      bl = length(b{j});
89      if bl < al, h = bl; bl = al; al = h; end
90      scale = max([al*cost(3)+(bl-al)*cost(1), al*cost(2)+bl*cost(1),...
91                   bl*cost(2)+al*cost(1), al*cost(3)+(bl-al)*cost(2)]);
92      d(i,j) = editd(a{i},b{j},cost)/scale;
93    end
94  end
95elseif scale == 0,
96  for i=1:m
97    for j=1:n
98      d(i,j) = editd(a{i},b{j},cost);
99    end
100  end
101else
102  ;
103end
104return;
105
106
107
108
109function d = editd (s,t,cost)
110m = length(s);
111n = length(t);
112
113if isstr(s),
114  s = ['a',s];
115  t = ['a',t];
116else
117  if size(s,1) > size(s,2)
118    s = [s(1); s];
119    t = [s(1); t];
120  else
121    s = [s(1) s];
122    t = [s(1) t];
123  end
124end
125
126M = zeros(m+1,n+1);
127M(1,1) = 0;
128M(1,2:n+1) = (1:n) *cost(1);    % Insertions
129M(2:m+1,1) = (1:m)'*cost(2);    % Deletions
130
131for i=2:m+1
132  for j=2:n+1
133    M(i,j) = min ([M(i-1,j)+cost(2), M(i,j-1)+cost(1), M(i-1,j-1)+cost(3)*(s(i) ~= t(j))]);
134  end
135end
136d = M(m+1,n+1);
Note: See TracBrowser for help on using the repository browser.