source: distools/bindistm.m @ 118

Last change on this file since 118 was 10, checked in by bduin, 14 years ago
File size: 5.3 KB
Line 
1%BINDISTM Dissimilarity Matrix between Binary Vectors
2%
3%   D = BINDISTM(A,B,TYPE)
4%
5% INPUT
6%   A     NxK Binary matrix or dataset
7%   B     MxK Binary matrix or dataset (optional; default: B=A)
8%   TYPE        Type of the similarity S (optional; default: 'J'):
9%           'SM', 'Simple-match':   (a+d)/(a+b+c+d)
10%           'RR', 'Russel-Rao':     a/(a+b+c+d)
11%           'J',  'Jaccard':        a/(a+b+c)
12%           'D',  'Dice':           a/(a+0.5*(b+c))
13%           'SS', 'Sokal-Sneath':   (a+d)/(a+0.5*(b+c)+d)
14%           'RT', 'Rogers-Tanimoto':(a+d)/(a+2*(b+c)+d)
15%           'K',  'Kulczynski':     0.5*(a/(a+b) + a/(a+c))
16%           'A1', 'Anderberg1':     a/(a+2*(b+c))
17%           'A2', 'Anderberg2':     0.5*(a/(a+b) + a/(a+c) + d/(c+d) + d/(b+d))
18%           'H',  'Hamman':         ((a+d)-(b+c))/(a+b+c+d)
19%           'Y',  'Yule':           (a*d -b*c)/(a*d+b*c)
20%           'P1', 'Pearson1':       (a*d)/sqrt((a+b)*(a+c)*(b+d)*(c+d))
21%           'P2', 'Pearson2':       (a*d-b*c)/sqrt((a+b)*(a+c)*(b+d)*(c+d))
22%           'O',  'Ochiai':         a/sqrt((a+b)*(a+c))
23%         The distance D is computed as D=sqrt(1-S).
24%
25%         Type of distance:
26%           'HG', 'Hamming':        (b+c)
27%           'EU', 'Euclidean':      sqrt(b+c)
28%           'VAR','Variance':       0.25*(b+c)/(a+b+c+d)
29%           'BC', 'Bray-Curtis':    (b+c)/(2*a+b+c)
30%           'SD', 'Size-diff':      (b-c)^2/(a+b+c+d)^2
31%           'PD', 'Pattern-diff':   b*c/(a+b+c+d)^2
32%           'SHD','Shape-diff':     ((a+b+c+d)*(b_c)-(b-c)^2)/(a+b+c+d)^2;
33%
34% OUTPUT
35%   D           NxM Dissimilarity matrix or dataset
36%
37% DESCRIPTION
38% Distance between sets of binary vectors, A and B.
39% The distances which are non-metric: 'K','A2','Y','BC','SD','PD' and 'SHD'.
40% The distances which do not have a Euclidean behaviour: 'SS','K','A2','Y','HG',
41% 'VAR','BC','SD','PD' and 'SHD'. If for the similarity S defined above, D is
42% computed as D=1-S, then the following distances are non-metric: 'D','SS','K',
43% 'A2','Y','P1','P2',and 'O', and all of the distances are non-Euclidean.
44%
45% NOTE
46% In some cases the operations may be undefined such as 0/0. This results
47% in NANs which are replaced here by zeros.
48%
49% If A and B are datasets, then D is a dataset as well with the labels defined
50% by the labels of A and the feature labels defined by the labels of B. If A is
51% not a dataset, but a matrix of doubles, then D is also a matrix of doubles.
52%
53% DEFAULT
54%   B    = A
55%   TYPE = 'J'
56%
57% REFERENCE
58% J.Gower, Metric and Euclidean Properties od Dissimilarity Coefficients.
59% Journal of Classification, no.5, 5-48, 1986.
60%
61
62% Copyright: Elzbieta Pekalska, ela.pekalska@googlemail.com
63% Faculty EWI, Delft University of Technology and
64% School of Computer Science, University of Manchester
65
66
67
68function D = bindistm(A,B,type)
69
70if nargin < 3,
71  type = 'J';
72end
73
74bisa = (nargin < 2 | isempty(B));
75if bisa,
76  B = A;
77end
78
79isda = isdataset(A);
80isdb = isdataset(B);
81a = +A;
82b = +B;
83
84[ra,ca] = size(a);
85[rb,cb] = size(b);
86if ca ~= cb,
87  error ('Matrices should have equal numbers of columns.');
88end
89
90if any(a~=0 & a~=1) | any(b~=0 & b~=1),
91  error('Data should be binary.');
92end
93
94Aij = a*b';
95Bij = a*(1-b)';
96Cij = (1-a)*b';
97Dij = (1-a)*(1-b)';
98
99D = [];
100switch lower(type)
101  case {'hg','hamming'}
102    D = (Bij+Cij);
103  case {'eu','euclidean'}
104    D = sqrt(Bij+Cij);
105  case {'var','variance'}
106    D = 0.25*(Bij+Cij)/ca;
107  case {'bc','bray-curtis'}
108    D = (Bij+Cij)./(2*Aij+Bij+Cij);
109  case {'sd','size-diff'}
110    D = (Bij-Cij).^2./ca^2;
111  case {'pd','pattern-diff'}
112    D = Bij.*Cij./ca^2;
113  case {'shd','shape-diff'}
114    D = (ca*(Bij_Cij)-(Bij-Cij).^2)./ca^2;
115%
116  case {'sm','simple-match'}
117    S = (Aij+Dij) ./ ca;
118  case {'rr','russel-rao'}
119    S = Aij ./ ca;
120  case {'j','jaccard'}
121    S = Aij ./ (Aij+Bij+Cij);
122  case {'d','dice'}
123    S = Aij ./ (Aij+0.5*(Bij+Cij));
124  case {'ss','sokal-sneath'}
125    S = (Aij +Dij)./ (Aij + 0.5*(Bij+Cij) + Dij);
126  case {'a1','anderberg1'}
127    S = Aij./ (Aij + 2*(Bij+Cij));
128  case {'rt','rogers-tanimoto'}
129    S = (Aij +Dij)./ (Aij + 2*(Bij+Cij)+Dij);
130  case {'k','kulczynski'}
131    S = 0.5*(Aij./ (Aij + Bij) + Aij./ (Aij + Cij));
132  case {'a2','anderberg2'}
133    S = 0.5*(Aij./ (Aij + Bij) + Aij./ (Aij + Cij) + Dij./ (Cij + Dij) + Dij./ (Bij + Dij) );
134  case {'h','hamman'}
135    S = ((Aij + Dij) - (Bij + Cij))/ca;
136  case {'y','yule'}
137    S = (Aij .* Dij  - Bij .* Cij) ./ (Aij .* Dij  + Bij .* Cij);
138  case {'p1','pearson1'}
139    S = (Aij .* Dij) ./ sqrt((Aij + Bij) .* (Aij + Cij).*(Bij + Dij).*(Cij + Dij));
140  case {'p2','pearson2'}
141    S = (Aij .* Dij - Bij .* Cij) ./ sqrt((Aij + Bij) .* (Aij + Cij).*(Bij + Dij).*(Cij + Dij));
142  case {'o','ochiai'}
143    S = Aij / sqrt((Aij + Bij) .* (Aij + Cij));
144  othwerwise
145    error('Wrong type.');
146end
147
148if isempty(D),
149  D = sqrt(1 - S);
150end
151
152% Replace potential NaNs by zeros
153D(find(isnan(D))) = 0;
154
155% Check numerical inaccuracy
156D(find(D<eps)) = 0;
157
158% Set object labels and feature labels
159if xor(isda, isdb),
160  prwarning(1,'One matrix is a dataset and the other not. ')
161end
162if isda,
163  if isdb,
164    D = setdata(A,D,getlab(B));
165  else
166    D = setdata(A,D);
167  end
168  D.name = 'Distance matrix';
169  if ~isempty(A.name)
170    D.name = [D.name ' for ' A.name];
171  end
172end
173return
Note: See TracBrowser for help on using the repository browser.