source: distools/chardmat.m @ 21

Last change on this file since 21 was 10, checked in by bduin, 14 years ago
File size: 8.4 KB
Line 
1%CHARDMAT Characterization of a square, labeled dissimilarity matrix
2%
3%  [C,D_OUT] = CHARDMAT(D,Ntr,Nsubeucl)
4%
5% Characterizes a square (dis)similarity dataset D.
6% D_OUT is the symmetric, normalized dissimilarity dataset D. If D is
7% a similarity dataset it is converted to dissimilarities first.
8% The following fields are returned in the structure C.
9% name       - dataset name as stored by PRTools or read command
10% desc       - dataset description as stored by read command
11% link       - web links as stored by read command
12% ref        - references as stored by read command
13% asym       - asymmetry, 2*|D-D'|./(|D|+|D'|)
14% size       - number of objects
15% classes    - number of classes
16% clsizes    - vector with class sizes
17% type       - 'dis' for dissimilarities, 'sim' for similarities
18%
19%         all following items are computed for a transformation
20%         of D by MAKESYM and DISNORM (make average distance 1),
21%         similarities are first transformed into dissimilarities
22%         by d(i,j) = sqrt(d(i,j) + d(j,j) - d(i,j) - d(j,i))
23%
24% within_mean- average within class dissimilarity
25% between_mean- average between class dissimilarity
26% pe_mapping - Pseudo-Euclidean mapping as computed by PE_EM
27% signature  - 2 component vector with # of positive and negative
28%              eigenvalues obtained during the PE embedding
29% eigenvalues- the eigenvalues obtained during the PE embedding,
30%              see PE_EM for their ranking
31% nef         - Negative Eigen Fraction (sum of absolute negative
32%               eigenvalues divided by sum of all absolute eigenvalues)
33% ner         - Negative Eigen Ratio (- largest negative eigenvalue
34%               divided by largest positive eigenvalue)
35% trineq      - fraction of triangle inequality violations
36%
37%         the following characteristics refer to a set of five spaces:
38%         - Pseudo-Euclidean space based on a full embedding. Distances
39%           in this space are identical to D.
40%         - Associated space, the same vector spaces, but now treated as
41%           an Euclidean space
42%         - Positive space based on the positive eigenvalues only
43%         - Negative space based on the negative eigenvalues only
44%         - Corrected space based on an embedding of sqrt(D.^2+2*Lmin)
45%           in which Lmin is the absolute values of the largest negative
46%           eigenvalue. The result is a proper Euclidean space.
47%
48% loo_a       - leave-one-out nearest neighbor errors for all five
49%               embedded spaces
50% loo_d       - leave-one-out nearest neighbor errors for the dissimilarity
51%               spaces related to the above five embedded spaces
52% lcurve_a    - nearest neighbor learning curves for the five embedded
53%               spaces
54% lcurve_a    - nearest neighbor learning curves for the five dissimilarity
55%               spaces
56% anames      - names of the five embedded spaces, useful for annotation
57% dnames      - names of the five dissimilarity spaces
58%
59% Ntr (default 200) and Nsubeucl (default 50) control numbers of trials to
60% estimate the fraction of triangle violations and the accuracy of the
61% subeuclidean curves.
62
63
64function [c,d] = chardmat(d,Ntr,Nsubeucl,makefigs)
65
66        if nargin < 4, makefigs = 0; end
67        if nargin < 3 | isempty(Nsubeucl), Nsubeucl = 50; end
68        if nargin < 2 | isempty(Ntr), Ntr = 200; end
69
70  isdataset(d);
71
72  datname = getname(d);
73  discheck(d,[],1);
74  m = size(d,1);
75  nclass = getsize(d,3);
76
77  c.name = datname;
78  c.desc = getuser(d,'desc');
79  c.link = getuser(d,'link');
80  c.ref = getuser(d,'ref');
81  c.asym = asymmetry(d);
82  c.size = m;
83  c.classes  = nclass;
84  c.clsizes  = classsizes(d);
85 
86  if discheck(d);
87    c.type = 'dis';
88  else
89    c.type = 'sim';
90                d = dissimt(d,'sim2dis');
91        end
92       
93  % we now have a dissimilarity matrix with positive distances
94 
95  d = makesym(d);  % make it symmetric now
96        d = d*disnorm(d);
97 
98  uc = zeros(1,nclass);
99  for j=1:c.classes
100                nj = c.clsizes(j);
101    dj = +selcdat(d,j);
102    uc(j) = sum(dj(:))/(nj*(nj-1));
103  end
104  c.within_mean = uc*(c.clsizes'.^2-c.clsizes')/(sum(c.clsizes.^2) - m);
105
106  ud = (m*(m-1) - uc*(c.clsizes'.^2-c.clsizes')) / (m*(m-1) - sum(c.clsizes.^2) + m);
107  c.between_mean = ud;
108
109  [nef,ner,w] = checkeucl(d);
110  c.pe_mapping  = w;
111  c.signature   = getsig(w);
112  c.eigenvalues = getdata(w,'eval');
113  c.nef      = nef;
114  c.ner      = ner;
115  c.trineq   = checktr(d,200);
116 
117  [A D] = disspaces(d,w);
118  nspaces = length(A);
119  c.loo_a = zeros(1,nspaces); % LOO NN errors embedding spaces
120  c.loo_d = zeros(1,nspaces); % LOO NN errors dis spaces
121  c.lcurve_a = cell(1,nspaces);  % NN Learning curves embedding spaces
122  c.lcurve_d = cell(1,nspaces);  % NN Learning curves dis spaces
123        c.anames = cell(1,nspaces);    % names embedded spaces
124        c.dnames = cell(1,nspaces);    % names dis spaces
125        t = sprintf('Compute %i learning curves: ',nspaces*2);
126        prwaitbar(nspaces*2,t);
127  for j=1:nspaces   
128    c.loo_a(j) = nne(D{j});
129                ddj = distm(D{j});
130    c.loo_d(j) = nne(ddj);
131                prwaitbar(nspaces*2,j*2-1,[t int2str(j*2-1)]);
132    c.lcurve_a{j} = nnerr(D{j});
133    c.lcurve_a{j}.names = getname(A{j});
134    c.anames{j} = getname(A{j});
135                prwaitbar(nspaces*2,j*2,[t int2str(j*2)]);
136    c.lcurve_d{j} = nnerr(ddj);
137    c.lcurve_d{j}.names = getname(D{j});
138    c.dnames{j} = getname(D{j});
139        end
140        prwaitbar(0);
141
142        if makefigs
143                make_figs(d,c);
144        else
145                show_figs(c,d,Nsubeucl)
146        end
147       
148return
149
150function show_figs(c,d,Nsubeucl)
151
152        if nargin < 3 | isempty(Nsubeucl)
153                Nsubeucl = 50;
154        end
155
156  fonts = 12;
157  m = size(d,1);
158  nclass = getsize(d,3);
159  d
160  delfigs
161 
162  figure; imagesc(+d);
163  colormap gray
164  axis off;
165  axis square
166  title('Dissimilarity Matrix');
167  fontsize(fonts);
168 
169  figure; scatterd(d*c.pe_mapping(:,1:2));
170  title('Scatterplot on first two positive eigenvectors')
171  xlabel('Eigenvector 1');
172  ylabel('Eigenvector 2');
173  fontsize(fonts);
174 
175  figure; plotspectrum(c.eigenvalues);
176  fontsize(fonts);
177       
178  figure; plote([c.lcurve_a(1:4) c.lcurve_d(1)],[],char('k-','r-','b-','m-','k--'));
179        V = axis; V(2)= m+1; axis(V);
180        ticks = [1 10 100 1000];
181        ticks = ticks(ticks <= m);
182        set(gca,'xtick',ticks);
183        set(gca,'xticklabel',ticks);
184  fontsize(fonts);
185 
186  if isfield(c,'lcurve_d')
187    figure; plote([c.lcurve_d(1:4) c.lcurve_a(1)],[],char('k--','r--','b--','m--','k-'));
188    V = axis; V(2)= m+1; axis(V);
189    ticks = [1 10 100 1000];
190    ticks = ticks(ticks <= m);
191    set(gca,'xtick',ticks);
192    set(gca,'xticklabel',ticks);
193    fontsize(fonts);
194  end
195       
196  [nef,ner,N] = checkeucl(d,'all');
197  figure;
198        semilogx(N,nef);
199        linewidth(2); fontsize(fonts);
200        V = axis; V(2)= m+1; axis(V);
201        ticks = [1 10 100 1000];
202        ticks = ticks(ticks <= m);
203        set(gca,'xtick',ticks);
204        set(gca,'xticklabel',ticks);
205        xlabel('Subset size')
206        ylabel('Fraction')
207        title('Negative Eigen Fraction')
208       
209        figure; semilogx(N,ner);
210        linewidth(2); fontsize(fonts);
211        V = axis; V(2)= m+1; axis(V);
212        ticks = [1 10 100 1000];
213        ticks = ticks(ticks <= m);
214        set(gca,'xtick',ticks);
215        set(gca,'xticklabel',ticks);
216        xlabel('Subset size')
217        ylabel('Fraction')
218        title('Negative Eigen Ratio')
219       
220  figure;
221  nep = checksubeucl(d,Nsubeucl);
222        n = min(find(nep==1));
223        if isempty(n), n = 0; end
224        plot(nep(1:n+1));
225        linewidth(2); fontsize(fonts);
226        xlabel('Subset size')
227        ylabel('Fraction')
228        title('Fraction of Non-Euclidean Subsets')
229       
230  figure;
231  hist(+d(:),[0:0.05:ceil(max(+d(:)*20))/20]);
232  dc = zeros(sum(c.clsizes.^2),1);
233  n = 0;
234  for j=1:nclass
235    nn = c.clsizes(j)^2;
236    dj = +selcdat(d,j);
237    dc(n+1:n+nn) = dj(:);
238    n = n+nn;
239  end
240  hold on
241  hist(+dc(:),[0:0.05:ceil(max(+d(:)*20))/20]);
242  h = get(gca,'Children');
243  set(h(1),'facecolor',[1 0 0]);
244  V = axis;
245  axis([-0.1 max(+d(:)) 0 V(4)]);
246  fontsize(fonts)
247  legend('between class','within class');
248  title('Histogram of normalized distances')
249 
250  showfigs
251 
252 
253%FSAVE Save current figure as eps and fig
254%
255%    FSAVE <dir,fig_nane>
256
257function fsave(datdir,file)
258
259  file = fullfile(datdir,file);
260  uns = get(gcf,'units');
261  pos = get(gcf,'position');
262  set(gcf,'units','pixels');
263  set(gcf,'position',[1 1 900 600]);
264
265  exportfig(gcf,file,'format','eps','preview','tiff','color','cmyk')
266  %exportfig(gcf,file,'format','png','color','cmyk')
267  %exportfig(gcf,file,'format','jpeg100','color','cmyk')
268  saveas(gcf,file,'fig')
269
270  set(gcf,'units',uns);
271  set(gcf,'position',pos);
272
273return
274
Note: See TracBrowser for help on using the repository browser.