[10] | 1 | %CHARDMAT Characterization of a square, labeled dissimilarity matrix
|
---|
| 2 | %
|
---|
| 3 | % [C,D_OUT] = CHARDMAT(D,Ntr,Nsubeucl)
|
---|
| 4 | %
|
---|
| 5 | % Characterizes a square (dis)similarity dataset D.
|
---|
| 6 | % D_OUT is the symmetric, normalized dissimilarity dataset D. If D is
|
---|
| 7 | % a similarity dataset it is converted to dissimilarities first.
|
---|
| 8 | % The following fields are returned in the structure C.
|
---|
| 9 | % name - dataset name as stored by PRTools or read command
|
---|
| 10 | % desc - dataset description as stored by read command
|
---|
| 11 | % link - web links as stored by read command
|
---|
| 12 | % ref - references as stored by read command
|
---|
| 13 | % asym - asymmetry, 2*|D-D'|./(|D|+|D'|)
|
---|
| 14 | % size - number of objects
|
---|
| 15 | % classes - number of classes
|
---|
| 16 | % clsizes - vector with class sizes
|
---|
| 17 | % type - 'dis' for dissimilarities, 'sim' for similarities
|
---|
| 18 | %
|
---|
| 19 | % all following items are computed for a transformation
|
---|
| 20 | % of D by MAKESYM and DISNORM (make average distance 1),
|
---|
| 21 | % similarities are first transformed into dissimilarities
|
---|
| 22 | % by d(i,j) = sqrt(d(i,j) + d(j,j) - d(i,j) - d(j,i))
|
---|
| 23 | %
|
---|
| 24 | % within_mean- average within class dissimilarity
|
---|
| 25 | % between_mean- average between class dissimilarity
|
---|
| 26 | % pe_mapping - Pseudo-Euclidean mapping as computed by PE_EM
|
---|
| 27 | % signature - 2 component vector with # of positive and negative
|
---|
| 28 | % eigenvalues obtained during the PE embedding
|
---|
| 29 | % eigenvalues- the eigenvalues obtained during the PE embedding,
|
---|
| 30 | % see PE_EM for their ranking
|
---|
| 31 | % nef - Negative Eigen Fraction (sum of absolute negative
|
---|
| 32 | % eigenvalues divided by sum of all absolute eigenvalues)
|
---|
| 33 | % ner - Negative Eigen Ratio (- largest negative eigenvalue
|
---|
| 34 | % divided by largest positive eigenvalue)
|
---|
| 35 | % trineq - fraction of triangle inequality violations
|
---|
| 36 | %
|
---|
| 37 | % the following characteristics refer to a set of five spaces:
|
---|
| 38 | % - Pseudo-Euclidean space based on a full embedding. Distances
|
---|
| 39 | % in this space are identical to D.
|
---|
| 40 | % - Associated space, the same vector spaces, but now treated as
|
---|
| 41 | % an Euclidean space
|
---|
| 42 | % - Positive space based on the positive eigenvalues only
|
---|
| 43 | % - Negative space based on the negative eigenvalues only
|
---|
| 44 | % - Corrected space based on an embedding of sqrt(D.^2+2*Lmin)
|
---|
| 45 | % in which Lmin is the absolute values of the largest negative
|
---|
| 46 | % eigenvalue. The result is a proper Euclidean space.
|
---|
| 47 | %
|
---|
| 48 | % loo_a - leave-one-out nearest neighbor errors for all five
|
---|
| 49 | % embedded spaces
|
---|
| 50 | % loo_d - leave-one-out nearest neighbor errors for the dissimilarity
|
---|
| 51 | % spaces related to the above five embedded spaces
|
---|
| 52 | % lcurve_a - nearest neighbor learning curves for the five embedded
|
---|
| 53 | % spaces
|
---|
| 54 | % lcurve_a - nearest neighbor learning curves for the five dissimilarity
|
---|
| 55 | % spaces
|
---|
| 56 | % anames - names of the five embedded spaces, useful for annotation
|
---|
| 57 | % dnames - names of the five dissimilarity spaces
|
---|
| 58 | %
|
---|
| 59 | % Ntr (default 200) and Nsubeucl (default 50) control numbers of trials to
|
---|
| 60 | % estimate the fraction of triangle violations and the accuracy of the
|
---|
| 61 | % subeuclidean curves.
|
---|
| 62 |
|
---|
| 63 |
|
---|
| 64 | function [c,d] = chardmat(d,Ntr,Nsubeucl,makefigs)
|
---|
| 65 |
|
---|
| 66 | if nargin < 4, makefigs = 0; end
|
---|
| 67 | if nargin < 3 | isempty(Nsubeucl), Nsubeucl = 50; end
|
---|
| 68 | if nargin < 2 | isempty(Ntr), Ntr = 200; end
|
---|
| 69 |
|
---|
| 70 | isdataset(d);
|
---|
| 71 |
|
---|
| 72 | datname = getname(d);
|
---|
| 73 | discheck(d,[],1);
|
---|
| 74 | m = size(d,1);
|
---|
| 75 | nclass = getsize(d,3);
|
---|
| 76 |
|
---|
| 77 | c.name = datname;
|
---|
| 78 | c.desc = getuser(d,'desc');
|
---|
| 79 | c.link = getuser(d,'link');
|
---|
| 80 | c.ref = getuser(d,'ref');
|
---|
| 81 | c.asym = asymmetry(d);
|
---|
| 82 | c.size = m;
|
---|
| 83 | c.classes = nclass;
|
---|
| 84 | c.clsizes = classsizes(d);
|
---|
| 85 |
|
---|
| 86 | if discheck(d);
|
---|
| 87 | c.type = 'dis';
|
---|
| 88 | else
|
---|
| 89 | c.type = 'sim';
|
---|
| 90 | d = dissimt(d,'sim2dis');
|
---|
| 91 | end
|
---|
| 92 |
|
---|
| 93 | % we now have a dissimilarity matrix with positive distances
|
---|
| 94 |
|
---|
| 95 | d = makesym(d); % make it symmetric now
|
---|
| 96 | d = d*disnorm(d);
|
---|
| 97 |
|
---|
| 98 | uc = zeros(1,nclass);
|
---|
| 99 | for j=1:c.classes
|
---|
| 100 | nj = c.clsizes(j);
|
---|
| 101 | dj = +selcdat(d,j);
|
---|
| 102 | uc(j) = sum(dj(:))/(nj*(nj-1));
|
---|
| 103 | end
|
---|
| 104 | c.within_mean = uc*(c.clsizes'.^2-c.clsizes')/(sum(c.clsizes.^2) - m);
|
---|
| 105 |
|
---|
| 106 | ud = (m*(m-1) - uc*(c.clsizes'.^2-c.clsizes')) / (m*(m-1) - sum(c.clsizes.^2) + m);
|
---|
| 107 | c.between_mean = ud;
|
---|
| 108 |
|
---|
| 109 | [nef,ner,w] = checkeucl(d);
|
---|
| 110 | c.pe_mapping = w;
|
---|
| 111 | c.signature = getsig(w);
|
---|
| 112 | c.eigenvalues = getdata(w,'eval');
|
---|
| 113 | c.nef = nef;
|
---|
| 114 | c.ner = ner;
|
---|
| 115 | c.trineq = checktr(d,200);
|
---|
| 116 |
|
---|
| 117 | [A D] = disspaces(d,w);
|
---|
| 118 | nspaces = length(A);
|
---|
| 119 | c.loo_a = zeros(1,nspaces); % LOO NN errors embedding spaces
|
---|
| 120 | c.loo_d = zeros(1,nspaces); % LOO NN errors dis spaces
|
---|
| 121 | c.lcurve_a = cell(1,nspaces); % NN Learning curves embedding spaces
|
---|
| 122 | c.lcurve_d = cell(1,nspaces); % NN Learning curves dis spaces
|
---|
| 123 | c.anames = cell(1,nspaces); % names embedded spaces
|
---|
| 124 | c.dnames = cell(1,nspaces); % names dis spaces
|
---|
| 125 | t = sprintf('Compute %i learning curves: ',nspaces*2);
|
---|
| 126 | prwaitbar(nspaces*2,t);
|
---|
| 127 | for j=1:nspaces
|
---|
| 128 | c.loo_a(j) = nne(D{j});
|
---|
| 129 | ddj = distm(D{j});
|
---|
| 130 | c.loo_d(j) = nne(ddj);
|
---|
| 131 | prwaitbar(nspaces*2,j*2-1,[t int2str(j*2-1)]);
|
---|
| 132 | c.lcurve_a{j} = nnerr(D{j});
|
---|
| 133 | c.lcurve_a{j}.names = getname(A{j});
|
---|
| 134 | c.anames{j} = getname(A{j});
|
---|
| 135 | prwaitbar(nspaces*2,j*2,[t int2str(j*2)]);
|
---|
| 136 | c.lcurve_d{j} = nnerr(ddj);
|
---|
| 137 | c.lcurve_d{j}.names = getname(D{j});
|
---|
| 138 | c.dnames{j} = getname(D{j});
|
---|
| 139 | end
|
---|
| 140 | prwaitbar(0);
|
---|
| 141 |
|
---|
| 142 | if makefigs
|
---|
| 143 | make_figs(d,c);
|
---|
| 144 | else
|
---|
| 145 | show_figs(c,d,Nsubeucl)
|
---|
| 146 | end
|
---|
| 147 |
|
---|
| 148 | return
|
---|
| 149 |
|
---|
| 150 | function show_figs(c,d,Nsubeucl)
|
---|
| 151 |
|
---|
| 152 | if nargin < 3 | isempty(Nsubeucl)
|
---|
| 153 | Nsubeucl = 50;
|
---|
| 154 | end
|
---|
| 155 |
|
---|
| 156 | fonts = 12;
|
---|
| 157 | m = size(d,1);
|
---|
| 158 | nclass = getsize(d,3);
|
---|
| 159 | d
|
---|
| 160 | delfigs
|
---|
| 161 |
|
---|
| 162 | figure; imagesc(+d);
|
---|
| 163 | colormap gray
|
---|
| 164 | axis off;
|
---|
| 165 | axis square
|
---|
| 166 | title('Dissimilarity Matrix');
|
---|
| 167 | fontsize(fonts);
|
---|
| 168 |
|
---|
| 169 | figure; scatterd(d*c.pe_mapping(:,1:2));
|
---|
| 170 | title('Scatterplot on first two positive eigenvectors')
|
---|
| 171 | xlabel('Eigenvector 1');
|
---|
| 172 | ylabel('Eigenvector 2');
|
---|
| 173 | fontsize(fonts);
|
---|
| 174 |
|
---|
| 175 | figure; plotspectrum(c.eigenvalues);
|
---|
| 176 | fontsize(fonts);
|
---|
| 177 |
|
---|
| 178 | figure; plote([c.lcurve_a(1:4) c.lcurve_d(1)],[],char('k-','r-','b-','m-','k--'));
|
---|
| 179 | V = axis; V(2)= m+1; axis(V);
|
---|
| 180 | ticks = [1 10 100 1000];
|
---|
| 181 | ticks = ticks(ticks <= m);
|
---|
| 182 | set(gca,'xtick',ticks);
|
---|
| 183 | set(gca,'xticklabel',ticks);
|
---|
| 184 | fontsize(fonts);
|
---|
| 185 |
|
---|
| 186 | if isfield(c,'lcurve_d')
|
---|
| 187 | figure; plote([c.lcurve_d(1:4) c.lcurve_a(1)],[],char('k--','r--','b--','m--','k-'));
|
---|
| 188 | V = axis; V(2)= m+1; axis(V);
|
---|
| 189 | ticks = [1 10 100 1000];
|
---|
| 190 | ticks = ticks(ticks <= m);
|
---|
| 191 | set(gca,'xtick',ticks);
|
---|
| 192 | set(gca,'xticklabel',ticks);
|
---|
| 193 | fontsize(fonts);
|
---|
| 194 | end
|
---|
| 195 |
|
---|
| 196 | [nef,ner,N] = checkeucl(d,'all');
|
---|
| 197 | figure;
|
---|
| 198 | semilogx(N,nef);
|
---|
| 199 | linewidth(2); fontsize(fonts);
|
---|
| 200 | V = axis; V(2)= m+1; axis(V);
|
---|
| 201 | ticks = [1 10 100 1000];
|
---|
| 202 | ticks = ticks(ticks <= m);
|
---|
| 203 | set(gca,'xtick',ticks);
|
---|
| 204 | set(gca,'xticklabel',ticks);
|
---|
| 205 | xlabel('Subset size')
|
---|
| 206 | ylabel('Fraction')
|
---|
| 207 | title('Negative Eigen Fraction')
|
---|
| 208 |
|
---|
| 209 | figure; semilogx(N,ner);
|
---|
| 210 | linewidth(2); fontsize(fonts);
|
---|
| 211 | V = axis; V(2)= m+1; axis(V);
|
---|
| 212 | ticks = [1 10 100 1000];
|
---|
| 213 | ticks = ticks(ticks <= m);
|
---|
| 214 | set(gca,'xtick',ticks);
|
---|
| 215 | set(gca,'xticklabel',ticks);
|
---|
| 216 | xlabel('Subset size')
|
---|
| 217 | ylabel('Fraction')
|
---|
| 218 | title('Negative Eigen Ratio')
|
---|
| 219 |
|
---|
| 220 | figure;
|
---|
| 221 | nep = checksubeucl(d,Nsubeucl);
|
---|
| 222 | n = min(find(nep==1));
|
---|
| 223 | if isempty(n), n = 0; end
|
---|
| 224 | plot(nep(1:n+1));
|
---|
| 225 | linewidth(2); fontsize(fonts);
|
---|
| 226 | xlabel('Subset size')
|
---|
| 227 | ylabel('Fraction')
|
---|
| 228 | title('Fraction of Non-Euclidean Subsets')
|
---|
| 229 |
|
---|
| 230 | figure;
|
---|
| 231 | hist(+d(:),[0:0.05:ceil(max(+d(:)*20))/20]);
|
---|
| 232 | dc = zeros(sum(c.clsizes.^2),1);
|
---|
| 233 | n = 0;
|
---|
| 234 | for j=1:nclass
|
---|
| 235 | nn = c.clsizes(j)^2;
|
---|
| 236 | dj = +selcdat(d,j);
|
---|
| 237 | dc(n+1:n+nn) = dj(:);
|
---|
| 238 | n = n+nn;
|
---|
| 239 | end
|
---|
| 240 | hold on
|
---|
| 241 | hist(+dc(:),[0:0.05:ceil(max(+d(:)*20))/20]);
|
---|
| 242 | h = get(gca,'Children');
|
---|
| 243 | set(h(1),'facecolor',[1 0 0]);
|
---|
| 244 | V = axis;
|
---|
| 245 | axis([-0.1 max(+d(:)) 0 V(4)]);
|
---|
| 246 | fontsize(fonts)
|
---|
| 247 | legend('between class','within class');
|
---|
| 248 | title('Histogram of normalized distances')
|
---|
| 249 |
|
---|
| 250 | showfigs
|
---|
| 251 |
|
---|
| 252 |
|
---|
| 253 | %FSAVE Save current figure as eps and fig
|
---|
| 254 | %
|
---|
| 255 | % FSAVE <dir,fig_nane>
|
---|
| 256 |
|
---|
| 257 | function fsave(datdir,file)
|
---|
| 258 |
|
---|
| 259 | file = fullfile(datdir,file);
|
---|
| 260 | uns = get(gcf,'units');
|
---|
| 261 | pos = get(gcf,'position');
|
---|
| 262 | set(gcf,'units','pixels');
|
---|
| 263 | set(gcf,'position',[1 1 900 600]);
|
---|
| 264 |
|
---|
| 265 | exportfig(gcf,file,'format','eps','preview','tiff','color','cmyk')
|
---|
| 266 | %exportfig(gcf,file,'format','png','color','cmyk')
|
---|
| 267 | %exportfig(gcf,file,'format','jpeg100','color','cmyk')
|
---|
| 268 | saveas(gcf,file,'fig')
|
---|
| 269 |
|
---|
| 270 | set(gcf,'units',uns);
|
---|
| 271 | set(gcf,'position',pos);
|
---|
| 272 |
|
---|
| 273 | return
|
---|
| 274 |
|
---|