%CHARDMAT Characterization of a square, labeled dissimilarity matrix % % [C,D_OUT] = CHARDMAT(D,Ntr,Nsubeucl) % % Characterizes a square (dis)similarity dataset D. % D_OUT is the symmetric, normalized dissimilarity dataset D. If D is % a similarity dataset it is converted to dissimilarities first. % The following fields are returned in the structure C. % name - dataset name as stored by PRTools or read command % desc - dataset description as stored by read command % link - web links as stored by read command % ref - references as stored by read command % asym - asymmetry, 2*|D-D'|./(|D|+|D'|) % size - number of objects % classes - number of classes % clsizes - vector with class sizes % type - 'dis' for dissimilarities, 'sim' for similarities % % all following items are computed for a transformation % of D by MAKESYM and DISNORM (make average distance 1), % similarities are first transformed into dissimilarities % by d(i,j) = sqrt(d(i,j) + d(j,j) - d(i,j) - d(j,i)) % % within_mean- average within class dissimilarity % between_mean- average between class dissimilarity % pe_mapping - Pseudo-Euclidean mapping as computed by PE_EM % signature - 2 component vector with # of positive and negative % eigenvalues obtained during the PE embedding % eigenvalues- the eigenvalues obtained during the PE embedding, % see PE_EM for their ranking % nef - Negative Eigen Fraction (sum of absolute negative % eigenvalues divided by sum of all absolute eigenvalues) % ner - Negative Eigen Ratio (- largest negative eigenvalue % divided by largest positive eigenvalue) % trineq - fraction of triangle inequality violations % % the following characteristics refer to a set of five spaces: % - Pseudo-Euclidean space based on a full embedding. Distances % in this space are identical to D. % - Associated space, the same vector spaces, but now treated as % an Euclidean space % - Positive space based on the positive eigenvalues only % - Negative space based on the negative eigenvalues only % - Corrected space based on an embedding of sqrt(D.^2+2*Lmin) % in which Lmin is the absolute values of the largest negative % eigenvalue. The result is a proper Euclidean space. % % loo_a - leave-one-out nearest neighbor errors for all five % embedded spaces % loo_d - leave-one-out nearest neighbor errors for the dissimilarity % spaces related to the above five embedded spaces % lcurve_a - nearest neighbor learning curves for the five embedded % spaces % lcurve_a - nearest neighbor learning curves for the five dissimilarity % spaces % anames - names of the five embedded spaces, useful for annotation % dnames - names of the five dissimilarity spaces % % Ntr (default 200) and Nsubeucl (default 50) control numbers of trials to % estimate the fraction of triangle violations and the accuracy of the % subeuclidean curves. function [c,d] = chardmat(d,Ntr,Nsubeucl,makefigs) if nargin < 4, makefigs = 0; end if nargin < 3 | isempty(Nsubeucl), Nsubeucl = 50; end if nargin < 2 | isempty(Ntr), Ntr = 200; end isdataset(d); datname = getname(d); discheck(d,[],1); m = size(d,1); nclass = getsize(d,3); c.name = datname; c.desc = getuser(d,'desc'); c.link = getuser(d,'link'); c.ref = getuser(d,'ref'); c.asym = asymmetry(d); c.size = m; c.classes = nclass; c.clsizes = classsizes(d); if discheck(d); c.type = 'dis'; else c.type = 'sim'; d = dissimt(d,'sim2dis'); end % we now have a dissimilarity matrix with positive distances d = makesym(d); % make it symmetric now d = d*disnorm(d); uc = zeros(1,nclass); for j=1:c.classes nj = c.clsizes(j); dj = +selcdat(d,j); uc(j) = sum(dj(:))/(nj*(nj-1)); end c.within_mean = uc*(c.clsizes'.^2-c.clsizes')/(sum(c.clsizes.^2) - m); ud = (m*(m-1) - uc*(c.clsizes'.^2-c.clsizes')) / (m*(m-1) - sum(c.clsizes.^2) + m); c.between_mean = ud; [nef,ner,w] = checkeucl(d); c.pe_mapping = w; c.signature = getsig(w); c.eigenvalues = getdata(w,'eval'); c.nef = nef; c.ner = ner; c.trineq = checktr(d,200); [A D] = disspaces(d,w); nspaces = length(A); c.loo_a = zeros(1,nspaces); % LOO NN errors embedding spaces c.loo_d = zeros(1,nspaces); % LOO NN errors dis spaces c.lcurve_a = cell(1,nspaces); % NN Learning curves embedding spaces c.lcurve_d = cell(1,nspaces); % NN Learning curves dis spaces c.anames = cell(1,nspaces); % names embedded spaces c.dnames = cell(1,nspaces); % names dis spaces t = sprintf('Compute %i learning curves: ',nspaces*2); prwaitbar(nspaces*2,t); for j=1:nspaces c.loo_a(j) = nne(D{j}); ddj = distm(D{j}); c.loo_d(j) = nne(ddj); prwaitbar(nspaces*2,j*2-1,[t int2str(j*2-1)]); c.lcurve_a{j} = nnerr(D{j}); c.lcurve_a{j}.names = getname(A{j}); c.anames{j} = getname(A{j}); prwaitbar(nspaces*2,j*2,[t int2str(j*2)]); c.lcurve_d{j} = nnerr(ddj); c.lcurve_d{j}.names = getname(D{j}); c.dnames{j} = getname(D{j}); end prwaitbar(0); if makefigs make_figs(d,c); else show_figs(c,d,Nsubeucl) end return function show_figs(c,d,Nsubeucl) if nargin < 3 | isempty(Nsubeucl) Nsubeucl = 50; end fonts = 12; m = size(d,1); nclass = getsize(d,3); d delfigs figure; imagesc(+d); colormap gray axis off; axis square title('Dissimilarity Matrix'); fontsize(fonts); figure; scatterd(d*c.pe_mapping(:,1:2)); title('Scatterplot on first two positive eigenvectors') xlabel('Eigenvector 1'); ylabel('Eigenvector 2'); fontsize(fonts); figure; plotspectrum(c.eigenvalues); fontsize(fonts); figure; plote([c.lcurve_a(1:4) c.lcurve_d(1)],[],char('k-','r-','b-','m-','k--')); V = axis; V(2)= m+1; axis(V); ticks = [1 10 100 1000]; ticks = ticks(ticks <= m); set(gca,'xtick',ticks); set(gca,'xticklabel',ticks); fontsize(fonts); if isfield(c,'lcurve_d') figure; plote([c.lcurve_d(1:4) c.lcurve_a(1)],[],char('k--','r--','b--','m--','k-')); V = axis; V(2)= m+1; axis(V); ticks = [1 10 100 1000]; ticks = ticks(ticks <= m); set(gca,'xtick',ticks); set(gca,'xticklabel',ticks); fontsize(fonts); end [nef,ner,N] = checkeucl(d,'all'); figure; semilogx(N,nef); linewidth(2); fontsize(fonts); V = axis; V(2)= m+1; axis(V); ticks = [1 10 100 1000]; ticks = ticks(ticks <= m); set(gca,'xtick',ticks); set(gca,'xticklabel',ticks); xlabel('Subset size') ylabel('Fraction') title('Negative Eigen Fraction') figure; semilogx(N,ner); linewidth(2); fontsize(fonts); V = axis; V(2)= m+1; axis(V); ticks = [1 10 100 1000]; ticks = ticks(ticks <= m); set(gca,'xtick',ticks); set(gca,'xticklabel',ticks); xlabel('Subset size') ylabel('Fraction') title('Negative Eigen Ratio') figure; nep = checksubeucl(d,Nsubeucl); n = min(find(nep==1)); if isempty(n), n = 0; end plot(nep(1:n+1)); linewidth(2); fontsize(fonts); xlabel('Subset size') ylabel('Fraction') title('Fraction of Non-Euclidean Subsets') figure; hist(+d(:),[0:0.05:ceil(max(+d(:)*20))/20]); dc = zeros(sum(c.clsizes.^2),1); n = 0; for j=1:nclass nn = c.clsizes(j)^2; dj = +selcdat(d,j); dc(n+1:n+nn) = dj(:); n = n+nn; end hold on hist(+dc(:),[0:0.05:ceil(max(+d(:)*20))/20]); h = get(gca,'Children'); set(h(1),'facecolor',[1 0 0]); V = axis; axis([-0.1 max(+d(:)) 0 V(4)]); fontsize(fonts) legend('between class','within class'); title('Histogram of normalized distances') showfigs %FSAVE Save current figure as eps and fig % % FSAVE