[10] | 1 | %PROTSELFD Forward Prototype Selection for Dissimilarity Matrices
|
---|
| 2 | %
|
---|
| 3 | % [W,E,KOPT] = PROTSELFD(D,K,PAR)
|
---|
| 4 | % W = D*PROTSELFD([],K,PAR)
|
---|
| 5 | %
|
---|
| 6 | % INPUT
|
---|
[30] | 7 | % D Dataset, dissimilarity matrix
|
---|
[10] | 8 | % K Integer, desired number of prototypes
|
---|
[30] | 9 | % PAR 'SUPER' supervised selection using 1NN error on prototypes.
|
---|
| 10 | % 'LOO' - supervised selection using leave-one-out error estimation.
|
---|
| 11 | % 'MAXDIST' - unsupervised selection minimizing the maximum
|
---|
| 12 | % distance to the nearest prototype.
|
---|
| 13 | % 'MEANDIST' - unsupervised selection minimizing the average
|
---|
| 14 | % distance to the nearest prototype.
|
---|
[10] | 15 | %
|
---|
| 16 | % OUTPUT
|
---|
[66] | 17 | % W Selection mapping ('feature selection') or prototype indices.
|
---|
[10] | 18 | % E Error stimate as a function of number of selected prototypes
|
---|
[30] | 19 | % (for supervised selection only reliable for prototype sizes >= class size)
|
---|
| 20 | % KOPT Estimate for best size in avoiding peaking
|
---|
| 21 | % (supervised selection only)
|
---|
[10] | 22 | %
|
---|
| 23 | % DESCRIPTION
|
---|
[30] | 24 | % This procedure for optimizing the representation set of a dissimilarity
|
---|
| 25 | % matrix is based on a greedy, forward selection of prototypes.
|
---|
[10] | 26 | %
|
---|
[30] | 27 | % In case of supervised selection D should be a labeled dataset with
|
---|
| 28 | % prototype labels stored as feature labels. The 1NN error to the nearest
|
---|
| 29 | % prototype is used as a criterion. In case of leave-one-out error
|
---|
| 30 | % estimation it is assumed that the first objects in D correspond with the
|
---|
| 31 | % prototypes.
|
---|
| 32 | %
|
---|
| 33 | % In case K=1 just a single prototype has to be returned, but computing the
|
---|
| 34 | % 1NN error is not possible as all objects are assigned to the same class.
|
---|
| 35 | % In that case the centre object of the largest class will be returned.
|
---|
[10] | 36 | %
|
---|
[30] | 37 | % Note that the search continues untill K prototypes are found. This might
|
---|
| 38 | % be larger than desired due to peaking (overtraining). Therefor an
|
---|
| 39 | % estimate for the optimal number of prototype is returned in KOPT.
|
---|
[10] | 40 | %
|
---|
[30] | 41 | % The prototype selection may be applied by C = B*W(:,1:KSEL), in which B
|
---|
| 42 | % is a dissimilarity matrix based on the same representation set as A (e.g.
|
---|
| 43 | % A itself) and C is a resulting dissimilarity matrix in which the KSEL
|
---|
| 44 | % (e.g. KOPT) best prototypes are selected.
|
---|
| 45 | %
|
---|
| 46 | % In case of unsupervised selection the maximum or the mean distances to
|
---|
| 47 | % the nearest prototype are minimized. These criteria are the same as used
|
---|
[66] | 48 | % in the KCENTRE and KMEDIOD cluster procedures. What is returned now in W
|
---|
| 49 | % is the (ordered) list of prototype indices and not a mapping.
|
---|
[30] | 50 | %
|
---|
[10] | 51 | % REFERENCE
|
---|
| 52 | % E. Pekalska, R.P.W. Duin, and P. Paclik, Prototype selection for
|
---|
| 53 | % dissimilarity-based classification, Pattern Recognition,
|
---|
| 54 | % vol. 39, no. 2, 2006, 189-208.
|
---|
| 55 | %
|
---|
| 56 | % SEE ALSO
|
---|
| 57 | % KNNDC, DISEX_PROTSELFD
|
---|
| 58 |
|
---|
| 59 | % Copyright: R.P.W. Duin, r.p.w.duin@prtools.org
|
---|
| 60 | % Faculty EWI, Delft University of Technology
|
---|
| 61 | % P.O. Box 5031, 2600 GA Delft, The Netherlands
|
---|
| 62 |
|
---|
[30] | 63 | function [R,e,D] = protselfd(D,ksel,type)
|
---|
[10] | 64 |
|
---|
[30] | 65 | if nargin < 2, ksel = []; end
|
---|
| 66 | if nargin < 3, type = []; end
|
---|
[10] | 67 |
|
---|
[30] | 68 | if nargin < 1 || isempty(D) % allow for D*protselfd([],pars)
|
---|
| 69 | R = mapping(mfilename,'untrained',{ksel,type});
|
---|
[10] | 70 | R = setname(R,'Forward Prototype Sel');
|
---|
| 71 | return
|
---|
| 72 | end
|
---|
| 73 |
|
---|
[30] | 74 | switch lower(type)
|
---|
| 75 | case {'loo','LOO','super','SUPER','',''}
|
---|
| 76 | [R,e,D,J,nlab,clab] = protselfd(D,ksel,type);
|
---|
| 77 | case {'maxdist','meandist'}
|
---|
[66] | 78 | [R,e] = protselfd_unsuper(D,ksel,type);
|
---|
[30] | 79 | otherwise
|
---|
| 80 | error('Unknown selection type')
|
---|
| 81 | end
|
---|
| 82 |
|
---|
| 83 | return
|
---|
| 84 |
|
---|
| 85 |
|
---|
| 86 | function [R,e,D,J,nlab,clab] = protselfd_super_init(D,ksel,par)
|
---|
| 87 | % this routine takes care of the initialisation of supervised selection
|
---|
| 88 |
|
---|
| 89 | isdataset(D);
|
---|
[10] | 90 | [m,k,c] = getsize(D);
|
---|
| 91 | if isempty(ksel), ksel = k; end
|
---|
| 92 | if strcmp(par,'loo') | strcmp(par,'LOO')
|
---|
| 93 | if k > m
|
---|
| 94 | error('More rows than columns expected for dissimilarity matrix')
|
---|
| 95 | end
|
---|
| 96 | discheck(D(1:k,:));
|
---|
| 97 | D(1:k,:) = D(1:k,:) + 1e100*eye(k); % get rid of diagonal for LOO
|
---|
| 98 | end
|
---|
| 99 |
|
---|
| 100 | %Initialise by the centre of the largest class
|
---|
| 101 | cc = classsizes(D);
|
---|
| 102 | [cmax,n] = max(cc); % n is the largest class
|
---|
| 103 | lablist = getlablist(D);
|
---|
| 104 | nlab = getnlab(D);
|
---|
| 105 | clab = renumlab(getfeatlab(D),lablist);
|
---|
| 106 | R = find(nlab == n);
|
---|
| 107 | C = find(clab == n);
|
---|
| 108 | dd = +D(R,C);
|
---|
| 109 | [dmin,rmin] = sort(dd,1); % find one but most remote object
|
---|
| 110 | [dmin,cmin] = min(dmin(end-1,:)); % find prototype for which this is minimum
|
---|
| 111 | R = C(cmin);
|
---|
| 112 |
|
---|
| 113 | e = zeros(1,ksel);
|
---|
| 114 | [nlab,clab] = renumlab(getlabels(D),getfeatlab(D));
|
---|
| 115 | [dd,J] = min(+D(:,R),[],2);
|
---|
| 116 | e(1) = sum(clab(R(J)) ~= nlab);
|
---|
| 117 |
|
---|
| 118 | if ksel > 1
|
---|
| 119 | % this will be a deep recursive call !!!
|
---|
| 120 | prwaitbar(ksel,'Forward prototype selection')
|
---|
[30] | 121 | [R,e,D,J,nlab,clab] = protselfd_super(D,ksel,R,J,e,nlab,clab);
|
---|
[10] | 122 | prwaitbar(0);
|
---|
| 123 | end
|
---|
| 124 | e = e(1:length(+R))/m;
|
---|
| 125 | R = featsel(k,R);
|
---|
| 126 |
|
---|
| 127 | % Find optimal number of prototypes in avoiding peaking
|
---|
| 128 |
|
---|
| 129 | Jopt = find(e==min(e));
|
---|
| 130 | D = floor((Jopt(end)+Jopt(1))/2);
|
---|
| 131 |
|
---|
[30] | 132 | return
|
---|
[10] | 133 |
|
---|
[30] | 134 | function [R,e,D,J,nlab,clab] = protselfd_super(D,ksel,R,J,e,nlab,clab)
|
---|
| 135 |
|
---|
[10] | 136 | [m,k,c] = getsize(D);
|
---|
| 137 | d = +D;
|
---|
| 138 | S = [1:k]; % all candidates
|
---|
| 139 | S(R) = []; % exclude ones we have
|
---|
| 140 | emin = inf;
|
---|
| 141 | dmin = inf;
|
---|
| 142 | r = length(R);
|
---|
| 143 | prwaitbar(ksel,r);
|
---|
| 144 | for j=S % run over all candidates left
|
---|
| 145 | % the following tricky statements finds the nearest neighobor indices n
|
---|
| 146 | % for all objects to their nearest prototype (n=1) or the candidate
|
---|
| 147 | % prototype (n=2). In ds the minimum distances are stored and used for
|
---|
| 148 | % solving ties later.
|
---|
| 149 | [ds,n] = min([d(m*(R(J')'-1)+[1:m]'),d(:,j)],[],2);
|
---|
| 150 | % the labels of the nearest prototypes and the candidates
|
---|
| 151 | cclab = [clab(R(J)') repmat(clab(j),m,1)];
|
---|
| 152 | % compute the nearest neighbor error using the computed n
|
---|
| 153 | ee = sum(cclab(m*(n-1)+[1:m]') ~= nlab);
|
---|
| 154 | de = sum(ds);
|
---|
| 155 | % if better, use it
|
---|
[30] | 156 | if ee < emin || ((ee == emin) && (de < dmin))
|
---|
[10] | 157 | emin = ee;
|
---|
| 158 | jmin = j;
|
---|
| 159 | JJ = [J repmat(r+1,m,1)];
|
---|
| 160 | Jmin = JJ(m*(n-1)+[1:m]');
|
---|
| 161 | Rmin = [R jmin];
|
---|
| 162 | dmin = de;
|
---|
| 163 | end
|
---|
| 164 | end
|
---|
| 165 |
|
---|
[30] | 166 | if emin <= e(r) || 1 % we even continue if emin increases due to peaking
|
---|
[10] | 167 | e(r+1) = emin;
|
---|
| 168 | R = Rmin;
|
---|
| 169 | if (r+1) < ksel
|
---|
[30] | 170 | [R,e,D,J,nlab,clab] = protselfd_super(D,ksel,R,Jmin,e,nlab,clab);
|
---|
[10] | 171 | end
|
---|
| 172 | end
|
---|
| 173 |
|
---|
[30] | 174 | return
|
---|
| 175 |
|
---|
| 176 | %PROTSELFD_UNSUPER Forward prototype selection
|
---|
| 177 | %
|
---|
| 178 | % N = PROTSELFD_UNSUPER(D,P,CRIT)
|
---|
| 179 | %
|
---|
| 180 | % INPUT
|
---|
| 181 | % D Square dissimilarity matrix, zeros on diagonal
|
---|
| 182 | % P Number of prototypes to be selected
|
---|
[66] | 183 | % CRIT 'maxdist' or 'meandist'
|
---|
[30] | 184 | %
|
---|
| 185 | % OUTPUT
|
---|
| 186 | % N Indices of selected prototypes
|
---|
| 187 | %
|
---|
| 188 | % DESCRIPTION
|
---|
| 189 | % Sort objects given by square dissim matrix D using a greedy approach
|
---|
| 190 | % such that the maximum NN distance from all objects (prototypes)
|
---|
| 191 | % to the first K: max(min(D(:,N(1:K),[],2)) is minimized.
|
---|
| 192 | %
|
---|
| 193 | % This routines tries to sample the objects such that they are evenly
|
---|
| 194 | % spaced judged from their dissimilarities. This may be used as
|
---|
| 195 | % initialisation in KCENTRES. It works reasonably, but not very good.
|
---|
| 196 | %
|
---|
| 197 | % SEE ALSO
|
---|
[66] | 198 | % KCENTRES, KMEDIODS
|
---|
[30] | 199 |
|
---|
| 200 | % Copyright: R.P.W. Duin, r.p.w.duin@prtools.org
|
---|
| 201 | % Faculty EWI, Delft University of Technology
|
---|
| 202 | % P.O. Box 5031, 2600 GA Delft, The Netherlands
|
---|
| 203 |
|
---|
[66] | 204 | function [N,e] = protselfd_unsuper(d,p,crit)
|
---|
[30] | 205 |
|
---|
| 206 | d = +d;
|
---|
| 207 | [m,k] = size(d);
|
---|
| 208 | if isempty(crit), crit = 'max'; end
|
---|
| 209 | if nargin < 2 || isempty(p), p = k; end
|
---|
| 210 | L = 1:k;
|
---|
| 211 | N = zeros(1,p);
|
---|
| 212 | switch crit
|
---|
| 213 | case 'maxdist'
|
---|
| 214 | [~,n] = min(max(d)); % this is the first (central) prototype
|
---|
| 215 | case 'meandist'
|
---|
| 216 | [~,n] = min(mean(d)); % this is the first (central) prototype
|
---|
[10] | 217 | end
|
---|
[30] | 218 | e = d(:,n); % store here the distances to the nearest prototype (dNNP)
|
---|
| 219 | f = min(d,repmat(e,1,k)); % replace distances that are larger than dNNP by dNNP
|
---|
| 220 | N(1) = n; % ranking of selected prototypes
|
---|
| 221 | L(n) = []; % candidate prototypes (all not yet selected objects)
|
---|
| 222 |
|
---|
| 223 | for j=2:p % extend prototype set
|
---|
| 224 | switch crit % select the next prototype out of candidates in L
|
---|
| 225 | case 'maxdist'
|
---|
| 226 | [~,n] = min(max(f(:,L)));
|
---|
| 227 | case 'meandist'
|
---|
| 228 | [~,n] = min(mean(f(:,L)));
|
---|
| 229 | end
|
---|
| 230 | e = min([d(:,L(n)) e],[],2); % update dNNP
|
---|
| 231 | f = min(d,repmat(e,1,k)); % update replacement of distances that are larger
|
---|
| 232 | % than dNNP by dNNP
|
---|
| 233 | N(j) = L(n); % update list of selected prototypes
|
---|
| 234 | L(n) = []; % update list of candidate prototypes
|
---|
| 235 | end
|
---|
| 236 |
|
---|