[10] | 1 | %PROTSELFD Forward Prototype Selection for Dissimilarity Matrices
|
---|
| 2 | %
|
---|
| 3 | % [W,E,KOPT] = PROTSELFD(D,K,PAR)
|
---|
| 4 | % W = D*PROTSELFD([],K,PAR)
|
---|
| 5 | %
|
---|
| 6 | % INPUT
|
---|
| 7 | % D Dataset, square dissimilarity matrix
|
---|
| 8 | % K Integer, desired number of prototypes
|
---|
| 9 | % PAR 'LOO' - leave-one-out option. This should be used if
|
---|
| 10 | % the objects are related to themselves. If D is not square,
|
---|
| 11 | % it is assumed that the first sets of objects in columns and
|
---|
| 12 | % rows match.
|
---|
| 13 | % 'ALL' - use all objects (default).
|
---|
| 14 | %
|
---|
| 15 | % OUTPUT
|
---|
| 16 | % W Selection mapping ('feature selection')
|
---|
| 17 | % E Error stimate as a function of number of selected prototypes
|
---|
| 18 | % (only reliable for prototype sizes >= class size)
|
---|
| 19 | % KOPT Estimate for best size in avoiding peaking
|
---|
| 20 | %
|
---|
| 21 | % DESCRIPTION
|
---|
| 22 | % This procedure for optimizing the representation set of a
|
---|
| 23 | % dissimilarity matrix is based on a greedy, forward selection of
|
---|
| 24 | % prototypes using the leave-one-out error estimate of the 1NN rule
|
---|
| 25 | % as a criterion. As this is computed on the given distances in
|
---|
| 26 | % D, the procedure is based on sorting and counting only and is
|
---|
| 27 | % thereby fast. In case K=1 just a single prototype has to be returned,
|
---|
| 28 | % but computing the 1NN error is not possible as all objects are assigned
|
---|
| 29 | % to the same class. In that case the centre object of the largest class
|
---|
| 30 | % will be returned.
|
---|
| 31 | %
|
---|
| 32 | % Note that the search continues untill K prototypes are found.
|
---|
| 33 | % This might be larger than desired due to peaking (curse of
|
---|
| 34 | % dimensionality, overtraining). Therefor an estimate for the
|
---|
| 35 | % optimal number of prototype is returned in KOPT.
|
---|
| 36 | %
|
---|
| 37 | % The prototype selection may be applied by C = B*W(:,1:KSEL),
|
---|
| 38 | % in which B is a dissimilarity matrix based on the same
|
---|
| 39 | % representation set as A (e.g. A itself) and C is a resulting
|
---|
| 40 | % dissimilarity matrix in which the KSEL (e.g. KOPT) best prototypes
|
---|
| 41 | % are selected.
|
---|
| 42 | %
|
---|
| 43 | % REFERENCE
|
---|
| 44 | % E. Pekalska, R.P.W. Duin, and P. Paclik, Prototype selection for
|
---|
| 45 | % dissimilarity-based classification, Pattern Recognition,
|
---|
| 46 | % vol. 39, no. 2, 2006, 189-208.
|
---|
| 47 | %
|
---|
| 48 | % SEE ALSO
|
---|
| 49 | % KNNDC, DISEX_PROTSELFD
|
---|
| 50 |
|
---|
| 51 | % Copyright: R.P.W. Duin, r.p.w.duin@prtools.org
|
---|
| 52 | % Faculty EWI, Delft University of Technology
|
---|
| 53 | % P.O. Box 5031, 2600 GA Delft, The Netherlands
|
---|
| 54 |
|
---|
| 55 | %
|
---|
| 56 |
|
---|
| 57 | function [R,e,D,J,nlab,clab] = protselfd(D,ksel,par,J,e,nlab,clab)
|
---|
| 58 |
|
---|
| 59 | if nargin < 2, ksel = []; end
|
---|
| 60 | if nargin < 3 | isempty(par), par = 'all'; end
|
---|
| 61 |
|
---|
| 62 | if nargin < 4 % user call
|
---|
| 63 |
|
---|
| 64 | if nargin < 1 | isempty(D) % allow for D*protselfd([],pars)
|
---|
| 65 | R = mapping(mfilename,'untrained',{ksel,par});
|
---|
| 66 | R = setname(R,'Forward Prototype Sel');
|
---|
| 67 | return
|
---|
| 68 | end
|
---|
| 69 |
|
---|
| 70 | [m,k,c] = getsize(D);
|
---|
| 71 | if isempty(ksel), ksel = k; end
|
---|
| 72 | if strcmp(par,'loo') | strcmp(par,'LOO')
|
---|
| 73 | if k > m
|
---|
| 74 | error('More rows than columns expected for dissimilarity matrix')
|
---|
| 75 | end
|
---|
| 76 | discheck(D(1:k,:));
|
---|
| 77 | D(1:k,:) = D(1:k,:) + 1e100*eye(k); % get rid of diagonal for LOO
|
---|
| 78 | end
|
---|
| 79 |
|
---|
| 80 | %Initialise by the centre of the largest class
|
---|
| 81 | cc = classsizes(D);
|
---|
| 82 | [cmax,n] = max(cc); % n is the largest class
|
---|
| 83 | lablist = getlablist(D);
|
---|
| 84 | nlab = getnlab(D);
|
---|
| 85 | clab = renumlab(getfeatlab(D),lablist);
|
---|
| 86 | R = find(nlab == n);
|
---|
| 87 | C = find(clab == n);
|
---|
| 88 | dd = +D(R,C);
|
---|
| 89 | [dmin,rmin] = sort(dd,1); % find one but most remote object
|
---|
| 90 | [dmin,cmin] = min(dmin(end-1,:)); % find prototype for which this is minimum
|
---|
| 91 | R = C(cmin);
|
---|
| 92 |
|
---|
| 93 | e = zeros(1,ksel);
|
---|
| 94 | [nlab,clab] = renumlab(getlabels(D),getfeatlab(D));
|
---|
| 95 | [dd,J] = min(+D(:,R),[],2);
|
---|
| 96 | e(1) = sum(clab(R(J)) ~= nlab);
|
---|
| 97 |
|
---|
| 98 | if ksel > 1
|
---|
| 99 | % this will be a deep recursive call !!!
|
---|
| 100 | prwaitbar(ksel,'Forward prototype selection')
|
---|
| 101 | [R,e,D,J,nlab,clab] = protselfd(D,ksel,R,J,e,nlab,clab);
|
---|
| 102 | prwaitbar(0);
|
---|
| 103 | end
|
---|
| 104 | e = e(1:length(+R))/m;
|
---|
| 105 | R = featsel(k,R);
|
---|
| 106 |
|
---|
| 107 | % Find optimal number of prototypes in avoiding peaking
|
---|
| 108 |
|
---|
| 109 | Jopt = find(e==min(e));
|
---|
| 110 | D = floor((Jopt(end)+Jopt(1))/2);
|
---|
| 111 |
|
---|
| 112 | % done!
|
---|
| 113 |
|
---|
| 114 | else % internal call, parameters may have another meaning!
|
---|
| 115 |
|
---|
| 116 | R = par; % prototypes sofar
|
---|
| 117 | [m,k,c] = getsize(D);
|
---|
| 118 | d = +D;
|
---|
| 119 | S = [1:k]; % all candidates
|
---|
| 120 | S(R) = []; % exclude ones we have
|
---|
| 121 | emin = inf;
|
---|
| 122 | dmin = inf;
|
---|
| 123 | r = length(R);
|
---|
| 124 | prwaitbar(ksel,r);
|
---|
| 125 | for j=S % run over all candidates left
|
---|
| 126 | % the following tricky statements finds the nearest neighobor indices n
|
---|
| 127 | % for all objects to their nearest prototype (n=1) or the candidate
|
---|
| 128 | % prototype (n=2). In ds the minimum distances are stored and used for
|
---|
| 129 | % solving ties later.
|
---|
| 130 | [ds,n] = min([d(m*(R(J')'-1)+[1:m]'),d(:,j)],[],2);
|
---|
| 131 | % the labels of the nearest prototypes and the candidates
|
---|
| 132 | cclab = [clab(R(J)') repmat(clab(j),m,1)];
|
---|
| 133 | % compute the nearest neighbor error using the computed n
|
---|
| 134 | ee = sum(cclab(m*(n-1)+[1:m]') ~= nlab);
|
---|
| 135 | de = sum(ds);
|
---|
| 136 | % if better, use it
|
---|
| 137 | if ee < emin | ((ee == emin) & (de < dmin))
|
---|
| 138 | emin = ee;
|
---|
| 139 | jmin = j;
|
---|
| 140 | JJ = [J repmat(r+1,m,1)];
|
---|
| 141 | Jmin = JJ(m*(n-1)+[1:m]');
|
---|
| 142 | Rmin = [R jmin];
|
---|
| 143 | dmin = de;
|
---|
| 144 | end
|
---|
| 145 | end
|
---|
| 146 |
|
---|
| 147 | if emin <= e(r) | 1 % we even continue if emin increases due to peaking
|
---|
| 148 | e(r+1) = emin;
|
---|
| 149 | R = Rmin;
|
---|
| 150 | if (r+1) < ksel
|
---|
| 151 | [R,e,D,J,nlab,clab] = protselfd(D,ksel,R,Jmin,e,nlab,clab);
|
---|
| 152 | end
|
---|
| 153 | end
|
---|
| 154 |
|
---|
| 155 | end
|
---|