source: distools/protselfd.m @ 145

Last change on this file since 145 was 79, checked in by bduin, 11 years ago
File size: 8.2 KB
RevLine 
[10]1%PROTSELFD Forward Prototype Selection for Dissimilarity Matrices
2%
3%   [W,E,KOPT] = PROTSELFD(D,K,PAR)
4%    W  = D*PROTSELFD([],K,PAR)
5%
6% INPUT
[30]7%   D     Dataset, dissimilarity matrix
[10]8%   K     Integer, desired number of prototypes
[30]9%   PAR   'SUPER' supervised selection using 1NN error on prototypes.
10%         'LOO' - supervised selection using leave-one-out error estimation.
11%         'MAXDIST' - unsupervised selection minimizing the maximum
12%                     distance to the nearest prototype.
13%         'MEANDIST' - unsupervised selection minimizing the average
14%                      distance to the nearest prototype.
[10]15%
16% OUTPUT
[66]17%   W     Selection mapping ('feature selection') or prototype indices.
[10]18%   E     Error stimate as a function of number of selected prototypes
[30]19%         (for supervised selection only reliable for prototype sizes >= class size)
20%   KOPT  Estimate for best size in avoiding peaking
21%         (supervised selection only)
[10]22%
23% DESCRIPTION
[30]24% This procedure for optimizing the representation set of a dissimilarity
25% matrix is based on a greedy, forward selection of prototypes.
[10]26%
[30]27% In case of supervised selection D should be a labeled dataset with
28% prototype labels stored as feature labels. The 1NN error to the nearest
29% prototype is used as a criterion. In case of leave-one-out error
30% estimation it is assumed that the first objects in D correspond with the
31% prototypes.
32%
33% In case K=1 just a single prototype has to be returned, but computing the
34% 1NN error is not possible as all objects are assigned to the same class.
35% In that case the centre object of the largest class will be returned.
[10]36%
[30]37% Note that the search continues untill K prototypes are found. This might
38% be larger than desired due to peaking (overtraining). Therefor an
39% estimate for the optimal number of prototype is returned in KOPT.
[10]40%
[30]41% The prototype selection may be applied by C = B*W(:,1:KSEL), in which B
42% is a dissimilarity matrix based on the same representation set as A (e.g.
43% A itself) and C is a resulting dissimilarity matrix in which the KSEL
44% (e.g. KOPT) best prototypes are selected.
45%
46% In case of unsupervised selection the maximum or the mean distances to
47% the nearest prototype are minimized. These criteria are the same as used
[66]48% in the KCENTRE and KMEDIOD cluster procedures. What is returned now in W
49% is the (ordered) list of prototype indices and not a mapping.
[30]50%
[10]51% REFERENCE
52% E. Pekalska, R.P.W. Duin, and P. Paclik, Prototype selection for
53% dissimilarity-based classification, Pattern Recognition,
54% vol. 39, no. 2, 2006, 189-208.
55%
56% SEE ALSO
57% KNNDC, DISEX_PROTSELFD
58
59% Copyright: R.P.W. Duin, r.p.w.duin@prtools.org
60% Faculty EWI, Delft University of Technology
61% P.O. Box 5031, 2600 GA Delft, The Netherlands
62
[30]63function [R,e,D] = protselfd(D,ksel,type)
[10]64
[30]65  if nargin < 2, ksel = []; end
66  if nargin < 3, type = []; end
[10]67
[30]68  if nargin < 1 || isempty(D)  % allow for D*protselfd([],pars)
[79]69    R = prmapping(mfilename,'untrained',{ksel,type});
[10]70    R = setname(R,'Forward Prototype Sel');
71    return
72  end
73 
[30]74  switch lower(type)
75    case {'loo','LOO','super','SUPER','',''}
76      [R,e,D,J,nlab,clab] = protselfd(D,ksel,type);
77    case {'maxdist','meandist'}
[66]78      [R,e] = protselfd_unsuper(D,ksel,type);
[30]79    otherwise
80      error('Unknown selection type')
81  end
82     
83return 
84
85
86function [R,e,D,J,nlab,clab] = protselfd_super_init(D,ksel,par)
87% this routine takes care of the initialisation of supervised selection
88
89  isdataset(D);
[10]90  [m,k,c] = getsize(D);
91  if isempty(ksel), ksel = k; end
92  if strcmp(par,'loo') | strcmp(par,'LOO')
93    if k > m
94      error('More rows than columns expected for dissimilarity matrix')
95    end
96    discheck(D(1:k,:));
97    D(1:k,:) = D(1:k,:) + 1e100*eye(k); % get rid of diagonal for LOO
98  end
99 
100  %Initialise by the centre of the largest class
101  cc = classsizes(D);
102  [cmax,n] = max(cc); % n is the largest class
103  lablist = getlablist(D);
104  nlab = getnlab(D);
105  clab = renumlab(getfeatlab(D),lablist);
106  R = find(nlab == n);
107  C = find(clab == n);
108  dd = +D(R,C);
109  [dmin,rmin] = sort(dd,1); % find one but most remote object
110  [dmin,cmin] = min(dmin(end-1,:)); % find prototype for which this is minimum
111  R = C(cmin);
112 
113  e = zeros(1,ksel);
114  [nlab,clab] = renumlab(getlabels(D),getfeatlab(D));
115  [dd,J] = min(+D(:,R),[],2);
116  e(1) = sum(clab(R(J)) ~= nlab);
117 
118        if ksel > 1
119                % this will be a deep recursive call !!!
120                prwaitbar(ksel,'Forward prototype selection')
[30]121                [R,e,D,J,nlab,clab] = protselfd_super(D,ksel,R,J,e,nlab,clab);
[10]122                prwaitbar(0);
123        end
124  e = e(1:length(+R))/m;
125  R = featsel(k,R);
126 
127  % Find optimal number of prototypes in avoiding peaking
128 
129  Jopt = find(e==min(e));
130  D = floor((Jopt(end)+Jopt(1))/2);
131 
[30]132return
[10]133 
[30]134function [R,e,D,J,nlab,clab] = protselfd_super(D,ksel,R,J,e,nlab,clab)
135
[10]136  [m,k,c] = getsize(D);
137  d = +D;
138  S = [1:k];   % all candidates
139  S(R) = [];   % exclude ones we have
140  emin = inf;
141  dmin = inf;
142  r = length(R);
143  prwaitbar(ksel,r);
144  for j=S      % run over all candidates left
145                % the following tricky statements finds the nearest neighobor indices n
146                % for all objects to their nearest prototype (n=1) or the candidate
147                % prototype (n=2). In ds the minimum distances are stored and used for
148                % solving ties later.
149    [ds,n] = min([d(m*(R(J')'-1)+[1:m]'),d(:,j)],[],2);
150                % the labels of the nearest prototypes and the candidates
151    cclab = [clab(R(J)') repmat(clab(j),m,1)];
152                % compute the nearest neighbor error using the computed n
153    ee = sum(cclab(m*(n-1)+[1:m]') ~= nlab);
154    de = sum(ds);
155                % if better, use it
[30]156    if ee < emin || ((ee == emin) && (de < dmin))
[10]157      emin = ee;
158      jmin = j;
159      JJ = [J repmat(r+1,m,1)];
160      Jmin = JJ(m*(n-1)+[1:m]');
161      Rmin = [R jmin];
162      dmin = de;
163    end
164  end
165
[30]166  if emin <= e(r) || 1 % we even continue if emin increases due to peaking
[10]167    e(r+1) = emin;
168    R = Rmin;
169    if (r+1) < ksel
[30]170                        [R,e,D,J,nlab,clab] = protselfd_super(D,ksel,R,Jmin,e,nlab,clab);
[10]171    end
172  end
173 
[30]174return
175
176%PROTSELFD_UNSUPER Forward prototype selection
177%
178%               N = PROTSELFD_UNSUPER(D,P,CRIT)
179%
180% INPUT
181%   D     Square dissimilarity matrix, zeros on diagonal
182%   P     Number of prototypes to be selected
[66]183%   CRIT  'maxdist' or 'meandist'
[30]184%
185% OUTPUT
186%   N     Indices of selected prototypes
187%
188% DESCRIPTION
189% Sort objects given by square dissim matrix D using a greedy approach
190% such that the maximum NN distance from all objects (prototypes)
191% to the first K: max(min(D(:,N(1:K),[],2)) is minimized.
192%
193% This routines tries to sample the objects such that they are evenly
194% spaced judged from their dissimilarities. This may be used as
195% initialisation in KCENTRES. It works reasonably, but not very good.
196%
197% SEE ALSO
[66]198% KCENTRES, KMEDIODS
[30]199
200% Copyright: R.P.W. Duin, r.p.w.duin@prtools.org
201% Faculty EWI, Delft University of Technology
202% P.O. Box 5031, 2600 GA Delft, The Netherlands
203
[66]204function [N,e] = protselfd_unsuper(d,p,crit)
[30]205
206d = +d;
207[m,k] = size(d);
208if isempty(crit), crit = 'max'; end
209if nargin < 2 || isempty(p), p = k; end
210L = 1:k;
211N = zeros(1,p);
212switch crit
213  case 'maxdist'
214    [~,n] = min(max(d));    % this is the first (central) prototype
215  case 'meandist'
216    [~,n] = min(mean(d));   % this is the first (central) prototype
[10]217end
[30]218e = d(:,n);                 % store here the distances to the nearest prototype (dNNP)
219f = min(d,repmat(e,1,k));   % replace distances that are larger than dNNP by dNNP
220N(1) = n;                   % ranking of selected prototypes
221L(n) = [];                  % candidate prototypes (all not yet selected objects)
222
223for j=2:p                   % extend prototype set
224  switch crit               % select the next prototype out of candidates in L
225    case 'maxdist'
226      [~,n] = min(max(f(:,L)));
227    case 'meandist'
228      [~,n] = min(mean(f(:,L)));
229  end
230  e = min([d(:,L(n)) e],[],2);   % update dNNP
231  f = min(d,repmat(e,1,k));      % update replacement of distances that are larger
232                                 % than dNNP by dNNP
233  N(j) = L(n);                   % update list of selected prototypes
234  L(n) = [];                     % update list of candidate prototypes
235end
236
Note: See TracBrowser for help on using the repository browser.