source: distools/protselfd.m @ 40

Last change on this file since 40 was 30, checked in by bduin, 12 years ago

unsupervised criteria included in protselfd

File size: 8.0 KB
Line 
1%PROTSELFD Forward Prototype Selection for Dissimilarity Matrices
2%
3%   [W,E,KOPT] = PROTSELFD(D,K,PAR)
4%    W  = D*PROTSELFD([],K,PAR)
5%
6% INPUT
7%   D     Dataset, dissimilarity matrix
8%   K     Integer, desired number of prototypes
9%   PAR   'SUPER' supervised selection using 1NN error on prototypes.
10%         'LOO' - supervised selection using leave-one-out error estimation.
11%         'MAXDIST' - unsupervised selection minimizing the maximum
12%                     distance to the nearest prototype.
13%         'MEANDIST' - unsupervised selection minimizing the average
14%                      distance to the nearest prototype.
15%
16% OUTPUT
17%   W     Selection mapping ('feature selection')
18%   E     Error stimate as a function of number of selected prototypes
19%         (for supervised selection only reliable for prototype sizes >= class size)
20%   KOPT  Estimate for best size in avoiding peaking
21%         (supervised selection only)
22%
23% DESCRIPTION
24% This procedure for optimizing the representation set of a dissimilarity
25% matrix is based on a greedy, forward selection of prototypes.
26%
27% In case of supervised selection D should be a labeled dataset with
28% prototype labels stored as feature labels. The 1NN error to the nearest
29% prototype is used as a criterion. In case of leave-one-out error
30% estimation it is assumed that the first objects in D correspond with the
31% prototypes.
32%
33% In case K=1 just a single prototype has to be returned, but computing the
34% 1NN error is not possible as all objects are assigned to the same class.
35% In that case the centre object of the largest class will be returned.
36%
37% Note that the search continues untill K prototypes are found. This might
38% be larger than desired due to peaking (overtraining). Therefor an
39% estimate for the optimal number of prototype is returned in KOPT.
40%
41% The prototype selection may be applied by C = B*W(:,1:KSEL), in which B
42% is a dissimilarity matrix based on the same representation set as A (e.g.
43% A itself) and C is a resulting dissimilarity matrix in which the KSEL
44% (e.g. KOPT) best prototypes are selected.
45%
46% In case of unsupervised selection the maximum or the mean distances to
47% the nearest prototype are minimized. These criteria are the same as used
48% in the KCENTRE and KMEDIOD cluster procedures.
49%
50% REFERENCE
51% E. Pekalska, R.P.W. Duin, and P. Paclik, Prototype selection for
52% dissimilarity-based classification, Pattern Recognition,
53% vol. 39, no. 2, 2006, 189-208.
54%
55% SEE ALSO
56% KNNDC, DISEX_PROTSELFD
57
58% Copyright: R.P.W. Duin, r.p.w.duin@prtools.org
59% Faculty EWI, Delft University of Technology
60% P.O. Box 5031, 2600 GA Delft, The Netherlands
61
62%
63
64function [R,e,D] = protselfd(D,ksel,type)
65
66  if nargin < 2, ksel = []; end
67  if nargin < 3, type = []; end
68
69  if nargin < 1 || isempty(D)  % allow for D*protselfd([],pars)
70    R = mapping(mfilename,'untrained',{ksel,type});
71    R = setname(R,'Forward Prototype Sel');
72    return
73  end
74 
75  switch lower(type)
76    case {'loo','LOO','super','SUPER','',''}
77      [R,e,D,J,nlab,clab] = protselfd(D,ksel,type);
78    case {'maxdist','meandist'}
79      R = protselfd_unsuper(D,ksel,type);
80    otherwise
81      error('Unknown selection type')
82  end
83     
84return 
85
86
87function [R,e,D,J,nlab,clab] = protselfd_super_init(D,ksel,par)
88% this routine takes care of the initialisation of supervised selection
89
90  isdataset(D);
91  [m,k,c] = getsize(D);
92  if isempty(ksel), ksel = k; end
93  if strcmp(par,'loo') | strcmp(par,'LOO')
94    if k > m
95      error('More rows than columns expected for dissimilarity matrix')
96    end
97    discheck(D(1:k,:));
98    D(1:k,:) = D(1:k,:) + 1e100*eye(k); % get rid of diagonal for LOO
99  end
100 
101  %Initialise by the centre of the largest class
102  cc = classsizes(D);
103  [cmax,n] = max(cc); % n is the largest class
104  lablist = getlablist(D);
105  nlab = getnlab(D);
106  clab = renumlab(getfeatlab(D),lablist);
107  R = find(nlab == n);
108  C = find(clab == n);
109  dd = +D(R,C);
110  [dmin,rmin] = sort(dd,1); % find one but most remote object
111  [dmin,cmin] = min(dmin(end-1,:)); % find prototype for which this is minimum
112  R = C(cmin);
113 
114  e = zeros(1,ksel);
115  [nlab,clab] = renumlab(getlabels(D),getfeatlab(D));
116  [dd,J] = min(+D(:,R),[],2);
117  e(1) = sum(clab(R(J)) ~= nlab);
118 
119        if ksel > 1
120                % this will be a deep recursive call !!!
121                prwaitbar(ksel,'Forward prototype selection')
122                [R,e,D,J,nlab,clab] = protselfd_super(D,ksel,R,J,e,nlab,clab);
123                prwaitbar(0);
124        end
125  e = e(1:length(+R))/m;
126  R = featsel(k,R);
127 
128  % Find optimal number of prototypes in avoiding peaking
129 
130  Jopt = find(e==min(e));
131  D = floor((Jopt(end)+Jopt(1))/2);
132 
133return
134 
135function [R,e,D,J,nlab,clab] = protselfd_super(D,ksel,R,J,e,nlab,clab)
136
137  [m,k,c] = getsize(D);
138  d = +D;
139  S = [1:k];   % all candidates
140  S(R) = [];   % exclude ones we have
141  emin = inf;
142  dmin = inf;
143  r = length(R);
144  prwaitbar(ksel,r);
145  for j=S      % run over all candidates left
146                % the following tricky statements finds the nearest neighobor indices n
147                % for all objects to their nearest prototype (n=1) or the candidate
148                % prototype (n=2). In ds the minimum distances are stored and used for
149                % solving ties later.
150    [ds,n] = min([d(m*(R(J')'-1)+[1:m]'),d(:,j)],[],2);
151                % the labels of the nearest prototypes and the candidates
152    cclab = [clab(R(J)') repmat(clab(j),m,1)];
153                % compute the nearest neighbor error using the computed n
154    ee = sum(cclab(m*(n-1)+[1:m]') ~= nlab);
155    de = sum(ds);
156                % if better, use it
157    if ee < emin || ((ee == emin) && (de < dmin))
158      emin = ee;
159      jmin = j;
160      JJ = [J repmat(r+1,m,1)];
161      Jmin = JJ(m*(n-1)+[1:m]');
162      Rmin = [R jmin];
163      dmin = de;
164    end
165  end
166
167  if emin <= e(r) || 1 % we even continue if emin increases due to peaking
168    e(r+1) = emin;
169    R = Rmin;
170    if (r+1) < ksel
171                        [R,e,D,J,nlab,clab] = protselfd_super(D,ksel,R,Jmin,e,nlab,clab);
172    end
173  end
174 
175return
176
177%PROTSELFD_UNSUPER Forward prototype selection
178%
179%               N = PROTSELFD_UNSUPER(D,P,CRIT)
180%
181% INPUT
182%   D     Square dissimilarity matrix, zeros on diagonal
183%   P     Number of prototypes to be selected
184%   CRIT  'dist' or 'centre'
185%
186% OUTPUT
187%   N     Indices of selected prototypes
188%
189% DESCRIPTION
190% Sort objects given by square dissim matrix D using a greedy approach
191% such that the maximum NN distance from all objects (prototypes)
192% to the first K: max(min(D(:,N(1:K),[],2)) is minimized.
193%
194% This routines tries to sample the objects such that they are evenly
195% spaced judged from their dissimilarities. This may be used as
196% initialisation in KCENTRES. It works reasonably, but not very good.
197%
198% SEE ALSO
199% KCENTRES
200
201% Copyright: R.P.W. Duin, r.p.w.duin@prtools.org
202% Faculty EWI, Delft University of Technology
203% P.O. Box 5031, 2600 GA Delft, The Netherlands
204
205function N = protselfd_unsuper(d,p,crit)
206
207d = +d;
208[m,k] = size(d);
209if isempty(crit), crit = 'max'; end
210if nargin < 2 || isempty(p), p = k; end
211L = 1:k;
212N = zeros(1,p);
213switch crit
214  case 'maxdist'
215    [~,n] = min(max(d));    % this is the first (central) prototype
216  case 'meandist'
217    [~,n] = min(mean(d));   % this is the first (central) prototype
218end
219e = d(:,n);                 % store here the distances to the nearest prototype (dNNP)
220f = min(d,repmat(e,1,k));   % replace distances that are larger than dNNP by dNNP
221N(1) = n;                   % ranking of selected prototypes
222L(n) = [];                  % candidate prototypes (all not yet selected objects)
223
224for j=2:p                   % extend prototype set
225  switch crit               % select the next prototype out of candidates in L
226    case 'maxdist'
227      [~,n] = min(max(f(:,L)));
228    case 'meandist'
229      [~,n] = min(mean(f(:,L)));
230  end
231  e = min([d(:,L(n)) e],[],2);   % update dNNP
232  f = min(d,repmat(e,1,k));      % update replacement of distances that are larger
233                                 % than dNNP by dNNP
234  N(j) = L(n);                   % update list of selected prototypes
235  L(n) = [];                     % update list of candidate prototypes
236end
237
Note: See TracBrowser for help on using the repository browser.