source: distools/protselfd.m @ 12

Last change on this file since 12 was 10, checked in by bduin, 14 years ago
File size: 5.1 KB
RevLine 
[10]1%PROTSELFD Forward Prototype Selection for Dissimilarity Matrices
2%
3%   [W,E,KOPT] = PROTSELFD(D,K,PAR)
4%    W  = D*PROTSELFD([],K,PAR)
5%
6% INPUT
7%   D     Dataset, square dissimilarity matrix
8%   K     Integer, desired number of prototypes
9%   PAR  'LOO' - leave-one-out option. This should be used if
10%          the objects are related to themselves. If D is not square,
11%          it is assumed that the first sets of objects in columns and
12%          rows match.
13%        'ALL' - use all objects (default).
14%
15% OUTPUT
16%   W     Selection mapping ('feature selection')
17%   E     Error stimate as a function of number of selected prototypes
18%         (only reliable for prototype sizes >= class size)
19%   KOPT  Estimate for best size in avoiding peaking
20%
21% DESCRIPTION
22% This procedure for optimizing the representation set of a
23% dissimilarity matrix is based on a greedy, forward selection of
24% prototypes using the leave-one-out error estimate of the 1NN rule
25% as a criterion. As this is computed on the given distances in
26% D, the procedure is based on sorting and counting only and is
27% thereby fast. In case K=1 just a single prototype has to be returned,
28% but computing the 1NN error is not possible as all objects are assigned
29% to the same class. In that case the centre object of the largest class
30% will be returned.
31%
32% Note that the search continues untill K prototypes are found.
33% This might be larger than desired due to peaking (curse of
34% dimensionality, overtraining). Therefor an estimate for the
35% optimal number of prototype is returned in KOPT.
36%
37% The prototype selection may be applied by C = B*W(:,1:KSEL),
38% in which B is a dissimilarity matrix based on the same
39% representation set as A (e.g. A itself) and C is a resulting
40% dissimilarity matrix in which the KSEL (e.g. KOPT) best prototypes
41% are selected.
42%
43% REFERENCE
44% E. Pekalska, R.P.W. Duin, and P. Paclik, Prototype selection for
45% dissimilarity-based classification, Pattern Recognition,
46% vol. 39, no. 2, 2006, 189-208.
47%
48% SEE ALSO
49% KNNDC, DISEX_PROTSELFD
50
51% Copyright: R.P.W. Duin, r.p.w.duin@prtools.org
52% Faculty EWI, Delft University of Technology
53% P.O. Box 5031, 2600 GA Delft, The Netherlands
54
55%
56
57function [R,e,D,J,nlab,clab] = protselfd(D,ksel,par,J,e,nlab,clab)
58
59if nargin < 2, ksel = []; end
60if nargin < 3 | isempty(par), par = 'all'; end
61
62if nargin < 4 % user call
63 
64  if nargin < 1 | isempty(D)  % allow for D*protselfd([],pars)
65    R = mapping(mfilename,'untrained',{ksel,par});
66    R = setname(R,'Forward Prototype Sel');
67    return
68  end
69 
70  [m,k,c] = getsize(D);
71  if isempty(ksel), ksel = k; end
72  if strcmp(par,'loo') | strcmp(par,'LOO')
73    if k > m
74      error('More rows than columns expected for dissimilarity matrix')
75    end
76    discheck(D(1:k,:));
77    D(1:k,:) = D(1:k,:) + 1e100*eye(k); % get rid of diagonal for LOO
78  end
79 
80  %Initialise by the centre of the largest class
81  cc = classsizes(D);
82  [cmax,n] = max(cc); % n is the largest class
83  lablist = getlablist(D);
84  nlab = getnlab(D);
85  clab = renumlab(getfeatlab(D),lablist);
86  R = find(nlab == n);
87  C = find(clab == n);
88  dd = +D(R,C);
89  [dmin,rmin] = sort(dd,1); % find one but most remote object
90  [dmin,cmin] = min(dmin(end-1,:)); % find prototype for which this is minimum
91  R = C(cmin);
92 
93  e = zeros(1,ksel);
94  [nlab,clab] = renumlab(getlabels(D),getfeatlab(D));
95  [dd,J] = min(+D(:,R),[],2);
96  e(1) = sum(clab(R(J)) ~= nlab);
97 
98        if ksel > 1
99                % this will be a deep recursive call !!!
100                prwaitbar(ksel,'Forward prototype selection')
101                [R,e,D,J,nlab,clab] = protselfd(D,ksel,R,J,e,nlab,clab);
102                prwaitbar(0);
103        end
104  e = e(1:length(+R))/m;
105  R = featsel(k,R);
106 
107  % Find optimal number of prototypes in avoiding peaking
108 
109  Jopt = find(e==min(e));
110  D = floor((Jopt(end)+Jopt(1))/2);
111 
112  % done!
113 
114else  % internal call, parameters may have another meaning!
115 
116  R = par;  % prototypes sofar
117  [m,k,c] = getsize(D);
118  d = +D;
119  S = [1:k];   % all candidates
120  S(R) = [];   % exclude ones we have
121  emin = inf;
122  dmin = inf;
123  r = length(R);
124  prwaitbar(ksel,r);
125  for j=S      % run over all candidates left
126                % the following tricky statements finds the nearest neighobor indices n
127                % for all objects to their nearest prototype (n=1) or the candidate
128                % prototype (n=2). In ds the minimum distances are stored and used for
129                % solving ties later.
130    [ds,n] = min([d(m*(R(J')'-1)+[1:m]'),d(:,j)],[],2);
131                % the labels of the nearest prototypes and the candidates
132    cclab = [clab(R(J)') repmat(clab(j),m,1)];
133                % compute the nearest neighbor error using the computed n
134    ee = sum(cclab(m*(n-1)+[1:m]') ~= nlab);
135    de = sum(ds);
136                % if better, use it
137    if ee < emin | ((ee == emin) & (de < dmin))
138      emin = ee;
139      jmin = j;
140      JJ = [J repmat(r+1,m,1)];
141      Jmin = JJ(m*(n-1)+[1:m]');
142      Rmin = [R jmin];
143      dmin = de;
144    end
145  end
146
147  if emin <= e(r) | 1 % we even continue if emin increases due to peaking
148    e(r+1) = emin;
149    R = Rmin;
150    if (r+1) < ksel
151                        [R,e,D,J,nlab,clab] = protselfd(D,ksel,R,Jmin,e,nlab,clab);
152    end
153  end
154 
155end
Note: See TracBrowser for help on using the repository browser.