1 | %PROTSELFD Forward Prototype Selection for Dissimilarity Matrices
|
---|
2 | %
|
---|
3 | % [W,E,KOPT] = PROTSELFD(D,K,PAR)
|
---|
4 | % W = D*PROTSELFD([],K,PAR)
|
---|
5 | %
|
---|
6 | % INPUT
|
---|
7 | % D Dataset, square dissimilarity matrix
|
---|
8 | % K Integer, desired number of prototypes
|
---|
9 | % PAR 'LOO' - leave-one-out option. This should be used if
|
---|
10 | % the objects are related to themselves. If D is not square,
|
---|
11 | % it is assumed that the first sets of objects in columns and
|
---|
12 | % rows match.
|
---|
13 | % 'ALL' - use all objects (default).
|
---|
14 | %
|
---|
15 | % OUTPUT
|
---|
16 | % W Selection mapping ('feature selection')
|
---|
17 | % E Error stimate as a function of number of selected prototypes
|
---|
18 | % (only reliable for prototype sizes >= class size)
|
---|
19 | % KOPT Estimate for best size in avoiding peaking
|
---|
20 | %
|
---|
21 | % DESCRIPTION
|
---|
22 | % This procedure for optimizing the representation set of a
|
---|
23 | % dissimilarity matrix is based on a greedy, forward selection of
|
---|
24 | % prototypes using the leave-one-out error estimate of the 1NN rule
|
---|
25 | % as a criterion. As this is computed on the given distances in
|
---|
26 | % D, the procedure is based on sorting and counting only and is
|
---|
27 | % thereby fast. In case K=1 just a single prototype has to be returned,
|
---|
28 | % but computing the 1NN error is not possible as all objects are assigned
|
---|
29 | % to the same class. In that case the centre object of the largest class
|
---|
30 | % will be returned.
|
---|
31 | %
|
---|
32 | % Note that the search continues untill K prototypes are found.
|
---|
33 | % This might be larger than desired due to peaking (curse of
|
---|
34 | % dimensionality, overtraining). Therefor an estimate for the
|
---|
35 | % optimal number of prototype is returned in KOPT.
|
---|
36 | %
|
---|
37 | % The prototype selection may be applied by C = B*W(:,1:KSEL),
|
---|
38 | % in which B is a dissimilarity matrix based on the same
|
---|
39 | % representation set as A (e.g. A itself) and C is a resulting
|
---|
40 | % dissimilarity matrix in which the KSEL (e.g. KOPT) best prototypes
|
---|
41 | % are selected.
|
---|
42 | %
|
---|
43 | % REFERENCE
|
---|
44 | % E. Pekalska, R.P.W. Duin, and P. Paclik, Prototype selection for
|
---|
45 | % dissimilarity-based classification, Pattern Recognition,
|
---|
46 | % vol. 39, no. 2, 2006, 189-208.
|
---|
47 | %
|
---|
48 | % SEE ALSO
|
---|
49 | % KNNDC, DISEX_PROTSELFD
|
---|
50 |
|
---|
51 | % Copyright: R.P.W. Duin, r.p.w.duin@prtools.org
|
---|
52 | % Faculty EWI, Delft University of Technology
|
---|
53 | % P.O. Box 5031, 2600 GA Delft, The Netherlands
|
---|
54 |
|
---|
55 | %
|
---|
56 |
|
---|
57 | function [R,e,D,J,nlab,clab] = protselfd(D,ksel,par,J,e,nlab,clab)
|
---|
58 |
|
---|
59 | if nargin < 2, ksel = []; end
|
---|
60 | if nargin < 3 | isempty(par), par = 'all'; end
|
---|
61 |
|
---|
62 | if nargin < 4 % user call
|
---|
63 |
|
---|
64 | if nargin < 1 | isempty(D) % allow for D*protselfd([],pars)
|
---|
65 | R = mapping(mfilename,'untrained',{ksel,par});
|
---|
66 | R = setname(R,'Forward Prototype Sel');
|
---|
67 | return
|
---|
68 | end
|
---|
69 |
|
---|
70 | [m,k,c] = getsize(D);
|
---|
71 | if isempty(ksel), ksel = k; end
|
---|
72 | if strcmp(par,'loo') | strcmp(par,'LOO')
|
---|
73 | if k > m
|
---|
74 | error('More rows than columns expected for dissimilarity matrix')
|
---|
75 | end
|
---|
76 | discheck(D(1:k,:));
|
---|
77 | D(1:k,:) = D(1:k,:) + 1e100*eye(k); % get rid of diagonal for LOO
|
---|
78 | end
|
---|
79 |
|
---|
80 | %Initialise by the centre of the largest class
|
---|
81 | cc = classsizes(D);
|
---|
82 | [cmax,n] = max(cc); % n is the largest class
|
---|
83 | lablist = getlablist(D);
|
---|
84 | nlab = getnlab(D);
|
---|
85 | clab = renumlab(getfeatlab(D),lablist);
|
---|
86 | R = find(nlab == n);
|
---|
87 | C = find(clab == n);
|
---|
88 | dd = +D(R,C);
|
---|
89 | [dmin,rmin] = sort(dd,1); % find one but most remote object
|
---|
90 | [dmin,cmin] = min(dmin(end-1,:)); % find prototype for which this is minimum
|
---|
91 | R = C(cmin);
|
---|
92 |
|
---|
93 | e = zeros(1,ksel);
|
---|
94 | [nlab,clab] = renumlab(getlabels(D),getfeatlab(D));
|
---|
95 | [dd,J] = min(+D(:,R),[],2);
|
---|
96 | e(1) = sum(clab(R(J)) ~= nlab);
|
---|
97 |
|
---|
98 | if ksel > 1
|
---|
99 | % this will be a deep recursive call !!!
|
---|
100 | prwaitbar(ksel,'Forward prototype selection')
|
---|
101 | [R,e,D,J,nlab,clab] = protselfd(D,ksel,R,J,e,nlab,clab);
|
---|
102 | prwaitbar(0);
|
---|
103 | end
|
---|
104 | e = e(1:length(+R))/m;
|
---|
105 | R = featsel(k,R);
|
---|
106 |
|
---|
107 | % Find optimal number of prototypes in avoiding peaking
|
---|
108 |
|
---|
109 | Jopt = find(e==min(e));
|
---|
110 | D = floor((Jopt(end)+Jopt(1))/2);
|
---|
111 |
|
---|
112 | % done!
|
---|
113 |
|
---|
114 | else % internal call, parameters may have another meaning!
|
---|
115 |
|
---|
116 | R = par; % prototypes sofar
|
---|
117 | [m,k,c] = getsize(D);
|
---|
118 | d = +D;
|
---|
119 | S = [1:k]; % all candidates
|
---|
120 | S(R) = []; % exclude ones we have
|
---|
121 | emin = inf;
|
---|
122 | dmin = inf;
|
---|
123 | r = length(R);
|
---|
124 | prwaitbar(ksel,r);
|
---|
125 | for j=S % run over all candidates left
|
---|
126 | % the following tricky statements finds the nearest neighobor indices n
|
---|
127 | % for all objects to their nearest prototype (n=1) or the candidate
|
---|
128 | % prototype (n=2). In ds the minimum distances are stored and used for
|
---|
129 | % solving ties later.
|
---|
130 | [ds,n] = min([d(m*(R(J')'-1)+[1:m]'),d(:,j)],[],2);
|
---|
131 | % the labels of the nearest prototypes and the candidates
|
---|
132 | cclab = [clab(R(J)') repmat(clab(j),m,1)];
|
---|
133 | % compute the nearest neighbor error using the computed n
|
---|
134 | ee = sum(cclab(m*(n-1)+[1:m]') ~= nlab);
|
---|
135 | de = sum(ds);
|
---|
136 | % if better, use it
|
---|
137 | if ee < emin | ((ee == emin) & (de < dmin))
|
---|
138 | emin = ee;
|
---|
139 | jmin = j;
|
---|
140 | JJ = [J repmat(r+1,m,1)];
|
---|
141 | Jmin = JJ(m*(n-1)+[1:m]');
|
---|
142 | Rmin = [R jmin];
|
---|
143 | dmin = de;
|
---|
144 | end
|
---|
145 | end
|
---|
146 |
|
---|
147 | if emin <= e(r) | 1 % we even continue if emin increases due to peaking
|
---|
148 | e(r+1) = emin;
|
---|
149 | R = Rmin;
|
---|
150 | if (r+1) < ksel
|
---|
151 | [R,e,D,J,nlab,clab] = protselfd(D,ksel,R,Jmin,e,nlab,clab);
|
---|
152 | end
|
---|
153 | end
|
---|
154 |
|
---|
155 | end
|
---|