Context Navigation

source: distools/protselfd.m @ 74

Last change on this file since 74 was 66, checked in by bduin, 11 years ago

File size: 8.2 KB

Line
1	%PROTSELFD Forward Prototype Selection for Dissimilarity Matrices
2	%
3	% [W,E,KOPT] = PROTSELFD(D,K,PAR)
4	% W = D*PROTSELFD([],K,PAR)
5	%
6	% INPUT
7	% D Dataset, dissimilarity matrix
8	% K Integer, desired number of prototypes
9	% PAR 'SUPER' supervised selection using 1NN error on prototypes.
10	% 'LOO' - supervised selection using leave-one-out error estimation.
11	% 'MAXDIST' - unsupervised selection minimizing the maximum
12	% distance to the nearest prototype.
13	% 'MEANDIST' - unsupervised selection minimizing the average
14	% distance to the nearest prototype.
15	%
16	% OUTPUT
17	% W Selection mapping ('feature selection') or prototype indices.
18	% E Error stimate as a function of number of selected prototypes
19	% (for supervised selection only reliable for prototype sizes >= class size)
20	% KOPT Estimate for best size in avoiding peaking
21	% (supervised selection only)
22	%
23	% DESCRIPTION
24	% This procedure for optimizing the representation set of a dissimilarity
25	% matrix is based on a greedy, forward selection of prototypes.
26	%
27	% In case of supervised selection D should be a labeled dataset with
28	% prototype labels stored as feature labels. The 1NN error to the nearest
29	% prototype is used as a criterion. In case of leave-one-out error
30	% estimation it is assumed that the first objects in D correspond with the
31	% prototypes.
32	%
33	% In case K=1 just a single prototype has to be returned, but computing the
34	% 1NN error is not possible as all objects are assigned to the same class.
35	% In that case the centre object of the largest class will be returned.
36	%
37	% Note that the search continues untill K prototypes are found. This might
38	% be larger than desired due to peaking (overtraining). Therefor an
39	% estimate for the optimal number of prototype is returned in KOPT.
40	%
41	% The prototype selection may be applied by C = B*W(:,1:KSEL), in which B
42	% is a dissimilarity matrix based on the same representation set as A (e.g.
43	% A itself) and C is a resulting dissimilarity matrix in which the KSEL
44	% (e.g. KOPT) best prototypes are selected.
45	%
46	% In case of unsupervised selection the maximum or the mean distances to
47	% the nearest prototype are minimized. These criteria are the same as used
48	% in the KCENTRE and KMEDIOD cluster procedures. What is returned now in W
49	% is the (ordered) list of prototype indices and not a mapping.
50	%
51	% REFERENCE
52	% E. Pekalska, R.P.W. Duin, and P. Paclik, Prototype selection for
53	% dissimilarity-based classification, Pattern Recognition,
54	% vol. 39, no. 2, 2006, 189-208.
55	%
56	% SEE ALSO
57	% KNNDC, DISEX_PROTSELFD
58
59	% Copyright: R.P.W. Duin, r.p.w.duin@prtools.org
60	% Faculty EWI, Delft University of Technology
61	% P.O. Box 5031, 2600 GA Delft, The Netherlands
62
63	function [R,e,D] = protselfd(D,ksel,type)
64
65	if nargin < 2, ksel = []; end
66	if nargin < 3, type = []; end
67
68	if nargin < 1 \|\| isempty(D) % allow for D*protselfd([],pars)
69	R = mapping(mfilename,'untrained',{ksel,type});
70	R = setname(R,'Forward Prototype Sel');
71	return
72	end
73
74	switch lower(type)
75	case {'loo','LOO','super','SUPER','',''}
76	[R,e,D,J,nlab,clab] = protselfd(D,ksel,type);
77	case {'maxdist','meandist'}
78	[R,e] = protselfd_unsuper(D,ksel,type);
79	otherwise
80	error('Unknown selection type')
81	end
82
83	return
84
85
86	function [R,e,D,J,nlab,clab] = protselfd_super_init(D,ksel,par)
87	% this routine takes care of the initialisation of supervised selection
88
89	isdataset(D);
90	[m,k,c] = getsize(D);
91	if isempty(ksel), ksel = k; end
92	if strcmp(par,'loo') \| strcmp(par,'LOO')
93	if k > m
94	error('More rows than columns expected for dissimilarity matrix')
95	end
96	discheck(D(1:k,:));
97	D(1:k,:) = D(1:k,:) + 1e100*eye(k); % get rid of diagonal for LOO
98	end
99
100	%Initialise by the centre of the largest class
101	cc = classsizes(D);
102	[cmax,n] = max(cc); % n is the largest class
103	lablist = getlablist(D);
104	nlab = getnlab(D);
105	clab = renumlab(getfeatlab(D),lablist);
106	R = find(nlab == n);
107	C = find(clab == n);
108	dd = +D(R,C);
109	[dmin,rmin] = sort(dd,1); % find one but most remote object
110	[dmin,cmin] = min(dmin(end-1,:)); % find prototype for which this is minimum
111	R = C(cmin);
112
113	e = zeros(1,ksel);
114	[nlab,clab] = renumlab(getlabels(D),getfeatlab(D));
115	[dd,J] = min(+D(:,R),[],2);
116	e(1) = sum(clab(R(J)) ~= nlab);
117
118	if ksel > 1
119	% this will be a deep recursive call !!!
120	prwaitbar(ksel,'Forward prototype selection')
121	[R,e,D,J,nlab,clab] = protselfd_super(D,ksel,R,J,e,nlab,clab);
122	prwaitbar(0);
123	end
124	e = e(1:length(+R))/m;
125	R = featsel(k,R);
126
127	% Find optimal number of prototypes in avoiding peaking
128
129	Jopt = find(e==min(e));
130	D = floor((Jopt(end)+Jopt(1))/2);
131
132	return
133
134	function [R,e,D,J,nlab,clab] = protselfd_super(D,ksel,R,J,e,nlab,clab)
135
136	[m,k,c] = getsize(D);
137	d = +D;
138	S = [1:k]; % all candidates
139	S(R) = []; % exclude ones we have
140	emin = inf;
141	dmin = inf;
142	r = length(R);
143	prwaitbar(ksel,r);
144	for j=S % run over all candidates left
145	% the following tricky statements finds the nearest neighobor indices n
146	% for all objects to their nearest prototype (n=1) or the candidate
147	% prototype (n=2). In ds the minimum distances are stored and used for
148	% solving ties later.
149	[ds,n] = min([d(m*(R(J')'-1)+[1:m]'),d(:,j)],[],2);
150	% the labels of the nearest prototypes and the candidates
151	cclab = [clab(R(J)') repmat(clab(j),m,1)];
152	% compute the nearest neighbor error using the computed n
153	ee = sum(cclab(m*(n-1)+[1:m]') ~= nlab);
154	de = sum(ds);
155	% if better, use it
156	if ee < emin \|\| ((ee == emin) && (de < dmin))
157	emin = ee;
158	jmin = j;
159	JJ = [J repmat(r+1,m,1)];
160	Jmin = JJ(m*(n-1)+[1:m]');
161	Rmin = [R jmin];
162	dmin = de;
163	end
164	end
165
166	if emin <= e(r) \|\| 1 % we even continue if emin increases due to peaking
167	e(r+1) = emin;
168	R = Rmin;
169	if (r+1) < ksel
170	[R,e,D,J,nlab,clab] = protselfd_super(D,ksel,R,Jmin,e,nlab,clab);
171	end
172	end
173
174	return
175
176	%PROTSELFD_UNSUPER Forward prototype selection
177	%
178	% N = PROTSELFD_UNSUPER(D,P,CRIT)
179	%
180	% INPUT
181	% D Square dissimilarity matrix, zeros on diagonal
182	% P Number of prototypes to be selected
183	% CRIT 'maxdist' or 'meandist'
184	%
185	% OUTPUT
186	% N Indices of selected prototypes
187	%
188	% DESCRIPTION
189	% Sort objects given by square dissim matrix D using a greedy approach
190	% such that the maximum NN distance from all objects (prototypes)
191	% to the first K: max(min(D(:,N(1:K),[],2)) is minimized.
192	%
193	% This routines tries to sample the objects such that they are evenly
194	% spaced judged from their dissimilarities. This may be used as
195	% initialisation in KCENTRES. It works reasonably, but not very good.
196	%
197	% SEE ALSO
198	% KCENTRES, KMEDIODS
199
200	% Copyright: R.P.W. Duin, r.p.w.duin@prtools.org
201	% Faculty EWI, Delft University of Technology
202	% P.O. Box 5031, 2600 GA Delft, The Netherlands
203
204	function [N,e] = protselfd_unsuper(d,p,crit)
205
206	d = +d;
207	[m,k] = size(d);
208	if isempty(crit), crit = 'max'; end
209	if nargin < 2 \|\| isempty(p), p = k; end
210	L = 1:k;
211	N = zeros(1,p);
212	switch crit
213	case 'maxdist'
214	[~,n] = min(max(d)); % this is the first (central) prototype
215	case 'meandist'
216	[~,n] = min(mean(d)); % this is the first (central) prototype
217	end
218	e = d(:,n); % store here the distances to the nearest prototype (dNNP)
219	f = min(d,repmat(e,1,k)); % replace distances that are larger than dNNP by dNNP
220	N(1) = n; % ranking of selected prototypes
221	L(n) = []; % candidate prototypes (all not yet selected objects)
222
223	for j=2:p % extend prototype set
224	switch crit % select the next prototype out of candidates in L
225	case 'maxdist'
226	[~,n] = min(max(f(:,L)));
227	case 'meandist'
228	[~,n] = min(mean(f(:,L)));
229	end
230	e = min([d(:,L(n)) e],[],2); % update dNNP
231	f = min(d,repmat(e,1,k)); % update replacement of distances that are larger
232	% than dNNP by dNNP
233	N(j) = L(n); % update list of selected prototypes
234	L(n) = []; % update list of candidate prototypes
235	end
236

Note: See TracBrowser for help on using the repository browser.

Download in other formats: