Context Navigation

source: distools/clevald.m @ 20

Last change on this file since 20 was 20, checked in by bduin, 13 years ago
updates for handling soft labels
File size: 8.6 KB

Rev	Line
[18]	1	%CLEVALD Classifier evaluation (learning curve) for dissimilarity data
	2	%
[20]	3	% E = CLEVALD(D,CLASSF,TRAINSIZES,REPSIZE,NREPS,T,TESTFUN)
[18]	4	%
	5	% INPUT
	6	% D Square dissimilarity dataset
	7	% CLASSF Classifiers to be evaluated (cell array)
[20]	8	% TRAINSIZE Vector of training set sizes, used to generate subsets of D
	9	% (default [2,3,5,7,10,15,20,30,50,70,100]). TRAINSIZE is per
	10	% class unless D has no priors set or has soft labels.
[18]	11	% REPSIZE Representation set size per class (>=1), or fraction (<1)
	12	% (default total, training set)
	13	% NREPS Number of repetitions (default 1)
	14	% T Test dataset (default [], use remaining samples in A)
[20]	15	% TESTFUN Mapping,evaluation function (default classification error)
[18]	16	%
	17	% OUTPUT
	18	% E Error structure (see PLOTE) containing training and test
	19	% errors
	20	%
	21	% DESCRIPTION
	22	% Generates at random, for all class sizes defined in TRAINSIZES, training
	23	% sets out of the dissimilarity dataset D. The representation set is either
[20]	24	% equal to the training set (REPSIZE = []), or a fraction of it (REPSIZE <1)
[18]	25	% or a random subset of it of a given size (REPSIZE>1). This set is used
	26	% for training the untrained classifiers CLASSF. The resulting trained
	27	% classifiers are tested on the training objects and on the left-over test
	28	% objects, or, if supplied, the testset T. This procedure is then repeated
[20]	29	% NREPS times. The default test routine is classification error estimation
	30	% by TESTC([],'crisp').
[18]	31	%
	32	% The returned structure E contains several fields for annotating the plot
	33	% produced by PLOTE. They may be changed by the users. Removal of the field
	34	% 'apperror' (RMFIELD(E,'apperror')) suppresses the draw of the error
	35	% curves for the training set.
	36	%
	37	% Training set generation is done "with replacement" and such that for each
	38	% run the larger training sets include the smaller ones and that for all
	39	% classifiers the same training sets are used.
	40	%
	41	% This function uses the RAND random generator and thereby reproduces
	42	% if its seed is reset (see RAND).
	43	% If CLASSF uses RANDN, its seed should be reset as well.
	44	%
	45	% SEE ALSO
	46	% MAPPINGS, DATASETS, CLEVAL, TESTC, PLOTE
	47
	48	% R.P.W. Duin, r.p.w.duin@prtools.org
	49	% Faculty EWI, Delft University of Technology
	50	% P.O. Box 5031, 2600 GA Delft, The Netherlands
	51
[20]	52	function e = clevald(a,classf,learnsizes,repsize,nreps,t,testfun)
[18]	53
	54	prtrace(mfilename);
	55
[20]	56	if (nargin < 7) \| isempty(testfun)
	57	testfun = testc([],'crisp');
	58	end;
[18]	59	if (nargin < 6)
	60	t = [];
	61	end;
	62	if (nargin < 5) \| isempty(nreps);
	63	nreps = 1;
	64	end;
	65	if (nargin < 4)
	66	repsize = [];
	67	end
	68	if (nargin < 3) \| isempty(learnsizes);
	69	learnsizes = [2,3,5,7,10,15,20,30,50,70,100];
	70	end;
	71	if ~iscell(classf), classf = {classf}; end
	72
	73	% Assert that all is right.
	74	isdataset(a); issquare(a); ismapping(classf{1});
	75	if (~isempty(t)), isdataset(t); end
	76
	77	% Remove requested class sizes that are larger than the size of the
	78	% smallest class.
	79
[20]	80	[m,k,c] = getsize(a);
	81	if ~isempty(a,'prior') & islabtype(a,'crisp')
	82	classs = true;
	83	mc = classsizes(a);
	84	toolarge = find(learnsizes >= min(mc));
	85	if (~isempty(toolarge))
	86	prwarning(2,['training set class sizes ' num2str(learnsizes(toolarge)) ...
	87	' larger than the minimal class size; removed them']);
	88	learnsizes(toolarge) = [];
	89	end
	90	else
	91	if islabtype(a,'crisp') & isempty(a,'prior')
	92	prwarning(1,['No priors found in dataset, class frequencies are used.' ...
	93	newline ' Training set sizes hold for entire dataset']);
	94	end
	95	classs = false;
	96	toolarge = find(learnsizes >= m);
	97	if (~isempty(toolarge))
	98	prwarning(2,['training set sizes ' num2str(learnsizes(toolarge)) ...
	99	' larger than number of objects; removed them']);
	100	learnsizes(toolarge) = [];
	101	end
	102	end
[18]	103	learnsizes = learnsizes(:)';
	104
	105	% Fill the error structure.
	106
	107	nw = length(classf(:));
	108	datname = getname(a);
	109
	110	e.n = nreps;
	111	e.error = zeros(nw,length(learnsizes));
	112	e.std = zeros(nw,length(learnsizes));
	113	e.apperror = zeros(nw,length(learnsizes));
	114	e.appstd = zeros(nw,length(learnsizes));
	115	e.xvalues = learnsizes(:)';
[20]	116	if classs
	117	e.xlabel = 'Training set size per class';
	118	else
	119	e.xlabel = 'Training set size';
	120	end
[18]	121	e.names = [];
	122	if (nreps > 1)
	123	e.ylabel= ['Averaged error (' num2str(nreps) ' experiments)'];
	124	elseif (nreps == 1)
	125	e.ylabel = 'Error';
	126	else
	127	error('Number of repetitions NREPS should be >= 1.');
	128	end;
	129	if (~isempty(datname))
	130	if isempty(repsize)
	131	e.title = [datname ', Rep. Set = Train Set'];
	132	elseif repsize < 1
	133	e.title = [datname ', Rep. size = ' num2str(repsize) ' Train size'];
	134	else
	135	e.title = [datname ', Rep. size = ' num2str(repsize) ' per class'];
	136	end
	137	end
	138	if (learnsizes(end)/learnsizes(1) > 20)
	139	e.plot = 'semilogx'; % If range too large, use a log-plot for X.
	140	end
	141
	142	% Report progress.
	143
	144	s1 = sprintf('cleval: %i classifiers: ',nw);
	145	prwaitbar(nw,s1);
	146
	147	% Store the seed, to reset the random generator later for different
	148	% classifiers.
	149
	150	seed = rand('state');
	151
	152	% Loop over all classifiers (with index WI).
	153
	154	for wi = 1:nw
	155
	156	if (~isuntrained(classf{wi}))
	157	error('Classifiers should be untrained.')
	158	end
	159	name = getname(classf{wi});
	160	e.names = char(e.names,name);
	161	prwaitbar(nw,wi,[s1 name]);
	162
	163	% E1 will contain the error estimates.
	164
	165	e1 = zeros(nreps,length(learnsizes));
	166	e0 = zeros(nreps,length(learnsizes));
	167
	168	% Take care that classifiers use same training set.
	169
	170	rand('state',seed); seed2 = seed;
	171
	172	% For NREPS repetitions...
	173
	174	s2 = sprintf('cleval: %i repetitions: ',nreps);
	175	prwaitbar(nreps,s2);
	176
	177	for i = 1:nreps
	178
	179	prwaitbar(nreps,i,[s2 int2str(i)]);
	180	% Store the randomly permuted indices of samples of class CI to use in
	181	% this training set in JR(CI,:).
	182
[20]	183	if classs
	184
	185	JR = zeros(c,max(learnsizes));
[18]	186
[20]	187	for ci = 1:c
[18]	188
[20]	189	JC = findnlab(a,ci);
[18]	190
[20]	191	% Necessary for reproducable training sets: set the seed and store
	192	% it after generation, so that next time we will use the previous one.
	193	rand('state',seed2);
[18]	194
[20]	195	JD = JC(randperm(mc(ci)));
	196	JR(ci,:) = JD(1:max(learnsizes))';
	197	seed2 = rand('state');
	198	end
	199
	200	elseif islabtype(a,'crisp')
	201
	202	rand('state',seed2); % get seed for reproducable training sets
	203	% generate indices for the entire dataset taking care that in
	204	% the first 2c objects we have 2 objects for every class
	205	[a1,a2,I1,I2] = gendat(a,2*ones(1,c));
	206	JD = randperm(m-2*c);
	207	JR = [I1;I2(JD)];
	208	seed2 = rand('state'); % save seed for reproducable training sets
	209
	210	else % soft labels
	211
	212	rand('state',seed2); % get seed for reproducable training sets
	213	JR = randperm(m);
	214	seed2 = rand('state'); % save seed for reproducable training sets
	215
[18]	216	end
	217
	218	li = 0; % Index of training set.
	219
	220	nlearns = length(learnsizes);
	221	s3 = sprintf('cleval: %i sizes: ',nlearns);
	222	prwaitbar(nreps,s3);
	223
	224	for j = 1:nlearns
	225
	226	nj = learnsizes(j);
	227
	228	prwaitbar(nlearns,j,[s3 int2str(j) ' (' int2str(nj) ')']);
	229	li = li + 1;
	230
	231	% J will contain the indices for this training set.
	232
	233	J = [];
	234	R = [];
[20]	235
	236	if classs
	237	for ci = 1:c
	238	J = [J;JR(ci,1:nj)'];
	239	if isempty(repsize)
	240	R = [R JR(ci,1:nj)];
	241	elseif repsize < 1
	242	R = [R JR(ci,1:ceil(repsize*nj))];
	243	else
	244	R = [R JR(ci,1:min(nj,repsize))];
	245	end
	246	end;
	247	else
	248	J = JR(1:nj);
	249	if isempty(repsize)
	250	R = JR;
	251	elseif repsize < 1
	252	R = JR(1:ceil(repsize*nj));
	253	else
	254	R = JR(1:min(nj,repsize));
	255	end
	256	end;
	257
	258	trainset = a(J,R);
	259	trainset = setprior(trainset,getprior(trainset,0));
	260	w = trainset*classf{wi}; % Use right classifier.
	261	e0(i,li) = trainsetwtestfun;
[18]	262	if (isempty(t))
	263	Jt = ones(m,1);
	264	Jt(J) = zeros(size(J));
	265	Jt = find(Jt); % Don't use training set for testing.
[20]	266	testset = a(Jt,R);
	267	testset = setprior(testset,getprior(testset,0));
	268	e1(i,li) = testsetwtestfun;
[18]	269	else
[20]	270	testset = t(:,R);
	271	testset = setprior(testset,getprior(testset,0));
	272	e1(i,li) = testsetwtestfun;
[18]	273	end
	274
	275	end
	276	prwaitbar(0);
	277
	278	end
	279	prwaitbar(0);
	280
	281	% Calculate average error and standard deviation for this classifier
	282	% (or set the latter to zero if there's been just 1 repetition).
	283
	284	e.error(wi,:) = mean(e1,1);
	285	e.apperror(wi,:) = mean(e0,1);
	286	if (nreps == 1)
	287	e.std(wi,:) = zeros(1,size(e.std,2));
	288	e.appstd(wi,:) = zeros(1,size(e.appstd,2));
	289	else
	290	e.std(wi,:) = std(e1)/sqrt(nreps);
	291	e.appstd(wi,:) = std(e0)/sqrt(nreps);
	292	end
	293	end
	294	prwaitbar(0);
	295
	296	% The first element is the empty string [], remove it.
	297	e.names(1,:) = [];
	298
	299	return
	300

Note: See TracBrowser for help on using the repository browser.

Download in other formats: