%CLEVALD Classifier evaluation (learning curve) for dissimilarity data
%
%   E = CLEVALD(D,CLASSF,TRAINSIZES,REPSIZE,NREPS,T,TESTFUN)
%
% INPUT
%   D          Square dissimilarity dataset
%   CLASSF     Classifiers to be evaluated (cell array)
%   TRAINSIZE  Vector of training set sizes, used to generate subsets of D
%              (default [2,3,5,7,10,15,20,30,50,70,100]). TRAINSIZE is per
%              class unless D has no priors set or has soft labels.
%   REPSIZE    Representation set size per class (>=1), or fraction (<1)
%              (default total, training set)
%   NREPS      Number of repetitions (default 1)
%   T          Test dataset (default [], use remaining samples in A)
%   TESTFUN    Mapping,evaluation function (default classification error)
%
% OUTPUT
%   E          Error structure (see PLOTE) containing training and test
%              errors
%
% DESCRIPTION
% Generates at random, for all class sizes defined in TRAINSIZES, training
% sets out of the dissimilarity dataset D. The representation set is either
% equal to the training set (REPSIZE = []), or a fraction of it (REPSIZE <1)
% or a random subset of it of a given size (REPSIZE>1). This set is used
% for training the untrained classifiers CLASSF. The resulting trained
% classifiers are tested on the training objects and on the left-over test 
% objects, or, if supplied, the testset T. This procedure is then repeated 
% NREPS times. The default test routine is classification error estimation 
% by TESTC([],'crisp'). 
%
% The returned structure E contains several fields for annotating the plot
% produced by PLOTE. They may be changed by the users. Removal of the field
% 'apperror' (RMFIELD(E,'apperror')) suppresses the draw of the error
% curves for the training set.
%
% Training set generation is done "with replacement" and such that for each
% run the larger training sets include the smaller ones and that for all
% classifiers the same training sets are used.
%
% This function uses the RAND random generator and thereby reproduces 
% if its seed is reset (see RAND).
% If CLASSF uses RANDN, its seed should be reset as well.
%
% SEE ALSO
% MAPPINGS, DATASETS, CLEVAL, TESTC, PLOTE

% R.P.W. Duin, r.p.w.duin@prtools.org
% Faculty EWI, Delft University of Technology
% P.O. Box 5031, 2600 GA Delft, The Netherlands

function e = clevald(a,classf,learnsizes,repsize,nreps,t,testfun)

	prtrace(mfilename);

  if (nargin < 7) | isempty(testfun)
    testfun = testc([],'crisp');
  end;
  if (nargin < 6) 
    t = [];
  end;
  if (nargin < 5) | isempty(nreps);
    nreps = 1;
  end;
  if (nargin < 4) 
    repsize = [];
  end
  if (nargin < 3) | isempty(learnsizes);
    learnsizes = [2,3,5,7,10,15,20,30,50,70,100];
  end;
  if ~iscell(classf), classf = {classf}; end

  % Assert that all is right.
  isdataset(a); issquare(a); ismapping(classf{1}); 
  if (~isempty(t)), isdataset(t); end

  % Remove requested class sizes that are larger than the size of the
  % smallest class.

	[m,k,c] = getsize(a);
	if ~isempty(a,'prior') & islabtype(a,'crisp')
		classs = true;
		mc = classsizes(a);
		toolarge = find(learnsizes >= min(mc));
		if (~isempty(toolarge))
			prwarning(2,['training set class sizes ' num2str(learnsizes(toolarge)) ...
									 ' larger than the minimal class size; removed them']);
			learnsizes(toolarge) = [];
		end
	else
		if islabtype(a,'crisp') & isempty(a,'prior')
			prwarning(1,['No priors found in dataset, class frequencies are used.' ...
			newline '            Training set sizes hold for entire dataset']);
		end
		classs = false;
		toolarge = find(learnsizes >= m);
		if (~isempty(toolarge))
			prwarning(2,['training set sizes ' num2str(learnsizes(toolarge)) ...
									 ' larger than number of objects; removed them']);
			learnsizes(toolarge) = [];
		end
	end
  learnsizes = learnsizes(:)';

  % Fill the error structure.

  nw = length(classf(:));
  datname = getname(a);

  e.n       = nreps;
  e.error   = zeros(nw,length(learnsizes));
  e.std     = zeros(nw,length(learnsizes));
  e.apperror   = zeros(nw,length(learnsizes));
  e.appstd     = zeros(nw,length(learnsizes));
  e.xvalues = learnsizes(:)';
	if classs
		e.xlabel   = 'Training set size per class';
	else
		e.xlabel   = 'Training set size';
	end
  e.names   = [];
  if (nreps > 1)
    e.ylabel= ['Averaged error (' num2str(nreps) ' experiments)'];
  elseif (nreps == 1)
    e.ylabel = 'Error';
  else
    error('Number of repetitions NREPS should be >= 1.');
  end;
  if (~isempty(datname))
    if isempty(repsize)
      e.title = [datname ', Rep. Set = Train Set'];
    elseif repsize < 1
      e.title = [datname ', Rep. size = ' num2str(repsize) ' Train size'];
    else
      e.title = [datname ', Rep. size = ' num2str(repsize) ' per class'];
    end
  end
  if (learnsizes(end)/learnsizes(1) > 20)
    e.plot = 'semilogx';        % If range too large, use a log-plot for X.
  end

  % Report progress.
	
	s1 = sprintf('cleval: %i classifiers: ',nw);
	prwaitbar(nw,s1);

  % Store the seed, to reset the random generator later for different
  % classifiers.

	seed = rand('state');

  % Loop over all classifiers (with index WI).

  for wi = 1:nw
		
    if (~isuntrained(classf{wi}))
      error('Classifiers should be untrained.')
    end
    name = getname(classf{wi});
    e.names = char(e.names,name);
    prwaitbar(nw,wi,[s1 name]);

    % E1 will contain the error estimates.

    e1 = zeros(nreps,length(learnsizes));
    e0 = zeros(nreps,length(learnsizes));

    % Take care that classifiers use same training set.

    rand('state',seed); seed2 = seed;

		% For NREPS repetitions...
		
		s2 = sprintf('cleval: %i repetitions: ',nreps);
		prwaitbar(nreps,s2);

		for i = 1:nreps
	
			prwaitbar(nreps,i,[s2 int2str(i)]);
      % Store the randomly permuted indices of samples of class CI to use in
      % this training set in JR(CI,:).

			if classs

				JR = zeros(c,max(learnsizes));
			
				for ci = 1:c

					JC = findnlab(a,ci);

					% Necessary for reproducable training sets: set the seed and store
					% it after generation, so that next time we will use the previous one.
					rand('state',seed2);

					JD = JC(randperm(mc(ci)));
					JR(ci,:) = JD(1:max(learnsizes))';
					seed2 = rand('state'); 
				end
				
			elseif islabtype(a,'crisp')
				
				rand('state',seed2); % get seed for reproducable training sets
				% generate indices for the entire dataset taking care that in
				% the first 2c objects we have 2 objects for every class
				[a1,a2,I1,I2] = gendat(a,2*ones(1,c));
				JD = randperm(m-2*c);
				JR = [I1;I2(JD)];
				seed2 = rand('state'); % save seed for reproducable training sets
				
			else  % soft labels
				
				rand('state',seed2); % get seed for reproducable training sets
				JR = randperm(m);
				seed2 = rand('state'); % save seed for reproducable training sets
				
			end

			li = 0;										% Index of training set.
			
			nlearns = length(learnsizes);
			s3 = sprintf('cleval: %i sizes: ',nlearns);
			prwaitbar(nreps,s3);
			
			for j = 1:nlearns 
				
				nj = learnsizes(j);
				
				prwaitbar(nlearns,j,[s3 int2str(j) ' (' int2str(nj) ')']);
				li = li + 1;

        % J will contain the indices for this training set.

        J = [];
        R = [];
				
				if classs
					for ci = 1:c
						J = [J;JR(ci,1:nj)'];
						if isempty(repsize)
							R = [R JR(ci,1:nj)];
						elseif repsize < 1
							R = [R JR(ci,1:ceil(repsize*nj))];
						else
							R = [R JR(ci,1:min(nj,repsize))];
						end
					end;
				else
					J = JR(1:nj);
					if isempty(repsize)
						R = J;
					elseif repsize < 1
						R = JR(1:ceil(repsize*nj));
					else
						R = JR(1:min(nj,repsize));
					end
				end;
					
				trainset = a(J,R);
				trainset = setprior(trainset,getprior(trainset,0));
				w = trainset*classf{wi}; 					% Use right classifier.
				e0(i,li) = trainset*w*testfun;
				if (isempty(t))
  				Jt = ones(m,1);
					Jt(J) = zeros(size(J));
					Jt = find(Jt); 								% Don't use training set for testing.
					testset = a(Jt,R);
					testset = setprior(testset,getprior(testset,0));
					e1(i,li) = testset*w*testfun;
				else
					testset = t(:,R);
					testset = setprior(testset,getprior(testset,0));
					e1(i,li) = testset*w*testfun;
				end

			end
			prwaitbar(0);

		end
		prwaitbar(0);

    % Calculate average error and standard deviation for this classifier
    % (or set the latter to zero if there's been just 1 repetition).

		e.error(wi,:) = mean(e1,1);
		e.apperror(wi,:) = mean(e0,1);
		if (nreps == 1)
			e.std(wi,:) = zeros(1,size(e.std,2));
			e.appstd(wi,:) = zeros(1,size(e.appstd,2));
		else
			e.std(wi,:) = std(e1)/sqrt(nreps);
			e.appstd(wi,:) = std(e0)/sqrt(nreps);
		end
	end
	prwaitbar(0);

	% The first element is the empty string [], remove it.
	e.names(1,:) = [];

return