source: distools/genddat.m @ 58

Last change on this file since 58 was 28, checked in by bduin, 13 years ago
File size: 3.8 KB
RevLine 
[10]1%GENDDAT Generate random training and test sets for dissimilarity data
2%
3%       [DTR,DTE,I,J] = GENDDAT(D,M,K)
4%
5% INPUT
6%       D       NxN dissimilarity dataset
7%       M       Cx1 vector of class sizes or frequencies, or a single number / frequency in(0,1)
8%       K       Cx1 vector of class sizes or frequencies, or a single number / frequency in(0,1)
9%                       (optional, default: K = M)
10%
11% OUTPUT
12%   DTR Training dissimilarity dataset
13%   DTE Test dissimilarity dataset
14%       I       Indices of the training objects
15%       J       Indices of the test objects
16%
17% DESCRIPTION
18% Generates random training and test sets from a square dissimilarity dataset D.
19% Feature labels and object labels of D should be equal. Note that M and K can be
20% either scalars or vectors with as many components as classes (=C), defining
21% specific sizes/fractions per class.
22%
23% Per default, all training are used as a represenation set (M=K). If M is Cx1 vector,
24% then GENDDAT selects at random M(i) vectors out of the i-th class in D and stores them
25% in the dataset DTR of the size [sum(M)]x[sum(M)]. The remaining  objects are stored in
26% [N-sum(M)]x[sum(M)] dissimilarity data DTE. Classes are ordered by using RENUMLAB(GETLAB(D)).
27% If M is a scalar, then M objects in total (given by number or frequency) are selected
28% at random according to the class priors. I and J are the indices of the training
29% and testing objects, respectively.
30%
31% If K is provided, then the first K(i) training objects per i-the class are used
32% for the representation set. Note that K(i) can be a frequency.
33%
34% DEFAULT
35%       K = M
36%
37% EXAMPLE
38% Let D be 100 x 100 dataset with two classes [40 60] and class priors [0.4 0.6].
39% 1)    [DTR,DTE] = GENDDAT(D,0.6)
40% DTR is 60x60 and DTE is 40x60. 60% of objects of the first class (24 in total)
41% and 40% of the objects of the second class (36 in total) are selected for DTR.
42%
43% 2) [DTR,DTE] = GENDDAT(D,0.6,0.1)
44% DTR is 60x6 and DTE is 40x6. 60% of objects of the first class (24 in total)
45% and 40% of the objects of the second class (36 in total) are selected for training.
46% From that, 10% of the first training objects per class are selected for the
47% represenatation set. 10% from 24 rounds to 2 objects for the first class, while
48% 10% of 36 rounds to 4 objects for the second class.
49%
50% SEE ALSO
51% DATASETS, RENUMLAB
52%
53
54% Copyright: R.P.W. Duin, r.p.w.duin@prtools.org, and
55% Elzbieta Pekalska, ela.pekalska@googlemail.com
56% Faculty EWI, Delft University of Technology, and
57% School of Computer Science, University of Manchester
58
59
60function [DTR,DTE,Itr,Ite] = genddat(D,m,k);
61
[28]62  if nargin < 3, k = []; end
63
64        % If input is a cell array of datasets, apply this procedure
65  % to the individual datasets.
66        if (iscell(D))
67                A  = cell(size(D));
68                B  = cell(size(D));
69                IA = cell(size(D));
70                IB = cell(size(D));
71                seed = randreset;
72                for j=1:length(D(:))
73                        randreset(seed);
74                        [DTR{j},DTE{j},Itr{j},Ite{j}] = feval(mfilename,D{j},m,k);
75                end
76                return;
77        end
78
79
80
[10]81[n,nk,c] = getsize(D);
82nlab     = getnlab(D);
83discheck(D,[],1);   % allow for similarities     
84
[28]85if ~isempty(k)
[10]86        if length(k) == 1
87                k = k*ones(1,c);
88        elseif length(k) == c
89                ;
90        else
91                error('Vector length of the number of objects should equal the number of classes.')
92        end
93       
94        if ~(all(k == round(k))) & ~(all(k > 0 & k < 1))
95                error('K should be given either by integers or frequencies in (0,1).') 
96        end
97end 
98if ~(all(m == round(m))) & ~(all(m > 0 & m < 1))
99        error('M should be given either by integers or frequencies in (0,1).') 
100end
101
102[ja,jb] = gendat(dataset([1:n]',nlab),m);
103ja = +ja;
104jb = +jb;
105
106J  = [];
107for j=1:c
108        K = find(nlab(ja)==j);
109        if ~isempty(k)
110                if k(j) < 1,
111                        k(j) = round(k(j)*length(K));
112                end
113                if k(j) > length(K)
114                        error('Requested size of the representation set is not possible.')
115                end
116                K = K(1:k(j));
117        end
118        J = [J; K(:)];
119end
120M = setdiff([1:length(ja)]',J);
121
122Itr = [ja(J); ja(M)];
123Ite = jb;
124DTR = D(Itr,ja(J));
125DTE = D(Ite,ja(J));
126
Note: See TracBrowser for help on using the repository browser.