[10] | 1 | %GENDDAT Generate random training and test sets for dissimilarity data |
---|
| 2 | % |
---|
| 3 | % [DTR,DTE,I,J] = GENDDAT(D,M,K) |
---|
| 4 | % |
---|
| 5 | % INPUT |
---|
| 6 | % D NxN dissimilarity dataset |
---|
| 7 | % M Cx1 vector of class sizes or frequencies, or a single number / frequency in(0,1) |
---|
| 8 | % K Cx1 vector of class sizes or frequencies, or a single number / frequency in(0,1) |
---|
| 9 | % (optional, default: K = M) |
---|
| 10 | % |
---|
| 11 | % OUTPUT |
---|
| 12 | % DTR Training dissimilarity dataset |
---|
| 13 | % DTE Test dissimilarity dataset |
---|
| 14 | % I Indices of the training objects |
---|
| 15 | % J Indices of the test objects |
---|
| 16 | % |
---|
| 17 | % DESCRIPTION |
---|
| 18 | % Generates random training and test sets from a square dissimilarity dataset D. |
---|
| 19 | % Feature labels and object labels of D should be equal. Note that M and K can be |
---|
| 20 | % either scalars or vectors with as many components as classes (=C), defining |
---|
| 21 | % specific sizes/fractions per class. |
---|
| 22 | % |
---|
| 23 | % Per default, all training are used as a represenation set (M=K). If M is Cx1 vector, |
---|
| 24 | % then GENDDAT selects at random M(i) vectors out of the i-th class in D and stores them |
---|
| 25 | % in the dataset DTR of the size [sum(M)]x[sum(M)]. The remaining objects are stored in |
---|
| 26 | % [N-sum(M)]x[sum(M)] dissimilarity data DTE. Classes are ordered by using RENUMLAB(GETLAB(D)). |
---|
| 27 | % If M is a scalar, then M objects in total (given by number or frequency) are selected |
---|
| 28 | % at random according to the class priors. I and J are the indices of the training |
---|
| 29 | % and testing objects, respectively. |
---|
| 30 | % |
---|
| 31 | % If K is provided, then the first K(i) training objects per i-the class are used |
---|
| 32 | % for the representation set. Note that K(i) can be a frequency. |
---|
| 33 | % |
---|
| 34 | % DEFAULT |
---|
| 35 | % K = M |
---|
| 36 | % |
---|
| 37 | % EXAMPLE |
---|
| 38 | % Let D be 100 x 100 dataset with two classes [40 60] and class priors [0.4 0.6]. |
---|
| 39 | % 1) [DTR,DTE] = GENDDAT(D,0.6) |
---|
| 40 | % DTR is 60x60 and DTE is 40x60. 60% of objects of the first class (24 in total) |
---|
| 41 | % and 40% of the objects of the second class (36 in total) are selected for DTR. |
---|
| 42 | % |
---|
| 43 | % 2) [DTR,DTE] = GENDDAT(D,0.6,0.1) |
---|
| 44 | % DTR is 60x6 and DTE is 40x6. 60% of objects of the first class (24 in total) |
---|
| 45 | % and 40% of the objects of the second class (36 in total) are selected for training. |
---|
| 46 | % From that, 10% of the first training objects per class are selected for the |
---|
| 47 | % represenatation set. 10% from 24 rounds to 2 objects for the first class, while |
---|
| 48 | % 10% of 36 rounds to 4 objects for the second class. |
---|
| 49 | % |
---|
| 50 | % SEE ALSO |
---|
| 51 | % DATASETS, RENUMLAB |
---|
| 52 | % |
---|
| 53 | |
---|
| 54 | % Copyright: R.P.W. Duin, r.p.w.duin@prtools.org, and |
---|
| 55 | % Elzbieta Pekalska, ela.pekalska@googlemail.com |
---|
| 56 | % Faculty EWI, Delft University of Technology, and |
---|
| 57 | % School of Computer Science, University of Manchester |
---|
| 58 | |
---|
| 59 | |
---|
| 60 | function [DTR,DTE,Itr,Ite] = genddat(D,m,k); |
---|
| 61 | |
---|
| 62 | [n,nk,c] = getsize(D); |
---|
| 63 | nlab = getnlab(D); |
---|
| 64 | discheck(D,[],1); % allow for similarities |
---|
| 65 | |
---|
| 66 | if nargin < 3, |
---|
| 67 | k = []; |
---|
| 68 | else |
---|
| 69 | if length(k) == 1 |
---|
| 70 | k = k*ones(1,c); |
---|
| 71 | elseif length(k) == c |
---|
| 72 | ; |
---|
| 73 | else |
---|
| 74 | error('Vector length of the number of objects should equal the number of classes.') |
---|
| 75 | end |
---|
| 76 | |
---|
| 77 | if ~(all(k == round(k))) & ~(all(k > 0 & k < 1)) |
---|
| 78 | error('K should be given either by integers or frequencies in (0,1).') |
---|
| 79 | end |
---|
| 80 | end |
---|
| 81 | if ~(all(m == round(m))) & ~(all(m > 0 & m < 1)) |
---|
| 82 | error('M should be given either by integers or frequencies in (0,1).') |
---|
| 83 | end |
---|
| 84 | |
---|
| 85 | [ja,jb] = gendat(dataset([1:n]',nlab),m); |
---|
| 86 | ja = +ja; |
---|
| 87 | jb = +jb; |
---|
| 88 | |
---|
| 89 | J = []; |
---|
| 90 | for j=1:c |
---|
| 91 | K = find(nlab(ja)==j); |
---|
| 92 | if ~isempty(k) |
---|
| 93 | if k(j) < 1, |
---|
| 94 | k(j) = round(k(j)*length(K)); |
---|
| 95 | end |
---|
| 96 | if k(j) > length(K) |
---|
| 97 | error('Requested size of the representation set is not possible.') |
---|
| 98 | end |
---|
| 99 | K = K(1:k(j)); |
---|
| 100 | end |
---|
| 101 | J = [J; K(:)]; |
---|
| 102 | end |
---|
| 103 | M = setdiff([1:length(ja)]',J); |
---|
| 104 | |
---|
| 105 | Itr = [ja(J); ja(M)]; |
---|
| 106 | Ite = jb; |
---|
| 107 | DTR = D(Itr,ja(J)); |
---|
| 108 | DTE = D(Ite,ja(J)); |
---|
| 109 | |
---|