[5] | 1 | function [A,Nxi,A2] = createA(X,y,rtype,par,seed) |
---|
| 2 | % [A,Nxi,A2] = CREATEA(X,Y,RTYPE,PAR,SEED) |
---|
| 3 | % |
---|
| 4 | % Create the data matrix containing all pairwise difference vectors in |
---|
| 5 | % data matrix X (with their corresponding labels Y, -1/+1). |
---|
| 6 | % Because the size of this data matrix can become huge (ALL pairwise |
---|
| 7 | % difference vectors is a lot!), you can subsample it by choosing an |
---|
| 8 | % appropriate RTYPE. |
---|
| 9 | % |
---|
| 10 | % RTYPE 'full' use all constraints |
---|
| 11 | % 'subs' randomly subsample PAR constraints |
---|
| 12 | % 'subk' randomly subsample a fraction PAR of the constraints |
---|
| 13 | % 'knn' use the PAR nearest neighbors in the other class |
---|
| 14 | % 'xval' subsample and use remaining constraints to optimize C |
---|
| 15 | % 'xvalk' subsample a fraction k*n and use remaining constraints |
---|
| 16 | % to optimize C |
---|
| 17 | % 'kmeans' use k-means clustering with k=PAR |
---|
| 18 | % 'randk' subsample objects to get PAR*(Npos+Nneg) constraints |
---|
| 19 | % |
---|
| 20 | % The SEED is optional, it is the seed for the random sampling. |
---|
| 21 | % |
---|
| 22 | if nargin<5 |
---|
| 23 | seed = []; |
---|
| 24 | end |
---|
| 25 | % If a seed is defined, set it: |
---|
| 26 | if ~isempty(seed) |
---|
| 27 | rand('state',seed); |
---|
| 28 | end |
---|
| 29 | |
---|
| 30 | A2 = []; |
---|
| 31 | %---create A for optauc |
---|
| 32 | |
---|
| 33 | k = size(X,2); |
---|
| 34 | |
---|
| 35 | % compute how many xi-s we expect: |
---|
| 36 | Ineg = find(y==-1); |
---|
| 37 | Ipos = find(y==+1); |
---|
| 38 | Nneg = length(Ineg); |
---|
| 39 | Npos = length(Ipos); |
---|
| 40 | |
---|
| 41 | % depending on the reduction type |
---|
| 42 | switch rtype |
---|
| 43 | case 'full' % take all the possibilities: |
---|
| 44 | Nxi = Nneg*Npos; |
---|
| 45 | A = zeros(Nxi,k); |
---|
| 46 | % run over all possibilities: |
---|
| 47 | dummyk=0; |
---|
| 48 | for i=1:Nneg |
---|
| 49 | for j=1:Npos |
---|
| 50 | dummyk = dummyk+1; |
---|
| 51 | A(dummyk,:) = X(Ineg(i),:)-X(Ipos(j),:); |
---|
| 52 | end |
---|
| 53 | end |
---|
| 54 | case 'subk' % subsample the possibilities, but now not a fixed number, |
---|
| 55 | %but k times the number of training objects: |
---|
| 56 | Nxi = ceil(par*size(X,1)); |
---|
| 57 | A = zeros(Nxi,k); |
---|
| 58 | Ip = floor(Npos*rand(Nxi,1))+1; Ip = Ip(1:Nxi); |
---|
| 59 | In = floor(Nneg*rand(Nxi,1))+1; In = In(1:Nxi); |
---|
| 60 | for i=1:Nxi |
---|
| 61 | diffx = X(Ineg(In(i)),:) - X(Ipos(Ip(i)),:); |
---|
| 62 | A(i,:) = diffx; |
---|
| 63 | end |
---|
| 64 | case 'subs' % subsample the possibilities: |
---|
| 65 | Nxi = par; |
---|
| 66 | A = zeros(Nxi,k); |
---|
| 67 | Ip = floor(Npos*rand(Nxi,1))+1; Ip = Ip(1:Nxi); |
---|
| 68 | In = floor(Nneg*rand(Nxi,1))+1; In = In(1:Nxi); |
---|
| 69 | for i=1:Nxi |
---|
| 70 | diffx = X(Ineg(In(i)),:) - X(Ipos(Ip(i)),:); |
---|
| 71 | A(i,:) = diffx; |
---|
| 72 | end |
---|
| 73 | case 'knn' % only use the k nearest neighbors |
---|
| 74 | Nxi = ceil((Nneg+Npos)*par); |
---|
| 75 | A = zeros(Nxi,k); |
---|
| 76 | % first process all the neg. examples: |
---|
| 77 | D = sqeucldistm(X(Ineg,:),X(Ipos,:)); |
---|
| 78 | [dummy,I] = sort(D,2); |
---|
| 79 | dummyk = 0; |
---|
| 80 | for i=1:Nneg |
---|
| 81 | for j=1:par |
---|
| 82 | thispos = I(i,j); |
---|
| 83 | diffx = X(Ineg(i),:)-X(Ipos(thispos),:); |
---|
| 84 | dummyk = dummyk+1; |
---|
| 85 | A(dummyk,:) = diffx; |
---|
| 86 | end |
---|
| 87 | end |
---|
| 88 | % then to all the pos. examples: |
---|
| 89 | D = D'; % (no need to recompute D) |
---|
| 90 | [dummy,I] = sort(D,2); |
---|
| 91 | for i=1:Npos |
---|
| 92 | for j=1:par |
---|
| 93 | thispos = I(i,j); |
---|
| 94 | diffx = -X(Ipos(i),:)+X(Ineg(thispos),:); |
---|
| 95 | dummyk = dummyk+1; |
---|
| 96 | A(dummyk,:) = diffx; |
---|
| 97 | end |
---|
| 98 | end |
---|
| 99 | case 'randk' % randomly chosen objs such that you have k(Npos+Nneg) |
---|
| 100 | % constraints |
---|
| 101 | q = sqrt(par*(Npos+Nneg)/(Npos*Nneg)); |
---|
| 102 | qpos = ceil(q*Npos); qneg = ceil(q*Nneg); |
---|
| 103 | Nxi = qpos*qneg; |
---|
| 104 | A = zeros(Nxi,k); |
---|
| 105 | |
---|
| 106 | % first select the neg. examples: |
---|
| 107 | I = randperm(Nneg); In = Ineg(I(1:qneg)); |
---|
| 108 | % then select the pos. examples: |
---|
| 109 | I = randperm(Npos); Ip = Ipos(I(1:qpos)); |
---|
| 110 | % run over all possibilities: |
---|
| 111 | dummyk=0; |
---|
| 112 | for i=1:qneg |
---|
| 113 | for j=1:qpos |
---|
| 114 | dummyk = dummyk+1; |
---|
| 115 | A(dummyk,:) = X(In(i),:)-X(Ip(j),:); |
---|
| 116 | end |
---|
| 117 | end |
---|
| 118 | case 'xval' % take all the possibilities and use part for testing: |
---|
| 119 | Nxi = Nneg*Npos; |
---|
| 120 | A = zeros(Nxi,k); |
---|
| 121 | % run over all possibilities: |
---|
| 122 | dummyk=0; |
---|
| 123 | for i=1:Nneg |
---|
| 124 | for j=1:Npos |
---|
| 125 | diffx = X(Ineg(i),:)-X(Ipos(j),:); |
---|
| 126 | dummyk = dummyk+1; |
---|
| 127 | A(dummyk,:) = diffx; |
---|
| 128 | end |
---|
| 129 | end |
---|
| 130 | % get part of data for constraints, the rest for evalation: |
---|
| 131 | I = randperm(Nxi); |
---|
| 132 | if par>=size(A,1) |
---|
| 133 | warning(sprintf('More constraints requested than available (%d and %d)',par,size(A,1))); |
---|
| 134 | disp('Now using half for training and testing'); |
---|
| 135 | par = ceil(size(A,1)/2); |
---|
| 136 | end |
---|
| 137 | % if data is really really huge, then subsample more... |
---|
| 138 | Mega=100000; |
---|
| 139 | if length(I)-par>Mega |
---|
| 140 | A2 = A(I((par+1):(par+Mega)),:); |
---|
| 141 | else |
---|
| 142 | A2 = A(I((par+1):end),:); |
---|
| 143 | end |
---|
| 144 | A = A(I(1:par),:); |
---|
| 145 | Nxi = par; |
---|
| 146 | case 'xvalk' % take all the possibilities and use part for testing: |
---|
| 147 | par = par*size(X,1); |
---|
| 148 | Nxi = Nneg*Npos; |
---|
| 149 | A = zeros(Nxi,k); |
---|
| 150 | % run over all possibilities: |
---|
| 151 | dummyk=0; |
---|
| 152 | for i=1:Nneg |
---|
| 153 | for j=1:Npos |
---|
| 154 | diffx = X(Ineg(i),:)-X(Ipos(j),:); |
---|
| 155 | dummyk = dummyk+1; |
---|
| 156 | A(dummyk,:) = diffx; |
---|
| 157 | end |
---|
| 158 | end |
---|
| 159 | % get part of data for constraints, the rest for evalation: |
---|
| 160 | I = randperm(Nxi); |
---|
| 161 | if par>=size(A,1) |
---|
| 162 | warning(sprintf('More constraints requested than available (%d and %d)',par,size(A,1))); |
---|
| 163 | disp('Now using half for training and testing'); |
---|
| 164 | par = ceil(size(A,1)/2); |
---|
| 165 | end |
---|
| 166 | % if data is really really huge, then subsample more... |
---|
| 167 | Mega=100000; |
---|
| 168 | if length(I)-par>Mega |
---|
| 169 | A2 = A(I((par+1):(par+Mega)),:); |
---|
| 170 | else |
---|
| 171 | A2 = A(I((par+1):end),:); |
---|
| 172 | end |
---|
| 173 | A = A(I(1:par),:); |
---|
| 174 | Nxi = par; |
---|
| 175 | case 'kmeans' |
---|
| 176 | wp = kmeans_dd(X(Ipos,:),0.1,par); |
---|
| 177 | wn = kmeans_dd(X(Ineg,:),0.1,par); |
---|
| 178 | Xp = wp.data.w; |
---|
| 179 | Xn = wn.data.w; |
---|
| 180 | Nxi = par*par; |
---|
| 181 | A = zeros(Nxi,k); |
---|
| 182 | dummyk=0; |
---|
| 183 | for i=1:par |
---|
| 184 | for j=1:par |
---|
| 185 | diffx = Xn(i,:)-Xp(j,:); |
---|
| 186 | dummyk = dummyk + 1; |
---|
| 187 | A(dummyk,:) = diffx; |
---|
| 188 | end |
---|
| 189 | end |
---|
| 190 | otherwise |
---|
| 191 | error(sprintf('Type %s is not defined',rtype)); |
---|
| 192 | end |
---|
| 193 | |
---|
| 194 | return |
---|