1 | function [A,Nxi,A2] = createA(X,y,rtype,par,seed) |
---|
2 | % [A,Nxi,A2] = CREATEA(X,Y,RTYPE,PAR,SEED) |
---|
3 | % |
---|
4 | % Create the data matrix containing all pairwise difference vectors in |
---|
5 | % data matrix X (with their corresponding labels Y, -1/+1). |
---|
6 | % Because the size of this data matrix can become huge (ALL pairwise |
---|
7 | % difference vectors is a lot!), you can subsample it by choosing an |
---|
8 | % appropriate RTYPE. |
---|
9 | % |
---|
10 | % RTYPE 'full' use all constraints |
---|
11 | % 'subs' randomly subsample PAR constraints |
---|
12 | % 'subk' randomly subsample a fraction PAR of the constraints |
---|
13 | % 'knn' use the PAR nearest neighbors in the other class |
---|
14 | % 'xval' subsample and use remaining constraints to optimize C |
---|
15 | % 'xvalk' subsample a fraction k*n and use remaining constraints |
---|
16 | % to optimize C |
---|
17 | % 'kmeans' use k-means clustering with k=PAR |
---|
18 | % 'randk' subsample objects to get PAR*(Npos+Nneg) constraints |
---|
19 | % |
---|
20 | % The SEED is optional, it is the seed for the random sampling. |
---|
21 | % |
---|
22 | if nargin<5 |
---|
23 | seed = []; |
---|
24 | end |
---|
25 | % If a seed is defined, set it: |
---|
26 | if ~isempty(seed) |
---|
27 | rand('state',seed); |
---|
28 | end |
---|
29 | |
---|
30 | A2 = []; |
---|
31 | %---create A for optauc |
---|
32 | |
---|
33 | k = size(X,2); |
---|
34 | |
---|
35 | % compute how many xi-s we expect: |
---|
36 | Ineg = find(y==-1); |
---|
37 | Ipos = find(y==+1); |
---|
38 | Nneg = length(Ineg); |
---|
39 | Npos = length(Ipos); |
---|
40 | |
---|
41 | % depending on the reduction type |
---|
42 | switch rtype |
---|
43 | case 'full' % take all the possibilities: |
---|
44 | Nxi = Nneg*Npos; |
---|
45 | A = zeros(Nxi,k); |
---|
46 | % run over all possibilities: |
---|
47 | dummyk=0; |
---|
48 | for i=1:Nneg |
---|
49 | for j=1:Npos |
---|
50 | dummyk = dummyk+1; |
---|
51 | A(dummyk,:) = X(Ineg(i),:)-X(Ipos(j),:); |
---|
52 | end |
---|
53 | end |
---|
54 | case 'subk' % subsample the possibilities, but now not a fixed number, |
---|
55 | %but k times the number of training objects: |
---|
56 | Nxi = ceil(par*size(X,1)); |
---|
57 | A = zeros(Nxi,k); |
---|
58 | Ip = floor(Npos*rand(Nxi,1))+1; Ip = Ip(1:Nxi); |
---|
59 | In = floor(Nneg*rand(Nxi,1))+1; In = In(1:Nxi); |
---|
60 | for i=1:Nxi |
---|
61 | diffx = X(Ineg(In(i)),:) - X(Ipos(Ip(i)),:); |
---|
62 | A(i,:) = diffx; |
---|
63 | end |
---|
64 | case 'subs' % subsample the possibilities: |
---|
65 | Nxi = par; |
---|
66 | A = zeros(Nxi,k); |
---|
67 | Ip = floor(Npos*rand(Nxi,1))+1; Ip = Ip(1:Nxi); |
---|
68 | In = floor(Nneg*rand(Nxi,1))+1; In = In(1:Nxi); |
---|
69 | for i=1:Nxi |
---|
70 | diffx = X(Ineg(In(i)),:) - X(Ipos(Ip(i)),:); |
---|
71 | A(i,:) = diffx; |
---|
72 | end |
---|
73 | case 'knn' % only use the k nearest neighbors |
---|
74 | Nxi = ceil((Nneg+Npos)*par); |
---|
75 | A = zeros(Nxi,k); |
---|
76 | % first process all the neg. examples: |
---|
77 | D = sqeucldistm(X(Ineg,:),X(Ipos,:)); |
---|
78 | [dummy,I] = sort(D,2); |
---|
79 | dummyk = 0; |
---|
80 | for i=1:Nneg |
---|
81 | for j=1:par |
---|
82 | thispos = I(i,j); |
---|
83 | diffx = X(Ineg(i),:)-X(Ipos(thispos),:); |
---|
84 | dummyk = dummyk+1; |
---|
85 | A(dummyk,:) = diffx; |
---|
86 | end |
---|
87 | end |
---|
88 | % then to all the pos. examples: |
---|
89 | D = D'; % (no need to recompute D) |
---|
90 | [dummy,I] = sort(D,2); |
---|
91 | for i=1:Npos |
---|
92 | for j=1:par |
---|
93 | thispos = I(i,j); |
---|
94 | diffx = -X(Ipos(i),:)+X(Ineg(thispos),:); |
---|
95 | dummyk = dummyk+1; |
---|
96 | A(dummyk,:) = diffx; |
---|
97 | end |
---|
98 | end |
---|
99 | case 'randk' % randomly chosen objs such that you have k(Npos+Nneg) |
---|
100 | % constraints |
---|
101 | q = sqrt(par*(Npos+Nneg)/(Npos*Nneg)); |
---|
102 | qpos = ceil(q*Npos); qneg = ceil(q*Nneg); |
---|
103 | Nxi = qpos*qneg; |
---|
104 | A = zeros(Nxi,k); |
---|
105 | |
---|
106 | % first select the neg. examples: |
---|
107 | I = randperm(Nneg); In = Ineg(I(1:qneg)); |
---|
108 | % then select the pos. examples: |
---|
109 | I = randperm(Npos); Ip = Ipos(I(1:qpos)); |
---|
110 | % run over all possibilities: |
---|
111 | dummyk=0; |
---|
112 | for i=1:qneg |
---|
113 | for j=1:qpos |
---|
114 | dummyk = dummyk+1; |
---|
115 | A(dummyk,:) = X(In(i),:)-X(Ip(j),:); |
---|
116 | end |
---|
117 | end |
---|
118 | case 'xval' % take all the possibilities and use part for testing: |
---|
119 | Nxi = Nneg*Npos; |
---|
120 | A = zeros(Nxi,k); |
---|
121 | % run over all possibilities: |
---|
122 | dummyk=0; |
---|
123 | for i=1:Nneg |
---|
124 | for j=1:Npos |
---|
125 | diffx = X(Ineg(i),:)-X(Ipos(j),:); |
---|
126 | dummyk = dummyk+1; |
---|
127 | A(dummyk,:) = diffx; |
---|
128 | end |
---|
129 | end |
---|
130 | % get part of data for constraints, the rest for evalation: |
---|
131 | I = randperm(Nxi); |
---|
132 | if par>=size(A,1) |
---|
133 | warning(sprintf('More constraints requested than available (%d and %d)',par,size(A,1))); |
---|
134 | disp('Now using half for training and testing'); |
---|
135 | par = ceil(size(A,1)/2); |
---|
136 | end |
---|
137 | % if data is really really huge, then subsample more... |
---|
138 | Mega=100000; |
---|
139 | if length(I)-par>Mega |
---|
140 | A2 = A(I((par+1):(par+Mega)),:); |
---|
141 | else |
---|
142 | A2 = A(I((par+1):end),:); |
---|
143 | end |
---|
144 | A = A(I(1:par),:); |
---|
145 | Nxi = par; |
---|
146 | case 'xvalk' % take all the possibilities and use part for testing: |
---|
147 | par = par*size(X,1); |
---|
148 | Nxi = Nneg*Npos; |
---|
149 | A = zeros(Nxi,k); |
---|
150 | % run over all possibilities: |
---|
151 | dummyk=0; |
---|
152 | for i=1:Nneg |
---|
153 | for j=1:Npos |
---|
154 | diffx = X(Ineg(i),:)-X(Ipos(j),:); |
---|
155 | dummyk = dummyk+1; |
---|
156 | A(dummyk,:) = diffx; |
---|
157 | end |
---|
158 | end |
---|
159 | % get part of data for constraints, the rest for evalation: |
---|
160 | I = randperm(Nxi); |
---|
161 | if par>=size(A,1) |
---|
162 | warning(sprintf('More constraints requested than available (%d and %d)',par,size(A,1))); |
---|
163 | disp('Now using half for training and testing'); |
---|
164 | par = ceil(size(A,1)/2); |
---|
165 | end |
---|
166 | % if data is really really huge, then subsample more... |
---|
167 | Mega=100000; |
---|
168 | if length(I)-par>Mega |
---|
169 | A2 = A(I((par+1):(par+Mega)),:); |
---|
170 | else |
---|
171 | A2 = A(I((par+1):end),:); |
---|
172 | end |
---|
173 | A = A(I(1:par),:); |
---|
174 | Nxi = par; |
---|
175 | case 'kmeans' |
---|
176 | wp = kmeans_dd(X(Ipos,:),0.1,par); |
---|
177 | wn = kmeans_dd(X(Ineg,:),0.1,par); |
---|
178 | Xp = wp.data.w; |
---|
179 | Xn = wn.data.w; |
---|
180 | Nxi = par*par; |
---|
181 | A = zeros(Nxi,k); |
---|
182 | dummyk=0; |
---|
183 | for i=1:par |
---|
184 | for j=1:par |
---|
185 | diffx = Xn(i,:)-Xp(j,:); |
---|
186 | dummyk = dummyk + 1; |
---|
187 | A(dummyk,:) = diffx; |
---|
188 | end |
---|
189 | end |
---|
190 | otherwise |
---|
191 | error(sprintf('Type %s is not defined',rtype)); |
---|
192 | end |
---|
193 | |
---|
194 | return |
---|