source: prextra/createA.m

Last change on this file was 5, checked in by bduin, 14 years ago
File size: 5.0 KB
Line 
1function [A,Nxi,A2] = createA(X,y,rtype,par,seed)
2% [A,Nxi,A2] = CREATEA(X,Y,RTYPE,PAR,SEED)
3%
4% Create the data matrix containing all pairwise difference vectors in
5% data matrix X (with their corresponding labels Y, -1/+1).
6% Because the size of this data matrix can become huge (ALL pairwise
7% difference vectors is a lot!), you can subsample it by choosing an
8% appropriate RTYPE.
9%
10%  RTYPE 'full'   use all constraints
11%        'subs'   randomly subsample PAR constraints
12%        'subk'   randomly subsample a fraction PAR of the constraints
13%        'knn'    use the PAR nearest neighbors in the other class
14%        'xval'   subsample and use remaining constraints to optimize C
15%        'xvalk'  subsample a fraction k*n and use remaining constraints
16%                 to optimize C
17%        'kmeans' use k-means clustering with k=PAR
18%        'randk'  subsample objects to get PAR*(Npos+Nneg) constraints
19%
20% The SEED is optional, it is the seed for the random sampling.
21%
22if nargin<5
23        seed = [];
24end
25% If a seed is defined, set it:
26if ~isempty(seed)
27        rand('state',seed);
28end
29
30A2 = [];
31%---create A for optauc
32
33k = size(X,2);
34
35% compute how many xi-s we expect:
36Ineg = find(y==-1);
37Ipos = find(y==+1);
38Nneg = length(Ineg);
39Npos = length(Ipos);
40
41% depending on the reduction type
42switch rtype
43case 'full'  % take all the possibilities:
44        Nxi = Nneg*Npos;
45        A = zeros(Nxi,k);
46        % run over all possibilities:
47        dummyk=0;
48        for i=1:Nneg
49                for j=1:Npos
50                        dummyk = dummyk+1;
51                        A(dummyk,:) = X(Ineg(i),:)-X(Ipos(j),:);
52                end
53        end
54case 'subk'  % subsample the possibilities, but now not a fixed number,
55        %but k times the number of training objects:
56        Nxi = ceil(par*size(X,1));
57        A = zeros(Nxi,k);
58        Ip = floor(Npos*rand(Nxi,1))+1; Ip = Ip(1:Nxi);
59        In = floor(Nneg*rand(Nxi,1))+1; In = In(1:Nxi);
60        for i=1:Nxi
61                diffx = X(Ineg(In(i)),:) - X(Ipos(Ip(i)),:);
62                A(i,:) = diffx;
63        end
64case 'subs'  % subsample the possibilities:
65        Nxi = par;
66        A = zeros(Nxi,k);
67        Ip = floor(Npos*rand(Nxi,1))+1; Ip = Ip(1:Nxi);
68        In = floor(Nneg*rand(Nxi,1))+1; In = In(1:Nxi);
69        for i=1:Nxi
70                diffx = X(Ineg(In(i)),:) - X(Ipos(Ip(i)),:);
71                A(i,:) = diffx;
72        end
73case 'knn'  % only use the k nearest neighbors
74        Nxi = ceil((Nneg+Npos)*par);
75        A = zeros(Nxi,k);
76        % first process all the neg. examples:
77        D = sqeucldistm(X(Ineg,:),X(Ipos,:));
78        [dummy,I] = sort(D,2);
79        dummyk = 0;
80        for i=1:Nneg
81                for j=1:par
82                        thispos = I(i,j);
83                        diffx = X(Ineg(i),:)-X(Ipos(thispos),:);
84                        dummyk = dummyk+1;
85                        A(dummyk,:) = diffx;
86                end
87        end
88        % then to all the pos. examples:
89        D = D';  % (no need to recompute D)
90        [dummy,I] = sort(D,2);
91        for i=1:Npos
92                for j=1:par
93                        thispos = I(i,j);
94                        diffx = -X(Ipos(i),:)+X(Ineg(thispos),:);
95                        dummyk = dummyk+1;
96                        A(dummyk,:) = diffx;
97                end
98        end
99case 'randk'  % randomly chosen objs such that you have k(Npos+Nneg)
100                   % constraints
101        q = sqrt(par*(Npos+Nneg)/(Npos*Nneg));
102        qpos = ceil(q*Npos); qneg = ceil(q*Nneg);
103        Nxi = qpos*qneg;
104        A = zeros(Nxi,k);
105
106        % first select the neg. examples:
107        I = randperm(Nneg); In = Ineg(I(1:qneg));
108        % then select the pos. examples:
109        I = randperm(Npos); Ip = Ipos(I(1:qpos));
110        % run over all possibilities:
111        dummyk=0;
112        for i=1:qneg
113                for j=1:qpos
114                        dummyk = dummyk+1;
115                        A(dummyk,:) = X(In(i),:)-X(Ip(j),:);
116                end
117        end
118case 'xval'  % take all the possibilities and use part for testing:
119        Nxi = Nneg*Npos;
120        A = zeros(Nxi,k);
121        % run over all possibilities:
122        dummyk=0;
123        for i=1:Nneg
124                for j=1:Npos
125                        diffx = X(Ineg(i),:)-X(Ipos(j),:);
126                        dummyk = dummyk+1;
127                        A(dummyk,:) = diffx;
128                end
129        end
130        % get part of data for constraints, the rest for evalation:
131        I = randperm(Nxi);
132   if par>=size(A,1)
133                warning(sprintf('More constraints requested than available (%d and %d)',par,size(A,1)));
134                disp('Now using half for training and testing');
135                par = ceil(size(A,1)/2);
136        end
137        % if data is really really huge, then subsample more...
138        Mega=100000;
139        if length(I)-par>Mega
140                A2 = A(I((par+1):(par+Mega)),:);
141        else
142                A2 = A(I((par+1):end),:);
143        end
144        A = A(I(1:par),:);
145        Nxi = par;
146case 'xvalk'  % take all the possibilities and use part for testing:
147        par = par*size(X,1);
148        Nxi = Nneg*Npos;
149        A = zeros(Nxi,k);
150        % run over all possibilities:
151        dummyk=0;
152        for i=1:Nneg
153                for j=1:Npos
154                        diffx = X(Ineg(i),:)-X(Ipos(j),:);
155                        dummyk = dummyk+1;
156                        A(dummyk,:) = diffx;
157                end
158        end
159        % get part of data for constraints, the rest for evalation:
160        I = randperm(Nxi);
161   if par>=size(A,1)
162                warning(sprintf('More constraints requested than available (%d and %d)',par,size(A,1)));
163                disp('Now using half for training and testing');
164                par = ceil(size(A,1)/2);
165        end
166        % if data is really really huge, then subsample more...
167        Mega=100000;
168        if length(I)-par>Mega
169                A2 = A(I((par+1):(par+Mega)),:);
170        else
171                A2 = A(I((par+1):end),:);
172        end
173        A = A(I(1:par),:);
174        Nxi = par;
175case 'kmeans'
176        wp = kmeans_dd(X(Ipos,:),0.1,par);
177        wn = kmeans_dd(X(Ineg,:),0.1,par);
178        Xp = wp.data.w;
179        Xn = wn.data.w;
180        Nxi = par*par;
181        A = zeros(Nxi,k);
182        dummyk=0;
183        for i=1:par
184                for j=1:par
185                        diffx = Xn(i,:)-Xp(j,:);
186                        dummyk = dummyk + 1;
187                        A(dummyk,:) = diffx;
188                end
189        end
190otherwise
191        error(sprintf('Type %s is not defined',rtype));
192end
193
194return
Note: See TracBrowser for help on using the repository browser.