1 | function [w,returnA,r2] = auclpm(x, C, rtype, par, unitnorm, usematlab) |
---|
2 | %AUCLPM Find linear mapping with optimized AUC |
---|
3 | % |
---|
4 | % W = AUCLPM(X, C, RTYPE, PAR) |
---|
5 | % |
---|
6 | % Optimize the AUC on dataset X and reg. param. C. This is done by |
---|
7 | % finding the weights W for which the ordering of the objects mapped |
---|
8 | % onto the line defined by W, is optimal. That means that objects from |
---|
9 | % class +1 is always mapped above objects from the -1 class. This |
---|
10 | % results in a constraint for each (+1,-1) pair of objects. The number |
---|
11 | % of constraints therefore become very large. The AUC constraints can |
---|
12 | % be subsampled in different ways: |
---|
13 | % |
---|
14 | % RTYPE PAR |
---|
15 | % 'full', - use all constraints |
---|
16 | % 'subs', N subsample just N constraints |
---|
17 | % 'subk', k subsample just k*#trainobj. constraints |
---|
18 | % 'knn' k use only the k nearest neighbors |
---|
19 | % 'xval' N subsample just N constraints and use the rest to |
---|
20 | % optimize C (this version can be very slow) |
---|
21 | % 'xvalk' k subsample k*#trainobj and use remaining constraints |
---|
22 | % to optimize C |
---|
23 | % 'kmeans' k use k-means clustering with k=PAR |
---|
24 | % 'randk' subsample objects to get PAR*(Npos+Nneg) constraints |
---|
25 | % |
---|
26 | % W = AUCLPM(X, C, RTYPE, PAR, UNITNORM) |
---|
27 | % |
---|
28 | % Finally, per default the difference vectors are normalized to unit |
---|
29 | % length. If you don't like that, set UNITNORM to 0. |
---|
30 | % |
---|
31 | % Default: RTYPE='subk' |
---|
32 | % PAR = 1.0; |
---|
33 | prtrace(mfilename); |
---|
34 | |
---|
35 | if (nargin < 6) |
---|
36 | usematlab = 0; |
---|
37 | end |
---|
38 | if (nargin < 5) |
---|
39 | unitnorm = 1; |
---|
40 | end |
---|
41 | if (nargin < 4) |
---|
42 | par = 1.00; |
---|
43 | end |
---|
44 | if (nargin < 3) |
---|
45 | rtype = 'subk'; |
---|
46 | end |
---|
47 | if (nargin < 2) |
---|
48 | prwarning(3,'Lambda set to ten'); |
---|
49 | C = 10; |
---|
50 | end |
---|
51 | |
---|
52 | % define the correct name: |
---|
53 | if unitnorm |
---|
54 | cl_name = sprintf('AUC-LP %s (%s) 1norm',rtype,num2str(par)); |
---|
55 | else |
---|
56 | cl_name = sprintf('AUC-LP %s (%s)',rtype,num2str(par)); |
---|
57 | end |
---|
58 | if (nargin < 1) | (isempty(x)) |
---|
59 | w = mapping(mfilename,{C,rtype,par,unitnorm,usematlab}); |
---|
60 | w = setname(w,cl_name); |
---|
61 | return |
---|
62 | end |
---|
63 | |
---|
64 | if ~ismapping(C) % train the mapping |
---|
65 | |
---|
66 | % Unpack the dataset. |
---|
67 | islabtype(x,'crisp'); |
---|
68 | isvaldset(x,1,2); % at least 1 object per class, 2 classes |
---|
69 | [n,k,c] = getsize(x); |
---|
70 | % Check some values: |
---|
71 | if par<=0 |
---|
72 | error('Parameter ''par'' should be larger than zero'); |
---|
73 | end |
---|
74 | |
---|
75 | if c == 2 % two-class classifier: |
---|
76 | |
---|
77 | labl = getlablist(x); dim = size(x,2); |
---|
78 | % first create the target values (+1 and -1): |
---|
79 | % make an exception for a one-class or mil dataset: |
---|
80 | tnr = strmatch('target',labl); |
---|
81 | if isempty(tnr) % no target class defined. |
---|
82 | tnr = strmatch('positive',labl); |
---|
83 | if isempty(tnr) % no positive class defined. |
---|
84 | % we just take the first class as target class: |
---|
85 | tnr = 1; |
---|
86 | end |
---|
87 | end |
---|
88 | y = 2*(getnlab(x)==tnr) - 1; |
---|
89 | tlab = labl(tnr,:); |
---|
90 | |
---|
91 | % makes the mapping much faster: |
---|
92 | X = +x; clear x; |
---|
93 | |
---|
94 | %---create A for optauc |
---|
95 | rstate = rand('state'); |
---|
96 | seed = 0; |
---|
97 | [A,Nxi,Aval] = createA(X,y,rtype,par,seed); |
---|
98 | rand('state',rstate); |
---|
99 | if unitnorm |
---|
100 | % normalize the length of A: |
---|
101 | lA = sqrt(sum(A.*A,2)); |
---|
102 | lenn0 = find(lA~=0); % when labels are flipped, terrible |
---|
103 | % things can happen |
---|
104 | A(lenn0,:) = A(lenn0,:)./repmat(lA(lenn0,:),1,size(A,2)); |
---|
105 | if ~isempty(Aval) |
---|
106 | % also normalize the length of Aval: |
---|
107 | lA = sqrt(sum(Aval.*Aval,2)); |
---|
108 | lenn0 = find(lA~=0); |
---|
109 | Aval(lenn0,:) = Aval(lenn0,:)./repmat(lA(lenn0,:),1,size(Aval,2)); |
---|
110 | end |
---|
111 | end |
---|
112 | %if nargout>1 |
---|
113 | % returnA = A; |
---|
114 | %end |
---|
115 | orgA = A; |
---|
116 | % negative should be present for the constraints: |
---|
117 | A = [A -A]; |
---|
118 | % take also care for the xi: |
---|
119 | A = [A -speye(Nxi)]; |
---|
120 | %A = [A -eye(Nxi)]; |
---|
121 | %---create f |
---|
122 | % NO, do this later, maybe we want to optimize it! |
---|
123 | %f = [ones(2*k,1); repmat(C,Nxi,1)]; |
---|
124 | %--- generate b |
---|
125 | b = -ones(Nxi,1); % no zeros, otherwise we get w=0 |
---|
126 | % the constraint is changed here to <=-1 |
---|
127 | %---lower bound constraints |
---|
128 | lb = zeros(2*k+Nxi,1); |
---|
129 | |
---|
130 | % should we run over a range of Cs? |
---|
131 | if ~isempty(Aval) |
---|
132 | M = 25; |
---|
133 | xtr = zeros(M,1); |
---|
134 | xval = zeros(M,1); |
---|
135 | C = logspace(-3,3,M); |
---|
136 | % run over all the Cs |
---|
137 | for i=1:length(C) |
---|
138 | %---create f again: |
---|
139 | f = [ones(2*k,1); repmat(C(i),Nxi,1)]; |
---|
140 | %---solve linear program |
---|
141 | if (exist('glpk')>0) & ~usematlab |
---|
142 | [z,fmin,status]=glpk(f,A,b,lb,[],repmat('U',Nxi,1),... |
---|
143 | repmat('C',size(f,1),1),1); |
---|
144 | elseif (exist('glpkmex')>0) & ~usematlab |
---|
145 | [z,fmin,status]=glpkmex(1,f,A,b,repmat('U',Nxi,1),... |
---|
146 | lb,[],repmat('C',size(f,1),1)); |
---|
147 | else |
---|
148 | opts = optimset('Display','off','LargeScale','on','Diagnostics','off'); |
---|
149 | z = linprog(f,A,b,[],[],lb,[],[],opts); |
---|
150 | end |
---|
151 | constr = Aval*(z(1:k)-z(k+1:2*k)); |
---|
152 | % the number of satisfied constraints (=AUC:) |
---|
153 | I = find(constr<-0); |
---|
154 | if ~isempty(I) |
---|
155 | xval(i) = length(I)/size(constr,1); |
---|
156 | end |
---|
157 | constr = orgA*(z(1:k)-z(k+1:2*k)); |
---|
158 | % the number of satisfied constraints (=AUC:) |
---|
159 | I = find(constr<-0); |
---|
160 | if ~isempty(I) |
---|
161 | xtr(i) = length(I)/size(constr,1); |
---|
162 | end |
---|
163 | end |
---|
164 | if nargout>1 |
---|
165 | returnA = xval; |
---|
166 | if nargout>2 |
---|
167 | r2 = xtr; |
---|
168 | end |
---|
169 | end |
---|
170 | [minxval,mini] = max(xval); |
---|
171 | C = C(mini); |
---|
172 | message(4,'Optimum C = %f\n',C); |
---|
173 | end |
---|
174 | %---create f |
---|
175 | f = [ones(2*k,1); repmat(C,Nxi,1)]; |
---|
176 | %---solve linear program |
---|
177 | if (exist('glpkmex')>0) & ~usematlab |
---|
178 | prwarning(7,'Use glpkmex'); |
---|
179 | param.msglev=0; |
---|
180 | [z,fmin,status,xtra]=glpkmex(1,f,A,b,repmat('U',Nxi,1),... |
---|
181 | lb,[],repmat('C',size(f,1),1),param); |
---|
182 | alpha = xtra.lambda; |
---|
183 | else |
---|
184 | [z,fmin,exitflag,outp,alpha] = linprog(f,A,b,[],[],lb); |
---|
185 | end |
---|
186 | |
---|
187 | %---extract parameters |
---|
188 | u = z(1:k); u = u(:); |
---|
189 | v = z(k+1:2*k); v = v(:); |
---|
190 | zeta = z(2*k+1:2*k+Nxi); zeta = zeta(:); |
---|
191 | %if nargout>1 |
---|
192 | % returnA = zeta; |
---|
193 | %end |
---|
194 | else |
---|
195 | error('Only a two-class classifier is implemented'); |
---|
196 | end |
---|
197 | % now find out how sparse the result is: |
---|
198 | %nr = sum(beta>1e-6); |
---|
199 | rel = (abs(u-v)>0); |
---|
200 | nr = sum(rel); |
---|
201 | |
---|
202 | % and store the results: |
---|
203 | %W.wsc = wsc; |
---|
204 | W.u = u; %the ultimate weights |
---|
205 | W.v = v; |
---|
206 | W.alpha = alpha; |
---|
207 | W.zeta = zeta; |
---|
208 | W.nr = nr; |
---|
209 | W.rel = rel; |
---|
210 | W.C = C; |
---|
211 | w = mapping(mfilename,'trained',W,tlab,dim,1); |
---|
212 | w = setname(w,sprintf('%s %s',cl_name,rtype)); |
---|
213 | |
---|
214 | else |
---|
215 | % Evaluate the classifier on new data: |
---|
216 | W = getdata(C); |
---|
217 | n = size(x,1); |
---|
218 | |
---|
219 | % linear classifier: |
---|
220 | out = x*(W.u-W.v); |
---|
221 | |
---|
222 | % and put it nicely in a prtools dataset: |
---|
223 | % (I am not really sure what I should output, I decided to give a 1D |
---|
224 | % output:) |
---|
225 | w = setdat(x,out,C); |
---|
226 | %w = setdat(x,[-out out],C); |
---|
227 | %w = setdat(x,sigm([-out out]),C); |
---|
228 | |
---|
229 | end |
---|
230 | |
---|
231 | return |
---|