1 | %PR_DOWNLOAD_UCI Load UCI data and convert to PRTools format
|
---|
2 | %
|
---|
3 | % [A,B, ...] = PR_DOWNLOAD_UCI(UCIDIR,DNAMS,OPTIONS)
|
---|
4 | %
|
---|
5 | % INPUT
|
---|
6 | % UCIDIR Name of desired UCI repository directory
|
---|
7 | % DNAMS Character cell array of UCI data files names to be downloaded
|
---|
8 | % or some full urls.
|
---|
9 | % OPTIONS Structure with options for parsing, see PR_DOWNLOAD
|
---|
10 | % OPTIONS may also be a cell array with options, one for every
|
---|
11 | % data file. Common fields may be defined in an additional
|
---|
12 | % element of the cell array.
|
---|
13 | %
|
---|
14 | % DESCRIPTION
|
---|
15 | % This routine loads data sets from the <a href="http://archive.ics.uci.edu/ml/datasets/">UCI Machine Learning Repository</a>.
|
---|
16 | % and converts them into PRTools datasets. The downloaded files are stored
|
---|
17 | % as .dat-files, the PRTools datasets as .mat-files in the directory of
|
---|
18 | % this routine. The file names of the calling routine are used. Various
|
---|
19 | % annotations are stored in the user-field of the PRTools datasets.
|
---|
20 | %
|
---|
21 | % This routine also accepts an old, undocumented format.
|
---|
22 | %
|
---|
23 | % EXAMPLE
|
---|
24 | % opt.nheadlines = 5;
|
---|
25 | % [a,b] = pr_download_uci('Image+Segmentation', ...
|
---|
26 | % {'segmentation.data','segmentation.test'},opt);
|
---|
27 | %
|
---|
28 | % SEE ALSO <a href="http://prtools.tudelft.nl/prtools/">PRTools Guide</a>
|
---|
29 |
|
---|
30 | % Copyright: R.P.W. Duin
|
---|
31 |
|
---|
32 | function varargout = pr_download_uci(name,varargin)
|
---|
33 | %% make sur there is a dta subdir
|
---|
34 | persistent DATADIREXISTS
|
---|
35 | if isempty(DATADIREXISTS)
|
---|
36 | datasubdir = fullfile(fileparts(which(mfilename)),'data');
|
---|
37 | if exist(datasubdir,'dir') ~= 7
|
---|
38 | mkdir(datasubdir);
|
---|
39 | end
|
---|
40 | DATADIREXISTS = true;
|
---|
41 | end
|
---|
42 |
|
---|
43 | %% handle old format
|
---|
44 | if nargin > 3 || (nargin == 3 && ischar(varargin{2}))
|
---|
45 | %if ~(nargin == 3 && (iscell(varargin{2}) || isstruct(varargin{2})))
|
---|
46 | varargout = cell(1,nargout);
|
---|
47 | [varargout{:}] = pr_download_uci_old(name,varargin{:});
|
---|
48 | return
|
---|
49 | end
|
---|
50 |
|
---|
51 | %% get inputs: data files (ucinames) and parse options
|
---|
52 | [ucinames,opt] = setdefaults(varargin,{[lower(name) '.data']},[]);
|
---|
53 | if ~iscell(ucinames)
|
---|
54 | ucinames = {ucinames};
|
---|
55 | end
|
---|
56 |
|
---|
57 | if iscell(opt)
|
---|
58 | if numel(opt) == numel(ucinames)+1
|
---|
59 | % multiple structures to combine, opt{end} is common
|
---|
60 | for n=1:numel(opt)-1
|
---|
61 | % copy the common fields into the other stuctures
|
---|
62 | M = [fieldnames(opt{end})' fieldnames(opt{n})'; ...
|
---|
63 | struct2cell(opt{end})' struct2cell(opt{n})'];
|
---|
64 | opt{n} = struct(M{:});
|
---|
65 | end
|
---|
66 | elseif numel(opt) ~= numel(ucinames)
|
---|
67 | error('Number op option-structures is wrong')
|
---|
68 | end
|
---|
69 | else
|
---|
70 | opt = repmat({opt},1,numel(ucinames)+1);
|
---|
71 | end
|
---|
72 | % Now opt{n} should correspond to ucinames{n}
|
---|
73 |
|
---|
74 |
|
---|
75 | %% where to store: names of data file, mat file and directory
|
---|
76 | comname = callername; % filenames
|
---|
77 | datadir = callerdir; % directory
|
---|
78 | if isempty(comname)
|
---|
79 | % call from command line
|
---|
80 | comname = name;
|
---|
81 | datadir = pwd;
|
---|
82 | end
|
---|
83 | datadir = fullfile(datadir,'data');
|
---|
84 | varargout = cell(1,numel(ucinames));
|
---|
85 | % might be too large, will be corrected
|
---|
86 |
|
---|
87 | %% if matfiles available, use them
|
---|
88 | [varargout{:}] = pr_loadmatfile(comname);
|
---|
89 | if ~isempty(varargout{1}), return; end
|
---|
90 |
|
---|
91 |
|
---|
92 | %% get UCI info
|
---|
93 | data = parselink(name);
|
---|
94 | url = data.url;
|
---|
95 |
|
---|
96 | %% Handle for all data files
|
---|
97 | anynew = false;
|
---|
98 | for j=1:numel(ucinames)
|
---|
99 | uciname = ucinames{j};
|
---|
100 | if strcmp(uciname(1:4),'http')
|
---|
101 | % full url given, use it
|
---|
102 | data.url = uciname;
|
---|
103 | else
|
---|
104 | % construct url from UCI info
|
---|
105 | data.url = [url uciname];
|
---|
106 | end
|
---|
107 | if numel(ucinames) > 1
|
---|
108 | dataname = [comname '_' num2str(j)];
|
---|
109 | else
|
---|
110 | dataname = comname;
|
---|
111 | end
|
---|
112 | % opt{j}.dsetname = dataname;
|
---|
113 | savemat = ~isfield(opt{j},'matfile') || opt{j}.matfile;
|
---|
114 | opt{j}.matfile = false;
|
---|
115 | opt{j}.delimeter= ',';
|
---|
116 | opt{j} = fielddef(opt{j},'dsetname',callername);
|
---|
117 | a = pr_download(data.url,fullfile(datadir,dataname),opt{j});
|
---|
118 | a = setuser(a,data,'user'); % store dataset info
|
---|
119 | % a = setname(a,dataname); % set dataset name
|
---|
120 | if ~isfield(opt{j},'labfeat') || isempty(opt{j}.labfeat)
|
---|
121 | a = feat2lab(a,size(a,2));
|
---|
122 | end
|
---|
123 | if savemat
|
---|
124 | save(fullfile(datadir,dataname),'a');
|
---|
125 | end
|
---|
126 | varargout{j} = a;
|
---|
127 | end
|
---|
128 |
|
---|
129 | %% combine them
|
---|
130 | if numel(ucinames) > 1
|
---|
131 | % multiple datasets loaded, alignment might be needed
|
---|
132 | [varargout{:}] = pr_dset_align(varargout{:});
|
---|
133 | a = vertcat(varargout{:});
|
---|
134 | a = setuser(a,data,'user'); % store dataset info
|
---|
135 | opt{end} = fielddef(opt{end},'dsetname',callername);
|
---|
136 | if ~isfield(opt{end},'matfile') || opt{end}.matfile
|
---|
137 | save(fullfile(datadir,comname),'a');
|
---|
138 | end
|
---|
139 | if nargout == 1 % just combined set is requested
|
---|
140 | varargout{1} = a;
|
---|
141 | end
|
---|
142 | end
|
---|
143 |
|
---|
144 | function varargout = pr_download_uci_old(name,varargin)
|
---|
145 | %% take care of old definition
|
---|
146 | [ucinames,form,prname,siz,nhead,misvalchar,delchar,cfeat,nosave] = ...
|
---|
147 | setdefaults(varargin,{[lower(name) '.data']},[],[],[],[],'?',',',[],false);
|
---|
148 | nhead = setdefaults({nhead},zeros(1,numel(ucinames))); % headerlines to be skipped
|
---|
149 |
|
---|
150 | if ~iscell(ucinames)
|
---|
151 | ucinames = {ucinames};
|
---|
152 | end
|
---|
153 | if isempty(cfeat)
|
---|
154 | cfeat = repmat({[]},1,numel(ucinames));
|
---|
155 | end
|
---|
156 | if ~iscell(cfeat)
|
---|
157 | cfeat = repmat({cfeat},1,numel(ucinames));
|
---|
158 | end
|
---|
159 |
|
---|
160 | prname = setdefaults({prname},callername(1));
|
---|
161 | prname = setdefaults({prname},lower(name));
|
---|
162 | thisdir = fileparts(which(mfilename));
|
---|
163 |
|
---|
164 | if isempty(siz)
|
---|
165 | % no sizes given, make all 0
|
---|
166 | siz = zeros(1,numel(ucinames));
|
---|
167 | end
|
---|
168 |
|
---|
169 | varargout = cell(1,numel(ucinames));
|
---|
170 | anynew = false;
|
---|
171 | filenames = cell(1,numel(ucinames));
|
---|
172 | for j=1:numel(ucinames)
|
---|
173 | uciname = ucinames{j};
|
---|
174 | if numel(ucinames) > 1
|
---|
175 | dataname = [prname '_' num2str(j)];
|
---|
176 | else
|
---|
177 | dataname = prname;
|
---|
178 | end
|
---|
179 | filenames{j} = fullfile(fullfile(thisdir,'data'),dataname);
|
---|
180 | if exist([filenames{j} '.mat'],'file') == 2
|
---|
181 | % if mat-file is available, use it
|
---|
182 | % s = load([filenames{j} '.mat']);
|
---|
183 | % f = fieldnames(s);
|
---|
184 | % a = getfield(s,f{1});
|
---|
185 | a = file2dset([filenames{j} '.mat']);
|
---|
186 | else
|
---|
187 | if ~exist('data','var')
|
---|
188 | % get UCI info
|
---|
189 | data = parselink(name);
|
---|
190 | if ~data.misval % avoid checking missing values if not needed
|
---|
191 | misvalchar = [];
|
---|
192 | end
|
---|
193 | url = data.url;
|
---|
194 | end
|
---|
195 | if strcmp(uciname(1:4),'http')
|
---|
196 | % full url given, use it
|
---|
197 | data.url = uciname;
|
---|
198 | else
|
---|
199 | % construct url from UCI info
|
---|
200 | data.url = [url uciname];
|
---|
201 | end
|
---|
202 | % do the real work
|
---|
203 | a = pr_download(data.url,filenames{j},siz(j),nhead(j),form,misvalchar,delchar,true);
|
---|
204 | if ischar(cfeat{j})
|
---|
205 | labfile = [filenames{j} '_lab'];
|
---|
206 | % old call to pr_download
|
---|
207 | labs = +pr_download([url cfeat{j}],labfile,0,[],[],[],[],true);
|
---|
208 | if isempty(labs) || size(labs,1) ~= size(a,1)
|
---|
209 | warning(['No correct label file found: ' [url cfeat{j}]]);
|
---|
210 | else
|
---|
211 | delete([labfile '.dat']);
|
---|
212 | a = setlabels(a,+labs);
|
---|
213 | end
|
---|
214 | else
|
---|
215 | if isempty(cfeat{j}) % find labels and use them
|
---|
216 | a = feat2lab(a,size(a,2));
|
---|
217 | elseif cfeat{j} ~= 0
|
---|
218 | a = feat2lab(a,cfeat{j});
|
---|
219 | end
|
---|
220 | end
|
---|
221 | a = setuser(a,data,'user'); % store dataset info
|
---|
222 | a = setname(a,dataname); % set dataset name
|
---|
223 | save([filenames{j} '.mat'],'a'); % save it
|
---|
224 | anynew = true;
|
---|
225 | end
|
---|
226 | varargout{j} = a;
|
---|
227 | end
|
---|
228 |
|
---|
229 | if anynew && numel(ucinames) > 1
|
---|
230 | % multiple datasets loaded, alignment might be needed
|
---|
231 | [varargout{:}] = pr_dset_align(varargout{:});
|
---|
232 | for j=1:numel(ucinames)
|
---|
233 | a = varargout{j};
|
---|
234 | if ~nosave
|
---|
235 | save(filenames{j},'a');
|
---|
236 | end
|
---|
237 | end
|
---|
238 | end
|
---|
239 |
|
---|
240 | function data = parselink(link)
|
---|
241 | %% Parse info from a particular UCI ML data set
|
---|
242 | % data.link : url of the particular data set pages
|
---|
243 | % data.info : url of the data set info page
|
---|
244 | % data.url : url of the data set data files (excluding the filename, as
|
---|
245 | % there might be more files and their names ar irregular)
|
---|
246 | % data.desc : the problem infor as given in the abstract;
|
---|
247 | % data.misval : true/false for missing values
|
---|
248 | % data.type : feature types (categorical / integer / real)
|
---|
249 |
|
---|
250 | link = ['http://archive.ics.uci.edu/ml/datasets/' link];
|
---|
251 | desc = urlread(link);
|
---|
252 | k = strfind(desc,'Download');
|
---|
253 | s = desc(k:k+250);
|
---|
254 | k = strfind(s,'"');
|
---|
255 | url = ['http://archive.ics.uci.edu/ml/' s(k(1)+4:k(2)-1)];
|
---|
256 | info = ['http://archive.ics.uci.edu/ml/' s(k(5)+3:k(6)-1)];
|
---|
257 |
|
---|
258 | k1 = strfind(desc,'Abstract</b>');
|
---|
259 | if ~isempty(k1)
|
---|
260 | k2 = strfind(desc(k1+12:end),'</p>');
|
---|
261 | descr = desc(k1+14:k1+k2+10);
|
---|
262 | end
|
---|
263 |
|
---|
264 | k = strfind(desc,'Attribute Characteristics:</b></p></td>');
|
---|
265 | type = desc(k+64:k+150);
|
---|
266 | k = strfind(type,'</p');
|
---|
267 | type = textscan(type(1:k-1),'%s','delimiter',',');
|
---|
268 | type = char(type{1});
|
---|
269 |
|
---|
270 | k1 = strfind(desc,'Missing Values');
|
---|
271 | if strcmp(desc(k1+53:k1+54),'No')
|
---|
272 | misval = false;
|
---|
273 | elseif strcmp(desc(k1+53:k1+55),'Yes')
|
---|
274 | misval = true;
|
---|
275 | else
|
---|
276 | misval = [];
|
---|
277 | end
|
---|
278 |
|
---|
279 | data.link = link;
|
---|
280 | data.info = info;
|
---|
281 | data.url = url;
|
---|
282 | data.desc = descr;
|
---|
283 | data.misval = misval;
|
---|
284 | data.type = type;
|
---|
285 |
|
---|
286 | function s = fielddef(s,field,x)
|
---|
287 | if ~isfield(s,field)
|
---|
288 | s.(field) = x;
|
---|
289 | end
|
---|
290 |
|
---|
291 | function name = callername(n)
|
---|
292 | %%
|
---|
293 | if nargin < 1, n=0; end
|
---|
294 | [ss,dummy] = dbstack;
|
---|
295 | if length(ss) < n+3
|
---|
296 | name = [];
|
---|
297 | else
|
---|
298 | name = ss(n+3).name;
|
---|
299 | end
|
---|
300 |
|
---|
301 | function dirname = callerdir(n)
|
---|
302 | %%
|
---|
303 | if nargin < 1, n=0; end
|
---|
304 | ss = dbstack;
|
---|
305 | if length(ss) < 3
|
---|
306 | % no caller, commandline call
|
---|
307 | dirname = pwd;
|
---|
308 | else
|
---|
309 | dirname = fileparts(which(ss(n+3).name));
|
---|
310 | end
|
---|