1 | %PR_DOWNLOAD_UCI Load UCI data and convert to PRTools format
|
---|
2 | %
|
---|
3 | % [A,B, ...] = PR_DOWNLOAD_UCI(UCIDIR,DNAMS,OPTIONS)
|
---|
4 | %
|
---|
5 | % INPUT
|
---|
6 | % UCIDIR Name of desired UCI repository directory
|
---|
7 | % DNAMS Character cell array of UCI data files names to be downloaded
|
---|
8 | % or some full urls.
|
---|
9 | % OPTIONS Structure with options for parsing, see PR_DOWNLOAD
|
---|
10 | % OPTIONS may also be a cell array with options, one for every
|
---|
11 | % data file. Common fields may be defined in an additional
|
---|
12 | % element of the cell array.
|
---|
13 | %
|
---|
14 | % DESCRIPTION
|
---|
15 | % This routine loads data sets from the <a href="http://archive.ics.uci.edu/ml/datasets/">UCI Machine Learning Repository</a>.
|
---|
16 | % and converts them into PRTools datasets. The downloaded files are stored
|
---|
17 | % as .dat-files, the PRTools datasets as .mat-files in the directory of
|
---|
18 | % this routine. The file names of the calling routine are used. Various
|
---|
19 | % annotations are stored in the user-field of the PRTools datasets.
|
---|
20 | %
|
---|
21 | % This routine also accepts an old, undocumented format.
|
---|
22 | %
|
---|
23 | % EXAMPLE
|
---|
24 | % opt.nheadlines = 5;
|
---|
25 | % [a,b] = pr_download_uci('Image+Segmentation', ...
|
---|
26 | % {'segmentation.data','segmentation.test'},opt);
|
---|
27 | %
|
---|
28 | % SEE ALSO <a href="http://37steps.com/prtools">PRTools Guide</a>
|
---|
29 |
|
---|
30 | % Copyright: R.P.W. Duin, r.p.w.duin@37steps.com
|
---|
31 |
|
---|
32 | function varargout = pr_download_uci(name,varargin)
|
---|
33 | %%
|
---|
34 |
|
---|
35 | %% handle old format
|
---|
36 | if nargin > 3 || (nargin == 3 && ischar(varargin{2}))
|
---|
37 | %if ~(nargin == 3 && (iscell(varargin{2}) || isstruct(varargin{2})))
|
---|
38 | varargout = cell(1,nargout);
|
---|
39 | [varargout{:}] = pr_download_uci_old(name,varargin{:});
|
---|
40 | return
|
---|
41 | end
|
---|
42 |
|
---|
43 | %% get inputs: data files (ucinames) and parse options
|
---|
44 | [ucinames,opt] = setdefaults(varargin,{[lower(name) '.data']},[]);
|
---|
45 | if ~iscell(ucinames)
|
---|
46 | ucinames = {ucinames};
|
---|
47 | end
|
---|
48 |
|
---|
49 | if iscell(opt)
|
---|
50 | if numel(opt) == numel(ucinames)+1
|
---|
51 | % multiple structures to combine, opt{end} is common
|
---|
52 | for n=1:numel(opt)-1
|
---|
53 | % copy the common fields into the other stuctures
|
---|
54 | M = [fieldnames(opt{end})' fieldnames(opt{n})'; ...
|
---|
55 | struct2cell(opt{end})' struct2cell(opt{n})'];
|
---|
56 | opt{n} = struct(M{:});
|
---|
57 | end
|
---|
58 | elseif numel(opt) ~= numel(ucinames)
|
---|
59 | error('Number op option-structures is wrong')
|
---|
60 | end
|
---|
61 | else
|
---|
62 | opt = repmat({opt},1,numel(ucinames)+1);
|
---|
63 | end
|
---|
64 | % Now opt{n} should correspond to ucinames{n}
|
---|
65 |
|
---|
66 |
|
---|
67 | %% where to store: names of data file, mat file and directory
|
---|
68 | comname = callername; % filenames
|
---|
69 | datadir = callerdir; % directory
|
---|
70 | if isempty(comname)
|
---|
71 | % call from command line
|
---|
72 | comname = name;
|
---|
73 | datadir = pwd;
|
---|
74 | end
|
---|
75 | datadir = fullfile(datadir,'data');
|
---|
76 | varargout = cell(1,numel(ucinames));
|
---|
77 | % might be too large, will be corrected
|
---|
78 |
|
---|
79 | %% if matfiles available, use them
|
---|
80 | [varargout{:}] = loadmatfile(comname);
|
---|
81 | if ~isempty(varargout{1}), return; end
|
---|
82 |
|
---|
83 |
|
---|
84 | %% get UCI info
|
---|
85 | data = parselink(name);
|
---|
86 | url = data.url;
|
---|
87 |
|
---|
88 | %% Handle for all data files
|
---|
89 | for j=1:numel(ucinames)
|
---|
90 | uciname = ucinames{j};
|
---|
91 | if strcmp(uciname(1:4),'http')
|
---|
92 | % full url given, use it
|
---|
93 | data.url = uciname;
|
---|
94 | else
|
---|
95 | % construct url from UCI info
|
---|
96 | data.url = [url uciname];
|
---|
97 | end
|
---|
98 | if numel(ucinames) > 1
|
---|
99 | dataname = [comname '_' num2str(j)];
|
---|
100 | else
|
---|
101 | dataname = comname;
|
---|
102 | end
|
---|
103 | opt{j}.dsetname = dataname;
|
---|
104 | savemat = ~isfield(opt{j},'matfile') || opt{j}.matfile;
|
---|
105 | opt{j}.matfile = false;
|
---|
106 | a = pr_download(data.url,fullfile(datadir,dataname),opt{j});
|
---|
107 | a = setuser(a,data,'user'); % store dataset info
|
---|
108 | a = setname(a,dataname); % set dataset name
|
---|
109 | if ~isfield(opt{j},'labfeat') || isempty(opt{j}.labfeat)
|
---|
110 | a = feat2lab(a,size(a,2));
|
---|
111 | end
|
---|
112 | if savemat
|
---|
113 | save(fullfile(datadir,dataname),'a');
|
---|
114 | end
|
---|
115 | varargout{j} = a;
|
---|
116 | end
|
---|
117 |
|
---|
118 | %% combine them
|
---|
119 | if numel(ucinames) > 1
|
---|
120 | % multiple datasets loaded, alignment might be needed
|
---|
121 | [varargout{:}] = dset_align(varargout{:});
|
---|
122 | a = vertcat(varargout{:});
|
---|
123 | a = setuser(a,data,'user'); % store dataset info
|
---|
124 | a = setname(a,comname); % set dataset name
|
---|
125 | if ~isfield(opt{end},'matfile') || opt{end}.matfile
|
---|
126 | save(fullfile(datadir,comname),'a');
|
---|
127 | end
|
---|
128 | if nargout == 1 % just combined set is requested
|
---|
129 | varargout{1} = a;
|
---|
130 | end
|
---|
131 | end
|
---|
132 |
|
---|
133 | function varargout = pr_download_uci_old(name,varargin)
|
---|
134 | %% take care of old definition
|
---|
135 | [ucinames,form,prname,siz,nhead,misvalchar,delchar,cfeat,nosave] = ...
|
---|
136 | setdefaults(varargin,{[lower(name) '.data']},[],[],[],[],'?',',',[],false);
|
---|
137 | nhead = setdefaults({nhead},zeros(1,numel(ucinames))); % headerlines to be skipped
|
---|
138 |
|
---|
139 | if ~iscell(ucinames)
|
---|
140 | ucinames = {ucinames};
|
---|
141 | end
|
---|
142 | if isempty(cfeat)
|
---|
143 | cfeat = repmat({[]},1,numel(ucinames));
|
---|
144 | end
|
---|
145 | if ~iscell(cfeat)
|
---|
146 | cfeat = repmat({cfeat},1,numel(ucinames));
|
---|
147 | end
|
---|
148 |
|
---|
149 | prname = setdefaults({prname},callername(1));
|
---|
150 | prname = setdefaults({prname},lower(name));
|
---|
151 | thisdir = fileparts(which(mfilename));
|
---|
152 |
|
---|
153 | if isempty(siz)
|
---|
154 | % no sizes given, make all 0
|
---|
155 | siz = zeros(1,numel(ucinames));
|
---|
156 | end
|
---|
157 |
|
---|
158 | varargout = cell(1,numel(ucinames));
|
---|
159 | anynew = false;
|
---|
160 | filenames = cell(1,numel(ucinames));
|
---|
161 | for j=1:numel(ucinames)
|
---|
162 | uciname = ucinames{j};
|
---|
163 | if numel(ucinames) > 1
|
---|
164 | dataname = [prname '_' num2str(j)];
|
---|
165 | else
|
---|
166 | dataname = prname;
|
---|
167 | end
|
---|
168 | filenames{j} = fullfile(thisdir,dataname);
|
---|
169 | if exist([filenames{j} '.mat'],'file') == 2
|
---|
170 | % if mat-file is available, use it
|
---|
171 | s = load([filenames{j} '.mat']);
|
---|
172 | f = fieldnames(s);
|
---|
173 | a = getfield(s,f{1});
|
---|
174 | else
|
---|
175 | if ~exist('data')
|
---|
176 | % get UCI info
|
---|
177 | data = parselink(name);
|
---|
178 | if ~data.misval % avoid checking missing values if not needed
|
---|
179 | misvalchar = [];
|
---|
180 | end
|
---|
181 | url = data.url;
|
---|
182 | end
|
---|
183 | if strcmp(uciname(1:4),'http')
|
---|
184 | % full url given, use it
|
---|
185 | data.url = uciname;
|
---|
186 | else
|
---|
187 | % construct url from UCI info
|
---|
188 | data.url = [url uciname];
|
---|
189 | end
|
---|
190 | % do the real work
|
---|
191 | a = pr_download(data.url,filenames{j},siz(j),nhead(j),form,misvalchar,delchar,true);
|
---|
192 | if ischar(cfeat{j})
|
---|
193 | labfile = [filenames{j} '_lab'];
|
---|
194 | % old call to pr_download
|
---|
195 | labs = +pr_download([url cfeat{j}],labfile,0,[],[],[],[],true);
|
---|
196 | if isempty(labs) || size(labs,1) ~= size(a,1)
|
---|
197 | warning(['No correct label file found: ' [url cfeat{j}]]);
|
---|
198 | else
|
---|
199 | delete([labfile '.dat']);
|
---|
200 | a = setlabels(a,+labs);
|
---|
201 | end
|
---|
202 | else
|
---|
203 | if isempty(cfeat{j}) % find labels and use them
|
---|
204 | a = feat2lab(a,size(a,2));
|
---|
205 | elseif cfeat{j} ~= 0
|
---|
206 | a = feat2lab(a,cfeat{j});
|
---|
207 | end
|
---|
208 | end
|
---|
209 | a = setuser(a,data,'user'); % store dataset info
|
---|
210 | a = setname(a,dataname); % set dataset name
|
---|
211 | save([filenames{j} '.mat'],'a'); % save it
|
---|
212 | anynew = true;
|
---|
213 | end
|
---|
214 | varargout{j} = a;
|
---|
215 | end
|
---|
216 |
|
---|
217 | if anynew && numel(ucinames) > 1
|
---|
218 | % multiple datasets loaded, alignment might be needed
|
---|
219 | [varargout{:}] = dset_align(varargout{:});
|
---|
220 | for j=1:numel(ucinames)
|
---|
221 | a = varargout{j};
|
---|
222 | if ~nosave
|
---|
223 | save(filenames{j},'a');
|
---|
224 | end
|
---|
225 | end
|
---|
226 | end
|
---|
227 |
|
---|
228 | function data = parselink(link)
|
---|
229 | %% Parse info from a particular UCI ML data set
|
---|
230 | % data.link : url of the particular data set pages
|
---|
231 | % data.info : url of the data set info page
|
---|
232 | % data.url : url of the data set data files (excluding the filename, as
|
---|
233 | % there might be more files and their names ar irregular)
|
---|
234 | % data.desc : the problem infor as given in the abstract;
|
---|
235 | % data.misval : true/false for missing values
|
---|
236 | % data.type : feature types (categorical / integer / real)
|
---|
237 |
|
---|
238 | link = ['http://archive.ics.uci.edu/ml/datasets/' link];
|
---|
239 | desc = urlread(link);
|
---|
240 | k = strfind(desc,'Download');
|
---|
241 | s = desc(k:k+250);
|
---|
242 | k = strfind(s,'"');
|
---|
243 | url = ['http://archive.ics.uci.edu/ml/' s(k(1)+4:k(2)-1)];
|
---|
244 | info = ['http://archive.ics.uci.edu/ml/' s(k(5)+3:k(6)-1)];
|
---|
245 |
|
---|
246 | k1 = strfind(desc,'Abstract</b>');
|
---|
247 | if ~isempty(k1)
|
---|
248 | k2 = strfind(desc(k1+12:end),'</p>');
|
---|
249 | descr = desc(k1+14:k1+k2+10);
|
---|
250 | end
|
---|
251 |
|
---|
252 | k = strfind(desc,'Attribute Characteristics:</b></p></td>');
|
---|
253 | type = desc(k+64:k+150);
|
---|
254 | k = strfind(type,'</p');
|
---|
255 | type = textscan(type(1:k-1),'%s','delimiter',',');
|
---|
256 | type = char(type{1});
|
---|
257 |
|
---|
258 | k1 = strfind(desc,'Missing Values');
|
---|
259 | if strcmp(desc(k1+53:k1+54),'No')
|
---|
260 | misval = false;
|
---|
261 | elseif strcmp(desc(k1+53:k1+55),'Yes')
|
---|
262 | misval = true;
|
---|
263 | else
|
---|
264 | misval = [];
|
---|
265 | end
|
---|
266 |
|
---|
267 | data.link = link;
|
---|
268 | data.info = info;
|
---|
269 | data.url = url;
|
---|
270 | data.desc = descr;
|
---|
271 | data.misval = misval;
|
---|
272 | data.type = type;
|
---|
273 |
|
---|
274 |
|
---|
275 | function name = callername(n)
|
---|
276 | %%
|
---|
277 | if nargin < 1, n=0; end
|
---|
278 | [ss,dummy] = dbstack;
|
---|
279 | if length(ss) < n+3
|
---|
280 | name = [];
|
---|
281 | else
|
---|
282 | name = ss(n+3).name;
|
---|
283 | end
|
---|
284 |
|
---|
285 | function dirname = callerdir(n)
|
---|
286 | %%
|
---|
287 | if nargin < 1, n=0; end
|
---|
288 | ss = dbstack;
|
---|
289 | if length(ss) < 3
|
---|
290 | % no caller, commandline call
|
---|
291 | dirname = pwd;
|
---|
292 | else
|
---|
293 | dirname = fileparts(which(ss(n+3).name));
|
---|
294 | end
|
---|