[134] | 1 | %PR_DOWNLOAD_UCI Load UCI data and convert to PRTools format
|
---|
| 2 | %
|
---|
| 3 | % [A,B, ...] = PR_DOWNLOAD_UCI(UCIDIR,DNAMS,OPTIONS)
|
---|
| 4 | %
|
---|
| 5 | % INPUT
|
---|
| 6 | % UCIDIR Name of desired UCI repository directory
|
---|
| 7 | % DNAMS Character cell array of UCI data files names to be downloaded
|
---|
| 8 | % or some full urls.
|
---|
| 9 | % OPTIONS Structure with options for parsing, see PR_DOWNLOAD
|
---|
| 10 | % OPTIONS may also be a cell array with options, one for every
|
---|
| 11 | % data file. Common fields may be defined in an additional
|
---|
| 12 | % element of the cell array.
|
---|
| 13 | %
|
---|
| 14 | % DESCRIPTION
|
---|
| 15 | % This routine loads data sets from the <a href="http://archive.ics.uci.edu/ml/datasets/">UCI Machine Learning Repository</a>.
|
---|
| 16 | % and converts them into PRTools datasets. The downloaded files are stored
|
---|
| 17 | % as .dat-files, the PRTools datasets as .mat-files in the directory of
|
---|
| 18 | % this routine. The file names of the calling routine are used. Various
|
---|
| 19 | % annotations are stored in the user-field of the PRTools datasets.
|
---|
| 20 | %
|
---|
| 21 | % This routine also accepts an old, undocumented format.
|
---|
| 22 | %
|
---|
| 23 | % EXAMPLE
|
---|
| 24 | % opt.nheadlines = 5;
|
---|
| 25 | % [a,b] = pr_download_uci('Image+Segmentation', ...
|
---|
| 26 | % {'segmentation.data','segmentation.test'},opt);
|
---|
| 27 | %
|
---|
| 28 | % SEE ALSO <a href="http://37steps.com/prtools">PRTools Guide</a>
|
---|
| 29 |
|
---|
| 30 | % Copyright: R.P.W. Duin, r.p.w.duin@37steps.com
|
---|
| 31 |
|
---|
| 32 | function varargout = pr_download_uci(name,varargin)
|
---|
| 33 | %%
|
---|
| 34 |
|
---|
| 35 | %% handle old format
|
---|
| 36 | if nargin > 3 || (nargin == 3 && ischar(varargin{2}))
|
---|
| 37 | %if ~(nargin == 3 && (iscell(varargin{2}) || isstruct(varargin{2})))
|
---|
| 38 | varargout = cell(1,nargout);
|
---|
| 39 | [varargout{:}] = pr_download_uci_old(name,varargin{:});
|
---|
| 40 | return
|
---|
| 41 | end
|
---|
| 42 |
|
---|
| 43 | %% get inputs: data files (ucinames) and parse options
|
---|
| 44 | [ucinames,opt] = setdefaults(varargin,{[lower(name) '.data']},[]);
|
---|
| 45 | if ~iscell(ucinames)
|
---|
| 46 | ucinames = {ucinames};
|
---|
| 47 | end
|
---|
| 48 |
|
---|
| 49 | if iscell(opt)
|
---|
| 50 | if numel(opt) == numel(ucinames)+1
|
---|
| 51 | % multiple structures to combine, opt{end} is common
|
---|
| 52 | for n=1:numel(opt)-1
|
---|
| 53 | % copy the common fields into the other stuctures
|
---|
| 54 | M = [fieldnames(opt{end})' fieldnames(opt{n})'; ...
|
---|
| 55 | struct2cell(opt{end})' struct2cell(opt{n})'];
|
---|
| 56 | opt{n} = struct(M{:});
|
---|
| 57 | end
|
---|
| 58 | elseif numel(opt) ~= numel(ucinames)
|
---|
| 59 | error('Number op option-structures is wrong')
|
---|
| 60 | end
|
---|
| 61 | else
|
---|
| 62 | opt = repmat({opt},1,numel(ucinames)+1);
|
---|
| 63 | end
|
---|
| 64 | % Now opt{n} should correspond to ucinames{n}
|
---|
| 65 |
|
---|
| 66 |
|
---|
| 67 | %% where to store: names of data file, mat file and directory
|
---|
| 68 | comname = callername; % filenames
|
---|
| 69 | datadir = callerdir; % directory
|
---|
| 70 | if isempty(comname)
|
---|
| 71 | % call from command line
|
---|
| 72 | comname = name;
|
---|
| 73 | datadir = pwd;
|
---|
| 74 | end
|
---|
| 75 | datadir = fullfile(datadir,'data');
|
---|
| 76 | varargout = cell(1,numel(ucinames));
|
---|
| 77 | % might be too large, will be corrected
|
---|
| 78 |
|
---|
| 79 | %% if matfiles available, use them
|
---|
[142] | 80 | [varargout{:}] = pr_loadmatfile(comname);
|
---|
[134] | 81 | if ~isempty(varargout{1}), return; end
|
---|
| 82 |
|
---|
| 83 |
|
---|
| 84 | %% get UCI info
|
---|
| 85 | data = parselink(name);
|
---|
| 86 | url = data.url;
|
---|
| 87 |
|
---|
| 88 | %% Handle for all data files
|
---|
[135] | 89 | anynew = false;
|
---|
[134] | 90 | for j=1:numel(ucinames)
|
---|
| 91 | uciname = ucinames{j};
|
---|
| 92 | if strcmp(uciname(1:4),'http')
|
---|
| 93 | % full url given, use it
|
---|
| 94 | data.url = uciname;
|
---|
| 95 | else
|
---|
| 96 | % construct url from UCI info
|
---|
| 97 | data.url = [url uciname];
|
---|
| 98 | end
|
---|
| 99 | if numel(ucinames) > 1
|
---|
| 100 | dataname = [comname '_' num2str(j)];
|
---|
| 101 | else
|
---|
| 102 | dataname = comname;
|
---|
| 103 | end
|
---|
[142] | 104 | % opt{j}.dsetname = dataname;
|
---|
[134] | 105 | savemat = ~isfield(opt{j},'matfile') || opt{j}.matfile;
|
---|
| 106 | opt{j}.matfile = false;
|
---|
[142] | 107 | opt{j}.delimeter= ',';
|
---|
| 108 | opt{j} = fielddef(opt{j},'dsetname',callername);
|
---|
[134] | 109 | a = pr_download(data.url,fullfile(datadir,dataname),opt{j});
|
---|
| 110 | a = setuser(a,data,'user'); % store dataset info
|
---|
[142] | 111 | % a = setname(a,dataname); % set dataset name
|
---|
[134] | 112 | if ~isfield(opt{j},'labfeat') || isempty(opt{j}.labfeat)
|
---|
| 113 | a = feat2lab(a,size(a,2));
|
---|
| 114 | end
|
---|
| 115 | if savemat
|
---|
| 116 | save(fullfile(datadir,dataname),'a');
|
---|
| 117 | end
|
---|
| 118 | varargout{j} = a;
|
---|
| 119 | end
|
---|
| 120 |
|
---|
| 121 | %% combine them
|
---|
| 122 | if numel(ucinames) > 1
|
---|
| 123 | % multiple datasets loaded, alignment might be needed
|
---|
[142] | 124 | [varargout{:}] = pr_dset_align(varargout{:});
|
---|
[134] | 125 | a = vertcat(varargout{:});
|
---|
| 126 | a = setuser(a,data,'user'); % store dataset info
|
---|
[142] | 127 | opt{end} = fielddef(opt{end},'dsetname',callername);
|
---|
[134] | 128 | if ~isfield(opt{end},'matfile') || opt{end}.matfile
|
---|
| 129 | save(fullfile(datadir,comname),'a');
|
---|
| 130 | end
|
---|
| 131 | if nargout == 1 % just combined set is requested
|
---|
| 132 | varargout{1} = a;
|
---|
| 133 | end
|
---|
| 134 | end
|
---|
| 135 |
|
---|
| 136 | function varargout = pr_download_uci_old(name,varargin)
|
---|
| 137 | %% take care of old definition
|
---|
| 138 | [ucinames,form,prname,siz,nhead,misvalchar,delchar,cfeat,nosave] = ...
|
---|
| 139 | setdefaults(varargin,{[lower(name) '.data']},[],[],[],[],'?',',',[],false);
|
---|
| 140 | nhead = setdefaults({nhead},zeros(1,numel(ucinames))); % headerlines to be skipped
|
---|
| 141 |
|
---|
| 142 | if ~iscell(ucinames)
|
---|
| 143 | ucinames = {ucinames};
|
---|
| 144 | end
|
---|
| 145 | if isempty(cfeat)
|
---|
| 146 | cfeat = repmat({[]},1,numel(ucinames));
|
---|
| 147 | end
|
---|
| 148 | if ~iscell(cfeat)
|
---|
| 149 | cfeat = repmat({cfeat},1,numel(ucinames));
|
---|
| 150 | end
|
---|
| 151 |
|
---|
| 152 | prname = setdefaults({prname},callername(1));
|
---|
| 153 | prname = setdefaults({prname},lower(name));
|
---|
| 154 | thisdir = fileparts(which(mfilename));
|
---|
| 155 |
|
---|
| 156 | if isempty(siz)
|
---|
| 157 | % no sizes given, make all 0
|
---|
| 158 | siz = zeros(1,numel(ucinames));
|
---|
| 159 | end
|
---|
| 160 |
|
---|
| 161 | varargout = cell(1,numel(ucinames));
|
---|
| 162 | anynew = false;
|
---|
| 163 | filenames = cell(1,numel(ucinames));
|
---|
| 164 | for j=1:numel(ucinames)
|
---|
| 165 | uciname = ucinames{j};
|
---|
| 166 | if numel(ucinames) > 1
|
---|
| 167 | dataname = [prname '_' num2str(j)];
|
---|
| 168 | else
|
---|
| 169 | dataname = prname;
|
---|
| 170 | end
|
---|
[142] | 171 | filenames{j} = fullfile(fullfile(thisdir,'data'),dataname);
|
---|
[134] | 172 | if exist([filenames{j} '.mat'],'file') == 2
|
---|
| 173 | % if mat-file is available, use it
|
---|
| 174 | s = load([filenames{j} '.mat']);
|
---|
| 175 | f = fieldnames(s);
|
---|
| 176 | a = getfield(s,f{1});
|
---|
| 177 | else
|
---|
[142] | 178 | if ~exist('data','var')
|
---|
[134] | 179 | % get UCI info
|
---|
| 180 | data = parselink(name);
|
---|
| 181 | if ~data.misval % avoid checking missing values if not needed
|
---|
| 182 | misvalchar = [];
|
---|
| 183 | end
|
---|
| 184 | url = data.url;
|
---|
| 185 | end
|
---|
| 186 | if strcmp(uciname(1:4),'http')
|
---|
| 187 | % full url given, use it
|
---|
| 188 | data.url = uciname;
|
---|
| 189 | else
|
---|
| 190 | % construct url from UCI info
|
---|
| 191 | data.url = [url uciname];
|
---|
| 192 | end
|
---|
| 193 | % do the real work
|
---|
| 194 | a = pr_download(data.url,filenames{j},siz(j),nhead(j),form,misvalchar,delchar,true);
|
---|
| 195 | if ischar(cfeat{j})
|
---|
| 196 | labfile = [filenames{j} '_lab'];
|
---|
| 197 | % old call to pr_download
|
---|
| 198 | labs = +pr_download([url cfeat{j}],labfile,0,[],[],[],[],true);
|
---|
| 199 | if isempty(labs) || size(labs,1) ~= size(a,1)
|
---|
| 200 | warning(['No correct label file found: ' [url cfeat{j}]]);
|
---|
| 201 | else
|
---|
| 202 | delete([labfile '.dat']);
|
---|
| 203 | a = setlabels(a,+labs);
|
---|
| 204 | end
|
---|
| 205 | else
|
---|
| 206 | if isempty(cfeat{j}) % find labels and use them
|
---|
| 207 | a = feat2lab(a,size(a,2));
|
---|
| 208 | elseif cfeat{j} ~= 0
|
---|
| 209 | a = feat2lab(a,cfeat{j});
|
---|
| 210 | end
|
---|
| 211 | end
|
---|
| 212 | a = setuser(a,data,'user'); % store dataset info
|
---|
| 213 | a = setname(a,dataname); % set dataset name
|
---|
| 214 | save([filenames{j} '.mat'],'a'); % save it
|
---|
| 215 | anynew = true;
|
---|
| 216 | end
|
---|
| 217 | varargout{j} = a;
|
---|
| 218 | end
|
---|
| 219 |
|
---|
| 220 | if anynew && numel(ucinames) > 1
|
---|
| 221 | % multiple datasets loaded, alignment might be needed
|
---|
[142] | 222 | [varargout{:}] = pr_dset_align(varargout{:});
|
---|
[134] | 223 | for j=1:numel(ucinames)
|
---|
| 224 | a = varargout{j};
|
---|
| 225 | if ~nosave
|
---|
| 226 | save(filenames{j},'a');
|
---|
| 227 | end
|
---|
| 228 | end
|
---|
| 229 | end
|
---|
| 230 |
|
---|
| 231 | function data = parselink(link)
|
---|
| 232 | %% Parse info from a particular UCI ML data set
|
---|
| 233 | % data.link : url of the particular data set pages
|
---|
| 234 | % data.info : url of the data set info page
|
---|
| 235 | % data.url : url of the data set data files (excluding the filename, as
|
---|
| 236 | % there might be more files and their names ar irregular)
|
---|
| 237 | % data.desc : the problem infor as given in the abstract;
|
---|
| 238 | % data.misval : true/false for missing values
|
---|
| 239 | % data.type : feature types (categorical / integer / real)
|
---|
| 240 |
|
---|
| 241 | link = ['http://archive.ics.uci.edu/ml/datasets/' link];
|
---|
| 242 | desc = urlread(link);
|
---|
| 243 | k = strfind(desc,'Download');
|
---|
| 244 | s = desc(k:k+250);
|
---|
| 245 | k = strfind(s,'"');
|
---|
| 246 | url = ['http://archive.ics.uci.edu/ml/' s(k(1)+4:k(2)-1)];
|
---|
| 247 | info = ['http://archive.ics.uci.edu/ml/' s(k(5)+3:k(6)-1)];
|
---|
| 248 |
|
---|
| 249 | k1 = strfind(desc,'Abstract</b>');
|
---|
| 250 | if ~isempty(k1)
|
---|
| 251 | k2 = strfind(desc(k1+12:end),'</p>');
|
---|
| 252 | descr = desc(k1+14:k1+k2+10);
|
---|
| 253 | end
|
---|
| 254 |
|
---|
| 255 | k = strfind(desc,'Attribute Characteristics:</b></p></td>');
|
---|
| 256 | type = desc(k+64:k+150);
|
---|
| 257 | k = strfind(type,'</p');
|
---|
| 258 | type = textscan(type(1:k-1),'%s','delimiter',',');
|
---|
| 259 | type = char(type{1});
|
---|
| 260 |
|
---|
| 261 | k1 = strfind(desc,'Missing Values');
|
---|
| 262 | if strcmp(desc(k1+53:k1+54),'No')
|
---|
| 263 | misval = false;
|
---|
| 264 | elseif strcmp(desc(k1+53:k1+55),'Yes')
|
---|
| 265 | misval = true;
|
---|
| 266 | else
|
---|
| 267 | misval = [];
|
---|
| 268 | end
|
---|
| 269 |
|
---|
| 270 | data.link = link;
|
---|
| 271 | data.info = info;
|
---|
| 272 | data.url = url;
|
---|
| 273 | data.desc = descr;
|
---|
| 274 | data.misval = misval;
|
---|
| 275 | data.type = type;
|
---|
| 276 |
|
---|
[142] | 277 | function s = fielddef(s,field,x)
|
---|
| 278 | if ~isfield(s,field)
|
---|
| 279 | s.(field) = x;
|
---|
| 280 | end
|
---|
[134] | 281 |
|
---|
| 282 | function name = callername(n)
|
---|
| 283 | %%
|
---|
| 284 | if nargin < 1, n=0; end
|
---|
| 285 | [ss,dummy] = dbstack;
|
---|
| 286 | if length(ss) < n+3
|
---|
| 287 | name = [];
|
---|
| 288 | else
|
---|
| 289 | name = ss(n+3).name;
|
---|
| 290 | end
|
---|
| 291 |
|
---|
| 292 | function dirname = callerdir(n)
|
---|
| 293 | %%
|
---|
| 294 | if nargin < 1, n=0; end
|
---|
| 295 | ss = dbstack;
|
---|
| 296 | if length(ss) < 3
|
---|
| 297 | % no caller, commandline call
|
---|
| 298 | dirname = pwd;
|
---|
| 299 | else
|
---|
| 300 | dirname = fileparts(which(ss(n+3).name));
|
---|
| 301 | end
|
---|