[134] | 1 | %PR_DOWNLOAD_UCI Load UCI data and convert to PRTools format
|
---|
| 2 | %
|
---|
| 3 | % [A,B, ...] = PR_DOWNLOAD_UCI(UCIDIR,DNAMS,OPTIONS)
|
---|
| 4 | %
|
---|
| 5 | % INPUT
|
---|
| 6 | % UCIDIR Name of desired UCI repository directory
|
---|
| 7 | % DNAMS Character cell array of UCI data files names to be downloaded
|
---|
| 8 | % or some full urls.
|
---|
| 9 | % OPTIONS Structure with options for parsing, see PR_DOWNLOAD
|
---|
| 10 | % OPTIONS may also be a cell array with options, one for every
|
---|
| 11 | % data file. Common fields may be defined in an additional
|
---|
| 12 | % element of the cell array.
|
---|
| 13 | %
|
---|
| 14 | % DESCRIPTION
|
---|
| 15 | % This routine loads data sets from the <a href="http://archive.ics.uci.edu/ml/datasets/">UCI Machine Learning Repository</a>.
|
---|
| 16 | % and converts them into PRTools datasets. The downloaded files are stored
|
---|
| 17 | % as .dat-files, the PRTools datasets as .mat-files in the directory of
|
---|
| 18 | % this routine. The file names of the calling routine are used. Various
|
---|
| 19 | % annotations are stored in the user-field of the PRTools datasets.
|
---|
| 20 | %
|
---|
| 21 | % This routine also accepts an old, undocumented format.
|
---|
| 22 | %
|
---|
| 23 | % EXAMPLE
|
---|
| 24 | % opt.nheadlines = 5;
|
---|
| 25 | % [a,b] = pr_download_uci('Image+Segmentation', ...
|
---|
| 26 | % {'segmentation.data','segmentation.test'},opt);
|
---|
| 27 | %
|
---|
[150] | 28 | % SEE ALSO <a href="http://prtools.tudelft.nl/prtools/">PRTools Guide</a>
|
---|
[134] | 29 |
|
---|
[150] | 30 | % Copyright: R.P.W. Duin
|
---|
[134] | 31 |
|
---|
| 32 | function varargout = pr_download_uci(name,varargin)
|
---|
[153] | 33 | %% make sur there is a dta subdir
|
---|
| 34 | persistent DATADIREXISTS
|
---|
| 35 | if isempty(DATADIREXISTS)
|
---|
| 36 | datasubdir = fullfile(fileparts(which(mfilename)),'data');
|
---|
| 37 | if exist(datasubdir,'dir') ~= 7
|
---|
| 38 | mkdir(datasubdir);
|
---|
| 39 | end
|
---|
| 40 | DATADIREXISTS = true;
|
---|
| 41 | end
|
---|
[134] | 42 |
|
---|
| 43 | %% handle old format
|
---|
| 44 | if nargin > 3 || (nargin == 3 && ischar(varargin{2}))
|
---|
| 45 | %if ~(nargin == 3 && (iscell(varargin{2}) || isstruct(varargin{2})))
|
---|
| 46 | varargout = cell(1,nargout);
|
---|
| 47 | [varargout{:}] = pr_download_uci_old(name,varargin{:});
|
---|
| 48 | return
|
---|
| 49 | end
|
---|
| 50 |
|
---|
| 51 | %% get inputs: data files (ucinames) and parse options
|
---|
| 52 | [ucinames,opt] = setdefaults(varargin,{[lower(name) '.data']},[]);
|
---|
| 53 | if ~iscell(ucinames)
|
---|
| 54 | ucinames = {ucinames};
|
---|
| 55 | end
|
---|
| 56 |
|
---|
| 57 | if iscell(opt)
|
---|
| 58 | if numel(opt) == numel(ucinames)+1
|
---|
| 59 | % multiple structures to combine, opt{end} is common
|
---|
| 60 | for n=1:numel(opt)-1
|
---|
| 61 | % copy the common fields into the other stuctures
|
---|
| 62 | M = [fieldnames(opt{end})' fieldnames(opt{n})'; ...
|
---|
| 63 | struct2cell(opt{end})' struct2cell(opt{n})'];
|
---|
| 64 | opt{n} = struct(M{:});
|
---|
| 65 | end
|
---|
| 66 | elseif numel(opt) ~= numel(ucinames)
|
---|
| 67 | error('Number op option-structures is wrong')
|
---|
| 68 | end
|
---|
| 69 | else
|
---|
| 70 | opt = repmat({opt},1,numel(ucinames)+1);
|
---|
| 71 | end
|
---|
| 72 | % Now opt{n} should correspond to ucinames{n}
|
---|
| 73 |
|
---|
| 74 |
|
---|
| 75 | %% where to store: names of data file, mat file and directory
|
---|
| 76 | comname = callername; % filenames
|
---|
| 77 | datadir = callerdir; % directory
|
---|
| 78 | if isempty(comname)
|
---|
| 79 | % call from command line
|
---|
| 80 | comname = name;
|
---|
| 81 | datadir = pwd;
|
---|
| 82 | end
|
---|
| 83 | datadir = fullfile(datadir,'data');
|
---|
| 84 | varargout = cell(1,numel(ucinames));
|
---|
| 85 | % might be too large, will be corrected
|
---|
| 86 |
|
---|
| 87 | %% if matfiles available, use them
|
---|
[142] | 88 | [varargout{:}] = pr_loadmatfile(comname);
|
---|
[134] | 89 | if ~isempty(varargout{1}), return; end
|
---|
| 90 |
|
---|
| 91 |
|
---|
| 92 | %% get UCI info
|
---|
| 93 | data = parselink(name);
|
---|
| 94 | url = data.url;
|
---|
| 95 |
|
---|
| 96 | %% Handle for all data files
|
---|
[135] | 97 | anynew = false;
|
---|
[134] | 98 | for j=1:numel(ucinames)
|
---|
| 99 | uciname = ucinames{j};
|
---|
| 100 | if strcmp(uciname(1:4),'http')
|
---|
| 101 | % full url given, use it
|
---|
| 102 | data.url = uciname;
|
---|
| 103 | else
|
---|
| 104 | % construct url from UCI info
|
---|
| 105 | data.url = [url uciname];
|
---|
| 106 | end
|
---|
| 107 | if numel(ucinames) > 1
|
---|
| 108 | dataname = [comname '_' num2str(j)];
|
---|
| 109 | else
|
---|
| 110 | dataname = comname;
|
---|
| 111 | end
|
---|
[142] | 112 | % opt{j}.dsetname = dataname;
|
---|
[134] | 113 | savemat = ~isfield(opt{j},'matfile') || opt{j}.matfile;
|
---|
| 114 | opt{j}.matfile = false;
|
---|
[142] | 115 | opt{j}.delimeter= ',';
|
---|
| 116 | opt{j} = fielddef(opt{j},'dsetname',callername);
|
---|
[134] | 117 | a = pr_download(data.url,fullfile(datadir,dataname),opt{j});
|
---|
| 118 | a = setuser(a,data,'user'); % store dataset info
|
---|
[142] | 119 | % a = setname(a,dataname); % set dataset name
|
---|
[134] | 120 | if ~isfield(opt{j},'labfeat') || isempty(opt{j}.labfeat)
|
---|
| 121 | a = feat2lab(a,size(a,2));
|
---|
| 122 | end
|
---|
| 123 | if savemat
|
---|
| 124 | save(fullfile(datadir,dataname),'a');
|
---|
| 125 | end
|
---|
| 126 | varargout{j} = a;
|
---|
| 127 | end
|
---|
| 128 |
|
---|
| 129 | %% combine them
|
---|
| 130 | if numel(ucinames) > 1
|
---|
| 131 | % multiple datasets loaded, alignment might be needed
|
---|
[142] | 132 | [varargout{:}] = pr_dset_align(varargout{:});
|
---|
[134] | 133 | a = vertcat(varargout{:});
|
---|
| 134 | a = setuser(a,data,'user'); % store dataset info
|
---|
[142] | 135 | opt{end} = fielddef(opt{end},'dsetname',callername);
|
---|
[134] | 136 | if ~isfield(opt{end},'matfile') || opt{end}.matfile
|
---|
| 137 | save(fullfile(datadir,comname),'a');
|
---|
| 138 | end
|
---|
| 139 | if nargout == 1 % just combined set is requested
|
---|
| 140 | varargout{1} = a;
|
---|
| 141 | end
|
---|
| 142 | end
|
---|
| 143 |
|
---|
| 144 | function varargout = pr_download_uci_old(name,varargin)
|
---|
| 145 | %% take care of old definition
|
---|
| 146 | [ucinames,form,prname,siz,nhead,misvalchar,delchar,cfeat,nosave] = ...
|
---|
| 147 | setdefaults(varargin,{[lower(name) '.data']},[],[],[],[],'?',',',[],false);
|
---|
| 148 | nhead = setdefaults({nhead},zeros(1,numel(ucinames))); % headerlines to be skipped
|
---|
| 149 |
|
---|
| 150 | if ~iscell(ucinames)
|
---|
| 151 | ucinames = {ucinames};
|
---|
| 152 | end
|
---|
| 153 | if isempty(cfeat)
|
---|
| 154 | cfeat = repmat({[]},1,numel(ucinames));
|
---|
| 155 | end
|
---|
| 156 | if ~iscell(cfeat)
|
---|
| 157 | cfeat = repmat({cfeat},1,numel(ucinames));
|
---|
| 158 | end
|
---|
| 159 |
|
---|
| 160 | prname = setdefaults({prname},callername(1));
|
---|
| 161 | prname = setdefaults({prname},lower(name));
|
---|
| 162 | thisdir = fileparts(which(mfilename));
|
---|
| 163 |
|
---|
| 164 | if isempty(siz)
|
---|
| 165 | % no sizes given, make all 0
|
---|
| 166 | siz = zeros(1,numel(ucinames));
|
---|
| 167 | end
|
---|
| 168 |
|
---|
| 169 | varargout = cell(1,numel(ucinames));
|
---|
| 170 | anynew = false;
|
---|
| 171 | filenames = cell(1,numel(ucinames));
|
---|
| 172 | for j=1:numel(ucinames)
|
---|
| 173 | uciname = ucinames{j};
|
---|
| 174 | if numel(ucinames) > 1
|
---|
| 175 | dataname = [prname '_' num2str(j)];
|
---|
| 176 | else
|
---|
| 177 | dataname = prname;
|
---|
| 178 | end
|
---|
[142] | 179 | filenames{j} = fullfile(fullfile(thisdir,'data'),dataname);
|
---|
[134] | 180 | if exist([filenames{j} '.mat'],'file') == 2
|
---|
| 181 | % if mat-file is available, use it
|
---|
[151] | 182 | % s = load([filenames{j} '.mat']);
|
---|
| 183 | % f = fieldnames(s);
|
---|
| 184 | % a = getfield(s,f{1});
|
---|
| 185 | a = file2dset([filenames{j} '.mat']);
|
---|
[134] | 186 | else
|
---|
[142] | 187 | if ~exist('data','var')
|
---|
[134] | 188 | % get UCI info
|
---|
| 189 | data = parselink(name);
|
---|
| 190 | if ~data.misval % avoid checking missing values if not needed
|
---|
| 191 | misvalchar = [];
|
---|
| 192 | end
|
---|
| 193 | url = data.url;
|
---|
| 194 | end
|
---|
| 195 | if strcmp(uciname(1:4),'http')
|
---|
| 196 | % full url given, use it
|
---|
| 197 | data.url = uciname;
|
---|
| 198 | else
|
---|
| 199 | % construct url from UCI info
|
---|
| 200 | data.url = [url uciname];
|
---|
| 201 | end
|
---|
| 202 | % do the real work
|
---|
| 203 | a = pr_download(data.url,filenames{j},siz(j),nhead(j),form,misvalchar,delchar,true);
|
---|
| 204 | if ischar(cfeat{j})
|
---|
| 205 | labfile = [filenames{j} '_lab'];
|
---|
| 206 | % old call to pr_download
|
---|
| 207 | labs = +pr_download([url cfeat{j}],labfile,0,[],[],[],[],true);
|
---|
| 208 | if isempty(labs) || size(labs,1) ~= size(a,1)
|
---|
| 209 | warning(['No correct label file found: ' [url cfeat{j}]]);
|
---|
| 210 | else
|
---|
| 211 | delete([labfile '.dat']);
|
---|
| 212 | a = setlabels(a,+labs);
|
---|
| 213 | end
|
---|
| 214 | else
|
---|
| 215 | if isempty(cfeat{j}) % find labels and use them
|
---|
| 216 | a = feat2lab(a,size(a,2));
|
---|
| 217 | elseif cfeat{j} ~= 0
|
---|
| 218 | a = feat2lab(a,cfeat{j});
|
---|
| 219 | end
|
---|
| 220 | end
|
---|
| 221 | a = setuser(a,data,'user'); % store dataset info
|
---|
| 222 | a = setname(a,dataname); % set dataset name
|
---|
| 223 | save([filenames{j} '.mat'],'a'); % save it
|
---|
| 224 | anynew = true;
|
---|
| 225 | end
|
---|
| 226 | varargout{j} = a;
|
---|
| 227 | end
|
---|
| 228 |
|
---|
| 229 | if anynew && numel(ucinames) > 1
|
---|
| 230 | % multiple datasets loaded, alignment might be needed
|
---|
[142] | 231 | [varargout{:}] = pr_dset_align(varargout{:});
|
---|
[134] | 232 | for j=1:numel(ucinames)
|
---|
| 233 | a = varargout{j};
|
---|
| 234 | if ~nosave
|
---|
| 235 | save(filenames{j},'a');
|
---|
| 236 | end
|
---|
| 237 | end
|
---|
| 238 | end
|
---|
| 239 |
|
---|
| 240 | function data = parselink(link)
|
---|
| 241 | %% Parse info from a particular UCI ML data set
|
---|
| 242 | % data.link : url of the particular data set pages
|
---|
| 243 | % data.info : url of the data set info page
|
---|
| 244 | % data.url : url of the data set data files (excluding the filename, as
|
---|
| 245 | % there might be more files and their names ar irregular)
|
---|
| 246 | % data.desc : the problem infor as given in the abstract;
|
---|
| 247 | % data.misval : true/false for missing values
|
---|
| 248 | % data.type : feature types (categorical / integer / real)
|
---|
| 249 |
|
---|
| 250 | link = ['http://archive.ics.uci.edu/ml/datasets/' link];
|
---|
| 251 | desc = urlread(link);
|
---|
| 252 | k = strfind(desc,'Download');
|
---|
| 253 | s = desc(k:k+250);
|
---|
| 254 | k = strfind(s,'"');
|
---|
| 255 | url = ['http://archive.ics.uci.edu/ml/' s(k(1)+4:k(2)-1)];
|
---|
| 256 | info = ['http://archive.ics.uci.edu/ml/' s(k(5)+3:k(6)-1)];
|
---|
| 257 |
|
---|
| 258 | k1 = strfind(desc,'Abstract</b>');
|
---|
| 259 | if ~isempty(k1)
|
---|
| 260 | k2 = strfind(desc(k1+12:end),'</p>');
|
---|
| 261 | descr = desc(k1+14:k1+k2+10);
|
---|
| 262 | end
|
---|
| 263 |
|
---|
| 264 | k = strfind(desc,'Attribute Characteristics:</b></p></td>');
|
---|
| 265 | type = desc(k+64:k+150);
|
---|
| 266 | k = strfind(type,'</p');
|
---|
| 267 | type = textscan(type(1:k-1),'%s','delimiter',',');
|
---|
| 268 | type = char(type{1});
|
---|
| 269 |
|
---|
| 270 | k1 = strfind(desc,'Missing Values');
|
---|
| 271 | if strcmp(desc(k1+53:k1+54),'No')
|
---|
| 272 | misval = false;
|
---|
| 273 | elseif strcmp(desc(k1+53:k1+55),'Yes')
|
---|
| 274 | misval = true;
|
---|
| 275 | else
|
---|
| 276 | misval = [];
|
---|
| 277 | end
|
---|
| 278 |
|
---|
| 279 | data.link = link;
|
---|
| 280 | data.info = info;
|
---|
| 281 | data.url = url;
|
---|
| 282 | data.desc = descr;
|
---|
| 283 | data.misval = misval;
|
---|
| 284 | data.type = type;
|
---|
| 285 |
|
---|
[142] | 286 | function s = fielddef(s,field,x)
|
---|
| 287 | if ~isfield(s,field)
|
---|
| 288 | s.(field) = x;
|
---|
| 289 | end
|
---|
[134] | 290 |
|
---|
| 291 | function name = callername(n)
|
---|
| 292 | %%
|
---|
| 293 | if nargin < 1, n=0; end
|
---|
| 294 | [ss,dummy] = dbstack;
|
---|
| 295 | if length(ss) < n+3
|
---|
| 296 | name = [];
|
---|
| 297 | else
|
---|
| 298 | name = ss(n+3).name;
|
---|
| 299 | end
|
---|
| 300 |
|
---|
| 301 | function dirname = callerdir(n)
|
---|
| 302 | %%
|
---|
| 303 | if nargin < 1, n=0; end
|
---|
| 304 | ss = dbstack;
|
---|
| 305 | if length(ss) < 3
|
---|
| 306 | % no caller, commandline call
|
---|
| 307 | dirname = pwd;
|
---|
| 308 | else
|
---|
| 309 | dirname = fileparts(which(ss(n+3).name));
|
---|
| 310 | end
|
---|