source: prdatasets/pr_download_uci.m @ 145

Last change on this file since 145 was 142, checked in by bduin, 5 years ago

Updated collection of datasets

File size: 8.7 KB
Line 
1%PR_DOWNLOAD_UCI  Load UCI data and convert to PRTools format
2%
3%   [A,B, ...] = PR_DOWNLOAD_UCI(UCIDIR,DNAMS,OPTIONS)
4%
5% INPUT
6%   UCIDIR   Name of desired UCI repository directory
7%   DNAMS    Character cell array of UCI data files names to be downloaded
8%            or some full urls.
9%   OPTIONS  Structure with options for parsing, see PR_DOWNLOAD
10%            OPTIONS may also be a cell array with options, one for every
11%            data file. Common fields may be defined in an additional
12%            element of the cell array.
13%
14% DESCRIPTION
15% This routine loads data sets from the <a href="http://archive.ics.uci.edu/ml/datasets/">UCI Machine Learning Repository</a>.
16% and converts them into PRTools datasets. The downloaded files are stored
17% as .dat-files, the PRTools datasets as .mat-files in the directory of
18% this routine. The file names of the calling routine are used. Various
19% annotations are stored in the user-field of the PRTools datasets.
20%
21% This routine also accepts an old, undocumented format.
22%
23% EXAMPLE
24% opt.nheadlines = 5;
25% [a,b] = pr_download_uci('Image+Segmentation', ...
26%         {'segmentation.data','segmentation.test'},opt);
27%
28% SEE ALSO <a href="http://37steps.com/prtools">PRTools Guide</a>
29
30% Copyright: R.P.W. Duin, r.p.w.duin@37steps.com
31
32function varargout = pr_download_uci(name,varargin)
33%%
34
35%% handle old format
36if nargin > 3 || (nargin == 3 && ischar(varargin{2}))
37%if ~(nargin == 3 && (iscell(varargin{2}) || isstruct(varargin{2})))
38  varargout = cell(1,nargout);
39  [varargout{:}] = pr_download_uci_old(name,varargin{:});
40  return
41end
42
43%% get inputs: data files (ucinames) and parse options
44[ucinames,opt] = setdefaults(varargin,{[lower(name) '.data']},[]);
45if ~iscell(ucinames)
46  ucinames = {ucinames};
47end
48
49if iscell(opt)
50  if numel(opt) == numel(ucinames)+1
51    % multiple structures to combine, opt{end} is common
52    for n=1:numel(opt)-1
53    % copy the common fields into the other stuctures
54      M = [fieldnames(opt{end})' fieldnames(opt{n})'; ...
55           struct2cell(opt{end})' struct2cell(opt{n})'];
56      opt{n} = struct(M{:});
57    end
58  elseif numel(opt) ~= numel(ucinames)
59    error('Number op option-structures is wrong')
60  end
61else
62  opt = repmat({opt},1,numel(ucinames)+1);
63end
64% Now opt{n} should correspond to ucinames{n}
65
66
67%% where to store: names of data file, mat file and directory
68comname = callername; % filenames
69datadir = callerdir;  % directory
70if isempty(comname)
71  % call from command line
72  comname = name;
73  datadir = pwd;
74end
75datadir = fullfile(datadir,'data');
76varargout  = cell(1,numel(ucinames));
77% might be too large, will be corrected
78
79%% if matfiles available, use them
80[varargout{:}] = pr_loadmatfile(comname);
81if ~isempty(varargout{1}), return; end
82
83
84%% get UCI info
85data = parselink(name);
86url = data.url;
87
88%% Handle for all data files
89anynew = false;
90for j=1:numel(ucinames)
91  uciname = ucinames{j};
92  if strcmp(uciname(1:4),'http')
93    % full url given, use it
94    data.url = uciname;
95  else
96    % construct url from UCI info
97    data.url = [url uciname];
98  end
99  if numel(ucinames) > 1
100    dataname = [comname '_' num2str(j)];
101  else
102    dataname = comname;
103  end
104%   opt{j}.dsetname = dataname;
105  savemat = ~isfield(opt{j},'matfile') || opt{j}.matfile;
106  opt{j}.matfile  = false;
107  opt{j}.delimeter= ',';
108  opt{j} = fielddef(opt{j},'dsetname',callername);
109  a = pr_download(data.url,fullfile(datadir,dataname),opt{j});
110  a = setuser(a,data,'user'); % store dataset info
111%   a = setname(a,dataname);    % set dataset name
112  if ~isfield(opt{j},'labfeat') || isempty(opt{j}.labfeat)
113    a = feat2lab(a,size(a,2));
114  end
115  if savemat
116    save(fullfile(datadir,dataname),'a');
117  end
118  varargout{j} = a;
119end
120
121%% combine them
122if numel(ucinames) > 1
123  % multiple datasets loaded, alignment might be needed
124  [varargout{:}] = pr_dset_align(varargout{:});
125  a = vertcat(varargout{:});
126  a = setuser(a,data,'user'); % store dataset info
127  opt{end} = fielddef(opt{end},'dsetname',callername);
128  if ~isfield(opt{end},'matfile') || opt{end}.matfile
129    save(fullfile(datadir,comname),'a');
130  end
131  if nargout == 1 % just combined set is requested
132    varargout{1} = a;
133  end
134end
135
136function varargout = pr_download_uci_old(name,varargin)
137%% take care of old definition
138[ucinames,form,prname,siz,nhead,misvalchar,delchar,cfeat,nosave] = ...
139          setdefaults(varargin,{[lower(name) '.data']},[],[],[],[],'?',',',[],false);
140nhead = setdefaults({nhead},zeros(1,numel(ucinames))); % headerlines to be skipped
141
142if ~iscell(ucinames)
143  ucinames = {ucinames};
144end
145if isempty(cfeat)
146  cfeat = repmat({[]},1,numel(ucinames));
147end
148if ~iscell(cfeat)
149  cfeat = repmat({cfeat},1,numel(ucinames));
150end
151
152prname  = setdefaults({prname},callername(1));
153prname  = setdefaults({prname},lower(name));
154thisdir = fileparts(which(mfilename));
155
156if isempty(siz)
157  % no sizes given, make all 0
158  siz = zeros(1,numel(ucinames));
159end
160
161varargout = cell(1,numel(ucinames));
162anynew    = false;
163filenames = cell(1,numel(ucinames));
164for j=1:numel(ucinames)
165  uciname = ucinames{j};
166  if numel(ucinames) > 1
167    dataname = [prname '_' num2str(j)];
168  else
169    dataname = prname;
170  end
171  filenames{j} = fullfile(fullfile(thisdir,'data'),dataname);
172  if exist([filenames{j} '.mat'],'file') == 2
173    % if mat-file is available, use it
174    s = load([filenames{j} '.mat']);
175    f = fieldnames(s);
176    a = getfield(s,f{1});
177  else
178    if ~exist('data','var')
179      % get UCI info
180      data = parselink(name);
181      if ~data.misval % avoid checking missing values if not needed
182        misvalchar = [];
183      end
184      url = data.url;
185    end   
186    if strcmp(uciname(1:4),'http')
187      % full url given, use it
188      data.url = uciname;
189    else
190      % construct url from UCI info
191      data.url = [url uciname];
192    end
193    % do the real work
194    a = pr_download(data.url,filenames{j},siz(j),nhead(j),form,misvalchar,delchar,true);
195    if ischar(cfeat{j})
196      labfile = [filenames{j} '_lab'];
197      % old call to pr_download
198      labs = +pr_download([url cfeat{j}],labfile,0,[],[],[],[],true);
199      if isempty(labs) || size(labs,1) ~= size(a,1)
200        warning(['No correct label file found: ' [url cfeat{j}]]);
201      else
202        delete([labfile '.dat']);
203        a = setlabels(a,+labs);
204      end
205    else
206      if isempty(cfeat{j}) % find labels and use them
207        a = feat2lab(a,size(a,2));
208      elseif cfeat{j} ~= 0
209        a = feat2lab(a,cfeat{j});
210      end
211    end
212    a = setuser(a,data,'user'); % store dataset info
213    a = setname(a,dataname);    % set dataset name
214    save([filenames{j} '.mat'],'a'); % save it
215    anynew = true;
216  end
217  varargout{j} = a;
218end
219
220if anynew && numel(ucinames) > 1
221  % multiple datasets loaded, alignment might be needed
222  [varargout{:}] = pr_dset_align(varargout{:});
223  for j=1:numel(ucinames)
224    a = varargout{j};
225    if ~nosave
226      save(filenames{j},'a');
227    end
228  end
229end
230
231function data = parselink(link)
232%% Parse info from a particular UCI ML data set
233% data.link : url of the particular data set pages
234% data.info : url of the data set info page
235% data.url  : url of the data set data files (excluding the filename, as
236%              there might be more files and their names ar irregular)
237% data.desc : the problem infor as given in the abstract;
238% data.misval : true/false for missing values
239% data.type : feature types (categorical / integer / real)
240
241link = ['http://archive.ics.uci.edu/ml/datasets/' link];
242desc = urlread(link);
243k = strfind(desc,'Download');
244s = desc(k:k+250);
245k = strfind(s,'"');
246url = ['http://archive.ics.uci.edu/ml/' s(k(1)+4:k(2)-1)];
247info = ['http://archive.ics.uci.edu/ml/' s(k(5)+3:k(6)-1)];
248
249k1 = strfind(desc,'Abstract</b>');
250if ~isempty(k1)
251  k2 = strfind(desc(k1+12:end),'</p>');
252  descr = desc(k1+14:k1+k2+10);
253end
254
255k = strfind(desc,'Attribute Characteristics:</b></p></td>');
256type = desc(k+64:k+150);
257k = strfind(type,'</p');
258type = textscan(type(1:k-1),'%s','delimiter',',');
259type = char(type{1});
260
261k1 = strfind(desc,'Missing Values');
262if strcmp(desc(k1+53:k1+54),'No')
263  misval = false;
264elseif strcmp(desc(k1+53:k1+55),'Yes')
265  misval = true;
266else
267  misval = [];
268end
269
270data.link = link;
271data.info = info;
272data.url  = url;
273data.desc = descr;
274data.misval = misval;
275data.type = type;
276
277function s = fielddef(s,field,x)
278  if ~isfield(s,field)
279    s.(field) = x;
280  end
281
282function name = callername(n)
283%%
284if nargin < 1, n=0; end
285[ss,dummy] = dbstack;
286if length(ss) < n+3
287        name = [];
288else
289        name = ss(n+3).name;
290end
291
292function dirname = callerdir(n)
293%%
294if nargin < 1, n=0; end
295ss = dbstack;
296if length(ss) < 3
297  % no caller, commandline call
298  dirname = pwd;
299else
300  dirname = fileparts(which(ss(n+3).name));
301end
Note: See TracBrowser for help on using the repository browser.