source: prdatasets/pr_download_uci.m @ 141

Last change on this file since 141 was 135, checked in by bduin, 5 years ago
File size: 8.5 KB
RevLine 
[134]1%PR_DOWNLOAD_UCI  Load UCI data and convert to PRTools format
2%
3%   [A,B, ...] = PR_DOWNLOAD_UCI(UCIDIR,DNAMS,OPTIONS)
4%
5% INPUT
6%   UCIDIR   Name of desired UCI repository directory
7%   DNAMS    Character cell array of UCI data files names to be downloaded
8%            or some full urls.
9%   OPTIONS  Structure with options for parsing, see PR_DOWNLOAD
10%            OPTIONS may also be a cell array with options, one for every
11%            data file. Common fields may be defined in an additional
12%            element of the cell array.
13%
14% DESCRIPTION
15% This routine loads data sets from the <a href="http://archive.ics.uci.edu/ml/datasets/">UCI Machine Learning Repository</a>.
16% and converts them into PRTools datasets. The downloaded files are stored
17% as .dat-files, the PRTools datasets as .mat-files in the directory of
18% this routine. The file names of the calling routine are used. Various
19% annotations are stored in the user-field of the PRTools datasets.
20%
21% This routine also accepts an old, undocumented format.
22%
23% EXAMPLE
24% opt.nheadlines = 5;
25% [a,b] = pr_download_uci('Image+Segmentation', ...
26%         {'segmentation.data','segmentation.test'},opt);
27%
28% SEE ALSO <a href="http://37steps.com/prtools">PRTools Guide</a>
29
30% Copyright: R.P.W. Duin, r.p.w.duin@37steps.com
31
32function varargout = pr_download_uci(name,varargin)
33%%
34
35%% handle old format
36if nargin > 3 || (nargin == 3 && ischar(varargin{2}))
37%if ~(nargin == 3 && (iscell(varargin{2}) || isstruct(varargin{2})))
38  varargout = cell(1,nargout);
39  [varargout{:}] = pr_download_uci_old(name,varargin{:});
40  return
41end
42
43%% get inputs: data files (ucinames) and parse options
44[ucinames,opt] = setdefaults(varargin,{[lower(name) '.data']},[]);
45if ~iscell(ucinames)
46  ucinames = {ucinames};
47end
48
49if iscell(opt)
50  if numel(opt) == numel(ucinames)+1
51    % multiple structures to combine, opt{end} is common
52    for n=1:numel(opt)-1
53    % copy the common fields into the other stuctures
54      M = [fieldnames(opt{end})' fieldnames(opt{n})'; ...
55           struct2cell(opt{end})' struct2cell(opt{n})'];
56      opt{n} = struct(M{:});
57    end
58  elseif numel(opt) ~= numel(ucinames)
59    error('Number op option-structures is wrong')
60  end
61else
62  opt = repmat({opt},1,numel(ucinames)+1);
63end
64% Now opt{n} should correspond to ucinames{n}
65
66
67%% where to store: names of data file, mat file and directory
68comname = callername; % filenames
69datadir = callerdir;  % directory
70if isempty(comname)
71  % call from command line
72  comname = name;
73  datadir = pwd;
74end
75datadir = fullfile(datadir,'data');
76varargout  = cell(1,numel(ucinames));
77% might be too large, will be corrected
78
79%% if matfiles available, use them
80[varargout{:}] = loadmatfile(comname);
81if ~isempty(varargout{1}), return; end
82
83
84%% get UCI info
85data = parselink(name);
86url = data.url;
87
88%% Handle for all data files
[135]89anynew = false;
[134]90for j=1:numel(ucinames)
91  uciname = ucinames{j};
92  if strcmp(uciname(1:4),'http')
93    % full url given, use it
94    data.url = uciname;
95  else
96    % construct url from UCI info
97    data.url = [url uciname];
98  end
99  if numel(ucinames) > 1
100    dataname = [comname '_' num2str(j)];
101  else
102    dataname = comname;
103  end
104  opt{j}.dsetname = dataname;
105  savemat = ~isfield(opt{j},'matfile') || opt{j}.matfile;
106  opt{j}.matfile  = false;
107  a = pr_download(data.url,fullfile(datadir,dataname),opt{j});
108  a = setuser(a,data,'user'); % store dataset info
109  a = setname(a,dataname);    % set dataset name
110  if ~isfield(opt{j},'labfeat') || isempty(opt{j}.labfeat)
111    a = feat2lab(a,size(a,2));
112  end
113  if savemat
114    save(fullfile(datadir,dataname),'a');
115  end
116  varargout{j} = a;
117end
118
119%% combine them
120if numel(ucinames) > 1
121  % multiple datasets loaded, alignment might be needed
122  [varargout{:}] = dset_align(varargout{:});
123  a = vertcat(varargout{:});
124  a = setuser(a,data,'user'); % store dataset info
125  a = setname(a,comname);    % set dataset name
126  if ~isfield(opt{end},'matfile') || opt{end}.matfile
127    save(fullfile(datadir,comname),'a');
128  end
129  if nargout == 1 % just combined set is requested
130    varargout{1} = a;
131  end
132end
133
134function varargout = pr_download_uci_old(name,varargin)
135%% take care of old definition
136[ucinames,form,prname,siz,nhead,misvalchar,delchar,cfeat,nosave] = ...
137          setdefaults(varargin,{[lower(name) '.data']},[],[],[],[],'?',',',[],false);
138nhead = setdefaults({nhead},zeros(1,numel(ucinames))); % headerlines to be skipped
139
140if ~iscell(ucinames)
141  ucinames = {ucinames};
142end
143if isempty(cfeat)
144  cfeat = repmat({[]},1,numel(ucinames));
145end
146if ~iscell(cfeat)
147  cfeat = repmat({cfeat},1,numel(ucinames));
148end
149
150prname  = setdefaults({prname},callername(1));
151prname  = setdefaults({prname},lower(name));
152thisdir = fileparts(which(mfilename));
153
154if isempty(siz)
155  % no sizes given, make all 0
156  siz = zeros(1,numel(ucinames));
157end
158
159varargout = cell(1,numel(ucinames));
160anynew    = false;
161filenames = cell(1,numel(ucinames));
162for j=1:numel(ucinames)
163  uciname = ucinames{j};
164  if numel(ucinames) > 1
165    dataname = [prname '_' num2str(j)];
166  else
167    dataname = prname;
168  end
169  filenames{j} = fullfile(thisdir,dataname);
170  if exist([filenames{j} '.mat'],'file') == 2
171    % if mat-file is available, use it
172    s = load([filenames{j} '.mat']);
173    f = fieldnames(s);
174    a = getfield(s,f{1});
175  else
176    if ~exist('data')
177      % get UCI info
178      data = parselink(name);
179      if ~data.misval % avoid checking missing values if not needed
180        misvalchar = [];
181      end
182      url = data.url;
183    end   
184    if strcmp(uciname(1:4),'http')
185      % full url given, use it
186      data.url = uciname;
187    else
188      % construct url from UCI info
189      data.url = [url uciname];
190    end
191    % do the real work
192    a = pr_download(data.url,filenames{j},siz(j),nhead(j),form,misvalchar,delchar,true);
193    if ischar(cfeat{j})
194      labfile = [filenames{j} '_lab'];
195      % old call to pr_download
196      labs = +pr_download([url cfeat{j}],labfile,0,[],[],[],[],true);
197      if isempty(labs) || size(labs,1) ~= size(a,1)
198        warning(['No correct label file found: ' [url cfeat{j}]]);
199      else
200        delete([labfile '.dat']);
201        a = setlabels(a,+labs);
202      end
203    else
204      if isempty(cfeat{j}) % find labels and use them
205        a = feat2lab(a,size(a,2));
206      elseif cfeat{j} ~= 0
207        a = feat2lab(a,cfeat{j});
208      end
209    end
210    a = setuser(a,data,'user'); % store dataset info
211    a = setname(a,dataname);    % set dataset name
212    save([filenames{j} '.mat'],'a'); % save it
213    anynew = true;
214  end
215  varargout{j} = a;
216end
217
218if anynew && numel(ucinames) > 1
219  % multiple datasets loaded, alignment might be needed
220  [varargout{:}] = dset_align(varargout{:});
221  for j=1:numel(ucinames)
222    a = varargout{j};
223    if ~nosave
224      save(filenames{j},'a');
225    end
226  end
227end
228
229function data = parselink(link)
230%% Parse info from a particular UCI ML data set
231% data.link : url of the particular data set pages
232% data.info : url of the data set info page
233% data.url  : url of the data set data files (excluding the filename, as
234%              there might be more files and their names ar irregular)
235% data.desc : the problem infor as given in the abstract;
236% data.misval : true/false for missing values
237% data.type : feature types (categorical / integer / real)
238
239link = ['http://archive.ics.uci.edu/ml/datasets/' link];
240desc = urlread(link);
241k = strfind(desc,'Download');
242s = desc(k:k+250);
243k = strfind(s,'"');
244url = ['http://archive.ics.uci.edu/ml/' s(k(1)+4:k(2)-1)];
245info = ['http://archive.ics.uci.edu/ml/' s(k(5)+3:k(6)-1)];
246
247k1 = strfind(desc,'Abstract</b>');
248if ~isempty(k1)
249  k2 = strfind(desc(k1+12:end),'</p>');
250  descr = desc(k1+14:k1+k2+10);
251end
252
253k = strfind(desc,'Attribute Characteristics:</b></p></td>');
254type = desc(k+64:k+150);
255k = strfind(type,'</p');
256type = textscan(type(1:k-1),'%s','delimiter',',');
257type = char(type{1});
258
259k1 = strfind(desc,'Missing Values');
260if strcmp(desc(k1+53:k1+54),'No')
261  misval = false;
262elseif strcmp(desc(k1+53:k1+55),'Yes')
263  misval = true;
264else
265  misval = [];
266end
267
268data.link = link;
269data.info = info;
270data.url  = url;
271data.desc = descr;
272data.misval = misval;
273data.type = type;
274
275
276function name = callername(n)
277%%
278if nargin < 1, n=0; end
279[ss,dummy] = dbstack;
280if length(ss) < n+3
281        name = [];
282else
283        name = ss(n+3).name;
284end
285
286function dirname = callerdir(n)
287%%
288if nargin < 1, n=0; end
289ss = dbstack;
290if length(ss) < 3
291  % no caller, commandline call
292  dirname = pwd;
293else
294  dirname = fileparts(which(ss(n+3).name));
295end
Note: See TracBrowser for help on using the repository browser.