source: prdatasets/pr_download_uci.m @ 134

Last change on this file since 134 was 134, checked in by bduin, 5 years ago
File size: 8.5 KB
Line 
1%PR_DOWNLOAD_UCI  Load UCI data and convert to PRTools format
2%
3%   [A,B, ...] = PR_DOWNLOAD_UCI(UCIDIR,DNAMS,OPTIONS)
4%
5% INPUT
6%   UCIDIR   Name of desired UCI repository directory
7%   DNAMS    Character cell array of UCI data files names to be downloaded
8%            or some full urls.
9%   OPTIONS  Structure with options for parsing, see PR_DOWNLOAD
10%            OPTIONS may also be a cell array with options, one for every
11%            data file. Common fields may be defined in an additional
12%            element of the cell array.
13%
14% DESCRIPTION
15% This routine loads data sets from the <a href="http://archive.ics.uci.edu/ml/datasets/">UCI Machine Learning Repository</a>.
16% and converts them into PRTools datasets. The downloaded files are stored
17% as .dat-files, the PRTools datasets as .mat-files in the directory of
18% this routine. The file names of the calling routine are used. Various
19% annotations are stored in the user-field of the PRTools datasets.
20%
21% This routine also accepts an old, undocumented format.
22%
23% EXAMPLE
24% opt.nheadlines = 5;
25% [a,b] = pr_download_uci('Image+Segmentation', ...
26%         {'segmentation.data','segmentation.test'},opt);
27%
28% SEE ALSO <a href="http://37steps.com/prtools">PRTools Guide</a>
29
30% Copyright: R.P.W. Duin, r.p.w.duin@37steps.com
31
32function varargout = pr_download_uci(name,varargin)
33%%
34
35%% handle old format
36if nargin > 3 || (nargin == 3 && ischar(varargin{2}))
37%if ~(nargin == 3 && (iscell(varargin{2}) || isstruct(varargin{2})))
38  varargout = cell(1,nargout);
39  [varargout{:}] = pr_download_uci_old(name,varargin{:});
40  return
41end
42
43%% get inputs: data files (ucinames) and parse options
44[ucinames,opt] = setdefaults(varargin,{[lower(name) '.data']},[]);
45if ~iscell(ucinames)
46  ucinames = {ucinames};
47end
48
49if iscell(opt)
50  if numel(opt) == numel(ucinames)+1
51    % multiple structures to combine, opt{end} is common
52    for n=1:numel(opt)-1
53    % copy the common fields into the other stuctures
54      M = [fieldnames(opt{end})' fieldnames(opt{n})'; ...
55           struct2cell(opt{end})' struct2cell(opt{n})'];
56      opt{n} = struct(M{:});
57    end
58  elseif numel(opt) ~= numel(ucinames)
59    error('Number op option-structures is wrong')
60  end
61else
62  opt = repmat({opt},1,numel(ucinames)+1);
63end
64% Now opt{n} should correspond to ucinames{n}
65
66
67%% where to store: names of data file, mat file and directory
68comname = callername; % filenames
69datadir = callerdir;  % directory
70if isempty(comname)
71  % call from command line
72  comname = name;
73  datadir = pwd;
74end
75datadir = fullfile(datadir,'data');
76varargout  = cell(1,numel(ucinames));
77% might be too large, will be corrected
78
79%% if matfiles available, use them
80[varargout{:}] = loadmatfile(comname);
81if ~isempty(varargout{1}), return; end
82
83
84%% get UCI info
85data = parselink(name);
86url = data.url;
87
88%% Handle for all data files
89for j=1:numel(ucinames)
90  uciname = ucinames{j};
91  if strcmp(uciname(1:4),'http')
92    % full url given, use it
93    data.url = uciname;
94  else
95    % construct url from UCI info
96    data.url = [url uciname];
97  end
98  if numel(ucinames) > 1
99    dataname = [comname '_' num2str(j)];
100  else
101    dataname = comname;
102  end
103  opt{j}.dsetname = dataname;
104  savemat = ~isfield(opt{j},'matfile') || opt{j}.matfile;
105  opt{j}.matfile  = false;
106  a = pr_download(data.url,fullfile(datadir,dataname),opt{j});
107  a = setuser(a,data,'user'); % store dataset info
108  a = setname(a,dataname);    % set dataset name
109  if ~isfield(opt{j},'labfeat') || isempty(opt{j}.labfeat)
110    a = feat2lab(a,size(a,2));
111  end
112  if savemat
113    save(fullfile(datadir,dataname),'a');
114  end
115  varargout{j} = a;
116end
117
118%% combine them
119if numel(ucinames) > 1
120  % multiple datasets loaded, alignment might be needed
121  [varargout{:}] = dset_align(varargout{:});
122  a = vertcat(varargout{:});
123  a = setuser(a,data,'user'); % store dataset info
124  a = setname(a,comname);    % set dataset name
125  if ~isfield(opt{end},'matfile') || opt{end}.matfile
126    save(fullfile(datadir,comname),'a');
127  end
128  if nargout == 1 % just combined set is requested
129    varargout{1} = a;
130  end
131end
132
133function varargout = pr_download_uci_old(name,varargin)
134%% take care of old definition
135[ucinames,form,prname,siz,nhead,misvalchar,delchar,cfeat,nosave] = ...
136          setdefaults(varargin,{[lower(name) '.data']},[],[],[],[],'?',',',[],false);
137nhead = setdefaults({nhead},zeros(1,numel(ucinames))); % headerlines to be skipped
138
139if ~iscell(ucinames)
140  ucinames = {ucinames};
141end
142if isempty(cfeat)
143  cfeat = repmat({[]},1,numel(ucinames));
144end
145if ~iscell(cfeat)
146  cfeat = repmat({cfeat},1,numel(ucinames));
147end
148
149prname  = setdefaults({prname},callername(1));
150prname  = setdefaults({prname},lower(name));
151thisdir = fileparts(which(mfilename));
152
153if isempty(siz)
154  % no sizes given, make all 0
155  siz = zeros(1,numel(ucinames));
156end
157
158varargout = cell(1,numel(ucinames));
159anynew    = false;
160filenames = cell(1,numel(ucinames));
161for j=1:numel(ucinames)
162  uciname = ucinames{j};
163  if numel(ucinames) > 1
164    dataname = [prname '_' num2str(j)];
165  else
166    dataname = prname;
167  end
168  filenames{j} = fullfile(thisdir,dataname);
169  if exist([filenames{j} '.mat'],'file') == 2
170    % if mat-file is available, use it
171    s = load([filenames{j} '.mat']);
172    f = fieldnames(s);
173    a = getfield(s,f{1});
174  else
175    if ~exist('data')
176      % get UCI info
177      data = parselink(name);
178      if ~data.misval % avoid checking missing values if not needed
179        misvalchar = [];
180      end
181      url = data.url;
182    end   
183    if strcmp(uciname(1:4),'http')
184      % full url given, use it
185      data.url = uciname;
186    else
187      % construct url from UCI info
188      data.url = [url uciname];
189    end
190    % do the real work
191    a = pr_download(data.url,filenames{j},siz(j),nhead(j),form,misvalchar,delchar,true);
192    if ischar(cfeat{j})
193      labfile = [filenames{j} '_lab'];
194      % old call to pr_download
195      labs = +pr_download([url cfeat{j}],labfile,0,[],[],[],[],true);
196      if isempty(labs) || size(labs,1) ~= size(a,1)
197        warning(['No correct label file found: ' [url cfeat{j}]]);
198      else
199        delete([labfile '.dat']);
200        a = setlabels(a,+labs);
201      end
202    else
203      if isempty(cfeat{j}) % find labels and use them
204        a = feat2lab(a,size(a,2));
205      elseif cfeat{j} ~= 0
206        a = feat2lab(a,cfeat{j});
207      end
208    end
209    a = setuser(a,data,'user'); % store dataset info
210    a = setname(a,dataname);    % set dataset name
211    save([filenames{j} '.mat'],'a'); % save it
212    anynew = true;
213  end
214  varargout{j} = a;
215end
216
217if anynew && numel(ucinames) > 1
218  % multiple datasets loaded, alignment might be needed
219  [varargout{:}] = dset_align(varargout{:});
220  for j=1:numel(ucinames)
221    a = varargout{j};
222    if ~nosave
223      save(filenames{j},'a');
224    end
225  end
226end
227
228function data = parselink(link)
229%% Parse info from a particular UCI ML data set
230% data.link : url of the particular data set pages
231% data.info : url of the data set info page
232% data.url  : url of the data set data files (excluding the filename, as
233%              there might be more files and their names ar irregular)
234% data.desc : the problem infor as given in the abstract;
235% data.misval : true/false for missing values
236% data.type : feature types (categorical / integer / real)
237
238link = ['http://archive.ics.uci.edu/ml/datasets/' link];
239desc = urlread(link);
240k = strfind(desc,'Download');
241s = desc(k:k+250);
242k = strfind(s,'"');
243url = ['http://archive.ics.uci.edu/ml/' s(k(1)+4:k(2)-1)];
244info = ['http://archive.ics.uci.edu/ml/' s(k(5)+3:k(6)-1)];
245
246k1 = strfind(desc,'Abstract</b>');
247if ~isempty(k1)
248  k2 = strfind(desc(k1+12:end),'</p>');
249  descr = desc(k1+14:k1+k2+10);
250end
251
252k = strfind(desc,'Attribute Characteristics:</b></p></td>');
253type = desc(k+64:k+150);
254k = strfind(type,'</p');
255type = textscan(type(1:k-1),'%s','delimiter',',');
256type = char(type{1});
257
258k1 = strfind(desc,'Missing Values');
259if strcmp(desc(k1+53:k1+54),'No')
260  misval = false;
261elseif strcmp(desc(k1+53:k1+55),'Yes')
262  misval = true;
263else
264  misval = [];
265end
266
267data.link = link;
268data.info = info;
269data.url  = url;
270data.desc = descr;
271data.misval = misval;
272data.type = type;
273
274
275function name = callername(n)
276%%
277if nargin < 1, n=0; end
278[ss,dummy] = dbstack;
279if length(ss) < n+3
280        name = [];
281else
282        name = ss(n+3).name;
283end
284
285function dirname = callerdir(n)
286%%
287if nargin < 1, n=0; end
288ss = dbstack;
289if length(ss) < 3
290  % no caller, commandline call
291  dirname = pwd;
292else
293  dirname = fileparts(which(ss(n+3).name));
294end
Note: See TracBrowser for help on using the repository browser.