source: prdatasets/pr_download_uci.m @ 151

Last change on this file since 151 was 151, checked in by bduin, 5 years ago
File size: 8.8 KB
Line 
1%PR_DOWNLOAD_UCI  Load UCI data and convert to PRTools format
2%
3%   [A,B, ...] = PR_DOWNLOAD_UCI(UCIDIR,DNAMS,OPTIONS)
4%
5% INPUT
6%   UCIDIR   Name of desired UCI repository directory
7%   DNAMS    Character cell array of UCI data files names to be downloaded
8%            or some full urls.
9%   OPTIONS  Structure with options for parsing, see PR_DOWNLOAD
10%            OPTIONS may also be a cell array with options, one for every
11%            data file. Common fields may be defined in an additional
12%            element of the cell array.
13%
14% DESCRIPTION
15% This routine loads data sets from the <a href="http://archive.ics.uci.edu/ml/datasets/">UCI Machine Learning Repository</a>.
16% and converts them into PRTools datasets. The downloaded files are stored
17% as .dat-files, the PRTools datasets as .mat-files in the directory of
18% this routine. The file names of the calling routine are used. Various
19% annotations are stored in the user-field of the PRTools datasets.
20%
21% This routine also accepts an old, undocumented format.
22%
23% EXAMPLE
24% opt.nheadlines = 5;
25% [a,b] = pr_download_uci('Image+Segmentation', ...
26%         {'segmentation.data','segmentation.test'},opt);
27%
28% SEE ALSO <a href="http://prtools.tudelft.nl/prtools/">PRTools Guide</a>
29
30% Copyright: R.P.W. Duin
31
32function varargout = pr_download_uci(name,varargin)
33%%
34
35%% handle old format
36if nargin > 3 || (nargin == 3 && ischar(varargin{2}))
37%if ~(nargin == 3 && (iscell(varargin{2}) || isstruct(varargin{2})))
38  varargout = cell(1,nargout);
39  [varargout{:}] = pr_download_uci_old(name,varargin{:});
40  return
41end
42
43%% get inputs: data files (ucinames) and parse options
44[ucinames,opt] = setdefaults(varargin,{[lower(name) '.data']},[]);
45if ~iscell(ucinames)
46  ucinames = {ucinames};
47end
48
49if iscell(opt)
50  if numel(opt) == numel(ucinames)+1
51    % multiple structures to combine, opt{end} is common
52    for n=1:numel(opt)-1
53    % copy the common fields into the other stuctures
54      M = [fieldnames(opt{end})' fieldnames(opt{n})'; ...
55           struct2cell(opt{end})' struct2cell(opt{n})'];
56      opt{n} = struct(M{:});
57    end
58  elseif numel(opt) ~= numel(ucinames)
59    error('Number op option-structures is wrong')
60  end
61else
62  opt = repmat({opt},1,numel(ucinames)+1);
63end
64% Now opt{n} should correspond to ucinames{n}
65
66
67%% where to store: names of data file, mat file and directory
68comname = callername; % filenames
69datadir = callerdir;  % directory
70if isempty(comname)
71  % call from command line
72  comname = name;
73  datadir = pwd;
74end
75datadir = fullfile(datadir,'data');
76varargout  = cell(1,numel(ucinames));
77% might be too large, will be corrected
78
79%% if matfiles available, use them
80[varargout{:}] = pr_loadmatfile(comname);
81if ~isempty(varargout{1}), return; end
82
83
84%% get UCI info
85data = parselink(name);
86url = data.url;
87
88%% Handle for all data files
89anynew = false;
90for j=1:numel(ucinames)
91  uciname = ucinames{j};
92  if strcmp(uciname(1:4),'http')
93    % full url given, use it
94    data.url = uciname;
95  else
96    % construct url from UCI info
97    data.url = [url uciname];
98  end
99  if numel(ucinames) > 1
100    dataname = [comname '_' num2str(j)];
101  else
102    dataname = comname;
103  end
104%   opt{j}.dsetname = dataname;
105  savemat = ~isfield(opt{j},'matfile') || opt{j}.matfile;
106  opt{j}.matfile  = false;
107  opt{j}.delimeter= ',';
108  opt{j} = fielddef(opt{j},'dsetname',callername);
109  a = pr_download(data.url,fullfile(datadir,dataname),opt{j});
110  a = setuser(a,data,'user'); % store dataset info
111%   a = setname(a,dataname);    % set dataset name
112  if ~isfield(opt{j},'labfeat') || isempty(opt{j}.labfeat)
113    a = feat2lab(a,size(a,2));
114  end
115  if savemat
116    save(fullfile(datadir,dataname),'a');
117  end
118  varargout{j} = a;
119end
120
121%% combine them
122if numel(ucinames) > 1
123  % multiple datasets loaded, alignment might be needed
124  [varargout{:}] = pr_dset_align(varargout{:});
125  a = vertcat(varargout{:});
126  a = setuser(a,data,'user'); % store dataset info
127  opt{end} = fielddef(opt{end},'dsetname',callername);
128  if ~isfield(opt{end},'matfile') || opt{end}.matfile
129    save(fullfile(datadir,comname),'a');
130  end
131  if nargout == 1 % just combined set is requested
132    varargout{1} = a;
133  end
134end
135
136function varargout = pr_download_uci_old(name,varargin)
137%% take care of old definition
138[ucinames,form,prname,siz,nhead,misvalchar,delchar,cfeat,nosave] = ...
139          setdefaults(varargin,{[lower(name) '.data']},[],[],[],[],'?',',',[],false);
140nhead = setdefaults({nhead},zeros(1,numel(ucinames))); % headerlines to be skipped
141
142if ~iscell(ucinames)
143  ucinames = {ucinames};
144end
145if isempty(cfeat)
146  cfeat = repmat({[]},1,numel(ucinames));
147end
148if ~iscell(cfeat)
149  cfeat = repmat({cfeat},1,numel(ucinames));
150end
151
152prname  = setdefaults({prname},callername(1));
153prname  = setdefaults({prname},lower(name));
154thisdir = fileparts(which(mfilename));
155
156if isempty(siz)
157  % no sizes given, make all 0
158  siz = zeros(1,numel(ucinames));
159end
160
161varargout = cell(1,numel(ucinames));
162anynew    = false;
163filenames = cell(1,numel(ucinames));
164for j=1:numel(ucinames)
165  uciname = ucinames{j};
166  if numel(ucinames) > 1
167    dataname = [prname '_' num2str(j)];
168  else
169    dataname = prname;
170  end
171  filenames{j} = fullfile(fullfile(thisdir,'data'),dataname);
172  if exist([filenames{j} '.mat'],'file') == 2
173    % if mat-file is available, use it
174%     s = load([filenames{j} '.mat']);
175%     f = fieldnames(s);
176%     a = getfield(s,f{1});
177    a = file2dset([filenames{j} '.mat']);
178  else
179    if ~exist('data','var')
180      % get UCI info
181      data = parselink(name);
182      if ~data.misval % avoid checking missing values if not needed
183        misvalchar = [];
184      end
185      url = data.url;
186    end   
187    if strcmp(uciname(1:4),'http')
188      % full url given, use it
189      data.url = uciname;
190    else
191      % construct url from UCI info
192      data.url = [url uciname];
193    end
194    % do the real work
195    a = pr_download(data.url,filenames{j},siz(j),nhead(j),form,misvalchar,delchar,true);
196    if ischar(cfeat{j})
197      labfile = [filenames{j} '_lab'];
198      % old call to pr_download
199      labs = +pr_download([url cfeat{j}],labfile,0,[],[],[],[],true);
200      if isempty(labs) || size(labs,1) ~= size(a,1)
201        warning(['No correct label file found: ' [url cfeat{j}]]);
202      else
203        delete([labfile '.dat']);
204        a = setlabels(a,+labs);
205      end
206    else
207      if isempty(cfeat{j}) % find labels and use them
208        a = feat2lab(a,size(a,2));
209      elseif cfeat{j} ~= 0
210        a = feat2lab(a,cfeat{j});
211      end
212    end
213    a = setuser(a,data,'user'); % store dataset info
214    a = setname(a,dataname);    % set dataset name
215    save([filenames{j} '.mat'],'a'); % save it
216    anynew = true;
217  end
218  varargout{j} = a;
219end
220
221if anynew && numel(ucinames) > 1
222  % multiple datasets loaded, alignment might be needed
223  [varargout{:}] = pr_dset_align(varargout{:});
224  for j=1:numel(ucinames)
225    a = varargout{j};
226    if ~nosave
227      save(filenames{j},'a');
228    end
229  end
230end
231
232function data = parselink(link)
233%% Parse info from a particular UCI ML data set
234% data.link : url of the particular data set pages
235% data.info : url of the data set info page
236% data.url  : url of the data set data files (excluding the filename, as
237%              there might be more files and their names ar irregular)
238% data.desc : the problem infor as given in the abstract;
239% data.misval : true/false for missing values
240% data.type : feature types (categorical / integer / real)
241
242link = ['http://archive.ics.uci.edu/ml/datasets/' link];
243desc = urlread(link);
244k = strfind(desc,'Download');
245s = desc(k:k+250);
246k = strfind(s,'"');
247url = ['http://archive.ics.uci.edu/ml/' s(k(1)+4:k(2)-1)];
248info = ['http://archive.ics.uci.edu/ml/' s(k(5)+3:k(6)-1)];
249
250k1 = strfind(desc,'Abstract</b>');
251if ~isempty(k1)
252  k2 = strfind(desc(k1+12:end),'</p>');
253  descr = desc(k1+14:k1+k2+10);
254end
255
256k = strfind(desc,'Attribute Characteristics:</b></p></td>');
257type = desc(k+64:k+150);
258k = strfind(type,'</p');
259type = textscan(type(1:k-1),'%s','delimiter',',');
260type = char(type{1});
261
262k1 = strfind(desc,'Missing Values');
263if strcmp(desc(k1+53:k1+54),'No')
264  misval = false;
265elseif strcmp(desc(k1+53:k1+55),'Yes')
266  misval = true;
267else
268  misval = [];
269end
270
271data.link = link;
272data.info = info;
273data.url  = url;
274data.desc = descr;
275data.misval = misval;
276data.type = type;
277
278function s = fielddef(s,field,x)
279  if ~isfield(s,field)
280    s.(field) = x;
281  end
282
283function name = callername(n)
284%%
285if nargin < 1, n=0; end
286[ss,dummy] = dbstack;
287if length(ss) < n+3
288        name = [];
289else
290        name = ss(n+3).name;
291end
292
293function dirname = callerdir(n)
294%%
295if nargin < 1, n=0; end
296ss = dbstack;
297if length(ss) < 3
298  % no caller, commandline call
299  dirname = pwd;
300else
301  dirname = fileparts(which(ss(n+3).name));
302end
Note: See TracBrowser for help on using the repository browser.