source: prdatasets/pr_download_uci.m @ 155

Last change on this file since 155 was 153, checked in by bduin, 5 years ago
File size: 9.0 KB
Line 
1%PR_DOWNLOAD_UCI  Load UCI data and convert to PRTools format
2%
3%   [A,B, ...] = PR_DOWNLOAD_UCI(UCIDIR,DNAMS,OPTIONS)
4%
5% INPUT
6%   UCIDIR   Name of desired UCI repository directory
7%   DNAMS    Character cell array of UCI data files names to be downloaded
8%            or some full urls.
9%   OPTIONS  Structure with options for parsing, see PR_DOWNLOAD
10%            OPTIONS may also be a cell array with options, one for every
11%            data file. Common fields may be defined in an additional
12%            element of the cell array.
13%
14% DESCRIPTION
15% This routine loads data sets from the <a href="http://archive.ics.uci.edu/ml/datasets/">UCI Machine Learning Repository</a>.
16% and converts them into PRTools datasets. The downloaded files are stored
17% as .dat-files, the PRTools datasets as .mat-files in the directory of
18% this routine. The file names of the calling routine are used. Various
19% annotations are stored in the user-field of the PRTools datasets.
20%
21% This routine also accepts an old, undocumented format.
22%
23% EXAMPLE
24% opt.nheadlines = 5;
25% [a,b] = pr_download_uci('Image+Segmentation', ...
26%         {'segmentation.data','segmentation.test'},opt);
27%
28% SEE ALSO <a href="http://prtools.tudelft.nl/prtools/">PRTools Guide</a>
29
30% Copyright: R.P.W. Duin
31
32function varargout = pr_download_uci(name,varargin)
33%% make sur there is a dta subdir
34persistent DATADIREXISTS
35if isempty(DATADIREXISTS)
36  datasubdir = fullfile(fileparts(which(mfilename)),'data');
37  if exist(datasubdir,'dir') ~= 7
38    mkdir(datasubdir);
39  end
40  DATADIREXISTS = true;
41end
42
43%% handle old format
44if nargin > 3 || (nargin == 3 && ischar(varargin{2}))
45%if ~(nargin == 3 && (iscell(varargin{2}) || isstruct(varargin{2})))
46  varargout = cell(1,nargout);
47  [varargout{:}] = pr_download_uci_old(name,varargin{:});
48  return
49end
50
51%% get inputs: data files (ucinames) and parse options
52[ucinames,opt] = setdefaults(varargin,{[lower(name) '.data']},[]);
53if ~iscell(ucinames)
54  ucinames = {ucinames};
55end
56
57if iscell(opt)
58  if numel(opt) == numel(ucinames)+1
59    % multiple structures to combine, opt{end} is common
60    for n=1:numel(opt)-1
61    % copy the common fields into the other stuctures
62      M = [fieldnames(opt{end})' fieldnames(opt{n})'; ...
63           struct2cell(opt{end})' struct2cell(opt{n})'];
64      opt{n} = struct(M{:});
65    end
66  elseif numel(opt) ~= numel(ucinames)
67    error('Number op option-structures is wrong')
68  end
69else
70  opt = repmat({opt},1,numel(ucinames)+1);
71end
72% Now opt{n} should correspond to ucinames{n}
73
74
75%% where to store: names of data file, mat file and directory
76comname = callername; % filenames
77datadir = callerdir;  % directory
78if isempty(comname)
79  % call from command line
80  comname = name;
81  datadir = pwd;
82end
83datadir = fullfile(datadir,'data');
84varargout  = cell(1,numel(ucinames));
85% might be too large, will be corrected
86
87%% if matfiles available, use them
88[varargout{:}] = pr_loadmatfile(comname);
89if ~isempty(varargout{1}), return; end
90
91
92%% get UCI info
93data = parselink(name);
94url = data.url;
95
96%% Handle for all data files
97anynew = false;
98for j=1:numel(ucinames)
99  uciname = ucinames{j};
100  if strcmp(uciname(1:4),'http')
101    % full url given, use it
102    data.url = uciname;
103  else
104    % construct url from UCI info
105    data.url = [url uciname];
106  end
107  if numel(ucinames) > 1
108    dataname = [comname '_' num2str(j)];
109  else
110    dataname = comname;
111  end
112%   opt{j}.dsetname = dataname;
113  savemat = ~isfield(opt{j},'matfile') || opt{j}.matfile;
114  opt{j}.matfile  = false;
115  opt{j}.delimeter= ',';
116  opt{j} = fielddef(opt{j},'dsetname',callername);
117  a = pr_download(data.url,fullfile(datadir,dataname),opt{j});
118  a = setuser(a,data,'user'); % store dataset info
119%   a = setname(a,dataname);    % set dataset name
120  if ~isfield(opt{j},'labfeat') || isempty(opt{j}.labfeat)
121    a = feat2lab(a,size(a,2));
122  end
123  if savemat
124    save(fullfile(datadir,dataname),'a');
125  end
126  varargout{j} = a;
127end
128
129%% combine them
130if numel(ucinames) > 1
131  % multiple datasets loaded, alignment might be needed
132  [varargout{:}] = pr_dset_align(varargout{:});
133  a = vertcat(varargout{:});
134  a = setuser(a,data,'user'); % store dataset info
135  opt{end} = fielddef(opt{end},'dsetname',callername);
136  if ~isfield(opt{end},'matfile') || opt{end}.matfile
137    save(fullfile(datadir,comname),'a');
138  end
139  if nargout == 1 % just combined set is requested
140    varargout{1} = a;
141  end
142end
143
144function varargout = pr_download_uci_old(name,varargin)
145%% take care of old definition
146[ucinames,form,prname,siz,nhead,misvalchar,delchar,cfeat,nosave] = ...
147          setdefaults(varargin,{[lower(name) '.data']},[],[],[],[],'?',',',[],false);
148nhead = setdefaults({nhead},zeros(1,numel(ucinames))); % headerlines to be skipped
149
150if ~iscell(ucinames)
151  ucinames = {ucinames};
152end
153if isempty(cfeat)
154  cfeat = repmat({[]},1,numel(ucinames));
155end
156if ~iscell(cfeat)
157  cfeat = repmat({cfeat},1,numel(ucinames));
158end
159
160prname  = setdefaults({prname},callername(1));
161prname  = setdefaults({prname},lower(name));
162thisdir = fileparts(which(mfilename));
163
164if isempty(siz)
165  % no sizes given, make all 0
166  siz = zeros(1,numel(ucinames));
167end
168
169varargout = cell(1,numel(ucinames));
170anynew    = false;
171filenames = cell(1,numel(ucinames));
172for j=1:numel(ucinames)
173  uciname = ucinames{j};
174  if numel(ucinames) > 1
175    dataname = [prname '_' num2str(j)];
176  else
177    dataname = prname;
178  end
179  filenames{j} = fullfile(fullfile(thisdir,'data'),dataname);
180  if exist([filenames{j} '.mat'],'file') == 2
181    % if mat-file is available, use it
182%     s = load([filenames{j} '.mat']);
183%     f = fieldnames(s);
184%     a = getfield(s,f{1});
185    a = file2dset([filenames{j} '.mat']);
186  else
187    if ~exist('data','var')
188      % get UCI info
189      data = parselink(name);
190      if ~data.misval % avoid checking missing values if not needed
191        misvalchar = [];
192      end
193      url = data.url;
194    end   
195    if strcmp(uciname(1:4),'http')
196      % full url given, use it
197      data.url = uciname;
198    else
199      % construct url from UCI info
200      data.url = [url uciname];
201    end
202    % do the real work
203    a = pr_download(data.url,filenames{j},siz(j),nhead(j),form,misvalchar,delchar,true);
204    if ischar(cfeat{j})
205      labfile = [filenames{j} '_lab'];
206      % old call to pr_download
207      labs = +pr_download([url cfeat{j}],labfile,0,[],[],[],[],true);
208      if isempty(labs) || size(labs,1) ~= size(a,1)
209        warning(['No correct label file found: ' [url cfeat{j}]]);
210      else
211        delete([labfile '.dat']);
212        a = setlabels(a,+labs);
213      end
214    else
215      if isempty(cfeat{j}) % find labels and use them
216        a = feat2lab(a,size(a,2));
217      elseif cfeat{j} ~= 0
218        a = feat2lab(a,cfeat{j});
219      end
220    end
221    a = setuser(a,data,'user'); % store dataset info
222    a = setname(a,dataname);    % set dataset name
223    save([filenames{j} '.mat'],'a'); % save it
224    anynew = true;
225  end
226  varargout{j} = a;
227end
228
229if anynew && numel(ucinames) > 1
230  % multiple datasets loaded, alignment might be needed
231  [varargout{:}] = pr_dset_align(varargout{:});
232  for j=1:numel(ucinames)
233    a = varargout{j};
234    if ~nosave
235      save(filenames{j},'a');
236    end
237  end
238end
239
240function data = parselink(link)
241%% Parse info from a particular UCI ML data set
242% data.link : url of the particular data set pages
243% data.info : url of the data set info page
244% data.url  : url of the data set data files (excluding the filename, as
245%              there might be more files and their names ar irregular)
246% data.desc : the problem infor as given in the abstract;
247% data.misval : true/false for missing values
248% data.type : feature types (categorical / integer / real)
249
250link = ['http://archive.ics.uci.edu/ml/datasets/' link];
251desc = urlread(link);
252k = strfind(desc,'Download');
253s = desc(k:k+250);
254k = strfind(s,'"');
255url = ['http://archive.ics.uci.edu/ml/' s(k(1)+4:k(2)-1)];
256info = ['http://archive.ics.uci.edu/ml/' s(k(5)+3:k(6)-1)];
257
258k1 = strfind(desc,'Abstract</b>');
259if ~isempty(k1)
260  k2 = strfind(desc(k1+12:end),'</p>');
261  descr = desc(k1+14:k1+k2+10);
262end
263
264k = strfind(desc,'Attribute Characteristics:</b></p></td>');
265type = desc(k+64:k+150);
266k = strfind(type,'</p');
267type = textscan(type(1:k-1),'%s','delimiter',',');
268type = char(type{1});
269
270k1 = strfind(desc,'Missing Values');
271if strcmp(desc(k1+53:k1+54),'No')
272  misval = false;
273elseif strcmp(desc(k1+53:k1+55),'Yes')
274  misval = true;
275else
276  misval = [];
277end
278
279data.link = link;
280data.info = info;
281data.url  = url;
282data.desc = descr;
283data.misval = misval;
284data.type = type;
285
286function s = fielddef(s,field,x)
287  if ~isfield(s,field)
288    s.(field) = x;
289  end
290
291function name = callername(n)
292%%
293if nargin < 1, n=0; end
294[ss,dummy] = dbstack;
295if length(ss) < n+3
296        name = [];
297else
298        name = ss(n+3).name;
299end
300
301function dirname = callerdir(n)
302%%
303if nargin < 1, n=0; end
304ss = dbstack;
305if length(ss) < 3
306  % no caller, commandline call
307  dirname = pwd;
308else
309  dirname = fileparts(which(ss(n+3).name));
310end
Note: See TracBrowser for help on using the repository browser.