source: prdatasets/pr_download.m @ 159

Last change on this file since 159 was 155, checked in by bduin, 5 years ago
File size: 15.3 KB
Line 
1%PR_DOWNLOAD Load or download data and create dataset
2%
3%   [A,NEW] = PR_DOWNLOAD(URL,DATFILE,OPTIONS)
4%
5% INPUT
6%   URL        URL of character file to be downloaded
7%   DATFILE    Desired name of downloaded and uncompressed file
8%              Default: name of the url-file, extended by .dat
9%   OPTIONS    Structure with options used for parsing and constructing
10%              a PRTools dataset
11%
12% OUTPUT
13%   A          Dataset
14%   NEW        Logical, TRUE if a new dataset has been created, FALSE if an
15%              existing mat-file has been found and used.
16%
17% DESCRIPTION
18% This routine facilitates downloading of character based datasets. DATFILE
19% will be the name (or path with name) in which the URL is downloaded. If
20% needed the URL file is unzipped and/or untarred first. After parsing a
21% PRTools dataset is constructed, stored in a mat-file (optional) and
22% returned. The name of the mat-file is DATFILE extended by .mat.
23%
24% The directory specified in DATFILE, or if not supplied, the directory and
25% the name of the calling routine, will be used for storing files in a
26% subdirectory 'data'. If the mat-file already exists it will be loaded and
27% returned in A (no new download and parsing). If DATFILE already exists it
28% will be used (no new download).
29%
30% OPTIONS should be a structure with the below fields, to be supplied in
31% lower case. Missing fields are replaced by the given defaults.
32%
33%   SIZE       = [];    Size of data to be downloaded, in MB. Not needed,
34%                       just used to warn the user.
35%   PARSE      = TRUE;  If FALSE, parsing is skipped. Just downloading and
36%                       uncompression. A will be empty.
37%   PARSEFUN   = [];    A handle of a user supplied parsing function. This
38%                       function should operate on DATFILE (first parameter,
39%                       substituted by PR_DOWNLOAD) and return a PRTools
40%                       dataset. If PARSEFUN is not given, default parsing
41%                       using PR_READDATASET will be used.
42%   PARSEPARS  = {};    Cell array with additional parameters for PARSEFUN.
43%   FORMAT     = [];    Needed for default parsing, see PR_READDATASET.
44%   NHEADLINES = 0;     Needed for default parsing, see PR_READDATASET.
45%   MISVALCHAR = '?';   Data characters to be replaced by NaN
46%   MISVALUE   = [];    Data values to be replaced by NaN
47%   DELIMETER  = ' ';   Needed for default parsing, see PR_READDATASET.
48%   EXTENSION  = 'dat'; Extension to be used for downloaded DATFILE.
49%   MATFILE    = TRUE;  If FALSE, the dataset A will not be saved.
50%   LABFEAT    = [];    Feature found in DATFILE and to be used as class
51%                       label, see FEAT2LAB.
52%   FEATS      = [];    Columns of dataset to be used ase features.
53%   FEATNAMES  = [];    Desired feature names of dataset A, see SETFEATLAB.
54%   CLASSNAMES = [];    Class names to be stored in A, see SETLABLIST.
55%   USER       = [];    Additional information to be stored in the
56%                       user-field of A, see SETUSER.
57%   LINK       = [];    Link for more information in the dataset.
58%   DESC       = [];    Short description of the dataset.
59%   DSETNAME   = [];    Desired name of the dataset A.
60%
61%
62% EXAMPLE
63%  url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data';
64%  opt.extension = 'dat'; % create iris.dat
65%  opt.labfeat   = 5;     % use feature 5 for labeling
66%  opt.matfile   = false; % don't create a mat-file
67%  c = pr_download(url,[],opt) % load Iris dataset from UCI and parse
68%
69% SEE ALSO
70% DATASETS, SETFEATDOM, GETFEATDOM, FEAT2LAB
71
72% Copyright: R.P.W. Duin
73
74%%
75function [a,new] = pr_download(url,datname,varargin)
76
77%% make sur there is a data subdir
78persistent DATADIREXISTS
79if isempty(DATADIREXISTS)
80  datasubdir = fullfile(fileparts(which(mfilename)),'data');
81  if exist(datasubdir,'dir') ~= 7
82    mkdir(datasubdir);
83  end
84  DATADIREXISTS = true;
85end
86
87%%
88if nargin >= 3
89  % this can be removed when all mfiles in prdatasets call the new version
90  % of pr_download_uci
91  if ~isstruct(varargin{1}) && ~isempty(varargin{1}) && isnumeric(varargin{1})
92    [a,new] = pr_download_old(url,datname,varargin{:});
93    return
94  else
95    opt = varargin{1};
96  end
97end
98
99if nargin < 3, opt = []; end
100if nargin < 2, datname = []; end
101
102opt = download_opt(opt);  % set defaults where necessary
103
104%% find directory to be used
105if isempty(datname)
106  datname = pr_callername;
107  dirname = fullfile(fileparts(which(datname)),'data');
108else
109  [dirname,datname] = fileparts(datname);
110end
111
112%% set all necessary filenames
113[~,urlname,urlext] = fileparts(url);
114if isempty(datname)
115  % will only be empty if called from command line
116  datname = urlname;
117  dirname = pwd;
118end
119urlname = [urlname urlext]; % name of file to be downloaded
120matname = [datname '.mat']; % name of mat-file to be created
121datname = [datname '.' opt.extension]; % name of datfile to be created
122urlfile = fullfile(dirname,urlname);   % temp file for download
123datfile = fullfile(dirname,datname);   % unpacked urlfile
124matfile = fullfile(dirname,matname);   % final matfile
125
126%% load mat-file if it exist
127new = false;
128if exist(matfile,'file') == 2
129  s = prload(matfile);
130  f = fieldnames(s);
131%   a = getfield(s,f{1});
132  a = s.(f{1});
133  return  % we are done!!
134end
135
136%% download the data file  if it doesn't exist
137if exist(datfile,'file') ~= 2        % if datfile does not exist ...
138  ask_download(urlname,opt.size);
139
140  if ~usejava('jvm') && isunix
141    stat = unix(['wget -q -O ' urlfile ' ' url]);
142    status = (stat == 0);
143  else
144    [~,status] = urlwrite(url,urlfile);
145  end
146  if status == 0
147    error(['Server unreachable or file not found: ' url])
148  end
149 
150  % assume file is created, uncompress if needed
151  % delete compressed file
152  if strcmp(urlext,'.zip')
153    disp('Decompression ....')
154    if ~usejava('jvm') && isunix
155      unix(['unzip ' urlfile ' -d ' datfile]);
156    else
157      unzip(urlfile,datfile);
158    end
159  elseif strcmp(urlext,'.gz')
160    disp('Decompression ....')
161    gunzip(urlfile,datfile);
162  elseif strcmp(urlext,'.tar') || strcmp(urlext,'.tgz') || strcmp(urlext,'.tar.gz')
163    disp('Decompression ....')
164    untar(urlfile,datfile);
165  elseif ~strcmp(urlfile,datfile)
166    copyfile(urlfile,datfile)
167  end
168  if exist(datfile,'dir') == 7
169    dirn = dir(datfile);
170    copyfile(fullfile(datfile,dirn(3).name),[datfile 'tmp']);
171    delete([datfile '/*']);
172    rmdir(datfile);
173    copyfile([datfile 'tmp'],datfile);
174    delete([datfile 'tmp']);
175  end
176  if ~strcmp(urlfile,datfile)
177    delete(urlfile);
178  end
179end
180
181if ~opt.parse
182  % no parsing desired, we are done
183  return
184end
185
186%% datfile should now be there, read and convert to dataset 
187disp('Parsing ...')
188if isempty(opt.parsefun)
189  a = pr_readdataset(datfile,opt.nheadlines,opt.delimeter, ...
190                   opt.misvalchar,opt.format);
191else
192  % user defined parsing
193  a = opt.parsefun(datfile,opt.parsepars{:});
194end
195
196%% set dataset fields
197if ~isempty(opt.labfeat) && opt.labfeat > 0
198  a = feat2lab(a,opt.labfeat);
199end
200if ~isempty(opt.classnames)
201  a = setlablist(a,opt.classnames);
202end
203if ~isempty(opt.feats)
204  a = a(:,opt.feats);
205end
206if ~isempty(opt.featnames)
207  a = setfeatlab(a,opt.featnames);
208end
209if ~isempty(opt.misvalue)
210  J = find(a==opt.misvalue);
211  a(J) = NaN;
212end
213if ~isempty(opt.user)
214  a = setuser(a,opt.user);
215end
216if ~isempty(opt.link)
217  a = setuser(a,opt.link,'link');
218end
219if ~isempty(opt.desc)
220  a = setuser(a,opt.desc,'desc');
221end
222if ~isempty(opt.dsetname)
223  a = setname(a,opt.dsetname);
224else
225  a = setname(a,pr_callername);
226end
227
228%% save if desired
229if opt.matfile
230  save(matfile,'a');
231  new = true;
232end
233
234return
235
236
237function ask_download(urlname,datsize)
238%% user controlled downloading
239  global ASK
240  if isempty(ASK), ASK = true; end
241 
242  if ASK && ~isempty(datsize) && datsize > 1 % ask only if datsize has been set
243    siz = ['(' num2str(datsize) ' MB)'];
244    q = input(['Dataset is not available, OK to download ' siz ' [y]/n ?'],'s');
245    if ~isempty(q) && ~strcmp(q,'y')
246      error('No dataset')
247    end
248    disp(['Downloading ' urlname ' (' num2str(siz) ' MB) ....'])
249  else
250    disp(['Downloading ' urlname ' ....'])
251  end
252 
253return
254
255function opt = download_opt(opt_given)
256%%
257  opt.size       = [];
258  opt.parse      = true;
259  opt.parsefun   = [];
260  opt.parsepars  = {};
261  opt.format     = [];
262  opt.nheadlines = 0;
263  opt.misvalchar = '?';
264  opt.misvalue   = [];
265  opt.delimeter  = ' ';
266  opt.extension  = 'dat';
267  opt.matfile    = true;
268  opt.labfeat    = [];
269  opt.feats      = [];
270  opt.featnames  = '';
271  opt.classnames = '';
272  opt.user       = [];
273  opt.dsetname   = '';
274  opt.link       = '';
275  opt.desc       = '';
276
277 
278
279  if (~isempty(opt_given))
280    if (~isstruct(opt_given))
281      error('OPTIONS should be a structure with at least one of the following fields: q, init, etol, optim, maxiter, itmap, isratio, st or inspect.');
282    end
283    fn = fieldnames(opt_given);
284    fall = fieldnames(opt);
285    if (~all(ismember(fn,fall)))
286      ff = '';
287      for j=1:numel(fall)
288        ff = [ff char(fall{j}) ', '];
289      end
290      error(['Wrong field names; valid field names are: ' ff])
291    end
292    for i = 1:length(fn)
293      opt.(fn{i}) = opt_given.(fn{i});
294    end
295  end
296 
297return
298
299function [a,new] = pr_download_old(url,varargin)
300%% This is the old version of pr_download, to be called from the old
301%  version of pr_download_uci only (inside it). It can be removed when all
302%  mfiles in prdataset make the new call to  pr_download_uci
303%
304%PR_DOWNLOAD Load or download data and create dataset
305%
306%   A = PR_DOWNLOAD(URL,FILE,SIZE,NHEAD,FORMAT,MISVALCHAR,DELCHAR,NOSAVE)
307%
308% INPUT
309%   URL          URL of character file to be downloaded
310%   FILE         Filename to download
311%   SIZE         Size of data to be downloaded in Mbytes
312%   NHEAD        # of headerlines to skip
313%   FORMAT       String or cell array defining the format
314%                (default, automatic)
315%   MISVALCHAR   Character used for missing values
316%   DEL          Character delimiter used in the file (default ',')
317%   NOSAVE       Logical, if TRUE A will not be saved, default FALSE
318%
319% OUTPUT
320%   A            Unlabeled dataset
321%
322% DESCRIPTION
323% This routine facilitates downloading of character based datasets. FILE
324% should be the name (or path with name) in which the URL is downloaded. If
325% needed the URL file is unzipped and/or untarred first. If FILE already
326% exists it is used (no downloading). The file is parsed by TEXTSCAN using
327% the format given in FORMAT (see TEXTSCAN) and the delimiter specified in
328% DEL. If FORMAT is not given an attempt is made to derive it
329% automatically.
330%
331% In case a mat-file name [FILE '.mat'] is found it will be used instead of
332% downloading.
333%
334% Columns (features) given as characters (the '%s' fields in FORMAT) will
335% be stored as text based features. They will be replaced by indices to a
336% set of strings stored in the corresponding feature domain (see
337% SETFEATDOM). Use FEAT2LAB to use such a feature for labeling the dataset,
338% see the below example.
339%
340% EXAMPLE
341%  url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data';
342%  c = pr_download(url,'iris.dat',[]); % load Iris dataset from UCI
343%  % the labels are set as string (char) features in c(:,5)
344%  a = feat2lab(c,5);  % use feature 5 for labeling
345%
346% SEE ALSO
347% DATASETS, SETFEATDOM, GETFEATDOM, FEAT2LAB
348
349% Copyright: R.P.W. Duin
350% Faculty EWI, Delft University of Technology
351% P.O. Box 5031, 2600 GA Delft, The Netherlands
352
353
354[~,urlname,urlext] = fileparts(url);
355[datname,siz,nhead,form,misval,del,nosave] = setdefaults(varargin,urlname,1,0,[],'?',',',false);
356
357[dirname,datname] = fileparts(datname);
358if isempty(dirname)
359  dirname = fileparts(which(mfilename));
360  % dirname = pwd;
361end
362urlname = [urlname urlext]; % name of file to be downloaded
363matname = [datname '.mat']; % name of mat-file to be created
364datname = [datname '.dat']; % name of datfile to be created
365urlfile = fullfile(dirname,urlname); % temp file for download
366datfile = fullfile(dirname,datname); % unpacked urlfile
367matfile = fullfile(dirname,matname); % final matfile
368
369new = true;                          % if matfile exists, use it
370if exist(matfile,'file') == 2
371  s = load(matfile);
372  f = fieldnames(s);
373  a = s.(f{1});
374  new = false;
375  return
376end
377
378if exist(datfile,'file') ~= 2        % if datfile does not exist ...
379  ask_download_old(siz);
380  if isempty(siz) || siz == 0
381    disp(['Downloading ' urlname ' ....'])
382  else
383    disp(['Downloading ' urlname ' (' num2str(siz) ' MB) ....'])
384  end
385
386  %disp(['Downloading ' urlname ' ....']) % download into urlfile
387  if ~usejava('jvm') && isunix
388    stat = unix(['wget -q -O ' urlfile ' ' url]);
389    status = (stat == 0);
390  else
391    [~,status] = urlwrite(url,urlfile);
392  end
393  if status == 0
394    error(['Server unreachable or file not found: ' url])
395  end
396 
397  % assume file is created, uncompress if needed
398  % delete compressed file
399  if strcmp(urlext,'.zip')
400    disp('Decompression ....')
401    if ~usejava('jvm') && isunix
402      unix(['unzip ' urlfile ' -d ' datfile]);
403    else
404      unzip(urlfile,datfile);
405    end
406  elseif strcmp(urlext,'.gz')
407    disp('Decompression ....')
408    gunzip(urlfile,datfile);
409  elseif strcmp(urlext,'.tar') || strcmp(urlext,'.tgz') || strcmp(urlext,'.tar.gz')
410    disp('Decompression ....')
411    untar(urlfile,datfile);
412  elseif ~strcmp(urlfile,datfile)
413    copyfile(urlfile,datfile)
414  end
415  if exist(datfile,'dir') == 7
416    dirn = dir(datfile);
417    copyfile(fullfile(datfile,dirn(3).name),[datfile 'tmp']);
418    delete([datfile '/*']);
419    rmdir(datfile);
420    copyfile([datfile 'tmp'],datfile);
421    delete([datfile 'tmp']);
422  end
423  if ~strcmp(urlfile,datfile)
424    delete(urlfile);
425  end
426end
427
428% datfile should now be there, read and parse it
429fid = fopen(datfile);
430if isempty(form)        % if no format given ...
431  for j=1:nhead+1
432    s = fgetl(fid);     % derive it from the first nonheader line
433  end       
434  s = mytextscan(s,'c',del,0); % use all %s for time being
435  form = getform(s);    % convert fields to %n where appropriate
436  fseek(fid,0,-1);      % restart
437end
438
439disp('Parsing ...')
440c = mytextscan(fid,strrep(form,'n','s'),del,nhead);
441a = pr_cell2dset(c,form,misval);
442
443if ~nosave % don't save if not needed (e.g. called by pr_download_uci)
444  save(matfile,'a');
445end
446
447return
448
449function ask_download_old(size)
450
451  global ASK
452  if isempty(ASK)
453    ASK = true;
454  end
455 
456  if ASK
457    if ~isempty(size) && size > 1
458      siz = ['(' num2str(size) ' MB)'];
459      q = input(['Dataset is not available, OK to download ' siz ' [y]/n ?'],'s');
460      if ~isempty(q) && ~strcmp(q,'y')
461        error('Dataset not found')
462      end
463    end
464  end
465 
466return
467
468function form = getform(s)
469s = char(s{1});
470form = repmat('n',1,size(s,1));
471for j=1:size(s,1)
472  %n = textscan(char(s(j,:)),'%n');
473        if ~isempty(regexp(s(j,:),'[^0-9+-.eE ]','once'))
474    form(j) = 'c';
475  end
476end
477
478function s = mytextscan(fid,forms,del,nhead)
479
480form = repmat('%%',1,numel(forms));
481form(2:2:end) = forms;
482forms = strrep(form,'c','s');
483if del == ' '
484  s = textscan(fid,forms,'Headerlines',nhead);
485else
486  s = textscan(fid,forms,'Delimiter',del,'Headerlines',nhead);
487end
488if ~ischar(fid);
489  fclose(fid);
490end
Note: See TracBrowser for help on using the repository browser.