source: prdatasets/pr_download.m @ 153

Last change on this file since 153 was 153, checked in by bduin, 5 years ago
File size: 15.3 KB
Line 
1%PR_DOWNLOAD Load or download data and create dataset
2%
3%   [A,NEW] = PR_DOWNLOAD(URL,DATFILE,OPTIONS)
4%
5% INPUT
6%   URL        URL of character file to be downloaded
7%   DATFILE    Desired name of downloaded and uncompressed file
8%              Default: name of the url-file, extended by .dat
9%   OPTIONS    Structure with options used for parsing and constructing
10%              a PRTools dataset
11%
12% OUTPUT
13%   A          Dataset
14%   NEW        Logical, TRUE if a new dataset has been created, FALSE if an
15%              existing mat-file has been found and used.
16%
17% DESCRIPTION
18% This routine facilitates downloading of character based datasets. DATFILE
19% will be the name (or path with name) in which the URL is downloaded. If
20% needed the URL file is unzipped and/or untarred first. After parsing a
21% PRTools dataset is constructed, stored in a mat-file (optional) and
22% returned. The name of the mat-file is DATFILE extended by .mat.
23%
24% The directory specified in DATFILE, or if not supplied, the directory and
25% the name of the calling routine, will be used for storing files in a
26% subdirectory 'data'. If the mat-file already exists it will be loaded and
27% returned in A (no new download and parsing). If DATFILE already exists it
28% will be used (no new download).
29%
30% OPTIONS should be a structure with the below fields, to be supplied in
31% lower case. Missing fields are replaced by the given defaults.
32%
33%   SIZE       = [];    Size of data to be downloaded, in MB. Not needed,
34%                       just used to warn the user.
35%   PARSE      = TRUE;  If FALSE, parsing is skipped. Just downloading and
36%                       uncompression. A will be empty.
37%   PARSEFUN   = [];    A handle of a user supplied parsing function. This
38%                       function should operate on DATFILE (first parameter,
39%                       substituted by PR_DOWNLOAD) and return a PRTools
40%                       dataset. If PARSEFUN is not given, default parsing
41%                       using PR_READDATASET will be used.
42%   PARSEPARS  = {};    Cell array with additional parameters for PARSEFUN.
43%   FORMAT     = [];    Needed for default parsing, see PR_READDATASET.
44%   NHEADLINES = 0;     Needed for default parsing, see PR_READDATASET.
45%   MISVALCHAR = '?';   Data characters to be replaced by NaN
46%   MISVALUE   = [];    Data values to be replaced by NaN
47%   DELIMETER  = ' ';   Needed for default parsing, see PR_READDATASET.
48%   EXTENSION  = 'dat'; Extension to be used for downloaded DATFILE.
49%   MATFILE    = TRUE;  If FALSE, the dataset A will not be saved.
50%   LABFEAT    = [];    Feature found in DATFILE and to be used as class
51%                       label, see FEAT2LAB.
52%   FEATS      = [];    Columns of dataset to be used ase features.
53%   FEATNAMES  = [];    Desired feature names of dataset A, see SETFEATLAB.
54%   CLASSNAMES = [];    Class names to be stored in A, see SETLABLIST.
55%   USER       = [];    Additional information to be stored in the
56%                       user-field of A, see SETUSER.
57%   LINK       = [];    Link for more information in the dataset.
58%   DESC       = [];    Short description of the dataset.
59%   DSETNAME   = [];    Desired name of the dataset A.
60%
61%
62% EXAMPLE
63%  url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data';
64%  opt.extension = 'dat'; % create iris.dat
65%  opt.labfeat   = 5;     % use feature 5 for labeling
66%  opt.matfile   = false; % don't create a mat-file
67%  c = pr_download(url,[],opt) % load Iris dataset from UCI and parse
68%
69% SEE ALSO
70% DATASETS, SETFEATDOM, GETFEATDOM, FEAT2LAB
71
72% Copyright: R.P.W. Duin
73
74%%
75function [a,new] = pr_download(url,datname,varargin)
76
77%% make sur there is a data subdir
78persistent DATADIREXISTS
79if isempty(DATADIREXISTS)
80  datasubdir = fullfile(fileparts(which(mfilename)),'data');
81  if exist(datasubdir,'dir') ~= 7
82    mkdir(datasubdir);
83  end
84  DATADIREXISTS = true;
85end
86
87%%
88if nargin >= 3
89  % this can be removed when all mfiles in prdatasets call the new version
90  % of pr_download_uci
91  if ~isstruct(varargin{1}) && ~isempty(varargin{1}) && isnumeric(varargin{1})
92    [a,new] = pr_download_old(url,datname,varargin{:});
93    return
94  else
95    opt = varargin{1};
96  end
97end
98
99if nargin < 3, opt = []; end
100if nargin < 2, datname = []; end
101
102opt = download_opt(opt);  % set defaults where necessary
103
104%% find directory to be used
105if isempty(datname)
106  datname = pr_callername;
107  dirname = fullfile(fileparts(which(datname)),'data');
108else
109  [dirname,datname] = fileparts(datname);
110end
111
112%% set all necessary filenames
113[~,urlname,urlext] = fileparts(url);
114if isempty(datname)
115  % will only be empty if called from command line
116  datname = urlname;
117  dirname = pwd;
118end
119urlname = [urlname urlext]; % name of file to be downloaded
120matname = [datname '.mat']; % name of mat-file to be created
121datname = [datname '.' opt.extension]; % name of datfile to be created
122urlfile = fullfile(dirname,urlname);   % temp file for download
123datfile = fullfile(dirname,datname);   % unpacked urlfile
124matfile = fullfile(dirname,matname);   % final matfile
125
126%% load mat-file if it exist
127new = false;
128if exist(matfile,'file') == 2
129  s = prload(matfile);
130  f = fieldnames(s);
131%   a = getfield(s,f{1});
132  a = s.(f{1});
133  return  % we are done!!
134end
135
136%% download the data file  if it doesn't exist
137if exist(datfile,'file') ~= 2        % if datfile does not exist ...
138  ask_download(urlname,opt.size);
139
140  if ~usejava('jvm') && isunix
141    stat = unix(['wget -q -O ' urlfile ' ' url]);
142    status = (stat == 0);
143  else
144    [~,status] = urlwrite(url,urlfile);
145  end
146  if status == 0
147    error(['Server unreachable or file not found: ' url])
148  end
149 
150  % assume file is created, uncompress if needed
151  % delete compressed file
152  if strcmp(urlext,'.zip')
153    disp('Decompression ....')
154    if ~usejava('jvm') && isunix
155      unix(['unzip ' urlfile ' -d ' datfile]);
156    else
157      unzip(urlfile,datfile);
158    end
159  elseif strcmp(urlext,'.gz')
160    disp('Decompression ....')
161    gunzip(urlfile,datfile);
162  elseif strcmp(urlext,'.tar') || strcmp(urlext,'.tgz') || strcmp(urlext,'.tar.gz')
163    disp('Decompression ....')
164    untar(urlfile,datfile);
165  elseif ~strcmp(urlfile,datfile)
166    copyfile(urlfile,datfile)
167  end
168  if exist(datfile,'dir') == 7
169    dirn = dir(datfile);
170    copyfile(fullfile(datfile,dirn(3).name),[datfile 'tmp']);
171    delete([datfile '/*']);
172    rmdir(datfile);
173    copyfile([datfile 'tmp'],datfile);
174    delete([datfile 'tmp']);
175  end
176  if ~strcmp(urlfile,datfile)
177    delete(urlfile);
178  end
179end
180
181if ~opt.parse
182  % no parsing desired, we are done
183  return
184end
185
186%% datfile should now be there, read and convert to dataset 
187disp('Parsing ...')
188if isempty(opt.parsefun)
189  a = pr_readdataset(datfile,opt.nheadlines,opt.delimeter, ...
190                   opt.misvalchar,opt.format);
191else
192  % user defined parsing
193  a = opt.parsefun(datfile,opt.parsepars{:});
194end
195
196%% set dataset fields
197if ~isempty(opt.labfeat) && opt.labfeat > 0
198  a = feat2lab(a,opt.labfeat);
199end
200if ~isempty(opt.classnames)
201  a = setlablist(a,opt.classnames);
202end
203if ~isempty(opt.feats)
204  a = a(:,opt.feats);
205end
206if ~isempty(opt.featnames)
207  a = setfeatlab(a,opt.featnames);
208end
209if ~isempty(opt.misvalue)
210  J = find(a==opt.misvalue);
211  a(J) = NaN;
212end
213if ~isempty(opt.user)
214  a = setuser(a,opt.user);
215end
216if ~isempty(opt.link)
217  a = setuser(a,opt.link,'link');
218end
219if ~isempty(opt.desc)
220  a = setuser(a,opt.desc,'desc');
221end
222if ~isempty(opt.dsetname)
223  a = setname(a,opt.dsetname);
224else
225  a = setname(a,pr_callername);
226end
227
228%% save if desired
229if opt.matfile
230  save(matfile,'a');
231  new = true;
232end
233
234return
235
236
237function ask_download(urlname,datsize)
238%% user controlled downloading
239  global ASK
240 
241  if ASK && ~isempty(datsize) % ask only if datsize has been set
242    if datsize ~= 0
243      siz = ['(' num2str(datsize) ' MB)'];
244    else
245      siz = '';
246    end
247    q = input(['Dataset is not available, OK to download ' siz ' [y]/n ?'],'s');
248    if ~isempty(q) && ~strcmp(q,'y')
249      error('No dataset')
250    end
251  else
252    siz = [];
253  end
254 
255  if isempty(siz)
256    disp(['Downloading ' urlname ' ....'])
257  else
258    disp(['Downloading ' urlname ' (' num2str(siz) ' MB) ....'])
259  end
260 
261return
262
263function opt = download_opt(opt_given)
264%%
265  opt.size       = [];
266  opt.parse      = true;
267  opt.parsefun   = [];
268  opt.parsepars  = {};
269  opt.format     = [];
270  opt.nheadlines = 0;
271  opt.misvalchar = '?';
272  opt.misvalue   = [];
273  opt.delimeter  = ' ';
274  opt.extension  = 'dat';
275  opt.matfile    = true;
276  opt.labfeat    = [];
277  opt.feats      = [];
278  opt.featnames  = '';
279  opt.classnames = '';
280  opt.user       = [];
281  opt.dsetname   = '';
282  opt.link       = '';
283  opt.desc       = '';
284
285 
286
287  if (~isempty(opt_given))
288    if (~isstruct(opt_given))
289      error('OPTIONS should be a structure with at least one of the following fields: q, init, etol, optim, maxiter, itmap, isratio, st or inspect.');
290    end
291    fn = fieldnames(opt_given);
292    fall = fieldnames(opt);
293    if (~all(ismember(fn,fall)))
294      ff = '';
295      for j=1:numel(fall)
296        ff = [ff char(fall{j}) ', '];
297      end
298      error(['Wrong field names; valid field names are: ' ff])
299    end
300    for i = 1:length(fn)
301      opt.(fn{i}) = opt_given.(fn{i});
302    end
303  end
304 
305return
306
307function [a,new] = pr_download_old(url,varargin)
308%% This is the old version of pr_download, to be called from the old
309%  version of pr_download_uci only (inside it). It can be removed when all
310%  mfiles in prdataset make the new call to  pr_download_uci
311%
312%PR_DOWNLOAD Load or download data and create dataset
313%
314%   A = PR_DOWNLOAD(URL,FILE,SIZE,NHEAD,FORMAT,MISVALCHAR,DELCHAR,NOSAVE)
315%
316% INPUT
317%   URL          URL of character file to be downloaded
318%   FILE         Filename to download
319%   SIZE         Size of data to be downloaded in Mbytes
320%   NHEAD        # of headerlines to skip
321%   FORMAT       String or cell array defining the format
322%                (default, automatic)
323%   MISVALCHAR   Character used for missing values
324%   DEL          Character delimiter used in the file (default ',')
325%   NOSAVE       Logical, if TRUE A will not be saved, default FALSE
326%
327% OUTPUT
328%   A            Unlabeled dataset
329%
330% DESCRIPTION
331% This routine facilitates downloading of character based datasets. FILE
332% should be the name (or path with name) in which the URL is downloaded. If
333% needed the URL file is unzipped and/or untarred first. If FILE already
334% exists it is used (no downloading). The file is parsed by TEXTSCAN using
335% the format given in FORMAT (see TEXTSCAN) and the delimiter specified in
336% DEL. If FORMAT is not given an attempt is made to derive it
337% automatically.
338%
339% In case a mat-file name [FILE '.mat'] is found it will be used instead of
340% downloading.
341%
342% Columns (features) given as characters (the '%s' fields in FORMAT) will
343% be stored as text based features. They will be replaced by indices to a
344% set of strings stored in the corresponding feature domain (see
345% SETFEATDOM). Use FEAT2LAB to use such a feature for labeling the dataset,
346% see the below example.
347%
348% EXAMPLE
349%  url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data';
350%  c = pr_download(url,'iris.dat',[]); % load Iris dataset from UCI
351%  % the labels are set as string (char) features in c(:,5)
352%  a = feat2lab(c,5);  % use feature 5 for labeling
353%
354% SEE ALSO
355% DATASETS, SETFEATDOM, GETFEATDOM, FEAT2LAB
356
357% Copyright: R.P.W. Duin
358% Faculty EWI, Delft University of Technology
359% P.O. Box 5031, 2600 GA Delft, The Netherlands
360
361
362[~,urlname,urlext] = fileparts(url);
363[datname,siz,nhead,form,misval,del,nosave] = setdefaults(varargin,urlname,1,0,[],'?',',',false);
364
365[dirname,datname] = fileparts(datname);
366if isempty(dirname)
367  dirname = fileparts(which(mfilename));
368  % dirname = pwd;
369end
370urlname = [urlname urlext]; % name of file to be downloaded
371matname = [datname '.mat']; % name of mat-file to be created
372datname = [datname '.dat']; % name of datfile to be created
373urlfile = fullfile(dirname,urlname); % temp file for download
374datfile = fullfile(dirname,datname); % unpacked urlfile
375matfile = fullfile(dirname,matname); % final matfile
376
377new = true;                          % if matfile exists, use it
378if exist(matfile,'file') == 2
379  s = load(matfile);
380  f = fieldnames(s);
381  a = s.(f{1});
382  new = false;
383  return
384end
385
386if exist(datfile,'file') ~= 2        % if datfile does not exist ...
387  ask_download_old(siz);
388  if isempty(siz) || siz == 0
389    disp(['Downloading ' urlname ' ....'])
390  else
391    disp(['Downloading ' urlname ' (' num2str(siz) ' MB) ....'])
392  end
393
394  %disp(['Downloading ' urlname ' ....']) % download into urlfile
395  if ~usejava('jvm') && isunix
396    stat = unix(['wget -q -O ' urlfile ' ' url]);
397    status = (stat == 0);
398  else
399    [~,status] = urlwrite(url,urlfile);
400  end
401  if status == 0
402    error(['Server unreachable or file not found: ' url])
403  end
404 
405  % assume file is created, uncompress if needed
406  % delete compressed file
407  if strcmp(urlext,'.zip')
408    disp('Decompression ....')
409    if ~usejava('jvm') && isunix
410      unix(['unzip ' urlfile ' -d ' datfile]);
411    else
412      unzip(urlfile,datfile);
413    end
414  elseif strcmp(urlext,'.gz')
415    disp('Decompression ....')
416    gunzip(urlfile,datfile);
417  elseif strcmp(urlext,'.tar') || strcmp(urlext,'.tgz') || strcmp(urlext,'.tar.gz')
418    disp('Decompression ....')
419    untar(urlfile,datfile);
420  elseif ~strcmp(urlfile,datfile)
421    copyfile(urlfile,datfile)
422  end
423  if exist(datfile,'dir') == 7
424    dirn = dir(datfile);
425    copyfile(fullfile(datfile,dirn(3).name),[datfile 'tmp']);
426    delete([datfile '/*']);
427    rmdir(datfile);
428    copyfile([datfile 'tmp'],datfile);
429    delete([datfile 'tmp']);
430  end
431  if ~strcmp(urlfile,datfile)
432    delete(urlfile);
433  end
434end
435
436% datfile should now be there, read and parse it
437fid = fopen(datfile);
438if isempty(form)        % if no format given ...
439  for j=1:nhead+1
440    s = fgetl(fid);     % derive it from the first nonheader line
441  end       
442  s = mytextscan(s,'c',del,0); % use all %s for time being
443  form = getform(s);    % convert fields to %n where appropriate
444  fseek(fid,0,-1);      % restart
445end
446
447disp('Parsing ...')
448c = mytextscan(fid,strrep(form,'n','s'),del,nhead);
449a = pr_cell2dset(c,form,misval);
450
451if ~nosave % don't save if not needed (e.g. called by pr_download_uci)
452  save(matfile,'a');
453end
454
455return
456
457function ask_download_old(size)
458
459  global ASK
460  if isempty(ASK)
461    ASK = true;
462  end
463 
464  if ASK
465    if ~isempty(size)
466      siz = ['(' num2str(size) ' MB)'];
467    else
468      siz = '';
469    end
470    q = input(['Dataset is not available, OK to download ' siz ' [y]/n ?'],'s');
471    if ~isempty(q) && ~strcmp(q,'y')
472      error('Dataset not found')
473    end
474  end
475 
476return
477
478function form = getform(s)
479s = char(s{1});
480form = repmat('n',1,size(s,1));
481for j=1:size(s,1)
482  %n = textscan(char(s(j,:)),'%n');
483        if ~isempty(regexp(s(j,:),'[^0-9+-.eE ]','once'))
484    form(j) = 'c';
485  end
486end
487
488function s = mytextscan(fid,forms,del,nhead)
489
490form = repmat('%%',1,numel(forms));
491form(2:2:end) = forms;
492forms = strrep(form,'c','s');
493if del == ' '
494  s = textscan(fid,forms,'Headerlines',nhead);
495else
496  s = textscan(fid,forms,'Delimiter',del,'Headerlines',nhead);
497end
498if ~ischar(fid);
499  fclose(fid);
500end
Note: See TracBrowser for help on using the repository browser.