source: prdatasets/pr_download.m @ 136

Last change on this file since 136 was 136, checked in by bduin, 5 years ago
File size: 15.2 KB
Line 
1%PR_DOWNLOAD Load or download data and create dataset
2%
3%   [A,NEW] = PR_DOWNLOAD(URL,DATFILE,OPTIONS)
4%
5% INPUT
6%   URL        URL of character file to be downloaded
7%   DATFILE    Desired name of downloaded and uncompressed file
8%              Default: name of the url-file, extended by .dat
9%   OPTIONS    Structure with options used for parsing and constructing
10%              a PRTools dataset
11%
12% OUTPUT
13%   A          Dataset
14%   NEW        Logical, TRUE if a new dataset has been created, FALSE if an
15%              existing mat-file has been found and used.
16%
17% DESCRIPTION
18% This routine facilitates downloading of character based datasets. DATFILE
19% will be the name (or path with name) in which the URL is downloaded. If
20% needed the URL file is unzipped and/or untarred first. After parsing a
21% PRTools dataset is constructed, stored in a mat-file (optional) and
22% returned. The name of the mat-file is DATFILE extended by .mat.
23%
24% The directory specified in DATFILE, or if not supplied, the directory and
25% the name of the calling routine, will be used for storing files in a
26% subdirectory 'data'. If the mat-file already exists it will be loaded and
27% returned in A (no new download and parsing). If DATFILE already exists it
28% will be used (no new download).
29%
30% OPTIONS should be a structure with the below fields, to be supplied in
31% lower case. Missing fields are replaced by the given defaults.
32%
33%   SIZE       = [];    Size of data to be downloaded, in MB. Not needed,
34%                       just used to warn the user.
35%   PARSE      = TRUE;  If FALSE, parsing is skipped. Just downloading and
36%                       uncompression. A will be empty.
37%   PARSEFUN   = [];    A handle of a user supplied parsing function. This
38%                       function should operate on DATFILE (first parameter,
39%                       substituted by PR_DOWNLOAD) and return a PRTools
40%                       dataset. If PARSEFUN is not given, default parsing
41%                       using PR_READDATASET will be used.
42%   PARSEPARS  = {};    Cell array with additional parameters for PARSEFUN.
43%   FORMAT     = [];    Needed for default parsing, see PR_READDATASET.
44%   NHEADLINES = 0;     Needed for default parsing, see PR_READDATASET.
45%   MISVALCHAR = '?';   Data characters to be replaced by NaN
46%   MISVALUE   = [];    Data values to be replaced by NaN
47%   DELIMETER  = ' ';   Needed for default parsing, see PR_READDATASET.
48%   EXTENSION  = 'dat'; Extension to be used for downloaded DATFILE.
49%   MATFILE    = TRUE;  If FALSE, the dataset A will not be saved.
50%   LABFEAT    = [];    Feature found in DATFILE and to be used as class
51%                       label, see FEAT2LAB.
52%   FEATS      = [];    Columns of dataset to be used ase features.
53%   FEATNAMES  = [];    Desired feature names of dataset A, see SETFEATLAB.
54%   CLASSNAMES = [];    Class names to be stored in A, see SETLABLIST.
55%   USER       = [];    Additional information to be stored in the
56%                       user-field of A, see SETUSER.
57%   LINK       = [];    Link for more information in the dataset.
58%   DESC       = [];    Short description of the dataset.
59%   DSETNAME   = [];    Desired name of the dataset A.
60%
61%
62% EXAMPLE
63%  url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data';
64%  opt.extension = 'dat'; % create iris.dat
65%  opt.labfeat   = 5;     % use feature 5 for labeling
66%  opt.matfile   = false; % don't create a mat-file
67%  c = pr_download(url,[],opt) % load Iris dataset from UCI and parse
68%
69% SEE ALSO
70% DATASETS, SETFEATDOM, GETFEATDOM, FEAT2LAB
71
72% Copyright: R.P.W. Duin
73
74%%
75function [a,new] = pr_download(url,datname,varargin)
76
77if nargin >= 3
78  % this can be removed when all mfiles in prdatasets call the new version
79  % of pr_download_uci
80  if ~isstruct(varargin{1}) && ~isempty(varargin{1}) && isnumeric(varargin{1})
81    [a,new] = pr_download_old(url,datname,varargin{:});
82    return
83  else
84    opt = varargin{1};
85  end
86end
87
88if nargin < 3, opt = []; end
89if nargin < 2, datname = []; end
90
91opt = download_opt(opt);  % set defaults where necessary
92
93%% find directory to be used
94if isempty(datname)
95  datname = callername;
96  dirname = fullfile(fileparts(which(datname)),'data');
97else
98  [dirname,datname] = fileparts(datname);
99end
100
101%% set all necessary filenames
102[~,urlname,urlext] = fileparts(url);
103if isempty(datname)
104  % will only be empty if called from command line
105  datname = urlname;
106  dirname = pwd;
107end
108urlname = [urlname urlext]; % name of file to be downloaded
109matname = [datname '.mat']; % name of mat-file to be created
110datname = [datname '.' opt.extension]; % name of datfile to be created
111urlfile = fullfile(dirname,urlname);   % temp file for download
112datfile = fullfile(dirname,datname);   % unpacked urlfile
113matfile = fullfile(dirname,matname);   % final matfile
114
115%% load mat-file if it exist
116new = false;
117if exist(matfile,'file') == 2
118  s = prload(matfile);
119  f = fieldnames(s);
120%   a = getfield(s,f{1});
121  a = s.(f{1});
122  return  % we are done!!
123end
124
125%% download the data file  if it doesn't exist
126if exist(datfile,'file') ~= 2        % if datfile does not exist ...
127  ask_download(urlname,opt.size);
128
129  if ~usejava('jvm') && isunix
130    stat = unix(['wget -q -O ' urlfile ' ' url]);
131    status = (stat == 0);
132  else
133    [~,status] = urlwrite(url,urlfile);
134  end
135  if status == 0
136    error(['Server unreachable or file not found: ' url])
137  end
138 
139  % assume file is created, uncompress if needed
140  % delete compressed file
141  if strcmp(urlext,'.zip')
142    disp('Decompression ....')
143    if ~usejava('jvm') && isunix
144      unix(['unzip ' urlfile ' -d ' datfile]);
145    else
146      unzip(urlfile,datfile);
147    end
148  elseif strcmp(urlext,'.gz')
149    disp('Decompression ....')
150    gunzip(urlfile,datfile);
151  elseif strcmp(urlext,'.tar') || strcmp(urlext,'.tgz') || strcmp(urlext,'.tar.gz')
152    disp('Decompression ....')
153    untar(urlfile,datfile);
154  elseif ~strcmp(urlfile,datfile)
155    copyfile(urlfile,datfile)
156  end
157  if exist(datfile,'dir') == 7
158    dirn = dir(datfile);
159    copyfile(fullfile(datfile,dirn(3).name),[datfile 'tmp']);
160    delete([datfile '/*']);
161    rmdir(datfile);
162    copyfile([datfile 'tmp'],datfile);
163    delete([datfile 'tmp']);
164  end
165  if ~strcmp(urlfile,datfile)
166    delete(urlfile);
167  end
168end
169
170if ~opt.parse
171  % no parsing desired, we are done
172  return
173end
174
175%% datfile should now be there, read and convert to dataset 
176disp('Parsing ...')
177if isempty(opt.parsefun)
178  a = pr_readdataset(datfile,opt.nheadlines,opt.delimeter, ...
179                   opt.misvalchar,opt.format);
180else
181  % user defined parsing
182  a = opt.parsefun(datfile,opt.parsepars{:});
183end
184
185%% set dataset fields
186if ~isempty(opt.labfeat) && opt.labfeat > 0
187  a = feat2lab(a,opt.labfeat);
188end
189if ~isempty(opt.classnames)
190  a = setlablist(a,opt.classnames);
191end
192if ~isempty(opt.feats)
193  a = a(:,opt.feats);
194end
195if ~isempty(opt.featnames)
196  a = setfeatlab(a,opt.featnames);
197end
198if ~isempty(opt.misvalue)
199  J = find(a==opt.misvalue);
200  a(J) = NaN;
201end
202if ~isempty(opt.user)
203  a = setuser(a,opt.user);
204end
205if ~isempty(opt.link)
206  a = setuser(a,opt.link,'link');
207end
208if ~isempty(opt.desc)
209  a = setuser(a,opt.desc,'desc');
210end
211if ~isempty(opt.dsetname)
212  a = setname(a,opt.dsetname);
213else
214  a = setname(a,callername);
215end
216
217%% save if desired
218if opt.matfile
219  save(matfile,'a');
220  new = true;
221end
222
223return
224
225
226function ask_download(urlname,size)
227%% user controlled downloading
228  global ASK
229  if isempty(ASK)
230    ASK = true;
231  end
232 
233  if ASK
234    if ~isempty(size) && size ~= 0
235      siz = ['(' num2str(size) ' MB)'];
236    else
237      siz = '';
238    end
239    q = input(['Dataset is not available, OK to download ' siz ' [y]/n ?'],'s');
240    if ~isempty(q) && ~strcmp(q,'y')
241      error('No dataset')
242    end
243  else
244    siz = [];
245  end
246 
247  if isempty(siz)
248    disp(['Downloading ' urlname ' ....'])
249  else
250    disp(['Downloading ' urlname ' (' num2str(siz) ' MB) ....'])
251  end
252 
253return
254
255function opt = download_opt(opt_given)
256%%
257  opt.size       = [];
258  opt.parse      = true;
259  opt.parsefun   = [];
260  opt.parsepars  = {};
261  opt.format     = [];
262  opt.nheadlines = 0;
263  opt.misvalchar = '?';
264  opt.misvalue   = [];
265  opt.delimeter  = ' ';
266  opt.extension  = 'dat';
267  opt.matfile    = true;
268  opt.labfeat    = [];
269  opt.feats      = [];
270  opt.featnames  = '';
271  opt.classnames = '';
272  opt.user       = [];
273  opt.dsetname   = '';
274  opt.link       = '';
275  opt.desc       = '';
276
277 
278
279  if (~isempty(opt_given))
280    if (~isstruct(opt_given))
281      error('OPTIONS should be a structure with at least one of the following fields: q, init, etol, optim, maxiter, itmap, isratio, st or inspect.');
282    end
283    fn = fieldnames(opt_given);
284    fall = fieldnames(opt);
285    if (~all(ismember(fn,fall)))
286      ff = '';
287      for j=1:numel(fall)
288        ff = [ff char(fall{j}) ', '];
289      end
290      error(['Wrong field names; valid field names are: ' ff])
291    end
292    for i = 1:length(fn)
293      opt.(fn{i}) = opt_given.(fn{i});
294    end
295  end
296 
297return
298
299function name = callername
300%%
301[ss,~] = dbstack;
302if length(ss) < 3
303        name = [];
304else
305        name = ss(3).name;
306end
307
308function [a,new] = pr_download_old(url,varargin)
309%% This is the old version of pr_download, to be called from the old
310%  version of pr_download_uci only (inside it). It can be removed when all
311%  mfiles in prdataset make the new call to  pr_download_uci
312%
313%PR_DOWNLOAD Load or download data and create dataset
314%
315%   A = PR_DOWNLOAD(URL,FILE,SIZE,NHEAD,FORMAT,MISVALCHAR,DELCHAR,NOSAVE)
316%
317% INPUT
318%   URL          URL of character file to be downloaded
319%   FILE         Filename to download
320%   SIZE         Size of data to be downloaded in Mbytes
321%   NHEAD        # of headerlines to skip
322%   FORMAT       String or cell array defining the format
323%                (default, automatic)
324%   MISVALCHAR   Character used for missing values
325%   DEL          Character delimiter used in the file (default ',')
326%   NOSAVE       Logical, if TRUE A will not be saved, default FALSE
327%
328% OUTPUT
329%   A            Unlabeled dataset
330%
331% DESCRIPTION
332% This routine facilitates downloading of character based datasets. FILE
333% should be the name (or path with name) in which the URL is downloaded. If
334% needed the URL file is unzipped and/or untarred first. If FILE already
335% exists it is used (no downloading). The file is parsed by TEXTSCAN using
336% the format given in FORMAT (see TEXTSCAN) and the delimiter specified in
337% DEL. If FORMAT is not given an attempt is made to derive it
338% automatically.
339%
340% In case a mat-file name [FILE '.mat'] is found it will be used instead of
341% downloading.
342%
343% Columns (features) given as characters (the '%s' fields in FORMAT) will
344% be stored as text based features. They will be replaced by indices to a
345% set of strings stored in the corresponding feature domain (see
346% SETFEATDOM). Use FEAT2LAB to use such a feature for labeling the dataset,
347% see the below example.
348%
349% EXAMPLE
350%  url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data';
351%  c = pr_download(url,'iris.dat',[]); % load Iris dataset from UCI
352%  % the labels are set as string (char) features in c(:,5)
353%  a = feat2lab(c,5);  % use feature 5 for labeling
354%
355% SEE ALSO
356% DATASETS, SETFEATDOM, GETFEATDOM, FEAT2LAB
357
358% Copyright: R.P.W. Duin
359% Faculty EWI, Delft University of Technology
360% P.O. Box 5031, 2600 GA Delft, The Netherlands
361
362
363[~,urlname,urlext] = fileparts(url);
364[datname,siz,nhead,form,misval,del,nosave] = setdefaults(varargin,urlname,1,0,[],'?',',',false);
365
366[dirname,datname] = fileparts(datname);
367if isempty(dirname)
368  dirname = fileparts(which(mfilename));
369  % dirname = pwd;
370end
371urlname = [urlname urlext]; % name of file to be downloaded
372matname = [datname '.mat']; % name of mat-file to be created
373datname = [datname '.dat']; % name of datfile to be created
374urlfile = fullfile(dirname,urlname); % temp file for download
375datfile = fullfile(dirname,datname); % unpacked urlfile
376matfile = fullfile(dirname,matname); % final matfile
377
378new = true;                          % if matfile exists, use it
379if exist(matfile,'file') == 2
380  s = load(matfile);
381  f = fieldnames(s);
382  a = s.(f{1});
383  new = false;
384  return
385end
386
387if exist(datfile,'file') ~= 2        % if datfile does not exist ...
388  ask_download_old(siz);
389  if isempty(siz) || siz == 0
390    disp(['Downloading ' urlname ' ....'])
391  else
392    disp(['Downloading ' urlname ' (' num2str(siz) ' MB) ....'])
393  end
394
395  %disp(['Downloading ' urlname ' ....']) % download into urlfile
396  if ~usejava('jvm') && isunix
397    stat = unix(['wget -q -O ' urlfile ' ' url]);
398    status = (stat == 0);
399  else
400    [~,status] = urlwrite(url,urlfile);
401  end
402  if status == 0
403    error(['Server unreachable or file not found: ' url])
404  end
405 
406  % assume file is created, uncompress if needed
407  % delete compressed file
408  if strcmp(urlext,'.zip')
409    disp('Decompression ....')
410    if ~usejava('jvm') && isunix
411      unix(['unzip ' urlfile ' -d ' datfile]);
412    else
413      unzip(urlfile,datfile);
414    end
415  elseif strcmp(urlext,'.gz')
416    disp('Decompression ....')
417    gunzip(urlfile,datfile);
418  elseif strcmp(urlext,'.tar') || strcmp(urlext,'.tgz') || strcmp(urlext,'.tar.gz')
419    disp('Decompression ....')
420    untar(urlfile,datfile);
421  elseif ~strcmp(urlfile,datfile)
422    copyfile(urlfile,datfile)
423  end
424  if exist(datfile,'dir') == 7
425    dirn = dir(datfile);
426    copyfile(fullfile(datfile,dirn(3).name),[datfile 'tmp']);
427    delete([datfile '/*']);
428    rmdir(datfile);
429    copyfile([datfile 'tmp'],datfile);
430    delete([datfile 'tmp']);
431  end
432  if ~strcmp(urlfile,datfile)
433    delete(urlfile);
434  end
435end
436
437% datfile should now be there, read and parse it
438fid = fopen(datfile);
439if isempty(form)        % if no format given ...
440  for j=1:nhead+1
441    s = fgetl(fid);     % derive it from the first nonheader line
442  end       
443  s = mytextscan(s,'c',del,0); % use all %s for time being
444  form = getform(s);    % convert fields to %n where appropriate
445  fseek(fid,0,-1);      % restart
446end
447
448disp('Parsing ...')
449c = mytextscan(fid,strrep(form,'n','s'),del,nhead);
450a = cell2dset(c,form,misval);
451
452if ~nosave % don't save if not needed (e.g. called by pr_download_uci)
453  save(matfile,'a');
454end
455
456return
457
458function ask_download_old(size)
459
460  global ASK
461  if isempty(ASK)
462    ASK = true;
463  end
464 
465  if ASK
466    if ~isempty(size)
467      siz = ['(' num2str(size) ' MB)'];
468    else
469      siz = '';
470    end
471    q = input(['Dataset is not available, OK to download ' siz ' [y]/n ?'],'s');
472    if ~isempty(q) && ~strcmp(q,'y')
473      error('Dataset not found')
474    end
475  end
476 
477return
478
479function form = getform(s)
480s = char(s{1});
481form = repmat('n',1,size(s,1));
482for j=1:size(s,1)
483  %n = textscan(char(s(j,:)),'%n');
484        if ~isempty(regexp(s(j,:),'[^0-9+-.eE ]','once'))
485    form(j) = 'c';
486  end
487end
488
489function s = mytextscan(fid,forms,del,nhead)
490
491form = repmat('%%',1,numel(forms));
492form(2:2:end) = forms;
493forms = strrep(form,'c','s');
494if del == ' '
495  s = textscan(fid,forms,'Headerlines',nhead);
496else
497  s = textscan(fid,forms,'Delimiter',del,'Headerlines',nhead);
498end
499if ~ischar(fid);
500  fclose(fid);
501end
Note: See TracBrowser for help on using the repository browser.