source: prdatasets/pr_download.m @ 154

Last change on this file since 154 was 154, checked in by bduin, 5 years ago
File size: 15.2 KB
Line 
1%PR_DOWNLOAD Load or download data and create dataset
2%
3%   [A,NEW] = PR_DOWNLOAD(URL,DATFILE,OPTIONS)
4%
5% INPUT
6%   URL        URL of character file to be downloaded
7%   DATFILE    Desired name of downloaded and uncompressed file
8%              Default: name of the url-file, extended by .dat
9%   OPTIONS    Structure with options used for parsing and constructing
10%              a PRTools dataset
11%
12% OUTPUT
13%   A          Dataset
14%   NEW        Logical, TRUE if a new dataset has been created, FALSE if an
15%              existing mat-file has been found and used.
16%
17% DESCRIPTION
18% This routine facilitates downloading of character based datasets. DATFILE
19% will be the name (or path with name) in which the URL is downloaded. If
20% needed the URL file is unzipped and/or untarred first. After parsing a
21% PRTools dataset is constructed, stored in a mat-file (optional) and
22% returned. The name of the mat-file is DATFILE extended by .mat.
23%
24% The directory specified in DATFILE, or if not supplied, the directory and
25% the name of the calling routine, will be used for storing files in a
26% subdirectory 'data'. If the mat-file already exists it will be loaded and
27% returned in A (no new download and parsing). If DATFILE already exists it
28% will be used (no new download).
29%
30% OPTIONS should be a structure with the below fields, to be supplied in
31% lower case. Missing fields are replaced by the given defaults.
32%
33%   SIZE       = [];    Size of data to be downloaded, in MB. Not needed,
34%                       just used to warn the user.
35%   PARSE      = TRUE;  If FALSE, parsing is skipped. Just downloading and
36%                       uncompression. A will be empty.
37%   PARSEFUN   = [];    A handle of a user supplied parsing function. This
38%                       function should operate on DATFILE (first parameter,
39%                       substituted by PR_DOWNLOAD) and return a PRTools
40%                       dataset. If PARSEFUN is not given, default parsing
41%                       using PR_READDATASET will be used.
42%   PARSEPARS  = {};    Cell array with additional parameters for PARSEFUN.
43%   FORMAT     = [];    Needed for default parsing, see PR_READDATASET.
44%   NHEADLINES = 0;     Needed for default parsing, see PR_READDATASET.
45%   MISVALCHAR = '?';   Data characters to be replaced by NaN
46%   MISVALUE   = [];    Data values to be replaced by NaN
47%   DELIMETER  = ' ';   Needed for default parsing, see PR_READDATASET.
48%   EXTENSION  = 'dat'; Extension to be used for downloaded DATFILE.
49%   MATFILE    = TRUE;  If FALSE, the dataset A will not be saved.
50%   LABFEAT    = [];    Feature found in DATFILE and to be used as class
51%                       label, see FEAT2LAB.
52%   FEATS      = [];    Columns of dataset to be used ase features.
53%   FEATNAMES  = [];    Desired feature names of dataset A, see SETFEATLAB.
54%   CLASSNAMES = [];    Class names to be stored in A, see SETLABLIST.
55%   USER       = [];    Additional information to be stored in the
56%                       user-field of A, see SETUSER.
57%   LINK       = [];    Link for more information in the dataset.
58%   DESC       = [];    Short description of the dataset.
59%   DSETNAME   = [];    Desired name of the dataset A.
60%
61%
62% EXAMPLE
63%  url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data';
64%  opt.extension = 'dat'; % create iris.dat
65%  opt.labfeat   = 5;     % use feature 5 for labeling
66%  opt.matfile   = false; % don't create a mat-file
67%  c = pr_download(url,[],opt) % load Iris dataset from UCI and parse
68%
69% SEE ALSO
70% DATASETS, SETFEATDOM, GETFEATDOM, FEAT2LAB
71
72% Copyright: R.P.W. Duin
73
74%%
75function [a,new] = pr_download(url,datname,varargin)
76
77%% make sur there is a data subdir
78persistent DATADIREXISTS
79if isempty(DATADIREXISTS)
80  datasubdir = fullfile(fileparts(which(mfilename)),'data');
81  if exist(datasubdir,'dir') ~= 7
82    mkdir(datasubdir);
83  end
84  DATADIREXISTS = true;
85end
86
87%%
88if nargin >= 3
89  % this can be removed when all mfiles in prdatasets call the new version
90  % of pr_download_uci
91  if ~isstruct(varargin{1}) && ~isempty(varargin{1}) && isnumeric(varargin{1})
92    [a,new] = pr_download_old(url,datname,varargin{:});
93    return
94  else
95    opt = varargin{1};
96  end
97end
98
99if nargin < 3, opt = []; end
100if nargin < 2, datname = []; end
101
102opt = download_opt(opt);  % set defaults where necessary
103
104%% find directory to be used
105if isempty(datname)
106  datname = pr_callername;
107  dirname = fullfile(fileparts(which(datname)),'data');
108else
109  [dirname,datname] = fileparts(datname);
110end
111
112%% set all necessary filenames
113[~,urlname,urlext] = fileparts(url);
114if isempty(datname)
115  % will only be empty if called from command line
116  datname = urlname;
117  dirname = pwd;
118end
119urlname = [urlname urlext]; % name of file to be downloaded
120matname = [datname '.mat']; % name of mat-file to be created
121datname = [datname '.' opt.extension]; % name of datfile to be created
122urlfile = fullfile(dirname,urlname);   % temp file for download
123datfile = fullfile(dirname,datname);   % unpacked urlfile
124matfile = fullfile(dirname,matname);   % final matfile
125
126%% load mat-file if it exist
127new = false;
128if exist(matfile,'file') == 2
129  s = prload(matfile);
130  f = fieldnames(s);
131%   a = getfield(s,f{1});
132  a = s.(f{1});
133  return  % we are done!!
134end
135
136%% download the data file  if it doesn't exist
137if exist(datfile,'file') ~= 2        % if datfile does not exist ...
138  ask_download(urlname,opt.size);
139
140  if ~usejava('jvm') && isunix
141    stat = unix(['wget -q -O ' urlfile ' ' url]);
142    status = (stat == 0);
143  else
144    [~,status] = urlwrite(url,urlfile);
145  end
146  if status == 0
147    error(['Server unreachable or file not found: ' url])
148  end
149 
150  % assume file is created, uncompress if needed
151  % delete compressed file
152  if strcmp(urlext,'.zip')
153    disp('Decompression ....')
154    if ~usejava('jvm') && isunix
155      unix(['unzip ' urlfile ' -d ' datfile]);
156    else
157      unzip(urlfile,datfile);
158    end
159  elseif strcmp(urlext,'.gz')
160    disp('Decompression ....')
161    gunzip(urlfile,datfile);
162  elseif strcmp(urlext,'.tar') || strcmp(urlext,'.tgz') || strcmp(urlext,'.tar.gz')
163    disp('Decompression ....')
164    untar(urlfile,datfile);
165  elseif ~strcmp(urlfile,datfile)
166    copyfile(urlfile,datfile)
167  end
168  if exist(datfile,'dir') == 7
169    dirn = dir(datfile);
170    copyfile(fullfile(datfile,dirn(3).name),[datfile 'tmp']);
171    delete([datfile '/*']);
172    rmdir(datfile);
173    copyfile([datfile 'tmp'],datfile);
174    delete([datfile 'tmp']);
175  end
176  if ~strcmp(urlfile,datfile)
177    delete(urlfile);
178  end
179end
180
181if ~opt.parse
182  % no parsing desired, we are done
183  return
184end
185
186%% datfile should now be there, read and convert to dataset 
187disp('Parsing ...')
188if isempty(opt.parsefun)
189  a = pr_readdataset(datfile,opt.nheadlines,opt.delimeter, ...
190                   opt.misvalchar,opt.format);
191else
192  % user defined parsing
193  a = opt.parsefun(datfile,opt.parsepars{:});
194end
195
196%% set dataset fields
197if ~isempty(opt.labfeat) && opt.labfeat > 0
198  a = feat2lab(a,opt.labfeat);
199end
200if ~isempty(opt.classnames)
201  a = setlablist(a,opt.classnames);
202end
203if ~isempty(opt.feats)
204  a = a(:,opt.feats);
205end
206if ~isempty(opt.featnames)
207  a = setfeatlab(a,opt.featnames);
208end
209if ~isempty(opt.misvalue)
210  J = find(a==opt.misvalue);
211  a(J) = NaN;
212end
213if ~isempty(opt.user)
214  a = setuser(a,opt.user);
215end
216if ~isempty(opt.link)
217  a = setuser(a,opt.link,'link');
218end
219if ~isempty(opt.desc)
220  a = setuser(a,opt.desc,'desc');
221end
222if ~isempty(opt.dsetname)
223  a = setname(a,opt.dsetname);
224else
225  a = setname(a,pr_callername);
226end
227
228%% save if desired
229if opt.matfile
230  save(matfile,'a');
231  new = true;
232end
233
234return
235
236
237function ask_download(urlname,datsize)
238%% user controlled downloading
239  global ASK
240 
241  if ASK && ~isempty(datsize) && datsize > 1 % ask only if datsize has been set
242    siz = ['(' num2str(datsize) ' MB)'];
243    q = input(['Dataset is not available, OK to download ' siz ' [y]/n ?'],'s');
244    if ~isempty(q) && ~strcmp(q,'y')
245      error('No dataset')
246    end
247    disp(['Downloading ' urlname ' (' num2str(siz) ' MB) ....'])
248  else
249    disp(['Downloading ' urlname ' ....'])
250  end
251 
252return
253
254function opt = download_opt(opt_given)
255%%
256  opt.size       = [];
257  opt.parse      = true;
258  opt.parsefun   = [];
259  opt.parsepars  = {};
260  opt.format     = [];
261  opt.nheadlines = 0;
262  opt.misvalchar = '?';
263  opt.misvalue   = [];
264  opt.delimeter  = ' ';
265  opt.extension  = 'dat';
266  opt.matfile    = true;
267  opt.labfeat    = [];
268  opt.feats      = [];
269  opt.featnames  = '';
270  opt.classnames = '';
271  opt.user       = [];
272  opt.dsetname   = '';
273  opt.link       = '';
274  opt.desc       = '';
275
276 
277
278  if (~isempty(opt_given))
279    if (~isstruct(opt_given))
280      error('OPTIONS should be a structure with at least one of the following fields: q, init, etol, optim, maxiter, itmap, isratio, st or inspect.');
281    end
282    fn = fieldnames(opt_given);
283    fall = fieldnames(opt);
284    if (~all(ismember(fn,fall)))
285      ff = '';
286      for j=1:numel(fall)
287        ff = [ff char(fall{j}) ', '];
288      end
289      error(['Wrong field names; valid field names are: ' ff])
290    end
291    for i = 1:length(fn)
292      opt.(fn{i}) = opt_given.(fn{i});
293    end
294  end
295 
296return
297
298function [a,new] = pr_download_old(url,varargin)
299%% This is the old version of pr_download, to be called from the old
300%  version of pr_download_uci only (inside it). It can be removed when all
301%  mfiles in prdataset make the new call to  pr_download_uci
302%
303%PR_DOWNLOAD Load or download data and create dataset
304%
305%   A = PR_DOWNLOAD(URL,FILE,SIZE,NHEAD,FORMAT,MISVALCHAR,DELCHAR,NOSAVE)
306%
307% INPUT
308%   URL          URL of character file to be downloaded
309%   FILE         Filename to download
310%   SIZE         Size of data to be downloaded in Mbytes
311%   NHEAD        # of headerlines to skip
312%   FORMAT       String or cell array defining the format
313%                (default, automatic)
314%   MISVALCHAR   Character used for missing values
315%   DEL          Character delimiter used in the file (default ',')
316%   NOSAVE       Logical, if TRUE A will not be saved, default FALSE
317%
318% OUTPUT
319%   A            Unlabeled dataset
320%
321% DESCRIPTION
322% This routine facilitates downloading of character based datasets. FILE
323% should be the name (or path with name) in which the URL is downloaded. If
324% needed the URL file is unzipped and/or untarred first. If FILE already
325% exists it is used (no downloading). The file is parsed by TEXTSCAN using
326% the format given in FORMAT (see TEXTSCAN) and the delimiter specified in
327% DEL. If FORMAT is not given an attempt is made to derive it
328% automatically.
329%
330% In case a mat-file name [FILE '.mat'] is found it will be used instead of
331% downloading.
332%
333% Columns (features) given as characters (the '%s' fields in FORMAT) will
334% be stored as text based features. They will be replaced by indices to a
335% set of strings stored in the corresponding feature domain (see
336% SETFEATDOM). Use FEAT2LAB to use such a feature for labeling the dataset,
337% see the below example.
338%
339% EXAMPLE
340%  url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data';
341%  c = pr_download(url,'iris.dat',[]); % load Iris dataset from UCI
342%  % the labels are set as string (char) features in c(:,5)
343%  a = feat2lab(c,5);  % use feature 5 for labeling
344%
345% SEE ALSO
346% DATASETS, SETFEATDOM, GETFEATDOM, FEAT2LAB
347
348% Copyright: R.P.W. Duin
349% Faculty EWI, Delft University of Technology
350% P.O. Box 5031, 2600 GA Delft, The Netherlands
351
352
353[~,urlname,urlext] = fileparts(url);
354[datname,siz,nhead,form,misval,del,nosave] = setdefaults(varargin,urlname,1,0,[],'?',',',false);
355
356[dirname,datname] = fileparts(datname);
357if isempty(dirname)
358  dirname = fileparts(which(mfilename));
359  % dirname = pwd;
360end
361urlname = [urlname urlext]; % name of file to be downloaded
362matname = [datname '.mat']; % name of mat-file to be created
363datname = [datname '.dat']; % name of datfile to be created
364urlfile = fullfile(dirname,urlname); % temp file for download
365datfile = fullfile(dirname,datname); % unpacked urlfile
366matfile = fullfile(dirname,matname); % final matfile
367
368new = true;                          % if matfile exists, use it
369if exist(matfile,'file') == 2
370  s = load(matfile);
371  f = fieldnames(s);
372  a = s.(f{1});
373  new = false;
374  return
375end
376
377if exist(datfile,'file') ~= 2        % if datfile does not exist ...
378  ask_download_old(siz);
379  if isempty(siz) || siz == 0
380    disp(['Downloading ' urlname ' ....'])
381  else
382    disp(['Downloading ' urlname ' (' num2str(siz) ' MB) ....'])
383  end
384
385  %disp(['Downloading ' urlname ' ....']) % download into urlfile
386  if ~usejava('jvm') && isunix
387    stat = unix(['wget -q -O ' urlfile ' ' url]);
388    status = (stat == 0);
389  else
390    [~,status] = urlwrite(url,urlfile);
391  end
392  if status == 0
393    error(['Server unreachable or file not found: ' url])
394  end
395 
396  % assume file is created, uncompress if needed
397  % delete compressed file
398  if strcmp(urlext,'.zip')
399    disp('Decompression ....')
400    if ~usejava('jvm') && isunix
401      unix(['unzip ' urlfile ' -d ' datfile]);
402    else
403      unzip(urlfile,datfile);
404    end
405  elseif strcmp(urlext,'.gz')
406    disp('Decompression ....')
407    gunzip(urlfile,datfile);
408  elseif strcmp(urlext,'.tar') || strcmp(urlext,'.tgz') || strcmp(urlext,'.tar.gz')
409    disp('Decompression ....')
410    untar(urlfile,datfile);
411  elseif ~strcmp(urlfile,datfile)
412    copyfile(urlfile,datfile)
413  end
414  if exist(datfile,'dir') == 7
415    dirn = dir(datfile);
416    copyfile(fullfile(datfile,dirn(3).name),[datfile 'tmp']);
417    delete([datfile '/*']);
418    rmdir(datfile);
419    copyfile([datfile 'tmp'],datfile);
420    delete([datfile 'tmp']);
421  end
422  if ~strcmp(urlfile,datfile)
423    delete(urlfile);
424  end
425end
426
427% datfile should now be there, read and parse it
428fid = fopen(datfile);
429if isempty(form)        % if no format given ...
430  for j=1:nhead+1
431    s = fgetl(fid);     % derive it from the first nonheader line
432  end       
433  s = mytextscan(s,'c',del,0); % use all %s for time being
434  form = getform(s);    % convert fields to %n where appropriate
435  fseek(fid,0,-1);      % restart
436end
437
438disp('Parsing ...')
439c = mytextscan(fid,strrep(form,'n','s'),del,nhead);
440a = pr_cell2dset(c,form,misval);
441
442if ~nosave % don't save if not needed (e.g. called by pr_download_uci)
443  save(matfile,'a');
444end
445
446return
447
448function ask_download_old(size)
449
450  global ASK
451  if isempty(ASK)
452    ASK = true;
453  end
454 
455  if ASK
456    if ~isempty(size) && size > 1
457      siz = ['(' num2str(size) ' MB)'];
458      q = input(['Dataset is not available, OK to download ' siz ' [y]/n ?'],'s');
459      if ~isempty(q) && ~strcmp(q,'y')
460        error('Dataset not found')
461      end
462    end
463  end
464 
465return
466
467function form = getform(s)
468s = char(s{1});
469form = repmat('n',1,size(s,1));
470for j=1:size(s,1)
471  %n = textscan(char(s(j,:)),'%n');
472        if ~isempty(regexp(s(j,:),'[^0-9+-.eE ]','once'))
473    form(j) = 'c';
474  end
475end
476
477function s = mytextscan(fid,forms,del,nhead)
478
479form = repmat('%%',1,numel(forms));
480form(2:2:end) = forms;
481forms = strrep(form,'c','s');
482if del == ' '
483  s = textscan(fid,forms,'Headerlines',nhead);
484else
485  s = textscan(fid,forms,'Delimiter',del,'Headerlines',nhead);
486end
487if ~ischar(fid);
488  fclose(fid);
489end
Note: See TracBrowser for help on using the repository browser.