source: prdatasets/pr_download.m @ 145

Last change on this file since 145 was 137, checked in by bduin, 5 years ago
File size: 15.1 KB
Line 
1%PR_DOWNLOAD Load or download data and create dataset
2%
3%   [A,NEW] = PR_DOWNLOAD(URL,DATFILE,OPTIONS)
4%
5% INPUT
6%   URL        URL of character file to be downloaded
7%   DATFILE    Desired name of downloaded and uncompressed file
8%              Default: name of the url-file, extended by .dat
9%   OPTIONS    Structure with options used for parsing and constructing
10%              a PRTools dataset
11%
12% OUTPUT
13%   A          Dataset
14%   NEW        Logical, TRUE if a new dataset has been created, FALSE if an
15%              existing mat-file has been found and used.
16%
17% DESCRIPTION
18% This routine facilitates downloading of character based datasets. DATFILE
19% will be the name (or path with name) in which the URL is downloaded. If
20% needed the URL file is unzipped and/or untarred first. After parsing a
21% PRTools dataset is constructed, stored in a mat-file (optional) and
22% returned. The name of the mat-file is DATFILE extended by .mat.
23%
24% The directory specified in DATFILE, or if not supplied, the directory and
25% the name of the calling routine, will be used for storing files in a
26% subdirectory 'data'. If the mat-file already exists it will be loaded and
27% returned in A (no new download and parsing). If DATFILE already exists it
28% will be used (no new download).
29%
30% OPTIONS should be a structure with the below fields, to be supplied in
31% lower case. Missing fields are replaced by the given defaults.
32%
33%   SIZE       = [];    Size of data to be downloaded, in MB. Not needed,
34%                       just used to warn the user.
35%   PARSE      = TRUE;  If FALSE, parsing is skipped. Just downloading and
36%                       uncompression. A will be empty.
37%   PARSEFUN   = [];    A handle of a user supplied parsing function. This
38%                       function should operate on DATFILE (first parameter,
39%                       substituted by PR_DOWNLOAD) and return a PRTools
40%                       dataset. If PARSEFUN is not given, default parsing
41%                       using PR_READDATASET will be used.
42%   PARSEPARS  = {};    Cell array with additional parameters for PARSEFUN.
43%   FORMAT     = [];    Needed for default parsing, see PR_READDATASET.
44%   NHEADLINES = 0;     Needed for default parsing, see PR_READDATASET.
45%   MISVALCHAR = '?';   Data characters to be replaced by NaN
46%   MISVALUE   = [];    Data values to be replaced by NaN
47%   DELIMETER  = ' ';   Needed for default parsing, see PR_READDATASET.
48%   EXTENSION  = 'dat'; Extension to be used for downloaded DATFILE.
49%   MATFILE    = TRUE;  If FALSE, the dataset A will not be saved.
50%   LABFEAT    = [];    Feature found in DATFILE and to be used as class
51%                       label, see FEAT2LAB.
52%   FEATS      = [];    Columns of dataset to be used ase features.
53%   FEATNAMES  = [];    Desired feature names of dataset A, see SETFEATLAB.
54%   CLASSNAMES = [];    Class names to be stored in A, see SETLABLIST.
55%   USER       = [];    Additional information to be stored in the
56%                       user-field of A, see SETUSER.
57%   LINK       = [];    Link for more information in the dataset.
58%   DESC       = [];    Short description of the dataset.
59%   DSETNAME   = [];    Desired name of the dataset A.
60%
61%
62% EXAMPLE
63%  url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data';
64%  opt.extension = 'dat'; % create iris.dat
65%  opt.labfeat   = 5;     % use feature 5 for labeling
66%  opt.matfile   = false; % don't create a mat-file
67%  c = pr_download(url,[],opt) % load Iris dataset from UCI and parse
68%
69% SEE ALSO
70% DATASETS, SETFEATDOM, GETFEATDOM, FEAT2LAB
71
72% Copyright: R.P.W. Duin
73
74%%
75function [a,new] = pr_download(url,datname,varargin)
76
77if nargin >= 3
78  % this can be removed when all mfiles in prdatasets call the new version
79  % of pr_download_uci
80  if ~isstruct(varargin{1}) && ~isempty(varargin{1}) && isnumeric(varargin{1})
81    [a,new] = pr_download_old(url,datname,varargin{:});
82    return
83  else
84    opt = varargin{1};
85  end
86end
87
88if nargin < 3, opt = []; end
89if nargin < 2, datname = []; end
90
91opt = download_opt(opt);  % set defaults where necessary
92
93%% find directory to be used
94if isempty(datname)
95  datname = pr_callername;
96  dirname = fullfile(fileparts(which(datname)),'data');
97else
98  [dirname,datname] = fileparts(datname);
99end
100
101%% set all necessary filenames
102[~,urlname,urlext] = fileparts(url);
103if isempty(datname)
104  % will only be empty if called from command line
105  datname = urlname;
106  dirname = pwd;
107end
108urlname = [urlname urlext]; % name of file to be downloaded
109matname = [datname '.mat']; % name of mat-file to be created
110datname = [datname '.' opt.extension]; % name of datfile to be created
111urlfile = fullfile(dirname,urlname);   % temp file for download
112datfile = fullfile(dirname,datname);   % unpacked urlfile
113matfile = fullfile(dirname,matname);   % final matfile
114
115%% load mat-file if it exist
116new = false;
117if exist(matfile,'file') == 2
118  s = prload(matfile);
119  f = fieldnames(s);
120%   a = getfield(s,f{1});
121  a = s.(f{1});
122  return  % we are done!!
123end
124
125%% download the data file  if it doesn't exist
126if exist(datfile,'file') ~= 2        % if datfile does not exist ...
127  ask_download(urlname,opt.size);
128
129  if ~usejava('jvm') && isunix
130    stat = unix(['wget -q -O ' urlfile ' ' url]);
131    status = (stat == 0);
132  else
133    [~,status] = urlwrite(url,urlfile);
134  end
135  if status == 0
136    error(['Server unreachable or file not found: ' url])
137  end
138 
139  % assume file is created, uncompress if needed
140  % delete compressed file
141  if strcmp(urlext,'.zip')
142    disp('Decompression ....')
143    if ~usejava('jvm') && isunix
144      unix(['unzip ' urlfile ' -d ' datfile]);
145    else
146      unzip(urlfile,datfile);
147    end
148  elseif strcmp(urlext,'.gz')
149    disp('Decompression ....')
150    gunzip(urlfile,datfile);
151  elseif strcmp(urlext,'.tar') || strcmp(urlext,'.tgz') || strcmp(urlext,'.tar.gz')
152    disp('Decompression ....')
153    untar(urlfile,datfile);
154  elseif ~strcmp(urlfile,datfile)
155    copyfile(urlfile,datfile)
156  end
157  if exist(datfile,'dir') == 7
158    dirn = dir(datfile);
159    copyfile(fullfile(datfile,dirn(3).name),[datfile 'tmp']);
160    delete([datfile '/*']);
161    rmdir(datfile);
162    copyfile([datfile 'tmp'],datfile);
163    delete([datfile 'tmp']);
164  end
165  if ~strcmp(urlfile,datfile)
166    delete(urlfile);
167  end
168end
169
170if ~opt.parse
171  % no parsing desired, we are done
172  return
173end
174
175%% datfile should now be there, read and convert to dataset 
176disp('Parsing ...')
177if isempty(opt.parsefun)
178  a = pr_readdataset(datfile,opt.nheadlines,opt.delimeter, ...
179                   opt.misvalchar,opt.format);
180else
181  % user defined parsing
182  a = opt.parsefun(datfile,opt.parsepars{:});
183end
184
185%% set dataset fields
186if ~isempty(opt.labfeat) && opt.labfeat > 0
187  a = feat2lab(a,opt.labfeat);
188end
189if ~isempty(opt.classnames)
190  a = setlablist(a,opt.classnames);
191end
192if ~isempty(opt.feats)
193  a = a(:,opt.feats);
194end
195if ~isempty(opt.featnames)
196  a = setfeatlab(a,opt.featnames);
197end
198if ~isempty(opt.misvalue)
199  J = find(a==opt.misvalue);
200  a(J) = NaN;
201end
202if ~isempty(opt.user)
203  a = setuser(a,opt.user);
204end
205if ~isempty(opt.link)
206  a = setuser(a,opt.link,'link');
207end
208if ~isempty(opt.desc)
209  a = setuser(a,opt.desc,'desc');
210end
211if ~isempty(opt.dsetname)
212  a = setname(a,opt.dsetname);
213else
214  a = setname(a,pr_callername);
215end
216
217%% save if desired
218if opt.matfile
219  save(matfile,'a');
220  new = true;
221end
222
223return
224
225
226function ask_download(urlname,datsize)
227%% user controlled downloading
228  global ASK
229 
230  if ASK && ~isempty(datsize) % ask only if datsize has been set
231    if datsize ~= 0
232      siz = ['(' num2str(datsize) ' MB)'];
233    else
234      siz = '';
235    end
236    q = input(['Dataset is not available, OK to download ' siz ' [y]/n ?'],'s');
237    if ~isempty(q) && ~strcmp(q,'y')
238      error('No dataset')
239    end
240  else
241    siz = [];
242  end
243 
244  if isempty(siz)
245    disp(['Downloading ' urlname ' ....'])
246  else
247    disp(['Downloading ' urlname ' (' num2str(siz) ' MB) ....'])
248  end
249 
250return
251
252function opt = download_opt(opt_given)
253%%
254  opt.size       = [];
255  opt.parse      = true;
256  opt.parsefun   = [];
257  opt.parsepars  = {};
258  opt.format     = [];
259  opt.nheadlines = 0;
260  opt.misvalchar = '?';
261  opt.misvalue   = [];
262  opt.delimeter  = ' ';
263  opt.extension  = 'dat';
264  opt.matfile    = true;
265  opt.labfeat    = [];
266  opt.feats      = [];
267  opt.featnames  = '';
268  opt.classnames = '';
269  opt.user       = [];
270  opt.dsetname   = '';
271  opt.link       = '';
272  opt.desc       = '';
273
274 
275
276  if (~isempty(opt_given))
277    if (~isstruct(opt_given))
278      error('OPTIONS should be a structure with at least one of the following fields: q, init, etol, optim, maxiter, itmap, isratio, st or inspect.');
279    end
280    fn = fieldnames(opt_given);
281    fall = fieldnames(opt);
282    if (~all(ismember(fn,fall)))
283      ff = '';
284      for j=1:numel(fall)
285        ff = [ff char(fall{j}) ', '];
286      end
287      error(['Wrong field names; valid field names are: ' ff])
288    end
289    for i = 1:length(fn)
290      opt.(fn{i}) = opt_given.(fn{i});
291    end
292  end
293 
294return
295
296function [a,new] = pr_download_old(url,varargin)
297%% This is the old version of pr_download, to be called from the old
298%  version of pr_download_uci only (inside it). It can be removed when all
299%  mfiles in prdataset make the new call to  pr_download_uci
300%
301%PR_DOWNLOAD Load or download data and create dataset
302%
303%   A = PR_DOWNLOAD(URL,FILE,SIZE,NHEAD,FORMAT,MISVALCHAR,DELCHAR,NOSAVE)
304%
305% INPUT
306%   URL          URL of character file to be downloaded
307%   FILE         Filename to download
308%   SIZE         Size of data to be downloaded in Mbytes
309%   NHEAD        # of headerlines to skip
310%   FORMAT       String or cell array defining the format
311%                (default, automatic)
312%   MISVALCHAR   Character used for missing values
313%   DEL          Character delimiter used in the file (default ',')
314%   NOSAVE       Logical, if TRUE A will not be saved, default FALSE
315%
316% OUTPUT
317%   A            Unlabeled dataset
318%
319% DESCRIPTION
320% This routine facilitates downloading of character based datasets. FILE
321% should be the name (or path with name) in which the URL is downloaded. If
322% needed the URL file is unzipped and/or untarred first. If FILE already
323% exists it is used (no downloading). The file is parsed by TEXTSCAN using
324% the format given in FORMAT (see TEXTSCAN) and the delimiter specified in
325% DEL. If FORMAT is not given an attempt is made to derive it
326% automatically.
327%
328% In case a mat-file name [FILE '.mat'] is found it will be used instead of
329% downloading.
330%
331% Columns (features) given as characters (the '%s' fields in FORMAT) will
332% be stored as text based features. They will be replaced by indices to a
333% set of strings stored in the corresponding feature domain (see
334% SETFEATDOM). Use FEAT2LAB to use such a feature for labeling the dataset,
335% see the below example.
336%
337% EXAMPLE
338%  url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data';
339%  c = pr_download(url,'iris.dat',[]); % load Iris dataset from UCI
340%  % the labels are set as string (char) features in c(:,5)
341%  a = feat2lab(c,5);  % use feature 5 for labeling
342%
343% SEE ALSO
344% DATASETS, SETFEATDOM, GETFEATDOM, FEAT2LAB
345
346% Copyright: R.P.W. Duin
347% Faculty EWI, Delft University of Technology
348% P.O. Box 5031, 2600 GA Delft, The Netherlands
349
350
351[~,urlname,urlext] = fileparts(url);
352[datname,siz,nhead,form,misval,del,nosave] = setdefaults(varargin,urlname,1,0,[],'?',',',false);
353
354[dirname,datname] = fileparts(datname);
355if isempty(dirname)
356  dirname = fileparts(which(mfilename));
357  % dirname = pwd;
358end
359urlname = [urlname urlext]; % name of file to be downloaded
360matname = [datname '.mat']; % name of mat-file to be created
361datname = [datname '.dat']; % name of datfile to be created
362urlfile = fullfile(dirname,urlname); % temp file for download
363datfile = fullfile(dirname,datname); % unpacked urlfile
364matfile = fullfile(dirname,matname); % final matfile
365
366new = true;                          % if matfile exists, use it
367if exist(matfile,'file') == 2
368  s = load(matfile);
369  f = fieldnames(s);
370  a = s.(f{1});
371  new = false;
372  return
373end
374
375if exist(datfile,'file') ~= 2        % if datfile does not exist ...
376  ask_download_old(siz);
377  if isempty(siz) || siz == 0
378    disp(['Downloading ' urlname ' ....'])
379  else
380    disp(['Downloading ' urlname ' (' num2str(siz) ' MB) ....'])
381  end
382
383  %disp(['Downloading ' urlname ' ....']) % download into urlfile
384  if ~usejava('jvm') && isunix
385    stat = unix(['wget -q -O ' urlfile ' ' url]);
386    status = (stat == 0);
387  else
388    [~,status] = urlwrite(url,urlfile);
389  end
390  if status == 0
391    error(['Server unreachable or file not found: ' url])
392  end
393 
394  % assume file is created, uncompress if needed
395  % delete compressed file
396  if strcmp(urlext,'.zip')
397    disp('Decompression ....')
398    if ~usejava('jvm') && isunix
399      unix(['unzip ' urlfile ' -d ' datfile]);
400    else
401      unzip(urlfile,datfile);
402    end
403  elseif strcmp(urlext,'.gz')
404    disp('Decompression ....')
405    gunzip(urlfile,datfile);
406  elseif strcmp(urlext,'.tar') || strcmp(urlext,'.tgz') || strcmp(urlext,'.tar.gz')
407    disp('Decompression ....')
408    untar(urlfile,datfile);
409  elseif ~strcmp(urlfile,datfile)
410    copyfile(urlfile,datfile)
411  end
412  if exist(datfile,'dir') == 7
413    dirn = dir(datfile);
414    copyfile(fullfile(datfile,dirn(3).name),[datfile 'tmp']);
415    delete([datfile '/*']);
416    rmdir(datfile);
417    copyfile([datfile 'tmp'],datfile);
418    delete([datfile 'tmp']);
419  end
420  if ~strcmp(urlfile,datfile)
421    delete(urlfile);
422  end
423end
424
425% datfile should now be there, read and parse it
426fid = fopen(datfile);
427if isempty(form)        % if no format given ...
428  for j=1:nhead+1
429    s = fgetl(fid);     % derive it from the first nonheader line
430  end       
431  s = mytextscan(s,'c',del,0); % use all %s for time being
432  form = getform(s);    % convert fields to %n where appropriate
433  fseek(fid,0,-1);      % restart
434end
435
436disp('Parsing ...')
437c = mytextscan(fid,strrep(form,'n','s'),del,nhead);
438a = pr_cell2dset(c,form,misval);
439
440if ~nosave % don't save if not needed (e.g. called by pr_download_uci)
441  save(matfile,'a');
442end
443
444return
445
446function ask_download_old(size)
447
448  global ASK
449  if isempty(ASK)
450    ASK = true;
451  end
452 
453  if ASK
454    if ~isempty(size)
455      siz = ['(' num2str(size) ' MB)'];
456    else
457      siz = '';
458    end
459    q = input(['Dataset is not available, OK to download ' siz ' [y]/n ?'],'s');
460    if ~isempty(q) && ~strcmp(q,'y')
461      error('Dataset not found')
462    end
463  end
464 
465return
466
467function form = getform(s)
468s = char(s{1});
469form = repmat('n',1,size(s,1));
470for j=1:size(s,1)
471  %n = textscan(char(s(j,:)),'%n');
472        if ~isempty(regexp(s(j,:),'[^0-9+-.eE ]','once'))
473    form(j) = 'c';
474  end
475end
476
477function s = mytextscan(fid,forms,del,nhead)
478
479form = repmat('%%',1,numel(forms));
480form(2:2:end) = forms;
481forms = strrep(form,'c','s');
482if del == ' '
483  s = textscan(fid,forms,'Headerlines',nhead);
484else
485  s = textscan(fid,forms,'Delimiter',del,'Headerlines',nhead);
486end
487if ~ischar(fid);
488  fclose(fid);
489end
Note: See TracBrowser for help on using the repository browser.