source: prdatasets/adult.m @ 153

Last change on this file since 153 was 150, checked in by bduin, 5 years ago
File size: 2.0 KB
RevLine 
[142]1%ADULT Income exceeding $50K/yr based on census data
2%PRTools UCI dataset import, 32561+16281 objects, 14 mixed features, 2 classes
3%
4%  [TRAIN,TEST] = ADULT(VAL)
5%   TRAIN_TEST  = ADULT(VAL)
6%
7%DESCRIPTION
8%This command downloads one of the UCI data sets, converts it into PRTools
9%format and stores it locally for future use. Consult the <a href="http://archive.ics.uci.edu/ml/datasets/Adult">related website</a>
10%for further information. Please make the appropriate references in
11%publications that make use of this dataset.
12%
13%This dataset contains a number of categorical features with N > 2
14%categories. They may be converted to N new real features by CAT2REAL.
15%
16%Dataset has missing values. By default all objects with missing values are
17%removed. Use VAL=NaN to avoid this. For other options see MISVAL.
18%
[150]19%SEE ALSO <a href="http://prtools.tudelft.nl/prtools/">PRTools Guide</a>, <a href="http://archive.ics.uci.edu/ml/">UCI Website</a>
[142]20%PRTOOLS, DATASETS, SETFEATDOM, FEATTYPES, CAT2REAL, MISVAL
21
[150]22% Copyright: R.P.W. Duin
[142]23
24function varargout = adult(val)
25
26if nargin<1
27        val = 'remove';
28end
29
30varargout = cell(1,nargout);
31[varargout{:}] = pr_loadmatfile;
32if isempty(varargout{1})
33  % no matfiles found, create them
34  % define settings
35  f = textscan('age workclass fnlwgt education education-num martial-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country','%s');
36  opt.labfeat     = 15;
37  opt.featnames   = f(1);
38  opt1.nheadlines = 0;
39  opt2.nheadlines = 1;
40  opt.matfile     = false; % we save the datasets below
41  opt.dsetname    = 'Census Income Original';
42
43  % download, parse
44  [a,b] = pr_download_uci('adult',{'adult.data','adult.test'},{opt1,opt2,opt});
45  % classnames of b are wrong
46  a = remclass(a);
47  b = remclass(b);
48  b = setlablist(b,getlablist(a));
49  a = setname(a,'Census Income Original');
50  b = setname(b,'Census Income Original');
51  % save separate and combined files
52  [varargout{:}] = pr_savematfile(a,b);
53 
54end
55
56varargout = varargout*misval(val);
57
Note: See TracBrowser for help on using the repository browser.