[142] | 1 | %ADULT Income exceeding $50K/yr based on census data
|
---|
| 2 | %PRTools UCI dataset import, 32561+16281 objects, 14 mixed features, 2 classes
|
---|
| 3 | %
|
---|
| 4 | % [TRAIN,TEST] = ADULT(VAL)
|
---|
| 5 | % TRAIN_TEST = ADULT(VAL)
|
---|
| 6 | %
|
---|
| 7 | %DESCRIPTION
|
---|
| 8 | %This command downloads one of the UCI data sets, converts it into PRTools
|
---|
| 9 | %format and stores it locally for future use. Consult the <a href="http://archive.ics.uci.edu/ml/datasets/Adult">related website</a>
|
---|
| 10 | %for further information. Please make the appropriate references in
|
---|
| 11 | %publications that make use of this dataset.
|
---|
| 12 | %
|
---|
| 13 | %This dataset contains a number of categorical features with N > 2
|
---|
| 14 | %categories. They may be converted to N new real features by CAT2REAL.
|
---|
| 15 | %
|
---|
| 16 | %Dataset has missing values. By default all objects with missing values are
|
---|
| 17 | %removed. Use VAL=NaN to avoid this. For other options see MISVAL.
|
---|
| 18 | %
|
---|
[150] | 19 | %SEE ALSO <a href="http://prtools.tudelft.nl/prtools/">PRTools Guide</a>, <a href="http://archive.ics.uci.edu/ml/">UCI Website</a>
|
---|
[142] | 20 | %PRTOOLS, DATASETS, SETFEATDOM, FEATTYPES, CAT2REAL, MISVAL
|
---|
| 21 |
|
---|
[150] | 22 | % Copyright: R.P.W. Duin
|
---|
[142] | 23 |
|
---|
| 24 | function varargout = adult(val)
|
---|
| 25 |
|
---|
| 26 | if nargin<1
|
---|
| 27 | val = 'remove';
|
---|
| 28 | end
|
---|
| 29 |
|
---|
| 30 | varargout = cell(1,nargout);
|
---|
| 31 | [varargout{:}] = pr_loadmatfile;
|
---|
| 32 | if isempty(varargout{1})
|
---|
| 33 | % no matfiles found, create them
|
---|
| 34 | % define settings
|
---|
| 35 | f = textscan('age workclass fnlwgt education education-num martial-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country','%s');
|
---|
| 36 | opt.labfeat = 15;
|
---|
| 37 | opt.featnames = f(1);
|
---|
| 38 | opt1.nheadlines = 0;
|
---|
| 39 | opt2.nheadlines = 1;
|
---|
| 40 | opt.matfile = false; % we save the datasets below
|
---|
| 41 | opt.dsetname = 'Census Income Original';
|
---|
| 42 |
|
---|
| 43 | % download, parse
|
---|
| 44 | [a,b] = pr_download_uci('adult',{'adult.data','adult.test'},{opt1,opt2,opt});
|
---|
| 45 | % classnames of b are wrong
|
---|
| 46 | a = remclass(a);
|
---|
| 47 | b = remclass(b);
|
---|
| 48 | b = setlablist(b,getlablist(a));
|
---|
| 49 | a = setname(a,'Census Income Original');
|
---|
| 50 | b = setname(b,'Census Income Original');
|
---|
| 51 | % save separate and combined files
|
---|
| 52 | [varargout{:}] = pr_savematfile(a,b);
|
---|
| 53 |
|
---|
| 54 | end
|
---|
| 55 |
|
---|
| 56 | varargout = varargout*misval(val);
|
---|
| 57 |
|
---|