1 | %ADULT Income exceeding $50K/yr based on census data
|
---|
2 | %PRTools UCI dataset import, 32561+16281 objects, 14 mixed features, 2 classes
|
---|
3 | %
|
---|
4 | % [TRAIN,TEST] = ADULT(VAL)
|
---|
5 | % TRAIN_TEST = ADULT(VAL)
|
---|
6 | %
|
---|
7 | %DESCRIPTION
|
---|
8 | %This command downloads one of the UCI data sets, converts it into PRTools
|
---|
9 | %format and stores it locally for future use. Consult the <a href="http://archive.ics.uci.edu/ml/datasets/Adult">related website</a>
|
---|
10 | %for further information. Please make the appropriate references in
|
---|
11 | %publications that make use of this dataset.
|
---|
12 | %
|
---|
13 | %This dataset contains a number of categorical features with N > 2
|
---|
14 | %categories. They may be converted to N new real features by CAT2REAL.
|
---|
15 | %
|
---|
16 | %Dataset has missing values. By default all objects with missing values are
|
---|
17 | %removed. Use VAL=NaN to avoid this. For other options see MISVAL.
|
---|
18 | %
|
---|
19 | %SEE ALSO <a href="http://prtools.tudelft.nl/prtools/">PRTools Guide</a>, <a href="http://archive.ics.uci.edu/ml/">UCI Website</a>
|
---|
20 | %PRTOOLS, DATASETS, SETFEATDOM, FEATTYPES, CAT2REAL, MISVAL
|
---|
21 |
|
---|
22 | % Copyright: R.P.W. Duin
|
---|
23 |
|
---|
24 | function varargout = adult(val)
|
---|
25 |
|
---|
26 | if nargin<1
|
---|
27 | val = 'remove';
|
---|
28 | end
|
---|
29 |
|
---|
30 | varargout = cell(1,nargout);
|
---|
31 | [varargout{:}] = pr_loadmatfile;
|
---|
32 | if isempty(varargout{1})
|
---|
33 | % no matfiles found, create them
|
---|
34 | % define settings
|
---|
35 | f = textscan('age workclass fnlwgt education education-num martial-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country','%s');
|
---|
36 | opt.labfeat = 15;
|
---|
37 | opt.featnames = f(1);
|
---|
38 | opt1.nheadlines = 0;
|
---|
39 | opt2.nheadlines = 1;
|
---|
40 | opt.matfile = false; % we save the datasets below
|
---|
41 | opt.dsetname = 'Census Income Original';
|
---|
42 |
|
---|
43 | % download, parse
|
---|
44 | [a,b] = pr_download_uci('adult',{'adult.data','adult.test'},{opt1,opt2,opt});
|
---|
45 | % classnames of b are wrong
|
---|
46 | a = remclass(a);
|
---|
47 | b = remclass(b);
|
---|
48 | b = setlablist(b,getlablist(a));
|
---|
49 | a = setname(a,'Census Income Original');
|
---|
50 | b = setname(b,'Census Income Original');
|
---|
51 | % save separate and combined files
|
---|
52 | [varargout{:}] = pr_savematfile(a,b);
|
---|
53 |
|
---|
54 | end
|
---|
55 |
|
---|
56 | varargout = varargout*misval(val);
|
---|
57 |
|
---|