1 | %PR_CELL2DSET Construct dataset from a cell array having one feature per cell.
|
---|
2 | %
|
---|
3 | % A = PR_CELL2DSET(C,F,M,L)
|
---|
4 | %
|
---|
5 | % INPUT
|
---|
6 | % C Cell array with one cell per feature. Categorical features should
|
---|
7 | % be given by strings in a character array or as cellstrings (one
|
---|
8 | % object per cell) stored in the feature cell.
|
---|
9 | % Numerical features may either be stored as character arrays, cell
|
---|
10 | % strings or as a numeric vector.
|
---|
11 | % In case C is a 2D cell array columns are first combined into a
|
---|
12 | % cellstring or a vector.
|
---|
13 | % F Format field (e.g. 'ccncnn') for distinguishing cell character
|
---|
14 | % arrays with categorical data ('c') from numeric data ('n').
|
---|
15 | % M Optional character array with symbols used for missing values. The
|
---|
16 | % empty string ('') will always be interpreted as a missing value.
|
---|
17 | % L Labels that may be used for labelling A (numbers or strings, see
|
---|
18 | % PRDATASET). (optional)
|
---|
19 | %
|
---|
20 | % OUTPUT
|
---|
21 | % A Dataset with categorical features numerically coded as indices in
|
---|
22 | % the DATA field, pointing to the list on category names stored in the
|
---|
23 | % FEATDOM field, see SETFEATDOM.
|
---|
24 | %
|
---|
25 | % DESCRIPTION
|
---|
26 | % This routine is an alternative for PRDATASET in case data is given by
|
---|
27 | % strings or numeric characters. The parameter F is optional. If not given
|
---|
28 | % all character arrays and cell strings are interpreted as categorical.
|
---|
29 | %
|
---|
30 | % Missing data (empty strings), will be coded as a NaNs.
|
---|
31 | %
|
---|
32 | % SEE ALSO <a href="http://prtools.tudelft.nl/prtools">PRTools Guide</a>
|
---|
33 | % DATASETS, SETFEATDOM, FEATTYPES, DSET2CELL, CAT2DSET, DSET2CELL
|
---|
34 |
|
---|
35 | % Copyright: R.P.W. Duin
|
---|
36 |
|
---|
37 | function a = pr_cell2dset(c,varargin)
|
---|
38 |
|
---|
39 | [f,misval,lab] = setdefaults(varargin,'',[],[]);
|
---|
40 |
|
---|
41 | if ~iscell(c)
|
---|
42 | error('Cell array expected')
|
---|
43 | end
|
---|
44 |
|
---|
45 | if min(size(c)) > 1 % 2D cell array, make 1D
|
---|
46 | cc = cell(1,size(c,2));
|
---|
47 | for j=1:size(c,2)
|
---|
48 | try
|
---|
49 | x = cell2mat(c(:,j));
|
---|
50 | cc{1,j} = x;
|
---|
51 | f = [f 'n'];
|
---|
52 | catch
|
---|
53 | try
|
---|
54 | x = char(c(:,j));
|
---|
55 | cc{1,j} = x;
|
---|
56 | f = [f 'c'];
|
---|
57 | catch
|
---|
58 | cc{1,j} = NaN(size(c,1),1);
|
---|
59 | f = [f 'n'];
|
---|
60 | end
|
---|
61 | end
|
---|
62 | end
|
---|
63 | c = cc;
|
---|
64 | end
|
---|
65 |
|
---|
66 | m = size(c{1},1); % number of objects
|
---|
67 | k = numel(c); % number of features
|
---|
68 | a = zeros(m,k); % numeric data will be stored here
|
---|
69 | if ~isempty(f) % decode format statement
|
---|
70 | if numel(f) ~= k
|
---|
71 | error('format string has wrong size')
|
---|
72 | end
|
---|
73 | else
|
---|
74 | f = repmat('x',1,k); % indicates: no format given
|
---|
75 | end
|
---|
76 |
|
---|
77 | fdom = cell(1,numel(c)); % space for feature domains
|
---|
78 | for j=1:numel(c) % run over all features
|
---|
79 | if size(c{j},1) ~= m
|
---|
80 | error('Not the same number of objects for all features')
|
---|
81 | end
|
---|
82 | if ischar(c{j}) || iscell(c{j})
|
---|
83 | cc = cellstr(c{j});
|
---|
84 | if ~isempty(misval)
|
---|
85 | for i=1:numel(misval)
|
---|
86 | L = find(strcmp(misval(i),cc));
|
---|
87 | cc(L) = [repmat({''},numel(L),1)];
|
---|
88 | end
|
---|
89 | end
|
---|
90 | L = find(strcmp('',cc)); % find missing values
|
---|
91 | if (f(j) == 'x') || (f(j) == 'c')
|
---|
92 | % interpret as category data, use renumlab for coding
|
---|
93 | [a(:,j),fdom{j}] = renumlab(cc);
|
---|
94 | a(L,j) = NaN(numel(L),1);
|
---|
95 | elseif f(j) == 'n'
|
---|
96 | cc(L) = [repmat({'NaN'},numel(L),1)]; % put a 'NaN' for missing
|
---|
97 | a(:,j) = str2num(char(cc)); % convert
|
---|
98 | else
|
---|
99 | error('Wrong format found')
|
---|
100 | end
|
---|
101 | else
|
---|
102 | a(:,j) = c{j};
|
---|
103 | end
|
---|
104 | end
|
---|
105 |
|
---|
106 | a = prdataset(a,lab);
|
---|
107 | a = setfeatdom(a,fdom);
|
---|