source: prdatasets/pr_cell2dset.m @ 155

Last change on this file since 155 was 137, checked in by bduin, 5 years ago
File size: 3.4 KB
Line 
1%PR_CELL2DSET Construct dataset from a cell array having one feature per cell.
2%
3%   A = PR_CELL2DSET(C,F,M,L)
4%
5% INPUT
6%   C  Cell array with one cell per feature. Categorical features should
7%      be given by strings in a character array or as cellstrings (one
8%      object per cell) stored in the feature cell.
9%      Numerical features may either be stored as character arrays, cell
10%      strings or as a numeric vector.
11%      In case C is a 2D cell array columns are first combined into a
12%      cellstring or a vector.
13%   F  Format field (e.g. 'ccncnn') for distinguishing cell character
14%      arrays with categorical data ('c') from numeric data ('n').
15%   M  Optional character array with symbols used for missing values. The
16%      empty string ('') will always be interpreted as a missing value.
17%   L  Labels that may be used for labelling A (numbers or strings, see
18%      PRDATASET). (optional)
19%
20% OUTPUT
21%   A  Dataset with categorical features numerically coded as indices in
22%      the DATA field, pointing to the list on category names stored in the 
23%      FEATDOM field, see SETFEATDOM.
24%
25% DESCRIPTION
26% This routine is an alternative for PRDATASET in case data is given by
27% strings or numeric characters. The parameter F is optional. If not given
28% all character arrays and cell strings are interpreted as categorical.
29%
30% Missing data (empty strings), will be coded as a NaNs.
31%
32% SEE ALSO <a href="http://prtools.tudelft.nl/prtools">PRTools Guide</a>
33% DATASETS, SETFEATDOM, FEATTYPES, DSET2CELL, CAT2DSET, DSET2CELL
34
35% Copyright: R.P.W. Duin
36
37function a = pr_cell2dset(c,varargin)
38
39[f,misval,lab] = setdefaults(varargin,'',[],[]);
40
41if ~iscell(c)
42  error('Cell array expected')
43end
44
45if min(size(c)) > 1       % 2D cell array, make 1D
46  cc = cell(1,size(c,2));
47  for j=1:size(c,2)
48    try
49      x = cell2mat(c(:,j));
50      cc{1,j} = x;
51      f = [f 'n'];
52    catch
53      try
54        x = char(c(:,j));
55        cc{1,j} = x;
56        f = [f 'c'];
57      catch
58        cc{1,j} = NaN(size(c,1),1);
59        f = [f 'n'];
60      end
61    end
62  end
63  c = cc;
64end
65
66m = size(c{1},1);         % number of objects
67k = numel(c);             % number of features
68a = zeros(m,k);           % numeric data will be stored here
69if ~isempty(f)            % decode format statement
70  if numel(f) ~= k
71    error('format string has wrong size')
72  end
73else
74  f = repmat('x',1,k);    % indicates: no format given
75end
76
77fdom = cell(1,numel(c));  % space for feature domains
78for j=1:numel(c)          % run over all features
79  if size(c{j},1) ~= m
80    error('Not the same number of objects for all features')
81  end
82  if ischar(c{j}) || iscell(c{j})
83    cc = cellstr(c{j});
84    if ~isempty(misval)
85      for i=1:numel(misval)
86        L = find(strcmp(misval(i),cc));
87        cc(L) = [repmat({''},numel(L),1)];
88      end
89    end
90    L = find(strcmp('',cc));    % find missing values
91    if (f(j) == 'x') || (f(j) == 'c')
92      % interpret as category data, use renumlab for coding
93      [a(:,j),fdom{j}] = renumlab(cc);
94      a(L,j) = NaN(numel(L),1);
95    elseif f(j) == 'n'
96      cc(L) = [repmat({'NaN'},numel(L),1)]; % put a 'NaN' for missing
97      a(:,j) = str2num(char(cc)); % convert
98    else
99      error('Wrong format found')
100    end
101  else
102    a(:,j) = c{j};
103  end
104end
105   
106a = prdataset(a,lab);
107a = setfeatdom(a,fdom);
Note: See TracBrowser for help on using the repository browser.