Changeset 97


Ignore:
Timestamp:
09/24/14 16:04:03 (10 years ago)
Author:
bduin
Message:
 
Location:
prdatasets
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • prdatasets/car.m

    r92 r97  
    1717fl = {'buying price' 'maintenance price' 'nr of doors' 'nr of persons' 'luggage boot' 'safety'};
    1818
    19 [a,strvals] = pr_readdataset('car.data',ones(7,1));
    20 labs = strvals{end}(a(:,end));
    21 for i=1:6
    22         featdom{i} = strvcat(strvals{i}');
    23 end
    24 x = pr_dataset(a(:,1:(end-1)), labs);
     19x = pr_readdataset('car.data',0,',',[],'ccccccc',7)
    2520x = setfeatlab(x,fl);
    26 x = setfeatdom(x,featdom);
    2721x = setuser(x,user);
    2822x = setname(x,'Car');
  • prdatasets/pr_readdataset.m

    r92 r97  
    1 function [x,strvals] = pr_readdataset(fname,strtype)
    2 %     [X,STRVALS] = PR_READDATASET(FNAME)
    3 %
    4 % Read the dataset from the text file FNAME. It can process categorical
    5 % features, or features for which categories are given in text. A matrix
    6 % X is returned containing the numerical values, or integers. The
    7 % integers point to the entry in STRVALS containing for each
    8 % (categorical) feature its string members.
    9 %
    10 %     X = PR_READDATASET(FNAME,STRTYPE)
    11 %
    12 % The user can supply a vector STRTYPE that indicates for each feature
    13 % if it is numerical (0) or string/categorical (1).
    14 %
    15 %     X = PR_READDATASET(FNAME,STRTYPE,DELIMITER)
    16 %
    17 % For datasets that have a strange delimiter (not comma or space), you
    18 % have to supply it.
    19 if nargin<3
    20         delimiter = ',';
    21 end
    22 if nargin<2
    23         strtype = [];
    24 end
    25 
    26 % try to open the file
    27 [fid,message] = fopen(fname,'r');
    28 if fid==-1
    29         disp(message)
    30         error('I cannot open file %s.',fname);
    31 end
    32 % get the first line:
    33 dline = fgetl(fid);
    34 % check if the delimiter is present:
    35 I = find(dline==delimiter);
    36 if isempty(I)
    37         delimiter = ' ';
    38         I = find(dline==delimiter);
    39         if isempty(I)
    40                 error('Cannot determine the delimiter');
    41         end
    42 end
    43 
    44 % now run over all elements in the line:
    45 I = [0 I length(dline)+1];
    46 w = {};
    47 for i=1:length(I)-1
    48         w{i} = dline((I(i)+1):(I(i+1)-1));
    49 end
    50 
    51 % remove the empty entries:
    52 I = zeros(length(w),1);
    53 for i=1:length(w)
    54         if isempty(w{i})
    55                 I(i) = 1;
    56         end
    57 end
    58 w(find(I)) = [];
    59 n = length(w);
    60 x = [];
    61 
    62 % see if we have strings or numbers, and put the result in the matrix:
    63 strvals = {};
    64 if isempty(strtype)
    65         for i=1:n
    66                 num = str2double(w{i});
    67                 if isnan(num)   % the feature is string
    68                         strtype(i) = 1;   % remember that it is a string
    69                         strvals{i}{1} = w{i}; % put it to the collection
    70                         x(1,i) = 1;
    71                 else               % feature is a number, life is simple
    72                         strtype(i) = 0;
    73                         x(1,i) = num;
    74                 end
    75         end
    76 else
    77         for i=1:n
    78                 strtype(i) = 1;   % remember that it is a string
    79                 strvals{i}{1} = w{i}; % put it to the collection
    80                 x(1,i) = 1;
    81         end
    82 end
    83 % now run over the other lines:
    84 nrx = 1;
    85 while 1
    86         dline = fgetl(fid);
    87         if ~ischar(dline), break, end  %end of file...
    88 
    89         % now process this line:
    90         nrx = nrx+1;
    91         % find delimiters again:
    92         I = find(dline==delimiter);
    93         % cut out the words:
    94         I = [0 I length(dline)+1];
    95         w = {};
    96         for i=1:length(I)-1
    97                 w{i} = dline((I(i)+1):(I(i+1)-1));
    98         end
    99         % remove the empty entries:
    100         I = zeros(length(w),1);
    101         for i=1:length(w)
    102                 if isempty(w{i})
    103                         I(i) = 1;
    104                 end
    105         end
    106         w(find(I)) = [];
    107         % check:
    108         if length(w)~=n
    109                 error('I cannot find enough values on line %d.',nrx);
    110         end
    111         % fill the values in the matrix
    112         for i=1:n
    113                 if strtype(i)==0 % we have a number:
    114                         tmp = str2double(w{i});
    115                         if isnan(tmp)
    116                                 error('It seems that feature %d is not numeric (encountered "%s" on line %d).',i,w{i},nrx);
    117                         end
    118                         x(nrx,i) = tmp;
    119                 else
    120                         % we have to find matching strings for feature i:
    121                         I = strmatch(w{i},strvals{i});
    122                         if ~isempty(I)  % it is found
    123                                 x(nrx,i) = I;
    124                         else  % we have to add this entry:
    125                                 x(nrx,i) = length(strvals{i})+1;
    126                                 strvals{i}{end+1} = w{i};
    127                         end
    128                 end
    129         end
    130 end
    131 
    132 fclose(fid);
     1%PR_READDATASET Convert text file into PRTools dataset
     2%
     3%   A = PR_READDATASET(FILE,NHEAD,DELIM,MISVAL,FORMAT,NLAB)
     4%
     5%INPUT
     6%  FILE       - filename
     7%  NHEAD      - number of headerlines to be skipped, default 0
     8%  DELIM      - delimiter characters, default ' '
     9%  MISVAL     - character used for missing values, default '?'
     10%  FORMAT     - format needed for interpreting feature types of columns.
     11%               default is determined from first line, e.g. 'nncc' for two
     12%               numeric and two categorical features, see SETFEATDOM and
     13%               CELL2DSET
     14%  NLAB       - feature to be interpreted as class label, default [].
     15%
     16%OUTPUT
     17%  A          - PRTools dataset
     18%
     19%SEE ALSO
     20%DATASETS, SETFEATDOM, CELL2DSET
     21
     22% Copyright: R.P.W. Duin, r.p.w.duin@37steps.com
     23
     24function a = pr_readdataset(file,varargin)
     25
     26  [nhead,del,misval,form,flab] = setdefaults(varargin,0,' ','?',[],[]);
     27
     28  [fid,msg] = fopen(file);
     29  if fid < 1
     30        error(msg)
     31  end
     32  if isempty(form)        % if no format given ...
     33    for j=1:nhead+1
     34      s = fgetl(fid);     % derive it from the first nonheader line
     35    end       
     36    s = mytextscan(s,'c',del,0); % use all %s for time being
     37    form = getform(s);    % convert fields to %n where appropriate
     38    fseek(fid,0,-1);      % restart
     39  end
     40  c = mytextscan(fid,strrep(form,'n','s'),del,nhead);
     41  a = cell2dset(c,form,misval);
     42  if ~isempty(flab)
     43    a = feat2lab(a,flab);
     44  end
     45 
     46return
     47
     48function s = mytextscan(fid,forms,del,nhead)
     49  form = repmat('%%',1,numel(forms));
     50  form(2:2:end) = forms;
     51  forms = strrep(form,'c','s');
     52  if del == ' '
     53    s = textscan(fid,forms,'Headerlines',nhead);
     54  else
     55    s = textscan(fid,forms,'Delimiter',del,'Headerlines',nhead);
     56  end
     57  if ~ischar(fid);
     58    fclose(fid);
     59  end
     60return
     61
     62function form = getform(s)
     63  s = char(s{1});
     64  form = repmat('n',1,size(s,1));
     65  for j=1:size(s,1)
     66    if ~isempty(regexp(s(j,:),'[^0-9+-.eE ]','once'))
     67      form(j) = 'c';
     68    end
     69  end
     70return
     71
     72
     73% function [x,strvals] = pr_readdataset(fname,strtype)
     74% %     [X,STRVALS] = PR_READDATASET(FNAME)
     75% %
     76% % Read the dataset from the text file FNAME. It can process categorical
     77% % features, or features for which categories are given in text. A matrix
     78% % X is returned containing the numerical values, or integers. The
     79% % integers point to the entry in STRVALS containing for each
     80% % (categorical) feature its string members.
     81% %
     82% %     X = PR_READDATASET(FNAME,STRTYPE)
     83% %
     84% % The user can supply a vector STRTYPE that indicates for each feature
     85% % if it is numerical (0) or string/categorical (1).
     86% %
     87% %     X = PR_READDATASET(FNAME,STRTYPE,DELIMITER)
     88% %
     89% % For datasets that have a strange delimiter (not comma or space), you
     90% % have to supply it.
     91% if nargin<3
     92%       delimiter = ',';
     93% end
     94% if nargin<2
     95%       strtype = [];
     96% end
     97%
     98% % try to open the file
     99% [fid,message] = fopen(fname,'r');
     100% if fid==-1
     101%       disp(message)
     102%       error('I cannot open file %s.',fname);
     103% end
     104% % get the first line:
     105% dline = fgetl(fid);
     106% % check if the delimiter is present:
     107% I = find(dline==delimiter);
     108% if isempty(I)
     109%       delimiter = ' ';
     110%       I = find(dline==delimiter);
     111%       if isempty(I)
     112%               error('Cannot determine the delimiter');
     113%       end
     114% end
     115%
     116% % now run over all elements in the line:
     117% I = [0 I length(dline)+1];
     118% w = {};
     119% for i=1:length(I)-1
     120%       w{i} = dline((I(i)+1):(I(i+1)-1));
     121% end
     122%
     123% % remove the empty entries:
     124% I = zeros(length(w),1);
     125% for i=1:length(w)
     126%       if isempty(w{i})
     127%               I(i) = 1;
     128%       end
     129% end
     130% w(find(I)) = [];
     131% n = length(w);
     132% x = [];
     133%
     134% % see if we have strings or numbers, and put the result in the matrix:
     135% strvals = {};
     136% if isempty(strtype)
     137%       for i=1:n
     138%               num = str2double(w{i});
     139%               if isnan(num)   % the feature is string
     140%                       strtype(i) = 1;   % remember that it is a string
     141%                       strvals{i}{1} = w{i}; % put it to the collection
     142%                       x(1,i) = 1;
     143%               else               % feature is a number, life is simple
     144%                       strtype(i) = 0;
     145%                       x(1,i) = num;
     146%               end
     147%       end
     148% else
     149%       for i=1:n
     150%               strtype(i) = 1;   % remember that it is a string
     151%               strvals{i}{1} = w{i}; % put it to the collection
     152%               x(1,i) = 1;
     153%       end
     154% end
     155% % now run over the other lines:
     156% nrx = 1;
     157% while 1
     158%       dline = fgetl(fid);
     159%       if ~ischar(dline), break, end  %end of file...
     160%
     161%       % now process this line:
     162%       nrx = nrx+1;
     163%       % find delimiters again:
     164%       I = find(dline==delimiter);
     165%       % cut out the words:
     166%       I = [0 I length(dline)+1];
     167%       w = {};
     168%       for i=1:length(I)-1
     169%               w{i} = dline((I(i)+1):(I(i+1)-1));
     170%       end
     171%       % remove the empty entries:
     172%       I = zeros(length(w),1);
     173%       for i=1:length(w)
     174%               if isempty(w{i})
     175%                       I(i) = 1;
     176%               end
     177%       end
     178%       w(find(I)) = [];
     179%       % check:
     180%       if length(w)~=n
     181%               error('I cannot find enough values on line %d.',nrx);
     182%       end
     183%       % fill the values in the matrix
     184%       for i=1:n
     185%               if strtype(i)==0 % we have a number:
     186%                       tmp = str2double(w{i});
     187%                       if isnan(tmp)
     188%                               error('It seems that feature %d is not numeric (encountered "%s" on line %d).',i,w{i},nrx);
     189%                       end
     190%                       x(nrx,i) = tmp;
     191%               else
     192%                       % we have to find matching strings for feature i:
     193%                       I = strmatch(w{i},strvals{i});
     194%                       if ~isempty(I)  % it is found
     195%                               x(nrx,i) = I;
     196%                       else  % we have to add this entry:
     197%                               x(nrx,i) = length(strvals{i})+1;
     198%                               strvals{i}{end+1} = w{i};
     199%                       end
     200%               end
     201%       end
     202% end
     203%
     204% fclose(fid);
Note: See TracChangeset for help on using the changeset viewer.