Context Navigation

← Previous Changeset
Next Changeset →

Changeset 97

Timestamp:

09/24/14 16:04:03 (11 years ago)

Author:

bduin

Message:

Location:

prdatasets

Files:

: 2 edited

car.m (modified) (1 diff)
pr_readdataset.m (modified) (1 diff)

Legend:

: Unmodified
: Added
: Removed

prdatasets/car.m

-                      r92
+                      r97
 fl = {'buying price' 'maintenance price' 'nr of doors' 'nr of persons' 'luggage boot' 'safety'};
+[a,strvals] = pr_readdataset('car.data',ones(7,1));
+labs = strvals{end}(a(:,end));
+for i=1:6
+        featdom{i} = strvcat(strvals{i}');
+end
+x = pr_dataset(a(:,1:(end-1)), labs);
+x = pr_readdataset('car.data',0,',',[],'ccccccc',7)
 x = setfeatlab(x,fl);
-x = setfeatdom(x,featdom);
 x = setuser(x,user);
 x = setname(x,'Car');

prdatasets/pr_readdataset.m

-                      r92
+                      r97
+function [x,strvals] = pr_readdataset(fname,strtype)
+%     [X,STRVALS] = PR_READDATASET(FNAME)
+%
+% Read the dataset from the text file FNAME. It can process categorical
+% features, or features for which categories are given in text. A matrix
+% X is returned containing the numerical values, or integers. The
+% integers point to the entry in STRVALS containing for each
+% (categorical) feature its string members.
+%
+%     X = PR_READDATASET(FNAME,STRTYPE)
+%
+% The user can supply a vector STRTYPE that indicates for each feature
+% if it is numerical (0) or string/categorical (1).
+%
+%     X = PR_READDATASET(FNAME,STRTYPE,DELIMITER)
+%
+% For datasets that have a strange delimiter (not comma or space), you
+% have to supply it.
+if nargin<3
+        delimiter = ',';
+end
+if nargin<2
+        strtype = [];
+end
+% try to open the file
+[fid,message] = fopen(fname,'r');
+if fid==-1
+        disp(message)
+        error('I cannot open file %s.',fname);
+end
+% get the first line:
+dline = fgetl(fid);
+% check if the delimiter is present:
+I = find(dline==delimiter);
+if isempty(I)
+        delimiter = ' ';
+        I = find(dline==delimiter);
+        if isempty(I)
+                error('Cannot determine the delimiter');
+        end
+end
+% now run over all elements in the line:
+I = [0 I length(dline)+1];
+w = {};
+for i=1:length(I)-1
+        w{i} = dline((I(i)+1):(I(i+1)-1));
+end
+% remove the empty entries:
+I = zeros(length(w),1);
+for i=1:length(w)
+        if isempty(w{i})
+                I(i) = 1;
+        end
+end
+w(find(I)) = [];
+n = length(w);
+x = [];
+% see if we have strings or numbers, and put the result in the matrix:
+strvals = {};
+if isempty(strtype)
+        for i=1:n
+                num = str2double(w{i});
+                if isnan(num)   % the feature is string
+                        strtype(i) = 1;   % remember that it is a string
+                        strvals{i}{1} = w{i}; % put it to the collection
+                        x(1,i) = 1;
+                else               % feature is a number, life is simple
+                        strtype(i) = 0;
+                        x(1,i) = num;
+                end
+        end
+else
+        for i=1:n
+                strtype(i) = 1;   % remember that it is a string
+                strvals{i}{1} = w{i}; % put it to the collection
+                x(1,i) = 1;
+        end
+end
+% now run over the other lines:
+nrx = 1;
+while 1
+        dline = fgetl(fid);
+        if ~ischar(dline), break, end  %end of file...
+        % now process this line:
+        nrx = nrx+1;
+        % find delimiters again:
+        I = find(dline==delimiter);
+        % cut out the words:
+        I = [0 I length(dline)+1];
+        w = {};
+        for i=1:length(I)-1
+                w{i} = dline((I(i)+1):(I(i+1)-1));
+        end
+        % remove the empty entries:
+        I = zeros(length(w),1);
+        for i=1:length(w)
+                if isempty(w{i})
+                        I(i) = 1;
+                end
+        end
+        w(find(I)) = [];
+        % check:
+        if length(w)~=n
+                error('I cannot find enough values on line %d.',nrx);
+        end
+        % fill the values in the matrix
+        for i=1:n
+                if strtype(i)==0 % we have a number:
+                        tmp = str2double(w{i});
+                        if isnan(tmp)
+                                error('It seems that feature %d is not numeric (encountered "%s" on line %d).',i,w{i},nrx);
+                        end
+                        x(nrx,i) = tmp;
+                else
+                        % we have to find matching strings for feature i:
+                        I = strmatch(w{i},strvals{i});
+                        if ~isempty(I)  % it is found
+                                x(nrx,i) = I;
+                        else  % we have to add this entry:
+                                x(nrx,i) = length(strvals{i})+1;
+                                strvals{i}{end+1} = w{i};
+                        end
+                end
+        end
+end
+fclose(fid);
+%PR_READDATASET Convert text file into PRTools dataset
+%
+%   A = PR_READDATASET(FILE,NHEAD,DELIM,MISVAL,FORMAT,NLAB)
+%
+%INPUT
+%  FILE       - filename
+%  NHEAD      - number of headerlines to be skipped, default 0
+%  DELIM      - delimiter characters, default ' '
+%  MISVAL     - character used for missing values, default '?'
+%  FORMAT     - format needed for interpreting feature types of columns.
+%               default is determined from first line, e.g. 'nncc' for two
+%               numeric and two categorical features, see SETFEATDOM and
+%               CELL2DSET
+%  NLAB       - feature to be interpreted as class label, default [].
+%
+%OUTPUT
+%  A          - PRTools dataset
+%
+%SEE ALSO
+%DATASETS, SETFEATDOM, CELL2DSET
+% Copyright: R.P.W. Duin, r.p.w.duin@37steps.com
+function a = pr_readdataset(file,varargin)
+  [nhead,del,misval,form,flab] = setdefaults(varargin,0,' ','?',[],[]);
+  [fid,msg] = fopen(file);
+  if fid < 1
+        error(msg)
+  end
+  if isempty(form)        % if no format given ...
+    for j=1:nhead+1
+      s = fgetl(fid);     % derive it from the first nonheader line
+    end
+    s = mytextscan(s,'c',del,0); % use all %s for time being
+    form = getform(s);    % convert fields to %n where appropriate
+    fseek(fid,0,-1);      % restart
+  end
+  c = mytextscan(fid,strrep(form,'n','s'),del,nhead);
+  a = cell2dset(c,form,misval);
+  if ~isempty(flab)
+    a = feat2lab(a,flab);
+  end
+return
+function s = mytextscan(fid,forms,del,nhead)
+  form = repmat('%%',1,numel(forms));
+  form(2:2:end) = forms;
+  forms = strrep(form,'c','s');
+  if del == ' '
+    s = textscan(fid,forms,'Headerlines',nhead);
+  else
+    s = textscan(fid,forms,'Delimiter',del,'Headerlines',nhead);
+  end
+  if ~ischar(fid);
+    fclose(fid);
+  end
+return
+function form = getform(s)
+  s = char(s{1});
+  form = repmat('n',1,size(s,1));
+  for j=1:size(s,1)
+    if ~isempty(regexp(s(j,:),'[^0-9+-.eE ]','once'))
+      form(j) = 'c';
+    end
+  end
+return
+% function [x,strvals] = pr_readdataset(fname,strtype)
+% %     [X,STRVALS] = PR_READDATASET(FNAME)
+% %
+% % Read the dataset from the text file FNAME. It can process categorical
+% % features, or features for which categories are given in text. A matrix
+% % X is returned containing the numerical values, or integers. The
+% % integers point to the entry in STRVALS containing for each
+% % (categorical) feature its string members.
+% %
+% %     X = PR_READDATASET(FNAME,STRTYPE)
+% %
+% % The user can supply a vector STRTYPE that indicates for each feature
+% % if it is numerical (0) or string/categorical (1).
+% %
+% %     X = PR_READDATASET(FNAME,STRTYPE,DELIMITER)
+% %
+% % For datasets that have a strange delimiter (not comma or space), you
+% % have to supply it.
+% if nargin<3
+%       delimiter = ',';
+% end
+% if nargin<2
+%       strtype = [];
+% end
+%
+% % try to open the file
+% [fid,message] = fopen(fname,'r');
+% if fid==-1
+%       disp(message)
+%       error('I cannot open file %s.',fname);
+% end
+% % get the first line:
+% dline = fgetl(fid);
+% % check if the delimiter is present:
+% I = find(dline==delimiter);
+% if isempty(I)
+%       delimiter = ' ';
+%       I = find(dline==delimiter);
+%       if isempty(I)
+%               error('Cannot determine the delimiter');
+%       end
+% end
+%
+% % now run over all elements in the line:
+% I = [0 I length(dline)+1];
+% w = {};
+% for i=1:length(I)-1
+%       w{i} = dline((I(i)+1):(I(i+1)-1));
+% end
+%
+% % remove the empty entries:
+% I = zeros(length(w),1);
+% for i=1:length(w)
+%       if isempty(w{i})
+%               I(i) = 1;
+%       end
+% end
+% w(find(I)) = [];
+% n = length(w);
+% x = [];
+%
+% % see if we have strings or numbers, and put the result in the matrix:
+% strvals = {};
+% if isempty(strtype)
+%       for i=1:n
+%               num = str2double(w{i});
+%               if isnan(num)   % the feature is string
+%                       strtype(i) = 1;   % remember that it is a string
+%                       strvals{i}{1} = w{i}; % put it to the collection
+%                       x(1,i) = 1;
+%               else               % feature is a number, life is simple
+%                       strtype(i) = 0;
+%                       x(1,i) = num;
+%               end
+%       end
+% else
+%       for i=1:n
+%               strtype(i) = 1;   % remember that it is a string
+%               strvals{i}{1} = w{i}; % put it to the collection
+%               x(1,i) = 1;
+%       end
+% end
+% % now run over the other lines:
+% nrx = 1;
+% while 1
+%       dline = fgetl(fid);
+%       if ~ischar(dline), break, end  %end of file...
+%
+%       % now process this line:
+%       nrx = nrx+1;
+%       % find delimiters again:
+%       I = find(dline==delimiter);
+%       % cut out the words:
+%       I = [0 I length(dline)+1];
+%       w = {};
+%       for i=1:length(I)-1
+%               w{i} = dline((I(i)+1):(I(i+1)-1));
+%       end
+%       % remove the empty entries:
+%       I = zeros(length(w),1);
+%       for i=1:length(w)
+%               if isempty(w{i})
+%                       I(i) = 1;
+%               end
+%       end
+%       w(find(I)) = [];
+%       % check:
+%       if length(w)~=n
+%               error('I cannot find enough values on line %d.',nrx);
+%       end
+%       % fill the values in the matrix
+%       for i=1:n
+%               if strtype(i)==0 % we have a number:
+%                       tmp = str2double(w{i});
+%                       if isnan(tmp)
+%                               error('It seems that feature %d is not numeric (encountered "%s" on line %d).',i,w{i},nrx);
+%                       end
+%                       x(nrx,i) = tmp;
+%               else
+%                       % we have to find matching strings for feature i:
+%                       I = strmatch(w{i},strvals{i});
+%                       if ~isempty(I)  % it is found
+%                               x(nrx,i) = I;
+%                       else  % we have to add this entry:
+%                               x(nrx,i) = length(strvals{i})+1;
+%                               strvals{i}{end+1} = w{i};
+%                       end
+%               end
+%       end
+% end
+%
+% fclose(fid);

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 97

Legend:

prdatasets/car.m

prdatasets/pr_readdataset.m

Download in other formats: