%PR_READDATASET Convert text file into PRTools dataset
%
%   A = PR_READDATASET(FILE,NHEAD,DELIM,MISVAL,FORMAT,NLAB)
%
%INPUT
%  FILE       - filename
%  NHEAD      - number of headerlines to be skipped, default 0
%  DELIM      - delimiter characters, default ' '
%  MISVAL     - character used for missing values, default '?'
%  FORMAT     - format needed for interpreting feature types of columns.
%               default is determined from first line, e.g. 'nncc' for two
%               numeric and two categorical features, see SETFEATDOM and
%               CELL2DSET
%  NLAB       - feature to be interpreted as class label, default [].
%
%OUTPUT
%  A          - PRTools dataset
%
%SEE ALSO
%DATASETS, SETFEATDOM, CELL2DSET

% Copyright: R.P.W. Duin

function a = pr_readdataset(file,varargin)

  [nhead,del,misval,form,flab] = setdefaults(varargin,0,' ','?',[],[]);

  [fid,msg] = fopen(file);
  if fid < 1
  	error(msg)
  end
  if isempty(form)        % if no format given ...
    for j=1:nhead+1
      s = fgetl(fid);     % derive it from the first nonheader line
    end        
    s = mytextscan(s,'c',del,0); % use all %s for time being
    form = getform(s);    % convert fields to %n where appropriate
    fseek(fid,0,-1);      % restart
  end
  c = mytextscan(fid,strrep(form,'n','s'),del,nhead);
  a = cell2dset(c,form,misval);
  if ~isempty(flab)
    a = feat2lab(a,flab);
  end
  
return

function s = mytextscan(fid,forms,del,nhead)
  form = repmat('%%',1,numel(forms));
  form(2:2:end) = forms;
  forms = strrep(form,'c','s');
  if del == ' '
    s = textscan(fid,forms,'Headerlines',nhead);
  else
    s = textscan(fid,forms,'Delimiter',del,'Headerlines',nhead);
  end
  if ~ischar(fid);
    fclose(fid);
  end
return

function form = getform(s)
  s = char(s{1});
  form = repmat('n',1,size(s,1));
  for j=1:size(s,1)
    if ~isempty(regexp(s(j,:),'[^0-9+-.eE ]','once'))
      form(j) = 'c';
    end
  end
return


% function [x,strvals] = pr_readdataset(fname,strtype)
% %     [X,STRVALS] = PR_READDATASET(FNAME)
% %
% % Read the dataset from the text file FNAME. It can process categorical
% % features, or features for which categories are given in text. A matrix
% % X is returned containing the numerical values, or integers. The
% % integers point to the entry in STRVALS containing for each
% % (categorical) feature its string members.
% %
% %     X = PR_READDATASET(FNAME,STRTYPE)
% %
% % The user can supply a vector STRTYPE that indicates for each feature
% % if it is numerical (0) or string/categorical (1).
% %
% %     X = PR_READDATASET(FNAME,STRTYPE,DELIMITER)
% %
% % For datasets that have a strange delimiter (not comma or space), you
% % have to supply it.
% if nargin<3
% 	delimiter = ',';
% end
% if nargin<2
% 	strtype = [];
% end
% 
% % try to open the file
% [fid,message] = fopen(fname,'r');
% if fid==-1
% 	disp(message)
% 	error('I cannot open file %s.',fname);
% end
% % get the first line:
% dline = fgetl(fid);
% % check if the delimiter is present:
% I = find(dline==delimiter);
% if isempty(I)
% 	delimiter = ' ';
% 	I = find(dline==delimiter);
% 	if isempty(I)
% 		error('Cannot determine the delimiter');
% 	end
% end
% 
% % now run over all elements in the line:
% I = [0 I length(dline)+1];
% w = {};
% for i=1:length(I)-1
% 	w{i} = dline((I(i)+1):(I(i+1)-1));
% end
% 
% % remove the empty entries:
% I = zeros(length(w),1);
% for i=1:length(w)
% 	if isempty(w{i})
% 		I(i) = 1;
% 	end
% end
% w(find(I)) = [];
% n = length(w);
% x = [];
% 
% % see if we have strings or numbers, and put the result in the matrix:
% strvals = {};
% if isempty(strtype)
% 	for i=1:n
% 		num = str2double(w{i});
% 		if isnan(num)   % the feature is string
% 			strtype(i) = 1;   % remember that it is a string
% 			strvals{i}{1} = w{i}; % put it to the collection
% 			x(1,i) = 1;
% 		else               % feature is a number, life is simple
% 			strtype(i) = 0;
% 			x(1,i) = num;
% 		end
% 	end
% else
% 	for i=1:n
% 		strtype(i) = 1;   % remember that it is a string
% 		strvals{i}{1} = w{i}; % put it to the collection
% 		x(1,i) = 1;
% 	end
% end
% % now run over the other lines:
% nrx = 1;
% while 1
% 	dline = fgetl(fid);
% 	if ~ischar(dline), break, end  %end of file...
% 
% 	% now process this line:
% 	nrx = nrx+1;
% 	% find delimiters again:
% 	I = find(dline==delimiter);
% 	% cut out the words:
% 	I = [0 I length(dline)+1];
% 	w = {};
% 	for i=1:length(I)-1
% 		w{i} = dline((I(i)+1):(I(i+1)-1));
% 	end
% 	% remove the empty entries:
% 	I = zeros(length(w),1);
% 	for i=1:length(w)
% 		if isempty(w{i})
% 			I(i) = 1;
% 		end
% 	end
% 	w(find(I)) = [];
% 	% check:
% 	if length(w)~=n
% 		error('I cannot find enough values on line %d.',nrx);
% 	end
% 	% fill the values in the matrix
% 	for i=1:n
% 		if strtype(i)==0 % we have a number:
% 			tmp = str2double(w{i});
% 			if isnan(tmp)
% 				error('It seems that feature %d is not numeric (encountered "%s" on line %d).',i,w{i},nrx);
% 			end
% 			x(nrx,i) = tmp;
% 		else
% 			% we have to find matching strings for feature i:
% 			I = strmatch(w{i},strvals{i});
% 			if ~isempty(I)  % it is found
% 				x(nrx,i) = I;
% 			else  % we have to add this entry:
% 				x(nrx,i) = length(strvals{i})+1;
% 				strvals{i}{end+1} = w{i};
% 			end
% 		end
% 	end
% end
% 
% fclose(fid);