%PR_READDATASET Convert text file into PRTools dataset % % A = PR_READDATASET(FILE,NHEAD,DELIM,MISVAL,FORMAT,NLAB) % %INPUT % FILE - filename % NHEAD - number of headerlines to be skipped, default 0 % DELIM - delimiter characters, default ' ' % MISVAL - character used for missing values, default '?' % FORMAT - format needed for interpreting feature types of columns. % default is determined from first line, e.g. 'nncc' for two % numeric and two categorical features, see SETFEATDOM and % CELL2DSET % NLAB - feature to be interpreted as class label, default []. % %OUTPUT % A - PRTools dataset % %SEE ALSO %DATASETS, SETFEATDOM, CELL2DSET % Copyright: R.P.W. Duin function a = pr_readdataset(file,varargin) [nhead,del,misval,form,flab] = setdefaults(varargin,0,' ','?',[],[]); [fid,msg] = fopen(file); if fid < 1 error(msg) end if isempty(form) % if no format given ... for j=1:nhead+1 s = fgetl(fid); % derive it from the first nonheader line end s = mytextscan(s,'c',del,0); % use all %s for time being form = getform(s); % convert fields to %n where appropriate fseek(fid,0,-1); % restart end c = mytextscan(fid,strrep(form,'n','s'),del,nhead); a = cell2dset(c,form,misval); if ~isempty(flab) a = feat2lab(a,flab); end return function s = mytextscan(fid,forms,del,nhead) form = repmat('%%',1,numel(forms)); form(2:2:end) = forms; forms = strrep(form,'c','s'); if del == ' ' s = textscan(fid,forms,'Headerlines',nhead); else s = textscan(fid,forms,'Delimiter',del,'Headerlines',nhead); end if ~ischar(fid); fclose(fid); end return function form = getform(s) s = char(s{1}); form = repmat('n',1,size(s,1)); for j=1:size(s,1) if ~isempty(regexp(s(j,:),'[^0-9+-.eE ]','once')) form(j) = 'c'; end end return % function [x,strvals] = pr_readdataset(fname,strtype) % % [X,STRVALS] = PR_READDATASET(FNAME) % % % % Read the dataset from the text file FNAME. It can process categorical % % features, or features for which categories are given in text. A matrix % % X is returned containing the numerical values, or integers. The % % integers point to the entry in STRVALS containing for each % % (categorical) feature its string members. % % % % X = PR_READDATASET(FNAME,STRTYPE) % % % % The user can supply a vector STRTYPE that indicates for each feature % % if it is numerical (0) or string/categorical (1). % % % % X = PR_READDATASET(FNAME,STRTYPE,DELIMITER) % % % % For datasets that have a strange delimiter (not comma or space), you % % have to supply it. % if nargin<3 % delimiter = ','; % end % if nargin<2 % strtype = []; % end % % % try to open the file % [fid,message] = fopen(fname,'r'); % if fid==-1 % disp(message) % error('I cannot open file %s.',fname); % end % % get the first line: % dline = fgetl(fid); % % check if the delimiter is present: % I = find(dline==delimiter); % if isempty(I) % delimiter = ' '; % I = find(dline==delimiter); % if isempty(I) % error('Cannot determine the delimiter'); % end % end % % % now run over all elements in the line: % I = [0 I length(dline)+1]; % w = {}; % for i=1:length(I)-1 % w{i} = dline((I(i)+1):(I(i+1)-1)); % end % % % remove the empty entries: % I = zeros(length(w),1); % for i=1:length(w) % if isempty(w{i}) % I(i) = 1; % end % end % w(find(I)) = []; % n = length(w); % x = []; % % % see if we have strings or numbers, and put the result in the matrix: % strvals = {}; % if isempty(strtype) % for i=1:n % num = str2double(w{i}); % if isnan(num) % the feature is string % strtype(i) = 1; % remember that it is a string % strvals{i}{1} = w{i}; % put it to the collection % x(1,i) = 1; % else % feature is a number, life is simple % strtype(i) = 0; % x(1,i) = num; % end % end % else % for i=1:n % strtype(i) = 1; % remember that it is a string % strvals{i}{1} = w{i}; % put it to the collection % x(1,i) = 1; % end % end % % now run over the other lines: % nrx = 1; % while 1 % dline = fgetl(fid); % if ~ischar(dline), break, end %end of file... % % % now process this line: % nrx = nrx+1; % % find delimiters again: % I = find(dline==delimiter); % % cut out the words: % I = [0 I length(dline)+1]; % w = {}; % for i=1:length(I)-1 % w{i} = dline((I(i)+1):(I(i+1)-1)); % end % % remove the empty entries: % I = zeros(length(w),1); % for i=1:length(w) % if isempty(w{i}) % I(i) = 1; % end % end % w(find(I)) = []; % % check: % if length(w)~=n % error('I cannot find enough values on line %d.',nrx); % end % % fill the values in the matrix % for i=1:n % if strtype(i)==0 % we have a number: % tmp = str2double(w{i}); % if isnan(tmp) % error('It seems that feature %d is not numeric (encountered "%s" on line %d).',i,w{i},nrx); % end % x(nrx,i) = tmp; % else % % we have to find matching strings for feature i: % I = strmatch(w{i},strvals{i}); % if ~isempty(I) % it is found % x(nrx,i) = I; % else % we have to add this entry: % x(nrx,i) = length(strvals{i})+1; % strvals{i}{end+1} = w{i}; % end % end % end % end % % fclose(fid);