Context Navigation

source: prdatasets/pr_readdataset.m @ 98

Last change on this file since 98 was 97, checked in by bduin, 10 years ago

File size: 5.2 KB

Rev	Line
[97]	1	%PR_READDATASET Convert text file into PRTools dataset
[92]	2	%
[97]	3	% A = PR_READDATASET(FILE,NHEAD,DELIM,MISVAL,FORMAT,NLAB)
[92]	4	%
[97]	5	%INPUT
	6	% FILE - filename
	7	% NHEAD - number of headerlines to be skipped, default 0
	8	% DELIM - delimiter characters, default ' '
	9	% MISVAL - character used for missing values, default '?'
	10	% FORMAT - format needed for interpreting feature types of columns.
	11	% default is determined from first line, e.g. 'nncc' for two
	12	% numeric and two categorical features, see SETFEATDOM and
	13	% CELL2DSET
	14	% NLAB - feature to be interpreted as class label, default [].
[92]	15	%
[97]	16	%OUTPUT
	17	% A - PRTools dataset
[92]	18	%
[97]	19	%SEE ALSO
	20	%DATASETS, SETFEATDOM, CELL2DSET
[92]	21
[97]	22	% Copyright: R.P.W. Duin, r.p.w.duin@37steps.com
[92]	23
[97]	24	function a = pr_readdataset(file,varargin)
[92]	25
[97]	26	[nhead,del,misval,form,flab] = setdefaults(varargin,0,' ','?',[],[]);
[92]	27
[97]	28	[fid,msg] = fopen(file);
	29	if fid < 1
	30	error(msg)
	31	end
	32	if isempty(form) % if no format given ...
	33	for j=1:nhead+1
	34	s = fgetl(fid); % derive it from the first nonheader line
	35	end
	36	s = mytextscan(s,'c',del,0); % use all %s for time being
	37	form = getform(s); % convert fields to %n where appropriate
	38	fseek(fid,0,-1); % restart
	39	end
	40	c = mytextscan(fid,strrep(form,'n','s'),del,nhead);
	41	a = cell2dset(c,form,misval);
	42	if ~isempty(flab)
	43	a = feat2lab(a,flab);
	44	end
	45
	46	return
[92]	47
[97]	48	function s = mytextscan(fid,forms,del,nhead)
	49	form = repmat('%%',1,numel(forms));
	50	form(2:2:end) = forms;
	51	forms = strrep(form,'c','s');
	52	if del == ' '
	53	s = textscan(fid,forms,'Headerlines',nhead);
	54	else
	55	s = textscan(fid,forms,'Delimiter',del,'Headerlines',nhead);
	56	end
	57	if ~ischar(fid);
	58	fclose(fid);
	59	end
	60	return
[92]	61
[97]	62	function form = getform(s)
	63	s = char(s{1});
	64	form = repmat('n',1,size(s,1));
	65	for j=1:size(s,1)
	66	if ~isempty(regexp(s(j,:),'[^0-9+-.eE ]','once'))
	67	form(j) = 'c';
	68	end
	69	end
	70	return
	71
	72
	73	% function [x,strvals] = pr_readdataset(fname,strtype)
	74	% % [X,STRVALS] = PR_READDATASET(FNAME)
	75	% %
	76	% % Read the dataset from the text file FNAME. It can process categorical
	77	% % features, or features for which categories are given in text. A matrix
	78	% % X is returned containing the numerical values, or integers. The
	79	% % integers point to the entry in STRVALS containing for each
	80	% % (categorical) feature its string members.
	81	% %
	82	% % X = PR_READDATASET(FNAME,STRTYPE)
	83	% %
	84	% % The user can supply a vector STRTYPE that indicates for each feature
	85	% % if it is numerical (0) or string/categorical (1).
	86	% %
	87	% % X = PR_READDATASET(FNAME,STRTYPE,DELIMITER)
	88	% %
	89	% % For datasets that have a strange delimiter (not comma or space), you
	90	% % have to supply it.
	91	% if nargin<3
	92	% delimiter = ',';
	93	% end
	94	% if nargin<2
	95	% strtype = [];
	96	% end
	97	%
	98	% % try to open the file
	99	% [fid,message] = fopen(fname,'r');
	100	% if fid==-1
	101	% disp(message)
	102	% error('I cannot open file %s.',fname);
	103	% end
	104	% % get the first line:
	105	% dline = fgetl(fid);
	106	% % check if the delimiter is present:
	107	% I = find(dline==delimiter);
	108	% if isempty(I)
	109	% delimiter = ' ';
	110	% I = find(dline==delimiter);
	111	% if isempty(I)
	112	% error('Cannot determine the delimiter');
	113	% end
	114	% end
	115	%
	116	% % now run over all elements in the line:
	117	% I = [0 I length(dline)+1];
	118	% w = {};
	119	% for i=1:length(I)-1
	120	% w{i} = dline((I(i)+1):(I(i+1)-1));
	121	% end
	122	%
	123	% % remove the empty entries:
	124	% I = zeros(length(w),1);
	125	% for i=1:length(w)
	126	% if isempty(w{i})
	127	% I(i) = 1;
	128	% end
	129	% end
	130	% w(find(I)) = [];
	131	% n = length(w);
	132	% x = [];
	133	%
	134	% % see if we have strings or numbers, and put the result in the matrix:
	135	% strvals = {};
	136	% if isempty(strtype)
	137	% for i=1:n
	138	% num = str2double(w{i});
	139	% if isnan(num) % the feature is string
	140	% strtype(i) = 1; % remember that it is a string
	141	% strvals{i}{1} = w{i}; % put it to the collection
	142	% x(1,i) = 1;
	143	% else % feature is a number, life is simple
	144	% strtype(i) = 0;
	145	% x(1,i) = num;
	146	% end
	147	% end
	148	% else
	149	% for i=1:n
	150	% strtype(i) = 1; % remember that it is a string
	151	% strvals{i}{1} = w{i}; % put it to the collection
	152	% x(1,i) = 1;
	153	% end
	154	% end
	155	% % now run over the other lines:
	156	% nrx = 1;
	157	% while 1
	158	% dline = fgetl(fid);
	159	% if ~ischar(dline), break, end %end of file...
	160	%
	161	% % now process this line:
	162	% nrx = nrx+1;
	163	% % find delimiters again:
	164	% I = find(dline==delimiter);
	165	% % cut out the words:
	166	% I = [0 I length(dline)+1];
	167	% w = {};
	168	% for i=1:length(I)-1
	169	% w{i} = dline((I(i)+1):(I(i+1)-1));
	170	% end
	171	% % remove the empty entries:
	172	% I = zeros(length(w),1);
	173	% for i=1:length(w)
	174	% if isempty(w{i})
	175	% I(i) = 1;
	176	% end
	177	% end
	178	% w(find(I)) = [];
	179	% % check:
	180	% if length(w)~=n
	181	% error('I cannot find enough values on line %d.',nrx);
	182	% end
	183	% % fill the values in the matrix
	184	% for i=1:n
	185	% if strtype(i)==0 % we have a number:
	186	% tmp = str2double(w{i});
	187	% if isnan(tmp)
	188	% error('It seems that feature %d is not numeric (encountered "%s" on line %d).',i,w{i},nrx);
	189	% end
	190	% x(nrx,i) = tmp;
	191	% else
	192	% % we have to find matching strings for feature i:
	193	% I = strmatch(w{i},strvals{i});
	194	% if ~isempty(I) % it is found
	195	% x(nrx,i) = I;
	196	% else % we have to add this entry:
	197	% x(nrx,i) = length(strvals{i})+1;
	198	% strvals{i}{end+1} = w{i};
	199	% end
	200	% end
	201	% end
	202	% end
	203	%
	204	% fclose(fid);

Note: See TracBrowser for help on using the repository browser.

Download in other formats: