Context Navigation

source: prdatasets/pr_readdataset.m @ 104

Last change on this file since 104 was 97, checked in by bduin, 10 years ago

File size: 5.2 KB

Line
1	%PR_READDATASET Convert text file into PRTools dataset
2	%
3	% A = PR_READDATASET(FILE,NHEAD,DELIM,MISVAL,FORMAT,NLAB)
4	%
5	%INPUT
6	% FILE - filename
7	% NHEAD - number of headerlines to be skipped, default 0
8	% DELIM - delimiter characters, default ' '
9	% MISVAL - character used for missing values, default '?'
10	% FORMAT - format needed for interpreting feature types of columns.
11	% default is determined from first line, e.g. 'nncc' for two
12	% numeric and two categorical features, see SETFEATDOM and
13	% CELL2DSET
14	% NLAB - feature to be interpreted as class label, default [].
15	%
16	%OUTPUT
17	% A - PRTools dataset
18	%
19	%SEE ALSO
20	%DATASETS, SETFEATDOM, CELL2DSET
21
22	% Copyright: R.P.W. Duin, r.p.w.duin@37steps.com
23
24	function a = pr_readdataset(file,varargin)
25
26	[nhead,del,misval,form,flab] = setdefaults(varargin,0,' ','?',[],[]);
27
28	[fid,msg] = fopen(file);
29	if fid < 1
30	error(msg)
31	end
32	if isempty(form) % if no format given ...
33	for j=1:nhead+1
34	s = fgetl(fid); % derive it from the first nonheader line
35	end
36	s = mytextscan(s,'c',del,0); % use all %s for time being
37	form = getform(s); % convert fields to %n where appropriate
38	fseek(fid,0,-1); % restart
39	end
40	c = mytextscan(fid,strrep(form,'n','s'),del,nhead);
41	a = cell2dset(c,form,misval);
42	if ~isempty(flab)
43	a = feat2lab(a,flab);
44	end
45
46	return
47
48	function s = mytextscan(fid,forms,del,nhead)
49	form = repmat('%%',1,numel(forms));
50	form(2:2:end) = forms;
51	forms = strrep(form,'c','s');
52	if del == ' '
53	s = textscan(fid,forms,'Headerlines',nhead);
54	else
55	s = textscan(fid,forms,'Delimiter',del,'Headerlines',nhead);
56	end
57	if ~ischar(fid);
58	fclose(fid);
59	end
60	return
61
62	function form = getform(s)
63	s = char(s{1});
64	form = repmat('n',1,size(s,1));
65	for j=1:size(s,1)
66	if ~isempty(regexp(s(j,:),'[^0-9+-.eE ]','once'))
67	form(j) = 'c';
68	end
69	end
70	return
71
72
73	% function [x,strvals] = pr_readdataset(fname,strtype)
74	% % [X,STRVALS] = PR_READDATASET(FNAME)
75	% %
76	% % Read the dataset from the text file FNAME. It can process categorical
77	% % features, or features for which categories are given in text. A matrix
78	% % X is returned containing the numerical values, or integers. The
79	% % integers point to the entry in STRVALS containing for each
80	% % (categorical) feature its string members.
81	% %
82	% % X = PR_READDATASET(FNAME,STRTYPE)
83	% %
84	% % The user can supply a vector STRTYPE that indicates for each feature
85	% % if it is numerical (0) or string/categorical (1).
86	% %
87	% % X = PR_READDATASET(FNAME,STRTYPE,DELIMITER)
88	% %
89	% % For datasets that have a strange delimiter (not comma or space), you
90	% % have to supply it.
91	% if nargin<3
92	% delimiter = ',';
93	% end
94	% if nargin<2
95	% strtype = [];
96	% end
97	%
98	% % try to open the file
99	% [fid,message] = fopen(fname,'r');
100	% if fid==-1
101	% disp(message)
102	% error('I cannot open file %s.',fname);
103	% end
104	% % get the first line:
105	% dline = fgetl(fid);
106	% % check if the delimiter is present:
107	% I = find(dline==delimiter);
108	% if isempty(I)
109	% delimiter = ' ';
110	% I = find(dline==delimiter);
111	% if isempty(I)
112	% error('Cannot determine the delimiter');
113	% end
114	% end
115	%
116	% % now run over all elements in the line:
117	% I = [0 I length(dline)+1];
118	% w = {};
119	% for i=1:length(I)-1
120	% w{i} = dline((I(i)+1):(I(i+1)-1));
121	% end
122	%
123	% % remove the empty entries:
124	% I = zeros(length(w),1);
125	% for i=1:length(w)
126	% if isempty(w{i})
127	% I(i) = 1;
128	% end
129	% end
130	% w(find(I)) = [];
131	% n = length(w);
132	% x = [];
133	%
134	% % see if we have strings or numbers, and put the result in the matrix:
135	% strvals = {};
136	% if isempty(strtype)
137	% for i=1:n
138	% num = str2double(w{i});
139	% if isnan(num) % the feature is string
140	% strtype(i) = 1; % remember that it is a string
141	% strvals{i}{1} = w{i}; % put it to the collection
142	% x(1,i) = 1;
143	% else % feature is a number, life is simple
144	% strtype(i) = 0;
145	% x(1,i) = num;
146	% end
147	% end
148	% else
149	% for i=1:n
150	% strtype(i) = 1; % remember that it is a string
151	% strvals{i}{1} = w{i}; % put it to the collection
152	% x(1,i) = 1;
153	% end
154	% end
155	% % now run over the other lines:
156	% nrx = 1;
157	% while 1
158	% dline = fgetl(fid);
159	% if ~ischar(dline), break, end %end of file...
160	%
161	% % now process this line:
162	% nrx = nrx+1;
163	% % find delimiters again:
164	% I = find(dline==delimiter);
165	% % cut out the words:
166	% I = [0 I length(dline)+1];
167	% w = {};
168	% for i=1:length(I)-1
169	% w{i} = dline((I(i)+1):(I(i+1)-1));
170	% end
171	% % remove the empty entries:
172	% I = zeros(length(w),1);
173	% for i=1:length(w)
174	% if isempty(w{i})
175	% I(i) = 1;
176	% end
177	% end
178	% w(find(I)) = [];
179	% % check:
180	% if length(w)~=n
181	% error('I cannot find enough values on line %d.',nrx);
182	% end
183	% % fill the values in the matrix
184	% for i=1:n
185	% if strtype(i)==0 % we have a number:
186	% tmp = str2double(w{i});
187	% if isnan(tmp)
188	% error('It seems that feature %d is not numeric (encountered "%s" on line %d).',i,w{i},nrx);
189	% end
190	% x(nrx,i) = tmp;
191	% else
192	% % we have to find matching strings for feature i:
193	% I = strmatch(w{i},strvals{i});
194	% if ~isempty(I) % it is found
195	% x(nrx,i) = I;
196	% else % we have to add this entry:
197	% x(nrx,i) = length(strvals{i})+1;
198	% strvals{i}{end+1} = w{i};
199	% end
200	% end
201	% end
202	% end
203	%
204	% fclose(fid);

Note: See TracBrowser for help on using the repository browser.

Download in other formats: