source: prdatasets/pr_readdataset.m @ 114

Last change on this file since 114 was 97, checked in by bduin, 10 years ago
File size: 5.2 KB
Line 
1%PR_READDATASET Convert text file into PRTools dataset
2%
3%   A = PR_READDATASET(FILE,NHEAD,DELIM,MISVAL,FORMAT,NLAB)
4%
5%INPUT
6%  FILE       - filename
7%  NHEAD      - number of headerlines to be skipped, default 0
8%  DELIM      - delimiter characters, default ' '
9%  MISVAL     - character used for missing values, default '?'
10%  FORMAT     - format needed for interpreting feature types of columns.
11%               default is determined from first line, e.g. 'nncc' for two
12%               numeric and two categorical features, see SETFEATDOM and
13%               CELL2DSET
14%  NLAB       - feature to be interpreted as class label, default [].
15%
16%OUTPUT
17%  A          - PRTools dataset
18%
19%SEE ALSO
20%DATASETS, SETFEATDOM, CELL2DSET
21
22% Copyright: R.P.W. Duin, r.p.w.duin@37steps.com
23
24function a = pr_readdataset(file,varargin)
25
26  [nhead,del,misval,form,flab] = setdefaults(varargin,0,' ','?',[],[]);
27
28  [fid,msg] = fopen(file);
29  if fid < 1
30        error(msg)
31  end
32  if isempty(form)        % if no format given ...
33    for j=1:nhead+1
34      s = fgetl(fid);     % derive it from the first nonheader line
35    end       
36    s = mytextscan(s,'c',del,0); % use all %s for time being
37    form = getform(s);    % convert fields to %n where appropriate
38    fseek(fid,0,-1);      % restart
39  end
40  c = mytextscan(fid,strrep(form,'n','s'),del,nhead);
41  a = cell2dset(c,form,misval);
42  if ~isempty(flab)
43    a = feat2lab(a,flab);
44  end
45 
46return
47
48function s = mytextscan(fid,forms,del,nhead)
49  form = repmat('%%',1,numel(forms));
50  form(2:2:end) = forms;
51  forms = strrep(form,'c','s');
52  if del == ' '
53    s = textscan(fid,forms,'Headerlines',nhead);
54  else
55    s = textscan(fid,forms,'Delimiter',del,'Headerlines',nhead);
56  end
57  if ~ischar(fid);
58    fclose(fid);
59  end
60return
61
62function form = getform(s)
63  s = char(s{1});
64  form = repmat('n',1,size(s,1));
65  for j=1:size(s,1)
66    if ~isempty(regexp(s(j,:),'[^0-9+-.eE ]','once'))
67      form(j) = 'c';
68    end
69  end
70return
71
72
73% function [x,strvals] = pr_readdataset(fname,strtype)
74% %     [X,STRVALS] = PR_READDATASET(FNAME)
75% %
76% % Read the dataset from the text file FNAME. It can process categorical
77% % features, or features for which categories are given in text. A matrix
78% % X is returned containing the numerical values, or integers. The
79% % integers point to the entry in STRVALS containing for each
80% % (categorical) feature its string members.
81% %
82% %     X = PR_READDATASET(FNAME,STRTYPE)
83% %
84% % The user can supply a vector STRTYPE that indicates for each feature
85% % if it is numerical (0) or string/categorical (1).
86% %
87% %     X = PR_READDATASET(FNAME,STRTYPE,DELIMITER)
88% %
89% % For datasets that have a strange delimiter (not comma or space), you
90% % have to supply it.
91% if nargin<3
92%       delimiter = ',';
93% end
94% if nargin<2
95%       strtype = [];
96% end
97%
98% % try to open the file
99% [fid,message] = fopen(fname,'r');
100% if fid==-1
101%       disp(message)
102%       error('I cannot open file %s.',fname);
103% end
104% % get the first line:
105% dline = fgetl(fid);
106% % check if the delimiter is present:
107% I = find(dline==delimiter);
108% if isempty(I)
109%       delimiter = ' ';
110%       I = find(dline==delimiter);
111%       if isempty(I)
112%               error('Cannot determine the delimiter');
113%       end
114% end
115%
116% % now run over all elements in the line:
117% I = [0 I length(dline)+1];
118% w = {};
119% for i=1:length(I)-1
120%       w{i} = dline((I(i)+1):(I(i+1)-1));
121% end
122%
123% % remove the empty entries:
124% I = zeros(length(w),1);
125% for i=1:length(w)
126%       if isempty(w{i})
127%               I(i) = 1;
128%       end
129% end
130% w(find(I)) = [];
131% n = length(w);
132% x = [];
133%
134% % see if we have strings or numbers, and put the result in the matrix:
135% strvals = {};
136% if isempty(strtype)
137%       for i=1:n
138%               num = str2double(w{i});
139%               if isnan(num)   % the feature is string
140%                       strtype(i) = 1;   % remember that it is a string
141%                       strvals{i}{1} = w{i}; % put it to the collection
142%                       x(1,i) = 1;
143%               else               % feature is a number, life is simple
144%                       strtype(i) = 0;
145%                       x(1,i) = num;
146%               end
147%       end
148% else
149%       for i=1:n
150%               strtype(i) = 1;   % remember that it is a string
151%               strvals{i}{1} = w{i}; % put it to the collection
152%               x(1,i) = 1;
153%       end
154% end
155% % now run over the other lines:
156% nrx = 1;
157% while 1
158%       dline = fgetl(fid);
159%       if ~ischar(dline), break, end  %end of file...
160%
161%       % now process this line:
162%       nrx = nrx+1;
163%       % find delimiters again:
164%       I = find(dline==delimiter);
165%       % cut out the words:
166%       I = [0 I length(dline)+1];
167%       w = {};
168%       for i=1:length(I)-1
169%               w{i} = dline((I(i)+1):(I(i+1)-1));
170%       end
171%       % remove the empty entries:
172%       I = zeros(length(w),1);
173%       for i=1:length(w)
174%               if isempty(w{i})
175%                       I(i) = 1;
176%               end
177%       end
178%       w(find(I)) = [];
179%       % check:
180%       if length(w)~=n
181%               error('I cannot find enough values on line %d.',nrx);
182%       end
183%       % fill the values in the matrix
184%       for i=1:n
185%               if strtype(i)==0 % we have a number:
186%                       tmp = str2double(w{i});
187%                       if isnan(tmp)
188%                               error('It seems that feature %d is not numeric (encountered "%s" on line %d).',i,w{i},nrx);
189%                       end
190%                       x(nrx,i) = tmp;
191%               else
192%                       % we have to find matching strings for feature i:
193%                       I = strmatch(w{i},strvals{i});
194%                       if ~isempty(I)  % it is found
195%                               x(nrx,i) = I;
196%                       else  % we have to add this entry:
197%                               x(nrx,i) = length(strvals{i})+1;
198%                               strvals{i}{end+1} = w{i};
199%                       end
200%               end
201%       end
202% end
203%
204% fclose(fid);
Note: See TracBrowser for help on using the repository browser.