source: prdatasets/pr_readdataset.m @ 96

Last change on this file since 96 was 92, checked in by bduin, 10 years ago
File size: 3.0 KB
Line 
1function [x,strvals] = pr_readdataset(fname,strtype)
2%     [X,STRVALS] = PR_READDATASET(FNAME)
3%
4% Read the dataset from the text file FNAME. It can process categorical
5% features, or features for which categories are given in text. A matrix
6% X is returned containing the numerical values, or integers. The
7% integers point to the entry in STRVALS containing for each
8% (categorical) feature its string members.
9%
10%     X = PR_READDATASET(FNAME,STRTYPE)
11%
12% The user can supply a vector STRTYPE that indicates for each feature
13% if it is numerical (0) or string/categorical (1).
14%
15%     X = PR_READDATASET(FNAME,STRTYPE,DELIMITER)
16%
17% For datasets that have a strange delimiter (not comma or space), you
18% have to supply it.
19if nargin<3
20        delimiter = ',';
21end
22if nargin<2
23        strtype = [];
24end
25
26% try to open the file
27[fid,message] = fopen(fname,'r');
28if fid==-1
29        disp(message)
30        error('I cannot open file %s.',fname);
31end
32% get the first line:
33dline = fgetl(fid);
34% check if the delimiter is present:
35I = find(dline==delimiter);
36if isempty(I)
37        delimiter = ' ';
38        I = find(dline==delimiter);
39        if isempty(I)
40                error('Cannot determine the delimiter');
41        end
42end
43
44% now run over all elements in the line:
45I = [0 I length(dline)+1];
46w = {};
47for i=1:length(I)-1
48        w{i} = dline((I(i)+1):(I(i+1)-1));
49end
50
51% remove the empty entries:
52I = zeros(length(w),1);
53for i=1:length(w)
54        if isempty(w{i})
55                I(i) = 1;
56        end
57end
58w(find(I)) = [];
59n = length(w);
60x = [];
61
62% see if we have strings or numbers, and put the result in the matrix:
63strvals = {};
64if isempty(strtype)
65        for i=1:n
66                num = str2double(w{i});
67                if isnan(num)   % the feature is string
68                        strtype(i) = 1;   % remember that it is a string
69                        strvals{i}{1} = w{i}; % put it to the collection
70                        x(1,i) = 1;
71                else               % feature is a number, life is simple
72                        strtype(i) = 0;
73                        x(1,i) = num;
74                end
75        end
76else
77        for i=1:n
78                strtype(i) = 1;   % remember that it is a string
79                strvals{i}{1} = w{i}; % put it to the collection
80                x(1,i) = 1;
81        end
82end
83% now run over the other lines:
84nrx = 1;
85while 1
86        dline = fgetl(fid);
87        if ~ischar(dline), break, end  %end of file...
88
89        % now process this line:
90        nrx = nrx+1;
91        % find delimiters again:
92        I = find(dline==delimiter);
93        % cut out the words:
94        I = [0 I length(dline)+1];
95        w = {};
96        for i=1:length(I)-1
97                w{i} = dline((I(i)+1):(I(i+1)-1));
98        end
99        % remove the empty entries:
100        I = zeros(length(w),1);
101        for i=1:length(w)
102                if isempty(w{i})
103                        I(i) = 1;
104                end
105        end
106        w(find(I)) = [];
107        % check:
108        if length(w)~=n
109                error('I cannot find enough values on line %d.',nrx);
110        end
111        % fill the values in the matrix
112        for i=1:n
113                if strtype(i)==0 % we have a number:
114                        tmp = str2double(w{i});
115                        if isnan(tmp)
116                                error('It seems that feature %d is not numeric (encountered "%s" on line %d).',i,w{i},nrx);
117                        end
118                        x(nrx,i) = tmp;
119                else
120                        % we have to find matching strings for feature i:
121                        I = strmatch(w{i},strvals{i});
122                        if ~isempty(I)  % it is found
123                                x(nrx,i) = I;
124                        else  % we have to add this entry:
125                                x(nrx,i) = length(strvals{i})+1;
126                                strvals{i}{end+1} = w{i};
127                        end
128                end
129        end
130end
131
132fclose(fid);
Note: See TracBrowser for help on using the repository browser.