1 | function [x,strvals] = pr_readdataset(fname,strtype) |
---|
2 | % [X,STRVALS] = PR_READDATASET(FNAME) |
---|
3 | % |
---|
4 | % Read the dataset from the text file FNAME. It can process categorical |
---|
5 | % features, or features for which categories are given in text. A matrix |
---|
6 | % X is returned containing the numerical values, or integers. The |
---|
7 | % integers point to the entry in STRVALS containing for each |
---|
8 | % (categorical) feature its string members. |
---|
9 | % |
---|
10 | % X = PR_READDATASET(FNAME,STRTYPE) |
---|
11 | % |
---|
12 | % The user can supply a vector STRTYPE that indicates for each feature |
---|
13 | % if it is numerical (0) or string/categorical (1). |
---|
14 | % |
---|
15 | % X = PR_READDATASET(FNAME,STRTYPE,DELIMITER) |
---|
16 | % |
---|
17 | % For datasets that have a strange delimiter (not comma or space), you |
---|
18 | % have to supply it. |
---|
19 | if nargin<3 |
---|
20 | delimiter = ','; |
---|
21 | end |
---|
22 | if nargin<2 |
---|
23 | strtype = []; |
---|
24 | end |
---|
25 | |
---|
26 | % try to open the file |
---|
27 | [fid,message] = fopen(fname,'r'); |
---|
28 | if fid==-1 |
---|
29 | disp(message) |
---|
30 | error('I cannot open file %s.',fname); |
---|
31 | end |
---|
32 | % get the first line: |
---|
33 | dline = fgetl(fid); |
---|
34 | % check if the delimiter is present: |
---|
35 | I = find(dline==delimiter); |
---|
36 | if isempty(I) |
---|
37 | delimiter = ' '; |
---|
38 | I = find(dline==delimiter); |
---|
39 | if isempty(I) |
---|
40 | error('Cannot determine the delimiter'); |
---|
41 | end |
---|
42 | end |
---|
43 | |
---|
44 | % now run over all elements in the line: |
---|
45 | I = [0 I length(dline)+1]; |
---|
46 | w = {}; |
---|
47 | for i=1:length(I)-1 |
---|
48 | w{i} = dline((I(i)+1):(I(i+1)-1)); |
---|
49 | end |
---|
50 | |
---|
51 | % remove the empty entries: |
---|
52 | I = zeros(length(w),1); |
---|
53 | for i=1:length(w) |
---|
54 | if isempty(w{i}) |
---|
55 | I(i) = 1; |
---|
56 | end |
---|
57 | end |
---|
58 | w(find(I)) = []; |
---|
59 | n = length(w); |
---|
60 | x = []; |
---|
61 | |
---|
62 | % see if we have strings or numbers, and put the result in the matrix: |
---|
63 | strvals = {}; |
---|
64 | if isempty(strtype) |
---|
65 | for i=1:n |
---|
66 | num = str2double(w{i}); |
---|
67 | if isnan(num) % the feature is string |
---|
68 | strtype(i) = 1; % remember that it is a string |
---|
69 | strvals{i}{1} = w{i}; % put it to the collection |
---|
70 | x(1,i) = 1; |
---|
71 | else % feature is a number, life is simple |
---|
72 | strtype(i) = 0; |
---|
73 | x(1,i) = num; |
---|
74 | end |
---|
75 | end |
---|
76 | else |
---|
77 | for i=1:n |
---|
78 | strtype(i) = 1; % remember that it is a string |
---|
79 | strvals{i}{1} = w{i}; % put it to the collection |
---|
80 | x(1,i) = 1; |
---|
81 | end |
---|
82 | end |
---|
83 | % now run over the other lines: |
---|
84 | nrx = 1; |
---|
85 | while 1 |
---|
86 | dline = fgetl(fid); |
---|
87 | if ~ischar(dline), break, end %end of file... |
---|
88 | |
---|
89 | % now process this line: |
---|
90 | nrx = nrx+1; |
---|
91 | % find delimiters again: |
---|
92 | I = find(dline==delimiter); |
---|
93 | % cut out the words: |
---|
94 | I = [0 I length(dline)+1]; |
---|
95 | w = {}; |
---|
96 | for i=1:length(I)-1 |
---|
97 | w{i} = dline((I(i)+1):(I(i+1)-1)); |
---|
98 | end |
---|
99 | % remove the empty entries: |
---|
100 | I = zeros(length(w),1); |
---|
101 | for i=1:length(w) |
---|
102 | if isempty(w{i}) |
---|
103 | I(i) = 1; |
---|
104 | end |
---|
105 | end |
---|
106 | w(find(I)) = []; |
---|
107 | % check: |
---|
108 | if length(w)~=n |
---|
109 | error('I cannot find enough values on line %d.',nrx); |
---|
110 | end |
---|
111 | % fill the values in the matrix |
---|
112 | for i=1:n |
---|
113 | if strtype(i)==0 % we have a number: |
---|
114 | tmp = str2double(w{i}); |
---|
115 | if isnan(tmp) |
---|
116 | error('It seems that feature %d is not numeric (encountered "%s" on line %d).',i,w{i},nrx); |
---|
117 | end |
---|
118 | x(nrx,i) = tmp; |
---|
119 | else |
---|
120 | % we have to find matching strings for feature i: |
---|
121 | I = strmatch(w{i},strvals{i}); |
---|
122 | if ~isempty(I) % it is found |
---|
123 | x(nrx,i) = I; |
---|
124 | else % we have to add this entry: |
---|
125 | x(nrx,i) = length(strvals{i})+1; |
---|
126 | strvals{i}{end+1} = w{i}; |
---|
127 | end |
---|
128 | end |
---|
129 | end |
---|
130 | end |
---|
131 | |
---|
132 | fclose(fid); |
---|