[80] | 1 | function [x,strvals] = prreaddataset(fname,strtype) |
---|
| 2 | % [X,STRVALS] = PRREADDATASET(FNAME) |
---|
| 3 | % |
---|
| 4 | % Read the dataset from the text file FNAME. It can process categorical |
---|
| 5 | % features, or features for which categories are given in text. A matrix |
---|
| 6 | % X is returned containing the numerical values, or integers. The |
---|
| 7 | % integers point to the entry in STRVALS containing for each |
---|
| 8 | % (categorical) feature its string members. |
---|
| 9 | % |
---|
| 10 | % X = PRREADDATASET(FNAME,STRTYPE) |
---|
| 11 | % |
---|
| 12 | % The user can supply a vector STRTYPE that indicates for each feature |
---|
| 13 | % if it is numerical (0) or string/categorical (1). |
---|
| 14 | % |
---|
| 15 | % X = PRREADDATASET(FNAME,STRTYPE,DELIMITER) |
---|
| 16 | % |
---|
| 17 | % For datasets that have a strange delimiter (not comma or space), you |
---|
| 18 | % have to supply it. |
---|
| 19 | if nargin<3 |
---|
| 20 | delimiter = ','; |
---|
| 21 | end |
---|
| 22 | if nargin<2 |
---|
| 23 | strtype = []; |
---|
| 24 | end |
---|
| 25 | |
---|
| 26 | % try to open the file |
---|
| 27 | [fid,message] = fopen(fname,'r'); |
---|
| 28 | if fid==-1 |
---|
| 29 | disp(message) |
---|
| 30 | error('I cannot open file %s.',fname); |
---|
| 31 | end |
---|
| 32 | % get the first line: |
---|
| 33 | dline = fgetl(fid); |
---|
| 34 | % check if the delimiter is present: |
---|
| 35 | I = find(dline==delimiter); |
---|
| 36 | if isempty(I) |
---|
| 37 | delimiter = ' '; |
---|
| 38 | I = find(dline==delimiter); |
---|
| 39 | if isempty(I) |
---|
| 40 | error('Cannot determine the delimiter'); |
---|
| 41 | end |
---|
| 42 | end |
---|
| 43 | |
---|
| 44 | % now run over all elements in the line: |
---|
| 45 | I = [0 I length(dline)+1]; |
---|
| 46 | w = {}; |
---|
| 47 | for i=1:length(I)-1 |
---|
| 48 | w{i} = dline((I(i)+1):(I(i+1)-1)); |
---|
| 49 | end |
---|
| 50 | |
---|
| 51 | % remove the empty entries: |
---|
| 52 | I = zeros(length(w),1); |
---|
| 53 | for i=1:length(w) |
---|
| 54 | if isempty(w{i}) |
---|
| 55 | I(i) = 1; |
---|
| 56 | end |
---|
| 57 | end |
---|
| 58 | w(find(I)) = []; |
---|
| 59 | n = length(w); |
---|
| 60 | x = []; |
---|
| 61 | |
---|
| 62 | % see if we have strings or numbers, and put the result in the matrix: |
---|
| 63 | strvals = {}; |
---|
| 64 | if isempty(strtype) |
---|
| 65 | for i=1:n |
---|
| 66 | num = str2double(w{i}); |
---|
| 67 | if isnan(num) % the feature is string |
---|
| 68 | strtype(i) = 1; % remember that it is a string |
---|
| 69 | strvals{i}{1} = w{i}; % put it to the collection |
---|
| 70 | x(1,i) = 1; |
---|
| 71 | else % feature is a number, life is simple |
---|
| 72 | strtype(i) = 0; |
---|
| 73 | x(1,i) = num; |
---|
| 74 | end |
---|
| 75 | end |
---|
| 76 | else |
---|
| 77 | for i=1:n |
---|
| 78 | strtype(i) = 1; % remember that it is a string |
---|
| 79 | strvals{i}{1} = w{i}; % put it to the collection |
---|
| 80 | x(1,i) = 1; |
---|
| 81 | end |
---|
| 82 | end |
---|
| 83 | % now run over the other lines: |
---|
| 84 | nrx = 1; |
---|
| 85 | while 1 |
---|
| 86 | dline = fgetl(fid); |
---|
| 87 | if ~ischar(dline), break, end %end of file... |
---|
| 88 | |
---|
| 89 | % now process this line: |
---|
| 90 | nrx = nrx+1; |
---|
| 91 | % find delimiters again: |
---|
| 92 | I = find(dline==delimiter); |
---|
| 93 | % cut out the words: |
---|
| 94 | I = [0 I length(dline)+1]; |
---|
| 95 | w = {}; |
---|
| 96 | for i=1:length(I)-1 |
---|
| 97 | w{i} = dline((I(i)+1):(I(i+1)-1)); |
---|
| 98 | end |
---|
| 99 | % remove the empty entries: |
---|
| 100 | I = zeros(length(w),1); |
---|
| 101 | for i=1:length(w) |
---|
| 102 | if isempty(w{i}) |
---|
| 103 | I(i) = 1; |
---|
| 104 | end |
---|
| 105 | end |
---|
| 106 | w(find(I)) = []; |
---|
| 107 | % check: |
---|
| 108 | if length(w)~=n |
---|
| 109 | error('I cannot find enough values on line %d.',nrx); |
---|
| 110 | end |
---|
| 111 | % fill the values in the matrix |
---|
| 112 | for i=1:n |
---|
| 113 | if strtype(i)==0 % we have a number: |
---|
| 114 | tmp = str2double(w{i}); |
---|
| 115 | if isnan(tmp) |
---|
| 116 | error('It seems that feature %d is not numeric (encountered "%s" on line %d).',i,w{i},nrx); |
---|
| 117 | end |
---|
| 118 | x(nrx,i) = tmp; |
---|
| 119 | else |
---|
| 120 | % we have to find matching strings for feature i: |
---|
| 121 | I = strmatch(w{i},strvals{i}); |
---|
| 122 | if ~isempty(I) % it is found |
---|
| 123 | x(nrx,i) = I; |
---|
| 124 | else % we have to add this entry: |
---|
| 125 | x(nrx,i) = length(strvals{i})+1; |
---|
| 126 | strvals{i}{end+1} = w{i}; |
---|
| 127 | end |
---|
| 128 | end |
---|
| 129 | end |
---|
| 130 | end |
---|
| 131 | |
---|
| 132 | fclose(fid); |
---|