Changeset 97
- Timestamp:
- 09/24/14 16:04:03 (10 years ago)
- Location:
- prdatasets
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
prdatasets/car.m
r92 r97 17 17 fl = {'buying price' 'maintenance price' 'nr of doors' 'nr of persons' 'luggage boot' 'safety'}; 18 18 19 [a,strvals] = pr_readdataset('car.data',ones(7,1)); 20 labs = strvals{end}(a(:,end)); 21 for i=1:6 22 featdom{i} = strvcat(strvals{i}'); 23 end 24 x = pr_dataset(a(:,1:(end-1)), labs); 19 x = pr_readdataset('car.data',0,',',[],'ccccccc',7) 25 20 x = setfeatlab(x,fl); 26 x = setfeatdom(x,featdom);27 21 x = setuser(x,user); 28 22 x = setname(x,'Car'); -
prdatasets/pr_readdataset.m
r92 r97 1 function [x,strvals] = pr_readdataset(fname,strtype) 2 % [X,STRVALS] = PR_READDATASET(FNAME) 3 % 4 % Read the dataset from the text file FNAME. It can process categorical 5 % features, or features for which categories are given in text. A matrix 6 % X is returned containing the numerical values, or integers. The 7 % integers point to the entry in STRVALS containing for each 8 % (categorical) feature its string members. 9 % 10 % X = PR_READDATASET(FNAME,STRTYPE) 11 % 12 % The user can supply a vector STRTYPE that indicates for each feature 13 % if it is numerical (0) or string/categorical (1). 14 % 15 % X = PR_READDATASET(FNAME,STRTYPE,DELIMITER) 16 % 17 % For datasets that have a strange delimiter (not comma or space), you 18 % have to supply it. 19 if nargin<3 20 delimiter = ','; 21 end 22 if nargin<2 23 strtype = []; 24 end 25 26 % try to open the file 27 [fid,message] = fopen(fname,'r'); 28 if fid==-1 29 disp(message) 30 error('I cannot open file %s.',fname); 31 end 32 % get the first line: 33 dline = fgetl(fid); 34 % check if the delimiter is present: 35 I = find(dline==delimiter); 36 if isempty(I) 37 delimiter = ' '; 38 I = find(dline==delimiter); 39 if isempty(I) 40 error('Cannot determine the delimiter'); 41 end 42 end 43 44 % now run over all elements in the line: 45 I = [0 I length(dline)+1]; 46 w = {}; 47 for i=1:length(I)-1 48 w{i} = dline((I(i)+1):(I(i+1)-1)); 49 end 50 51 % remove the empty entries: 52 I = zeros(length(w),1); 53 for i=1:length(w) 54 if isempty(w{i}) 55 I(i) = 1; 56 end 57 end 58 w(find(I)) = []; 59 n = length(w); 60 x = []; 61 62 % see if we have strings or numbers, and put the result in the matrix: 63 strvals = {}; 64 if isempty(strtype) 65 for i=1:n 66 num = str2double(w{i}); 67 if isnan(num) % the feature is string 68 strtype(i) = 1; % remember that it is a string 69 strvals{i}{1} = w{i}; % put it to the collection 70 x(1,i) = 1; 71 else % feature is a number, life is simple 72 strtype(i) = 0; 73 x(1,i) = num; 74 end 75 end 76 else 77 for i=1:n 78 strtype(i) = 1; % remember that it is a string 79 strvals{i}{1} = w{i}; % put it to the collection 80 x(1,i) = 1; 81 end 82 end 83 % now run over the other lines: 84 nrx = 1; 85 while 1 86 dline = fgetl(fid); 87 if ~ischar(dline), break, end %end of file... 88 89 % now process this line: 90 nrx = nrx+1; 91 % find delimiters again: 92 I = find(dline==delimiter); 93 % cut out the words: 94 I = [0 I length(dline)+1]; 95 w = {}; 96 for i=1:length(I)-1 97 w{i} = dline((I(i)+1):(I(i+1)-1)); 98 end 99 % remove the empty entries: 100 I = zeros(length(w),1); 101 for i=1:length(w) 102 if isempty(w{i}) 103 I(i) = 1; 104 end 105 end 106 w(find(I)) = []; 107 % check: 108 if length(w)~=n 109 error('I cannot find enough values on line %d.',nrx); 110 end 111 % fill the values in the matrix 112 for i=1:n 113 if strtype(i)==0 % we have a number: 114 tmp = str2double(w{i}); 115 if isnan(tmp) 116 error('It seems that feature %d is not numeric (encountered "%s" on line %d).',i,w{i},nrx); 117 end 118 x(nrx,i) = tmp; 119 else 120 % we have to find matching strings for feature i: 121 I = strmatch(w{i},strvals{i}); 122 if ~isempty(I) % it is found 123 x(nrx,i) = I; 124 else % we have to add this entry: 125 x(nrx,i) = length(strvals{i})+1; 126 strvals{i}{end+1} = w{i}; 127 end 128 end 129 end 130 end 131 132 fclose(fid); 1 %PR_READDATASET Convert text file into PRTools dataset 2 % 3 % A = PR_READDATASET(FILE,NHEAD,DELIM,MISVAL,FORMAT,NLAB) 4 % 5 %INPUT 6 % FILE - filename 7 % NHEAD - number of headerlines to be skipped, default 0 8 % DELIM - delimiter characters, default ' ' 9 % MISVAL - character used for missing values, default '?' 10 % FORMAT - format needed for interpreting feature types of columns. 11 % default is determined from first line, e.g. 'nncc' for two 12 % numeric and two categorical features, see SETFEATDOM and 13 % CELL2DSET 14 % NLAB - feature to be interpreted as class label, default []. 15 % 16 %OUTPUT 17 % A - PRTools dataset 18 % 19 %SEE ALSO 20 %DATASETS, SETFEATDOM, CELL2DSET 21 22 % Copyright: R.P.W. Duin, r.p.w.duin@37steps.com 23 24 function a = pr_readdataset(file,varargin) 25 26 [nhead,del,misval,form,flab] = setdefaults(varargin,0,' ','?',[],[]); 27 28 [fid,msg] = fopen(file); 29 if fid < 1 30 error(msg) 31 end 32 if isempty(form) % if no format given ... 33 for j=1:nhead+1 34 s = fgetl(fid); % derive it from the first nonheader line 35 end 36 s = mytextscan(s,'c',del,0); % use all %s for time being 37 form = getform(s); % convert fields to %n where appropriate 38 fseek(fid,0,-1); % restart 39 end 40 c = mytextscan(fid,strrep(form,'n','s'),del,nhead); 41 a = cell2dset(c,form,misval); 42 if ~isempty(flab) 43 a = feat2lab(a,flab); 44 end 45 46 return 47 48 function s = mytextscan(fid,forms,del,nhead) 49 form = repmat('%%',1,numel(forms)); 50 form(2:2:end) = forms; 51 forms = strrep(form,'c','s'); 52 if del == ' ' 53 s = textscan(fid,forms,'Headerlines',nhead); 54 else 55 s = textscan(fid,forms,'Delimiter',del,'Headerlines',nhead); 56 end 57 if ~ischar(fid); 58 fclose(fid); 59 end 60 return 61 62 function form = getform(s) 63 s = char(s{1}); 64 form = repmat('n',1,size(s,1)); 65 for j=1:size(s,1) 66 if ~isempty(regexp(s(j,:),'[^0-9+-.eE ]','once')) 67 form(j) = 'c'; 68 end 69 end 70 return 71 72 73 % function [x,strvals] = pr_readdataset(fname,strtype) 74 % % [X,STRVALS] = PR_READDATASET(FNAME) 75 % % 76 % % Read the dataset from the text file FNAME. It can process categorical 77 % % features, or features for which categories are given in text. A matrix 78 % % X is returned containing the numerical values, or integers. The 79 % % integers point to the entry in STRVALS containing for each 80 % % (categorical) feature its string members. 81 % % 82 % % X = PR_READDATASET(FNAME,STRTYPE) 83 % % 84 % % The user can supply a vector STRTYPE that indicates for each feature 85 % % if it is numerical (0) or string/categorical (1). 86 % % 87 % % X = PR_READDATASET(FNAME,STRTYPE,DELIMITER) 88 % % 89 % % For datasets that have a strange delimiter (not comma or space), you 90 % % have to supply it. 91 % if nargin<3 92 % delimiter = ','; 93 % end 94 % if nargin<2 95 % strtype = []; 96 % end 97 % 98 % % try to open the file 99 % [fid,message] = fopen(fname,'r'); 100 % if fid==-1 101 % disp(message) 102 % error('I cannot open file %s.',fname); 103 % end 104 % % get the first line: 105 % dline = fgetl(fid); 106 % % check if the delimiter is present: 107 % I = find(dline==delimiter); 108 % if isempty(I) 109 % delimiter = ' '; 110 % I = find(dline==delimiter); 111 % if isempty(I) 112 % error('Cannot determine the delimiter'); 113 % end 114 % end 115 % 116 % % now run over all elements in the line: 117 % I = [0 I length(dline)+1]; 118 % w = {}; 119 % for i=1:length(I)-1 120 % w{i} = dline((I(i)+1):(I(i+1)-1)); 121 % end 122 % 123 % % remove the empty entries: 124 % I = zeros(length(w),1); 125 % for i=1:length(w) 126 % if isempty(w{i}) 127 % I(i) = 1; 128 % end 129 % end 130 % w(find(I)) = []; 131 % n = length(w); 132 % x = []; 133 % 134 % % see if we have strings or numbers, and put the result in the matrix: 135 % strvals = {}; 136 % if isempty(strtype) 137 % for i=1:n 138 % num = str2double(w{i}); 139 % if isnan(num) % the feature is string 140 % strtype(i) = 1; % remember that it is a string 141 % strvals{i}{1} = w{i}; % put it to the collection 142 % x(1,i) = 1; 143 % else % feature is a number, life is simple 144 % strtype(i) = 0; 145 % x(1,i) = num; 146 % end 147 % end 148 % else 149 % for i=1:n 150 % strtype(i) = 1; % remember that it is a string 151 % strvals{i}{1} = w{i}; % put it to the collection 152 % x(1,i) = 1; 153 % end 154 % end 155 % % now run over the other lines: 156 % nrx = 1; 157 % while 1 158 % dline = fgetl(fid); 159 % if ~ischar(dline), break, end %end of file... 160 % 161 % % now process this line: 162 % nrx = nrx+1; 163 % % find delimiters again: 164 % I = find(dline==delimiter); 165 % % cut out the words: 166 % I = [0 I length(dline)+1]; 167 % w = {}; 168 % for i=1:length(I)-1 169 % w{i} = dline((I(i)+1):(I(i+1)-1)); 170 % end 171 % % remove the empty entries: 172 % I = zeros(length(w),1); 173 % for i=1:length(w) 174 % if isempty(w{i}) 175 % I(i) = 1; 176 % end 177 % end 178 % w(find(I)) = []; 179 % % check: 180 % if length(w)~=n 181 % error('I cannot find enough values on line %d.',nrx); 182 % end 183 % % fill the values in the matrix 184 % for i=1:n 185 % if strtype(i)==0 % we have a number: 186 % tmp = str2double(w{i}); 187 % if isnan(tmp) 188 % error('It seems that feature %d is not numeric (encountered "%s" on line %d).',i,w{i},nrx); 189 % end 190 % x(nrx,i) = tmp; 191 % else 192 % % we have to find matching strings for feature i: 193 % I = strmatch(w{i},strvals{i}); 194 % if ~isempty(I) % it is found 195 % x(nrx,i) = I; 196 % else % we have to add this entry: 197 % x(nrx,i) = length(strvals{i})+1; 198 % strvals{i}{end+1} = w{i}; 199 % end 200 % end 201 % end 202 % end 203 % 204 % fclose(fid);
Note: See TracChangeset
for help on using the changeset viewer.