[89] | 1 | %FLOWCYTO_2012 Load flow-cytometry dataset. |
---|
| 2 | % |
---|
| 3 | % A = FLOWCYTO_2012(TUBE,CLASS,YEAR,FLAG) |
---|
| 4 | % |
---|
| 5 | % INPUT |
---|
| 6 | % TUBE Vector of integers [1,5] pointing to the desired tube. |
---|
| 7 | % Default 3. |
---|
| 8 | % CLASS Vector of integer pointers in the list of class names, |
---|
| 9 | % aneuploid, diploid, multi-aneuploid, tetraploid, multi |
---|
| 10 | % aneuploid, peridiploid. Default [1,2,4] |
---|
| 11 | % YEAR Vector of integers [2005:2012] pointing to the desired years. |
---|
| 12 | % Default: all. |
---|
| 13 | % FLAG 0 - remove all repeated measurments, including first ones |
---|
| 14 | % 1 - take just first of repeated measurements |
---|
| 15 | % 2 - take just last ones of repeated measurements |
---|
| 16 | % |
---|
| 17 | % OUTPUT |
---|
| 18 | % A Dataset. |
---|
| 19 | % |
---|
| 20 | % DESCRIPTION |
---|
| 21 | % These datasets are based on FL3-A DNA flowcytometer histograms from |
---|
| 22 | % breast cancer tissues in 1024 resolution. The initial data have been |
---|
| 23 | % acquired by M. Nap and N. van Rodijnen of the Atrium Medical Center in |
---|
| 24 | % Heerlen, The Netherlands, during 2005-2012, using tubes 1-5 of a DACO |
---|
| 25 | % Galaxy flowcytometer (TUBE = 1-5). |
---|
| 26 | % |
---|
| 27 | % Histograms are labeled in 3 classes: aneuploid (547), diploid (457) and |
---|
| 28 | % tetraploid(152). The numbers report the numbers of histograms of Tube 3. |
---|
| 29 | % The first two and the last two bins (1,2,1023,1024) of every histogram |
---|
| 30 | % are removed as they may contain noise. After that histograms are |
---|
| 31 | % normalized (sum to one) resulting in a dataset with 1020 features. |
---|
| 32 | % |
---|
| 33 | % In case multiple tubes are request (number of elements in TUBE > 1) the |
---|
| 34 | % corresponding datasets are concatenated, but patients are removed for |
---|
| 35 | % which not for all tubes a histogram is available. |
---|
| 36 | % |
---|
| 37 | % The final dataset is multi-labeled, see MULTI_LABELING, with the |
---|
| 38 | % following names for the label lists: |
---|
| 39 | % 1 default unlabeled |
---|
| 40 | % 2 Number patient number |
---|
| 41 | % 3 Tube Tube number (1:5) |
---|
| 42 | % 4 Class 6 possible histogram labels |
---|
| 43 | % 5 Year year of measurement (2005:2012) |
---|
| 44 | % The actual label list may be changed by CHANGELABLIST. Standard 'Class' |
---|
| 45 | % is returned in A. |
---|
| 46 | % |
---|
| 47 | % SEE ALSO |
---|
| 48 | % PRTOOLS, DATASETS, MULTI_LABELING, CHANGELABLIST |
---|
| 49 | |
---|
| 50 | % Copyright: R.P.W. Duin, r.p.w.duin@37steps.com |
---|
| 51 | |
---|
| 52 | |
---|
| 53 | function a = flowcyto_2012(varargin) |
---|
| 54 | |
---|
| 55 | [tubes,classes,years,flag] = setdefaults(varargin,3,[1 2 4],[2005:2012],1); |
---|
| 56 | a = pr_getdata('http://37steps.com/data/prdatasets/FlowCyto_2012.mat',6,[],'a',1); |
---|
| 57 | %s = load(fullfile(fileparts(which(mfilename)),'flowcyto_2012.mat')); |
---|
| 58 | %a = s.a; |
---|
| 59 | |
---|
| 60 | a = selclass(a,{},'Tube'); |
---|
| 61 | b = cell(1,numel(tubes)); |
---|
| 62 | for j = 1:numel(tubes) |
---|
| 63 | b{j} = singletube(a{tubes(j)},classes,years,flag); |
---|
| 64 | end |
---|
| 65 | a = vertcat(b{:}); |
---|
| 66 | if numel(tubes) > 1 |
---|
| 67 | a = changelablist(a,'Number'); |
---|
| 68 | a = remclass(a,numel(tubes)-1); |
---|
| 69 | a = changelablist(a,'Class'); |
---|
| 70 | end |
---|
| 71 | a = a*normm; |
---|
| 72 | return |
---|
| 73 | |
---|
| 74 | |
---|
| 75 | function a = singletube(a,classes,years,flag) |
---|
| 76 | |
---|
| 77 | a = selclass(a,classes,'Class'); |
---|
| 78 | a = selclass(a,years-2004,'Year'); |
---|
| 79 | a = remclass(changelablist(a,'Number')); |
---|
| 80 | |
---|
| 81 | if flag == 0 |
---|
| 82 | % remove all multiples in patient numbers |
---|
| 83 | n = getnlab(a); |
---|
| 84 | s = classsizes(a); |
---|
| 85 | T = true(1,size(a,1)); |
---|
| 86 | L = find(s>1); |
---|
| 87 | for j=1:numel(L) |
---|
| 88 | k = find(n==L(j)); |
---|
| 89 | T(k) = false; |
---|
| 90 | end |
---|
| 91 | a = changelablist(a(T,:),'Class'); |
---|
| 92 | elseif flag == 1 |
---|
| 93 | % find all multiples in patient numbers and take first |
---|
| 94 | n = getnlab(a); |
---|
| 95 | s = classsizes(a); |
---|
| 96 | T = true(1,size(a,1)); |
---|
| 97 | L = find(s>1); |
---|
| 98 | for j=1:numel(L) |
---|
| 99 | k = find(n==L(j)); |
---|
| 100 | T(k(2:end)) = false; |
---|
| 101 | end |
---|
| 102 | a = changelablist(a(T,:),'Class'); |
---|
| 103 | elseif flag == 2 |
---|
| 104 | % find all multiples in patient numbers and take last |
---|
| 105 | n = getnlab(a); |
---|
| 106 | s = classsizes(a); |
---|
| 107 | T = true(1,size(a,1)); |
---|
| 108 | L = find(s>1); |
---|
| 109 | for j=1:numel(L) |
---|
| 110 | k = find(n==L(j)); |
---|
| 111 | T(k(1:end-1)) = false; |
---|
| 112 | end |
---|
| 113 | a = changelablist(a(T,:),'Class'); |
---|
| 114 | else |
---|
| 115 | error('FLAG has wrong value') |
---|
| 116 | end |
---|
| 117 | |
---|
| 118 | return |
---|
| 119 | |
---|
| 120 | %PR_GETDATA Loads PRTOOLS dataset for any toolbox |
---|
| 121 | % |
---|
| 122 | % OUT = PR_GETDATA(URL,SIZE,DSET,FIELD,ASK) |
---|
| 123 | % |
---|
| 124 | % Checks the availability of the particular dataset or datafile DSET. By |
---|
| 125 | % default DSET is COMMAND.mat in which COMMAND is the name of the calling |
---|
| 126 | % m-file. If this is not available in the directory of COMMAND the URL will |
---|
| 127 | % be downloaded. If ASK = true (default), the user is asked for approval. |
---|
| 128 | % If given, SIZE (in MByte) is displayed in the request. |
---|
| 129 | % |
---|
| 130 | % If available, the dataset or datafile stored in FIELD is returned in OUT. |
---|
| 131 | % If not, but download was successful OUT is empty, otherwise an error is |
---|
| 132 | % generated. |
---|
| 133 | % |
---|
| 134 | % This is a low-level routine, typically used in COMMAND and not called |
---|
| 135 | % from the command line. COMMAND should take care that a proper mat-file |
---|
| 136 | % is constucted and returned to the user. |
---|
| 137 | % |
---|
| 138 | % SEE ALSO |
---|
| 139 | % DATASETS, DATAFILES |
---|
| 140 | |
---|
| 141 | % Copyright: R.P.W. Duin, r.p.w.duin@37steps.com |
---|
| 142 | |
---|
| 143 | function out = pr_getdata(varargin) |
---|
| 144 | |
---|
| 145 | % name of calling routine, might be used for the dataset |
---|
| 146 | % |
---|
| 147 | % name : name of calling routine, might be used for the dataset |
---|
| 148 | % url : url of dataset |
---|
| 149 | % uname : dataset name as used in url |
---|
| 150 | % dset : becomes full path and name of dataset |
---|
| 151 | % ddir : becomes full path of dataset |
---|
| 152 | name = callername; |
---|
| 153 | argin = setdefaults(varargin,[],[],[],[],true); |
---|
| 154 | [url,size,dset,field,ask] = deal(argin{:}); |
---|
| 155 | [dummy,uname,ext] = fileparts(url); |
---|
| 156 | |
---|
| 157 | if isempty(name) |
---|
| 158 | ddir = pwd; |
---|
| 159 | else |
---|
| 160 | ddir = fileparts(which(name)); |
---|
| 161 | end |
---|
| 162 | |
---|
| 163 | if isempty(dset) |
---|
| 164 | if isempty(name) |
---|
| 165 | dset = [uname ext]; |
---|
| 166 | else |
---|
| 167 | dset = [name ext]; |
---|
| 168 | end |
---|
| 169 | end |
---|
| 170 | dset = fullfile(ddir,dset); |
---|
| 171 | |
---|
| 172 | out = tryload(dset,field); |
---|
| 173 | if isempty(out) |
---|
| 174 | if ask |
---|
| 175 | if ~isempty(size) |
---|
| 176 | siz = ['(' num2str(size) ' MB)']; |
---|
| 177 | else |
---|
| 178 | siz = ''; |
---|
| 179 | end |
---|
| 180 | q = input(['Dataset is not available, OK to download ' siz ' [y]/n ?'],'s'); |
---|
| 181 | if ~isempty(q) & ~strcmp(q,'y') |
---|
| 182 | error('Dataset not found') |
---|
| 183 | end |
---|
| 184 | end |
---|
| 185 | % download in dir of dset |
---|
| 186 | status = prdownload(url,fileparts(dset)); |
---|
| 187 | % get naming consistent, avoid Matlab naming problems with capitals |
---|
| 188 | movefile([fullfile(fileparts(dset),uname) ext],[dset 'temp']); |
---|
| 189 | movefile([dset 'temp'],dset); |
---|
| 190 | if status |
---|
| 191 | out = tryload(dset,field); |
---|
| 192 | else |
---|
| 193 | error('Download failed') |
---|
| 194 | end |
---|
| 195 | end |
---|
| 196 | |
---|
| 197 | function out = tryload(dset,field) |
---|
| 198 | out = []; |
---|
| 199 | if exist(dset,'file') == 2 |
---|
| 200 | s = prload(dset); |
---|
| 201 | if isstruct(s) |
---|
| 202 | if isempty(field) |
---|
| 203 | f = fieldnames(s); |
---|
| 204 | out = getfield(s,f{1}); |
---|
| 205 | else |
---|
| 206 | out = getfield(s,field); |
---|
| 207 | end |
---|
| 208 | else |
---|
| 209 | out = s; |
---|
| 210 | end |
---|
| 211 | end |
---|
| 212 | |
---|
| 213 | function name = callername |
---|
| 214 | [ss ,i] = dbstack; |
---|
| 215 | if length(ss) < 3 |
---|
| 216 | name = []; |
---|
| 217 | else |
---|
| 218 | name = ss(3).name; |
---|
| 219 | end |
---|
| 220 | |
---|
| 221 | |
---|