1 | %FLOWCYTO_2012 Load flow-cytometry dataset. |
---|
2 | % |
---|
3 | % A = FLOWCYTO_2012(TUBE,CLASS,YEAR,FLAG) |
---|
4 | % |
---|
5 | % INPUT |
---|
6 | % TUBE Vector of integers [1,5] pointing to the desired tube. |
---|
7 | % Default 3. |
---|
8 | % CLASS Vector of integer pointers in the list of class names, |
---|
9 | % aneuploid, diploid, multi-aneuploid, tetraploid, multi |
---|
10 | % aneuploid, peridiploid. Default [1,2,4] |
---|
11 | % YEAR Vector of integers [2005:2012] pointing to the desired years. |
---|
12 | % Default: all. |
---|
13 | % FLAG 0 - remove all repeated measurments, including first ones |
---|
14 | % 1 - take just first of repeated measurements |
---|
15 | % 2 - take just last ones of repeated measurements |
---|
16 | % |
---|
17 | % OUTPUT |
---|
18 | % A Dataset. |
---|
19 | % |
---|
20 | % DESCRIPTION |
---|
21 | % These datasets are based on FL3-A DNA flowcytometer histograms from |
---|
22 | % breast cancer tissues in 1024 resolution. The initial data have been |
---|
23 | % acquired by M. Nap and N. van Rodijnen of the Atrium Medical Center in |
---|
24 | % Heerlen, The Netherlands, during 2005-2012, using tubes 1-5 of a DACO |
---|
25 | % Galaxy flowcytometer (TUBE = 1-5). |
---|
26 | % |
---|
27 | % Histograms are labeled in 3 classes: aneuploid (547), diploid (457) and |
---|
28 | % tetraploid(152). The numbers report the numbers of histograms of Tube 3. |
---|
29 | % The first two and the last two bins (1,2,1023,1024) of every histogram |
---|
30 | % are removed as they may contain noise. After that histograms are |
---|
31 | % normalized (sum to one) resulting in a dataset with 1020 features. |
---|
32 | % |
---|
33 | % In case multiple tubes are request (number of elements in TUBE > 1) the |
---|
34 | % corresponding datasets are concatenated, but patients are removed for |
---|
35 | % which not for all tubes a histogram is available. |
---|
36 | % |
---|
37 | % The final dataset is multi-labeled, see MULTI_LABELING, with the |
---|
38 | % following names for the label lists: |
---|
39 | % 1 default unlabeled |
---|
40 | % 2 Number patient number |
---|
41 | % 3 Tube Tube number (1:5) |
---|
42 | % 4 Class 6 possible histogram labels |
---|
43 | % 5 Year year of measurement (2005:2012) |
---|
44 | % The actual label list may be changed by CHANGELABLIST. Standard 'Class' |
---|
45 | % is returned in A. |
---|
46 | % |
---|
47 | % SEE ALSO |
---|
48 | % PRTOOLS, DATASETS, MULTI_LABELING, CHANGELABLIST |
---|
49 | |
---|
50 | % Copyright: R.P.W. Duin, r.p.w.duin@37steps.com |
---|
51 | |
---|
52 | |
---|
53 | function a = flowcyto_2012(varargin) |
---|
54 | |
---|
55 | [tubes,classes,years,flag] = setdefaults(varargin,3,[1 2 4],[2005:2012],1); |
---|
56 | a = pr_getdata('http://37steps.com/data/prdatasets/FlowCyto_2012.mat',6,[],'a',1); |
---|
57 | %s = load(fullfile(fileparts(which(mfilename)),'flowcyto_2012.mat')); |
---|
58 | %a = s.a; |
---|
59 | |
---|
60 | a = selclass(a,{},'Tube'); |
---|
61 | b = cell(1,numel(tubes)); |
---|
62 | for j = 1:numel(tubes) |
---|
63 | b{j} = singletube(a{tubes(j)},classes,years,flag); |
---|
64 | end |
---|
65 | a = vertcat(b{:}); |
---|
66 | if numel(tubes) > 1 |
---|
67 | a = changelablist(a,'Number'); |
---|
68 | a = remclass(a,numel(tubes)-1); |
---|
69 | a = changelablist(a,'Class'); |
---|
70 | end |
---|
71 | a = a*normm; |
---|
72 | return |
---|
73 | |
---|
74 | |
---|
75 | function a = singletube(a,classes,years,flag) |
---|
76 | |
---|
77 | a = selclass(a,classes,'Class'); |
---|
78 | a = selclass(a,years-2004,'Year'); |
---|
79 | a = remclass(changelablist(a,'Number')); |
---|
80 | |
---|
81 | if flag == 0 |
---|
82 | % remove all multiples in patient numbers |
---|
83 | n = getnlab(a); |
---|
84 | s = classsizes(a); |
---|
85 | T = true(1,size(a,1)); |
---|
86 | L = find(s>1); |
---|
87 | for j=1:numel(L) |
---|
88 | k = find(n==L(j)); |
---|
89 | T(k) = false; |
---|
90 | end |
---|
91 | a = changelablist(a(T,:),'Class'); |
---|
92 | elseif flag == 1 |
---|
93 | % find all multiples in patient numbers and take first |
---|
94 | n = getnlab(a); |
---|
95 | s = classsizes(a); |
---|
96 | T = true(1,size(a,1)); |
---|
97 | L = find(s>1); |
---|
98 | for j=1:numel(L) |
---|
99 | k = find(n==L(j)); |
---|
100 | T(k(2:end)) = false; |
---|
101 | end |
---|
102 | a = changelablist(a(T,:),'Class'); |
---|
103 | elseif flag == 2 |
---|
104 | % find all multiples in patient numbers and take last |
---|
105 | n = getnlab(a); |
---|
106 | s = classsizes(a); |
---|
107 | T = true(1,size(a,1)); |
---|
108 | L = find(s>1); |
---|
109 | for j=1:numel(L) |
---|
110 | k = find(n==L(j)); |
---|
111 | T(k(1:end-1)) = false; |
---|
112 | end |
---|
113 | a = changelablist(a(T,:),'Class'); |
---|
114 | else |
---|
115 | error('FLAG has wrong value') |
---|
116 | end |
---|
117 | |
---|
118 | return |
---|
119 | |
---|
120 | %PR_GETDATA Loads PRTOOLS dataset for any toolbox |
---|
121 | % |
---|
122 | % OUT = PR_GETDATA(URL,SIZE,DSET,FIELD,ASK) |
---|
123 | % |
---|
124 | % Checks the availability of the particular dataset or datafile DSET. By |
---|
125 | % default DSET is COMMAND.mat in which COMMAND is the name of the calling |
---|
126 | % m-file. If this is not available in the directory of COMMAND the URL will |
---|
127 | % be downloaded. If ASK = true (default), the user is asked for approval. |
---|
128 | % If given, SIZE (in MByte) is displayed in the request. |
---|
129 | % |
---|
130 | % If available, the dataset or datafile stored in FIELD is returned in OUT. |
---|
131 | % If not, but download was successful OUT is empty, otherwise an error is |
---|
132 | % generated. |
---|
133 | % |
---|
134 | % This is a low-level routine, typically used in COMMAND and not called |
---|
135 | % from the command line. COMMAND should take care that a proper mat-file |
---|
136 | % is constucted and returned to the user. |
---|
137 | % |
---|
138 | % SEE ALSO |
---|
139 | % DATASETS, DATAFILES |
---|
140 | |
---|
141 | % Copyright: R.P.W. Duin, r.p.w.duin@37steps.com |
---|
142 | |
---|
143 | function out = pr_getdata(varargin) |
---|
144 | |
---|
145 | % name of calling routine, might be used for the dataset |
---|
146 | % |
---|
147 | % name : name of calling routine, might be used for the dataset |
---|
148 | % url : url of dataset |
---|
149 | % uname : dataset name as used in url |
---|
150 | % dset : becomes full path and name of dataset |
---|
151 | % ddir : becomes full path of dataset |
---|
152 | name = callername; |
---|
153 | argin = setdefaults(varargin,[],[],[],[],true); |
---|
154 | [url,size,dset,field,ask] = deal(argin{:}); |
---|
155 | [dummy,uname,ext] = fileparts(url); |
---|
156 | |
---|
157 | if isempty(name) |
---|
158 | ddir = pwd; |
---|
159 | else |
---|
160 | ddir = fileparts(which(name)); |
---|
161 | end |
---|
162 | |
---|
163 | if isempty(dset) |
---|
164 | if isempty(name) |
---|
165 | dset = [uname ext]; |
---|
166 | else |
---|
167 | dset = [name ext]; |
---|
168 | end |
---|
169 | end |
---|
170 | dset = fullfile(ddir,dset); |
---|
171 | |
---|
172 | out = tryload(dset,field); |
---|
173 | if isempty(out) |
---|
174 | if ask |
---|
175 | if ~isempty(size) |
---|
176 | siz = ['(' num2str(size) ' MB)']; |
---|
177 | else |
---|
178 | siz = ''; |
---|
179 | end |
---|
180 | q = input(['Dataset is not available, OK to download ' siz ' [y]/n ?'],'s'); |
---|
181 | if ~isempty(q) & ~strcmp(q,'y') |
---|
182 | error('Dataset not found') |
---|
183 | end |
---|
184 | end |
---|
185 | % download in dir of dset |
---|
186 | status = prdownload(url,fileparts(dset)); |
---|
187 | % get naming consistent, avoid Matlab naming problems with capitals |
---|
188 | movefile([fullfile(fileparts(dset),uname) ext],[dset 'temp']); |
---|
189 | movefile([dset 'temp'],dset); |
---|
190 | if status |
---|
191 | out = tryload(dset,field); |
---|
192 | else |
---|
193 | error('Download failed') |
---|
194 | end |
---|
195 | end |
---|
196 | |
---|
197 | function out = tryload(dset,field) |
---|
198 | out = []; |
---|
199 | if exist(dset,'file') == 2 |
---|
200 | s = prload(dset); |
---|
201 | if isstruct(s) |
---|
202 | if isempty(field) |
---|
203 | f = fieldnames(s); |
---|
204 | out = getfield(s,f{1}); |
---|
205 | else |
---|
206 | out = getfield(s,field); |
---|
207 | end |
---|
208 | else |
---|
209 | out = s; |
---|
210 | end |
---|
211 | end |
---|
212 | |
---|
213 | function name = callername |
---|
214 | [ss ,i] = dbstack; |
---|
215 | if length(ss) < 3 |
---|
216 | name = []; |
---|
217 | else |
---|
218 | name = ss(3).name; |
---|
219 | end |
---|
220 | |
---|
221 | |
---|