source: prdatasets/flowcyto_2012.m @ 89

Last change on this file since 89 was 89, checked in by bduin, 11 years ago
File size: 6.1 KB
Line 
1%FLOWCYTO_2012 Load flow-cytometry dataset.
2%
3%   A = FLOWCYTO_2012(TUBE,CLASS,YEAR,FLAG)
4%
5% INPUT
6%   TUBE   Vector of integers [1,5] pointing to the desired tube.
7%          Default 3.
8%   CLASS  Vector of integer pointers in the list of class names,
9%          aneuploid, diploid, multi-aneuploid, tetraploid, multi
10%          aneuploid, peridiploid. Default [1,2,4]
11%   YEAR   Vector of integers [2005:2012] pointing to the desired years.
12%          Default: all.
13%   FLAG   0 - remove all repeated measurments, including first ones
14%          1 - take just first of repeated measurements
15%          2 - take just last ones of repeated measurements
16%
17% OUTPUT
18%   A      Dataset.
19%
20% DESCRIPTION
21% These datasets are based on FL3-A DNA flowcytometer histograms from
22% breast cancer tissues in 1024 resolution. The initial data have been
23% acquired by M. Nap and N. van Rodijnen of the Atrium Medical Center in
24% Heerlen, The Netherlands, during 2005-2012, using tubes 1-5 of a DACO
25% Galaxy flowcytometer (TUBE = 1-5).
26%
27% Histograms are labeled in 3 classes: aneuploid (547), diploid (457) and
28% tetraploid(152). The numbers report the numbers of histograms of Tube 3.
29% The first two and the last two bins (1,2,1023,1024) of every histogram
30% are removed as they may contain noise. After that histograms are
31% normalized (sum to one) resulting in a dataset with 1020 features.
32%
33% In case multiple tubes are request (number of elements in TUBE > 1) the
34% corresponding datasets are concatenated, but patients are removed for
35% which not for all tubes a histogram is available.
36%
37% The final dataset is multi-labeled, see MULTI_LABELING, with the
38% following names for the label lists:
39% 1  default     unlabeled
40% 2  Number      patient number
41% 3  Tube        Tube number (1:5)
42% 4  Class       6 possible histogram labels
43% 5  Year        year of measurement (2005:2012)
44% The actual label list may be changed by CHANGELABLIST. Standard 'Class'
45% is returned in A.
46%
47% SEE ALSO
48% PRTOOLS, DATASETS, MULTI_LABELING, CHANGELABLIST
49
50% Copyright: R.P.W. Duin, r.p.w.duin@37steps.com
51
52
53function a = flowcyto_2012(varargin)
54
55[tubes,classes,years,flag] = setdefaults(varargin,3,[1 2 4],[2005:2012],1);
56a = pr_getdata('http://37steps.com/data/prdatasets/FlowCyto_2012.mat',6,[],'a',1);
57%s = load(fullfile(fileparts(which(mfilename)),'flowcyto_2012.mat'));
58%a = s.a;
59
60a = selclass(a,{},'Tube');
61b = cell(1,numel(tubes));
62for j = 1:numel(tubes)
63  b{j} = singletube(a{tubes(j)},classes,years,flag);
64end
65a = vertcat(b{:});
66if numel(tubes) > 1
67  a = changelablist(a,'Number');
68  a = remclass(a,numel(tubes)-1);
69  a = changelablist(a,'Class');
70end
71a = a*normm;
72return
73
74
75function a = singletube(a,classes,years,flag)
76
77a = selclass(a,classes,'Class');
78a = selclass(a,years-2004,'Year');
79a = remclass(changelablist(a,'Number'));
80
81if flag == 0
82  % remove all multiples in patient numbers
83  n = getnlab(a);
84  s = classsizes(a);
85  T = true(1,size(a,1));
86  L = find(s>1);
87  for j=1:numel(L)
88    k = find(n==L(j));
89    T(k) = false;
90  end
91  a = changelablist(a(T,:),'Class');
92elseif flag == 1
93  % find all multiples in patient numbers and take first
94  n = getnlab(a);
95  s = classsizes(a);
96  T = true(1,size(a,1));
97  L = find(s>1);
98  for j=1:numel(L)
99    k = find(n==L(j));
100    T(k(2:end)) = false;
101  end
102  a = changelablist(a(T,:),'Class');
103elseif flag == 2
104  % find all multiples in patient numbers and take last
105  n = getnlab(a);
106  s = classsizes(a);
107  T = true(1,size(a,1));
108  L = find(s>1);
109  for j=1:numel(L)
110    k = find(n==L(j));
111    T(k(1:end-1)) = false;
112  end
113  a = changelablist(a(T,:),'Class');
114else
115  error('FLAG has wrong value')
116end
117
118return
119
120%PR_GETDATA Loads PRTOOLS dataset for any toolbox
121%
122%               OUT = PR_GETDATA(URL,SIZE,DSET,FIELD,ASK)
123%
124% Checks the availability of the particular dataset or datafile DSET. By
125% default DSET is COMMAND.mat in which COMMAND is the name of the calling
126% m-file. If this is not available in the directory of COMMAND the URL will
127% be downloaded. If ASK = true (default), the user is asked for approval.
128% If given, SIZE (in MByte) is displayed in the request.
129%
130% If available, the dataset or datafile stored in FIELD is returned in OUT.
131% If not, but download was successful OUT is empty, otherwise an error is
132% generated.
133%
134% This is a low-level routine, typically used in COMMAND and not called
135% from the command line. COMMAND should take care that a proper mat-file
136% is constucted and returned to the user.
137%
138% SEE ALSO
139% DATASETS, DATAFILES
140
141% Copyright: R.P.W. Duin, r.p.w.duin@37steps.com
142
143function out = pr_getdata(varargin)
144
145% name of calling routine, might be used for the dataset
146%
147%  name  : name of calling routine, might be used for the dataset
148%  url   : url of dataset
149%  uname : dataset name as used in url
150%  dset  : becomes full path and name of dataset
151%  ddir  : becomes full path of dataset
152name = callername;
153argin = setdefaults(varargin,[],[],[],[],true);
154[url,size,dset,field,ask] = deal(argin{:});
155[dummy,uname,ext] = fileparts(url);
156
157if isempty(name)
158  ddir = pwd;
159else
160  ddir = fileparts(which(name));
161end
162
163if isempty(dset)
164  if isempty(name)
165    dset = [uname ext];
166  else
167    dset = [name ext];
168  end
169end
170dset = fullfile(ddir,dset);
171
172out = tryload(dset,field);
173if isempty(out)
174  if ask
175    if ~isempty(size)
176      siz = ['(' num2str(size) ' MB)'];
177    else
178      siz = '';
179    end
180    q = input(['Dataset is not available, OK to download ' siz ' [y]/n ?'],'s');
181    if ~isempty(q) & ~strcmp(q,'y')
182      error('Dataset not found')
183    end
184  end
185  % download in dir of dset
186  status = prdownload(url,fileparts(dset));
187  % get naming consistent, avoid Matlab naming problems with capitals
188  movefile([fullfile(fileparts(dset),uname) ext],[dset 'temp']);
189  movefile([dset 'temp'],dset);
190  if status
191    out = tryload(dset,field);
192  else
193    error('Download failed')
194  end
195end
196 
197function out = tryload(dset,field)
198out = [];
199if exist(dset,'file') == 2
200  s = prload(dset);
201  if isstruct(s)
202    if isempty(field)
203      f = fieldnames(s);
204      out = getfield(s,f{1});
205    else
206      out = getfield(s,field);
207    end
208  else
209    out = s;
210  end
211end
212
213function name = callername
214[ss ,i] = dbstack;
215if length(ss) < 3
216        name = [];
217else
218        name = ss(3).name;
219end
220
221
Note: See TracBrowser for help on using the repository browser.