Sleipnir
|
00001 #include "stdafx.h" 00002 #include "cmdline.h" 00003 00004 struct bresult{ 00005 int i; 00006 int j; 00007 float f; 00008 float f2; 00009 }; 00010 00011 struct ascend{ 00012 bool operator()(const bresult& lx, const bresult& rx) const{ 00013 return lx.f < rx.f; 00014 } 00015 }; 00016 00017 struct descend{ 00018 bool operator()(const bresult& lx, const bresult& rx) const{ 00019 return lx.f > rx.f; 00020 } 00021 }; 00022 00023 bool rank_transform(vector< vector< vector<float> > > &mat, 00024 vector<CSeekIntIntMap*> &dm, vector<char> &absent_g, 00025 int numGenes){ 00026 00027 int d, dd, i, j, k; 00028 int numDatasets = mat.size(); 00029 absent_g.clear(); 00030 absent_g.resize(numGenes); 00031 00032 for(i=0; i<numGenes; i++){ 00033 int tot = 0; 00034 if(i%500==0 || i==numGenes-1) 00035 fprintf(stderr, "Gene %d\n", i); 00036 for(d=0; d<numDatasets; d++){ 00037 CSeekIntIntMap *mapG = dm[d]; 00038 if(CSeekTools::IsNaN(mapG->GetForward(i))) continue; 00039 tot+=mat[d][0].size(); //mat[d][0].size() is the number of conditions 00040 } 00041 if(tot==0){ 00042 absent_g[i] = 1; 00043 continue; 00044 } 00045 00046 absent_g[i] = 0; 00047 vector<bresult> a; 00048 a.resize(tot); 00049 00050 k=0; 00051 for(d=0; d<numDatasets; d++){ 00052 if(CSeekTools::IsNaN(dm[d]->GetForward(i))) continue; 00053 for(j=0; j<mat[d][0].size(); j++){ 00054 a[k].i = d; 00055 a[k].j = j; 00056 a[k].f = mat[d][dm[d]->GetForward(i)][j]; 00057 k++; 00058 } 00059 } 00060 00061 sort(a.begin(), a.end(), ascend()); 00062 float b1 = (float) (1.0 - (tot+1.0) / 2.0) / (float) ((tot+1.0)/2.0); 00063 float b2 = (float) (tot-1.0-(tot+1.0)/2.0) / (float)((tot+1.0)/2.0); 00064 00065 for(j=0; j<tot; j++){ 00066 int id1 = a[j].i; 00067 int id2 = a[j].j; 00068 int p = tot+1; 00069 mat[id1][dm[id1]->GetForward(i)][id2] = 00070 ((float)(j+1) - (float) p/2.0) / ((float)p/2.0); 00071 } 00072 a.clear(); 00073 } 00074 return true; 00075 } 00076 00077 bool weight_experiment( 00078 vector<vector<vector<float> > > &mat, vector<CSeekIntIntMap*> &dm, 00079 int &numExperiments, float cut_off_percentage, 00080 vector<utype> &q, vector<bresult> &a){ 00081 00082 int d, dd, j, qi; 00083 numExperiments = 0; 00084 int numDatasets = mat.size(); 00085 for(d=0; d<numDatasets; d++){ 00086 //for(j=0; j<mat[d][0].size(); j++){ 00087 int q_size = 0; 00088 for(qi=0; qi<q.size(); qi++){ 00089 int g = q[qi]; 00090 if(CSeekTools::IsNaN(dm[d]->GetForward(g))) continue; 00091 q_size++; 00092 } 00093 int pq_size = (int)(0.5*q.size()); 00094 if(q_size<pq_size || q_size<2){ 00095 continue; 00096 } 00097 numExperiments+=mat[d][0].size(); 00098 //} 00099 } 00100 00101 a.clear(); 00102 a.resize(numExperiments); 00103 int numThreads=8; 00104 omp_set_num_threads(numThreads); 00105 00106 int k=0; 00107 /* 00108 vector<int> array_i, array_j; 00109 vector<float> array_f, array_f2; 00110 */ 00111 00112 vector< vector<int> > array_i, array_j; 00113 vector< vector<float> > array_f, array_f2; 00114 array_i.resize(numThreads); 00115 array_j.resize(numThreads); 00116 array_f.resize(numThreads); 00117 array_f2.resize(numThreads); 00118 00119 for(j=0; j<numThreads; j++){ 00120 array_i[j] = vector<int>(); 00121 array_j[j] = vector<int>(); 00122 array_f[j] = vector<float>(); 00123 array_f2[j] = vector<float>(); 00124 } 00125 00126 #pragma omp parallel for \ 00127 shared(numExperiments, mat, q, dm, a, array_i, array_j, array_f, array_f2) \ 00128 private(d, j, qi) \ 00129 firstprivate(numDatasets) schedule(dynamic) 00130 for(d=0; d<numDatasets; d++){ 00131 utype tid = omp_get_thread_num(); 00132 for(j=0; j<mat[d][0].size(); j++){ 00133 float mean_query = 0; 00134 float sample_variance = 0; 00135 int q_size = 0; 00136 float significance = 0; 00137 for(qi=0; qi<q.size(); qi++){ 00138 int g= q[qi]; 00139 if(CSeekTools::IsNaN(dm[d]->GetForward(g))) continue; 00140 q_size++; 00141 mean_query+=mat[d][dm[d]->GetForward(g)][j]; 00142 } 00143 int pq_size = (int)(0.5*q.size()); 00144 if(q_size<pq_size || q_size<2) continue; 00145 00146 mean_query/=(float)q_size; 00147 for(qi=0; qi<q.size(); qi++){ 00148 int g=q[qi]; 00149 if(CSeekTools::IsNaN(dm[d]->GetForward(g))) continue; 00150 float diff = mat[d][dm[d]->GetForward(g)][j] - mean_query; 00151 sample_variance+=diff*diff; 00152 } 00153 sample_variance/=(float)q_size; 00154 significance = sqrt((float)q_size)*mean_query/sqrt(sample_variance + 1.0 / 00155 (3*numExperiments*numExperiments)); 00156 00157 array_i[tid].push_back(d); 00158 array_j[tid].push_back(j); 00159 array_f[tid].push_back(significance); 00160 array_f2[tid].push_back(mean_query); 00161 00162 /*array_i.push_back(d); 00163 array_j.push_back(j); 00164 array_f.push_back(significance); 00165 array_f2.push_back(mean_query);*/ 00166 /*a[k].i = d; 00167 a[k].j = j; 00168 a[k].f = sqrt((float)q_size)*mean_query/sqrt(sample_variance + 1.0 / 00169 (3*numExperiments*numExperiments)); 00170 a[k].f2 = mean_query; 00171 k++;*/ 00172 } 00173 } 00174 00175 /* 00176 for(d=0; d<array_i.size(); d++){ 00177 a[k].i = array_i[d]; 00178 a[k].j = array_j[d]; 00179 a[k].f = array_f[d]; 00180 a[k].f2 = array_f2[d]; 00181 k++; 00182 } 00183 */ 00184 00185 00186 for(j=0; j<numThreads; j++){ 00187 for(d=0; d<array_i[j].size(); d++){ 00188 a[k].i = array_i[j][d]; 00189 a[k].j = array_j[j][d]; 00190 a[k].f = array_f[j][d]; 00191 a[k].f2 = array_f2[j][d]; 00192 k++; 00193 } 00194 } 00195 00196 //int nth=(int) ((float) numExperiments * cut_off_percentage); 00197 //std::nth_element(a.begin(), a.begin()+nth, a.end(), descend()); 00198 sort(a.begin(), a.end(), descend()); 00199 return true; 00200 } 00201 00202 bool gene_scoring( 00203 int numGenes, 00204 vector<vector<vector<float> > > &mat, vector<CSeekIntIntMap*> &dm, 00205 vector<bresult> &a, int numExperiments, vector<char> &absent_g, 00206 float cut_off_percentage, vector<float> &gs){ 00207 00208 int i, ii, d, k; 00209 CSeekTools::InitVector(gs, numGenes, (float) -320); 00210 vector<int> tot; 00211 CSeekTools::InitVector(tot, numGenes, (int) 0); 00212 00213 int numDatasets = mat.size(); 00214 CSeekIntIntMap mapG(numGenes); 00215 00216 for(i=0; i<numGenes; i++){ 00217 if(absent_g[i]==1) continue; 00218 mapG.Add(i); 00219 for(d=0; d<numDatasets; d++){ 00220 if(CSeekTools::IsNaN(dm[d]->GetForward(i))) continue; 00221 tot[i]+=mat[d][0].size(); //numConditions in a dataset 00222 } 00223 } 00224 00225 int numThreads=8; 00226 omp_set_num_threads(numThreads); 00227 00228 const vector<utype> &allGenes = mapG.GetAllReverse(); 00229 float cutoff = cut_off_percentage; 00230 int numExp = numExperiments; 00231 00232 if(numExp==0){ 00233 return true; 00234 } 00235 00236 #pragma omp parallel for \ 00237 shared(allGenes, absent_g, a, tot, mat, dm, gs) \ 00238 private(i, ii, k) \ 00239 firstprivate(cutoff, numExp) schedule(dynamic) 00240 //fprintf(stderr, "Number of experiments: %d\n", numExp); 00241 for(ii=0; ii<mapG.GetNumSet(); ii++){ 00242 i = allGenes[ii]; 00243 int num_valid_experiments = 0; 00244 float sc = 0; 00245 //fprintf(stderr, "Gene %d, %d of %d\n", i, ii, mapG.GetNumSet()); 00246 for(k=0; k<tot[i]; k++){ 00247 if(k+1 > (int) (cutoff*(float) numExp)){ 00248 break; 00249 } 00250 int id1 = a[k].i; 00251 int id2 = a[k].j; 00252 00253 if(CSeekTools::IsNaN(dm[id1]->GetForward(i))) continue; 00254 sc+=mat[id1][dm[id1]->GetForward(i)][id2] * a[k].f2; 00255 num_valid_experiments++; 00256 } 00257 //fprintf(stderr, "Valid experiments %d\n", num_valid_experiments); 00258 00259 int valid_cutoff = (int) (cutoff / 2.0 * (float) numExp); 00260 if(num_valid_experiments<valid_cutoff){ 00261 continue; 00262 } 00263 gs[i] = sc / (float)num_valid_experiments; 00264 } 00265 00266 return true; 00267 } 00268 00269 int main(int iArgs, char **aszArgs){ 00270 static const size_t c_iBuffer = 1024; 00271 char acBuffer[c_iBuffer]; 00272 00273 gengetopt_args_info sArgs; 00274 ifstream ifsm; 00275 istream *pistm; 00276 vector<string> vecstrLine, vecstrGenes, vecstrDBs, vecstrQuery; 00277 utype i, qi, j, k, l; 00278 00279 if(cmdline_parser(iArgs, aszArgs, &sArgs)){ 00280 cmdline_parser_print_help(); 00281 return 1; 00282 } 00283 00284 if( sArgs.input_arg ) { 00285 ifsm.open( sArgs.input_arg ); 00286 pistm = &ifsm; 00287 }else{ 00288 pistm = &cin; 00289 } 00290 00291 map<string, size_t> mapstriGenes; 00292 while( !pistm->eof( ) ) { 00293 pistm->getline( acBuffer, c_iBuffer - 1 ); 00294 acBuffer[ c_iBuffer - 1 ] = 0; 00295 vecstrLine.clear( ); 00296 CMeta::Tokenize( acBuffer, vecstrLine ); 00297 if( vecstrLine.size( ) < 2 ) { 00298 cerr << "Ignoring line: " << acBuffer << endl; 00299 continue; 00300 } 00301 if( !( i = atoi( vecstrLine[ 0 ].c_str( ) ) ) ) { 00302 cerr << "Illegal gene ID: " << vecstrLine[ 0 ] << " for " 00303 << vecstrLine[ 1 ] << endl; 00304 return 1; 00305 } 00306 i--; 00307 if( vecstrGenes.size( ) <= i ) 00308 vecstrGenes.resize( i + 1 ); 00309 vecstrGenes[ i ] = vecstrLine[ 1 ]; 00310 mapstriGenes[vecstrGenes[i]] = i; 00311 } 00312 00313 //char acBuffer[1024]; 00314 00315 if(sArgs.pcl_flag==1){ 00316 string pcl_dir = sArgs.pcl_dir_arg; 00317 string output_dir = sArgs.dir_out_arg; 00318 vector<string> pcl_list; 00319 vector< vector<string> > vecstrAllQuery; 00320 int numGenes = vecstrGenes.size(); 00321 00322 if(!CSeekTools::ReadMultipleQueries(sArgs.query_arg, vecstrAllQuery)) 00323 return -1; 00324 00325 CSeekTools::ReadListOneColumn(sArgs.pcl_list_arg, pcl_list); 00326 vector< vector< vector<float> > > mat; 00327 vector<CSeekIntIntMap*> dm; 00328 dm.resize(pcl_list.size()); 00329 mat.resize(pcl_list.size()); 00330 00331 for(i=0; i<pcl_list.size(); i++){ 00332 fprintf(stderr, "Reading %d: %s\n", i, pcl_list[i].c_str()); 00333 dm[i] = new CSeekIntIntMap(vecstrGenes.size()); 00334 00335 string pclfile = pcl_dir + "/" + pcl_list[i] + ".bin"; 00336 00337 CPCL pcl; 00338 pcl.Open(pclfile.c_str()); 00339 int totNumExperiments = pcl.GetExperiments() - 2; 00340 00341 vector<utype> presentIndex; 00342 vector<string> presentGeneNames; 00343 for(j=0; j<vecstrGenes.size(); j++){ 00344 utype g = pcl.GetGene(vecstrGenes[j]); 00345 if(CSeekTools::IsNaN(g)) continue; //gene does not exist in the dataset 00346 presentIndex.push_back(g); 00347 presentGeneNames.push_back(vecstrGenes[j]); 00348 dm[i]->Add(j); 00349 } 00350 00351 mat[i].resize(presentIndex.size()); 00352 for(j=0; j<presentIndex.size(); j++) 00353 mat[i][j].resize(totNumExperiments); 00354 00355 for(j=0; j<presentIndex.size(); j++){ 00356 float *val = pcl.Get(presentIndex[j]); 00357 for(k=2; k<pcl.GetExperiments(); k++) 00358 mat[i][j][k-2] = val[k]; 00359 } 00360 00361 } 00362 00363 fprintf(stderr, "Finished loading...\n"); //getchar(); 00364 vector<char> absent_g; 00365 rank_transform(mat, dm, absent_g, vecstrGenes.size()); 00366 fprintf(stderr, "Finished transforming...\n"); //getchar(); 00367 00368 for(i=0; i<vecstrAllQuery.size(); i++){ 00369 vector<utype> q; 00370 vector<char> is_query; 00371 00372 fprintf(stderr, "Query %d begin\n", i); //getchar(); 00373 00374 CSeekTools::InitVector(is_query, numGenes, (char) 0); 00375 for(j=0; j<vecstrAllQuery[i].size(); j++){ 00376 q.push_back(mapstriGenes[vecstrAllQuery[i][j]]); 00377 is_query[mapstriGenes[vecstrAllQuery[i][j]]] = 1; 00378 } 00379 int numExperiments = 0; 00380 vector<bresult> b; 00381 vector<float> gs; 00382 fprintf(stderr, "Query %d finished initializing query\n", i); //getchar(); 00383 weight_experiment(mat, dm, numExperiments, 0.05, q, b); 00384 00385 fprintf(stderr, "Query %d finished weighting\n", i); //getchar(); 00386 gene_scoring(vecstrGenes.size(), mat, dm, b, numExperiments, absent_g, 00387 0.05, gs); 00388 00389 fprintf(stderr, "Query %d finished scoring\n", i); //getchar(); 00390 00391 for(j=0; j<absent_g.size(); j++){ 00392 if(absent_g[j]==1){ 00393 gs[j] = -320; 00394 } 00395 } 00396 00397 00398 fprintf(stderr, "Query %d end\n", i); //getchar(); 00399 00400 sprintf(acBuffer, "%s/%d.query", output_dir.c_str(), i); 00401 CSeekTools::WriteArrayText(acBuffer, vecstrAllQuery[i]); 00402 sprintf(acBuffer, "%s/%d.gscore", output_dir.c_str(), i); 00403 CSeekTools::WriteArray(acBuffer, gs); 00404 00405 /* 00406 fprintf(stdout, "Query %d\n", i); 00407 fprintf(stdout, "Weights:"); 00408 //fprintf(stderr, "Weights:\n"); 00409 for(j=0; j<b.size() && j<(int)(0.05*numExperiments); j++){ 00410 if(j%10==0){ 00411 fprintf(stdout, "\n"); 00412 } 00413 fprintf(stdout, "%.2f ", b[j].f); 00414 //fprintf(stderr, "%d %.2f\n", j, b[j].f); 00415 } 00416 fprintf(stdout, "\n"); 00417 vector<AResultFloat> gsc; 00418 gsc.resize(numGenes); 00419 //fprintf(stderr, "numexp %d gsc %d gs %d\n", numExperiments, gsc.size(), gs.size()); 00420 for(j=0; j<numGenes; j++){ 00421 gsc[j].i = j; 00422 gsc[j].f = gs[j]; 00423 //fprintf(stderr, "Gene %d %.2f\n", j, gsc[j].f); 00424 } 00425 sort(gsc.begin(), gsc.end()); 00426 fprintf(stdout, "Results:\n"); 00427 int kk=0; 00428 for(k=0; k<numGenes && kk<500; k++){ 00429 if(is_query[gsc[k].i]==1) continue; 00430 if(gsc[k].f==-320) continue; 00431 fprintf(stdout, "%s %.5f\n", vecstrGenes[gsc[k].i].c_str(), gsc[k].f); 00432 kk++; 00433 } 00434 */ 00435 /* 00436 fprintf(stderr, "Query %d\n", i); 00437 fprintf(stderr, "Weights:"); 00438 for(j=0; j<(int)(0.05* numExperiments); j++){ 00439 if(j%10==0){ 00440 fprintf(stderr, "\n"); 00441 } 00442 fprintf(stderr, "%.2f ", b[j].f); 00443 } 00444 fprintf(stderr, "\n"); 00445 vector<AResultFloat> gsc; 00446 gsc.resize(numGenes); 00447 for(j=0; j<numGenes; j++){ 00448 gsc[j].i = j; 00449 gsc[j].f = gs[j]; 00450 } 00451 sort(gsc.begin(), gsc.end()); 00452 fprintf(stderr, "Results:\n"); 00453 int kk=0; 00454 for(k=0; k<numGenes && kk<500; k++){ 00455 if(is_query[gsc[k].i]==1) continue; 00456 if(gsc[k].f==-320) continue; 00457 fprintf(stderr, "%d %.5f\n", gsc[k].i, gsc[k].f); 00458 kk++; 00459 } 00460 */ 00461 gs.clear(); 00462 b.clear(); 00463 00464 } 00465 00466 00467 } 00468 00469 00470 }