Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 enum QUERY_MODE{ 00026 SINGLE_QUERY, MULTI_QUERY, MULTI_DWEIGHT 00027 }; 00028 00029 enum METRIC{ 00030 RBP, AVGP, PR, PR_ALL, AUC 00031 }; 00032 00033 bool GetRandom(gsl_rng *r, const vector<AResultFloat> &geneScore, 00034 vector<AResultFloat> &random, vector<char> &excludeGene, 00035 vector<char> &includeGene, vector<utype> &queryGeneID, 00036 const float nan){ 00037 00038 int i, j; 00039 int ss = 0; 00040 00041 vector<char> q; 00042 q.clear(); 00043 CSeekTools::InitVector(q, geneScore.size(), (char) 0); 00044 for(j=0; j<queryGeneID.size(); j++) 00045 q[queryGeneID[j]] = 1; 00046 00047 for(i=0; i<geneScore.size(); i++){ 00048 if(includeGene[geneScore[i].i]==0){ 00049 }else if(q[geneScore[i].i]==1){ 00050 }else if(excludeGene[geneScore[i].i]==1){ 00051 }else{ 00052 ss++; 00053 } 00054 } 00055 00056 float *gs = (float*)malloc(ss*sizeof(float)); 00057 random.clear(); 00058 random.resize(geneScore.size()); 00059 00060 int ii = 0; 00061 for(i=0; i<geneScore.size(); i++){ 00062 if(includeGene[geneScore[i].i]==0){ 00063 }else if(q[geneScore[i].i]==1){ 00064 }else if(excludeGene[geneScore[i].i]==1){ 00065 }else{ 00066 //gs[ii] = geneScore[i].f; 00067 gs[ii] = (float) ii; 00068 ii++; 00069 } 00070 } 00071 00072 gsl_ran_shuffle(r, gs, ss, sizeof(float)); 00073 00074 ii = 0; 00075 for(i=0; i<geneScore.size(); i++){ 00076 random[i].i = geneScore[i].i; 00077 random[i].f = nan; 00078 if(includeGene[geneScore[i].i]==0){ 00079 }else if(q[geneScore[i].i]==1){ 00080 }else if(excludeGene[geneScore[i].i]==1){ 00081 }else{ 00082 random[i].f = gs[ii]; 00083 ii++; 00084 } 00085 } 00086 free(gs); 00087 return true; 00088 } 00089 00090 float RankBiasedPrecision(const float &rate, 00091 const vector<AResultFloat> &sortedScore, const vector<char> &gold_std, 00092 const float &nanVal){ 00093 00094 vector<AResultFloat>::const_iterator iterScore = sortedScore.begin(); 00095 float x = 0; 00096 utype i; 00097 for(i=0; iterScore!=sortedScore.end(); i++, iterScore++){ 00098 //if(iterScore->f == nanVal) continue; 00099 if(gold_std[iterScore->i]==1) x += pow(rate, i); 00100 } 00101 x*=(1.0-rate); 00102 return x; 00103 } 00104 00105 vector<float>* Precision(const vector<AResultFloat> &sortedScore, 00106 const vector<char> &gold_std, const float &nanVal){ 00107 00108 utype i, numPos = 1; 00109 vector<float> *r = new vector<float>(); 00110 00111 vector<AResultFloat>::const_iterator iterScore = sortedScore.begin(); 00112 for(i=0; iterScore!=sortedScore.end(); i++, iterScore++){ 00113 //if(iterScore->f == nanVal) continue; 00114 if(gold_std[iterScore->i]==1){ 00115 //fprintf(stderr, "%d %d\n", numPos, i); 00116 r->push_back((float) numPos++ / (float) (i + 1)); 00117 } 00118 } 00119 r->resize(r->size()); 00120 return r; 00121 } 00122 00123 bool MeanStandardDeviation(const vector<float> &f, float &avg, float &stdev){ 00124 avg = 0; 00125 stdev = 0; 00126 avg = std::accumulate(f.begin(), f.end(), (float) 0); 00127 avg /= (float) f.size(); 00128 vector<float>::const_iterator iterF = f.begin(); 00129 for(; iterF!=f.end(); iterF++) stdev += (*iterF - avg) * (*iterF - avg); 00130 stdev /= (float) f.size(); 00131 stdev = sqrt(stdev); 00132 return true; 00133 } 00134 00135 bool MinMaxQuartile(vector<float> &f, float &min, float &max, float &Q1, 00136 float &Q2, float &Q3){ 00137 min = *min_element(f.begin(), f.end()); 00138 max = *max_element(f.begin(), f.end()); 00139 Q1 = 0; 00140 Q2 = 0; 00141 Q3 = 0; 00142 00143 if(f.size()==1){ 00144 Q1 = f[0]; 00145 Q2 = Q1; 00146 Q3 = Q2; 00147 return true; 00148 } 00149 if(f.size()==2){ 00150 Q1 = f[0]; 00151 Q2 = (f[0] + f[1] ) / 2.0; 00152 Q3 = f[1]; 00153 return true; 00154 } 00155 if(f.size()==3){ 00156 Q1 = (f[0] + f[1] ) / 2.0; 00157 Q2 = f[1]; 00158 Q3 = (f[1] + f[2]) / 2.0; 00159 return true; 00160 } 00161 00162 if(f.size()%2==0){ 00163 int m1 = f.size()/2 - 1; 00164 int m2 = f.size()/2; 00165 std::nth_element(f.begin(), f.begin()+m1, f.end()); 00166 float f1 = f[m1]; 00167 std::nth_element(f.begin(), f.begin()+m2, f.end()); 00168 float f2 = f[m2]; 00169 Q2 = (f1 + f2) / 2.0; 00170 00171 int s1 = m1 + 1; 00172 if(s1%2==0){ 00173 int m3 = s1/2 - 1; 00174 int m4 = s1/2; 00175 std::nth_element(f.begin(), f.begin()+m3, f.end()); 00176 float f3 = f[m3]; 00177 std::nth_element(f.begin(), f.begin()+m4, f.end()); 00178 float f4 = f[m4]; 00179 Q1 = (f3 + f4) / 2.0; 00180 }else{ 00181 int m3 = s1/2; 00182 std::nth_element(f.begin(), f.begin()+m3, f.end()); 00183 float f3 = f[m3]; 00184 Q1 = f3; 00185 } 00186 00187 int s2 = (f.size()-1) - m2 + 1; 00188 if(s2%2==0){ 00189 int m3 = m2 + s2/2 - 1; 00190 int m4 = m2 + s2/2; 00191 std::nth_element(f.begin(), f.begin()+m3, f.end()); 00192 float f3 = f[m3]; 00193 std::nth_element(f.begin(), f.begin()+m4, f.end()); 00194 float f4 = f[m4]; 00195 Q3 = (f3 + f4) / 2.0; 00196 }else{ 00197 int m3 = m2 + s2/2; 00198 std::nth_element(f.begin(), f.begin()+m3, f.end()); 00199 float f3 = f[m3]; 00200 Q3 = f3; 00201 } 00202 00203 }else{ 00204 int m1 = f.size()/2; 00205 std::nth_element(f.begin(), f.begin()+m1, f.end()); 00206 float f1 = f[m1]; 00207 Q2 = f1; 00208 00209 int s1 = m1 - 1 + 1; 00210 if(s1%2==0){ 00211 int m3 = s1/2 - 1; 00212 int m4 = s1/2; 00213 std::nth_element(f.begin(), f.begin()+m3, f.end()); 00214 float f3 = f[m3]; 00215 std::nth_element(f.begin(), f.begin()+m4, f.end()); 00216 float f4 = f[m4]; 00217 Q1 = (f3 + f4) / 2.0; 00218 }else{ 00219 int m3 = s1/2; 00220 std::nth_element(f.begin(), f.begin()+m3, f.end()); 00221 float f3 = f[m3]; 00222 Q1 = f3; 00223 } 00224 00225 int s2 = (f.size()-1) - (m1 + 1) + 1; 00226 if(s2%2==0){ 00227 int m3 = m1 + 1 + s2/2 - 1; 00228 int m4 = m1 + 1 + s2/2; 00229 std::nth_element(f.begin(), f.begin()+m3, f.end()); 00230 float f3 = f[m3]; 00231 std::nth_element(f.begin(), f.begin()+m4, f.end()); 00232 float f4 = f[m4]; 00233 Q3 = (f3 + f4) / 2.0; 00234 }else{ 00235 int m3 = m1 + 1 + s2/2; 00236 std::nth_element(f.begin(), f.begin()+m3, f.end()); 00237 float f3 = f[m3]; 00238 Q3 = f3; 00239 } 00240 } 00241 return true; 00242 } 00243 00244 bool EvaluateOneQuery(const gengetopt_args_info &sArgs, const enum METRIC &met, 00245 const vector<AResultFloat> &sortedGenes, 00246 const vector<char> &goldstdGenePresence, const float &nan, 00247 vector<float> &eval){ 00248 size_t i; 00249 eval.clear(); 00250 //metric 00251 if(met==PR_ALL){ 00252 vector<float> *vf = Precision(sortedGenes, goldstdGenePresence, nan); 00253 for(i=0; i<vf->size(); i++) eval.push_back(vf->at(i)); 00254 delete vf; 00255 return true; 00256 } 00257 fprintf(stderr, "Invalid option!\n"); 00258 return false; 00259 } 00260 00261 00262 bool EvaluateOneQuery(const gengetopt_args_info &sArgs, const enum METRIC &met, 00263 const vector<AResultFloat> &sortedGenes, 00264 const vector<char> &goldstdGenePresence, const float &nan, float &eval){ 00265 size_t i; 00266 //metric 00267 if(met==RBP){ 00268 float rate = sArgs.rbp_p_arg; 00269 float rbp = RankBiasedPrecision(rate, sortedGenes, 00270 goldstdGenePresence, nan); 00271 eval = rbp; 00272 //fprintf(stdout, "%.5f\n", rbp); 00273 return true; 00274 } 00275 else if(met==AVGP || met==PR){ 00276 vector<float> *vf = Precision(sortedGenes, goldstdGenePresence, nan); 00277 /*if(met==PR_ALL){ 00278 for(i=0; i<vf->size()-1; i++){ 00279 fprintf(stdout, "%.5f ", vf->at(i)); 00280 } 00281 fprintf(stdout, "%.5f\n", vf->at(i)); 00282 delete vf; 00283 return true; 00284 }*/ 00285 00286 if(sArgs.x_int_arg==-1 && sArgs.x_per_arg==0){ 00287 fprintf(stderr, "Must specify --x_int or --x_per\n"); 00288 delete vf; 00289 return false; 00290 } 00291 00292 int X = -1; 00293 if(sArgs.x_int_arg!=-1 && sArgs.x_per_arg==0){ 00294 X = sArgs.x_int_arg; 00295 if(X>vf->size()){ 00296 fprintf(stderr, "Error: X is too large (>%d)\n", vf->size()); 00297 delete vf; 00298 return false; 00299 } 00300 } 00301 else if(sArgs.x_int_arg==-1 && sArgs.x_per_arg>0){ 00302 float per = sArgs.x_per_arg; 00303 if(per>1.0){ 00304 fprintf(stderr, "Error: percentage is above 1.0\n"); 00305 delete vf; 00306 return false; 00307 } 00308 X = (int) ( (float) per * (float) vf->size() ); 00309 //fprintf(stderr, "%.5f %d %d", per, vf->size(), X); 00310 X = std::max(1, X); 00311 } 00312 00313 if(met==AVGP){ 00314 float avg = std::accumulate(vf->begin(), vf->begin()+X, (float) 0); 00315 avg /= (float) X; 00316 //fprintf(stdout, "%.5f\n", avg); 00317 eval = avg; 00318 delete vf; 00319 return true; 00320 } 00321 if(met==PR){ 00322 //fprintf(stdout, "%.5f\n", vf->at(X-1)); 00323 eval = vf->at(X-1); 00324 delete vf; 00325 return true; 00326 } 00327 } 00328 00329 fprintf(stderr, "Invalid option!\n"); 00330 return false; 00331 } 00332 00333 void PrintVector(const vector<float> &f){ 00334 vector<float>::const_iterator iterF = f.begin(); 00335 vector<float>::const_iterator end = f.begin() + f.size() - 1; 00336 for(; iterF!=end; iterF++) fprintf(stderr, "%.5f ", *iterF); 00337 fprintf(stderr, "%.5f\n", *iterF); 00338 } 00339 00340 void PrintResult(vector< vector<float> > f){ 00341 int i; 00342 for(i=0; i<f.size(); i++){ 00343 PrintVector(f[i]); 00344 } 00345 } 00346 00347 bool DoAggregate(const gengetopt_args_info &sArgs, const enum METRIC &met, 00348 vector<AResultFloat> *sortedGenes, vector<utype> *queryGeneID, 00349 int listSize, vector<char> *goldstdGenePresence, vector<char> *excludeGene, 00350 vector<char> *includeGene, 00351 vector< vector<float> > &result 00352 00353 ){ 00354 00355 //fprintf(stderr, "Finished reading gene score list\n"); 00356 vector<AResultFloat> *master = NULL; 00357 vector<AResultFloat> master_score; 00358 vector<AResultFloat> master_rank; 00359 utype i, j; 00360 float nan = sArgs.nan_arg; 00361 result.clear(); 00362 00363 vector<char> *q = new vector<char>[listSize]; 00364 for(i=0; i<listSize; i++){ 00365 q[i] = vector<char>(); 00366 CSeekTools::InitVector(q[i], sortedGenes[i].size(), (char) 0); 00367 for(j=0; j<queryGeneID[i].size(); j++) 00368 q[i][queryGeneID[i][j]] = 1; 00369 } 00370 00371 if(sArgs.agg_ranksum_flag==1 || sArgs.agg_scoresum_flag==1){ 00372 //Gold standard must be the same across all queries for this mode!! 00373 //ASSUME THIS IS TRUE 00374 for(i=0; i<listSize; i++){ 00375 for(j=0; j<sortedGenes[0].size(); j++){ 00376 if(q[i][sortedGenes[i][j].i]==1){ 00377 sortedGenes[i][j].f = nan; 00378 } 00379 if(excludeGene[i][sortedGenes[i][j].i]==1){ 00380 sortedGenes[i][j].f = nan; 00381 } 00382 } 00383 } 00384 00385 if(sArgs.agg_scoresum_flag==1){ 00386 master_score.resize(sortedGenes[0].size()); 00387 00388 for(j=0; j<sortedGenes[0].size(); j++){ 00389 master_score[j].i = j; 00390 master_score[j].f = 0.0; 00391 } 00392 00393 for(j=0; j<sortedGenes[0].size(); j++) 00394 for(i=0; i<listSize; i++) 00395 master_score[sortedGenes[i][j].i].f += sortedGenes[i][j].f; 00396 00397 for(j=0; j<sortedGenes[0].size(); j++) 00398 master_score[j].f /= (float)listSize; 00399 00400 sort(master_score.begin(), master_score.end()); 00401 00402 master = &master_score; 00403 } 00404 00405 else if(sArgs.agg_ranksum_flag==1){ 00406 //fprintf(stderr, "Got here 2\n"); 00407 master_rank.resize(sortedGenes[0].size()); 00408 00409 //fprintf(stderr, "Got here 2a\n"); 00410 00411 for(i=0; i<listSize; i++){ 00412 sort(sortedGenes[i].begin(), sortedGenes[i].end()); 00413 //fprintf(stderr, "Sort %d fine\n", i); 00414 } 00415 00416 //fprintf(stderr, "Got here 2b\n"); 00417 00418 for(j=0; j<sortedGenes[0].size(); j++){ 00419 master_rank[j].i = j; 00420 master_rank[j].f = 0; 00421 } 00422 00423 for(i=0; i<listSize; i++) 00424 for(j=0; j<sortedGenes[i].size(); j++) 00425 master_rank[sortedGenes[i][j].i].f += 00426 (float) sortedGenes[i].size() - (float) j; 00427 00428 for(j=0; j<sortedGenes[0].size(); j++) 00429 master_rank[j].f /= (float) listSize; 00430 00431 sort(master_rank.begin(), master_rank.end()); 00432 00433 master = &master_rank; 00434 00435 //fprintf(stderr, "Got here 3\n"); 00436 } 00437 00438 if(met!=PR_ALL){ 00439 float eval; 00440 bool ret = EvaluateOneQuery(sArgs, met, *master, 00441 goldstdGenePresence[0], CMeta::GetNaN(), eval); 00442 if(!ret) return 1; 00443 result.resize(1); 00444 result[0] = vector<float>(); 00445 result[0].push_back(eval); 00446 return 0; 00447 }else{ 00448 vector<float> evalAll; 00449 bool ret = EvaluateOneQuery(sArgs, met, *master, 00450 goldstdGenePresence[0], CMeta::GetNaN(), evalAll); 00451 if(!ret) return 1; 00452 result.resize(1); 00453 result[0] = vector<float>(); 00454 for(i=0; i<evalAll.size(); i++) 00455 result[0].push_back(evalAll[i]); 00456 return 0; 00457 } 00458 } 00459 00460 //Not Rank Sum 00461 vector<float> eval; 00462 vector< vector<float> > vecevalAll; 00463 00464 eval.resize(listSize); 00465 vecevalAll.resize(listSize); 00466 00467 for(i=0; i<listSize; i++){ 00468 for(j=0; j<sortedGenes[i].size(); j++){ 00469 //NEW 00470 if(includeGene[i][sortedGenes[i][j].i]==0){ 00471 sortedGenes[i][j].f = nan; 00472 } 00473 if(q[i][sortedGenes[i][j].i]==1){ 00474 sortedGenes[i][j].f = nan; 00475 } 00476 if(excludeGene[i][sortedGenes[i][j].i]==1){ 00477 sortedGenes[i][j].f = nan; 00478 } 00479 } 00480 00481 sort(sortedGenes[i].begin(), sortedGenes[i].end()); 00482 00483 if(met!=PR_ALL){ 00484 float fEval; 00485 bool ret = EvaluateOneQuery(sArgs, met, sortedGenes[i], 00486 goldstdGenePresence[i], nan, fEval); 00487 if(!ret) return 1; 00488 eval[i] = fEval; 00489 00490 }else{ 00491 vector<float> evalAll; 00492 bool ret = EvaluateOneQuery(sArgs, met, sortedGenes[i], 00493 goldstdGenePresence[i], nan, evalAll); 00494 if(!ret) return 1; 00495 vecevalAll[i] = evalAll; 00496 } 00497 } 00498 00499 if(sArgs.fold_over_random_flag==1){ 00500 const gsl_rng_type *T; 00501 gsl_rng *rnd; 00502 gsl_rng_env_setup(); 00503 T = gsl_rng_default; 00504 rnd = gsl_rng_alloc(T); 00505 00506 vector<AResultFloat> *randomScores = 00507 new vector<AResultFloat>[listSize]; 00508 00509 vector<float> random_eval; 00510 vector< vector<float> > random_vecevalAll; 00511 random_eval.resize(listSize); 00512 random_vecevalAll.resize(listSize); 00513 00514 //should shuffle only within annotated genes 00515 for(i=0; i<listSize; i++){ 00516 GetRandom(rnd, sortedGenes[i], randomScores[i], excludeGene[i], 00517 includeGene[i], queryGeneID[i], nan); 00518 sort(randomScores[i].begin(), randomScores[i].end()); 00519 if(met!=PR_ALL){ 00520 float fEval; 00521 bool ret = EvaluateOneQuery(sArgs, met, randomScores[i], 00522 goldstdGenePresence[i], nan, fEval); 00523 if(!ret) return 1; 00524 random_eval[i] = fEval; 00525 //calculate fold 00526 if(random_eval[i]==eval[i]) eval[i] = 1.0; 00527 else if(random_eval[i]==0) eval[i] = 1.0; 00528 else eval[i] = eval[i] / random_eval[i]; 00529 }else{ 00530 vector<float> evalAll; 00531 bool ret = EvaluateOneQuery(sArgs, met, randomScores[i], 00532 goldstdGenePresence[i], nan, evalAll); 00533 if(!ret) return 1; 00534 random_vecevalAll[i] = evalAll; 00535 int ii=0; 00536 //calculate fold 00537 for(ii=0; ii<random_vecevalAll[i].size(); ii++){ 00538 if(random_vecevalAll[i][ii]==vecevalAll[i][ii]) 00539 vecevalAll[i][ii] = 1.0; 00540 else if(random_vecevalAll[i][ii]==0) 00541 vecevalAll[i][ii] = 1.0; 00542 else 00543 vecevalAll[i][ii] /= random_vecevalAll[i][ii]; 00544 } 00545 } 00546 //fprintf(stderr, "Got here!\n"); 00547 } 00548 //fprintf(stderr, "Got here\n"); 00549 gsl_rng_free(rnd); 00550 delete[] randomScores; 00551 //fprintf(stderr, "Got here 2\n"); 00552 } 00553 00554 if(met!=PR_ALL){ 00555 if(sArgs.agg_avg_flag==1){ 00556 float avg, stdev; 00557 MeanStandardDeviation(eval, avg, stdev); 00558 result.resize(1); 00559 result[0] = vector<float>(); 00560 result[0].push_back(avg); 00561 result[0].push_back(stdev); 00562 return 0; 00563 } 00564 if(sArgs.agg_quartile_flag==1){ 00565 float min, max, Q1, Q2, Q3; 00566 MinMaxQuartile(eval, min, max, Q1, Q2, Q3); 00567 result.resize(2); 00568 result[0] = vector<float>(); 00569 result[0].push_back(min); 00570 result[0].push_back(max); 00571 result[1] = vector<float>(); 00572 result[1].push_back(Q1); 00573 result[1].push_back(Q2); 00574 result[1].push_back(Q3); 00575 return 0; 00576 } 00577 if(sArgs.display_all_flag==1){ 00578 result.resize(eval.size()); 00579 for(i=0; i<eval.size(); i++){ 00580 result[i] = vector<float>(); 00581 result[i].push_back(eval[i]); 00582 } 00583 return 0; 00584 } 00585 00586 }else{ 00587 vector< vector<float> > veceval; 00588 veceval.resize(vecevalAll[0].size()); 00589 for(i=0; i<vecevalAll.size(); i++) 00590 for(j=0; j<vecevalAll[i].size(); j++) 00591 veceval[j].push_back(vecevalAll[i][j]); 00592 00593 if(sArgs.agg_avg_flag==1){ 00594 vector<float> avgAll, stdevAll; 00595 for(i=0; i<veceval.size(); i++){ 00596 float avg, stdev; 00597 MeanStandardDeviation(veceval[i], avg, stdev); 00598 avgAll.push_back(avg); 00599 stdevAll.push_back(stdev); 00600 } 00601 result.resize(2); 00602 result[0] = vector<float>(); 00603 result[1] = vector<float>(); 00604 for(i=0; i<avgAll.size(); i++){ 00605 result[0].push_back(avgAll[i]); 00606 } 00607 for(i=0; i<stdevAll.size(); i++){ 00608 result[1].push_back(stdevAll[i]); 00609 } 00610 return 0; 00611 } 00612 else if(sArgs.agg_quartile_flag==1){ 00613 vector<float> minAll, maxAll, Q1All, Q2All, Q3All; 00614 for(i=0; i<veceval.size(); i++){ 00615 float min, max, Q1, Q2, Q3; 00616 MinMaxQuartile(veceval[i], min, max, Q1, Q2, Q3); 00617 minAll.push_back(min); 00618 maxAll.push_back(max); 00619 Q1All.push_back(Q1); 00620 Q2All.push_back(Q2); 00621 Q3All.push_back(Q3); 00622 } 00623 result.resize(5); 00624 result[0] = vector<float>(); 00625 result[1] = vector<float>(); 00626 result[2] = vector<float>(); 00627 result[3] = vector<float>(); 00628 result[4] = vector<float>(); 00629 for(i=0; i<minAll.size(); i++){ 00630 result[0].push_back(minAll[i]); 00631 } 00632 for(i=0; i<maxAll.size(); i++){ 00633 result[1].push_back(maxAll[i]); 00634 } 00635 for(i=0; i<Q1All.size(); i++){ 00636 result[2].push_back(Q1All[i]); 00637 } 00638 for(i=0; i<Q2All.size(); i++){ 00639 result[3].push_back(Q2All[i]); 00640 } 00641 for(i=0; i<Q3All.size(); i++){ 00642 result[4].push_back(Q3All[i]); 00643 } 00644 return 0; 00645 } 00646 } 00647 00648 return true; 00649 } 00650 00651 int main( int iArgs, char** aszArgs ) { 00652 static const size_t c_iBuffer = 1024; 00653 #ifdef WIN32 00654 pthread_win32_process_attach_np( ); 00655 #endif // WIN32 00656 gengetopt_args_info sArgs; 00657 ifstream ifsm; 00658 istream* pistm; 00659 vector<string> vecstrLine, vecstrGenes, vecstrDBs, vecstrQuery; 00660 char acBuffer[ c_iBuffer ]; 00661 size_t i, j, k; 00662 00663 00664 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00665 cmdline_parser_print_help( ); 00666 return 1; } 00667 00668 /* reading gene-mapping file */ 00669 ifsm.open(sArgs.input_arg); 00670 pistm = &ifsm; 00671 00672 map<string, size_t> mapstriGenes; 00673 while( !pistm->eof( ) ) { 00674 pistm->getline( acBuffer, c_iBuffer - 1 ); 00675 acBuffer[ c_iBuffer - 1 ] = 0; 00676 vecstrLine.clear( ); 00677 CMeta::Tokenize( acBuffer, vecstrLine ); 00678 if( vecstrLine.size( ) < 2 ) { 00679 //cerr << "Ignoring line: " << acBuffer << endl; 00680 continue; 00681 } 00682 if( !( i = atoi( vecstrLine[ 0 ].c_str( ) ) ) ) { 00683 cerr << "Illegal gene ID: " << vecstrLine[ 0 ] << " for " 00684 << vecstrLine[ 1 ] << endl; 00685 return 1; 00686 } 00687 i--; 00688 if( vecstrGenes.size( ) <= i ) vecstrGenes.resize( i + 1 ); 00689 vecstrGenes[ i ] = vecstrLine[ 1 ]; 00690 mapstriGenes[vecstrGenes[i]] = i; 00691 } 00692 ifsm.close( ); 00693 00694 enum QUERY_MODE qmode; 00695 if(sArgs.single_flag==1) qmode = SINGLE_QUERY; 00696 else if(sArgs.aggregate_flag==1) qmode = MULTI_QUERY; 00697 else if(sArgs.multi_weight_flag==1) qmode = MULTI_DWEIGHT; 00698 00699 enum METRIC met; 00700 if(sArgs.rbp_flag==1) met = RBP; 00701 else if(sArgs.avgp_flag==1) met = AVGP; 00702 else if(sArgs.pr_flag==1) met = PR; 00703 else if(sArgs.pr_all_flag==1) met = PR_ALL; 00704 else if(sArgs.auc_flag==1) met = AUC; 00705 00706 //fprintf(stderr, "Query mode: %d\n", qmode); 00707 00708 if(qmode==SINGLE_QUERY){ 00709 if(sArgs.display_weight_flag==1){ 00710 vector<float> ww; 00711 CSeekTools::ReadArray(sArgs.weight_arg, ww); 00712 vector<string> vecstrDatasets, vecstrP; 00713 CSeekTools::ReadListTwoColumns(sArgs.dataset_map_arg, 00714 vecstrDatasets, vecstrP); 00715 vector<AResultFloat> sortedDatasets; 00716 sortedDatasets.resize(ww.size()); 00717 for(i=0; i<sortedDatasets.size(); i++){ 00718 sortedDatasets[i].i = i; 00719 sortedDatasets[i].f = ww[i]; 00720 } 00721 sort(sortedDatasets.begin(), sortedDatasets.end()); 00722 for(i=0; i<sortedDatasets.size(); i++){ 00723 fprintf(stderr, "%.2e\t%s\n", sortedDatasets[i].f, 00724 vecstrDatasets[sortedDatasets[i].i].c_str()); 00725 } 00726 return 0; 00727 } 00728 00729 string queryFile = sArgs.query_arg; 00730 vector<string> queryGenes; 00731 CSeekTools::ReadMultiGeneOneLine(queryFile, queryGenes); 00732 vector<utype> queryGeneID; 00733 for(i=0; i<queryGenes.size(); i++) 00734 queryGeneID.push_back(mapstriGenes[queryGenes[i]]); 00735 00736 string genescoreFile = sArgs.gscore_arg; 00737 vector<float> geneScores; 00738 CSeekTools::ReadArray(genescoreFile.c_str(), geneScores); 00739 //float maxScore = *std::max_element(geneScores.begin(), 00740 // geneScores.end()); 00741 00742 float nan = sArgs.nan_arg; 00743 vector<AResultFloat> sortedGenes; 00744 sortedGenes.resize(geneScores.size()); 00745 for(i=0; i<sortedGenes.size(); i++){ 00746 sortedGenes[i].i = i; 00747 sortedGenes[i].f = geneScores[i]; 00748 } 00749 00750 //Query genes themselves have lowest score, to prevent 00751 //them from being counted in PR 00752 for(i=0; i<queryGeneID.size(); i++) 00753 sortedGenes[queryGeneID[i]].f = nan; 00754 00755 sort(sortedGenes.begin(), sortedGenes.end()); 00756 00757 if(sArgs.p_value_flag==1){ 00758 string random_directory = sArgs.random_dir_arg; 00759 int num_random = sArgs.random_num_arg; 00760 int ii, jj; 00761 char ac[256]; 00762 vector<vector<int> > randomRank; 00763 vector<vector<float> > randomSc; 00764 vector<int> geneRank; 00765 00766 randomRank.resize(sortedGenes.size()); 00767 randomSc.resize(sortedGenes.size()); 00768 geneRank.resize(sortedGenes.size()); 00769 for(ii=0; ii<sortedGenes.size(); ii++){ 00770 randomRank[ii].resize(num_random); 00771 randomSc[ii].resize(num_random); 00772 } 00773 00774 for(ii=0; ii<num_random; ii++){ 00775 vector<float> randomScores; 00776 sprintf(ac, "%s/%d.gscore", random_directory.c_str(), ii); 00777 CSeekTools::ReadArray(ac, randomScores); 00778 vector<AResultFloat> sortedRandom; 00779 sortedRandom.resize(randomScores.size()); 00780 for(jj=0; jj<randomScores.size(); jj++){ 00781 sortedRandom[jj].i = jj; 00782 sortedRandom[jj].f = randomScores[jj]; 00783 } 00784 sort(sortedRandom.begin(), sortedRandom.end()); 00785 for(jj=0; jj<randomScores.size(); jj++){ 00786 randomRank[sortedRandom[jj].i][ii] = jj; 00787 randomSc[sortedRandom[jj].i][ii] = sortedRandom[jj].f; 00788 } 00789 } 00790 00791 for(jj=0; jj<geneScores.size(); jj++){ 00792 sort(randomRank[jj].begin(), randomRank[jj].end()); 00793 sort(randomSc[jj].begin(), randomSc[jj].end(), std::greater<float>()); 00794 geneRank[sortedGenes[jj].i] = jj; 00795 } 00796 00797 for(jj=0; jj<geneScores.size(); jj++){ 00798 int gene = sortedGenes[jj].i; 00799 int gene_rank = jj; 00800 float gene_score = sortedGenes[jj].f; 00801 if(gene_score<0) 00802 continue; 00803 vector<int> &rR = randomRank[gene]; 00804 vector<float> &rF = randomSc[gene]; 00805 int kk = 0; 00806 for(kk=0; kk<rR.size(); kk++){ 00807 //if(gene_rank<=rR[kk]){ 00808 if(gene_score>=rF[kk]){ 00809 //float f1 = (float) kk / (float) rR.size(); 00810 fprintf(stderr, "%s\t%d\t%d\t%.5e\t%.5e\n", vecstrGenes[gene].c_str(), 00811 gene_rank, kk, gene_score, randomSc[gene][kk]); 00812 break; 00813 }else if(kk==rR.size()-1){ 00814 //float f1 = (float) kk / (float) rR.size(); 00815 fprintf(stderr, "%s\t%d\t%d\t%.5e\t%.5e\n", vecstrGenes[gene].c_str(), 00816 gene_rank, kk, gene_score, randomSc[gene][kk]); 00817 //fprintf(stderr, "%d %.6f\n", gene_rank, f1); 00818 //fprintf(stderr, "%d %d / 100\n", gene_rank, kk); 00819 } 00820 } 00821 } 00822 00823 return 0; 00824 } 00825 00826 if(sArgs.dislay_only_flag==1){ 00827 for(i=0; i<15000; i++) 00828 fprintf(stderr, "%s\t%.5f\n", 00829 vecstrGenes[sortedGenes[i].i].c_str(), sortedGenes[i].f); 00830 return 0; 00831 } 00832 00833 string goldstdFile = sArgs.goldstd_arg; 00834 vector<string> goldstdGenes; 00835 CSeekTools::ReadMultiGeneOneLine(goldstdFile, goldstdGenes); 00836 vector<char> goldstdGenePresence; 00837 CSeekTools::InitVector(goldstdGenePresence, 00838 vecstrGenes.size(), (char) 0); 00839 00840 for(i=0; i<goldstdGenes.size(); i++) 00841 goldstdGenePresence[mapstriGenes[goldstdGenes[i]]] = 1; 00842 00843 if(met!=PR_ALL){ 00844 float eval; 00845 bool ret = EvaluateOneQuery(sArgs, met, sortedGenes, 00846 goldstdGenePresence, nan, eval); 00847 if(!ret) return 1; 00848 fprintf(stderr, "%.5f\n", eval); 00849 return 0; 00850 }else{ 00851 vector<float> evalAll; 00852 //fprintf(stderr, "Got here"); 00853 bool ret = EvaluateOneQuery(sArgs, met, sortedGenes, 00854 goldstdGenePresence, nan, evalAll); 00855 if(!ret) return 1; 00856 PrintVector(evalAll); 00857 return 0; 00858 } 00859 } 00860 00861 if(qmode == MULTI_QUERY){ 00862 string goldstdList = sArgs.goldstd_list_arg; 00863 vector<string> vecstrList; 00864 CSeekTools::ReadListOneColumn(goldstdList, vecstrList); 00865 vector<char> *goldstdGenePresence = 00866 new vector<char>[vecstrList.size()]; 00867 for(i=0; i<vecstrList.size(); i++){ 00868 vector<string> goldstdGenes; 00869 CSeekTools::ReadMultiGeneOneLine(vecstrList[i], goldstdGenes); 00870 CSeekTools::InitVector(goldstdGenePresence[i], 00871 vecstrGenes.size(), (char) 0); 00872 for(j=0; j<goldstdGenes.size(); j++) 00873 goldstdGenePresence[i][mapstriGenes[goldstdGenes[j]]] = 1; 00874 } 00875 00876 //fprintf(stderr, "Finished reading gold standard list\n"); 00877 00878 string queryList = sArgs.query_list_arg; 00879 vecstrList.clear(); 00880 CSeekTools::ReadListOneColumn(queryList, vecstrList); 00881 vector<utype> *queryGeneID = new vector<utype>[vecstrList.size()]; 00882 for(i=0; i<vecstrList.size(); i++){ 00883 vector<string> queryGenes; 00884 CSeekTools::ReadMultiGeneOneLine(vecstrList[i], queryGenes); 00885 for(j=0; j<queryGenes.size(); j++) 00886 queryGeneID[i].push_back(mapstriGenes[queryGenes[j]]); 00887 } 00888 00889 //fprintf(stderr, "Finished reading query list\n"); 00890 00891 vector<string> exclude_list; 00892 string excl = sArgs.exclude_list_arg; 00893 exclude_list.clear(); 00894 CSeekTools::ReadListOneColumn(excl, exclude_list); 00895 vector<char> *excludeGene = new vector<char>[vecstrList.size()]; 00896 for(i=0; i<exclude_list.size(); i++){ 00897 vector<string> ex; 00898 CSeekTools::ReadMultiGeneOneLine(exclude_list[i], ex); 00899 CSeekTools::InitVector(excludeGene[i], vecstrGenes.size(), (char) 0); 00900 for(j=0; j<ex.size(); j++) 00901 excludeGene[i][mapstriGenes[ex[j]]] = 1; 00902 } 00903 00904 vector<string> include_list; 00905 string incl = sArgs.include_list_arg; 00906 include_list.clear(); 00907 CSeekTools::ReadListOneColumn(incl, include_list); 00908 vector<char> *includeGene = new vector<char>[vecstrList.size()]; 00909 for(i=0; i<include_list.size(); i++){ 00910 vector<string> in; 00911 CSeekTools::ReadMultiGeneOneLine(include_list[i], in, 40000); 00912 CSeekTools::InitVector(includeGene[i], vecstrGenes.size(), (char) 0); 00913 for(j=0; j<in.size(); j++) 00914 includeGene[i][mapstriGenes[in[j]]] = 1; 00915 } 00916 00917 00918 string genescoreList = sArgs.gscore_list_arg; 00919 vecstrList.clear(); 00920 CSeekTools::ReadListOneColumn(genescoreList, vecstrList); 00921 vector<float> *geneScores = new vector<float>[vecstrList.size()]; 00922 vector<AResultFloat> *sortedGenes = 00923 new vector<AResultFloat>[vecstrList.size()]; 00924 float nan = sArgs.nan_arg; 00925 for(i=0; i<vecstrList.size(); i++){ 00926 CSeekTools::ReadArray(vecstrList[i].c_str(), geneScores[i]); 00927 //maxScore[i] = *std::max_element(geneScores[i].begin(), 00928 // geneScores[i].end()); 00929 sortedGenes[i].resize(geneScores[i].size()); 00930 for(j=0; j<sortedGenes[i].size(); j++){ 00931 sortedGenes[i][j].i = j; 00932 sortedGenes[i][j].f = geneScores[i][j]; 00933 if(isnan(geneScores[i][j]) || isinf(geneScores[i][j])){ 00934 sortedGenes[i][j].f = nan; 00935 } 00936 //fprintf(stderr, "%.5f\n", sortedGenes[i][j].f); 00937 } 00938 } 00939 00940 //fprintf(stderr, "Got here"); 00941 00942 vector<vector<float> > result; 00943 DoAggregate(sArgs, met, sortedGenes, queryGeneID, vecstrList.size(), 00944 goldstdGenePresence, excludeGene, includeGene, result); 00945 00946 00947 /* 00948 if(sArgs.fold_over_random_flag==1){ 00949 vector<AResultFloat> *randomScores = new vector<AResultFloat>[vecstrList.size()]; 00950 //should shuffle only within annotated genes 00951 for(i=0; i<vecstrList.size(); i++){ 00952 GetRandom(rnd, sortedGenes[i], randomScores[i], excludeGene[i], 00953 includeGene[i], queryGeneID[i], nan); 00954 } 00955 vector<vector<float> > random_result; 00956 DoAggregate(sArgs, met, randomScores, queryGeneID, vecstrList.size(), 00957 goldstdGenePresence, excludeGene, includeGene, random_result); 00958 00959 vector<vector<float> > fold; 00960 fold.resize(result.size()); 00961 for(i=0; i<result.size(); i++){ 00962 fold[i] = vector<float>(); 00963 fold[i].resize(result[i].size()); 00964 for(j=0; j<result[i].size(); j++){ 00965 if(random_result[i][j]==result[i][j]){ 00966 fold[i][j] = 1.0; 00967 }else if(random_result[i][j]==0){ 00968 fold[i][j] = 1.0; 00969 }else{ 00970 fold[i][j] = result[i][j]/random_result[i][j]; 00971 } 00972 fprintf(stderr, "%.2f %d %d %.2f %.2f\n", fold[i][j], i, j, result[i][j], random_result[i][j]); 00973 } 00974 } 00975 PrintResult(fold); 00976 */ 00977 //}else{ 00978 PrintResult(result); 00979 //} 00980 00981 } 00982 00983 if(qmode == MULTI_DWEIGHT){ 00984 string dweightList = sArgs.dweight_list_arg; 00985 vector<string> vecstrList; 00986 CSeekTools::ReadListOneColumn(dweightList, vecstrList); 00987 int numDataset = 0; 00988 00989 for(j=0; j<vecstrList.size(); j++){ 00990 vector<float> ww; 00991 CSeekTools::ReadArray(vecstrList[j].c_str(), ww); 00992 vector<AResultFloat> sortedDatasets; 00993 sortedDatasets.resize(ww.size()); 00994 numDataset = ww.size(); 00995 for(i=0; i<sortedDatasets.size(); i++){ 00996 sortedDatasets[i].i = i; 00997 sortedDatasets[i].f = ww[i]; 00998 } 00999 sort(sortedDatasets.begin(), sortedDatasets.end()); 01000 /*for(i=0; i<sortedDatasets.size(); i++){ 01001 fprintf(stderr, "%.2e\t%s\n", sortedDatasets[i].f, 01002 vecstrDatasets[sortedDatasets[i].i].c_str()); 01003 }*/ 01004 vector<int> depth; 01005 depth.push_back((int) ((float)numDataset * 0.005)); 01006 depth.push_back((int) ((float)numDataset * 0.01)); 01007 depth.push_back((int) ((float)numDataset * 0.05)); 01008 depth.push_back((int) ((float)numDataset * 0.10)); 01009 depth.push_back((int) ((float)numDataset * 0.20)); 01010 depth.push_back((int) ((float)numDataset * 0.50)); 01011 vector<float> avg; 01012 for(i=0; i<depth.size(); i++){ 01013 float a = 0; 01014 for(k=0; k<depth[i]; k++) 01015 a+=sortedDatasets[k].f; 01016 a /= (float) depth[i]; 01017 avg.push_back(a); 01018 } 01019 for(i=0; i<depth.size(); i++){ 01020 fprintf(stderr, "%.3e\t", avg[i]); 01021 } 01022 fprintf(stderr, "\n"); 01023 } 01024 01025 } 01026 01027 01028 #ifdef WIN32 01029 pthread_win32_process_detach_np( ); 01030 #endif // WIN32 01031 return 0; }