Sleipnir
tools/SeekEvaluator/SeekEvaluator.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 enum QUERY_MODE{
00026     SINGLE_QUERY, MULTI_QUERY, MULTI_DWEIGHT
00027 };
00028 
00029 enum METRIC{
00030     RBP, AVGP, PR, PR_ALL, AUC
00031 };
00032 
00033 bool GetRandom(gsl_rng *r, const vector<AResultFloat> &geneScore, 
00034     vector<AResultFloat> &random, vector<char> &excludeGene,
00035     vector<char> &includeGene, vector<utype> &queryGeneID, 
00036     const float nan){
00037 
00038     int i, j;
00039     int ss = 0;
00040 
00041     vector<char> q;
00042     q.clear();
00043     CSeekTools::InitVector(q, geneScore.size(), (char) 0);
00044     for(j=0; j<queryGeneID.size(); j++)
00045         q[queryGeneID[j]] = 1;
00046 
00047     for(i=0; i<geneScore.size(); i++){
00048         if(includeGene[geneScore[i].i]==0){
00049         }else if(q[geneScore[i].i]==1){
00050         }else if(excludeGene[geneScore[i].i]==1){
00051         }else{
00052             ss++;
00053         }
00054     }
00055 
00056     float *gs = (float*)malloc(ss*sizeof(float));
00057     random.clear();
00058     random.resize(geneScore.size());
00059     
00060     int ii = 0;
00061     for(i=0; i<geneScore.size(); i++){
00062         if(includeGene[geneScore[i].i]==0){
00063         }else if(q[geneScore[i].i]==1){
00064         }else if(excludeGene[geneScore[i].i]==1){
00065         }else{
00066             //gs[ii] = geneScore[i].f;
00067             gs[ii] = (float) ii;
00068             ii++;
00069         }
00070     }
00071 
00072     gsl_ran_shuffle(r, gs, ss, sizeof(float));
00073 
00074     ii = 0;
00075     for(i=0; i<geneScore.size(); i++){
00076         random[i].i = geneScore[i].i;
00077         random[i].f = nan;
00078         if(includeGene[geneScore[i].i]==0){
00079         }else if(q[geneScore[i].i]==1){
00080         }else if(excludeGene[geneScore[i].i]==1){
00081         }else{
00082             random[i].f = gs[ii];
00083             ii++;
00084         }
00085     }
00086     free(gs);
00087     return true;
00088 }
00089 
00090 float RankBiasedPrecision(const float &rate,
00091     const vector<AResultFloat> &sortedScore, const vector<char> &gold_std,
00092     const float &nanVal){
00093 
00094     vector<AResultFloat>::const_iterator iterScore = sortedScore.begin();
00095     float x = 0;
00096     utype i;
00097     for(i=0; iterScore!=sortedScore.end(); i++, iterScore++){
00098         //if(iterScore->f == nanVal) continue;
00099         if(gold_std[iterScore->i]==1) x += pow(rate, i);
00100     }
00101     x*=(1.0-rate);
00102     return x;
00103 }
00104 
00105 vector<float>* Precision(const vector<AResultFloat> &sortedScore,
00106     const vector<char> &gold_std, const float &nanVal){
00107 
00108     utype i, numPos = 1;
00109     vector<float> *r = new vector<float>();
00110 
00111     vector<AResultFloat>::const_iterator iterScore = sortedScore.begin();
00112     for(i=0; iterScore!=sortedScore.end(); i++, iterScore++){
00113         //if(iterScore->f == nanVal) continue;
00114         if(gold_std[iterScore->i]==1){
00115             //fprintf(stderr, "%d %d\n", numPos, i);
00116             r->push_back((float) numPos++ / (float) (i + 1));
00117         }
00118     }
00119     r->resize(r->size());
00120     return r;
00121 }
00122 
00123 bool MeanStandardDeviation(const vector<float> &f, float &avg, float &stdev){
00124     avg = 0;
00125     stdev = 0;
00126     avg = std::accumulate(f.begin(), f.end(), (float) 0);
00127     avg /= (float) f.size();
00128     vector<float>::const_iterator iterF = f.begin();
00129     for(; iterF!=f.end(); iterF++) stdev += (*iterF - avg) * (*iterF - avg);
00130     stdev /= (float) f.size();
00131     stdev = sqrt(stdev);
00132     return true;
00133 }
00134 
00135 bool MinMaxQuartile(vector<float> &f, float &min, float &max, float &Q1,
00136         float &Q2, float &Q3){
00137     min = *min_element(f.begin(), f.end());
00138     max = *max_element(f.begin(), f.end());
00139     Q1 = 0;
00140     Q2 = 0;
00141     Q3 = 0;
00142 
00143     if(f.size()==1){
00144         Q1 = f[0];
00145         Q2 = Q1;
00146         Q3 = Q2;
00147         return true;
00148     }
00149     if(f.size()==2){
00150         Q1 = f[0];
00151         Q2 = (f[0] + f[1] ) / 2.0;
00152         Q3 = f[1];
00153         return true;
00154     }
00155     if(f.size()==3){
00156         Q1 = (f[0] + f[1] ) / 2.0;
00157         Q2 = f[1];
00158         Q3 = (f[1] + f[2]) / 2.0;
00159         return true;
00160     }
00161 
00162     if(f.size()%2==0){
00163         int m1 = f.size()/2 - 1;
00164         int m2 = f.size()/2;
00165         std::nth_element(f.begin(), f.begin()+m1, f.end());
00166         float f1 = f[m1];
00167         std::nth_element(f.begin(), f.begin()+m2, f.end());
00168         float f2 = f[m2];
00169         Q2 = (f1 + f2) / 2.0;
00170 
00171         int s1 = m1 + 1;
00172         if(s1%2==0){
00173             int m3 = s1/2 - 1;
00174             int m4 = s1/2;
00175             std::nth_element(f.begin(), f.begin()+m3, f.end());
00176             float f3 = f[m3];
00177             std::nth_element(f.begin(), f.begin()+m4, f.end());
00178             float f4 = f[m4];
00179             Q1 = (f3 + f4) / 2.0;
00180         }else{
00181             int m3 = s1/2;
00182             std::nth_element(f.begin(), f.begin()+m3, f.end());
00183             float f3 = f[m3];
00184             Q1 = f3;
00185         }
00186 
00187         int s2 = (f.size()-1) - m2 + 1;
00188         if(s2%2==0){
00189             int m3 = m2 + s2/2 - 1;
00190             int m4 = m2 + s2/2;
00191             std::nth_element(f.begin(), f.begin()+m3, f.end());
00192             float f3 = f[m3];
00193             std::nth_element(f.begin(), f.begin()+m4, f.end());
00194             float f4 = f[m4];
00195             Q3 = (f3 + f4) / 2.0;
00196         }else{
00197             int m3 = m2 + s2/2;
00198             std::nth_element(f.begin(), f.begin()+m3, f.end());
00199             float f3 = f[m3];
00200             Q3 = f3;
00201         }
00202 
00203     }else{
00204         int m1 = f.size()/2;
00205         std::nth_element(f.begin(), f.begin()+m1, f.end());
00206         float f1 = f[m1];
00207         Q2 = f1;
00208 
00209         int s1 = m1 - 1 + 1;
00210         if(s1%2==0){
00211             int m3 = s1/2 - 1;
00212             int m4 = s1/2;
00213             std::nth_element(f.begin(), f.begin()+m3, f.end());
00214             float f3 = f[m3];
00215             std::nth_element(f.begin(), f.begin()+m4, f.end());
00216             float f4 = f[m4];
00217             Q1 = (f3 + f4) / 2.0;
00218         }else{
00219             int m3 = s1/2;
00220             std::nth_element(f.begin(), f.begin()+m3, f.end());
00221             float f3 = f[m3];
00222             Q1 = f3;
00223         }
00224 
00225         int s2 = (f.size()-1) - (m1 + 1) + 1;
00226         if(s2%2==0){
00227             int m3 = m1 + 1 + s2/2 - 1;
00228             int m4 = m1 + 1 + s2/2;
00229             std::nth_element(f.begin(), f.begin()+m3, f.end());
00230             float f3 = f[m3];
00231             std::nth_element(f.begin(), f.begin()+m4, f.end());
00232             float f4 = f[m4];
00233             Q3 = (f3 + f4) / 2.0;
00234         }else{
00235             int m3 = m1 + 1 + s2/2;
00236             std::nth_element(f.begin(), f.begin()+m3, f.end());
00237             float f3 = f[m3];
00238             Q3 = f3;
00239         }
00240     }
00241     return true;
00242 }
00243 
00244 bool EvaluateOneQuery(const gengetopt_args_info &sArgs, const enum METRIC &met,
00245     const vector<AResultFloat> &sortedGenes,
00246     const vector<char> &goldstdGenePresence, const float &nan,
00247     vector<float> &eval){
00248     size_t i;
00249     eval.clear();
00250     //metric
00251     if(met==PR_ALL){
00252         vector<float> *vf = Precision(sortedGenes, goldstdGenePresence, nan);
00253         for(i=0; i<vf->size(); i++) eval.push_back(vf->at(i));
00254         delete vf;
00255         return true;
00256     }
00257     fprintf(stderr, "Invalid option!\n");
00258     return false;
00259 }
00260 
00261 
00262 bool EvaluateOneQuery(const gengetopt_args_info &sArgs, const enum METRIC &met,
00263     const vector<AResultFloat> &sortedGenes,
00264     const vector<char> &goldstdGenePresence, const float &nan, float &eval){
00265     size_t i;
00266     //metric
00267     if(met==RBP){
00268         float rate = sArgs.rbp_p_arg;
00269         float rbp = RankBiasedPrecision(rate, sortedGenes,
00270             goldstdGenePresence, nan);
00271         eval = rbp;
00272         //fprintf(stdout, "%.5f\n", rbp);
00273         return true;
00274     }
00275     else if(met==AVGP || met==PR){
00276         vector<float> *vf = Precision(sortedGenes, goldstdGenePresence, nan);
00277         /*if(met==PR_ALL){
00278             for(i=0; i<vf->size()-1; i++){
00279                 fprintf(stdout, "%.5f ", vf->at(i));
00280                 }
00281                 fprintf(stdout, "%.5f\n", vf->at(i));
00282                 delete vf;
00283                 return true;
00284             }*/
00285 
00286         if(sArgs.x_int_arg==-1 && sArgs.x_per_arg==0){
00287             fprintf(stderr, "Must specify --x_int or --x_per\n");
00288             delete vf;
00289             return false;
00290         }
00291 
00292         int X = -1;
00293         if(sArgs.x_int_arg!=-1 && sArgs.x_per_arg==0){
00294             X = sArgs.x_int_arg;
00295             if(X>vf->size()){
00296                 fprintf(stderr, "Error: X is too large (>%d)\n", vf->size());
00297                 delete vf;
00298                 return false;
00299             }
00300         }
00301         else if(sArgs.x_int_arg==-1 && sArgs.x_per_arg>0){
00302             float per = sArgs.x_per_arg;
00303             if(per>1.0){
00304                 fprintf(stderr, "Error: percentage is above 1.0\n");
00305                 delete vf;
00306                 return false;
00307             }
00308             X = (int) ( (float) per * (float) vf->size() );
00309             //fprintf(stderr, "%.5f %d %d", per, vf->size(), X);
00310             X = std::max(1, X);
00311         }
00312 
00313         if(met==AVGP){
00314             float avg = std::accumulate(vf->begin(), vf->begin()+X, (float) 0);
00315             avg /= (float) X;
00316             //fprintf(stdout, "%.5f\n", avg);
00317             eval = avg;
00318             delete vf;
00319             return true;
00320         }
00321         if(met==PR){
00322             //fprintf(stdout, "%.5f\n", vf->at(X-1));
00323             eval = vf->at(X-1);
00324             delete vf;
00325             return true;
00326         }
00327     }
00328 
00329     fprintf(stderr, "Invalid option!\n");
00330     return false;
00331 }
00332 
00333 void PrintVector(const vector<float> &f){
00334     vector<float>::const_iterator iterF = f.begin();
00335     vector<float>::const_iterator end = f.begin() + f.size() - 1;
00336     for(; iterF!=end; iterF++) fprintf(stderr, "%.5f ", *iterF);
00337     fprintf(stderr, "%.5f\n", *iterF);
00338 }
00339 
00340 void PrintResult(vector< vector<float> > f){
00341     int i;
00342     for(i=0; i<f.size(); i++){
00343         PrintVector(f[i]);
00344     }
00345 }
00346 
00347 bool DoAggregate(const gengetopt_args_info &sArgs, const enum METRIC &met, 
00348     vector<AResultFloat> *sortedGenes, vector<utype> *queryGeneID, 
00349     int listSize, vector<char> *goldstdGenePresence, vector<char> *excludeGene,
00350     vector<char> *includeGene, 
00351     vector< vector<float> > &result
00352 
00353     ){
00354 
00355     //fprintf(stderr, "Finished reading gene score list\n");
00356     vector<AResultFloat> *master = NULL;
00357     vector<AResultFloat> master_score;
00358     vector<AResultFloat> master_rank;
00359     utype i, j;
00360     float nan = sArgs.nan_arg;
00361     result.clear();
00362 
00363     vector<char> *q = new vector<char>[listSize];
00364     for(i=0; i<listSize; i++){
00365         q[i] = vector<char>();
00366         CSeekTools::InitVector(q[i], sortedGenes[i].size(), (char) 0);
00367         for(j=0; j<queryGeneID[i].size(); j++)
00368             q[i][queryGeneID[i][j]] = 1;
00369     }
00370 
00371     if(sArgs.agg_ranksum_flag==1 || sArgs.agg_scoresum_flag==1){
00372         //Gold standard must be the same across all queries for this mode!!
00373         //ASSUME THIS IS TRUE
00374         for(i=0; i<listSize; i++){
00375             for(j=0; j<sortedGenes[0].size(); j++){
00376                 if(q[i][sortedGenes[i][j].i]==1){
00377                     sortedGenes[i][j].f = nan;
00378                 }
00379                 if(excludeGene[i][sortedGenes[i][j].i]==1){
00380                     sortedGenes[i][j].f = nan;
00381                 }
00382             }
00383         }
00384 
00385         if(sArgs.agg_scoresum_flag==1){
00386             master_score.resize(sortedGenes[0].size());
00387 
00388             for(j=0; j<sortedGenes[0].size(); j++){
00389                 master_score[j].i = j;
00390                 master_score[j].f = 0.0;
00391             }
00392 
00393             for(j=0; j<sortedGenes[0].size(); j++)
00394                 for(i=0; i<listSize; i++)
00395                     master_score[sortedGenes[i][j].i].f += sortedGenes[i][j].f;
00396 
00397             for(j=0; j<sortedGenes[0].size(); j++)
00398                 master_score[j].f /= (float)listSize;
00399 
00400             sort(master_score.begin(), master_score.end());
00401 
00402             master = &master_score;
00403         }
00404 
00405         else if(sArgs.agg_ranksum_flag==1){
00406             //fprintf(stderr, "Got here 2\n"); 
00407             master_rank.resize(sortedGenes[0].size());
00408 
00409             //fprintf(stderr, "Got here 2a\n"); 
00410 
00411             for(i=0; i<listSize; i++){
00412                 sort(sortedGenes[i].begin(), sortedGenes[i].end());
00413                 //fprintf(stderr, "Sort %d fine\n", i); 
00414             }   
00415 
00416             //fprintf(stderr, "Got here 2b\n"); 
00417 
00418             for(j=0; j<sortedGenes[0].size(); j++){
00419                 master_rank[j].i = j;
00420                 master_rank[j].f = 0;
00421             }
00422 
00423             for(i=0; i<listSize; i++)
00424                 for(j=0; j<sortedGenes[i].size(); j++)
00425                     master_rank[sortedGenes[i][j].i].f +=
00426                         (float) sortedGenes[i].size() - (float) j;
00427 
00428             for(j=0; j<sortedGenes[0].size(); j++)
00429                 master_rank[j].f /= (float) listSize;
00430 
00431             sort(master_rank.begin(), master_rank.end());
00432 
00433             master = &master_rank;
00434 
00435             //fprintf(stderr, "Got here 3\n"); 
00436         }
00437 
00438         if(met!=PR_ALL){
00439             float eval;
00440             bool ret = EvaluateOneQuery(sArgs, met, *master,
00441                 goldstdGenePresence[0], CMeta::GetNaN(), eval);
00442             if(!ret) return 1;
00443             result.resize(1);
00444             result[0] = vector<float>();
00445             result[0].push_back(eval);
00446             return 0;
00447         }else{
00448             vector<float> evalAll;
00449             bool ret = EvaluateOneQuery(sArgs, met, *master,
00450                 goldstdGenePresence[0], CMeta::GetNaN(), evalAll);
00451             if(!ret) return 1;
00452             result.resize(1);
00453             result[0] = vector<float>();
00454             for(i=0; i<evalAll.size(); i++) 
00455                 result[0].push_back(evalAll[i]);
00456             return 0;
00457         }
00458     }
00459 
00460     //Not Rank Sum
00461     vector<float> eval;
00462     vector< vector<float> > vecevalAll;
00463 
00464     eval.resize(listSize);
00465     vecevalAll.resize(listSize);
00466 
00467     for(i=0; i<listSize; i++){
00468         for(j=0; j<sortedGenes[i].size(); j++){
00469             //NEW
00470             if(includeGene[i][sortedGenes[i][j].i]==0){
00471                 sortedGenes[i][j].f = nan;
00472             }
00473             if(q[i][sortedGenes[i][j].i]==1){
00474                 sortedGenes[i][j].f = nan;
00475             }
00476             if(excludeGene[i][sortedGenes[i][j].i]==1){
00477                 sortedGenes[i][j].f = nan;
00478             }
00479         }
00480 
00481         sort(sortedGenes[i].begin(), sortedGenes[i].end());
00482 
00483         if(met!=PR_ALL){
00484             float fEval;
00485             bool ret = EvaluateOneQuery(sArgs, met, sortedGenes[i],
00486                 goldstdGenePresence[i], nan, fEval);
00487             if(!ret) return 1;
00488             eval[i] = fEval;
00489 
00490         }else{
00491             vector<float> evalAll;
00492             bool ret = EvaluateOneQuery(sArgs, met, sortedGenes[i],
00493                 goldstdGenePresence[i], nan, evalAll);
00494             if(!ret) return 1;
00495             vecevalAll[i] = evalAll;
00496         }
00497     }
00498 
00499     if(sArgs.fold_over_random_flag==1){
00500         const gsl_rng_type *T;
00501         gsl_rng *rnd;
00502         gsl_rng_env_setup();
00503         T = gsl_rng_default;
00504         rnd = gsl_rng_alloc(T);
00505 
00506         vector<AResultFloat> *randomScores = 
00507             new vector<AResultFloat>[listSize];
00508 
00509         vector<float> random_eval;
00510         vector< vector<float> > random_vecevalAll;
00511         random_eval.resize(listSize);
00512         random_vecevalAll.resize(listSize);
00513 
00514         //should shuffle only within annotated genes
00515         for(i=0; i<listSize; i++){
00516             GetRandom(rnd, sortedGenes[i], randomScores[i], excludeGene[i], 
00517                 includeGene[i], queryGeneID[i], nan);
00518             sort(randomScores[i].begin(), randomScores[i].end());
00519             if(met!=PR_ALL){
00520                 float fEval;
00521                 bool ret = EvaluateOneQuery(sArgs, met, randomScores[i],
00522                     goldstdGenePresence[i], nan, fEval);
00523                 if(!ret) return 1;
00524                 random_eval[i] = fEval;
00525                 //calculate fold
00526                 if(random_eval[i]==eval[i]) eval[i] = 1.0;
00527                 else if(random_eval[i]==0) eval[i] = 1.0;
00528                 else eval[i] = eval[i] / random_eval[i];
00529             }else{
00530                 vector<float> evalAll;
00531                 bool ret = EvaluateOneQuery(sArgs, met, randomScores[i],
00532                     goldstdGenePresence[i], nan, evalAll);
00533                 if(!ret) return 1;
00534                 random_vecevalAll[i] = evalAll;
00535                 int ii=0; 
00536                 //calculate fold
00537                 for(ii=0; ii<random_vecevalAll[i].size(); ii++){
00538                     if(random_vecevalAll[i][ii]==vecevalAll[i][ii])
00539                         vecevalAll[i][ii] = 1.0;
00540                     else if(random_vecevalAll[i][ii]==0)
00541                         vecevalAll[i][ii] = 1.0;
00542                     else
00543                         vecevalAll[i][ii] /= random_vecevalAll[i][ii];
00544                 }
00545             }
00546             //fprintf(stderr, "Got here!\n");   
00547         }
00548         //fprintf(stderr, "Got here\n");
00549         gsl_rng_free(rnd);
00550         delete[] randomScores;      
00551         //fprintf(stderr, "Got here 2\n");
00552     }
00553 
00554     if(met!=PR_ALL){
00555         if(sArgs.agg_avg_flag==1){
00556             float avg, stdev;
00557             MeanStandardDeviation(eval, avg, stdev);
00558             result.resize(1);
00559             result[0] = vector<float>();
00560             result[0].push_back(avg);
00561             result[0].push_back(stdev);
00562             return 0;
00563         }
00564         if(sArgs.agg_quartile_flag==1){
00565             float min, max, Q1, Q2, Q3;
00566             MinMaxQuartile(eval, min, max, Q1, Q2, Q3);
00567             result.resize(2);
00568             result[0] = vector<float>();
00569             result[0].push_back(min);
00570             result[0].push_back(max);
00571             result[1] = vector<float>();
00572             result[1].push_back(Q1);
00573             result[1].push_back(Q2);
00574             result[1].push_back(Q3);
00575             return 0;
00576         }
00577         if(sArgs.display_all_flag==1){
00578             result.resize(eval.size());
00579             for(i=0; i<eval.size(); i++){
00580                 result[i] = vector<float>();
00581                 result[i].push_back(eval[i]);
00582             }
00583             return 0;
00584         }
00585 
00586     }else{
00587         vector< vector<float> > veceval;
00588         veceval.resize(vecevalAll[0].size());
00589         for(i=0; i<vecevalAll.size(); i++)
00590             for(j=0; j<vecevalAll[i].size(); j++)
00591                 veceval[j].push_back(vecevalAll[i][j]);
00592 
00593         if(sArgs.agg_avg_flag==1){
00594             vector<float> avgAll, stdevAll;
00595             for(i=0; i<veceval.size(); i++){
00596                 float avg, stdev;
00597                 MeanStandardDeviation(veceval[i], avg, stdev);
00598                 avgAll.push_back(avg);
00599                 stdevAll.push_back(stdev);
00600             }
00601             result.resize(2);
00602             result[0] = vector<float>();
00603             result[1] = vector<float>();
00604             for(i=0; i<avgAll.size(); i++){
00605                 result[0].push_back(avgAll[i]);
00606             }
00607             for(i=0; i<stdevAll.size(); i++){
00608                 result[1].push_back(stdevAll[i]);
00609             }
00610             return 0;
00611         }
00612         else if(sArgs.agg_quartile_flag==1){
00613             vector<float> minAll, maxAll, Q1All, Q2All, Q3All;
00614             for(i=0; i<veceval.size(); i++){
00615                 float min, max, Q1, Q2, Q3;
00616                 MinMaxQuartile(veceval[i], min, max, Q1, Q2, Q3);
00617                 minAll.push_back(min);
00618                 maxAll.push_back(max);
00619                 Q1All.push_back(Q1);
00620                 Q2All.push_back(Q2);
00621                 Q3All.push_back(Q3);
00622             }
00623             result.resize(5);
00624             result[0] = vector<float>();
00625             result[1] = vector<float>();
00626             result[2] = vector<float>();
00627             result[3] = vector<float>();
00628             result[4] = vector<float>();
00629             for(i=0; i<minAll.size(); i++){
00630                 result[0].push_back(minAll[i]);
00631             }
00632             for(i=0; i<maxAll.size(); i++){
00633                 result[1].push_back(maxAll[i]);
00634             }
00635             for(i=0; i<Q1All.size(); i++){
00636                 result[2].push_back(Q1All[i]);
00637             }
00638             for(i=0; i<Q2All.size(); i++){
00639                 result[3].push_back(Q2All[i]);
00640             }
00641             for(i=0; i<Q3All.size(); i++){
00642                 result[4].push_back(Q3All[i]);
00643             }
00644             return 0;
00645         }
00646     }
00647 
00648     return true;
00649 }
00650 
00651 int main( int iArgs, char** aszArgs ) {
00652     static const size_t c_iBuffer   = 1024;
00653 #ifdef WIN32
00654     pthread_win32_process_attach_np( );
00655 #endif // WIN32
00656     gengetopt_args_info sArgs;
00657     ifstream            ifsm;
00658     istream*            pistm;
00659     vector<string>      vecstrLine, vecstrGenes, vecstrDBs, vecstrQuery;
00660     char                acBuffer[ c_iBuffer ];
00661     size_t              i, j, k;
00662 
00663 
00664     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00665         cmdline_parser_print_help( );
00666         return 1; }
00667 
00668     /* reading gene-mapping file */
00669     ifsm.open(sArgs.input_arg);
00670     pistm = &ifsm;
00671 
00672     map<string, size_t> mapstriGenes;
00673     while( !pistm->eof( ) ) {
00674         pistm->getline( acBuffer, c_iBuffer - 1 );
00675         acBuffer[ c_iBuffer - 1 ] = 0;
00676         vecstrLine.clear( );
00677         CMeta::Tokenize( acBuffer, vecstrLine );
00678         if( vecstrLine.size( ) < 2 ) {
00679             //cerr << "Ignoring line: " << acBuffer << endl;
00680             continue;
00681         }
00682         if( !( i = atoi( vecstrLine[ 0 ].c_str( ) ) ) ) {
00683             cerr << "Illegal gene ID: " << vecstrLine[ 0 ] << " for "
00684                 << vecstrLine[ 1 ] << endl;
00685             return 1;
00686         }
00687         i--;
00688         if( vecstrGenes.size( ) <= i ) vecstrGenes.resize( i + 1 );
00689         vecstrGenes[ i ] = vecstrLine[ 1 ];
00690         mapstriGenes[vecstrGenes[i]] = i;
00691     }
00692     ifsm.close( );
00693 
00694     enum QUERY_MODE qmode;
00695     if(sArgs.single_flag==1) qmode = SINGLE_QUERY;
00696     else if(sArgs.aggregate_flag==1) qmode = MULTI_QUERY;
00697     else if(sArgs.multi_weight_flag==1) qmode = MULTI_DWEIGHT;
00698 
00699     enum METRIC met;
00700     if(sArgs.rbp_flag==1) met = RBP;
00701     else if(sArgs.avgp_flag==1) met = AVGP;
00702     else if(sArgs.pr_flag==1) met = PR;
00703     else if(sArgs.pr_all_flag==1) met = PR_ALL;
00704     else if(sArgs.auc_flag==1) met = AUC;
00705 
00706     //fprintf(stderr, "Query mode: %d\n", qmode);
00707 
00708     if(qmode==SINGLE_QUERY){
00709         if(sArgs.display_weight_flag==1){
00710             vector<float> ww;
00711             CSeekTools::ReadArray(sArgs.weight_arg, ww);
00712             vector<string> vecstrDatasets, vecstrP;
00713             CSeekTools::ReadListTwoColumns(sArgs.dataset_map_arg, 
00714                 vecstrDatasets, vecstrP);
00715             vector<AResultFloat> sortedDatasets;
00716             sortedDatasets.resize(ww.size());
00717             for(i=0; i<sortedDatasets.size(); i++){
00718                 sortedDatasets[i].i = i;
00719                 sortedDatasets[i].f = ww[i];
00720             }
00721             sort(sortedDatasets.begin(), sortedDatasets.end());
00722             for(i=0; i<sortedDatasets.size(); i++){
00723                 fprintf(stderr, "%.2e\t%s\n", sortedDatasets[i].f, 
00724                     vecstrDatasets[sortedDatasets[i].i].c_str());
00725             }
00726             return 0;
00727         }
00728 
00729         string queryFile = sArgs.query_arg;
00730         vector<string> queryGenes;
00731         CSeekTools::ReadMultiGeneOneLine(queryFile, queryGenes);
00732         vector<utype> queryGeneID;
00733         for(i=0; i<queryGenes.size(); i++)
00734             queryGeneID.push_back(mapstriGenes[queryGenes[i]]);
00735 
00736         string genescoreFile = sArgs.gscore_arg;
00737         vector<float> geneScores;
00738         CSeekTools::ReadArray(genescoreFile.c_str(), geneScores);
00739         //float maxScore = *std::max_element(geneScores.begin(),
00740         //  geneScores.end());
00741 
00742         float nan = sArgs.nan_arg;
00743         vector<AResultFloat> sortedGenes;
00744         sortedGenes.resize(geneScores.size());
00745         for(i=0; i<sortedGenes.size(); i++){
00746             sortedGenes[i].i = i;
00747             sortedGenes[i].f = geneScores[i];
00748         }
00749 
00750         //Query genes themselves have lowest score, to prevent
00751         //them from being counted in PR
00752         for(i=0; i<queryGeneID.size(); i++)
00753             sortedGenes[queryGeneID[i]].f = nan;
00754 
00755         sort(sortedGenes.begin(), sortedGenes.end());
00756 
00757         if(sArgs.p_value_flag==1){
00758             string random_directory = sArgs.random_dir_arg;
00759             int num_random = sArgs.random_num_arg;
00760             int ii, jj;
00761             char ac[256];
00762             vector<vector<int> > randomRank;
00763             vector<vector<float> > randomSc;
00764             vector<int> geneRank;
00765 
00766             randomRank.resize(sortedGenes.size());
00767             randomSc.resize(sortedGenes.size());
00768             geneRank.resize(sortedGenes.size());
00769             for(ii=0; ii<sortedGenes.size(); ii++){
00770                 randomRank[ii].resize(num_random);
00771                 randomSc[ii].resize(num_random);
00772             }
00773 
00774             for(ii=0; ii<num_random; ii++){
00775                 vector<float> randomScores;
00776                 sprintf(ac, "%s/%d.gscore", random_directory.c_str(), ii);
00777                 CSeekTools::ReadArray(ac, randomScores);
00778                 vector<AResultFloat> sortedRandom;
00779                 sortedRandom.resize(randomScores.size());
00780                 for(jj=0; jj<randomScores.size(); jj++){
00781                     sortedRandom[jj].i = jj;
00782                     sortedRandom[jj].f = randomScores[jj];
00783                 }
00784                 sort(sortedRandom.begin(), sortedRandom.end());
00785                 for(jj=0; jj<randomScores.size(); jj++){
00786                     randomRank[sortedRandom[jj].i][ii] = jj;
00787                     randomSc[sortedRandom[jj].i][ii] = sortedRandom[jj].f;
00788                 }
00789             }
00790 
00791             for(jj=0; jj<geneScores.size(); jj++){
00792                 sort(randomRank[jj].begin(), randomRank[jj].end());
00793                 sort(randomSc[jj].begin(), randomSc[jj].end(), std::greater<float>());
00794                 geneRank[sortedGenes[jj].i] = jj;
00795             }
00796 
00797             for(jj=0; jj<geneScores.size(); jj++){
00798                 int gene = sortedGenes[jj].i;
00799                 int gene_rank = jj;
00800                 float gene_score = sortedGenes[jj].f;
00801                 if(gene_score<0) 
00802                     continue;
00803                 vector<int> &rR = randomRank[gene];
00804                 vector<float> &rF = randomSc[gene];
00805                 int kk = 0;
00806                 for(kk=0; kk<rR.size(); kk++){
00807                     //if(gene_rank<=rR[kk]){
00808                     if(gene_score>=rF[kk]){
00809                         //float f1 = (float) kk / (float) rR.size();
00810                         fprintf(stderr, "%s\t%d\t%d\t%.5e\t%.5e\n", vecstrGenes[gene].c_str(),
00811                             gene_rank, kk, gene_score, randomSc[gene][kk]);
00812                         break;
00813                     }else if(kk==rR.size()-1){
00814                         //float f1 = (float) kk / (float) rR.size();
00815                         fprintf(stderr, "%s\t%d\t%d\t%.5e\t%.5e\n", vecstrGenes[gene].c_str(),
00816                             gene_rank, kk, gene_score, randomSc[gene][kk]);
00817                         //fprintf(stderr, "%d %.6f\n", gene_rank, f1);
00818                         //fprintf(stderr, "%d %d / 100\n", gene_rank, kk);
00819                     }
00820                 }
00821             }
00822 
00823             return 0;
00824         }
00825 
00826         if(sArgs.dislay_only_flag==1){
00827             for(i=0; i<15000; i++)
00828                 fprintf(stderr, "%s\t%.5f\n", 
00829                     vecstrGenes[sortedGenes[i].i].c_str(), sortedGenes[i].f);
00830             return 0;
00831         }
00832 
00833         string goldstdFile = sArgs.goldstd_arg;
00834         vector<string> goldstdGenes;
00835         CSeekTools::ReadMultiGeneOneLine(goldstdFile, goldstdGenes);
00836         vector<char> goldstdGenePresence;
00837         CSeekTools::InitVector(goldstdGenePresence,
00838             vecstrGenes.size(), (char) 0);
00839 
00840         for(i=0; i<goldstdGenes.size(); i++)
00841             goldstdGenePresence[mapstriGenes[goldstdGenes[i]]] = 1;
00842 
00843         if(met!=PR_ALL){
00844             float eval;
00845             bool ret = EvaluateOneQuery(sArgs, met, sortedGenes,
00846                 goldstdGenePresence, nan, eval);
00847             if(!ret) return 1;
00848             fprintf(stderr, "%.5f\n", eval);
00849             return 0;
00850         }else{
00851             vector<float> evalAll;
00852             //fprintf(stderr, "Got here");
00853             bool ret = EvaluateOneQuery(sArgs, met, sortedGenes,
00854                 goldstdGenePresence, nan, evalAll);
00855             if(!ret) return 1;
00856             PrintVector(evalAll);
00857             return 0;
00858         }
00859     }
00860 
00861     if(qmode == MULTI_QUERY){
00862         string goldstdList = sArgs.goldstd_list_arg;
00863         vector<string> vecstrList;
00864         CSeekTools::ReadListOneColumn(goldstdList, vecstrList);
00865         vector<char> *goldstdGenePresence =
00866             new vector<char>[vecstrList.size()];
00867         for(i=0; i<vecstrList.size(); i++){
00868             vector<string> goldstdGenes;
00869             CSeekTools::ReadMultiGeneOneLine(vecstrList[i], goldstdGenes);
00870             CSeekTools::InitVector(goldstdGenePresence[i],
00871                 vecstrGenes.size(), (char) 0);
00872             for(j=0; j<goldstdGenes.size(); j++)
00873                 goldstdGenePresence[i][mapstriGenes[goldstdGenes[j]]] = 1;
00874         }
00875 
00876         //fprintf(stderr, "Finished reading gold standard list\n");
00877 
00878         string queryList = sArgs.query_list_arg;
00879         vecstrList.clear();
00880         CSeekTools::ReadListOneColumn(queryList, vecstrList);
00881         vector<utype> *queryGeneID = new vector<utype>[vecstrList.size()];
00882         for(i=0; i<vecstrList.size(); i++){
00883             vector<string> queryGenes;
00884             CSeekTools::ReadMultiGeneOneLine(vecstrList[i], queryGenes);
00885             for(j=0; j<queryGenes.size(); j++)
00886                 queryGeneID[i].push_back(mapstriGenes[queryGenes[j]]);
00887         }
00888 
00889         //fprintf(stderr, "Finished reading query list\n");
00890 
00891         vector<string> exclude_list;
00892         string excl = sArgs.exclude_list_arg;
00893         exclude_list.clear();
00894         CSeekTools::ReadListOneColumn(excl, exclude_list);
00895         vector<char> *excludeGene = new vector<char>[vecstrList.size()];
00896         for(i=0; i<exclude_list.size(); i++){
00897             vector<string> ex;
00898             CSeekTools::ReadMultiGeneOneLine(exclude_list[i], ex);
00899             CSeekTools::InitVector(excludeGene[i], vecstrGenes.size(), (char) 0);
00900             for(j=0; j<ex.size(); j++)
00901                 excludeGene[i][mapstriGenes[ex[j]]] = 1;
00902         }
00903 
00904         vector<string> include_list;
00905         string incl = sArgs.include_list_arg;
00906         include_list.clear();
00907         CSeekTools::ReadListOneColumn(incl, include_list);
00908         vector<char> *includeGene = new vector<char>[vecstrList.size()];
00909         for(i=0; i<include_list.size(); i++){
00910             vector<string> in;
00911             CSeekTools::ReadMultiGeneOneLine(include_list[i], in, 40000);
00912             CSeekTools::InitVector(includeGene[i], vecstrGenes.size(), (char) 0);
00913             for(j=0; j<in.size(); j++)
00914                 includeGene[i][mapstriGenes[in[j]]] = 1;
00915         }
00916 
00917 
00918         string genescoreList = sArgs.gscore_list_arg;
00919         vecstrList.clear();
00920         CSeekTools::ReadListOneColumn(genescoreList, vecstrList);
00921         vector<float> *geneScores = new vector<float>[vecstrList.size()];
00922         vector<AResultFloat> *sortedGenes =
00923             new vector<AResultFloat>[vecstrList.size()];
00924         float nan = sArgs.nan_arg;
00925         for(i=0; i<vecstrList.size(); i++){
00926             CSeekTools::ReadArray(vecstrList[i].c_str(), geneScores[i]);
00927             //maxScore[i] = *std::max_element(geneScores[i].begin(),
00928             //  geneScores[i].end());
00929             sortedGenes[i].resize(geneScores[i].size());
00930             for(j=0; j<sortedGenes[i].size(); j++){
00931                 sortedGenes[i][j].i = j;
00932                 sortedGenes[i][j].f = geneScores[i][j];
00933                 if(isnan(geneScores[i][j]) || isinf(geneScores[i][j])){
00934                     sortedGenes[i][j].f = nan;
00935                 }
00936                 //fprintf(stderr, "%.5f\n", sortedGenes[i][j].f);
00937             }
00938         }
00939 
00940         //fprintf(stderr, "Got here"); 
00941 
00942         vector<vector<float> > result;
00943         DoAggregate(sArgs, met, sortedGenes, queryGeneID, vecstrList.size(),
00944             goldstdGenePresence, excludeGene, includeGene, result);
00945 
00946 
00947         /*
00948         if(sArgs.fold_over_random_flag==1){
00949             vector<AResultFloat> *randomScores = new vector<AResultFloat>[vecstrList.size()];
00950             //should shuffle only within annotated genes
00951             for(i=0; i<vecstrList.size(); i++){
00952                 GetRandom(rnd, sortedGenes[i], randomScores[i], excludeGene[i], 
00953                     includeGene[i], queryGeneID[i], nan);
00954             }
00955             vector<vector<float> > random_result;
00956             DoAggregate(sArgs, met, randomScores, queryGeneID, vecstrList.size(),
00957                 goldstdGenePresence, excludeGene, includeGene, random_result);
00958 
00959             vector<vector<float> > fold;
00960             fold.resize(result.size());
00961             for(i=0; i<result.size(); i++){
00962                 fold[i] = vector<float>();
00963                 fold[i].resize(result[i].size());
00964                 for(j=0; j<result[i].size(); j++){
00965                     if(random_result[i][j]==result[i][j]){
00966                         fold[i][j] = 1.0;
00967                     }else if(random_result[i][j]==0){
00968                         fold[i][j] = 1.0;
00969                     }else{
00970                         fold[i][j] = result[i][j]/random_result[i][j];
00971                     }
00972                     fprintf(stderr, "%.2f %d %d %.2f %.2f\n", fold[i][j], i, j, result[i][j], random_result[i][j]);
00973                 }
00974             }
00975             PrintResult(fold);
00976         */
00977         //}else{
00978         PrintResult(result);    
00979         //}
00980 
00981     }
00982 
00983     if(qmode == MULTI_DWEIGHT){
00984         string dweightList = sArgs.dweight_list_arg;
00985         vector<string> vecstrList;
00986         CSeekTools::ReadListOneColumn(dweightList, vecstrList);
00987         int numDataset = 0;
00988 
00989         for(j=0; j<vecstrList.size(); j++){
00990             vector<float> ww;
00991             CSeekTools::ReadArray(vecstrList[j].c_str(), ww);
00992             vector<AResultFloat> sortedDatasets;
00993             sortedDatasets.resize(ww.size());
00994             numDataset = ww.size();
00995             for(i=0; i<sortedDatasets.size(); i++){
00996                 sortedDatasets[i].i = i;
00997                 sortedDatasets[i].f = ww[i];
00998             }
00999             sort(sortedDatasets.begin(), sortedDatasets.end());
01000             /*for(i=0; i<sortedDatasets.size(); i++){
01001                 fprintf(stderr, "%.2e\t%s\n", sortedDatasets[i].f, 
01002                     vecstrDatasets[sortedDatasets[i].i].c_str());
01003             }*/
01004             vector<int> depth;
01005             depth.push_back((int) ((float)numDataset * 0.005)); 
01006             depth.push_back((int) ((float)numDataset * 0.01));  
01007             depth.push_back((int) ((float)numDataset * 0.05));  
01008             depth.push_back((int) ((float)numDataset * 0.10));  
01009             depth.push_back((int) ((float)numDataset * 0.20));  
01010             depth.push_back((int) ((float)numDataset * 0.50));  
01011             vector<float> avg;
01012             for(i=0; i<depth.size(); i++){
01013                 float a = 0;
01014                 for(k=0; k<depth[i]; k++)
01015                     a+=sortedDatasets[k].f;
01016                 a /= (float) depth[i];
01017                 avg.push_back(a);
01018             }
01019             for(i=0; i<depth.size(); i++){
01020                 fprintf(stderr, "%.3e\t", avg[i]);
01021             }
01022             fprintf(stderr, "\n");
01023         }
01024 
01025     }
01026 
01027 
01028 #ifdef WIN32
01029     pthread_win32_process_detach_np( );
01030 #endif // WIN32
01031     return 0; }