Sleipnir
src/seekweight.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 
00023 #include "seekweight.h"
00024 
00025 
00026 namespace Sleipnir {
00027 
00028 //MIN_REQUIRED is for a given gene A, the minimum query genes required that
00029 //correlate with A in order to count A's query score
00030 bool CSeekWeighter::LinearCombine(vector<utype> &rank,
00031     const vector<utype> &cv_query, CSeekDataset &sDataset,
00032     const utype &MIN_REQUIRED, const bool &bSquareZ){
00033 
00034     CSeekIntIntMap *mapG = sDataset.GetGeneMap();
00035     CSeekIntIntMap *mapQ = sDataset.GetQueryMap();
00036     if(mapQ==NULL) return true;
00037 
00038     if(cv_query.size()==0){
00039         cerr << "cv_query empty" << endl;
00040         return true;
00041     }
00042     utype i, j, g, q;
00043     vector<utype>::const_iterator iter;
00044 
00045     utype iNumGenes = sDataset.GetNumGenes();
00046     utype q_size = cv_query.size();
00047     utype **f = sDataset.GetDataMatrix();
00048 
00049     //rank.resize(iNumGenes);
00050     CSeekTools::InitVector(rank, iNumGenes, (utype) 0);
00051 
00052     /* as long as rank[g] does not overflow, due to too many queries, we are fine
00053      * should control query size to be <100. */
00054     vector<utype> queryPos;
00055     queryPos.resize(q_size);
00056     for(i=0; i<q_size; i++) queryPos[i] = mapQ->GetForward(cv_query[i]);
00057 
00058     sort(queryPos.begin(), queryPos.end());
00059 
00060     vector<utype> offset;
00061     offset.push_back(0);
00062     for(i=1; i<q_size; i++) offset.push_back(queryPos[i] - queryPos[i-1]);
00063     offset.resize(offset.size());
00064 
00065     vector<utype>::iterator iter_g;
00066     vector<utype>::const_iterator iterOffset;
00067     utype **pf;
00068     utype *pp;
00069     utype totNonZero, tmpScore;
00070 
00071     //special case, no MIN_REQUIRED
00072     if(MIN_REQUIRED==1){
00073         if(bSquareZ){ //must be used with cutoff (or else we will mix
00074                     //positive and negative correlations together)
00075             for(iter_g=rank.begin(), pf = &f[0]; iter_g!=rank.end(); 
00076             iter_g++, pf++){
00077                 for(totNonZero=0, tmpScore = 0, pp = &(*pf)[queryPos[0]],
00078                 iterOffset = offset.begin(); iterOffset!=offset.end();
00079                 iterOffset++, pp+=(*iterOffset)){
00080                     if((*pp)==0) continue;
00081                     float sc = (float) ((*pp) - 320) / 100.0; 
00082                     sc = fabs(sc) + 1.0; //add one adjustment, suitable if cutoff=0
00083                     tmpScore += (utype) (sc * sc * 100.0 + 320.0);
00084                     ++totNonZero;
00085                 }
00086                 if(totNonZero==0) continue;
00087                 //(*iter_g) = tmpScore / totNonZero;
00088                 (*iter_g) = tmpScore / q_size;
00089             }       
00090         }else{
00091             for(iter_g=rank.begin(), pf = &f[0]; iter_g!=rank.end(); 
00092             iter_g++, pf++){
00093                 for(totNonZero=0, tmpScore = 0, pp = &(*pf)[queryPos[0]],
00094                 iterOffset = offset.begin(); iterOffset!=offset.end();
00095                 iterOffset++, pp+=(*iterOffset)){
00096                     if((*pp)==0) continue;
00097                     tmpScore += *pp;
00098                     ++totNonZero;
00099                 }
00100                 if(totNonZero==0) continue;
00101                 //(*iter_g) = tmpScore / totNonZero;
00102                 (*iter_g) = tmpScore / q_size;
00103             }
00104         }
00105     }
00106     else{ //MIN_REQUIRED ENABLED
00107 
00108         if(bSquareZ){
00109             //if the score of a gene to a query (*pp) is 0, it could 
00110             //either mean the gene is absent, or the gene-to-query correlation 
00111             //is below cutoff
00112             for(iter_g=rank.begin(), pf = &f[0]; iter_g!=rank.end(); 
00113             iter_g++, pf++){
00114                 for(totNonZero=0, tmpScore = 0, pp = &(*pf)[queryPos[0]],
00115                 iterOffset = offset.begin(); iterOffset!=offset.end();
00116                 iterOffset++, pp+=(*iterOffset)){
00117                     if((*pp)==0) continue;
00118                     float sc = (float) ((*pp) - 320) / 100.0;
00119                     sc = fabs(sc) + 1.0; //add one adjustment, suitable for cutoff=0
00120                     tmpScore += (utype) (sc * sc * 100.0 + 320.0);
00121                     ++totNonZero;
00122                 }
00123                 //if enough query edges passed the cut off 
00124                 if(totNonZero >= MIN_REQUIRED)
00125                     (*iter_g) = tmpScore / totNonZero;
00126                 else
00127                     (*iter_g) = 0;
00128             }
00129         }
00130         else{
00131             for(iter_g=rank.begin(), pf = &f[0]; iter_g!=rank.end(); 
00132             iter_g++, pf++){
00133                 for(totNonZero=0, tmpScore = 0, pp = &(*pf)[queryPos[0]],
00134                 iterOffset = offset.begin(); iterOffset!=offset.end();
00135                 iterOffset++, pp+=(*iterOffset)){
00136                     if((*pp)==0) continue;
00137                     tmpScore += *pp;
00138                     ++totNonZero;
00139                 }
00140                 if(totNonZero >= MIN_REQUIRED)
00141                     (*iter_g) = tmpScore / totNonZero;
00142                 else
00143                     (*iter_g) = 0;
00144             }
00145         }
00146     }
00147 
00148     return true;
00149 }
00150 
00151 bool CSeekWeighter::OrderStatisticsPreCompute(){
00152     utype cc, dd, kk;
00153     vector<float> all;
00154     all.resize(1100*600*600);
00155 
00156     utype i = 0;
00157     for(kk=0; kk<22000; kk+=20){
00158         float p = (float) (kk + 1) / 22000;
00159         for(dd=0; dd<3000; dd+=5){
00160             unsigned int rrk = dd + 1;
00161             for(cc=0; cc<3000; cc+=5){
00162                 double gg = gsl_cdf_binomial_Q(dd+1, p, cc+1);
00163                 all[i] = -1.0*log(gg);
00164                 i++;
00165             }
00166         }
00167     }
00168     //fprintf(stderr, "Done calculating\n"); //getchar();
00169 
00170     CSeekTools::WriteArray("/tmp/order_stats.binomial.bin", all);
00171 
00172     //fprintf(stderr, "Done saving\n"); //getchar();
00173 }
00174 
00175 bool CSeekWeighter::OrderStatisticsRankAggregation(const utype &iDatasets,
00176     const utype &iGenes, utype **rank_d, const vector<utype> &counts,
00177     vector<float> &master_rank, const utype &numThreads){
00178 
00179     //vector<float> precompute;
00180     //CSeekTools::ReadArray("/tmp/order_stats.binomial.bin", precompute);
00181     //fprintf(stderr, "Before\n"); getchar();
00182     //OrderStatisticsTest();
00183 
00184     if(rank_d==NULL){
00185         fprintf(stderr, "rank_d is null");
00186         return false;
00187     }
00188 
00189     master_rank.clear();
00190     master_rank.resize(iGenes);
00191     utype i, j, k, dd, d;
00192 
00193     //Zero out genes that are present in few datasets (<50%)
00194     for(k=0; k<iGenes; k++)
00195         if(counts[k]<(int)(0.5*iDatasets))
00196             for(j=0; j<iDatasets; j++) rank_d[j][k] = 0;
00197 
00198     //Hold the normalized rank
00199     float **rank_f =
00200         CSeekTools::Init2DArray(iDatasets, iGenes, (float) 1.1);
00201 
00202     const float DEFAULT_NULL = -320;
00203 
00204     for(j=0; j<iDatasets; j++){
00205         vector<AResult> this_d;
00206         utype numNonZero = 0;
00207         for(k=0; k<iGenes; k++){
00208             if(rank_d[j][k]>0) numNonZero++;
00209         }
00210         if(numNonZero==0){
00211             continue;
00212         }
00213         this_d.resize(numNonZero);
00214         int kk=0;
00215         for(k=0; k<iGenes; k++){
00216             if(rank_d[j][k]<=0) continue;
00217             this_d[kk].i = k;
00218             this_d[kk].f = rank_d[j][k];
00219             kk++;
00220         }
00221         sort(this_d.begin(), this_d.end());
00222         for(k=0; k<numNonZero; k++){
00223             rank_f[j][this_d[k].i] =
00224                 (float) (k+1) / (float) numNonZero;
00225         }
00226         this_d.clear();
00227     }
00228 
00229     //fprintf(stderr, "Done1\n");
00230     vector<gsl_vector_float *> gss;
00231     vector<gsl_permutation *> perms;
00232     vector<gsl_permutation *> rks;
00233 
00234     gss.resize(numThreads);
00235     perms.resize(numThreads);
00236     rks.resize(numThreads);
00237     for(i=0; i<numThreads; i++){
00238         gss[i] = gsl_vector_float_calloc(iDatasets);
00239         perms[i] = gsl_permutation_alloc(iDatasets);
00240         rks[i] = gsl_permutation_alloc(iDatasets);
00241     }
00242 
00243     //gsl_vector_float *gs = gsl_vector_float_calloc(iDatasets);
00244     //gsl_permutation *perm = gsl_permutation_alloc(iDatasets);
00245     //gsl_permutation *rk = gsl_permutation_alloc(iDatasets);
00246     //fprintf(stderr, "Finished allocating\n");
00247 
00248     #pragma omp parallel for \
00249     shared(rank_f, gss, perms, rks) private(k, dd) \
00250     schedule(dynamic)
00251     for(k=0; k<iGenes; k++){
00252         utype tid = omp_get_thread_num();
00253         gsl_vector_float *gs = gss[tid];
00254         gsl_permutation *perm = perms[tid];
00255         gsl_permutation *rk = rks[tid];
00256 
00257         master_rank[k] = DEFAULT_NULL;
00258         if(counts[k]<(int)(0.5*iDatasets)) continue;
00259 
00260         for(dd=0; dd<iDatasets; dd++)
00261             gsl_vector_float_set(gs, dd, rank_f[dd][k]);
00262 
00263         gsl_sort_vector_float_index(perm, gs);
00264         gsl_permutation_inverse(rk, perm);
00265 
00266         float max = DEFAULT_NULL;
00267         int max_rank = -1;
00268         float max_p = -1;
00269         for(dd=0; dd<iDatasets; dd++){
00270             //get the prob of the gene in dset dd
00271             float p = gsl_vector_float_get(gs, dd);
00272             if(p<0 || p>1.0){
00273                 continue;
00274             //  fprintf(stderr, "D%d %.5f\n", dd, p);
00275             }
00276             //get the rank of dset dd across all dsets
00277             unsigned int rrk = rk->data[dd];
00278             double gg = gsl_cdf_binomial_Q(rrk+1, p, counts[k]);
00279             float tmp = -1.0*log(gg);
00280 
00281             //load precomputed value==============
00282             /*int ind_p = (int) (p / (20.0 / 22000.0));
00283             int ind_r = (int) (rrk / 5.0);
00284             int ind_c = (int) (counts[k] / 5.0);
00285             float tmp = precompute[ind_p*600*600 + ind_r*600 + ind_c];
00286             */
00287             //end=================================
00288 
00289             if(isinf(tmp)) tmp = DEFAULT_NULL;
00290             if(tmp>max){
00291                 max = tmp;
00292                 max_rank = rrk;
00293                 max_p = p;
00294             }
00295         }
00296         if(max!=DEFAULT_NULL){
00297             master_rank[k] = max;
00298             //fprintf(stderr, "rank %.5f %.5f\n", max_p, max);
00299         }
00300     }
00301 
00302     CSeekTools::Free2DArray(rank_f);
00303     for(i=0; i<numThreads; i++){
00304         gsl_permutation_free(perms[i]);
00305         gsl_permutation_free(rks[i]);
00306         gsl_vector_float_free(gss[i]);
00307     }
00308     perms.clear();
00309     rks.clear();
00310     gss.clear();
00311 
00312     //gsl_permutation_free(perm);
00313     //gsl_permutation_free(rk);
00314     //gsl_vector_float_free(gs);
00315 
00316     return true;
00317 }
00318 
00319 //for equal weighting, in case user wants to still see
00320 //ordering of datasets, based on distance from the average ranking
00321 bool CSeekWeighter::OneGeneWeighting(CSeekQuery &sQuery, 
00322     CSeekDataset &sDataset, const float &rate, 
00323     const float &percent_required, const bool &bSquareZ,
00324     vector<utype> *rrank, const CSeekQuery *goldStd){
00325 
00326     CSeekIntIntMap *mapG = sDataset.GetGeneMap();
00327     CSeekIntIntMap *mapQ = sDataset.GetQueryMap();
00328     if(mapQ==NULL) return true;
00329 
00330     sDataset.InitializeCVWeight(1);
00331 
00332     utype i, j, qi, qj;
00333 
00334     vector<char> is_query, is_gold;
00335     CSeekTools::InitVector(is_query, sDataset.GetNumGenes(), (char) 0);
00336     CSeekTools::InitVector(is_gold, sDataset.GetNumGenes(), (char) 0);
00337 
00338     vector<utype> &rank = *rrank;
00339 
00340     utype TOP = 1000;
00341     //utype TOP = 0;
00342     vector<AResult> ar;
00343     ar.resize(rank.size());
00344 
00345     const vector<utype> &allQ = sQuery.GetQuery();
00346     vector<utype> query;
00347     utype num_q = 0;
00348     utype num_v = 0;
00349 
00350     const vector<utype> &vi = sQuery.GetCVQuery(qi);
00351 
00352     /* Set up query and gold standard */
00353     for(i=0; i<allQ.size(); i++){
00354         if(CSeekTools::IsNaN(mapQ->GetForward(allQ[i]))) continue;
00355         is_query[allQ[i]] = 1;
00356         query.push_back(allQ[i]);
00357         num_q++;
00358     }
00359 
00360     //assume goldStd must not be null
00361     assert(goldStd!=NULL);
00362     const vector<utype> &allGoldStd = goldStd->GetQuery();
00363     for(i=0; i<allGoldStd.size(); i++){
00364         if(CSeekTools::IsNaN(mapG->GetForward(allGoldStd[i])))
00365             continue;
00366         if(is_query[allGoldStd[i]]==1) continue;
00367         is_gold[allGoldStd[i]] = 1;
00368         num_v++;
00369     }
00370 
00371     if(num_q==0 || num_v==0){
00372         sDataset.SetCVWeight(0, -1);
00373     }else{
00374         /* actual weighting */
00375         float w = 0;
00376         const utype MIN_QUERY_REQUIRED =
00377             max((utype) 1, (utype) (percent_required * query.size()));
00378         bool ret = LinearCombine(rank, query, sDataset,
00379             MIN_QUERY_REQUIRED, bSquareZ);
00380         ret = CSeekPerformanceMeasure::RankBiasedPrecision(rate,
00381             rank, w, is_query, is_gold, *mapG, &ar, TOP);
00382         if(!ret) sDataset.SetCVWeight(0, -1);
00383         else sDataset.SetCVWeight(0, w);
00384     }
00385 
00386     ar.clear();
00387     return true;
00388 }
00389 
00390 bool CSeekWeighter::AverageWeighting(CSeekQuery &sQuery, CSeekDataset &sDataset,
00391     const float &percent_required, const bool &bSquareZ, float &w){
00392 
00393     CSeekIntIntMap *mapQ = sDataset.GetQueryMap();
00394     if(mapQ==NULL) return true;
00395 
00396     utype i, j, qi, qj;
00397 
00398     const vector<utype> &allQ = sQuery.GetQuery();
00399     vector<utype> presentQ;
00400     utype num_q = 0;
00401     for(qi=0; qi<allQ.size(); qi++){
00402         if(CSeekTools::IsNaN(mapQ->GetForward(allQ[qi]))) continue;
00403         num_q++;
00404         presentQ.push_back(allQ[qi]);
00405     }
00406 
00407     w = 0;
00408     const utype MIN_QUERY_REQUIRED =
00409         max((utype) 2, (utype) (percent_required * allQ.size()));
00410 
00411     if(num_q<MIN_QUERY_REQUIRED){
00412         w = -1;
00413         return true;
00414     }
00415 
00416     utype **f = sDataset.GetDataMatrix();
00417     /* as long as rank[g] does not overflow, due to too many queries, we are fine
00418      * should control query size to be <100. */
00419     vector<utype> queryPos;
00420     queryPos.resize(num_q);
00421     for(i=0; i<num_q; i++)
00422         queryPos[i] = mapQ->GetForward(presentQ[i]);
00423 
00424     int pairs = 0;
00425     if(bSquareZ){
00426         for(qi=0; qi<presentQ.size(); qi++){
00427             for(qj=0; qj<presentQ.size(); qj++){
00428                 if(qi==qj) continue;
00429                 float t = (float) f[presentQ[qj]][queryPos[qi]];
00430                 t = (t-320)/100.0;
00431                 t = t * t;
00432                 w += t;
00433                 pairs++;
00434             }
00435         }
00436         w /= (float) pairs;
00437 
00438     }else{
00439         for(qi=0; qi<presentQ.size(); qi++){
00440             for(qj=0; qj<presentQ.size(); qj++){
00441                 if(qi==qj) continue;
00442                 w += (float) f[presentQ[qj]][queryPos[qi]];
00443                 pairs++;
00444             }
00445         }
00446         w /= (float) pairs;
00447         w /= (float) 640;
00448     }
00449     return true;
00450 }
00451 
00452 bool CSeekWeighter::CVWeighting(CSeekQuery &sQuery, CSeekDataset &sDataset,
00453     const float &rate, const float &percent_required, const bool &bSquareZ,
00454     vector<utype> *rrank, const CSeekQuery *goldStd){
00455 
00456     CSeekIntIntMap *mapG = sDataset.GetGeneMap();
00457     CSeekIntIntMap *mapQ = sDataset.GetQueryMap();
00458     if(mapQ==NULL) return true;
00459 
00460     utype iFold = sQuery.GetNumFold();
00461     sDataset.InitializeCVWeight(iFold);
00462 
00463     utype i, j, qi, qj;
00464 
00465     vector<char> is_query_cross, is_gold;
00466     CSeekTools::InitVector(is_query_cross, sDataset.GetNumGenes(), (char) 0);
00467     CSeekTools::InitVector(is_gold, sDataset.GetNumGenes(), (char) 0);
00468 
00469     vector<utype> &rank = *rrank;
00470 
00471     utype TOP = 1000;
00472     //utype TOP = 0; //disable TOP approximation
00473     vector<AResult> ar;
00474     ar.resize(rank.size());
00475 
00476     const vector<utype> &allQ = sQuery.GetQuery();
00477 
00478     for(qi=0; qi<iFold; qi++){
00479         vector<utype> cv_query;
00480         utype num_q = 0;
00481         utype num_v = 0;
00482 
00483         const vector<utype> &vi = sQuery.GetCVQuery(qi);
00484 
00485         /* Set up query and gold standard */
00486         for(i=0; i<vi.size(); i++){
00487             if(CSeekTools::IsNaN(mapQ->GetForward(vi[i]))) continue;
00488             is_query_cross[vi[i]] = 1;
00489             cv_query.push_back(vi[i]);
00490             num_q++;
00491         }
00492 
00493         if(goldStd==NULL){ //if no custom gold standard
00494             //Use query itself as gold standard
00495             for(i=0; i<allQ.size(); i++){
00496                 if(CSeekTools::IsNaN(mapQ->GetForward(allQ[i]))) continue;
00497                 if(is_query_cross[allQ[i]]==1) continue;
00498                 is_gold[allQ[i]] = 1;
00499                 num_v++;
00500             }
00501         }else{
00502             const vector<utype> &allGoldStd = goldStd->GetQuery();
00503             //Use custom gene-set as gold standard
00504             for(i=0; i<allGoldStd.size(); i++){
00505                 if(CSeekTools::IsNaN(mapG->GetForward(allGoldStd[i])))
00506                     continue;
00507                 if(is_query_cross[allGoldStd[i]]==1) continue;
00508                 is_gold[allGoldStd[i]] = 1;
00509                 num_v++;
00510             }
00511         }
00512 
00513         if(num_q==0 || num_v==0){
00514             sDataset.SetCVWeight(qi, -1);
00515             //printf("num_q %d or num_v %d\n", num_q, num_v);
00516         }else{
00517             /* actual weighting */
00518             float w = 0;
00519             const utype MIN_QUERY_REQUIRED =
00520                 max((utype) 1, (utype) (percent_required * cv_query.size()));
00521             bool ret = LinearCombine(rank, cv_query, sDataset,
00522                 MIN_QUERY_REQUIRED, bSquareZ);
00523             ret = CSeekPerformanceMeasure::RankBiasedPrecision(rate,
00524                 rank, w, is_query_cross, is_gold, *mapG, &ar, TOP);
00525             //fprintf(stderr, "Weight %.5f\n", w);
00526             //ret = CSeekPerformanceMeasure::AveragePrecision(
00527             //  rank, w, is_query_cross, is_gold, *mapG, &ar);
00528             if(!ret) sDataset.SetCVWeight(qi, -1);
00529             else sDataset.SetCVWeight(qi, w);
00530             //printf("Weight: %.5f\n", w);
00531         }
00532 
00533         /* Reset query and gold standard */
00534         for(i=0; i<vi.size(); i++){
00535             if(CSeekTools::IsNaN(mapQ->GetForward(vi[i]))) continue;
00536             is_query_cross[vi[i]] = 0;
00537         }
00538 
00539         if(goldStd==NULL){
00540             for(i=0; i<allQ.size(); i++){
00541                 if(CSeekTools::IsNaN(mapQ->GetForward(allQ[i]))) continue;
00542                 is_gold[allQ[i]]=0;
00543             }
00544         }else{
00545             const vector<utype> &allGoldStd = goldStd->GetQuery();
00546             for(i=0; i<allGoldStd.size(); i++){
00547                 if(CSeekTools::IsNaN(mapG->GetForward(allGoldStd[i])))
00548                     continue;
00549                 is_gold[allGoldStd[i]] = 0;
00550             }
00551         }
00552 
00553     }
00554 
00555     ar.clear();
00556 
00557     return true;
00558 }
00559 
00560 }