Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 00023 #include "seekweight.h" 00024 00025 00026 namespace Sleipnir { 00027 00028 //MIN_REQUIRED is for a given gene A, the minimum query genes required that 00029 //correlate with A in order to count A's query score 00030 bool CSeekWeighter::LinearCombine(vector<utype> &rank, 00031 const vector<utype> &cv_query, CSeekDataset &sDataset, 00032 const utype &MIN_REQUIRED, const bool &bSquareZ){ 00033 00034 CSeekIntIntMap *mapG = sDataset.GetGeneMap(); 00035 CSeekIntIntMap *mapQ = sDataset.GetQueryMap(); 00036 if(mapQ==NULL) return true; 00037 00038 if(cv_query.size()==0){ 00039 cerr << "cv_query empty" << endl; 00040 return true; 00041 } 00042 utype i, j, g, q; 00043 vector<utype>::const_iterator iter; 00044 00045 utype iNumGenes = sDataset.GetNumGenes(); 00046 utype q_size = cv_query.size(); 00047 utype **f = sDataset.GetDataMatrix(); 00048 00049 //rank.resize(iNumGenes); 00050 CSeekTools::InitVector(rank, iNumGenes, (utype) 0); 00051 00052 /* as long as rank[g] does not overflow, due to too many queries, we are fine 00053 * should control query size to be <100. */ 00054 vector<utype> queryPos; 00055 queryPos.resize(q_size); 00056 for(i=0; i<q_size; i++) queryPos[i] = mapQ->GetForward(cv_query[i]); 00057 00058 sort(queryPos.begin(), queryPos.end()); 00059 00060 vector<utype> offset; 00061 offset.push_back(0); 00062 for(i=1; i<q_size; i++) offset.push_back(queryPos[i] - queryPos[i-1]); 00063 offset.resize(offset.size()); 00064 00065 vector<utype>::iterator iter_g; 00066 vector<utype>::const_iterator iterOffset; 00067 utype **pf; 00068 utype *pp; 00069 utype totNonZero, tmpScore; 00070 00071 //special case, no MIN_REQUIRED 00072 if(MIN_REQUIRED==1){ 00073 if(bSquareZ){ //must be used with cutoff (or else we will mix 00074 //positive and negative correlations together) 00075 for(iter_g=rank.begin(), pf = &f[0]; iter_g!=rank.end(); 00076 iter_g++, pf++){ 00077 for(totNonZero=0, tmpScore = 0, pp = &(*pf)[queryPos[0]], 00078 iterOffset = offset.begin(); iterOffset!=offset.end(); 00079 iterOffset++, pp+=(*iterOffset)){ 00080 if((*pp)==0) continue; 00081 float sc = (float) ((*pp) - 320) / 100.0; 00082 sc = fabs(sc) + 1.0; //add one adjustment, suitable if cutoff=0 00083 tmpScore += (utype) (sc * sc * 100.0 + 320.0); 00084 ++totNonZero; 00085 } 00086 if(totNonZero==0) continue; 00087 //(*iter_g) = tmpScore / totNonZero; 00088 (*iter_g) = tmpScore / q_size; 00089 } 00090 }else{ 00091 for(iter_g=rank.begin(), pf = &f[0]; iter_g!=rank.end(); 00092 iter_g++, pf++){ 00093 for(totNonZero=0, tmpScore = 0, pp = &(*pf)[queryPos[0]], 00094 iterOffset = offset.begin(); iterOffset!=offset.end(); 00095 iterOffset++, pp+=(*iterOffset)){ 00096 if((*pp)==0) continue; 00097 tmpScore += *pp; 00098 ++totNonZero; 00099 } 00100 if(totNonZero==0) continue; 00101 //(*iter_g) = tmpScore / totNonZero; 00102 (*iter_g) = tmpScore / q_size; 00103 } 00104 } 00105 } 00106 else{ //MIN_REQUIRED ENABLED 00107 00108 if(bSquareZ){ 00109 //if the score of a gene to a query (*pp) is 0, it could 00110 //either mean the gene is absent, or the gene-to-query correlation 00111 //is below cutoff 00112 for(iter_g=rank.begin(), pf = &f[0]; iter_g!=rank.end(); 00113 iter_g++, pf++){ 00114 for(totNonZero=0, tmpScore = 0, pp = &(*pf)[queryPos[0]], 00115 iterOffset = offset.begin(); iterOffset!=offset.end(); 00116 iterOffset++, pp+=(*iterOffset)){ 00117 if((*pp)==0) continue; 00118 float sc = (float) ((*pp) - 320) / 100.0; 00119 sc = fabs(sc) + 1.0; //add one adjustment, suitable for cutoff=0 00120 tmpScore += (utype) (sc * sc * 100.0 + 320.0); 00121 ++totNonZero; 00122 } 00123 //if enough query edges passed the cut off 00124 if(totNonZero >= MIN_REQUIRED) 00125 (*iter_g) = tmpScore / totNonZero; 00126 else 00127 (*iter_g) = 0; 00128 } 00129 } 00130 else{ 00131 for(iter_g=rank.begin(), pf = &f[0]; iter_g!=rank.end(); 00132 iter_g++, pf++){ 00133 for(totNonZero=0, tmpScore = 0, pp = &(*pf)[queryPos[0]], 00134 iterOffset = offset.begin(); iterOffset!=offset.end(); 00135 iterOffset++, pp+=(*iterOffset)){ 00136 if((*pp)==0) continue; 00137 tmpScore += *pp; 00138 ++totNonZero; 00139 } 00140 if(totNonZero >= MIN_REQUIRED) 00141 (*iter_g) = tmpScore / totNonZero; 00142 else 00143 (*iter_g) = 0; 00144 } 00145 } 00146 } 00147 00148 return true; 00149 } 00150 00151 bool CSeekWeighter::OrderStatisticsPreCompute(){ 00152 utype cc, dd, kk; 00153 vector<float> all; 00154 all.resize(1100*600*600); 00155 00156 utype i = 0; 00157 for(kk=0; kk<22000; kk+=20){ 00158 float p = (float) (kk + 1) / 22000; 00159 for(dd=0; dd<3000; dd+=5){ 00160 unsigned int rrk = dd + 1; 00161 for(cc=0; cc<3000; cc+=5){ 00162 double gg = gsl_cdf_binomial_Q(dd+1, p, cc+1); 00163 all[i] = -1.0*log(gg); 00164 i++; 00165 } 00166 } 00167 } 00168 //fprintf(stderr, "Done calculating\n"); //getchar(); 00169 00170 CSeekTools::WriteArray("/tmp/order_stats.binomial.bin", all); 00171 00172 //fprintf(stderr, "Done saving\n"); //getchar(); 00173 } 00174 00175 bool CSeekWeighter::OrderStatisticsRankAggregation(const utype &iDatasets, 00176 const utype &iGenes, utype **rank_d, const vector<utype> &counts, 00177 vector<float> &master_rank, const utype &numThreads){ 00178 00179 //vector<float> precompute; 00180 //CSeekTools::ReadArray("/tmp/order_stats.binomial.bin", precompute); 00181 //fprintf(stderr, "Before\n"); getchar(); 00182 //OrderStatisticsTest(); 00183 00184 if(rank_d==NULL){ 00185 fprintf(stderr, "rank_d is null"); 00186 return false; 00187 } 00188 00189 master_rank.clear(); 00190 master_rank.resize(iGenes); 00191 utype i, j, k, dd, d; 00192 00193 //Zero out genes that are present in few datasets (<50%) 00194 for(k=0; k<iGenes; k++) 00195 if(counts[k]<(int)(0.5*iDatasets)) 00196 for(j=0; j<iDatasets; j++) rank_d[j][k] = 0; 00197 00198 //Hold the normalized rank 00199 float **rank_f = 00200 CSeekTools::Init2DArray(iDatasets, iGenes, (float) 1.1); 00201 00202 const float DEFAULT_NULL = -320; 00203 00204 for(j=0; j<iDatasets; j++){ 00205 vector<AResult> this_d; 00206 utype numNonZero = 0; 00207 for(k=0; k<iGenes; k++){ 00208 if(rank_d[j][k]>0) numNonZero++; 00209 } 00210 if(numNonZero==0){ 00211 continue; 00212 } 00213 this_d.resize(numNonZero); 00214 int kk=0; 00215 for(k=0; k<iGenes; k++){ 00216 if(rank_d[j][k]<=0) continue; 00217 this_d[kk].i = k; 00218 this_d[kk].f = rank_d[j][k]; 00219 kk++; 00220 } 00221 sort(this_d.begin(), this_d.end()); 00222 for(k=0; k<numNonZero; k++){ 00223 rank_f[j][this_d[k].i] = 00224 (float) (k+1) / (float) numNonZero; 00225 } 00226 this_d.clear(); 00227 } 00228 00229 //fprintf(stderr, "Done1\n"); 00230 vector<gsl_vector_float *> gss; 00231 vector<gsl_permutation *> perms; 00232 vector<gsl_permutation *> rks; 00233 00234 gss.resize(numThreads); 00235 perms.resize(numThreads); 00236 rks.resize(numThreads); 00237 for(i=0; i<numThreads; i++){ 00238 gss[i] = gsl_vector_float_calloc(iDatasets); 00239 perms[i] = gsl_permutation_alloc(iDatasets); 00240 rks[i] = gsl_permutation_alloc(iDatasets); 00241 } 00242 00243 //gsl_vector_float *gs = gsl_vector_float_calloc(iDatasets); 00244 //gsl_permutation *perm = gsl_permutation_alloc(iDatasets); 00245 //gsl_permutation *rk = gsl_permutation_alloc(iDatasets); 00246 //fprintf(stderr, "Finished allocating\n"); 00247 00248 #pragma omp parallel for \ 00249 shared(rank_f, gss, perms, rks) private(k, dd) \ 00250 schedule(dynamic) 00251 for(k=0; k<iGenes; k++){ 00252 utype tid = omp_get_thread_num(); 00253 gsl_vector_float *gs = gss[tid]; 00254 gsl_permutation *perm = perms[tid]; 00255 gsl_permutation *rk = rks[tid]; 00256 00257 master_rank[k] = DEFAULT_NULL; 00258 if(counts[k]<(int)(0.5*iDatasets)) continue; 00259 00260 for(dd=0; dd<iDatasets; dd++) 00261 gsl_vector_float_set(gs, dd, rank_f[dd][k]); 00262 00263 gsl_sort_vector_float_index(perm, gs); 00264 gsl_permutation_inverse(rk, perm); 00265 00266 float max = DEFAULT_NULL; 00267 int max_rank = -1; 00268 float max_p = -1; 00269 for(dd=0; dd<iDatasets; dd++){ 00270 //get the prob of the gene in dset dd 00271 float p = gsl_vector_float_get(gs, dd); 00272 if(p<0 || p>1.0){ 00273 continue; 00274 // fprintf(stderr, "D%d %.5f\n", dd, p); 00275 } 00276 //get the rank of dset dd across all dsets 00277 unsigned int rrk = rk->data[dd]; 00278 double gg = gsl_cdf_binomial_Q(rrk+1, p, counts[k]); 00279 float tmp = -1.0*log(gg); 00280 00281 //load precomputed value============== 00282 /*int ind_p = (int) (p / (20.0 / 22000.0)); 00283 int ind_r = (int) (rrk / 5.0); 00284 int ind_c = (int) (counts[k] / 5.0); 00285 float tmp = precompute[ind_p*600*600 + ind_r*600 + ind_c]; 00286 */ 00287 //end================================= 00288 00289 if(isinf(tmp)) tmp = DEFAULT_NULL; 00290 if(tmp>max){ 00291 max = tmp; 00292 max_rank = rrk; 00293 max_p = p; 00294 } 00295 } 00296 if(max!=DEFAULT_NULL){ 00297 master_rank[k] = max; 00298 //fprintf(stderr, "rank %.5f %.5f\n", max_p, max); 00299 } 00300 } 00301 00302 CSeekTools::Free2DArray(rank_f); 00303 for(i=0; i<numThreads; i++){ 00304 gsl_permutation_free(perms[i]); 00305 gsl_permutation_free(rks[i]); 00306 gsl_vector_float_free(gss[i]); 00307 } 00308 perms.clear(); 00309 rks.clear(); 00310 gss.clear(); 00311 00312 //gsl_permutation_free(perm); 00313 //gsl_permutation_free(rk); 00314 //gsl_vector_float_free(gs); 00315 00316 return true; 00317 } 00318 00319 //for equal weighting, in case user wants to still see 00320 //ordering of datasets, based on distance from the average ranking 00321 bool CSeekWeighter::OneGeneWeighting(CSeekQuery &sQuery, 00322 CSeekDataset &sDataset, const float &rate, 00323 const float &percent_required, const bool &bSquareZ, 00324 vector<utype> *rrank, const CSeekQuery *goldStd){ 00325 00326 CSeekIntIntMap *mapG = sDataset.GetGeneMap(); 00327 CSeekIntIntMap *mapQ = sDataset.GetQueryMap(); 00328 if(mapQ==NULL) return true; 00329 00330 sDataset.InitializeCVWeight(1); 00331 00332 utype i, j, qi, qj; 00333 00334 vector<char> is_query, is_gold; 00335 CSeekTools::InitVector(is_query, sDataset.GetNumGenes(), (char) 0); 00336 CSeekTools::InitVector(is_gold, sDataset.GetNumGenes(), (char) 0); 00337 00338 vector<utype> &rank = *rrank; 00339 00340 utype TOP = 1000; 00341 //utype TOP = 0; 00342 vector<AResult> ar; 00343 ar.resize(rank.size()); 00344 00345 const vector<utype> &allQ = sQuery.GetQuery(); 00346 vector<utype> query; 00347 utype num_q = 0; 00348 utype num_v = 0; 00349 00350 const vector<utype> &vi = sQuery.GetCVQuery(qi); 00351 00352 /* Set up query and gold standard */ 00353 for(i=0; i<allQ.size(); i++){ 00354 if(CSeekTools::IsNaN(mapQ->GetForward(allQ[i]))) continue; 00355 is_query[allQ[i]] = 1; 00356 query.push_back(allQ[i]); 00357 num_q++; 00358 } 00359 00360 //assume goldStd must not be null 00361 assert(goldStd!=NULL); 00362 const vector<utype> &allGoldStd = goldStd->GetQuery(); 00363 for(i=0; i<allGoldStd.size(); i++){ 00364 if(CSeekTools::IsNaN(mapG->GetForward(allGoldStd[i]))) 00365 continue; 00366 if(is_query[allGoldStd[i]]==1) continue; 00367 is_gold[allGoldStd[i]] = 1; 00368 num_v++; 00369 } 00370 00371 if(num_q==0 || num_v==0){ 00372 sDataset.SetCVWeight(0, -1); 00373 }else{ 00374 /* actual weighting */ 00375 float w = 0; 00376 const utype MIN_QUERY_REQUIRED = 00377 max((utype) 1, (utype) (percent_required * query.size())); 00378 bool ret = LinearCombine(rank, query, sDataset, 00379 MIN_QUERY_REQUIRED, bSquareZ); 00380 ret = CSeekPerformanceMeasure::RankBiasedPrecision(rate, 00381 rank, w, is_query, is_gold, *mapG, &ar, TOP); 00382 if(!ret) sDataset.SetCVWeight(0, -1); 00383 else sDataset.SetCVWeight(0, w); 00384 } 00385 00386 ar.clear(); 00387 return true; 00388 } 00389 00390 bool CSeekWeighter::AverageWeighting(CSeekQuery &sQuery, CSeekDataset &sDataset, 00391 const float &percent_required, const bool &bSquareZ, float &w){ 00392 00393 CSeekIntIntMap *mapQ = sDataset.GetQueryMap(); 00394 if(mapQ==NULL) return true; 00395 00396 utype i, j, qi, qj; 00397 00398 const vector<utype> &allQ = sQuery.GetQuery(); 00399 vector<utype> presentQ; 00400 utype num_q = 0; 00401 for(qi=0; qi<allQ.size(); qi++){ 00402 if(CSeekTools::IsNaN(mapQ->GetForward(allQ[qi]))) continue; 00403 num_q++; 00404 presentQ.push_back(allQ[qi]); 00405 } 00406 00407 w = 0; 00408 const utype MIN_QUERY_REQUIRED = 00409 max((utype) 2, (utype) (percent_required * allQ.size())); 00410 00411 if(num_q<MIN_QUERY_REQUIRED){ 00412 w = -1; 00413 return true; 00414 } 00415 00416 utype **f = sDataset.GetDataMatrix(); 00417 /* as long as rank[g] does not overflow, due to too many queries, we are fine 00418 * should control query size to be <100. */ 00419 vector<utype> queryPos; 00420 queryPos.resize(num_q); 00421 for(i=0; i<num_q; i++) 00422 queryPos[i] = mapQ->GetForward(presentQ[i]); 00423 00424 int pairs = 0; 00425 if(bSquareZ){ 00426 for(qi=0; qi<presentQ.size(); qi++){ 00427 for(qj=0; qj<presentQ.size(); qj++){ 00428 if(qi==qj) continue; 00429 float t = (float) f[presentQ[qj]][queryPos[qi]]; 00430 t = (t-320)/100.0; 00431 t = t * t; 00432 w += t; 00433 pairs++; 00434 } 00435 } 00436 w /= (float) pairs; 00437 00438 }else{ 00439 for(qi=0; qi<presentQ.size(); qi++){ 00440 for(qj=0; qj<presentQ.size(); qj++){ 00441 if(qi==qj) continue; 00442 w += (float) f[presentQ[qj]][queryPos[qi]]; 00443 pairs++; 00444 } 00445 } 00446 w /= (float) pairs; 00447 w /= (float) 640; 00448 } 00449 return true; 00450 } 00451 00452 bool CSeekWeighter::CVWeighting(CSeekQuery &sQuery, CSeekDataset &sDataset, 00453 const float &rate, const float &percent_required, const bool &bSquareZ, 00454 vector<utype> *rrank, const CSeekQuery *goldStd){ 00455 00456 CSeekIntIntMap *mapG = sDataset.GetGeneMap(); 00457 CSeekIntIntMap *mapQ = sDataset.GetQueryMap(); 00458 if(mapQ==NULL) return true; 00459 00460 utype iFold = sQuery.GetNumFold(); 00461 sDataset.InitializeCVWeight(iFold); 00462 00463 utype i, j, qi, qj; 00464 00465 vector<char> is_query_cross, is_gold; 00466 CSeekTools::InitVector(is_query_cross, sDataset.GetNumGenes(), (char) 0); 00467 CSeekTools::InitVector(is_gold, sDataset.GetNumGenes(), (char) 0); 00468 00469 vector<utype> &rank = *rrank; 00470 00471 utype TOP = 1000; 00472 //utype TOP = 0; //disable TOP approximation 00473 vector<AResult> ar; 00474 ar.resize(rank.size()); 00475 00476 const vector<utype> &allQ = sQuery.GetQuery(); 00477 00478 for(qi=0; qi<iFold; qi++){ 00479 vector<utype> cv_query; 00480 utype num_q = 0; 00481 utype num_v = 0; 00482 00483 const vector<utype> &vi = sQuery.GetCVQuery(qi); 00484 00485 /* Set up query and gold standard */ 00486 for(i=0; i<vi.size(); i++){ 00487 if(CSeekTools::IsNaN(mapQ->GetForward(vi[i]))) continue; 00488 is_query_cross[vi[i]] = 1; 00489 cv_query.push_back(vi[i]); 00490 num_q++; 00491 } 00492 00493 if(goldStd==NULL){ //if no custom gold standard 00494 //Use query itself as gold standard 00495 for(i=0; i<allQ.size(); i++){ 00496 if(CSeekTools::IsNaN(mapQ->GetForward(allQ[i]))) continue; 00497 if(is_query_cross[allQ[i]]==1) continue; 00498 is_gold[allQ[i]] = 1; 00499 num_v++; 00500 } 00501 }else{ 00502 const vector<utype> &allGoldStd = goldStd->GetQuery(); 00503 //Use custom gene-set as gold standard 00504 for(i=0; i<allGoldStd.size(); i++){ 00505 if(CSeekTools::IsNaN(mapG->GetForward(allGoldStd[i]))) 00506 continue; 00507 if(is_query_cross[allGoldStd[i]]==1) continue; 00508 is_gold[allGoldStd[i]] = 1; 00509 num_v++; 00510 } 00511 } 00512 00513 if(num_q==0 || num_v==0){ 00514 sDataset.SetCVWeight(qi, -1); 00515 //printf("num_q %d or num_v %d\n", num_q, num_v); 00516 }else{ 00517 /* actual weighting */ 00518 float w = 0; 00519 const utype MIN_QUERY_REQUIRED = 00520 max((utype) 1, (utype) (percent_required * cv_query.size())); 00521 bool ret = LinearCombine(rank, cv_query, sDataset, 00522 MIN_QUERY_REQUIRED, bSquareZ); 00523 ret = CSeekPerformanceMeasure::RankBiasedPrecision(rate, 00524 rank, w, is_query_cross, is_gold, *mapG, &ar, TOP); 00525 //fprintf(stderr, "Weight %.5f\n", w); 00526 //ret = CSeekPerformanceMeasure::AveragePrecision( 00527 // rank, w, is_query_cross, is_gold, *mapG, &ar); 00528 if(!ret) sDataset.SetCVWeight(qi, -1); 00529 else sDataset.SetCVWeight(qi, w); 00530 //printf("Weight: %.5f\n", w); 00531 } 00532 00533 /* Reset query and gold standard */ 00534 for(i=0; i<vi.size(); i++){ 00535 if(CSeekTools::IsNaN(mapQ->GetForward(vi[i]))) continue; 00536 is_query_cross[vi[i]] = 0; 00537 } 00538 00539 if(goldStd==NULL){ 00540 for(i=0; i<allQ.size(); i++){ 00541 if(CSeekTools::IsNaN(mapQ->GetForward(allQ[i]))) continue; 00542 is_gold[allQ[i]]=0; 00543 } 00544 }else{ 00545 const vector<utype> &allGoldStd = goldStd->GetQuery(); 00546 for(i=0; i<allGoldStd.size(); i++){ 00547 if(CSeekTools::IsNaN(mapG->GetForward(allGoldStd[i]))) 00548 continue; 00549 is_gold[allGoldStd[i]] = 0; 00550 } 00551 } 00552 00553 } 00554 00555 ar.clear(); 00556 00557 return true; 00558 } 00559 00560 }