Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 #include "statistics.h" 00026 #include "datapair.h" 00027 #include "file.h" 00028 00029 struct GeneStruct { 00030 size_t idx; 00031 float score; 00032 }; 00033 00034 struct SortGeneStruct { 00035 00036 bool operator()(const GeneStruct rOne, const GeneStruct rTwo) const { 00037 return (rOne.score > rTwo.score); 00038 } 00039 }; 00040 00041 class PairStruct { 00042 public: 00043 size_t g1; 00044 size_t g2; 00045 float score; 00046 00047 PairStruct(size_t g1idx, size_t g2idx){ 00048 g1 = g1idx; 00049 g2 = g2idx; 00050 score = 0; 00051 } 00052 }; 00053 00054 struct SortPairStruct { 00055 00056 bool operator()(const PairStruct* rOne, const PairStruct* rTwo) const { 00057 return (rOne->score < rTwo->score); 00058 } 00059 }; 00060 00061 00062 bool open_genepairs(char* szFile, vector<PairStruct*>& vecPairs, CPCL& GDist){ 00063 ifstream istm; 00064 const char* pc; 00065 char* pcTail; 00066 char* acBuf; 00067 string strToken, strCache, strValue; 00068 size_t iOne, iTwo, i; 00069 const char c_acComment[]= "#"; 00070 00071 istm.open( szFile ); 00072 00073 acBuf = new char[ CFile::GetBufferSize() ]; 00074 while( istm.peek( ) != EOF ) { 00075 istm.getline( acBuf, CFile::GetBufferSize() - 1 ); 00076 strToken = CFile::OpenToken( acBuf, &pc ); 00077 if( !strToken.length( ) ) 00078 break; 00079 if( strToken == c_acComment ) 00080 continue; 00081 if( strToken != strCache ) { 00082 strCache = strToken; 00083 00084 iOne = GDist.GetGene(strToken); 00085 if( iOne == -1){ 00086 cerr << "Skipping gene pair, missing gene: " << strToken << endl; 00087 continue; 00088 } 00089 00090 //iOne = MapGene( mapGenes, m_vecstrGenes, strToken ); 00091 /* 00092 if(GDist != NULL){ 00093 iOne = GDist.GetGene(strToken); 00094 if( iOne == -1){ 00095 cerr << "Skipping gene pair, missing gene: " << strToken << endl; 00096 continue; 00097 } 00098 }else{ 00099 iOne = atoi(strToken.c_str()); 00100 } 00101 */ 00102 } 00103 00104 strToken = CFile::OpenToken( pc, &pc ); 00105 if( !strToken.length( ) ) { 00106 delete[] acBuf; 00107 return false; } 00108 00109 iTwo = GDist.GetGene(strToken); 00110 if( iTwo == -1){ 00111 cerr << "Skipping gene pair, missing gene: " << strToken << endl; 00112 continue; 00113 } 00114 //iTwo = MapGene( mapGenes, m_vecstrGenes, strToken ); 00115 /* 00116 if(GDist != NULL){ 00117 iTwo = GDist.GetGene(strToken); 00118 if( iTwo == -1){ 00119 cerr << "Skipping gene pair, missing gene: " << strToken << endl; 00120 continue; 00121 } 00122 }else{ 00123 iTwo = atoi(strToken.c_str()); 00124 } 00125 */ 00126 00127 vecPairs.push_back( new PairStruct(iOne, iTwo) ); 00128 } 00129 delete[] acBuf; 00130 return true; 00131 } 00132 00133 void find_bunch(CPCL& GDist, vector<PairStruct*>& vecPairs, vector<PairStruct*>& vecFinalPairs){ 00134 float C, d, d1, d2, density; 00135 size_t i, j, k, p, Bi, Bj, x; 00136 vector<PairStruct*> result_pairs; 00137 vector<PairStruct*> unresolved_pairs; 00138 00139 result_pairs.resize(vecPairs.size()); 00140 unresolved_pairs.resize(vecPairs.size()); 00141 00142 density = CMeta::GetNaN(); 00143 for(i=0; i < GDist.GetGenes(); i++){ 00144 for(j=0; j < GDist.GetGenes(); j++){ 00145 00146 if(i == j || CMeta::IsNaN(d = GDist.Get(i, j))) 00147 continue; 00148 00149 for(p=0; p < vecPairs.size(); p++){ 00150 if(CMeta::IsNaN(d1 = GDist.Get(vecPairs[p]->g1, i)) || 00151 CMeta::IsNaN(d2 = GDist.Get(j, vecPairs[p]->g2))){ 00152 vecPairs[p]->score = CMeta::GetNaN(); 00153 continue; 00154 } 00155 vecPairs[p]->score = d1 + d2; 00156 } 00157 00158 // DEBUG!!! check if direction is correct 00159 sort(vecPairs.begin(), vecPairs.end(), SortPairStruct()); 00160 00161 C = d; 00162 for(p=0; p < vecPairs.size(); p++){ 00163 // check nan 00164 // DEBUG is this right 00165 if(CMeta::IsNaN(vecPairs[p]->score)) 00166 break; 00167 00168 C += vecPairs[p]->score; 00169 00170 if(C/(p+1.0) < density){ 00171 density = C/(p+1.0); 00172 00173 result_pairs.clear(); 00174 unresolved_pairs.clear(); 00175 00176 // store to B the i,j and pairs up to p 00177 for(x=0; x< vecPairs.size(); x++){ 00178 if( x <= p ) 00179 result_pairs.push_back(vecPairs[x]); 00180 else 00181 unresolved_pairs.push_back(vecPairs[x]); 00182 } 00183 Bi = i; 00184 Bj = j; 00185 } 00186 } 00187 } 00188 } 00189 00190 // now set up results 00191 vecFinalPairs.push_back(new PairStruct(Bi, Bj)); 00192 for(i=0; i < result_pairs.size(); i++){ 00193 vecFinalPairs.push_back(result_pairs[i]); 00194 } 00195 00196 // set up teh resulting pairs for next execution 00197 vecPairs.clear(); 00198 for(j=0; j < unresolved_pairs.size(); j++){ 00199 vecPairs.push_back(unresolved_pairs[j]); 00200 } 00201 } 00202 00203 00204 00205 00206 int main( int iArgs, char** aszArgs ) { 00207 gengetopt_args_info sArgs; 00208 CGenome Genome; 00209 CGenes Genes( Genome ); 00210 ifstream ifsm; 00211 CDat Dat; 00212 CDat Sim; 00213 size_t i, j, k, p; 00214 bool fModified; 00215 float d, d1, d2; 00216 CPCL PCL; 00217 00218 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00219 cmdline_parser_print_help( ); 00220 return 1; } 00221 00222 CMeta Meta( sArgs.verbosity_arg, sArgs.random_arg ); 00223 00224 // open query set genes 00225 if( sArgs.query_given ) { 00226 ifsm.open( sArgs.query_arg ); 00227 if( !Genes.Open( ifsm ) ) { 00228 cerr << "Could not open: " << sArgs.query_arg << endl; 00229 return 1; } 00230 ifsm.close( ); } 00231 00232 cerr << "Num genes: " << Genes.GetGenes() << endl; 00233 00234 // open similarity 00235 if( sArgs.sim_given ) { 00236 if( !Sim.Open( sArgs.sim_arg, sArgs.memmap_flag ) ) { 00237 cerr << "Could not open: " << sArgs.sim_arg << endl; 00238 return 1; } 00239 00240 if( sArgs.normalize_flag ){ 00241 Sim.Normalize( CDat::ENormalizeMinMax ); 00242 cerr << "Normalize: " << sArgs.sim_arg << endl; 00243 } 00244 } 00245 00246 // open tissue expression matrix 00247 if( sArgs.tissue_given ){ 00248 if (!PCL.Open(sArgs.tissue_arg, sArgs.skip_arg)) { 00249 cerr << "Could not open: " << sArgs.tissue_arg << endl; 00250 return 1; 00251 } 00252 } 00253 00254 vector<size_t> qidxs; 00255 vector<GeneStruct> geneweights; 00256 00257 cerr << "check #genes: " << Sim.GetGenes() << endl; 00258 //cerr << "check: " << Sim.GetGene( "9261" ) << endl; 00259 //cerr << "check: " << Sim.GetGene( "1385" ) << endl; 00260 00261 qidxs.resize(Genes.GetGenes()); 00262 for(i=0; i < Genes.GetGenes(); i++){ 00263 //qidxs[i] = Sim.GetGene( Genes.GetGene(i).GetName().c_str() ); 00264 qidxs[i] = Sim.GetGene( Genes.GetGene(i).GetName() ); 00265 //cerr << Genes.GetGene(i).GetName() << "\t" << qidxs[i] << endl; 00266 } 00267 00268 // tissue 00269 vector<size_t> qidxs_pcl; 00270 vector<float> genetissue; 00271 vector<float> tprofile; 00272 vector<size_t> sim2pcl_idx; 00273 if( sArgs.tissue_given ){ 00274 float profilesum; 00275 00276 qidxs_pcl.resize(Genes.GetGenes()); 00277 for(i=0; i < Genes.GetGenes(); i++){ 00278 qidxs_pcl[i] = PCL.GetGene( Genes.GetGene(i).GetName() ); 00279 } 00280 00281 // construct query gene group tissue profile 00282 tprofile.resize( PCL.GetExperiments() ); 00283 for(j=0; j < PCL.GetExperiments(); j++){ 00284 tprofile[j] = 0; 00285 } 00286 00287 profilesum = 0; 00288 for(i=0; i < Genes.GetGenes(); i++){ 00289 if( qidxs_pcl[i] == -1 ) 00290 continue; 00291 for(j=0; j < PCL.GetExperiments(); j++){ 00292 // maybe check for nan 00293 if( ! CMeta::IsNaN(( d = PCL.Get(qidxs_pcl[i], j) )) ){ 00294 tprofile[j] += d; 00295 profilesum += d; 00296 } 00297 } 00298 } 00299 00300 // normalize tissue profile 00301 for(j=0; j < PCL.GetExperiments(); j++){ 00302 tprofile[j] = tprofile[j] / profilesum; 00303 } 00304 00305 genetissue.resize(PCL.GetGenes()); 00306 00307 float tsum; 00308 float profile_sum; 00309 for(i=0; i < PCL.GetGenes(); i++){ 00310 tsum = 0; 00311 profile_sum = 0; 00312 for(j=0; j < PCL.GetExperiments(); j++){ 00313 // check NaN 00314 if( ! CMeta::IsNaN(( d = PCL.Get(i, j) )) ){ 00315 tsum += (tprofile[j] * d); 00316 profile_sum += d; 00317 } 00318 } 00319 genetissue[i] = tsum / profile_sum; 00320 } 00321 00322 if(sArgs.sim_given){ 00323 sim2pcl_idx.resize(Sim.GetGenes()); 00324 for(i = 0; i < Sim.GetGenes(); i++){ 00325 sim2pcl_idx[i] = PCL.GetGene( Sim.GetGene( i ) ); 00326 } 00327 }else{ 00328 // no imilarity matrix just print results 00329 geneweights.resize(PCL.GetGenes()); 00330 for(i=0; i < geneweights.size(); i++){ 00331 geneweights[i].score = genetissue[i]; 00332 geneweights[i].idx = i; 00333 } 00334 // sort genes by weight 00335 sort(geneweights.begin(), geneweights.end(), SortGeneStruct()); 00336 00337 for(i=0; i < PCL.GetGenes(); i++){ 00338 if( CMeta::IsNaN(geneweights[i].score)) 00339 continue; 00340 cout << PCL.GetGene( geneweights[i].idx ) << "\t" << geneweights[i].score << endl; 00341 } 00342 00343 return 0; 00344 } 00345 } 00346 00347 if(sArgs.query_given){ 00348 00349 cerr << "mapped: " << qidxs.size() << endl; 00350 00351 geneweights.resize(Sim.GetGenes()); 00352 for(i=0; i < geneweights.size(); i++){ 00353 geneweights[i].score = 0.0; 00354 geneweights[i].idx = i; 00355 } 00356 00357 for(i=0; i < Sim.GetGenes(); i++){ 00358 for(j=0; j < qidxs.size(); j++){ 00359 if( i == qidxs[j] || qidxs[j] == -1 ) 00360 continue; 00361 if( !CMeta::IsNaN( (d = Sim.Get(i, qidxs[j])))){ 00362 if( sArgs.tissue_given ) 00363 if( sim2pcl_idx[i] != -1 ) 00364 geneweights[i].score += (d * genetissue[sim2pcl_idx[i]]); 00365 else // currently genes only in Sim matrix and not in PCL removed 00366 geneweights[i].score = CMeta::GetNaN( ); 00367 else 00368 geneweights[i].score += d; 00369 } 00370 } 00371 } 00372 00373 for(i=0; i < Genes.GetGenes(); i++){ 00374 geneweights[qidxs[i]].score = CMeta::GetNaN( ); 00375 } 00376 00377 // sort genes by weight 00378 sort(geneweights.begin(), geneweights.end(), SortGeneStruct()); 00379 00380 for(i=0; i < Sim.GetGenes(); i++){ 00381 if( CMeta::IsNaN(geneweights[i].score)) 00382 continue; 00383 cout << Sim.GetGene( geneweights[i].idx ) << "\t" << geneweights[i].score << endl; 00384 } 00385 return 0; 00386 } 00388 00389 CPCL GPCL; 00390 CPCL GDist; 00391 CPCL GNext; 00392 00393 // open gene direct distance matrix 00394 if( sArgs.input_given ){ 00395 if (!GPCL.Open(sArgs.input_arg, sArgs.skip_arg)) { 00396 cerr << "Could not open: " << sArgs.input_arg << endl; 00397 return 1; 00398 } 00399 cerr << "Open: " << sArgs.input_given << endl; 00400 00401 if( sArgs.NegLog_flag ){ 00402 // convert the values to -log(prob) 00403 for(i=0; i < GPCL.GetGenes(); i++){ 00404 for(j=0; j < GPCL.GetGenes(); j++){ 00405 if( i == j ) 00406 GPCL.Set(i, j, CMeta::GetNaN()); 00407 00408 if( CMeta::IsNaN( d = GPCL.Get(i,j) ) ) 00409 continue; 00410 00411 GPCL.Set(i, j, -log(d)); 00412 } 00413 } 00414 } 00415 } 00416 00417 GDist.Open(GPCL); 00418 cerr << "opend1: " << GDist.GetGenes() << endl; 00419 GNext.Open(GPCL); 00420 cerr << "opend2: " << GNext.GetGenes() << endl; 00421 00422 // initialize two matrix 00423 for(i=0; i < GDist.GetGenes(); i++){ 00424 //GDist.Set(i, CMeta::GetNaN()); 00425 for(j=0; j < GDist.GetGenes(); j++){ 00426 if( i == j ) 00427 GNext.Set(i, j, CMeta::GetNaN()); 00428 else 00429 GNext.Set(i, j, i); 00430 } 00431 } 00432 00433 cerr << "WTF" << endl; 00434 00435 for(k=0; k < GDist.GetGenes(); k++){ 00436 00437 if( k % 100 == 0 ) 00438 cerr << "k: " << k << endl; 00439 00440 for(i=0; i < GDist.GetGenes(); i++){ 00441 for(j=0; j < GDist.GetGenes(); j++){ 00442 if( i == j) 00443 continue; 00444 // update distance matrix 00445 // (i,k), (k,j) 00446 d = GDist.Get(i, j); 00447 00448 if( CMeta::IsNaN(d1 = GDist.Get(i, k)) || CMeta::IsNaN(d2 = GDist.Get(k, j)) ) 00449 continue; 00450 00451 if( CMeta::IsNaN(d) || (d1 + d2) < d ){ 00452 GDist.Set( i,j, (d1+d2) ); 00453 00454 // update path matrix 00455 GNext.Set( i,j, GNext.Get(k, j)); 00456 } 00457 } 00458 } 00459 } 00460 00461 cerr << "done" << endl; 00462 00463 if(sArgs.savematrix_flag){ 00464 std::ofstream ofsm; 00465 00466 cerr << "Save output" << endl; 00467 std::stringstream sstmDist; 00468 std::stringstream sstmNext; 00469 00470 std::string path(sArgs.input_arg); 00471 size_t pos = path.find_last_of("/"); 00472 size_t lastindex = path.find_last_of("."); 00473 std::string rawname = path.substr(0, lastindex); 00474 00475 sstmNext << "" << rawname << ".next.bin"; 00476 cerr << "Save next matrix: " << sstmNext.str() << endl; 00477 ofsm.open(sstmNext.str().c_str()); 00478 00479 GNext.SaveBinary(ofsm); 00480 ofsm.close(); 00481 00482 if( !sArgs.savedab_flag ){ 00483 sstmDist << "" << rawname << ".dist.bin"; 00484 00485 ofsm.open(sstmDist.str().c_str()); 00486 cerr << "Save distance matrix: " << sstmDist.str() << endl; 00487 GDist.SaveBinary(ofsm); 00488 ofsm.close(); 00489 }else{ 00490 CDat GDistDat; 00491 GDistDat.Open( GDist.GetGeneNames()); 00492 00493 sstmDist << "" << rawname << ".dist.dab"; 00494 00495 // map PCL to dab 00496 for(i=0; i < GDistDat.GetGenes(); i++) 00497 for(j=(i+1); j < GDistDat.GetGenes(); j++){ 00498 GDistDat.Set(i, j, (GDist.Get(i, j) + GDist.Get(j, i))/2.0 ); 00499 } 00500 00501 cerr << "Save distance matrix: " << sstmDist.str() << endl; 00502 GDistDat.Save(sstmDist.str().c_str()); 00503 } 00504 } 00505 00506 //**************** 00507 //* find steiner tree 00508 00509 if( sArgs.genepairs_given ){ 00510 // read the gene pair 00511 vector<PairStruct*> vecOrigPairs; 00512 vector<PairStruct*> vecPairs; 00513 vector<PairStruct*> vecFinalPairs; 00514 float C, density; 00515 00516 if( ! open_genepairs(sArgs.genepairs_arg, vecOrigPairs, GDist) ){ 00517 cerr << "ERROR: failed to open " << sArgs.genepairs_arg << endl; 00518 return 1; 00519 } 00520 vecPairs.resize(vecOrigPairs.size()); 00521 00522 for(i =0; i < vecOrigPairs.size(); i++){ 00523 vecPairs[i] = vecOrigPairs[i]; 00524 } 00525 00526 // might just make it that if all genes are covered halt 00527 // currently it is if all pairs are covered 00528 while(vecPairs.size() > 0){ 00529 find_bunch(GDist, vecPairs, vecFinalPairs); 00530 } 00531 00532 // now vecFinalPairs = original required pairs + new gene pairs 00533 // print output 00534 } 00535 00536 00537 00538 return 0; 00539 }