Sleipnir
tools/NetMiner/NetMiner.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 #include "statistics.h"
00026 #include "datapair.h"
00027 #include "file.h"
00028 
00029 struct GeneStruct {
00030   size_t idx;
00031   float score;
00032 };
00033 
00034 struct SortGeneStruct {
00035 
00036     bool operator()(const GeneStruct rOne, const GeneStruct rTwo) const {
00037         return (rOne.score > rTwo.score);
00038     }
00039 };
00040 
00041 class PairStruct {
00042 public:
00043   size_t g1;
00044   size_t g2;
00045   float score;
00046   
00047   PairStruct(size_t g1idx, size_t g2idx){
00048     g1 = g1idx;
00049     g2 = g2idx;
00050     score = 0;
00051   }
00052 };
00053 
00054 struct SortPairStruct {
00055   
00056   bool operator()(const PairStruct* rOne, const PairStruct* rTwo) const {
00057     return (rOne->score < rTwo->score);
00058   }
00059 };
00060 
00061 
00062 bool open_genepairs(char* szFile, vector<PairStruct*>& vecPairs, CPCL& GDist){
00063   ifstream istm;
00064   const char*   pc;
00065   char*     pcTail;
00066   char*     acBuf;
00067   string        strToken, strCache, strValue;
00068   size_t        iOne, iTwo, i;
00069   const char c_acComment[]= "#";
00070   
00071   istm.open( szFile );
00072   
00073   acBuf = new char[ CFile::GetBufferSize() ];
00074   while( istm.peek( ) != EOF ) {
00075     istm.getline( acBuf, CFile::GetBufferSize() - 1 );
00076     strToken = CFile::OpenToken( acBuf, &pc );
00077     if( !strToken.length( ) )
00078       break;
00079     if( strToken == c_acComment )
00080       continue;
00081     if( strToken != strCache ) {
00082       strCache = strToken;
00083       
00084       iOne = GDist.GetGene(strToken);
00085       if( iOne == -1){
00086     cerr << "Skipping gene pair, missing gene: " << strToken << endl;
00087     continue;
00088       }
00089       
00090       //iOne = MapGene( mapGenes, m_vecstrGenes, strToken ); 
00091       /*
00092       if(GDist != NULL){
00093     iOne = GDist.GetGene(strToken);
00094     if( iOne == -1){
00095       cerr << "Skipping gene pair, missing gene: " << strToken << endl;
00096       continue;
00097     }
00098       }else{
00099     iOne = atoi(strToken.c_str());
00100       }
00101       */
00102     }
00103     
00104     strToken = CFile::OpenToken( pc, &pc );
00105     if( !strToken.length( ) ) {
00106       delete[] acBuf;
00107       return false; }
00108     
00109     iTwo = GDist.GetGene(strToken);
00110     if( iTwo == -1){
00111       cerr << "Skipping gene pair, missing gene: " << strToken << endl;
00112       continue;
00113     }    
00114     //iTwo = MapGene( mapGenes, m_vecstrGenes, strToken );
00115     /*
00116     if(GDist != NULL){
00117       iTwo = GDist.GetGene(strToken);
00118       if( iTwo == -1){
00119     cerr << "Skipping gene pair, missing gene: " << strToken << endl;
00120     continue;
00121       }
00122     }else{
00123       iTwo = atoi(strToken.c_str());
00124     }
00125     */
00126     
00127     vecPairs.push_back( new PairStruct(iOne, iTwo) );
00128   }
00129   delete[] acBuf;  
00130   return true;
00131 }
00132 
00133 void find_bunch(CPCL& GDist, vector<PairStruct*>& vecPairs, vector<PairStruct*>& vecFinalPairs){
00134   float         C, d, d1, d2, density;
00135   size_t    i, j, k, p, Bi, Bj, x;  
00136   vector<PairStruct*> result_pairs;
00137   vector<PairStruct*> unresolved_pairs;
00138   
00139   result_pairs.resize(vecPairs.size());
00140   unresolved_pairs.resize(vecPairs.size());
00141   
00142   density = CMeta::GetNaN();
00143   for(i=0; i < GDist.GetGenes(); i++){
00144     for(j=0; j < GDist.GetGenes(); j++){
00145       
00146       if(i == j || CMeta::IsNaN(d = GDist.Get(i, j)))
00147     continue;
00148       
00149       for(p=0; p < vecPairs.size(); p++){
00150     if(CMeta::IsNaN(d1 = GDist.Get(vecPairs[p]->g1, i)) ||
00151        CMeta::IsNaN(d2 = GDist.Get(j, vecPairs[p]->g2))){
00152       vecPairs[p]->score = CMeta::GetNaN();
00153       continue;
00154     }
00155     vecPairs[p]->score =  d1 + d2;
00156       }
00157       
00158       // DEBUG!!! check if direction is correct
00159       sort(vecPairs.begin(), vecPairs.end(), SortPairStruct());
00160       
00161       C = d;
00162       for(p=0; p < vecPairs.size(); p++){
00163     // check nan
00164     // DEBUG is this right
00165     if(CMeta::IsNaN(vecPairs[p]->score))
00166       break;
00167     
00168     C += vecPairs[p]->score;
00169     
00170     if(C/(p+1.0) < density){
00171       density = C/(p+1.0);
00172       
00173       result_pairs.clear();   
00174       unresolved_pairs.clear();
00175       
00176       // store to B the i,j and pairs up to p
00177       for(x=0; x< vecPairs.size(); x++){
00178         if( x <= p )
00179           result_pairs.push_back(vecPairs[x]);
00180         else
00181           unresolved_pairs.push_back(vecPairs[x]);
00182       }
00183       Bi = i;
00184       Bj = j;     
00185     }
00186       }      
00187     }
00188   }  
00189   
00190   // now set up results
00191   vecFinalPairs.push_back(new PairStruct(Bi, Bj));  
00192   for(i=0; i < result_pairs.size(); i++){
00193     vecFinalPairs.push_back(result_pairs[i]);
00194   }
00195   
00196   // set up teh resulting pairs for next execution
00197   vecPairs.clear();
00198   for(j=0; j < unresolved_pairs.size(); j++){
00199     vecPairs.push_back(unresolved_pairs[j]);
00200   }
00201 }
00202 
00203 
00204 
00205 
00206 int main( int iArgs, char** aszArgs ) {
00207     gengetopt_args_info sArgs;
00208     CGenome             Genome;
00209     CGenes              Genes( Genome );
00210     ifstream            ifsm;
00211     CDat                Dat;
00212     CDat                Sim;
00213     size_t              i, j, k, p;
00214     bool                fModified;
00215     float d, d1, d2;
00216     CPCL PCL;
00217     
00218     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00219         cmdline_parser_print_help( );
00220         return 1; }
00221 
00222     CMeta Meta( sArgs.verbosity_arg, sArgs.random_arg );
00223     
00224     // open query set genes
00225     if( sArgs.query_given ) {
00226       ifsm.open( sArgs.query_arg );
00227         if( !Genes.Open( ifsm ) ) {
00228           cerr << "Could not open: " << sArgs.query_arg << endl;
00229           return 1; }
00230         ifsm.close( ); }
00231     
00232     cerr << "Num genes: " << Genes.GetGenes()  << endl;
00233     
00234     // open similarity
00235     if( sArgs.sim_given ) {
00236       if( !Sim.Open( sArgs.sim_arg, sArgs.memmap_flag ) ) {
00237         cerr << "Could not open: " << sArgs.sim_arg << endl;
00238         return 1; } 
00239 
00240       if( sArgs.normalize_flag ){
00241         Sim.Normalize( CDat::ENormalizeMinMax );
00242         cerr << "Normalize: " << sArgs.sim_arg << endl;
00243       }
00244     }
00245     
00246     // open tissue expression matrix
00247     if( sArgs.tissue_given ){
00248       if (!PCL.Open(sArgs.tissue_arg, sArgs.skip_arg)) {
00249         cerr << "Could not open: " << sArgs.tissue_arg << endl;
00250         return 1;
00251       }
00252     }
00253     
00254     vector<size_t> qidxs;
00255     vector<GeneStruct> geneweights;
00256     
00257     cerr << "check #genes: " << Sim.GetGenes() << endl;
00258     //cerr << "check: " << Sim.GetGene( "9261" ) << endl;
00259     //cerr << "check: " << Sim.GetGene( "1385" ) << endl;
00260 
00261     qidxs.resize(Genes.GetGenes());
00262     for(i=0; i < Genes.GetGenes(); i++){
00263       //qidxs[i] = Sim.GetGene( Genes.GetGene(i).GetName().c_str() );
00264       qidxs[i] = Sim.GetGene( Genes.GetGene(i).GetName() );
00265       //cerr << Genes.GetGene(i).GetName() << "\t" << qidxs[i] << endl;
00266     }
00267     
00268     // tissue 
00269     vector<size_t> qidxs_pcl;
00270     vector<float> genetissue;
00271     vector<float> tprofile;
00272     vector<size_t> sim2pcl_idx;
00273     if( sArgs.tissue_given ){
00274       float profilesum;
00275       
00276       qidxs_pcl.resize(Genes.GetGenes());
00277       for(i=0; i < Genes.GetGenes(); i++){
00278         qidxs_pcl[i] = PCL.GetGene( Genes.GetGene(i).GetName() );
00279       }
00280       
00281       // construct query gene group tissue profile
00282       tprofile.resize( PCL.GetExperiments() );
00283       for(j=0; j < PCL.GetExperiments(); j++){
00284         tprofile[j] = 0;
00285       }
00286       
00287       profilesum = 0;
00288       for(i=0; i < Genes.GetGenes(); i++){              
00289         if( qidxs_pcl[i] == -1 )
00290           continue;     
00291         for(j=0; j < PCL.GetExperiments(); j++){
00292           // maybe check for nan
00293           if( ! CMeta::IsNaN(( d = PCL.Get(qidxs_pcl[i], j) )) ){
00294         tprofile[j] += d; 
00295         profilesum  += d;
00296           }
00297         }
00298       }
00299       
00300       // normalize tissue profile
00301       for(j=0; j < PCL.GetExperiments(); j++){
00302         tprofile[j] = tprofile[j] / profilesum;
00303       }
00304           
00305       genetissue.resize(PCL.GetGenes());
00306       
00307       float tsum;
00308       float profile_sum;
00309       for(i=0; i < PCL.GetGenes(); i++){                
00310         tsum = 0;
00311         profile_sum = 0;
00312         for(j=0; j < PCL.GetExperiments(); j++){          
00313           // check NaN
00314           if( ! CMeta::IsNaN(( d = PCL.Get(i, j) )) ){
00315         tsum += (tprofile[j] * d);      
00316         profile_sum += d;
00317           }
00318         }
00319         genetissue[i] = tsum / profile_sum;
00320       }
00321       
00322       if(sArgs.sim_given){
00323         sim2pcl_idx.resize(Sim.GetGenes());
00324         for(i = 0; i < Sim.GetGenes(); i++){
00325           sim2pcl_idx[i] = PCL.GetGene( Sim.GetGene( i ) );
00326         }
00327       }else{
00328         // no imilarity matrix just print results
00329         geneweights.resize(PCL.GetGenes());
00330         for(i=0; i < geneweights.size(); i++){
00331           geneweights[i].score = genetissue[i];
00332           geneweights[i].idx = i;
00333         }
00334         // sort genes by weight
00335         sort(geneweights.begin(), geneweights.end(), SortGeneStruct());
00336         
00337         for(i=0; i < PCL.GetGenes(); i++){
00338           if( CMeta::IsNaN(geneweights[i].score))
00339         continue;
00340           cout << PCL.GetGene( geneweights[i].idx ) << "\t" << geneweights[i].score << endl;
00341         }
00342 
00343         return 0;
00344       }
00345     }
00346     
00347     if(sArgs.query_given){
00348       
00349       cerr << "mapped: " << qidxs.size() << endl;
00350       
00351       geneweights.resize(Sim.GetGenes());
00352       for(i=0; i < geneweights.size(); i++){
00353         geneweights[i].score = 0.0;
00354         geneweights[i].idx = i;
00355       }
00356       
00357       for(i=0; i < Sim.GetGenes(); i++){
00358         for(j=0; j < qidxs.size(); j++){
00359           if( i == qidxs[j] || qidxs[j] == -1 )
00360         continue;
00361           if( !CMeta::IsNaN( (d = Sim.Get(i, qidxs[j])))){
00362         if( sArgs.tissue_given )
00363           if( sim2pcl_idx[i] != -1 )
00364             geneweights[i].score += (d * genetissue[sim2pcl_idx[i]]);
00365           else // currently genes only in Sim matrix and not in PCL removed       
00366             geneweights[i].score = CMeta::GetNaN( );
00367         else
00368           geneweights[i].score += d;
00369           }
00370         }
00371       }
00372       
00373       for(i=0; i < Genes.GetGenes(); i++){  
00374         geneweights[qidxs[i]].score = CMeta::GetNaN( );
00375       }
00376       
00377       // sort genes by weight
00378       sort(geneweights.begin(), geneweights.end(), SortGeneStruct());
00379       
00380       for(i=0; i < Sim.GetGenes(); i++){
00381         if( CMeta::IsNaN(geneweights[i].score))
00382           continue;
00383         cout << Sim.GetGene( geneweights[i].idx ) << "\t" << geneweights[i].score << endl;
00384       }
00385       return 0;
00386     }
00388     
00389     CPCL GPCL;  
00390     CPCL GDist;
00391     CPCL GNext;
00392     
00393     // open gene direct distance matrix
00394     if( sArgs.input_given ){
00395       if (!GPCL.Open(sArgs.input_arg, sArgs.skip_arg)) {
00396         cerr << "Could not open: " << sArgs.input_arg << endl;
00397         return 1;
00398       }
00399       cerr << "Open: " << sArgs.input_given << endl;
00400       
00401       if( sArgs.NegLog_flag ){
00402         // convert the values to -log(prob)
00403         for(i=0; i < GPCL.GetGenes(); i++){
00404           for(j=0; j < GPCL.GetGenes(); j++){       
00405         if( i == j )
00406           GPCL.Set(i, j, CMeta::GetNaN());
00407         
00408         if( CMeta::IsNaN( d = GPCL.Get(i,j) ) )
00409           continue;       
00410         
00411         GPCL.Set(i, j, -log(d));
00412           }
00413         }
00414       }
00415     }
00416     
00417     GDist.Open(GPCL);
00418     cerr << "opend1: " << GDist.GetGenes() << endl;
00419     GNext.Open(GPCL);
00420     cerr << "opend2: " << GNext.GetGenes() << endl;
00421     
00422     // initialize two matrix
00423     for(i=0; i < GDist.GetGenes(); i++){
00424       //GDist.Set(i, CMeta::GetNaN());
00425       for(j=0; j < GDist.GetGenes(); j++){      
00426         if( i == j )
00427           GNext.Set(i, j, CMeta::GetNaN());
00428         else
00429           GNext.Set(i, j, i);
00430       }
00431     }
00432     
00433     cerr << "WTF" << endl;
00434     
00435     for(k=0; k < GDist.GetGenes(); k++){
00436       
00437       if( k % 100 == 0 )
00438         cerr << "k: " << k << endl;
00439       
00440       for(i=0; i < GDist.GetGenes(); i++){
00441         for(j=0; j < GDist.GetGenes(); j++){
00442           if( i == j)
00443         continue;
00444           // update distance matrix
00445           // (i,k), (k,j)
00446           d = GDist.Get(i, j);
00447           
00448           if( CMeta::IsNaN(d1 = GDist.Get(i, k)) || CMeta::IsNaN(d2 = GDist.Get(k, j)) )
00449         continue;
00450           
00451           if( CMeta::IsNaN(d) || (d1 + d2) < d ){
00452         GDist.Set( i,j, (d1+d2) );
00453         
00454         // update path matrix
00455         GNext.Set( i,j, GNext.Get(k, j));
00456           }
00457         }
00458       }
00459     }
00460     
00461     cerr << "done" << endl;
00462     
00463     if(sArgs.savematrix_flag){
00464       std::ofstream ofsm;
00465       
00466       cerr << "Save output" << endl;
00467       std::stringstream sstmDist;
00468       std::stringstream sstmNext;
00469       
00470       std::string path(sArgs.input_arg);
00471       size_t pos = path.find_last_of("/");
00472       size_t lastindex = path.find_last_of(".");
00473       std::string rawname = path.substr(0, lastindex); 
00474       
00475       sstmNext << "" << rawname << ".next.bin";
00476       cerr << "Save next matrix: " << sstmNext.str() << endl;     
00477       ofsm.open(sstmNext.str().c_str());
00478       
00479       GNext.SaveBinary(ofsm);
00480       ofsm.close();
00481       
00482       if( !sArgs.savedab_flag ){
00483         sstmDist << "" << rawname << ".dist.bin";
00484 
00485         ofsm.open(sstmDist.str().c_str());
00486         cerr << "Save distance matrix: " << sstmDist.str() << endl;   
00487         GDist.SaveBinary(ofsm);
00488         ofsm.close();
00489       }else{
00490         CDat GDistDat;
00491         GDistDat.Open( GDist.GetGeneNames());
00492         
00493         sstmDist << "" << rawname << ".dist.dab";
00494         
00495         // map PCL to dab
00496         for(i=0; i < GDistDat.GetGenes(); i++)
00497           for(j=(i+1); j < GDistDat.GetGenes(); j++){
00498         GDistDat.Set(i, j, (GDist.Get(i, j) + GDist.Get(j, i))/2.0 );       
00499           }
00500         
00501         cerr << "Save distance matrix: " << sstmDist.str() << endl;   
00502         GDistDat.Save(sstmDist.str().c_str());
00503       }
00504     }
00505     
00506     //****************
00507     //* find steiner tree
00508     
00509     if( sArgs.genepairs_given ){
00510       // read the gene pair
00511       vector<PairStruct*> vecOrigPairs;
00512       vector<PairStruct*> vecPairs;   
00513       vector<PairStruct*> vecFinalPairs;
00514       float C, density;
00515       
00516       if( ! open_genepairs(sArgs.genepairs_arg, vecOrigPairs, GDist) ){
00517         cerr << "ERROR: failed to open " << sArgs.genepairs_arg << endl;
00518         return 1;
00519       }
00520       vecPairs.resize(vecOrigPairs.size());
00521       
00522       for(i =0; i < vecOrigPairs.size(); i++){
00523         vecPairs[i] = vecOrigPairs[i];
00524       }
00525       
00526       // might just make it that if all genes are covered halt
00527       // currently it is if all pairs are covered
00528       while(vecPairs.size() > 0){
00529         find_bunch(GDist, vecPairs, vecFinalPairs);     
00530       }
00531       
00532       // now vecFinalPairs = original required pairs + new gene pairs
00533       //  print output
00534     }
00535     
00536     
00537     
00538     return 0; 
00539 }