Sleipnir
tools/SparseNetCombiner/SparseNetCombiner.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 
00024 static const char   c_acDab[]   = ".dab";
00025 static const char   c_acQDab[]   = ".qdab";
00026 static const char   c_acDat[]   = ".dat";
00027 static const char   c_acSVM[]   = ".svm";
00028 static const char   c_acOUT[]   = ".out";
00029 
00030 enum EMethod {
00031     EMethodBegin    = 0,
00032     EMethodMean     = EMethodBegin,
00033     EMethodMax      = EMethodMean + 1,
00034     EMethodQuant        = EMethodMax + 1,
00035     EMethodEnd      = EMethodQuant + 1
00036 };
00037 
00038 static const char*  c_aszMethods[]  = {
00039   "mean", "max", "quant",NULL
00040 };
00041 
00042 
00043 float Percentile(vector<float>& vecVals, float quartile) {
00044   //size_t iOne, iTwo, iSize;
00045   //float d, dFrac;
00046   size_t iSize;
00047   float value, value2;
00048   
00049   iSize = vecVals.size();
00050   if(iSize == 0)
00051     return CMeta::GetNaN();
00052   
00053   if(iSize == 1)
00054     return vecVals[0];
00055   
00056   std::sort(vecVals.begin(), vecVals.end());
00057   
00058   float index = quartile * (iSize + 1);
00059   float remainder = index - (size_t)index;
00060   
00061   index = (size_t)index - 1;
00062   
00063   if(remainder == 0){
00064     return vecVals[(size_t)index];    
00065   }else{
00066     if(remainder > 0.5){
00067       value = vecVals[(size_t)index-1];
00068       value2 = vecVals[(size_t)index];
00069     }else{
00070       value = vecVals[(size_t)index];
00071       value2 = vecVals[(size_t)index+1];
00072     }    
00073     float interpolationValue = (value2 - value) * remainder;
00074     
00075     cerr << value << " " << value2 << " " << remainder  <<endl;    
00076     //float interpolationValue = (value - vecVals[((size_t)index-1)]) * remainder;
00077     //cerr << interpolationValue << endl;
00078     
00079     //return (vecVals[(size_t)index] + interpolationValue);
00080     return (value + interpolationValue);
00081   }
00082 }
00083 
00084 int main( int iArgs, char** aszArgs ) {
00085     gengetopt_args_info sArgs;
00086     int                 iRet;
00087     size_t              i, j, k, l;
00088     size_t    iRuns, inputNums;
00089     float d, dout;
00090     DIR* dp;
00091     struct dirent* ep;
00092     CDat                    DatOut, DatTrack, DatCur;
00093     vector<size_t>              veciGenesCur;   
00094     // store all input file names
00095     vector<string> input_files;
00096     vector<string>      vecstrInputs, vecstrDatasets; 
00097     EMethod     eMethod;
00098     ifstream    ifsm;
00099     vector<float>   vecWeights;
00100     map<string, size_t> mapstriDatasets;
00101     CGenome             Genome;
00102     CGenes              Genes( Genome );
00103     
00104     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00105         cmdline_parser_print_help( );
00106         return 1; }
00107     CMeta Meta( sArgs.verbosity_arg );
00108     
00109     for( eMethod = EMethodBegin; eMethod < EMethodEnd; eMethod = (EMethod)( eMethod + 1 ) )
00110       if( !strcmp( c_aszMethods[eMethod], sArgs.method_arg ) ){
00111         cerr << "combine method: " << c_aszMethods[eMethod] << endl;
00112         break;
00113       }
00114     
00115     // now collect the data files from directory if given
00116     if(sArgs.directory_given){
00117       dp = opendir (sArgs.directory_arg);
00118       i = 0;
00119       if (dp != NULL){
00120         while (ep = readdir (dp)){
00121           if(  strstr(ep->d_name, c_acDab) != NULL ||
00122            strstr(ep->d_name, c_acQDab) != NULL ||
00123            strstr(ep->d_name, c_acDat) != NULL
00124            ){
00125         if(strstr(ep->d_name, c_acSVM) != NULL)
00126           continue;
00127         if(strstr(ep->d_name, c_acOUT) != NULL)
00128           continue;
00129         vecstrInputs.push_back((string)sArgs.directory_arg + "/" + ep->d_name);     
00130         vecstrDatasets.push_back(vecstrInputs[i]);      
00131         mapstriDatasets[ ep->d_name ] = i;
00132         ++i;
00133           }
00134         }
00135         (void) closedir (dp);
00136         
00137         inputNums = vecstrDatasets.size();
00138         
00139         // sort by ASCI 
00140         //std::sort( vecstrInputs.begin(), vecstrInputs.end() );
00141         
00142         cerr << "Number of datasets: " << inputNums << '\n';
00143       }
00144       else{
00145         cerr << "Couldn't open the directory: " << sArgs.directory_arg << '\n';
00146         return 1;
00147       }
00148     }
00149     
00150     // read dataset weight file
00151     if( sArgs.weight_given ) {
00152       static const size_t   c_iBuffer   = 1024;
00153       char                  szBuffer[ c_iBuffer ];
00154       string                strFile;
00155       vector<string>            vecstrLine;
00156       map<string, size_t>::const_iterator   iterDataset;
00157       size_t  numDataset;
00158       vecWeights.resize(vecstrInputs.size( ) );
00159       
00160       // set default weight
00161       for(i=0; i < vecWeights.size(); i++){
00162         vecWeights[i] = 0;
00163       }
00164       
00165       ifsm.clear( );
00166       ifsm.open( sArgs.weight_arg );
00167       if( !ifsm.is_open( ) ) {
00168             cerr << "Could not open: " << sArgs.weight_arg << endl;
00169             return 1;
00170       }
00171       while( !ifsm.eof( ) ) {
00172             ifsm.getline( szBuffer, c_iBuffer - 1 );
00173             szBuffer[ c_iBuffer - 1 ] = 0;
00174             if( !szBuffer[ 0 ] )
00175           continue;
00176             vecstrLine.clear( );
00177             CMeta::Tokenize( szBuffer, vecstrLine );
00178             if( vecstrLine.size( ) != 2 ) {
00179           cerr << "Illegal weight line: " << szBuffer << endl;
00180           return 1;
00181             }
00182         
00183         if( ( iterDataset = mapstriDatasets.find( vecstrLine[ 0 ] ) ) == mapstriDatasets.end( ) )
00184           cerr << "Dataset in weights but not database: " << vecstrLine[ 0 ] << endl;
00185             else          
00186           vecWeights[ iterDataset->second ] = (float)atof( vecstrLine[ 1 ].c_str( ) );
00187       }
00188       ifsm.close( );
00189       
00190       // now only keep datasets with non-zero weighting
00191       vecstrDatasets.clear();
00192       numDataset = 0;
00193       for(i = 0; i < vecstrInputs.size(); ++i){
00194         if(vecWeights[i] == 0)
00195           continue;
00196         vecstrDatasets.push_back(vecstrInputs[i]);
00197         vecWeights[numDataset] = vecWeights[i];
00198         numDataset++;
00199       }
00200       vecstrDatasets.resize(numDataset);
00201       vecWeights.resize(numDataset);
00202       
00203       cerr << "Total number of datasets combining with non-zero weights: " << numDataset << endl;
00204     }
00205     
00208     if( eMethod == EMethodQuant ){
00209       CDatasetCompact Dataset;
00210       vector<float> vecVals;
00211       size_t val;
00212       
00213       if( !Dataset.Open( vecstrDatasets ) ) {
00214         cerr << "Couldn't open input datasets for quantile combine." << endl;
00215         return 1; }
00216       
00217       // initialized the output dab and pair value vector
00218       DatOut.Open(Dataset.GetGeneNames());
00219       vecVals.resize(Dataset.GetExperiments());
00220 
00221       // debug
00222       cerr << "num dataset: " << Dataset.GetExperiments() << endl;
00223       
00224       for( i = 0; i < DatOut.GetGenes(); ++i ){
00225         for( j = i+1; j < DatOut.GetGenes(); ++j ){
00226           // iterate over each dataset
00227           vecVals.clear();
00228           for( k = 0; k < Dataset.GetExperiments(); ++k ){
00229         // debug
00230         //cerr << "Orig val: " << val << endl;
00231         
00232         if( (val = Dataset.GetDiscrete( i, j, k)) == -1 )
00233           continue;     
00234         
00235         // debug
00236         //cerr << "Got val: " << val << endl;
00237         if( sArgs.weight_given ){
00238           val *= vecWeights[k];
00239         }
00240         
00241         vecVals.push_back(val);
00242           }
00243           
00244           if(vecVals.size() < 1)
00245         continue;
00246           
00247           // debug
00248           //cerr << "quant val: " << CStatistics::Percentile(vecVals.begin(), vecVals.end(), sArgs.quantile_arg) << endl;
00249           //cerr << "stats: " << vecVals.begin() << " " << vecVals.end() << " " << endl; //(vecVals.end()-vecVals.begin()) << endl;
00250           //cerr << "stats: " << (vecVals.end()-vecVals.begin()) << endl;        
00251           DatOut.Set(i, j, CStatistics::Percentile(vecVals.begin(), vecVals.end(), sArgs.quantile_arg));
00252           
00253           //DatOut.Set(i, j, Percentile(vecVals, sArgs.quantile_arg));        
00254           //for(size_t t = 0; t < vecVals.size(); t++ )
00255           //    cerr << vecVals[t] << ' ';
00256           //cerr << endl;
00257         }
00258       }
00259       
00260       DatOut.Save( sArgs.output_arg );
00261       return 0;
00262     }
00263     
00264     
00265     // now iterate dat/dab networks
00266     for( i = 0; i < vecstrDatasets.size( ); ++i ) {
00267       
00268       // open first network, we will just add edge weights to this CDat
00269       if( i == 0 ){
00270         if( !DatCur.Open( vecstrDatasets[ i ].c_str() ) ) {
00271           cerr << "Couldn't open: " << vecstrDatasets[ i ] << endl;
00272           return 1; }
00273         
00274         DatOut.Open( DatCur );              
00275         DatTrack.Open( DatCur );
00276         
00277         // this Dat is used to track various values (count, max)
00278         for( j = 0; j < DatTrack.GetGenes( ); ++j )
00279           for( k = ( j + 1 ); k < DatTrack.GetGenes( ); ++k ){
00280         if( CMeta::IsNaN( d = DatCur.Get( j, k)))
00281           continue;
00282         
00283         if( sArgs.weight_given ){
00284           d *= vecWeights[i];
00285           // store weighted value (numerator)
00286           DatOut.Set( j, k, d);
00287         }
00288         
00289         switch( eMethod ) {
00290         case EMethodMax:
00291           DatOut.Set( j, k, d);
00292           break;
00293         case EMethodMean:
00294           // keep track of denominator
00295           DatTrack.Set( j, k, 1.0);
00296         }
00297           }
00298         
00299         cerr << "opened: " << vecstrDatasets[ i ] << endl;
00300         continue;    
00301       }
00302       
00303       if( !DatCur.Open( vecstrDatasets[ i ].c_str() ) ) {
00304         cerr << "Couldn't open: " << vecstrDatasets[ i ] << endl;
00305         return 1; }
00306       cerr << "opened: " << vecstrDatasets[ i ] << endl;
00307       
00308       if( sArgs.map_flag ){
00309         // Get gene index match   
00310         cerr << "inside map flag" << endl;
00311         veciGenesCur.clear();
00312         veciGenesCur.resize(DatOut.GetGenes());
00313         for( l = 0; l < DatOut.GetGenes(); l++){
00314           veciGenesCur[ l ] = DatCur.GetGene( DatOut.GetGene(l) );
00315           if( veciGenesCur[ l ] == -1 ){
00316         cerr << "ERROR: missing gene: " << DatOut.GetGene(l) << ", " << vecstrDatasets[ i ] << endl;
00317         return 1;
00318           }
00319         }
00320       }
00321       
00322       // now add edges to Dat
00323       for( j = 0; j < DatOut.GetGenes( ); ++j )
00324         for( k = ( j + 1 ); k < DatOut.GetGenes( ); ++k ) {
00325           
00326           if( sArgs.map_flag ){
00327         if( CMeta::IsNaN( d = DatCur.Get( veciGenesCur[ j ], veciGenesCur[ k ] ) ) ){
00328           continue;
00329         }
00330           }
00331           else{
00332         if( CMeta::IsNaN( d = DatCur.Get(  j, k ) ) ){
00333           continue;
00334         }
00335           }
00336           
00337           if( sArgs.weight_given )
00338           d *= vecWeights[i];
00339           
00340           switch( eMethod ) {
00341           case EMethodMax:
00342         if( CMeta::IsNaN( (dout = DatOut.Get( j, k ))) || d > dout )
00343           DatOut.Set( j, k, d);
00344         break;
00345         
00346           case EMethodMean:
00347         DatTrack.Set( j, k, DatTrack.Get( j, k ) + 1 );
00348         
00349         if( CMeta::IsNaN( (dout = DatOut.Get( j, k ))) )
00350           DatOut.Set( j, k, d );
00351         else
00352           DatOut.Set( j, k, DatOut.Get( j, k ) + d );          
00353         break;
00354           }
00355         }
00356     }
00357     
00358     switch( eMethod ) {
00359     case EMethodMean:
00360       // now convert sum to mean
00361       for( j = 0; j < DatOut.GetGenes( ); ++j )
00362         for( k = ( j + 1 ); k < DatOut.GetGenes( ); ++k )
00363           DatOut.Set( j, k, DatOut.Get( j, k ) / DatTrack.Get( j, k ) );
00364     }
00365 
00366     // Filter dat
00367     if( sArgs.genes_given ) {
00368       ifsm.clear( );
00369       ifsm.open( sArgs.genes_arg );
00370       if( !Genes.Open( ifsm ) ) {
00371         cerr << "Could not open: " << sArgs.genes_arg << endl;
00372         return 1; }
00373       ifsm.close( ); 
00374       
00375       DatOut.FilterGenes( Genes, CDat::EFilterInclude );
00376     }
00377     
00378     if( sArgs.genee_given ){
00379       cerr << "Filter only edges with genes in: " << sArgs.genee_arg << endl;
00380       if( ! DatOut.FilterGenes( sArgs.genee_arg, CDat::EFilterEdge ) ){
00381         cerr << "ERROR can't open file: " << sArgs.genee_arg << endl;
00382         return 1;
00383       }
00384     }
00385     
00386     DatOut.Save( sArgs.output_arg );
00387     return iRet; 
00388 }