Sleipnir
tools/Edges2Posteriors/Edges2Posteriors.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 #include <vector>
00025 
00026 static const char   c_acDab[]   = ".dab";
00027 static const char   c_acQdab[]  = ".qdab";
00028 static const char   c_acXdsl[]  = ".xdsl";
00029 
00030 int main( int iArgs, char** aszArgs ) {
00031     gengetopt_args_info sArgs;
00032     ifstream            ifsm;
00033     CBayesNetSmile      BNSmile;
00034     CDat                Dat, DatLookup;
00035     istream*            pistm;
00036     vector<size_t>      veciGenes;
00037     CPCL                PCLLookup;
00038     vector<string>      vecstrNodes, vecstrGenes, vecstrDummy;
00039     size_t              i, j, k, l, iOne, iTwo, iGene, nStart, nEnd, num_to_skip;
00040     float               dPrior;
00041     CDataMatrix         MatCPT;
00042     unsigned char       b;
00043     
00044     vector<CBayesNetSmile>  BNSmileList;
00045     vector<float>       dPriorList; 
00046     float    SumPosterior;
00047     DIR* dp;
00048     struct dirent* ep;
00049     
00050     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00051         cmdline_parser_print_help( );
00052         return 1; }
00053     CMeta Meta( sArgs.verbosity_arg );
00054     
00055     // single network xdsl file
00056     if( sArgs.network_given ){
00057       if( !BNSmile.Open( sArgs.network_arg ) ) {
00058         cerr << "Could not open: " << sArgs.network_arg << endl;
00059         return 1; }
00060       
00061       BNSmileList.push_back( BNSmile );
00062     }
00063     // directory directory network xdsl files: used for context average Edge2Post
00064     else if( sArgs.netdir_given ){    
00065       i = 0;
00066       dp = opendir (sArgs.netdir_arg);
00067       if (dp != NULL){
00068         while (ep = readdir (dp)){
00069           if(  strstr(ep->d_name, c_acXdsl) != NULL ){      
00070         BNSmileList.push_back( CBayesNetSmile() );
00071         if( !BNSmileList[i].Open(((string)sArgs.netdir_arg + "/" + ep->d_name).c_str() ) ){
00072           cerr << "Could not open: " << (string)sArgs.netdir_arg + "/" + ep->d_name << endl;
00073           return 1; }
00074         i++;
00075           }
00076         }
00077         (void) closedir (dp);
00078       }
00079       
00080       if ( BNSmileList.size() == 0 ){
00081         cerr << "No xdsl files in given directory:" << sArgs.netdir_arg << endl;
00082         return 1;
00083       }
00084     }
00085     else{
00086       cerr << "Need Network file or directory" << endl;
00087       return 1;
00088     }
00089     
00090     if( !Dat.Open( sArgs.input_arg, !!sArgs.memmap_flag ) ) {
00091         cerr << "Could not open: " << sArgs.input_arg << endl;
00092         return 1; }
00093 
00094         if( sArgs.cutoff_given ) {
00095         DatLookup.Open(Dat.GetGeneNames());
00096                 for( i = 0; i < Dat.GetGenes( ); ++i )
00097                         for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j )
00098                                 if( Dat.Get( i, j ) < sArgs.cutoff_arg )
00099                                         Dat.Set( i, j, CMeta::GetNaN( ) );
00100                 else
00101                     DatLookup.Set( i, j, 1);
00102     }
00103     else if( sArgs.lookup_arg ) {
00104         ifsm.open( sArgs.lookup_arg );
00105         pistm = &ifsm; }
00106     else
00107         pistm = &cin;
00108 
00109     if( sArgs.lookup_given && !DatLookup.Open( *pistm, CDat::EFormatText, 1 ) ) {
00110         cerr << "Could not open: " << ( sArgs.lookup_arg ? sArgs.lookup_arg : "stdin" ) << endl;
00111         return 1; }
00112     ifsm.close( );
00113     
00114     for( i = 0; i < DatLookup.GetGenes( ); ++i )
00115         for( j = ( i + 1 ); j < DatLookup.GetGenes( ); ++j )
00116             if( DatLookup.Get( i, j ) == 1 )
00117                 vecstrGenes.push_back( DatLookup.GetGene( i ) + " - " + DatLookup.GetGene( j ) );
00118     cerr << vecstrGenes.size() << endl;
00119     
00120     // Assumes all network xdsl has same datasets!!
00121     BNSmileList[ 0 ].GetNodes( vecstrNodes );
00122     vecstrNodes[ 0 ] = sArgs.input_arg;
00123     
00125     nStart = 0;
00126     nEnd = 0;
00127     num_to_skip = 0;
00128     if( sArgs.start_arg > -1){
00129       nStart = sArgs.start_arg + 1;
00130       num_to_skip = sArgs.start_arg;
00131 
00132       if( nStart == 2 )
00133         vecstrNodes.erase( vecstrNodes.begin() + 1 );
00134       else if( nStart <= vecstrNodes.size() )
00135         vecstrNodes.erase( vecstrNodes.begin() + 1, vecstrNodes.begin() + nStart );
00136     }   
00137     if( sArgs.end_arg > -1 ){
00138       nEnd = (sArgs.end_arg+1) - nStart + 1;
00139       
00140       if( (nEnd+1) < vecstrNodes.size() )
00141         vecstrNodes.erase( (vecstrNodes.begin() + nEnd + 1), vecstrNodes.end() );
00142     }
00143     
00144     PCLLookup.Open( vecstrGenes, vecstrNodes, vecstrDummy );
00145     
00146     veciGenes.resize( DatLookup.GetGenes( ) );
00147     for( i = 0; i < veciGenes.size( ); ++i )
00148         veciGenes[ i ] = Dat.GetGene( DatLookup.GetGene( i ) );
00149     for( iGene = i = 0; i < DatLookup.GetGenes( ); ++i ) {
00150         iOne = veciGenes[ i ];
00151         for( j = ( i + 1 ); j < DatLookup.GetGenes( ); ++j ) {
00152             iTwo = veciGenes[ j ];
00153             if( DatLookup.Get( i, j ) != 1 )
00154                 continue;
00155             if( ( iOne != -1 ) && ( iTwo != -1 ) )
00156                 PCLLookup.Set( iGene, 0, Dat.Get( iOne, iTwo ) );
00157             iGene++; } }
00158 
00159     // store all priors from each network       
00160     dPriorList.resize( BNSmileList.size( ) );
00161     for( i = 0; i < BNSmileList.size( ); ++i ) {
00162       BNSmileList[i].GetCPT( 0, MatCPT );
00163       dPriorList[i] = 1 - MatCPT.Get( 0, 0 );     
00164     }
00165     
00166     for( i = 1; i < vecstrNodes.size( ); ++i ) {
00167         CDataPair   DatCur;
00168         string      strIn;
00169         size_t      iValue;
00170 
00171         cerr << vecstrNodes[ i ] << endl;
00172         strIn = (string)sArgs.directory_arg + "/" + vecstrNodes[ i ] + c_acDab;
00173         if( !DatCur.Open( strIn.c_str( ), false, !!sArgs.memmap_flag ) ) {
00174             strIn = (string)sArgs.directory_arg + "/" + vecstrNodes[ i ] + c_acQdab;
00175             if( !DatCur.Open( strIn.c_str( ), false, !!sArgs.memmap_flag ) ) {
00176                 cerr << "Could not open: " << strIn << endl;
00177                 return 1; }}
00178 
00179         for( j = 0; j < veciGenes.size( ); ++j )
00180             veciGenes[ j ] = DatCur.GetGene( DatLookup.GetGene( j ) );
00181 
00182         // Cache each context's bin posterior prob
00183         // num bins * num contexts
00184         // DatCur.GetValues( ) * BNSmileList.size( );
00185         float CacheBinPost[BNSmileList.size( )][DatCur.GetValues( )];
00186         for(l = 0; l < BNSmileList.size( ); ++l){
00187           for( j = 0; j < DatCur.GetValues( ); ++j ){
00188             CacheBinPost[l][j] = (1 - BNSmileList[l].Evaluate( i + num_to_skip, (unsigned char)j ) - dPriorList[l]);
00189           }
00190         }
00191         
00192         for( iGene = j = 0; j < DatLookup.GetGenes( ); ++j ) {
00193             iOne = veciGenes[ j ];
00194             for( k = ( j + 1 ); k < DatLookup.GetGenes( ); ++k ) {
00195                 if( DatLookup.Get( j, k ) != 1 )
00196                     continue;
00197                 iTwo = veciGenes[ k ];
00198                 iValue = -1;
00199 
00200                 // Use zeros flag if edge is missing and both genes exist in dataset
00201                 iValue = DatCur.Quantize( iOne, iTwo, BNSmileList[0].GetDefault( i + num_to_skip ) );
00202                 
00203                 if( iValue != -1 ){
00204                   // iterate through network xdsl files and average the posteriors
00205                   SumPosterior = 0;
00206                   for( l = 0; l < BNSmileList.size( ); ++l ) {
00207                     SumPosterior += CacheBinPost[l][iValue];
00208                   }
00209                   PCLLookup.Set( iGene, i, SumPosterior / BNSmileList.size() );
00210                 }
00211                 
00212                 iGene++; } } }
00213     PCLLookup.Save( cout );
00214 
00215     return 0; }