Sleipnir
tools/Contexter/Contexter.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 static const char   c_szXDSL[]  = ".xdsl";
00026 static const char   c_szDSL[]   = ".dsl";
00027 
00028 int main_database( const gengetopt_args_info& );
00029 int main_datfile( const gengetopt_args_info& );
00030 size_t count_contexts( const char* );
00031 bool open_contexts( const char*, size_t, size_t, CCompactFullMatrix& );
00032 
00033 int main( int iArgs, char** aszArgs ) {
00034     gengetopt_args_info sArgs;
00035     int                 iRet;
00036 
00037     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00038         cmdline_parser_print_help( );
00039         return 1; }
00040     CMeta Meta( sArgs.verbosity_arg );
00041 
00042     iRet = sArgs.dat_flag ? main_datfile( sArgs ) : main_database( sArgs );
00043 
00044     return iRet; }
00045 
00046 size_t count_contexts( const char* szContexts ) {
00047     static const size_t c_iBuffer   = 1024;
00048     char                acBuffer[ c_iBuffer ];
00049     ifstream            ifsm;
00050     vector<string>      vecstrLine;
00051     size_t              iRet, iCur;
00052 
00053     iRet = -1;
00054     ifsm.open( szContexts );
00055     if( !ifsm.is_open( ) ) {
00056         cerr << "Could not open: " << szContexts << endl;
00057         return -1; }
00058     while( !ifsm.eof( ) ) {
00059         ifsm.getline( acBuffer, c_iBuffer - 1 );
00060         acBuffer[ c_iBuffer - 1 ] = 0;
00061         if( !acBuffer[ 0 ] )
00062             continue;
00063         vecstrLine.clear( );
00064         CMeta::Tokenize( acBuffer, vecstrLine );
00065         if( vecstrLine.empty( ) ) {
00066             cerr << "Invalid line in " << szContexts << ": " << acBuffer << endl;
00067             return -1; }
00068         iCur = atoi( vecstrLine[ 0 ].c_str( ) );
00069         if( ( iRet == -1 ) || ( iCur > iRet ) )
00070             iRet = iCur; }
00071     ifsm.close( );
00072 
00073     return iRet; }
00074 
00075 bool open_contexts( const char* szContexts, size_t iContexts, size_t iGenes,
00076     CCompactFullMatrix& MatContexts ) {
00077     static const size_t c_iBuffer   = 1024;
00078     char                acBuffer[ c_iBuffer ];
00079     ifstream            ifsm;
00080     vector<string>      vecstrLine;
00081 
00082     if( ( iContexts == -1 ) && ( ( iContexts = count_contexts( szContexts ) ) == -1 ) )
00083         return false;
00084 
00085     ifsm.open( szContexts );
00086     if( !ifsm.is_open( ) ) {
00087         cerr << "Could not open: " << szContexts << endl;
00088         return false; }
00089     MatContexts.Initialize( iContexts, iGenes, 2, true );
00090     while( !ifsm.eof( ) ) {
00091         size_t  iContext, iGene;
00092 
00093         ifsm.getline( acBuffer, c_iBuffer - 1 );
00094         acBuffer[ c_iBuffer - 1 ] = 0;
00095         if( !acBuffer[ 0 ] )
00096             continue;
00097         vecstrLine.clear( );
00098         CMeta::Tokenize( acBuffer, vecstrLine );
00099         if( vecstrLine.size( ) != 2 ) {
00100             cerr << "Invalid line in " << szContexts << ": " << acBuffer << endl;
00101             return false; }
00102         iContext = atoi( vecstrLine[ 0 ].c_str( ) ) - 1;
00103         iGene = atoi( vecstrLine[ 1 ].c_str( ) ) - 1;
00104         if( iContext >= MatContexts.GetRows( ) ) {
00105             cerr << "Invalid context on line: " << acBuffer << endl;
00106             return false; }
00107         if( iGene >= MatContexts.GetColumns( ) ) {
00108             cerr << "Invalid gene on line: " << acBuffer << endl;
00109             return false; }
00110         MatContexts.Set( iContext, iGene, 1 ); }
00111     ifsm.close( );
00112 
00113     return true; }
00114 
00115 int main_database( const gengetopt_args_info& sArgs ) {
00116     static const size_t                 c_iBuffer   = 1024;
00117     char                                acBuffer[ c_iBuffer ];
00118     CBayesNetSmile                      BNSmile;
00119     CBayesNetMinimal                    BNDefault;
00120     vector<CBayesNetMinimal>            vecBNs;
00121     ifstream                            ifsm;
00122     istream*                            pistm;
00123     map<size_t, string>                 mapistrBNs;
00124     map<size_t, string>::const_iterator iterBN;
00125     size_t                              i, iMax, iGene;
00126     //CDatabase                         Database;
00127     uint32_t                            iSize;
00128     float*                              adGenes;
00129     CDat                                Dat;
00130     CCompactFullMatrix                  MatContexts;
00131     vector<string>                      vecstrLine;
00132 
00133     bool isNibble = true;
00134     if(sArgs.is_nibble_arg==0){
00135         isNibble = false;
00136     }
00137     CDatabase Database(isNibble);
00138 
00139     if( !Database.Open( sArgs.database_arg ) ) {
00140         cerr << "Could not open: " << sArgs.database_arg << endl;
00141         return 1; }
00142     if( sArgs.minimal_in_flag ) {
00143         ifsm.open( sArgs.networks_arg, ios_base::binary );
00144         if( !BNDefault.Open( ifsm ) ) {
00145             cerr << "Could not read: " << sArgs.networks_arg << endl;
00146             return 1; }
00147         ifsm.read( (char*)&iSize, sizeof(iSize) );
00148         vecBNs.resize( iSize );
00149         for( i = 0; i < vecBNs.size( ); ++i )
00150             if( !vecBNs[ i ].Open( ifsm ) ) {
00151                 cerr << "Could not read: " << sArgs.networks_arg << " (" << i << ")" << endl;
00152                 return 1; }
00153         ifsm.close( ); }
00154     else {
00155         if( sArgs.default_arg && !( BNSmile.Open( sArgs.default_arg ) && BNDefault.Open( BNSmile ) ) ) {
00156             cerr << "Could not open: " << sArgs.default_arg << endl;
00157             return 1; }
00158         BNDefault.SetID( sArgs.default_arg );
00159         if( sArgs.input_arg ) {
00160             ifsm.open( sArgs.input_arg );
00161             pistm = &ifsm; }
00162         else
00163             pistm = &cin;
00164         iMax = 0;
00165         while( !pistm->eof( ) ) {
00166             pistm->getline( acBuffer, c_iBuffer - 1 );
00167             acBuffer[ c_iBuffer - 1 ] = 0;
00168             if( !acBuffer[ 0 ] )
00169                 continue;
00170             vecstrLine.clear( );
00171             CMeta::Tokenize( acBuffer, vecstrLine );
00172             if( vecstrLine.size( ) < 2 ) {
00173                 cerr << "Ignoring line: " << acBuffer << endl;
00174                 continue; }
00175             if( ( i = atoi( vecstrLine[ 0 ].c_str( ) ) ) > iMax )
00176                 iMax = i;
00177             mapistrBNs[ i ] = vecstrLine[ 1 ]; }
00178         if( sArgs.input_arg )
00179             ifsm.close( );
00180         vecBNs.resize( iMax );
00181         for( iterBN = mapistrBNs.begin( ); iterBN != mapistrBNs.end( ); ++iterBN ) {
00182             if( !( BNSmile.Open( ( (string)sArgs.networks_arg + '/' + CMeta::Filename( iterBN->second ) +
00183                 ( sArgs.xdsl_flag ? c_szXDSL : c_szDSL ) ).c_str( ) ) &&
00184                 vecBNs[ iterBN->first - 1 ].Open( BNSmile ) ) ) {
00185                 cerr << "Could not open: " << iterBN->second << endl;
00186                 return 1; }
00187             vecBNs[ iterBN->first - 1 ].SetID( iterBN->second ); } }
00188 
00189     if( sArgs.minimal_out_arg ) {
00190         ofstream    ofsm;
00191 
00192         ofsm.open( sArgs.minimal_out_arg, ios_base::binary );
00193         BNDefault.Save( ofsm );
00194         iSize = (uint32_t)vecBNs.size( );
00195         ofsm.write( (const char*)&iSize, sizeof(iSize) );
00196         for( i = 0; i < vecBNs.size( ); ++i )
00197             vecBNs[ i ].Save( ofsm ); }
00198 
00199     if( !open_contexts( sArgs.contexts_arg, vecBNs.size( ), Database.GetGenes( ), MatContexts ) )
00200         return 1;
00201 
00202     vecstrLine.resize( Database.GetGenes( ) );
00203     for( i = 0; i < vecstrLine.size( ); ++i )
00204         vecstrLine[ i ] = Database.GetGene( i );
00205     Dat.Open( vecstrLine );
00206     adGenes = new float[ Database.GetGenes( ) ];
00207     for( iGene = 0; ( iGene + 1 ) < Database.GetGenes( ); ++iGene ) {
00208         vector<unsigned char>   vecbData;
00209 
00210         if( !Database.Get( iGene, vecbData ) ) {
00211             cerr << "Could not retrieve gene: " << iGene << " (" << Database.GetGene( iGene ) << ")" << endl;
00212             return 1; }
00213         if( !vecBNs[ sArgs.context_arg - 1 ].Evaluate( vecbData, adGenes, Database.GetGenes( ), iGene + 1 ) ) {
00214             cerr << "Inference failed on gene: " << iGene << " (" << Database.GetGene( iGene ) << ")" << endl;
00215             return 1; }
00216 
00217         for( i = ( iGene + 1 ); i < Database.GetGenes( ); ++i )
00218             Dat.Set( iGene, i, adGenes[ i ] ); }
00219     delete[] adGenes;
00220 
00221     return 0; }
00222 
00223 int main_datfile( const gengetopt_args_info& sArgs ) {
00224     static const size_t                 c_iBuffer   = 1024;
00225     char                                acBuffer[ c_iBuffer ];
00226     CDat                                Dat;
00227     size_t                              i, j, iGene, iIn, iOut;
00228     float                               d, dIn, dOut;
00229     CCompactFullMatrix                  MatContexts;
00230     vector<size_t>                      veciGenes;
00231     map<string, size_t>                 mapstriGenes;
00232     map<string, size_t>::const_iterator iterGene;
00233     ifstream                            ifsm;
00234     vector<string>                      vecstrLine, vecstrGenes;
00235 
00236     if( sArgs.input_arg ) {
00237         if( !Dat.Open( sArgs.input_arg, !!sArgs.memmap_flag ) ) {
00238             cerr << "Could not open: " << sArgs.input_arg << endl;
00239             return 1; } }
00240     else {
00241         if( !Dat.Open( cin, CDat::EFormatText ) ) {
00242             cerr << "Could not open DAT" << endl;
00243             return 1; } }
00244 
00245     ifsm.open( sArgs.genes_arg );
00246     if( !ifsm.is_open( ) ) {
00247         cerr << "Could not open: " << sArgs.genes_arg << endl;
00248         return -1; }
00249     while( !ifsm.eof( ) ) {
00250         ifsm.getline( acBuffer, c_iBuffer - 1 );
00251         acBuffer[ c_iBuffer - 1 ] = 0;
00252         if( !acBuffer[ 0 ] )
00253             continue;
00254         vecstrLine.clear( );
00255         CMeta::Tokenize( acBuffer, vecstrLine );
00256         if( vecstrLine.size( ) != 2 ) {
00257             cerr << "Invalid line in " << sArgs.genes_arg << ": " << acBuffer << endl;
00258             return -1; }
00259         vecstrGenes.push_back( vecstrLine[ 1 ] );
00260         mapstriGenes[ vecstrLine[ 1 ] ] = atoi( vecstrLine[ 0 ].c_str( ) ); }
00261     ifsm.close( );
00262 
00263     if( !open_contexts( sArgs.contexts_arg, -1, Dat.GetGenes( ), MatContexts ) )
00264         return false;
00265 
00266     if( sArgs.lookup_arg ) {
00267         veciGenes.resize( Dat.GetGenes( ) );
00268         for( i = 0; i < Dat.GetGenes( ); ++i )
00269             veciGenes[ i ] = ( ( iterGene = mapstriGenes.find( Dat.GetGene( i ) ) ) == mapstriGenes.end( ) ) ?
00270                 -1 : ( iterGene->second - 1 );
00271         if( ( iGene = Dat.GetGene( sArgs.lookup_arg ) ) == -1 ) {
00272             cerr << "Unknown gene: " << sArgs.lookup_arg << endl;
00273             return 1; }
00274         dIn = dOut = 0;
00275         for( iIn = iOut = i = 0; i < Dat.GetGenes( ); ++i ) {
00276             if( ( i == iGene ) || CMeta::IsNaN( d = Dat.Get( iGene, i ) ) || ( veciGenes[ i ] == -1 ) )
00277                 continue;
00278             if( MatContexts.Get( sArgs.context_arg - 1, veciGenes[ i ] ) ) {
00279                 iIn++;
00280                 dIn += d; }
00281             iOut++;
00282             dOut += d; }
00283         cout << Dat.GetGene( iGene ) << '\t' << ( dIn / iIn ) << '\t' << ( dOut / iOut ) << endl; }
00284     else {
00285         veciGenes.resize( vecstrGenes.size( ) );
00286         for( i = 0; i < vecstrGenes.size( ); ++i )
00287             veciGenes[ i ] = Dat.GetGene( vecstrGenes[ i ] );
00288         for( i = 0; i < vecstrGenes.size( ); ++i ) {
00289             if( !( i % 100 ) )
00290                 cerr << "Gene " << i << '/' << vecstrGenes.size( ) << endl;
00291             if( ( iGene = veciGenes[ i ] ) == -1 ) {
00292                 cout << vecstrGenes[ i ] << '\t' << 0 << '\t' << 0 << endl;
00293                 continue; }
00294             dIn = dOut = 0;
00295             for( iIn = iOut = j = 0; j < vecstrGenes.size( ); ++j ) {
00296                 if( ( i == j ) || ( veciGenes[ j ] == -1 ) ||
00297                     CMeta::IsNaN( d = Dat.Get( iGene, veciGenes[ j ] ) ) )
00298                     continue;
00299                 if( MatContexts.Get( sArgs.context_arg - 1, j ) ) {
00300                     iIn++;
00301                     dIn += d; }
00302                 iOut++;
00303                 dOut += d; }
00304             cout << vecstrGenes[ i ] << '\t' << ( dIn / iIn ) << '\t' << ( dOut / iOut ) << endl; } }
00305 
00306     return 0; }