Sleipnir
tools/BNUnraveler/BNUnraveler.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 static const char   c_acDab[]   = ".dab";
00026 
00027 struct SEvaluate {
00028     CBayesNetSmile*         m_pBN;
00029     const CDataPair*        m_pDat;
00030     const CGenes*           m_pGenes;
00031     const CGenes*           m_pGenesIn;
00032     CDat*                   m_pYes;
00033     CDat*                   m_pNo;
00034     const CDataPair*        m_pAnswers;
00035     size_t                  m_iZero;
00036     size_t                  m_iNode;
00037     const vector<size_t>*   m_pveciGenes;
00038     bool                    m_fEverything;
00039     const char*             m_szName;
00040 };
00041 
00042 void* initialize( void* );
00043 void* evaluate( void* );
00044 void* finalize( void* );
00045 
00046 int main( int iArgs, char** aszArgs ) {
00047     gengetopt_args_info     sArgs;
00048     map<string,size_t>      mapZeros;
00049     CDataPair               Answers;
00050     vector<CBayesNetSmile*> vecpBNs;
00051     vector<size_t>          veciZeros, veciGenes;
00052     CGenome                 Genome;
00053     CGenes                  GenesIn( Genome );
00054     size_t                  i, iNode, iThread, iTerm;
00055     vector<CGenes*>         vecpGenes;
00056     CDatasetCompact         Dataset;
00057     vector<pthread_t>       vecpthdThreads;
00058     vector<string>          vecstrNodes, vecstrTmps;
00059     string                  strFile;
00060     vector<CDat*>           vecpYes, vecpNo;
00061     vector<SEvaluate>       vecsData;
00062 
00063 #ifdef WIN32
00064     pthread_win32_process_attach_np( );
00065 #endif // WIN32
00066 
00067     if( cmdline_parser( iArgs, aszArgs, &sArgs ) || !sArgs.inputs_num ) {
00068         cmdline_parser_print_help( );
00069         return 1; }
00070     CMeta Meta( sArgs.verbosity_arg );
00071 
00072     vecpBNs.resize( sArgs.inputs_num );
00073     for( i = 0; i < vecpBNs.size( ); ++i ) {
00074         vecpBNs[ i ] = new CBayesNetSmile( !!sArgs.group_flag );
00075         if( !vecpBNs[ i ]->Open( ( strFile = (string)sArgs.input_arg + '/' + CMeta::Deextension(
00076             CMeta::Basename( sArgs.inputs[ i ] ) ) + ( sArgs.xdsl_flag ? ".xdsl" : ".dsl" ) ).c_str( ) ) ) {
00077             cerr << "Couldn't open: " << strFile << endl;
00078             return 1; } }
00079 
00080     if( sArgs.zeros_arg ) {
00081         ifstream        ifsm;
00082         vector<string>  vecstrZeros;
00083         char            acLine[ 1024 ];
00084 
00085         ifsm.open( sArgs.zeros_arg );
00086         if( !ifsm.is_open( ) ) {
00087             cerr << "Couldn't open: " << sArgs.zeros_arg << endl;
00088             return 1; }
00089         while( !ifsm.eof( ) ) {
00090             ifsm.getline( acLine, ARRAYSIZE(acLine) - 1 );
00091             acLine[ ARRAYSIZE(acLine) - 1 ] = 0;
00092             vecstrZeros.clear( );
00093             CMeta::Tokenize( acLine, vecstrZeros );
00094             if( vecstrZeros.empty( ) )
00095                 continue;
00096             mapZeros[ vecstrZeros[ 0 ] ] = atoi( vecstrZeros[ 1 ].c_str( ) ); } }
00097     vecpBNs[ 0 ]->GetNodes( vecstrNodes );
00098     veciZeros.resize( vecstrNodes.size( ) );
00099     for( i = 0; i < veciZeros.size( ); ++i ) {
00100         map<string,size_t>::const_iterator  iterZero;
00101 
00102         veciZeros[ i ] = ( ( iterZero = mapZeros.find( vecstrNodes[ i ] ) ) == mapZeros.end( ) ) ? -1 :
00103             iterZero->second; }
00104 
00105     if( sArgs.answers_arg && !Answers.Open( sArgs.answers_arg, false, !!sArgs.memmap_flag ) ) {
00106         cerr << "Couldn't open: " << sArgs.answers_arg << endl;
00107         return 1; }
00108     if( sArgs.genome_arg ) {
00109         ifstream    ifsm;
00110         CGenes      Genes( Genome );
00111 
00112         ifsm.open( sArgs.genome_arg );
00113         if( !Genes.Open( ifsm ) ) {
00114             cerr << "Couldn't open: " << sArgs.genome_arg << endl;
00115             return 1; } }
00116     else if( sArgs.answers_arg )
00117         for( i = 0; i < Answers.GetGenes( ); ++i )
00118             Genome.AddGene( Answers.GetGene( i ) );
00119     else {
00120         vector<string>  vecstrFiles;
00121 
00122         vecstrFiles.resize( vecstrNodes.size( ) - 1 );
00123         for( i = 0; i < vecstrFiles.size( ); ++i )
00124             vecstrFiles[ i ] = (string)sArgs.directory_arg + '/' + vecstrNodes[ i + 1 ] + c_acDab;
00125         if( !Dataset.OpenGenes( vecstrFiles ) ) {
00126             cerr << "Couldn't open: " << sArgs.directory_arg << endl;
00127             return 1; }
00128         for( i = 0; i < Dataset.GetGenes( ); ++i )
00129             Genome.AddGene( Dataset.GetGene( i ) ); }
00130     if( sArgs.genes_arg ) {
00131         ifstream    ifsm;
00132 
00133         ifsm.open( sArgs.genes_arg );
00134         if( !GenesIn.Open( ifsm, !sArgs.genome_arg ) ) {
00135             cerr << "Couldn't open: " << sArgs.genes_arg << endl;
00136             return 1; } }
00137 
00138     vecpGenes.resize( sArgs.inputs_num );
00139     vecpYes.resize( vecpGenes.size( ) );
00140     vecpNo.resize( vecpGenes.size( ) );
00141     vecstrTmps.resize( vecpNo.size( ) );
00142     for( i = 0; i < vecpGenes.size( ); ++i ) {
00143         ifstream    ifsm;
00144         char        acTemp[ L_tmpnam + 1 ];
00145 
00146         vecpGenes[ i ]  = new CGenes( Genome );
00147         ifsm.open( sArgs.inputs[ i ] );
00148         if( !vecpGenes[ i ]->Open( ifsm, false ) ) {
00149             cerr << "Couldn't open: " << sArgs.inputs[ i ] << endl;
00150             return 1; }
00151         vecpYes[ i ] = new CDat( );
00152         vecpYes[ i ]->Open( Genome.GetGeneNames( ), false, ( (string)sArgs.output_arg + '/' +
00153             CMeta::Basename( sArgs.inputs[ i ] ) + c_acDab ).c_str( ) );
00154         vecpNo[ i ] = new CDat( );
00155 #pragma warning( disable : 4996 )
00156         vecstrTmps[ i ] = tmpnam( acTemp );
00157 #pragma warning( default : 4996 )
00158         vecpNo[ i ]->Open( Genome.GetGeneNames( ), false, vecstrTmps[ i ].c_str( ) ); }
00159 
00160     vecsData.resize( vecpGenes.size( ) );
00161     vecpthdThreads.resize( vecsData.size( ) );
00162     for( iTerm = 0; iTerm < vecpGenes.size( ); iTerm += iThread ) {
00163         for( iThread = 0; ( ( sArgs.threads_arg == -1 ) || ( iThread < (size_t)sArgs.threads_arg ) ) &&
00164             ( ( iTerm + iThread ) < vecpGenes.size( ) ); ++iThread ) {
00165             i = iTerm + iThread;
00166             vecsData[ i ].m_pBN = vecpBNs[ i ];
00167             vecsData[ i ].m_pGenes = vecpGenes[ i ];
00168             vecsData[ i ].m_pYes = vecpYes[ i ];
00169             vecsData[ i ].m_pNo = vecpNo[ i ];
00170             vecsData[ i ].m_szName = sArgs.inputs[ i ];
00171             if( pthread_create( &vecpthdThreads[ i ], NULL, initialize, &vecsData[ i ] ) ) {
00172                 cerr << "Couldn't create initialization thread: " << sArgs.inputs[ i ] << endl;
00173                 return 1; } }
00174         for( i = 0; i < iThread; ++i )
00175             pthread_join( vecpthdThreads[ iTerm + i ], NULL ); }
00176 
00177     veciGenes.resize( vecpYes[ 0 ]->GetGenes( ) );
00178     for( iNode = 1; iNode < vecstrNodes.size( ); ++iNode ) {
00179         CDataPair   Dat;
00180 
00181         if( !Dat.Open( ( strFile = ( (string)sArgs.directory_arg + '/' + vecstrNodes[ iNode ] +
00182             c_acDab ) ).c_str( ), false, !!sArgs.memmap_flag ) ) {
00183             cerr << "Couldn't open: " << strFile << endl;
00184             return 1; }
00185         for( i = 0; i < veciGenes.size( ); ++i )
00186             veciGenes[ i ] = Dat.GetGene( vecpYes[ 0 ]->GetGene( i ) );
00187         for( iTerm = 0; iTerm < vecpGenes.size( ); iTerm += iThread ) {
00188             for( iThread = 0; ( ( sArgs.threads_arg == -1 ) || ( iThread < (size_t)sArgs.threads_arg ) ) &&
00189                 ( ( iTerm + iThread ) < vecpGenes.size( ) ); ++iThread ) {
00190                 i = iTerm + iThread;
00191                 vecsData[ i ].m_pBN = vecpBNs[ i ];
00192                 vecsData[ i ].m_pDat = &Dat;
00193                 vecsData[ i ].m_pGenes = vecpGenes[ i ];
00194                 vecsData[ i ].m_pGenesIn = GenesIn.GetGenes( ) ? &GenesIn : NULL;
00195                 vecsData[ i ].m_pYes = vecpYes[ i ];
00196                 vecsData[ i ].m_pNo = vecpNo[ i ];
00197                 vecsData[ i ].m_pAnswers = Answers.GetGenes( ) ? &Answers : NULL;
00198                 vecsData[ i ].m_iZero = sArgs.zero_flag ? 0 : veciZeros[ iNode ];
00199                 vecsData[ i ].m_iNode = iNode;
00200                 vecsData[ i ].m_pveciGenes = &veciGenes;
00201                 vecsData[ i ].m_fEverything = !!sArgs.everything_flag;
00202                 vecsData[ i ].m_szName = sArgs.inputs[ i ];
00203                 if( pthread_create( &vecpthdThreads[ i ], NULL, evaluate, &vecsData[ i ] ) ) {
00204                     cerr << "Couldn't create evaluation thread: " << sArgs.inputs[ i ] << endl;
00205                     return 1; } }
00206             for( i = 0; i < iThread; ++i )
00207                 pthread_join( vecpthdThreads[ iTerm + i ], NULL ); } }
00208 
00209     for( iTerm = 0; iTerm < vecpGenes.size( ); iTerm += iThread ) {
00210         for( iThread = 0; ( ( sArgs.threads_arg == -1 ) || ( iThread < (size_t)sArgs.threads_arg ) ) &&
00211             ( ( iTerm + iThread ) < vecpGenes.size( ) ); ++iThread ) {
00212             i = iTerm + iThread;
00213             vecsData[ i ].m_pBN = vecpBNs[ i ];
00214             vecsData[ i ].m_pYes = vecpYes[ i ];
00215             vecsData[ i ].m_pNo = vecpNo[ i ];
00216             vecsData[ i ].m_szName = sArgs.inputs[ i ];
00217             if( pthread_create( &vecpthdThreads[ i ], NULL, finalize, &vecsData[ i ] ) ) {
00218                 cerr << "Couldn't create finalization thread: " << sArgs.inputs[ i ] << endl;
00219                 return 1; } }
00220         for( i = 0; i < iThread; ++i )
00221             pthread_join( vecpthdThreads[ iTerm + i ], NULL ); }
00222 
00223     for( i = 0; i < vecstrTmps.size( ); ++i )
00224         _unlink( vecstrTmps[ i ].c_str( ) );
00225     for( i = 0; i < vecpGenes.size( ); ++i ) {
00226         delete vecpBNs[ i ];
00227         delete vecpYes[ i ];
00228         delete vecpNo[ i ];
00229         delete vecpGenes[ i ]; }
00230 
00231 #ifdef WIN32
00232     pthread_win32_process_detach_np( );
00233 #endif // WIN32
00234     return 0; }
00235 
00236 void* initialize( void* pData ) {
00237     SEvaluate*  psData;
00238     size_t      i;
00239     float*      adBuffer;
00240 
00241     psData = (SEvaluate*)pData;
00242     adBuffer = new float[ psData->m_pYes->GetGenes( ) ];
00243     for( i = 0; i < psData->m_pNo->GetGenes( ); ++i )
00244         adBuffer[ i ] = CMeta::GetNaN( );
00245     for( i = 0; i < psData->m_pNo->GetGenes( ); ++i ) {
00246         if( !( i % 1000 ) )
00247             cerr << "IN: " << psData->m_szName << ", " << i << endl;
00248         psData->m_pNo->Set( i, adBuffer ); }
00249     for( i = 0; i < psData->m_pYes->GetGenes( ); ++i ) {
00250         if( !( i % 1000 ) )
00251             cerr << "IY: " << psData->m_szName << ", " << i << endl;
00252         psData->m_pYes->Set( i, adBuffer ); }
00253     delete[] adBuffer;
00254 
00255     return NULL; }
00256 
00257 void* evaluate( void* pData ) {
00258     SEvaluate*      psData;
00259     CDataMatrix     MatCPT;
00260     size_t          i, j, iOne, iTwo, iBin, iIndex;
00261     vector<bool>    vecfGenes, vecfGenesIn;
00262     bool            fTermOne, fListOne;
00263     float*          adYes;
00264     float*          adNo;
00265     float           dNo, dYes;
00266 
00267     psData = (SEvaluate*)pData;
00268     psData->m_pBN->GetCPT( 0, MatCPT );
00269     dNo = log( MatCPT.Get( 0, 0 ) );
00270     dYes = log( MatCPT.Get( 1, 0 ) );
00271     adYes = new float[ psData->m_pYes->GetGenes( ) ];
00272     adNo = new float[ psData->m_pNo->GetGenes( ) ];
00273     if( !psData->m_fEverything ) {
00274         vecfGenes.resize( psData->m_pYes->GetGenes( ) );
00275         for( i = 0; i < vecfGenes.size( ); ++i )
00276             vecfGenes[ i ] = psData->m_pGenes->IsGene( psData->m_pYes->GetGene( i ) ); }
00277     if( psData->m_pGenesIn ) {
00278         vecfGenesIn.resize( psData->m_pYes->GetGenes( ) );
00279         for( i = 0; i < vecfGenes.size( ); ++i )
00280             vecfGenesIn[ i ] = psData->m_pGenesIn->IsGene( psData->m_pYes->GetGene( i ) ); }
00281     psData->m_pBN->GetCPT( psData->m_iNode, MatCPT );
00282     for( i = 0; i < psData->m_pYes->GetGenes( ); ++i ) {
00283         if( !( i % 1000 ) )
00284             cerr << "C: " << psData->m_szName << ", " << i << endl;
00285         if( ( ( iOne = (*psData->m_pveciGenes)[ i ] ) == -1 ) && ( psData->m_iZero == -1 ) )
00286             continue;
00287         fTermOne = psData->m_fEverything || vecfGenes[ i ];
00288         fListOne = !psData->m_pGenesIn || vecfGenesIn[ i ];
00289         memcpy( adYes, psData->m_pYes->Get( i ), ( psData->m_pYes->GetGenes( ) - i - 1 ) * sizeof(*adYes) );
00290         memcpy( adNo, psData->m_pNo->Get( i ), ( psData->m_pNo->GetGenes( ) - i - 1 ) * sizeof(*adNo) );
00291         for( j = ( i + 1 ); j < psData->m_pYes->GetGenes( ); ++j ) {
00292             if( ( ( iTwo = (*psData->m_pveciGenes)[ j ] ) == -1 ) && ( psData->m_iZero == -1 ) )
00293                 continue;
00294             if( !( fTermOne || vecfGenes[ j ] ) || !( fListOne || vecfGenesIn[ j ] ) )
00295                 continue;
00296             if( psData->m_pAnswers && CMeta::IsNaN( psData->m_pAnswers->Get( i, j ) ) )
00297                 continue;
00298 
00299             iBin = psData->m_pDat->Quantize( iOne, iTwo, psData->m_iZero );
00300             if( iBin == -1 )
00301                 continue;
00302 
00303             if( CMeta::IsNaN( adYes[ iIndex = ( j - i - 1 ) ] ) ) {
00304                 adYes[ iIndex ] = dYes;
00305                 adNo[ iIndex ] = dNo; }
00306             adNo[ iIndex ] += log( MatCPT.Get( iBin, 0 ) );
00307             adYes[ iIndex ] += log( MatCPT.Get( iBin, 1 ) ); }
00308         psData->m_pNo->Set( i, adNo );
00309         psData->m_pYes->Set( i, adYes ); }
00310     delete[] adYes;
00311     delete[] adNo;
00312 
00313     return NULL; }
00314 
00315 void* finalize( void* pData ) {
00316     SEvaluate*  psData;
00317     size_t      i, j;
00318     float*      adYes;
00319     float*      adNo;
00320     float       dPrior;
00321     CDataMatrix MatCPT;
00322 
00323     psData = (SEvaluate*)pData;
00324     psData->m_pBN->GetCPT( 0, MatCPT );
00325     dPrior = MatCPT.Get( 1, 0 );
00326     adYes = new float[ psData->m_pYes->GetGenes( ) ];
00327     adNo = new float[ psData->m_pNo->GetGenes( ) ];
00328     for( i = 0; i < psData->m_pYes->GetGenes( ); ++i ) {
00329         if( !( i % 1000 ) )
00330             cerr << "F: " << psData->m_szName << ", " << i << endl;
00331         memcpy( adYes, psData->m_pYes->Get( i ), ( psData->m_pYes->GetGenes( ) - i - 1 ) * sizeof(*adYes) );
00332         memcpy( adNo, psData->m_pNo->Get( i ), ( psData->m_pNo->GetGenes( ) - i - 1 ) * sizeof(*adNo) );
00333         for( j = 0; j < ( psData->m_pYes->GetGenes( ) - i - 1 ); ++j )
00334             adYes[ j ] = CMeta::IsNaN( adYes[ j ] ) ? dPrior :
00335                 (float)( 1 / ( 1 + exp( (double)adNo[ j ] - (double)adYes[ j ] ) ) );
00336         psData->m_pYes->Set( i, adYes ); }
00337     delete[] adNo;
00338     delete[] adYes;
00339 
00340     return NULL; }