Sleipnir
tools/BNWeaver/BNWeaver.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 static const char   c_acDab[]   = ".dab";
00026 static const char   c_acQuant[] = ".quant";
00027 
00028 struct SLearn {
00029     CBayesNetSmile*         m_pBN;
00030     const IDataset*         m_pData;
00031     const CGenes*           m_pGenes;
00032     const CGenes*           m_pNegatives;
00033     const CDat*             m_pAnswers;
00034     const CBayesNetSmile*   m_pBNDefault;
00035     const char*             m_szName;
00036     const vector<string>*   m_pvecstrNames;
00037     const vector<size_t>*   m_pveciZeros;
00038     bool                    m_fZero;
00039 };
00040 
00041 void* learn( void* );
00042 
00043 int main( int iArgs, char** aszArgs ) {
00044     gengetopt_args_info                 sArgs;
00045     size_t                              i, j, iTerm, iThread;
00046     map<string,size_t>                  mapZeros;
00047     CDataPair                           Answers;
00048     vector<CBayesNetSmile*>             vecpBNRoots, vecpBNs;
00049     vector<vector<CBayesNetSmile*>* >   vecpvecpBNData;
00050     CBayesNetSmile                      BNDefault;
00051     vector<size_t>                      veciZeros;
00052     CGenome                             Genome;
00053     CDatasetCompact                     Data;
00054     vector<string>                      vecstrDummy;
00055     vector<CGenes*>                     vecpGenes;
00056     string                              strFile;
00057     vector<pthread_t>                   vecpthdThreads;
00058     vector<SLearn>                      vecsData;
00059     CGenes                              GenesNeg( Genome );
00060 
00061 #ifdef WIN32
00062     pthread_win32_process_attach_np( );
00063 #endif // WIN32
00064 
00065     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00066         cmdline_parser_print_help( );
00067         return 1; }
00068     CMeta Meta( sArgs.verbosity_arg, sArgs.random_arg );
00069 
00070     if( sArgs.zeros_arg ) {
00071         ifstream        ifsm;
00072         vector<string>  vecstrZeros;
00073         char            acLine[ 1024 ];
00074 
00075         ifsm.open( sArgs.zeros_arg );
00076         if( !ifsm.is_open( ) ) {
00077             cerr << "Couldn't open: " << sArgs.zeros_arg << endl;
00078             return 1; }
00079         while( !ifsm.eof( ) ) {
00080             ifsm.getline( acLine, ARRAYSIZE(acLine) - 1 );
00081             acLine[ ARRAYSIZE(acLine) - 1 ] = 0;
00082             vecstrZeros.clear( );
00083             CMeta::Tokenize( acLine, vecstrZeros );
00084             if( vecstrZeros.empty( ) )
00085                 continue;
00086             mapZeros[ vecstrZeros[ 0 ] ] = atoi( vecstrZeros[ 1 ].c_str( ) ); } }
00087 
00088     if( sArgs.default_arg && !BNDefault.Open( sArgs.default_arg ) ) {
00089         cerr << "Couldn't open: " << sArgs.default_arg << endl;
00090         return 1; }
00091 
00092     if( !Answers.Open( sArgs.answers_arg, false, sArgs.memmap_flag && !sArgs.genex_arg ) ) {
00093         cerr << "Couldn't open: " << sArgs.answers_arg << endl;
00094         return 1; }
00095     if( sArgs.genex_arg && !Answers.FilterGenes( sArgs.genex_arg, CDat::EFilterExclude ) ) {
00096         cerr << "Couldn't open: " << sArgs.genex_arg << endl;
00097         return 1; }
00098     if( sArgs.negatives_arg && !GenesNeg.Open( sArgs.negatives_arg ) ) {
00099         cerr << "Couldn't open: " << sArgs.negatives_arg << endl;
00100         return 1; }
00101 
00102     vecpGenes.resize( sArgs.inputs_num );
00103     for( i = 0; i < vecpGenes.size( ); ++i ) {
00104         ifstream    ifsm;
00105 
00106         vecpGenes[ i ]  = new CGenes( Genome );
00107         ifsm.open( sArgs.inputs[ i ] );
00108         if( !vecpGenes[ i ]->Open( ifsm ) ) {
00109             cerr << "Couldn't open: " << sArgs.inputs[ i ] << endl;
00110             return 1; } }
00111     if( !Data.Open( Answers, vecstrDummy, true ) ) {
00112         cerr << "Couldn't open answer set" << endl;
00113         return 1; }
00114     vecstrDummy.push_back( "FR" );
00115     vecpBNRoots.resize( vecpGenes.size( ) );
00116     vecpthdThreads.resize( vecpBNRoots.size( ) );
00117     vecsData.resize( vecpthdThreads.size( ) );
00118     for( iTerm = 0; iTerm < vecpBNRoots.size( ); iTerm += iThread ) {
00119         cerr << "Learning root " << iTerm << '/' << vecpBNRoots.size( ) << endl;
00120         for( iThread = 0; ( ( sArgs.threads_arg == -1 ) || ( iThread < (size_t)sArgs.threads_arg ) ) &&
00121             ( ( iTerm + iThread ) < vecpBNRoots.size( ) ); ++iThread ) {
00122             i = iTerm + iThread;
00123             vecsData[ i ].m_pBN = vecpBNRoots[ i ] = new CBayesNetSmile( );
00124             vecsData[ i ].m_pData = &Data;
00125             vecsData[ i ].m_pGenes = vecpGenes[ i ];
00126             vecsData[ i ].m_pNegatives = &GenesNeg;
00127             vecsData[ i ].m_pAnswers = &Answers;
00128             vecsData[ i ].m_pBNDefault = sArgs.default_arg ? &BNDefault : NULL;
00129             vecsData[ i ].m_szName = sArgs.inputs[ i ];
00130             vecsData[ i ].m_pvecstrNames = &vecstrDummy;
00131             vecsData[ i ].m_pveciZeros = &veciZeros;
00132             vecsData[ i ].m_fZero = false;
00133             if( pthread_create( &vecpthdThreads[ i ], NULL, learn, &vecsData[ i ] ) ) {
00134                 cerr << "Couldn't create root thread: " << sArgs.inputs[ i ] << endl;
00135                 return 1; } }
00136         for( i = 0; i < iThread; ++i )
00137             pthread_join( vecpthdThreads[ iTerm + i ], NULL ); }
00138 
00139     FOR_EACH_DIRECTORY_FILE((string)sArgs.directory_arg, strFile)
00140         vector<string>              vecstrNames;
00141         vector<CBayesNetSmile*>*    pvecpBNData;
00142 
00143         if( !CMeta::IsExtension( strFile, c_acQuant ) )
00144             continue;
00145 
00146         i = strFile.rfind( c_acQuant );
00147         vecstrNames.push_back( (string)sArgs.directory_arg + "/" + strFile.substr( 0, i ) + c_acDab );
00148         if( !Data.Open( Answers, vecstrNames, true, sArgs.memmap_flag && !sArgs.randomize_flag ) ) {
00149             cerr << "Couldn't open: " << vecstrNames.back( ) << endl;
00150             return 1; }
00151         if( sArgs.randomize_flag )
00152             Data.Randomize( );
00153         vecstrNames.insert( vecstrNames.begin( ), sArgs.answers_arg );
00154         for( i = 0; i < vecstrNames.size( ); ++i )
00155             vecstrNames[ i ] = CMeta::Filename( CMeta::Deextension( CMeta::Basename(
00156                 vecstrNames[ i ].c_str( ) ) ) );
00157         veciZeros.resize( vecstrNames.size( ) );
00158         for( i = 0; i < veciZeros.size( ); ++i ) {
00159             map<string,size_t>::const_iterator  iterZero;
00160 
00161             veciZeros[ i ] = ( ( iterZero = mapZeros.find( vecstrNames[ i ] ) ) == mapZeros.end( ) ) ? -1 :
00162                 iterZero->second; }
00163         pvecpBNData = new vector<CBayesNetSmile*>( );
00164         pvecpBNData->resize( vecpGenes.size( ) );
00165         for( iTerm = 0; iTerm < vecpBNRoots.size( ); iTerm += iThread ) {
00166             cerr << "Learning term " << iTerm << '/' << vecpBNRoots.size( ) << endl;
00167             for( iThread = 0; ( ( sArgs.threads_arg == -1 ) || ( iThread < (size_t)sArgs.threads_arg ) ) &&
00168                 ( ( iTerm + iThread ) < vecpBNRoots.size( ) ); ++iThread ) {
00169                 i = iTerm + iThread;
00170                 vecsData[ i ].m_pBN = (*pvecpBNData)[ i ] = new CBayesNetSmile( );
00171                 vecsData[ i ].m_pData = &Data;
00172                 vecsData[ i ].m_pGenes = vecpGenes[ i ];
00173                 vecsData[ i ].m_pNegatives = &GenesNeg;
00174                 vecsData[ i ].m_pAnswers = &Answers;
00175                 vecsData[ i ].m_pBNDefault = sArgs.default_arg ? &BNDefault : NULL;
00176                 vecsData[ i ].m_szName = sArgs.inputs[ i ];
00177                 vecsData[ i ].m_pvecstrNames = &vecstrNames;
00178                 vecsData[ i ].m_pveciZeros = &veciZeros;
00179                 vecsData[ i ].m_fZero = !!sArgs.zero_flag;
00180                 if( pthread_create( &vecpthdThreads[ i ], NULL, learn, &vecsData[ i ] ) ) {
00181                     cerr << "Couldn't create root thread: " << sArgs.inputs[ i ] << endl;
00182                     return 1; } }
00183             for( i = 0; i < iThread; ++i )
00184                 pthread_join( vecpthdThreads[ iTerm + i ], NULL ); }
00185         vecpvecpBNData.push_back( pvecpBNData ); }
00186 
00187 #ifdef _MSC_VER
00188     FindClose( hSearch );
00189 #else // _MSC_VER
00190     closedir( pDir );
00191 #endif // _MSC_VER
00192 
00193     vecpBNs.resize( vecpvecpBNData.size( ) );
00194     for( i = 0; i < vecpGenes.size( ); ++i ) {
00195         CBayesNetSmile          BNOut;
00196 
00197         for( j = 0; j < vecpBNs.size( ); ++j )
00198             vecpBNs[ j ] = (*vecpvecpBNData[ j ])[ i ];
00199         if( !BNOut.Open( *vecpBNRoots[ i ], vecpBNs ) ) {
00200             cerr << "Couldn't merge networks: " << sArgs.inputs[ i ] << endl;
00201             return 1; }
00202         BNOut.Save( ( (string)sArgs.output_arg + "/" + CMeta::Deextension( CMeta::Basename(
00203             sArgs.inputs[ i ] ) ) + "." + ( sArgs.xdsl_flag ? "x" : "" ) + "dsl" ).c_str( ) ); }
00204 
00205     for( i = 0; i < vecpvecpBNData.size( ); ++i ) {
00206         for( j = 0; j < vecpvecpBNData[ i ]->size( ); ++j )
00207             delete (*vecpvecpBNData[ i ])[ j ];
00208         delete vecpvecpBNData[ i ]; }
00209     for( i = 0; i < vecpBNRoots.size( ); ++i ) {
00210         delete vecpBNRoots[ i ];
00211         delete vecpGenes[ i ]; }
00212 
00213 #ifdef WIN32
00214     pthread_win32_process_detach_np( );
00215 #endif // WIN32
00216     return 0; }
00217 
00218 void* learn( void* pData ) {
00219     CDataFilter DataFilter, DataNegatives;
00220     SLearn*     psData;
00221 
00222     psData = (SLearn*)pData;
00223     if( psData->m_pNegatives->GetGenes( ) ) {
00224         DataNegatives.Attach( psData->m_pData, *psData->m_pNegatives, CDat::EFilterEdge, psData->m_pAnswers );
00225         DataFilter.Attach( &DataNegatives, *psData->m_pGenes, CDat::EFilterTerm, psData->m_pAnswers ); }
00226     else
00227         DataFilter.Attach( psData->m_pData, *psData->m_pGenes, CDat::EFilterTerm, psData->m_pAnswers );
00228     if( !psData->m_pBN->Open( &DataFilter, *psData->m_pvecstrNames, *psData->m_pveciZeros ) ) {
00229         cerr << "Couldn't create base network: " << psData->m_szName << endl;
00230         return NULL; }
00231     if( psData->m_pBNDefault )
00232         psData->m_pBN->SetDefault( *psData->m_pBNDefault );
00233     if( !psData->m_pBN->Learn( &DataFilter, 1, psData->m_fZero ) )
00234         cerr << "Couldn't learn base network: " << psData->m_szName << endl;
00235 
00236     return NULL; }