Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 static const char c_acDab[] = ".dab"; 00026 static const char c_acQuant[] = ".quant"; 00027 00028 struct SLearn { 00029 CBayesNetSmile* m_pBN; 00030 const IDataset* m_pData; 00031 const CGenes* m_pGenes; 00032 const CGenes* m_pNegatives; 00033 const CDat* m_pAnswers; 00034 const CBayesNetSmile* m_pBNDefault; 00035 const char* m_szName; 00036 const vector<string>* m_pvecstrNames; 00037 const vector<size_t>* m_pveciZeros; 00038 bool m_fZero; 00039 }; 00040 00041 void* learn( void* ); 00042 00043 int main( int iArgs, char** aszArgs ) { 00044 gengetopt_args_info sArgs; 00045 size_t i, j, iTerm, iThread; 00046 map<string,size_t> mapZeros; 00047 CDataPair Answers; 00048 vector<CBayesNetSmile*> vecpBNRoots, vecpBNs; 00049 vector<vector<CBayesNetSmile*>* > vecpvecpBNData; 00050 CBayesNetSmile BNDefault; 00051 vector<size_t> veciZeros; 00052 CGenome Genome; 00053 CDatasetCompact Data; 00054 vector<string> vecstrDummy; 00055 vector<CGenes*> vecpGenes; 00056 string strFile; 00057 vector<pthread_t> vecpthdThreads; 00058 vector<SLearn> vecsData; 00059 CGenes GenesNeg( Genome ); 00060 00061 #ifdef WIN32 00062 pthread_win32_process_attach_np( ); 00063 #endif // WIN32 00064 00065 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00066 cmdline_parser_print_help( ); 00067 return 1; } 00068 CMeta Meta( sArgs.verbosity_arg, sArgs.random_arg ); 00069 00070 if( sArgs.zeros_arg ) { 00071 ifstream ifsm; 00072 vector<string> vecstrZeros; 00073 char acLine[ 1024 ]; 00074 00075 ifsm.open( sArgs.zeros_arg ); 00076 if( !ifsm.is_open( ) ) { 00077 cerr << "Couldn't open: " << sArgs.zeros_arg << endl; 00078 return 1; } 00079 while( !ifsm.eof( ) ) { 00080 ifsm.getline( acLine, ARRAYSIZE(acLine) - 1 ); 00081 acLine[ ARRAYSIZE(acLine) - 1 ] = 0; 00082 vecstrZeros.clear( ); 00083 CMeta::Tokenize( acLine, vecstrZeros ); 00084 if( vecstrZeros.empty( ) ) 00085 continue; 00086 mapZeros[ vecstrZeros[ 0 ] ] = atoi( vecstrZeros[ 1 ].c_str( ) ); } } 00087 00088 if( sArgs.default_arg && !BNDefault.Open( sArgs.default_arg ) ) { 00089 cerr << "Couldn't open: " << sArgs.default_arg << endl; 00090 return 1; } 00091 00092 if( !Answers.Open( sArgs.answers_arg, false, sArgs.memmap_flag && !sArgs.genex_arg ) ) { 00093 cerr << "Couldn't open: " << sArgs.answers_arg << endl; 00094 return 1; } 00095 if( sArgs.genex_arg && !Answers.FilterGenes( sArgs.genex_arg, CDat::EFilterExclude ) ) { 00096 cerr << "Couldn't open: " << sArgs.genex_arg << endl; 00097 return 1; } 00098 if( sArgs.negatives_arg && !GenesNeg.Open( sArgs.negatives_arg ) ) { 00099 cerr << "Couldn't open: " << sArgs.negatives_arg << endl; 00100 return 1; } 00101 00102 vecpGenes.resize( sArgs.inputs_num ); 00103 for( i = 0; i < vecpGenes.size( ); ++i ) { 00104 ifstream ifsm; 00105 00106 vecpGenes[ i ] = new CGenes( Genome ); 00107 ifsm.open( sArgs.inputs[ i ] ); 00108 if( !vecpGenes[ i ]->Open( ifsm ) ) { 00109 cerr << "Couldn't open: " << sArgs.inputs[ i ] << endl; 00110 return 1; } } 00111 if( !Data.Open( Answers, vecstrDummy, true ) ) { 00112 cerr << "Couldn't open answer set" << endl; 00113 return 1; } 00114 vecstrDummy.push_back( "FR" ); 00115 vecpBNRoots.resize( vecpGenes.size( ) ); 00116 vecpthdThreads.resize( vecpBNRoots.size( ) ); 00117 vecsData.resize( vecpthdThreads.size( ) ); 00118 for( iTerm = 0; iTerm < vecpBNRoots.size( ); iTerm += iThread ) { 00119 cerr << "Learning root " << iTerm << '/' << vecpBNRoots.size( ) << endl; 00120 for( iThread = 0; ( ( sArgs.threads_arg == -1 ) || ( iThread < (size_t)sArgs.threads_arg ) ) && 00121 ( ( iTerm + iThread ) < vecpBNRoots.size( ) ); ++iThread ) { 00122 i = iTerm + iThread; 00123 vecsData[ i ].m_pBN = vecpBNRoots[ i ] = new CBayesNetSmile( ); 00124 vecsData[ i ].m_pData = &Data; 00125 vecsData[ i ].m_pGenes = vecpGenes[ i ]; 00126 vecsData[ i ].m_pNegatives = &GenesNeg; 00127 vecsData[ i ].m_pAnswers = &Answers; 00128 vecsData[ i ].m_pBNDefault = sArgs.default_arg ? &BNDefault : NULL; 00129 vecsData[ i ].m_szName = sArgs.inputs[ i ]; 00130 vecsData[ i ].m_pvecstrNames = &vecstrDummy; 00131 vecsData[ i ].m_pveciZeros = &veciZeros; 00132 vecsData[ i ].m_fZero = false; 00133 if( pthread_create( &vecpthdThreads[ i ], NULL, learn, &vecsData[ i ] ) ) { 00134 cerr << "Couldn't create root thread: " << sArgs.inputs[ i ] << endl; 00135 return 1; } } 00136 for( i = 0; i < iThread; ++i ) 00137 pthread_join( vecpthdThreads[ iTerm + i ], NULL ); } 00138 00139 FOR_EACH_DIRECTORY_FILE((string)sArgs.directory_arg, strFile) 00140 vector<string> vecstrNames; 00141 vector<CBayesNetSmile*>* pvecpBNData; 00142 00143 if( !CMeta::IsExtension( strFile, c_acQuant ) ) 00144 continue; 00145 00146 i = strFile.rfind( c_acQuant ); 00147 vecstrNames.push_back( (string)sArgs.directory_arg + "/" + strFile.substr( 0, i ) + c_acDab ); 00148 if( !Data.Open( Answers, vecstrNames, true, sArgs.memmap_flag && !sArgs.randomize_flag ) ) { 00149 cerr << "Couldn't open: " << vecstrNames.back( ) << endl; 00150 return 1; } 00151 if( sArgs.randomize_flag ) 00152 Data.Randomize( ); 00153 vecstrNames.insert( vecstrNames.begin( ), sArgs.answers_arg ); 00154 for( i = 0; i < vecstrNames.size( ); ++i ) 00155 vecstrNames[ i ] = CMeta::Filename( CMeta::Deextension( CMeta::Basename( 00156 vecstrNames[ i ].c_str( ) ) ) ); 00157 veciZeros.resize( vecstrNames.size( ) ); 00158 for( i = 0; i < veciZeros.size( ); ++i ) { 00159 map<string,size_t>::const_iterator iterZero; 00160 00161 veciZeros[ i ] = ( ( iterZero = mapZeros.find( vecstrNames[ i ] ) ) == mapZeros.end( ) ) ? -1 : 00162 iterZero->second; } 00163 pvecpBNData = new vector<CBayesNetSmile*>( ); 00164 pvecpBNData->resize( vecpGenes.size( ) ); 00165 for( iTerm = 0; iTerm < vecpBNRoots.size( ); iTerm += iThread ) { 00166 cerr << "Learning term " << iTerm << '/' << vecpBNRoots.size( ) << endl; 00167 for( iThread = 0; ( ( sArgs.threads_arg == -1 ) || ( iThread < (size_t)sArgs.threads_arg ) ) && 00168 ( ( iTerm + iThread ) < vecpBNRoots.size( ) ); ++iThread ) { 00169 i = iTerm + iThread; 00170 vecsData[ i ].m_pBN = (*pvecpBNData)[ i ] = new CBayesNetSmile( ); 00171 vecsData[ i ].m_pData = &Data; 00172 vecsData[ i ].m_pGenes = vecpGenes[ i ]; 00173 vecsData[ i ].m_pNegatives = &GenesNeg; 00174 vecsData[ i ].m_pAnswers = &Answers; 00175 vecsData[ i ].m_pBNDefault = sArgs.default_arg ? &BNDefault : NULL; 00176 vecsData[ i ].m_szName = sArgs.inputs[ i ]; 00177 vecsData[ i ].m_pvecstrNames = &vecstrNames; 00178 vecsData[ i ].m_pveciZeros = &veciZeros; 00179 vecsData[ i ].m_fZero = !!sArgs.zero_flag; 00180 if( pthread_create( &vecpthdThreads[ i ], NULL, learn, &vecsData[ i ] ) ) { 00181 cerr << "Couldn't create root thread: " << sArgs.inputs[ i ] << endl; 00182 return 1; } } 00183 for( i = 0; i < iThread; ++i ) 00184 pthread_join( vecpthdThreads[ iTerm + i ], NULL ); } 00185 vecpvecpBNData.push_back( pvecpBNData ); } 00186 00187 #ifdef _MSC_VER 00188 FindClose( hSearch ); 00189 #else // _MSC_VER 00190 closedir( pDir ); 00191 #endif // _MSC_VER 00192 00193 vecpBNs.resize( vecpvecpBNData.size( ) ); 00194 for( i = 0; i < vecpGenes.size( ); ++i ) { 00195 CBayesNetSmile BNOut; 00196 00197 for( j = 0; j < vecpBNs.size( ); ++j ) 00198 vecpBNs[ j ] = (*vecpvecpBNData[ j ])[ i ]; 00199 if( !BNOut.Open( *vecpBNRoots[ i ], vecpBNs ) ) { 00200 cerr << "Couldn't merge networks: " << sArgs.inputs[ i ] << endl; 00201 return 1; } 00202 BNOut.Save( ( (string)sArgs.output_arg + "/" + CMeta::Deextension( CMeta::Basename( 00203 sArgs.inputs[ i ] ) ) + "." + ( sArgs.xdsl_flag ? "x" : "" ) + "dsl" ).c_str( ) ); } 00204 00205 for( i = 0; i < vecpvecpBNData.size( ); ++i ) { 00206 for( j = 0; j < vecpvecpBNData[ i ]->size( ); ++j ) 00207 delete (*vecpvecpBNData[ i ])[ j ]; 00208 delete vecpvecpBNData[ i ]; } 00209 for( i = 0; i < vecpBNRoots.size( ); ++i ) { 00210 delete vecpBNRoots[ i ]; 00211 delete vecpGenes[ i ]; } 00212 00213 #ifdef WIN32 00214 pthread_win32_process_detach_np( ); 00215 #endif // WIN32 00216 return 0; } 00217 00218 void* learn( void* pData ) { 00219 CDataFilter DataFilter, DataNegatives; 00220 SLearn* psData; 00221 00222 psData = (SLearn*)pData; 00223 if( psData->m_pNegatives->GetGenes( ) ) { 00224 DataNegatives.Attach( psData->m_pData, *psData->m_pNegatives, CDat::EFilterEdge, psData->m_pAnswers ); 00225 DataFilter.Attach( &DataNegatives, *psData->m_pGenes, CDat::EFilterTerm, psData->m_pAnswers ); } 00226 else 00227 DataFilter.Attach( psData->m_pData, *psData->m_pGenes, CDat::EFilterTerm, psData->m_pAnswers ); 00228 if( !psData->m_pBN->Open( &DataFilter, *psData->m_pvecstrNames, *psData->m_pveciZeros ) ) { 00229 cerr << "Couldn't create base network: " << psData->m_szName << endl; 00230 return NULL; } 00231 if( psData->m_pBNDefault ) 00232 psData->m_pBN->SetDefault( *psData->m_pBNDefault ); 00233 if( !psData->m_pBN->Learn( &DataFilter, 1, psData->m_fZero ) ) 00234 cerr << "Couldn't learn base network: " << psData->m_szName << endl; 00235 00236 return NULL; }