Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 static const char c_acDab[] = ".dab"; 00026 00027 struct SEvaluate { 00028 CBayesNetSmile* m_pBN; 00029 const CDataPair* m_pDat; 00030 const CGenes* m_pGenes; 00031 const CGenes* m_pGenesIn; 00032 CDat* m_pYes; 00033 CDat* m_pNo; 00034 const CDataPair* m_pAnswers; 00035 size_t m_iZero; 00036 size_t m_iNode; 00037 const vector<size_t>* m_pveciGenes; 00038 bool m_fEverything; 00039 const char* m_szName; 00040 }; 00041 00042 void* initialize( void* ); 00043 void* evaluate( void* ); 00044 void* finalize( void* ); 00045 00046 int main( int iArgs, char** aszArgs ) { 00047 gengetopt_args_info sArgs; 00048 map<string,size_t> mapZeros; 00049 CDataPair Answers; 00050 vector<CBayesNetSmile*> vecpBNs; 00051 vector<size_t> veciZeros, veciGenes; 00052 CGenome Genome; 00053 CGenes GenesIn( Genome ); 00054 size_t i, iNode, iThread, iTerm; 00055 vector<CGenes*> vecpGenes; 00056 CDatasetCompact Dataset; 00057 vector<pthread_t> vecpthdThreads; 00058 vector<string> vecstrNodes, vecstrTmps; 00059 string strFile; 00060 vector<CDat*> vecpYes, vecpNo; 00061 vector<SEvaluate> vecsData; 00062 00063 #ifdef WIN32 00064 pthread_win32_process_attach_np( ); 00065 #endif // WIN32 00066 00067 if( cmdline_parser( iArgs, aszArgs, &sArgs ) || !sArgs.inputs_num ) { 00068 cmdline_parser_print_help( ); 00069 return 1; } 00070 CMeta Meta( sArgs.verbosity_arg ); 00071 00072 vecpBNs.resize( sArgs.inputs_num ); 00073 for( i = 0; i < vecpBNs.size( ); ++i ) { 00074 vecpBNs[ i ] = new CBayesNetSmile( !!sArgs.group_flag ); 00075 if( !vecpBNs[ i ]->Open( ( strFile = (string)sArgs.input_arg + '/' + CMeta::Deextension( 00076 CMeta::Basename( sArgs.inputs[ i ] ) ) + ( sArgs.xdsl_flag ? ".xdsl" : ".dsl" ) ).c_str( ) ) ) { 00077 cerr << "Couldn't open: " << strFile << endl; 00078 return 1; } } 00079 00080 if( sArgs.zeros_arg ) { 00081 ifstream ifsm; 00082 vector<string> vecstrZeros; 00083 char acLine[ 1024 ]; 00084 00085 ifsm.open( sArgs.zeros_arg ); 00086 if( !ifsm.is_open( ) ) { 00087 cerr << "Couldn't open: " << sArgs.zeros_arg << endl; 00088 return 1; } 00089 while( !ifsm.eof( ) ) { 00090 ifsm.getline( acLine, ARRAYSIZE(acLine) - 1 ); 00091 acLine[ ARRAYSIZE(acLine) - 1 ] = 0; 00092 vecstrZeros.clear( ); 00093 CMeta::Tokenize( acLine, vecstrZeros ); 00094 if( vecstrZeros.empty( ) ) 00095 continue; 00096 mapZeros[ vecstrZeros[ 0 ] ] = atoi( vecstrZeros[ 1 ].c_str( ) ); } } 00097 vecpBNs[ 0 ]->GetNodes( vecstrNodes ); 00098 veciZeros.resize( vecstrNodes.size( ) ); 00099 for( i = 0; i < veciZeros.size( ); ++i ) { 00100 map<string,size_t>::const_iterator iterZero; 00101 00102 veciZeros[ i ] = ( ( iterZero = mapZeros.find( vecstrNodes[ i ] ) ) == mapZeros.end( ) ) ? -1 : 00103 iterZero->second; } 00104 00105 if( sArgs.answers_arg && !Answers.Open( sArgs.answers_arg, false, !!sArgs.memmap_flag ) ) { 00106 cerr << "Couldn't open: " << sArgs.answers_arg << endl; 00107 return 1; } 00108 if( sArgs.genome_arg ) { 00109 ifstream ifsm; 00110 CGenes Genes( Genome ); 00111 00112 ifsm.open( sArgs.genome_arg ); 00113 if( !Genes.Open( ifsm ) ) { 00114 cerr << "Couldn't open: " << sArgs.genome_arg << endl; 00115 return 1; } } 00116 else if( sArgs.answers_arg ) 00117 for( i = 0; i < Answers.GetGenes( ); ++i ) 00118 Genome.AddGene( Answers.GetGene( i ) ); 00119 else { 00120 vector<string> vecstrFiles; 00121 00122 vecstrFiles.resize( vecstrNodes.size( ) - 1 ); 00123 for( i = 0; i < vecstrFiles.size( ); ++i ) 00124 vecstrFiles[ i ] = (string)sArgs.directory_arg + '/' + vecstrNodes[ i + 1 ] + c_acDab; 00125 if( !Dataset.OpenGenes( vecstrFiles ) ) { 00126 cerr << "Couldn't open: " << sArgs.directory_arg << endl; 00127 return 1; } 00128 for( i = 0; i < Dataset.GetGenes( ); ++i ) 00129 Genome.AddGene( Dataset.GetGene( i ) ); } 00130 if( sArgs.genes_arg ) { 00131 ifstream ifsm; 00132 00133 ifsm.open( sArgs.genes_arg ); 00134 if( !GenesIn.Open( ifsm, !sArgs.genome_arg ) ) { 00135 cerr << "Couldn't open: " << sArgs.genes_arg << endl; 00136 return 1; } } 00137 00138 vecpGenes.resize( sArgs.inputs_num ); 00139 vecpYes.resize( vecpGenes.size( ) ); 00140 vecpNo.resize( vecpGenes.size( ) ); 00141 vecstrTmps.resize( vecpNo.size( ) ); 00142 for( i = 0; i < vecpGenes.size( ); ++i ) { 00143 ifstream ifsm; 00144 char acTemp[ L_tmpnam + 1 ]; 00145 00146 vecpGenes[ i ] = new CGenes( Genome ); 00147 ifsm.open( sArgs.inputs[ i ] ); 00148 if( !vecpGenes[ i ]->Open( ifsm, false ) ) { 00149 cerr << "Couldn't open: " << sArgs.inputs[ i ] << endl; 00150 return 1; } 00151 vecpYes[ i ] = new CDat( ); 00152 vecpYes[ i ]->Open( Genome.GetGeneNames( ), false, ( (string)sArgs.output_arg + '/' + 00153 CMeta::Basename( sArgs.inputs[ i ] ) + c_acDab ).c_str( ) ); 00154 vecpNo[ i ] = new CDat( ); 00155 #pragma warning( disable : 4996 ) 00156 vecstrTmps[ i ] = tmpnam( acTemp ); 00157 #pragma warning( default : 4996 ) 00158 vecpNo[ i ]->Open( Genome.GetGeneNames( ), false, vecstrTmps[ i ].c_str( ) ); } 00159 00160 vecsData.resize( vecpGenes.size( ) ); 00161 vecpthdThreads.resize( vecsData.size( ) ); 00162 for( iTerm = 0; iTerm < vecpGenes.size( ); iTerm += iThread ) { 00163 for( iThread = 0; ( ( sArgs.threads_arg == -1 ) || ( iThread < (size_t)sArgs.threads_arg ) ) && 00164 ( ( iTerm + iThread ) < vecpGenes.size( ) ); ++iThread ) { 00165 i = iTerm + iThread; 00166 vecsData[ i ].m_pBN = vecpBNs[ i ]; 00167 vecsData[ i ].m_pGenes = vecpGenes[ i ]; 00168 vecsData[ i ].m_pYes = vecpYes[ i ]; 00169 vecsData[ i ].m_pNo = vecpNo[ i ]; 00170 vecsData[ i ].m_szName = sArgs.inputs[ i ]; 00171 if( pthread_create( &vecpthdThreads[ i ], NULL, initialize, &vecsData[ i ] ) ) { 00172 cerr << "Couldn't create initialization thread: " << sArgs.inputs[ i ] << endl; 00173 return 1; } } 00174 for( i = 0; i < iThread; ++i ) 00175 pthread_join( vecpthdThreads[ iTerm + i ], NULL ); } 00176 00177 veciGenes.resize( vecpYes[ 0 ]->GetGenes( ) ); 00178 for( iNode = 1; iNode < vecstrNodes.size( ); ++iNode ) { 00179 CDataPair Dat; 00180 00181 if( !Dat.Open( ( strFile = ( (string)sArgs.directory_arg + '/' + vecstrNodes[ iNode ] + 00182 c_acDab ) ).c_str( ), false, !!sArgs.memmap_flag ) ) { 00183 cerr << "Couldn't open: " << strFile << endl; 00184 return 1; } 00185 for( i = 0; i < veciGenes.size( ); ++i ) 00186 veciGenes[ i ] = Dat.GetGene( vecpYes[ 0 ]->GetGene( i ) ); 00187 for( iTerm = 0; iTerm < vecpGenes.size( ); iTerm += iThread ) { 00188 for( iThread = 0; ( ( sArgs.threads_arg == -1 ) || ( iThread < (size_t)sArgs.threads_arg ) ) && 00189 ( ( iTerm + iThread ) < vecpGenes.size( ) ); ++iThread ) { 00190 i = iTerm + iThread; 00191 vecsData[ i ].m_pBN = vecpBNs[ i ]; 00192 vecsData[ i ].m_pDat = &Dat; 00193 vecsData[ i ].m_pGenes = vecpGenes[ i ]; 00194 vecsData[ i ].m_pGenesIn = GenesIn.GetGenes( ) ? &GenesIn : NULL; 00195 vecsData[ i ].m_pYes = vecpYes[ i ]; 00196 vecsData[ i ].m_pNo = vecpNo[ i ]; 00197 vecsData[ i ].m_pAnswers = Answers.GetGenes( ) ? &Answers : NULL; 00198 vecsData[ i ].m_iZero = sArgs.zero_flag ? 0 : veciZeros[ iNode ]; 00199 vecsData[ i ].m_iNode = iNode; 00200 vecsData[ i ].m_pveciGenes = &veciGenes; 00201 vecsData[ i ].m_fEverything = !!sArgs.everything_flag; 00202 vecsData[ i ].m_szName = sArgs.inputs[ i ]; 00203 if( pthread_create( &vecpthdThreads[ i ], NULL, evaluate, &vecsData[ i ] ) ) { 00204 cerr << "Couldn't create evaluation thread: " << sArgs.inputs[ i ] << endl; 00205 return 1; } } 00206 for( i = 0; i < iThread; ++i ) 00207 pthread_join( vecpthdThreads[ iTerm + i ], NULL ); } } 00208 00209 for( iTerm = 0; iTerm < vecpGenes.size( ); iTerm += iThread ) { 00210 for( iThread = 0; ( ( sArgs.threads_arg == -1 ) || ( iThread < (size_t)sArgs.threads_arg ) ) && 00211 ( ( iTerm + iThread ) < vecpGenes.size( ) ); ++iThread ) { 00212 i = iTerm + iThread; 00213 vecsData[ i ].m_pBN = vecpBNs[ i ]; 00214 vecsData[ i ].m_pYes = vecpYes[ i ]; 00215 vecsData[ i ].m_pNo = vecpNo[ i ]; 00216 vecsData[ i ].m_szName = sArgs.inputs[ i ]; 00217 if( pthread_create( &vecpthdThreads[ i ], NULL, finalize, &vecsData[ i ] ) ) { 00218 cerr << "Couldn't create finalization thread: " << sArgs.inputs[ i ] << endl; 00219 return 1; } } 00220 for( i = 0; i < iThread; ++i ) 00221 pthread_join( vecpthdThreads[ iTerm + i ], NULL ); } 00222 00223 for( i = 0; i < vecstrTmps.size( ); ++i ) 00224 _unlink( vecstrTmps[ i ].c_str( ) ); 00225 for( i = 0; i < vecpGenes.size( ); ++i ) { 00226 delete vecpBNs[ i ]; 00227 delete vecpYes[ i ]; 00228 delete vecpNo[ i ]; 00229 delete vecpGenes[ i ]; } 00230 00231 #ifdef WIN32 00232 pthread_win32_process_detach_np( ); 00233 #endif // WIN32 00234 return 0; } 00235 00236 void* initialize( void* pData ) { 00237 SEvaluate* psData; 00238 size_t i; 00239 float* adBuffer; 00240 00241 psData = (SEvaluate*)pData; 00242 adBuffer = new float[ psData->m_pYes->GetGenes( ) ]; 00243 for( i = 0; i < psData->m_pNo->GetGenes( ); ++i ) 00244 adBuffer[ i ] = CMeta::GetNaN( ); 00245 for( i = 0; i < psData->m_pNo->GetGenes( ); ++i ) { 00246 if( !( i % 1000 ) ) 00247 cerr << "IN: " << psData->m_szName << ", " << i << endl; 00248 psData->m_pNo->Set( i, adBuffer ); } 00249 for( i = 0; i < psData->m_pYes->GetGenes( ); ++i ) { 00250 if( !( i % 1000 ) ) 00251 cerr << "IY: " << psData->m_szName << ", " << i << endl; 00252 psData->m_pYes->Set( i, adBuffer ); } 00253 delete[] adBuffer; 00254 00255 return NULL; } 00256 00257 void* evaluate( void* pData ) { 00258 SEvaluate* psData; 00259 CDataMatrix MatCPT; 00260 size_t i, j, iOne, iTwo, iBin, iIndex; 00261 vector<bool> vecfGenes, vecfGenesIn; 00262 bool fTermOne, fListOne; 00263 float* adYes; 00264 float* adNo; 00265 float dNo, dYes; 00266 00267 psData = (SEvaluate*)pData; 00268 psData->m_pBN->GetCPT( 0, MatCPT ); 00269 dNo = log( MatCPT.Get( 0, 0 ) ); 00270 dYes = log( MatCPT.Get( 1, 0 ) ); 00271 adYes = new float[ psData->m_pYes->GetGenes( ) ]; 00272 adNo = new float[ psData->m_pNo->GetGenes( ) ]; 00273 if( !psData->m_fEverything ) { 00274 vecfGenes.resize( psData->m_pYes->GetGenes( ) ); 00275 for( i = 0; i < vecfGenes.size( ); ++i ) 00276 vecfGenes[ i ] = psData->m_pGenes->IsGene( psData->m_pYes->GetGene( i ) ); } 00277 if( psData->m_pGenesIn ) { 00278 vecfGenesIn.resize( psData->m_pYes->GetGenes( ) ); 00279 for( i = 0; i < vecfGenes.size( ); ++i ) 00280 vecfGenesIn[ i ] = psData->m_pGenesIn->IsGene( psData->m_pYes->GetGene( i ) ); } 00281 psData->m_pBN->GetCPT( psData->m_iNode, MatCPT ); 00282 for( i = 0; i < psData->m_pYes->GetGenes( ); ++i ) { 00283 if( !( i % 1000 ) ) 00284 cerr << "C: " << psData->m_szName << ", " << i << endl; 00285 if( ( ( iOne = (*psData->m_pveciGenes)[ i ] ) == -1 ) && ( psData->m_iZero == -1 ) ) 00286 continue; 00287 fTermOne = psData->m_fEverything || vecfGenes[ i ]; 00288 fListOne = !psData->m_pGenesIn || vecfGenesIn[ i ]; 00289 memcpy( adYes, psData->m_pYes->Get( i ), ( psData->m_pYes->GetGenes( ) - i - 1 ) * sizeof(*adYes) ); 00290 memcpy( adNo, psData->m_pNo->Get( i ), ( psData->m_pNo->GetGenes( ) - i - 1 ) * sizeof(*adNo) ); 00291 for( j = ( i + 1 ); j < psData->m_pYes->GetGenes( ); ++j ) { 00292 if( ( ( iTwo = (*psData->m_pveciGenes)[ j ] ) == -1 ) && ( psData->m_iZero == -1 ) ) 00293 continue; 00294 if( !( fTermOne || vecfGenes[ j ] ) || !( fListOne || vecfGenesIn[ j ] ) ) 00295 continue; 00296 if( psData->m_pAnswers && CMeta::IsNaN( psData->m_pAnswers->Get( i, j ) ) ) 00297 continue; 00298 00299 iBin = psData->m_pDat->Quantize( iOne, iTwo, psData->m_iZero ); 00300 if( iBin == -1 ) 00301 continue; 00302 00303 if( CMeta::IsNaN( adYes[ iIndex = ( j - i - 1 ) ] ) ) { 00304 adYes[ iIndex ] = dYes; 00305 adNo[ iIndex ] = dNo; } 00306 adNo[ iIndex ] += log( MatCPT.Get( iBin, 0 ) ); 00307 adYes[ iIndex ] += log( MatCPT.Get( iBin, 1 ) ); } 00308 psData->m_pNo->Set( i, adNo ); 00309 psData->m_pYes->Set( i, adYes ); } 00310 delete[] adYes; 00311 delete[] adNo; 00312 00313 return NULL; } 00314 00315 void* finalize( void* pData ) { 00316 SEvaluate* psData; 00317 size_t i, j; 00318 float* adYes; 00319 float* adNo; 00320 float dPrior; 00321 CDataMatrix MatCPT; 00322 00323 psData = (SEvaluate*)pData; 00324 psData->m_pBN->GetCPT( 0, MatCPT ); 00325 dPrior = MatCPT.Get( 1, 0 ); 00326 adYes = new float[ psData->m_pYes->GetGenes( ) ]; 00327 adNo = new float[ psData->m_pNo->GetGenes( ) ]; 00328 for( i = 0; i < psData->m_pYes->GetGenes( ); ++i ) { 00329 if( !( i % 1000 ) ) 00330 cerr << "F: " << psData->m_szName << ", " << i << endl; 00331 memcpy( adYes, psData->m_pYes->Get( i ), ( psData->m_pYes->GetGenes( ) - i - 1 ) * sizeof(*adYes) ); 00332 memcpy( adNo, psData->m_pNo->Get( i ), ( psData->m_pNo->GetGenes( ) - i - 1 ) * sizeof(*adNo) ); 00333 for( j = 0; j < ( psData->m_pYes->GetGenes( ) - i - 1 ); ++j ) 00334 adYes[ j ] = CMeta::IsNaN( adYes[ j ] ) ? dPrior : 00335 (float)( 1 / ( 1 + exp( (double)adNo[ j ] - (double)adYes[ j ] ) ) ); 00336 psData->m_pYes->Set( i, adYes ); } 00337 delete[] adNo; 00338 delete[] adYes; 00339 00340 return NULL; }