Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 00026 static const char c_acDab[] = ".dab"; 00027 static const char c_acQDab[] = ".qdab"; 00028 00029 int main( int iArgs, char** aszArgs ) { 00030 gengetopt_args_info sArgs; 00031 CDataPair DatOne, DatTwo; 00032 size_t iDatOne, iDatTwo; 00033 size_t i, j, iGeneOne, iGeneTwo, iCountOne, iCountTwo; 00034 size_t iCountJoint, iJoint; 00035 size_t iRuns, inputNums; 00036 float iValueOne, iValueTwo; 00037 vector<size_t> veciGenesOne, veciGenesTwo, veciOne, veciTwo, veciDefaults, veciSizes; 00038 vector<vector<size_t> > vecveciJoint; 00039 vector<string> vecstrInputs, vecstrDatasets; 00040 map<string, size_t> mapZeros; 00041 float dOne, dJoint, dMI, dSubsample, dValueOne, dValueTwo; 00042 size_t non_zero_bins; 00043 DIR* dp; 00044 struct dirent* ep; 00045 const char* fileOne; 00046 const char* fileTwo; 00047 00048 set<string> setstrGenes; 00049 set<string>::const_iterator iterGene; 00050 00051 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00052 cmdline_parser_print_help( ); 00053 return 1; } 00054 CMeta Meta( sArgs.verbosity_arg, sArgs.random_arg ); 00055 00056 ostream* posm; 00057 ofstream ofsm; 00058 ifstream ifsm; 00059 bool opened_DatOne; 00060 CDat DatLk1; 00061 00062 // open output file 00063 if (sArgs.output_given){ 00064 ofsm.open(sArgs.output_arg); 00065 posm=&ofsm; 00066 } 00067 else{ 00068 posm=&cout; 00069 } 00070 00071 if (sArgs.datasets_given) { 00072 char acLine[ 1024 ]; 00073 vector<string> vecstrTok; 00074 string datstr; 00075 00076 ifsm.open(sArgs.datasets_arg); 00077 if ( !ifsm.is_open() ) { 00078 cerr << "Could not open: " << sArgs.datasets_arg << endl; 00079 return 1; 00080 } 00081 while ( !ifsm.eof() ) { 00082 ifsm.getline( acLine, ARRAYSIZE(acLine) - 1 ); 00083 acLine[ ARRAYSIZE(acLine) -1 ] = 0; 00084 if ( !acLine[0] ) 00085 continue; 00086 vecstrTok.clear(); 00087 //CMeta::Tokenize( acLine, vecstrTok ); 00088 //if ( vecstrTok.size() < 2 ) { 00089 // cerr << "Illegal line: " << acLine << endl; 00090 // return 1; 00091 //} 00092 00093 datstr = string(acLine); 00094 vecstrDatasets.push_back(datstr); 00095 } 00096 } 00097 00098 00099 // now collect the data files from directory if given 00100 if(sArgs.directory_given){ 00101 dp = opendir (sArgs.directory_arg); 00102 if (dp != NULL){ 00103 while (ep = readdir (dp)){ 00104 if( strstr(ep->d_name, c_acDab) != NULL || 00105 strstr(ep->d_name, c_acQDab) != NULL 00106 ){ 00107 vecstrInputs.push_back((string)sArgs.directory_arg + "/" + ep->d_name); 00108 } 00109 } 00110 (void) closedir (dp); 00111 00112 inputNums = vecstrInputs.size(); 00113 00114 // sort by ASCI 00115 std::sort( vecstrInputs.begin(), vecstrInputs.end() ); 00116 00117 cerr << "Number of datasets: " << inputNums << '\n'; 00118 } 00119 else{ 00120 cerr << "Couldn't open the directory: " << sArgs.directory_arg << '\n'; 00121 return 1; 00122 } 00123 } 00124 else{ 00125 inputNums = sArgs.inputs_num; 00126 } 00127 00128 00129 // read in zeros file if given 00130 if( sArgs.zeros_given ) { 00131 ifstream ifsm; 00132 vector<string> vecstrZeros; 00133 char acLine[ 1024 ]; 00134 00135 ifsm.open( sArgs.zeros_arg ); 00136 if( !ifsm.is_open( ) ) { 00137 cerr << "Could not open: " << sArgs.zeros_arg << endl; 00138 return 1; } 00139 while( !ifsm.eof( ) ) { 00140 ifsm.getline( acLine, ARRAYSIZE(acLine) - 1 ); 00141 acLine[ ARRAYSIZE(acLine) - 1 ] = 0; 00142 vecstrZeros.clear( ); 00143 CMeta::Tokenize( acLine, vecstrZeros ); 00144 if( vecstrZeros.empty( ) ) 00145 continue; 00146 mapZeros[ vecstrZeros[ 0 ] ] = atoi( vecstrZeros[ 1 ].c_str( ) ); 00147 } 00148 } 00149 00150 // now set default values if given in zeros file use the given default value 00151 // if not, use random value. A value of -1 for a given dataset index in veciDefaults 00152 // represents to use random values 00153 veciDefaults.resize( inputNums ); 00154 fill( veciDefaults.begin(), veciDefaults.end(), -1); 00155 if( sArgs.zeros_given ) 00156 for( i = 0; i < veciDefaults.size( ); ++i ) { 00157 map<string, size_t>::const_iterator iterZero; 00158 if( sArgs.inputs_num == 0 && 00159 ( iterZero = mapZeros.find( CMeta::Deextension( CMeta::Basename(vecstrInputs[ i ].c_str( ) ) ) ) ) != mapZeros.end( ) 00160 ){ 00161 veciDefaults[ i ] = iterZero->second; 00162 } 00163 else if( sArgs.inputs_num > 0 && 00164 ( iterZero = mapZeros.find( CMeta::Deextension( CMeta::Basename(sArgs.inputs[ i ]) ) ) ) != mapZeros.end( ) 00165 ){ 00166 veciDefaults[ i ] = iterZero->second; 00167 } 00168 } 00169 00170 00171 // open dab to filter edges for each dataset 00172 if( sArgs.edges_given ) { 00173 if( !DatLk1.Open( sArgs.edges_arg ) ) { 00174 cerr << "Could not open: " << sArgs.edges_arg << endl; 00175 return 1; } 00176 } 00177 00178 // now iterate through experiments to calculate MI 00179 for( iDatOne = iRuns = 0; iDatOne < ( sArgs.datasets_given ? vecstrDatasets.size() : inputNums ); ++iDatOne){ 00180 opened_DatOne = false; 00181 for( iDatTwo = ( sArgs.datasets_given ? 0 : iDatOne ); iDatTwo < inputNums; ++iDatTwo, ++iRuns){ 00182 00183 // ok skip runs not in range 00184 if(sArgs.start_arg != -1 && iRuns < sArgs.start_arg ){ 00185 continue; 00186 } 00187 if(sArgs.end_arg != -1 && iRuns >= sArgs.end_arg){ 00188 continue; 00189 } 00190 00191 if (iRuns % 50 == 0) 00192 cerr << "Processing " << iRuns << " among # of datasets: " << inputNums << '\n'; 00193 00194 // have I opned the first Data file? 00195 if(!opened_DatOne){ 00196 // now open first Dat 00197 if(sArgs.datasets_given) { 00198 fileOne = vecstrDatasets[iDatOne].c_str(); 00199 } 00200 else if(sArgs.directory_given){ 00201 fileOne = vecstrInputs[iDatOne].c_str(); 00202 } 00203 else{ 00204 fileOne = sArgs.inputs[ iDatOne ]; 00205 } 00206 00207 if(!DatOne.Open(fileOne, false) ) { 00208 cerr << "Could not open: " << fileOne << '\n'; 00209 return 1; } 00210 00211 // removed edges not wanted 00212 if( sArgs.edges_given ) { 00213 vector<size_t> veciGenesOne; 00214 size_t iOne, iTwo; 00215 float valx; 00216 00217 veciGenesOne.resize( DatOne.GetGenes( ) ); 00218 00219 for( i = 0; i < veciGenesOne.size( ); ++i ) 00220 veciGenesOne[ i ] = DatLk1.GetGene( DatOne.GetGene( i ) ); 00221 00222 for( i = 0; i < DatOne.GetGenes( ); ++i ) { 00223 00224 if( ( iOne = veciGenesOne[ i ] ) == -1 ) { 00225 for( j = ( i + 1 ); j < DatOne.GetGenes( ); ++j ) 00226 DatOne.Set( i, j, CMeta::GetNaN( ) ); 00227 continue; 00228 } 00229 00230 for( j = ( i + 1 ); j < DatOne.GetGenes( ); ++j ){ 00231 if( ( ( iTwo = veciGenesOne[ j ] ) == -1 ) || 00232 CMeta::IsNaN( (valx = DatLk1.Get( iOne, iTwo )) ) ){ 00233 DatOne.Set( i, j, CMeta::GetNaN( ) ); 00234 } 00235 } 00236 } 00237 } 00238 00239 DatOne.Quantize(); 00240 opened_DatOne = true; 00241 00242 veciOne.resize( DatOne.GetValues() ); 00243 00244 // laplace smoothingand also prevent devision by zeros 00245 fill( veciOne.begin( ), veciOne.end( ), 1 ); 00246 iCountOne = DatOne.GetValues(); 00247 00248 // When equal Only calculate self Entropy! 00249 if( iDatTwo == iDatOne ){ 00250 // now get the count distribution of experiment i 00251 for(i = 0; i < DatOne.GetGenes(); ++i){ 00252 for( j = ( i + 1 ); j < DatOne.GetGenes(); ++j ) { 00253 if( !CMeta::IsNaN( iValueOne = DatOne.Get(i, j) ) ){ 00254 veciOne[ (size_t)iValueOne ]++; 00255 iCountOne++; 00256 } 00257 } 00258 } 00259 00260 // compute the Entropy 00261 for( non_zero_bins = 0, dMI = 0,i = 0; i < veciOne.size( ); ++i ) 00262 if( dOne = (float)veciOne[ i ] / iCountOne ){ 00263 dMI += dOne * log( 1 / dOne ); 00264 ++non_zero_bins; 00265 } 00266 00267 //unbiased estimate correction 00268 dMI += ( non_zero_bins - 1 ) / ( 2.0f * ( iCountOne + iCountOne ) ); 00269 dMI /= log( 2.0f ); 00270 00271 *posm << fileOne << '\t' << fileOne << '\t' << dMI << '\n'; 00272 00273 // ok now lets skip to next dataset do some MI calculation 00274 continue; 00275 } 00276 } 00277 else{ 00278 // reset since now you have different number of genes 00279 // laplace smoothingand also prevent devision by zeros 00280 fill( veciOne.begin( ), veciOne.end( ), 1 ); 00281 00282 iCountOne = DatOne.GetValues(); 00283 } 00284 00285 // now open Second Dat 00286 if(sArgs.directory_given){ 00287 fileTwo = vecstrInputs[iDatTwo].c_str( ); 00288 } 00289 else{ 00290 fileTwo = sArgs.inputs[ iDatTwo ]; 00291 } 00292 00293 if( !DatTwo.Open( fileTwo, false) ) { 00294 cerr << "Could not open: " << fileTwo << '\n'; 00295 return 1; } 00296 00297 // removed edges not wanted 00298 if( sArgs.edges_given ) { 00299 vector<size_t> veciGenesTwo; 00300 size_t iOne, iTwo; 00301 float valx; 00302 00303 veciGenesTwo.resize( DatTwo.GetGenes( ) ); 00304 00305 for( i = 0; i < veciGenesTwo.size( ); ++i ) 00306 veciGenesTwo[ i ] = DatLk1.GetGene( DatTwo.GetGene( i ) ); 00307 00308 for( i = 0; i < DatTwo.GetGenes( ); ++i ) { 00309 if( ( iOne = veciGenesTwo[ i ] ) == -1 ) { 00310 for( j = ( i + 1 ); j < DatTwo.GetGenes( ); ++j ) 00311 DatTwo.Set( i, j, CMeta::GetNaN( ) ); 00312 continue; 00313 } 00314 for( j = ( i + 1 ); j < DatTwo.GetGenes( ); ++j ){ 00315 if( ( ( iTwo = veciGenesTwo[ j ] ) == -1 ) || 00316 CMeta::IsNaN( (valx = DatLk1.Get( iOne, iTwo )) ) ){ 00317 DatTwo.Set( i, j, CMeta::GetNaN( ) ); 00318 } 00319 } 00320 } 00321 } 00322 00323 DatTwo.Quantize(); 00324 veciTwo.resize( DatTwo.GetValues() ); 00325 00326 // laplace smoothingand also prevent devision by zeros 00327 fill( veciTwo.begin( ), veciTwo.end( ), 1 ); 00328 iCountTwo = DatTwo.GetValues(); 00329 00330 // create & initialize joint bins 00331 vecveciJoint.resize( veciOne.size( ) ); 00332 for( i = 0; i < vecveciJoint.size( ); ++i ) { 00333 vecveciJoint[ i ].resize( veciTwo.size( ) ); 00334 fill( vecveciJoint[ i ].begin( ), vecveciJoint[ i ].end( ), 0 ); 00335 } 00336 00337 // complie gene list from two dabs 00338 setstrGenes.clear(); 00339 for(i = 0; i < DatOne.GetGenes(); i++) 00340 setstrGenes.insert(DatOne.GetGene(i)); 00341 for(i = 0; i < DatTwo.GetGenes(); i++) 00342 setstrGenes.insert(DatTwo.GetGene(i)); 00343 00344 veciGenesOne.clear(); 00345 veciGenesTwo.clear(); 00346 veciGenesOne.resize(setstrGenes.size()); 00347 veciGenesTwo.resize(setstrGenes.size()); 00348 00349 // store if a gene belongs to a dataset 00350 for( iterGene = setstrGenes.begin( ),i = 0; iterGene != setstrGenes.end( ); ++iterGene,++i ){ 00351 veciGenesOne[ i ] = DatOne.GetGene(*iterGene); 00352 veciGenesTwo[ i ] = DatTwo.GetGene(*iterGene); 00353 } 00354 00355 for( i = iCountOne = iCountJoint = 0; i < setstrGenes.size(); ++i ) { 00356 for( j = i + 1; j < setstrGenes.size(); ++j ) { 00357 00358 // filter Dat with no edges in both dats 00359 if( sArgs.edges_given && 00360 veciGenesOne[ i ] != -1 && 00361 veciGenesOne[ j ] != -1 && 00362 CMeta::IsNaN( DatOne.Get(veciGenesOne[ i ], veciGenesOne[ j ]) ) && 00363 veciGenesTwo[ i ] != -1 && 00364 veciGenesTwo[ j ] != -1 && 00365 CMeta::IsNaN( DatTwo.Get(veciGenesTwo[ i ], veciGenesTwo[ j ]) ) 00366 ){ 00367 continue; 00368 } 00369 00370 // does this gene belong to DatOne? If not, randomize it. 00371 if ( veciGenesOne[ i ] == -1 || 00372 veciGenesOne[ j ] == -1 || 00373 CMeta::IsNaN( iValueOne = DatOne.Get(veciGenesOne[ i ], veciGenesOne[ j ]))){ 00374 00375 if( !sArgs.union_flag ) 00376 continue; 00377 else if( veciDefaults[ iDatOne ] == -1 ) 00378 iValueOne = rand( ) % DatOne.GetValues( ); 00379 else 00380 iValueOne = veciDefaults[ iDatOne ]; 00381 } 00382 00383 // does this gene belong to DatTwo? If not, randomize it. 00384 if(veciGenesTwo[ i ] == -1 || 00385 veciGenesTwo[ j ] == -1 || 00386 CMeta::IsNaN( iValueTwo = DatTwo.Get(veciGenesTwo[ i ], veciGenesTwo[ j ]) )){ 00387 00388 if( !sArgs.union_flag ) 00389 continue; 00390 else if( veciDefaults[ iDatTwo ] == -1 ) 00391 iValueTwo = rand( ) % DatTwo.GetValues( ); 00392 else 00393 iValueTwo = veciDefaults[ iDatTwo ]; 00394 } 00395 00396 iCountJoint++; 00397 iCountTwo++; 00398 iCountOne++; 00399 veciOne[ (size_t)iValueOne ]++; 00400 veciTwo[ (size_t)iValueTwo ]++; 00401 vecveciJoint[ (size_t)iValueOne ][ (size_t)iValueTwo ]++; 00402 } 00403 } 00404 00405 if(iCountJoint > 0){ 00406 // now calculate the MI between the two experiments 00407 for( non_zero_bins = 0, dMI = 0,i = 0; i < veciOne.size( ); ++i ) { 00408 dOne = (float)veciOne[ i ] / iCountOne; 00409 for( j = 0; j < veciTwo.size( ); ++j ) 00410 if( iJoint = vecveciJoint[ i ][ j ] ) { 00411 ++non_zero_bins; 00412 dJoint = (float)iJoint / iCountJoint; 00413 dMI += dJoint * log( dJoint * iCountTwo / dOne / veciTwo[ j ] ); 00414 } 00415 } 00416 00417 //unbiased estimate correction 00418 dMI += ( non_zero_bins - 1 ) / ( 2.0f * ( iCountOne + iCountTwo ) ); 00419 dMI = ( dMI < 0 ) ? 0 : ( dMI / log( 2.0f ) ); 00420 } 00421 else{ 00422 dMI = 0; 00423 } 00424 00425 *posm << fileOne << '\t' << fileTwo << '\t' << dMI << '\n'; 00426 } 00427 } 00428 00429 return 0; 00430 }