Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 struct SSorter { 00026 const vector<size_t>& m_veciSizes; 00027 00028 SSorter( const vector<size_t>& veciSizes ) : m_veciSizes(veciSizes) { } 00029 00030 bool operator()( size_t iOne, size_t iTwo ) const { 00031 00032 return ( m_veciSizes[ iOne ] > m_veciSizes[ iTwo ] ); 00033 } 00034 }; 00035 00036 float find_value( size_t iOne, size_t iTwo, const CDat* pDat ) 00037 { 00038 return ( ( ( iOne == -1 ) || ( iTwo == -1 ) ) ? CMeta::GetNaN( ) : pDat->Get( iOne, iTwo ) ); 00039 } 00040 00041 size_t find_value( float dValue, const CDataPair *pDat, size_t iDefault, bool fRandom ) 00042 { 00043 if( pDat->IsContinuous( ) ) 00044 return 0; 00045 00046 return ( CMeta::IsNaN( dValue ) ? ( fRandom ? ( rand( ) % pDat->GetValues( ) ) : iDefault ) : 00047 pDat->Quantize( dValue ) ); 00048 } 00049 00050 size_t find_value( size_t iOne, size_t iTwo, const CDataPair *pDat, size_t iDefault, bool fRandom ) 00051 { 00052 float fValue = find_value( iOne, iTwo, pDat ); 00053 size_t iValue = find_value( fValue, pDat, iDefault, fRandom ); 00054 return iValue; 00055 } 00056 00057 int main( int iArgs, char** aszArgs ) 00058 { 00059 gengetopt_args_info sArgs; 00060 vector<CDataPair*> vpDatsBigmem; 00061 vector<float*> vecpadVals; 00062 size_t iCountJoint, iJoint, iValueOne, iValueTwo; 00063 vector<size_t> veciDefaults, veciSizes; 00064 vector<string> vecstrInputs, vecstrGenes; 00065 float dSubsample; 00066 map<string, size_t> mapZeros; 00067 IMeasure* pMeasure; 00068 CMeasurePearson Pearson; 00069 CMeasureEuclidean Euclidean; 00070 CMeasureKendallsTau KendallsTau; 00071 CMeasureKolmogorovSmirnov KolmSmir; 00072 CMeasureHypergeometric Hypergeom; 00073 CMeasureQuickPearson PearQuick; 00074 CMeasureInnerProduct InnerProd; 00075 CMeasureBinaryInnerProduct BinInnerProd; 00076 00077 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00078 cmdline_parser_print_help( ); 00079 return 1; 00080 } 00081 CMeta Meta( sArgs.verbosity_arg, sArgs.random_arg ); 00082 00083 CMeasureSigmoid EuclideanSig( &Euclidean, false, 1.0f / sArgs.inputs_num ); 00084 IMeasure* apMeasures[] = { &Pearson, &EuclideanSig, &KendallsTau, 00085 &KolmSmir, &Hypergeom, &PearQuick, &InnerProd, 00086 &BinInnerProd, NULL 00087 }; 00088 00089 00090 vecstrInputs.resize( sArgs.inputs_num ); 00091 copy( sArgs.inputs, sArgs.inputs + sArgs.inputs_num, vecstrInputs.begin( ) ); 00092 if( sArgs.only_arg == -1 ) { 00093 vector<size_t> veciIndices, veciSizes; 00094 00095 veciSizes.resize( vecstrInputs.size( ) ); 00096 veciIndices.resize( vecstrInputs.size( ) ); 00097 for( size_t i = 0; i < vecstrInputs.size( ); ++i ) { 00098 CDat Dat; 00099 ifstream ifsm; 00100 00101 veciIndices[ i ] = i; 00102 ifsm.open( vecstrInputs[ i ].c_str( ) ); 00103 Dat.OpenGenes( ifsm, true ); 00104 veciSizes[ i ] = Dat.GetGenes( ); 00105 } 00106 sort( veciIndices.begin( ), veciIndices.end( ), SSorter( veciSizes ) ); 00107 CMeta::Permute( vecstrInputs, veciIndices ); 00108 } 00109 { 00110 CDataset Data; 00111 00112 Data.OpenGenes( vecstrInputs ); 00113 vecstrGenes.resize( Data.GetGenes( ) ); 00114 copy( Data.GetGeneNames( ).begin( ), Data.GetGeneNames( ).end( ), vecstrGenes.begin( ) ); 00115 } 00116 veciSizes.resize( vecstrInputs.size( ) ); 00117 for( size_t i = 0; i < veciSizes.size( ); ++i ) { 00118 CDataPair DatSize; 00119 DatSize.OpenQuants( vecstrInputs[ i ].c_str( ) ); 00120 veciSizes[ i ] = DatSize.GetValues( ); 00121 } 00122 00123 pMeasure = NULL; 00124 if( sArgs.distance_arg ) 00125 for( size_t i = 0; apMeasures[ i ]; ++i ) 00126 if( !strcmp( apMeasures[ i ]->GetName( ), sArgs.distance_arg ) ) { 00127 pMeasure = apMeasures[ i ]; 00128 break; 00129 } 00130 00131 if( sArgs.zeros_arg ) { 00132 ifstream ifsm; 00133 vector<string> vecstrZeros; 00134 char acLine[ 1024 ]; 00135 00136 ifsm.open( sArgs.zeros_arg ); 00137 if( !ifsm.is_open( ) ) { 00138 cerr << "Could not open: " << sArgs.zeros_arg << endl; 00139 return 1; 00140 } 00141 while( !ifsm.eof( ) ) { 00142 ifsm.getline( acLine, ARRAYSIZE(acLine) - 1 ); 00143 acLine[ ARRAYSIZE(acLine) - 1 ] = 0; 00144 vecstrZeros.clear( ); 00145 CMeta::Tokenize( acLine, vecstrZeros ); 00146 if( vecstrZeros.empty( ) ) 00147 continue; 00148 mapZeros[ vecstrZeros[ 0 ] ] = atoi( vecstrZeros[ 1 ].c_str( ) ); 00149 } 00150 } 00151 veciDefaults.resize( vecstrInputs.size( ) ); 00152 for( size_t i = 0; i < veciDefaults.size( ); ++i ) { 00153 map<string, size_t>::const_iterator iterZero; 00154 00155 if( ( iterZero = mapZeros.find( CMeta::Deextension( CMeta::Basename( 00156 vecstrInputs[ i ].c_str( ) ) ) ) ) != mapZeros.end( ) ) 00157 veciDefaults[ i ] = iterZero->second; 00158 else 00159 veciDefaults[ i ] = sArgs.randomize_flag ? -1 : 00160 ( sArgs.zero_flag ? 0 : veciSizes[ i ]++ ); 00161 } 00162 00163 dSubsample = ( sArgs.subsample_arg == -1 ) ? 1 : ( 2.0f * sArgs.subsample_arg / vecstrGenes.size( ) / ( vecstrGenes.size( ) - 1 ) ); 00164 vector<pair<string,string> > vecpairstrChosen; 00165 for( size_t i = 0; i < vecstrGenes.size( ); ++i ) { 00166 for( size_t j = ( i + 1 ); j < vecstrGenes.size( ); ++j ) { 00167 if( ( (float)rand( ) / RAND_MAX ) < dSubsample ) { 00168 vecpairstrChosen.push_back(make_pair(vecstrGenes[i], vecstrGenes[j])); 00169 } 00170 } 00171 } 00172 00173 if( sArgs.bigmem_flag ) { 00174 vecpadVals.resize( vecstrInputs.size( ), NULL); 00175 vpDatsBigmem.resize( vecstrInputs.size( ), NULL); 00176 #pragma omp parallel for num_threads(sArgs.threads_arg) 00177 for ( size_t iDat = 0; iDat < vecstrInputs.size( ); ++iDat ) { 00178 CDataPair* pDat = new CDataPair; 00179 #pragma omp critical 00180 { 00181 if( !( pDat->Open( vecstrInputs[ iDat ].c_str( ), false, !!sArgs.memmap_flag ) || 00182 pDat->Open( vecstrInputs[ iDat ].c_str( ), true, !!sArgs.memmap_flag ) ) ) { 00183 cerr << "Could not open: " << vecstrInputs[ iDat ] << endl; 00184 exit(1); 00185 } 00186 } 00187 if ( pMeasure ) { 00188 float* adVals = new float[ vecpairstrChosen.size( ) ]; 00189 for( size_t i = 0; i < vecpairstrChosen.size( ); ++i ) { 00190 pair<string, string> pairChosen = vecpairstrChosen[i]; 00191 size_t iGeneOne = pDat->GetGene( pairChosen.first ); 00192 size_t jGeneOne = pDat->GetGene( pairChosen.second ); 00193 float dValueOne = find_value( iGeneOne, jGeneOne, pDat ); 00194 adVals[ i ] = dValueOne; 00195 } 00196 vecpadVals[ iDat ] = adVals; 00197 delete pDat; 00198 } else { 00199 vpDatsBigmem[ iDat ] = pDat; 00200 } 00201 } 00202 } 00203 00204 00205 //Calculate pairwise measures for dataset 00206 vector<float> vecdMeasures( vecstrInputs.size( ) * ( vecstrInputs.size( ) - 1 ) / 2 + vecstrInputs.size( ), 0); 00207 size_t iCompletedPairs = 0; 00208 size_t iNumDatasets = vecstrInputs.size( ); 00209 for( size_t iDatOne = 0; iDatOne < iNumDatasets; ++iDatOne ) { 00210 cerr << "Processing " << iDatOne + 1 << '/' << iNumDatasets << endl; 00211 CDataPair* DatOne; 00212 if ( sArgs.bigmem_flag ) { 00213 if ( !pMeasure ) { 00214 DatOne = vpDatsBigmem[iDatOne]; 00215 } 00216 } 00217 else { 00218 DatOne = new CDataPair; 00219 if( !( DatOne->Open( vecstrInputs[ iDatOne ].c_str( ), false, !!sArgs.memmap_flag ) || 00220 DatOne->Open( vecstrInputs[ iDatOne ].c_str( ), true, !!sArgs.memmap_flag ) ) ) { 00221 cerr << "Could not open: " << vecstrInputs[ iDatOne ] << endl; 00222 return 1; 00223 } 00224 } 00225 #pragma omp parallel for num_threads(sArgs.threads_arg) 00226 for( size_t iDatTwo = iDatOne; iDatTwo < iNumDatasets; ++iDatTwo ) { //Change loop to also include self pair for all measures, simplifies code a great deal 00227 CDataPair* DatTwo; 00228 vector<vector<size_t> > vecveciJoint; 00229 vector<size_t> veciOne, veciTwo; 00230 size_t iCountOne, iCountTwo; 00231 float dMeasure = 0; 00232 float* adOne; 00233 float* adTwo; 00234 00235 //Load second dataset 00236 if ( sArgs.bigmem_flag ) { 00237 if ( !pMeasure ) { 00238 DatTwo = vpDatsBigmem[iDatTwo]; 00239 } 00240 } 00241 else { 00242 DatTwo = new CDataPair; 00243 if( !( DatTwo->Open( vecstrInputs[ iDatTwo ].c_str( ), false, !!sArgs.memmap_flag ) || 00244 DatTwo->Open( vecstrInputs[ iDatTwo ].c_str( ), true, !!sArgs.memmap_flag ) ) ) { 00245 cerr << "Could not open: " << vecstrInputs[ iDatTwo ] << endl; 00246 exit(1); 00247 } 00248 } 00249 00250 if( pMeasure ) {//Calculate with measure 00251 if ( sArgs.bigmem_flag ) { 00252 adOne = vecpadVals[ iDatOne ]; 00253 adTwo = vecpadVals[ iDatTwo ]; 00254 } 00255 else { 00256 adOne = new float[ vecpairstrChosen.size( ) ]; 00257 adTwo = new float[ vecpairstrChosen.size( ) ]; 00258 for( size_t i = 0; i < vecpairstrChosen.size( ); ++i ) { 00259 pair<string, string> pairChosen = vecpairstrChosen[i]; 00260 size_t iGeneOne = DatOne->GetGene( pairChosen.first ); 00261 size_t jGeneOne = DatOne->GetGene( pairChosen.second ); 00262 size_t iGeneTwo = DatTwo->GetGene( pairChosen.first ); 00263 size_t jGeneTwo = DatTwo->GetGene( pairChosen.second ); 00264 float dValueOne = find_value( iGeneOne, jGeneOne, DatOne ); 00265 float dValueTwo = find_value( iGeneTwo, jGeneTwo, DatTwo ); 00266 adOne[ i ] = dValueOne; 00267 adTwo[ i ] = dValueTwo; 00268 } 00269 } 00270 dMeasure = (float)pMeasure->Measure( adOne, vecpairstrChosen.size( ), adTwo, vecpairstrChosen.size( ) ); 00271 if ( !sArgs.bigmem_flag ) { 00272 delete[] adTwo; 00273 delete[] adOne; 00274 } 00275 } else {//use MI calculation 00276 veciOne.resize( veciSizes[ iDatOne ] ); 00277 fill( veciOne.begin( ), veciOne.end( ), 0 ); 00278 00279 veciTwo.resize( veciSizes[ iDatTwo ] ); 00280 fill( veciTwo.begin( ), veciTwo.end( ), 0 ); 00281 00282 vecveciJoint.resize( veciOne.size( ) ); 00283 for( size_t i = 0; i < vecveciJoint.size( ); ++i ) { 00284 vecveciJoint[ i ].resize( veciTwo.size( ) ); 00285 fill( vecveciJoint[ i ].begin( ), vecveciJoint[ i ].end( ), 0 ); 00286 } 00287 iCountOne = 0; 00288 iCountTwo = 0; 00289 iCountJoint = 0; 00290 for( size_t i = 0; i < vecpairstrChosen.size( ); ++i ) { 00291 pair<string, string> pairChosen = vecpairstrChosen[i]; 00292 size_t iGeneOne = DatOne->GetGene( pairChosen.first ); 00293 size_t jGeneOne = DatOne->GetGene( pairChosen.second ); 00294 size_t iGeneTwo = DatTwo->GetGene( pairChosen.first ); 00295 size_t jGeneTwo = DatTwo->GetGene( pairChosen.second ); 00296 float dValueOne = find_value( iGeneOne, jGeneOne, DatOne ); 00297 float dValueTwo = find_value( iGeneTwo, jGeneTwo, DatTwo ); 00298 //Get information for MI calculation 00299 if( ( iValueTwo = find_value( dValueTwo, DatTwo, veciDefaults[ iDatTwo ], !!sArgs.randomize_flag ) ) != -1 ) { 00300 iCountTwo++; 00301 veciTwo[ iValueTwo ]++; 00302 if( ( iValueOne = find_value( dValueOne, DatOne, veciDefaults[ iDatOne ], !!sArgs.randomize_flag ) ) != -1 ) { 00303 iCountOne++; 00304 iCountJoint++; 00305 vecveciJoint[ iValueOne ][ iValueTwo ]++; 00306 } 00307 } 00308 } 00309 00310 dMeasure = 0; 00311 for( size_t i = 0; i < veciOne.size( ); ++i ) { 00312 float dOne = (float)veciOne[ i ] / iCountOne; 00313 for( size_t j = 0; j < veciTwo.size( ); ++j ) 00314 if( iJoint = vecveciJoint[ i ][ j ] ) { 00315 float dJoint = (float)iJoint / iCountJoint; 00316 dMeasure += dJoint * log( dJoint * iCountTwo / dOne / veciTwo[ j ] ); 00317 } 00318 } 00319 // This corrects for bias introduced by empirical estimation: 00320 // http://robotics.stanford.edu/~gal/Research/Redundancy-Reduction/Neuron_suppl/node2.html 00321 dMeasure -= ( veciOne.size( ) - 1 ) * ( veciTwo.size( ) - 1 ) / ( 2.0f * ( iCountOne + 00322 iCountTwo ) ); 00323 dMeasure = ( dMeasure < 0 ) ? 0 : ( dMeasure / log( 2.0f ) ); 00324 } 00325 00326 //store for later output 00327 vecdMeasures[ iCompletedPairs + iDatTwo - iDatOne ] = dMeasure; 00328 if ( !sArgs.bigmem_flag ) { //Clean up after ourselves if we're not bigmem 00329 if ( iDatOne != iDatTwo ) { //don't delete DatOne 00330 delete DatTwo; 00331 } 00332 } 00333 } 00334 iCompletedPairs += iNumDatasets - iDatOne; 00335 if ( !sArgs.bigmem_flag ) { 00336 delete DatOne; //Clean up after ourselves 00337 } 00338 } 00339 00340 //Now on to output 00341 iCompletedPairs = 0; 00342 if( sArgs.table_flag ) { // Write out table 00343 for ( size_t iDat = 0; iDat < iNumDatasets; ++iDat ) { 00344 cout << '\t' << vecstrInputs[ iDat ]; 00345 } 00346 cout << endl; 00347 for ( size_t iDatOne = 0; iDatOne < iNumDatasets; ++iDatOne) { 00348 cout << vecstrInputs[ iDatOne ]; 00349 for ( size_t iDatTwo = 0; iDatTwo < iNumDatasets; ++iDatTwo) { 00350 if ( iDatTwo < iDatOne ) { 00351 cout << '\t'; 00352 } 00353 else { 00354 cout << '\t' << vecdMeasures[ iCompletedPairs + iDatTwo - iDatOne ]; 00355 } 00356 } 00357 cout << endl; 00358 iCompletedPairs += iNumDatasets - iDatOne; 00359 } 00360 } 00361 else { // write as pairs 00362 for ( size_t iDatOne = 0; iDatOne < iNumDatasets; ++iDatOne) { 00363 for ( size_t iDatTwo = iDatOne; iDatTwo < iNumDatasets; ++iDatTwo) { 00364 cout << vecstrInputs[ iDatOne ] << '\t' << vecstrInputs[ iDatTwo ] << '\t' << vecdMeasures[ iCompletedPairs + iDatTwo - iDatOne ] << endl; 00365 } 00366 iCompletedPairs += iNumDatasets - iDatOne; 00367 } 00368 } 00369 00370 return 0; 00371 }