Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 int cliques( const gengetopt_args_info&, const CDat&, const CDat&, const vector<size_t>& ); 00026 int heavy( const gengetopt_args_info&, CDat&, const CDat&, const vector<size_t>& ); 00027 int heavy2( const gengetopt_args_info&, CDat&, const CDat&, const vector<size_t>& ); 00028 int motifs( const gengetopt_args_info&, CDat& ); 00029 bool connectivity( size_t, const vector<size_t>&, const vector<float>&, const vector<size_t>&, 00030 float, size_t, float, size_t, const CDat&, float&, size_t&, float&, size_t& ); 00031 void max_connectivity( const vector<bool>&, const vector<size_t>&, const vector<float>&, 00032 const vector<size_t>&, float, size_t, float, size_t, size_t, const CDat&, float&, size_t&, float&, 00033 size_t&, float&, size_t& ); 00034 00035 int main( int iArgs, char** aszArgs ) { 00036 gengetopt_args_info sArgs; 00037 CDat Dat, DatKnowns; 00038 vector<size_t> veciGenes, veciKnowns; 00039 size_t i, j; 00040 float d; 00041 int iRet; 00042 00043 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00044 cmdline_parser_print_help( ); 00045 return 1; } 00046 if( sArgs.cutoff_arg < -1e-20 ) 00047 sArgs.cutoff_arg = CMeta::GetNaN( ); 00048 CMeta Meta( sArgs.verbosity_arg ); 00049 00050 if( sArgs.input_arg ) { 00051 if( !Dat.Open( sArgs.input_arg, sArgs.memmap_flag && !sArgs.normalize_flag && 00052 !sArgs.heavy_arg && !sArgs.cutoff_arg ) ) { 00053 cerr << "Could not open: " << sArgs.input_arg << endl; 00054 return 1; } } 00055 else if( !Dat.Open( cin, CDat::EFormatText ) ) { 00056 cerr << "Could not open input" << endl; 00057 return 1; } 00058 if( sArgs.normalize_flag ) 00059 Dat.Normalize( CDat::ENormalizeSigmoid ); 00060 if( !CMeta::IsNaN( sArgs.cutoff_arg ) ) 00061 for( i = 0; i < Dat.GetGenes( ); ++i ) 00062 for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) 00063 if( !CMeta::IsNaN( d = Dat.Get( i, j ) ) && ( d < sArgs.cutoff_arg ) ) 00064 Dat.Set( i, j, CMeta::GetNaN( ) ); 00065 if( sArgs.knowns_arg ) { 00066 if( !DatKnowns.Open( sArgs.knowns_arg, !!sArgs.memmap_flag ) ) { 00067 cerr << "Could not open: " << sArgs.knowns_arg << endl; 00068 return 1; } 00069 veciKnowns.resize( Dat.GetGenes( ) ); 00070 for( i = 0; i < veciKnowns.size( ); ++i ) 00071 veciKnowns[ i ] = DatKnowns.GetGene( Dat.GetGene( i ) ); } 00072 00073 iRet = sArgs.motifs_arg ? motifs( sArgs, Dat ) : ( sArgs.heavy_arg ? 00074 heavy2( sArgs, Dat, DatKnowns, veciKnowns ) : 00075 cliques( sArgs, Dat, DatKnowns, veciKnowns ) ); 00076 00077 return iRet; } 00078 00079 int cliques( const gengetopt_args_info& sArgs, const CDat& Dat, const CDat& DatKnowns, 00080 const vector<size_t>& veciKnowns ) { 00081 vector<pair<vector<size_t>,float> > vecprCliques; 00082 vector<size_t> veciGenes; 00083 float d, dCur; 00084 size_t i, j, iOne, iTwo, iMin; 00085 int iCol; 00086 00087 vecprCliques.resize( sArgs.subgraphs_arg ); 00088 for( iMin = i = 0; i < vecprCliques.size( ); ++i ) { 00089 vecprCliques[ i ].first.resize( sArgs.size_arg ); 00090 vecprCliques[ i ].second = -FLT_MAX; } 00091 veciGenes.resize( sArgs.size_arg ); 00092 for( i = 0; i < veciGenes.size( ); ++i ) 00093 veciGenes[ i ] = i; 00094 while( veciGenes[ 0 ] <= ( Dat.GetGenes( ) - veciGenes.size( ) ) ) { 00095 if( !rand( ) && ( ( (float)rand( ) / RAND_MAX ) < 1e-3 ) ) { 00096 for( i = 0; i < veciGenes.size( ); ++i ) 00097 cerr << veciGenes[ i ] << ','; 00098 cerr << endl; } 00099 for( dCur = 0,i = 0; i < veciGenes.size( ); ++i ) { 00100 iOne = sArgs.knowns_arg ? veciKnowns[ veciGenes[ i ] ] : -1; 00101 for( j = ( i + 1 ); j < veciGenes.size( ); ++j ) { 00102 if( ( iOne != -1 ) && ( ( iTwo = veciKnowns[ veciGenes[ j ] ] ) != -1 ) && 00103 !CMeta::IsNaN( d = DatKnowns.Get( iOne, iTwo ) ) ) 00104 continue; 00105 if( !CMeta::IsNaN( d = Dat.Get( veciGenes[ i ], veciGenes[ j ] ) ) ) 00106 dCur += d; } } 00107 if( dCur > vecprCliques[ iMin ].second ) { 00108 00109 copy( veciGenes.begin( ), veciGenes.end( ), vecprCliques[ iMin ].first.begin( ) ); 00110 vecprCliques[ iMin ].second = dCur; 00111 for( iMin = i = 0; i < vecprCliques.size( ); ++i ) 00112 if( vecprCliques[ i ].second < vecprCliques[ iMin ].second ) 00113 iMin = i; } 00114 for( i = 0; i < veciGenes.size( ); ++i ) 00115 if( ++veciGenes[ veciGenes.size( ) - 1 - i ] < ( Dat.GetGenes( ) - i ) ) { 00116 for( iCol = ( (int)veciGenes.size( ) - (int)i ); ( iCol > 0 ) && 00117 ( (size_t)iCol < veciGenes.size( ) ); ++iCol ) 00118 veciGenes[ iCol ] = veciGenes[ iCol - 1 ] + 1; 00119 break; } } 00120 00121 for( i = 0; i < vecprCliques.size( ); ++i ) { 00122 cout << vecprCliques[ i ].second; 00123 for( j = 0; j < vecprCliques[ i ].first.size( ); ++j ) 00124 cout << '\t' << Dat.GetGene( vecprCliques[ i ].first[ j ] ); 00125 cout << endl; } 00126 00127 return 0; } 00128 00129 struct SSeed { 00130 size_t m_iOne; 00131 size_t m_iTwo; 00132 float m_dIn; 00133 size_t m_iIn; 00134 float m_dTotal; 00135 size_t m_iTotal; 00136 float m_dRatio; 00137 00138 SSeed( size_t iOne, size_t iTwo, float dIn, size_t iIn, float dTotal, size_t iTotal, 00139 float dRatio ) : m_iOne(iOne), m_iTwo(iTwo), m_dIn(dIn), m_iIn(iIn), m_dTotal(dTotal), 00140 m_iTotal(iTotal), m_dRatio(dRatio) { } 00141 00142 bool operator<( const SSeed& sSeed ) const { 00143 00144 return ( m_dRatio < sSeed.m_dRatio ); } 00145 }; 00146 00147 int heavy( const gengetopt_args_info& sArgs, CDat& Dat, const CDat& DatKnowns, 00148 const vector<size_t>& veciKnowns ) { 00149 vector<bool> vecfCluster; 00150 vector<float> vecdConnectivity; 00151 vector<size_t> veciConnectivity, veciCluster; 00152 size_t i, j, iIn, iTotal, iMax, iMaxIn, iMaxTotal; 00153 size_t iClusters; 00154 float d, dMaxIn, dMaxTotal, dIn, dTotal, dRatio; 00155 00156 // veciConnectivity[ i ] contains the number of edges out of gene i 00157 // vecdConnectivity[ i ] contains the sum of edge weights out of gene i 00158 veciConnectivity.resize( Dat.GetGenes( ) ); 00159 vecdConnectivity.resize( Dat.GetGenes( ) ); 00160 for( i = 0; i < veciConnectivity.size( ); ++i ) 00161 for( j = ( i + 1 ); j < veciConnectivity.size( ); ++j ) 00162 if( !CMeta::IsNaN( d = Dat.Get( i, j ) ) ) { 00163 veciConnectivity[ i ]++; 00164 veciConnectivity[ j ]++; 00165 vecdConnectivity[ i ] += d; 00166 vecdConnectivity[ j ] += d; } 00167 00168 cout << sArgs.heavy_arg << endl; 00169 iClusters = 0; 00170 vecfCluster.resize( Dat.GetGenes( ) ); 00171 while( ( sArgs.subgraphs_arg == -1 ) || ( iClusters < (size_t)sArgs.subgraphs_arg ) ) { 00172 priority_queue<SSeed> pqueSeeds; 00173 00174 veciCluster.resize( 1 ); 00175 for( i = 0; i < Dat.GetGenes( ); ++i ) { 00176 veciCluster[ 0 ] = i; 00177 for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) 00178 if( connectivity( j, veciCluster, vecdConnectivity, veciConnectivity, 0, 0, 00179 vecdConnectivity[ i ], veciConnectivity[ i ], Dat, dIn, iIn, dTotal, iTotal ) && 00180 ( ( dRatio = ( ( dIn / iIn ) / ( dTotal / iTotal ) ) ) >= sArgs.specificity_arg ) ) 00181 pqueSeeds.push( SSeed( i, j, dIn, iIn, dTotal, iTotal, dRatio ) ); } 00182 if( pqueSeeds.empty( ) ) 00183 break; 00184 cerr << "Seeds remaining: " << pqueSeeds.size( ) << endl; 00185 00186 do { 00187 const SSeed& sSeed = pqueSeeds.top( ); 00188 00189 fill( vecfCluster.begin( ), vecfCluster.end( ), false ); 00190 vecfCluster[ sSeed.m_iOne ] = vecfCluster[ sSeed.m_iTwo ] = true; 00191 veciCluster.resize( 2 ); 00192 veciCluster[ 0 ] = sSeed.m_iOne; 00193 veciCluster[ 1 ] = sSeed.m_iTwo; 00194 00195 cerr << "Cluster " << iClusters << " seed: " << Dat.GetGene( sSeed.m_iOne ) << ", " << 00196 Dat.GetGene( sSeed.m_iTwo ) << ", " << sSeed.m_dRatio << endl; 00197 dIn = sSeed.m_dIn; 00198 iIn = sSeed.m_iIn; 00199 dTotal = sSeed.m_dTotal; 00200 iTotal = sSeed.m_iTotal; 00201 while( true ) { 00202 cerr << "Cluster " << iClusters << ", " << veciCluster.size( ) << " genes" << endl; 00203 max_connectivity( vecfCluster, veciCluster, vecdConnectivity, veciConnectivity, dIn, 00204 iIn, dTotal, iTotal, -1, Dat, dRatio, iMax, dMaxIn, iMaxIn, dMaxTotal, iMaxTotal ); 00205 if( ( dRatio > 1 ) && ( dRatio >= ( sArgs.heavy_arg * sSeed.m_dRatio ) ) ) { 00206 vecfCluster[ iMax ] = true; 00207 veciCluster.push_back( iMax ); 00208 dIn = dMaxIn; 00209 iIn = iMaxIn; 00210 dTotal = dMaxTotal; 00211 iTotal = iMaxTotal; } 00212 else 00213 break; } 00214 pqueSeeds.pop( ); } 00215 while( !pqueSeeds.empty( ) && ( veciCluster.size( ) < 3 ) ); 00216 if( veciCluster.size( ) < 3 ) 00217 break; 00218 00219 cerr << "Found cluster: " << ( ( dIn / iIn ) / ( dTotal / iTotal ) ) << " (" << dIn << '/' << 00220 iIn << ", " << dTotal << '/' << iTotal << ')' << endl; 00221 iClusters++; 00222 cout << ( ( dIn / iIn ) / ( dTotal / iTotal ) ); 00223 for( i = 0; i < veciCluster.size( ); ++i ) 00224 cout << '\t' << Dat.GetGene( veciCluster[ i ] ); 00225 cout << endl; 00226 cout.flush( ); 00227 00228 dRatio = dIn / iIn; 00229 for( i = 0; i < veciCluster.size( ); ++i ) 00230 for( j = ( i + 1 ); j < veciCluster.size( ); ++j ) 00231 if( !CMeta::IsNaN( d = Dat.Get( veciCluster[ i ], veciCluster[ j ] ) ) ) { 00232 dIn = min( dRatio, d ); 00233 Dat.Set( veciCluster[ i ], veciCluster[ j ], d - dIn ); 00234 vecdConnectivity[ veciCluster[ i ] ] -= dIn; 00235 vecdConnectivity[ veciCluster[ j ] ] -= dIn; } } 00236 00237 return 0; } 00238 00239 bool connectivity( size_t iNew, const vector<size_t>& veciCluster, 00240 const vector<float>& vecdConnectivity, const vector<size_t>& veciConnectivity, float dIn, 00241 size_t iIn, float dTotal, size_t iTotal, const CDat& Dat, float& dSumIn, size_t& iEdgesIn, 00242 float& dSumTotal, size_t& iEdgesTotal ) { 00243 size_t i; 00244 float d; 00245 bool fRet; 00246 00247 iEdgesTotal = iTotal + veciConnectivity[ iNew ]; 00248 dSumTotal = dTotal + vecdConnectivity[ iNew ]; 00249 iEdgesIn = iIn; 00250 dSumIn = dIn; 00251 fRet = false; 00252 for( i = 0; i < veciCluster.size( ); ++i ) 00253 if( !CMeta::IsNaN( d = Dat.Get( iNew, veciCluster[ i ] ) ) ) { 00254 fRet = true; 00255 iEdgesTotal--; 00256 dSumTotal -= d; 00257 iEdgesIn++; 00258 dSumIn += d; } 00259 00260 return fRet; } 00261 00262 void max_connectivity( const vector<bool>& vecfCluster, const vector<size_t>& veciCluster, 00263 const vector<float>& vecdConnectivity, const vector<size_t>& veciConnectivity, float dIn, 00264 size_t iIn, float dTotal, size_t iTotal, size_t iStart, const CDat& Dat, float& dMaxRatio, 00265 size_t& iMax, float& dMaxIn, size_t& iMaxIn, float& dMaxTotal, size_t& iMaxTotal ) { 00266 size_t i, iEdgesIn, iEdgesTotal; 00267 float dRatio, dSumIn, dSumTotal; 00268 00269 dMaxRatio = -FLT_MAX; 00270 for( iMax = 0,i = ( iStart == -1 ) ? 0 : ( iStart + 1 ); i < vecfCluster.size( ); ++i ) { 00271 if( vecfCluster[ i ] || !connectivity( i, veciCluster, vecdConnectivity, veciConnectivity, dIn, 00272 iIn, dTotal, iTotal, Dat, dSumIn, iEdgesIn, dSumTotal, iEdgesTotal ) ) 00273 continue; 00274 dRatio = ( ( dSumIn / iEdgesIn ) / ( dSumTotal / iEdgesTotal ) ); 00275 if( dRatio > dMaxRatio ) { 00276 dMaxRatio = dRatio; 00277 iMax = i; 00278 dMaxIn = dSumIn; 00279 iMaxIn = iEdgesIn; 00280 dMaxTotal = dSumTotal; 00281 iMaxTotal = iEdgesTotal; } } } 00282 00283 int heavy2( const gengetopt_args_info& sArgs, CDat& Dat, const CDat& DatKnowns, 00284 const vector<size_t>& veciKnowns ) { 00285 size_t i, j, iClusters, iMax; 00286 float d, dCur, dMax; 00287 vector<size_t> veciCluster, veciScores; 00288 vector<pair<size_t, size_t> > vecpriiSeeds; 00289 vector<float> vecdScores; 00290 vector<bool> vecfCluster; 00291 00292 for( i = 0; i < Dat.GetGenes( ); ++i ) 00293 for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) 00294 if( !CMeta::IsNaN( d = Dat.Get( i, j ) ) && ( d >= sArgs.specificity_arg ) ) 00295 vecpriiSeeds.push_back( pair<size_t, size_t>( i, j ) ); 00296 00297 cout << sArgs.heavy_arg << endl; 00298 iClusters = 0; 00299 veciScores.resize( Dat.GetGenes( ) ); 00300 vecdScores.resize( Dat.GetGenes( ) ); 00301 vecfCluster.resize( Dat.GetGenes( ) ); 00302 while( ( sArgs.subgraphs_arg == -1 ) || ( iClusters < (size_t)sArgs.subgraphs_arg ) ) { 00303 priority_queue<SSeed> pqueSeeds; 00304 bool fHit; 00305 00306 for( i = 0; i < vecpriiSeeds.size( ); ++i ) 00307 if( ( d = Dat.Get( vecpriiSeeds[ i ].first, vecpriiSeeds[ i ].second ) ) >= sArgs.specificity_arg ) 00308 pqueSeeds.push( SSeed( vecpriiSeeds[ i ].first, vecpriiSeeds[ i ].second, d, 1, 0, 0, d ) ); 00309 cerr << "Seeds remaining: " << pqueSeeds.size( ) << endl; 00310 00311 for( fHit = false; !pqueSeeds.empty( ); ) { 00312 const SSeed& sSeed = pqueSeeds.top( ); 00313 00314 fill( vecfCluster.begin( ), vecfCluster.end( ), false ); 00315 vecfCluster[ sSeed.m_iOne ] = vecfCluster [ sSeed.m_iTwo ] = true; 00316 veciCluster.resize( 2 ); 00317 veciCluster[ 0 ] = sSeed.m_iOne; 00318 veciCluster[ 1 ] = sSeed.m_iTwo; 00319 00320 cerr << "Cluster " << iClusters << " seed: " << Dat.GetGene( sSeed.m_iOne ) << ", " << 00321 Dat.GetGene( sSeed.m_iTwo ) << ", " << sSeed.m_dRatio << endl; 00322 for( i = 0; i < Dat.GetGenes( ); ++i ) { 00323 vecdScores[ i ] = 0; 00324 for( veciScores[ i ] = j = 0; j < veciCluster.size( ); ++j ) 00325 if( !CMeta::IsNaN( d = Dat.Get( i, veciCluster[ j ] ) ) ) { 00326 vecdScores[ i ] += d; 00327 veciScores[ i ]++; } } 00328 while( true ) { 00329 cerr << "Cluster " << iClusters << ", " << veciCluster.size( ) << " genes" << endl; 00330 for( dMax = 0,iMax = i = 0; i < Dat.GetGenes( ); ++i ) { 00331 if( vecfCluster[ i ] || !veciScores[ i ] ) 00332 continue; 00333 if( ( dCur = ( vecdScores[ i ] / veciScores[ i ] ) ) > dMax ) { 00334 dMax = dCur; 00335 iMax = i; } } 00336 if( dMax < ( sArgs.heavy_arg * sSeed.m_dRatio ) ) 00337 break; 00338 for( i = 0; i < Dat.GetGenes( ); ++i ) 00339 if( !CMeta::IsNaN( d = Dat.Get( i, iMax ) ) ) { 00340 vecdScores[ i ] += d; 00341 veciScores[ i ]++; } 00342 vecfCluster[ iMax ] = true; 00343 veciCluster.push_back( iMax ); } 00344 if( veciCluster.size( ) < 3 ) { 00345 pqueSeeds.pop( ); 00346 continue; } 00347 00348 fHit = true; 00349 for( dMax = 0, iMax = i = 0; i < veciCluster.size( ); ++i ) 00350 for( j = ( i + 1 ); j < veciCluster.size( ); ++j ) 00351 if( !CMeta::IsNaN( d = Dat.Get( veciCluster[ i ], veciCluster[ j ] ) ) ) { 00352 iMax++; 00353 dMax += d; } 00354 00355 cerr << "Found cluster: " << ( dMax /= iMax ) << endl; 00356 iClusters++; 00357 cout << dMax; 00358 for( i = 0; i < veciCluster.size( ); ++i ) 00359 cout << '\t' << Dat.GetGene( veciCluster[ i ] ); 00360 cout << endl; 00361 cout.flush( ); 00362 00363 for( i = 0; i < veciCluster.size( ); ++i ) 00364 for( j = ( i + 1 ); j < veciCluster.size( ); ++j ) 00365 if( !CMeta::IsNaN( d = Dat.Get( veciCluster[ i ], veciCluster[ j ] ) ) ) { 00366 dCur = min( dMax, d ); 00367 Dat.Set( veciCluster[ i ], veciCluster[ j ], d - dCur ); } 00368 break; } 00369 if( !fHit ) 00370 break; } 00371 00372 return 0; } 00373 00374 int motifs( const gengetopt_args_info& sArgs, CDat& Dat ) { 00375 size_t i, j, k, iOne, iTwo, iThree; 00376 vector<size_t> veciOne, veciTwo; 00377 vector<bool> vecfSign; 00378 float dOne, dTwo, dThree; 00379 00380 for( i = 0; i < Dat.GetGenes( ); ++i ) 00381 for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) 00382 if( !CMeta::IsNaN( dOne = Dat.Get( i, j ) ) && ( fabs( dOne ) >= sArgs.motifs_arg ) ) { 00383 iOne = ( dOne >= 0 ) ? 1 : 0; 00384 for( k = ( j + 1 ); k < Dat.GetGenes( ); ++k ) 00385 if( !CMeta::IsNaN( dTwo = Dat.Get( j, k ) ) && ( fabs( dTwo ) >= sArgs.motifs_arg ) && 00386 !CMeta::IsNaN( dThree = Dat.Get( i, k ) ) && ( fabs( dThree ) >= sArgs.motifs_arg ) ) { 00387 iTwo = ( dTwo >= 0 ) ? 1 : 0; 00388 iThree = ( dThree >= 0 ) ? 1 : 0; 00389 if( ( iOne + iTwo + iThree ) == 1 ) { 00390 cout << Dat.GetGene( i ) << '\t' << dOne << '\t' << 00391 Dat.GetGene( j ) << '\t' << dTwo << '\t' << 00392 Dat.GetGene( k ) << '\t' << dThree << endl; } } } 00393 00394 return 0; }