Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 int output_cograph( const char*, const CDat&, const CDat&, const CPCL&, float ); 00026 int explore_graphs( const gengetopt_args_info&, const CDat&, const CDat&, const CPCL& ); 00027 bool connectivity( size_t, const vector<size_t>&, const vector<float>&, const vector<size_t>&, 00028 float, size_t, float, size_t, const CDat&, float&, size_t&, float&, size_t& ); 00029 void max_connectivity( const vector<bool>&, const vector<size_t>&, const vector<float>&, 00030 const vector<size_t>&, float, size_t, float, size_t, size_t, const CDat&, float&, size_t&, float&, 00031 size_t&, float&, size_t& ); 00032 void output_datasets( const gengetopt_args_info&, const CDat&, const CDat&, const CPCL&, 00033 const vector<size_t>&, const vector<size_t>& ); 00034 00035 int main( int iArgs, char** aszArgs ) { 00036 gengetopt_args_info sArgs; 00037 CDat DatFunctions, DatData; 00038 CPCL PCLTrusts; 00039 int iRet; 00040 00041 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00042 cmdline_parser_print_help( ); 00043 return 1; } 00044 CMeta Meta( sArgs.verbosity_arg ); 00045 00046 if( !DatFunctions.Open( sArgs.functions_arg ) ) { 00047 cerr << "Could not open: " << sArgs.functions_arg << endl; 00048 return 1; } 00049 if( !DatData.Open( sArgs.datasets_arg ) ) { 00050 cerr << "Could not open: " << sArgs.datasets_arg << endl; 00051 return 1; } 00052 if( !PCLTrusts.Open( sArgs.trusts_arg, sArgs.skip_arg ) ) { 00053 cerr << "Could not open: " << sArgs.trusts_arg << endl; 00054 return 1; } 00055 DatFunctions.Normalize( sArgs.output_arg ? CDat::ENormalizeZScore : CDat::ENormalizeMinMax ); 00056 DatData.Normalize( sArgs.output_arg ? CDat::ENormalizeZScore : CDat::ENormalizeMinMax ); 00057 PCLTrusts.Normalize( sArgs.output_arg ? CPCL::ENormalizeZScore : CPCL::ENormalizeMinMax ); 00058 00059 iRet = sArgs.output_arg ? output_cograph( sArgs.output_arg, DatData, DatFunctions, PCLTrusts, 00060 (float)sArgs.adjust_data_arg ) : 00061 explore_graphs( sArgs, DatData, DatFunctions, PCLTrusts ); 00062 00063 return iRet; } 00064 00065 int output_cograph( const char* szOutput, const CDat& DatData, const CDat& DatFunctions, 00066 const CPCL& PCLTrusts, float dAdjustData ) { 00067 vector<string> vecstrNames; 00068 size_t i, j; 00069 vector<size_t> veciData, veciFunctions; 00070 CDat Dat; 00071 00072 vecstrNames.resize( DatFunctions.GetGenes( ) + DatData.GetGenes( ) ); 00073 for( i = 0; i < DatFunctions.GetGenes( ); ++i ) 00074 vecstrNames[ i ] = DatFunctions.GetGene( i ); 00075 for( i = 0; i < DatData.GetGenes( ); ++i ) 00076 vecstrNames[ DatFunctions.GetGenes( ) + i ] = DatData.GetGene( i ); 00077 Dat.Open( vecstrNames ); 00078 for( i = 0; i < DatFunctions.GetGenes( ); ++i ) 00079 for( j = ( i + 1 ); j < DatFunctions.GetGenes( ); ++j ) 00080 Dat.Set( i, j, DatFunctions.Get( i, j ) ); 00081 for( i = 0; i < DatData.GetGenes( ); ++i ) 00082 for( j = ( i + 1 ); j < DatData.GetGenes( ); ++j ) 00083 Dat.Set( DatFunctions.GetGenes( ) + i, DatFunctions.GetGenes( ) + j, DatData.Get( i, j ) + 00084 dAdjustData ); 00085 00086 veciFunctions.resize( PCLTrusts.GetGenes( ) ); 00087 veciData.resize( PCLTrusts.GetExperiments( ) ); 00088 for( i = 0; i < veciFunctions.size( ); ++i ) { 00089 veciFunctions[ i ] = Dat.GetGene( PCLTrusts.GetGene( i ) ); 00090 if( veciFunctions[ i ] == -1 ) { 00091 cerr << "Unknown function: " << PCLTrusts.GetGene( i ) << endl; 00092 return 1; } } 00093 for( i = 0; i < veciData.size( ); ++i ) { 00094 veciData[ i ] = Dat.GetGene( PCLTrusts.GetExperiment( i ) ); 00095 if( veciData[ i ] == -1 ) 00096 cerr << "Unknown dataset: " << PCLTrusts.GetExperiment( i ) << endl; } 00097 for( i = 0; i < veciFunctions.size( ); ++i ) { 00098 if( veciFunctions[ i ] == -1 ) 00099 continue; 00100 for( j = 0; j < veciData.size( ); ++j ) 00101 if( veciData[ j ] != -1 ) 00102 Dat.Set( veciFunctions[ i ], veciData[ j ], PCLTrusts.Get( i, j ) ); } 00103 Dat.Save( szOutput ); 00104 00105 return 0; } 00106 00107 struct SSeed { 00108 size_t m_iOne; 00109 size_t m_iTwo; 00110 float m_dIn; 00111 size_t m_iIn; 00112 float m_dTotal; 00113 size_t m_iTotal; 00114 float m_dRatio; 00115 00116 SSeed( size_t iOne, size_t iTwo, float dIn, size_t iIn, float dTotal, size_t iTotal, 00117 float dRatio ) : m_iOne(iOne), m_iTwo(iTwo), m_dIn(dIn), m_iIn(iIn), m_dTotal(dTotal), 00118 m_iTotal(iTotal), m_dRatio(dRatio) { } 00119 00120 bool operator<( const SSeed& sSeed ) const { 00121 00122 return ( m_dRatio < sSeed.m_dRatio ); } 00123 }; 00124 00125 int explore_graphs( const gengetopt_args_info& sArgs, const CDat& DatData, const CDat& DatFunctions, 00126 const CPCL& PCLTrusts ) { 00127 vector<bool> vecfCluster; 00128 vector<float> vecdConnectivity; 00129 vector<size_t> veciConnectivity, veciCluster, veciFunctions2Trusts; 00130 size_t i, j, iIn, iTotal, iMax, iMaxIn, iMaxTotal; 00131 size_t iClusters; 00132 float d, dMaxIn, dMaxTotal, dIn, dTotal, dRatio, dHeavy; 00133 CDat Dat; 00134 00135 Dat.Open( DatFunctions ); 00136 dHeavy = (float)( sArgs.heavy_arg ? sArgs.heavy_arg : ( 1 / sArgs.specificity_arg ) ); 00137 veciFunctions2Trusts.resize( DatFunctions.GetGenes( ) ); 00138 for( i = 0; i < veciFunctions2Trusts.size( ); ++i ) 00139 veciFunctions2Trusts[ i ] = PCLTrusts.GetGene( DatFunctions.GetGene( i ) ); 00140 00141 // veciConnectivity[ i ] contains the number of edges out of gene i 00142 // vecdConnectivity[ i ] contains the sum of edge weights out of gene i 00143 veciConnectivity.resize( Dat.GetGenes( ) ); 00144 vecdConnectivity.resize( Dat.GetGenes( ) ); 00145 for( i = 0; i < veciConnectivity.size( ); ++i ) 00146 for( j = ( i + 1 ); j < veciConnectivity.size( ); ++j ) 00147 if( !CMeta::IsNaN( d = Dat.Get( i, j ) ) ) { 00148 veciConnectivity[ i ]++; 00149 veciConnectivity[ j ]++; 00150 vecdConnectivity[ i ] += d; 00151 vecdConnectivity[ j ] += d; } 00152 00153 iClusters = 0; 00154 vecfCluster.resize( Dat.GetGenes( ) ); 00155 while( ( sArgs.subgraphs_arg == -1 ) || ( iClusters < (size_t)sArgs.subgraphs_arg ) ) { 00156 priority_queue<SSeed> pqueSeeds; 00157 00158 veciCluster.resize( 1 ); 00159 for( i = 0; i < Dat.GetGenes( ); ++i ) { 00160 veciCluster[ 0 ] = i; 00161 for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) 00162 if( connectivity( j, veciCluster, vecdConnectivity, veciConnectivity, 0, 0, 00163 vecdConnectivity[ i ], veciConnectivity[ i ], Dat, dIn, iIn, dTotal, iTotal ) && 00164 ( ( dRatio = ( ( dIn / iIn ) / ( dTotal / iTotal ) ) ) >= sArgs.specificity_arg ) ) 00165 pqueSeeds.push( SSeed( i, j, dIn, iIn, dTotal, iTotal, dRatio ) ); } 00166 if( pqueSeeds.empty( ) ) 00167 break; 00168 cerr << "Seeds remaining: " << pqueSeeds.size( ) << endl; 00169 00170 do { 00171 const SSeed& sSeed = pqueSeeds.top( ); 00172 00173 for( i = 0; i < veciCluster.size( ); ++i ) 00174 vecfCluster[ veciCluster[ i ] ] = false; 00175 vecfCluster[ sSeed.m_iOne ] = vecfCluster[ sSeed.m_iTwo ] = true; 00176 veciCluster.resize( 2 ); 00177 veciCluster[ 0 ] = sSeed.m_iOne; 00178 veciCluster[ 1 ] = sSeed.m_iTwo; 00179 00180 cerr << "Cluster " << iClusters << " seed: " << Dat.GetGene( sSeed.m_iOne ) << ", " << 00181 Dat.GetGene( sSeed.m_iTwo ) << ", " << sSeed.m_dRatio << endl; 00182 dIn = sSeed.m_dIn; 00183 iIn = sSeed.m_iIn; 00184 dTotal = sSeed.m_dTotal; 00185 iTotal = sSeed.m_iTotal; 00186 while( true ) { 00187 cerr << "Cluster " << iClusters << ", " << veciCluster.size( ) << " genes" << endl; 00188 max_connectivity( vecfCluster, veciCluster, vecdConnectivity, veciConnectivity, dIn, 00189 iIn, dTotal, iTotal, -1, Dat, dRatio, iMax, dMaxIn, iMaxIn, dMaxTotal, iMaxTotal ); 00190 if( dRatio >= ( dHeavy * sSeed.m_dRatio ) ) { 00191 vecfCluster[ iMax ] = true; 00192 veciCluster.push_back( iMax ); 00193 dIn = dMaxIn; 00194 iIn = iMaxIn; 00195 dTotal = dMaxTotal; 00196 iTotal = iMaxTotal; } 00197 else 00198 break; } 00199 pqueSeeds.pop( ); } 00200 while( !pqueSeeds.empty( ) && ( veciCluster.size( ) < 3 ) ); 00201 if( veciCluster.size( ) < 3 ) 00202 break; 00203 00204 cerr << "Found cluster: " << ( ( dIn / iIn ) / ( dTotal / iTotal ) ) << " (" << dIn << '/' << 00205 iIn << ", " << dTotal << '/' << iTotal << ')' << endl; 00206 if( veciCluster.size( ) >= (size_t)sArgs.size_functions_arg ) { 00207 iClusters++; 00208 cout << ( ( dIn / iIn ) / ( dTotal / iTotal ) ); 00209 for( i = 0; i < veciCluster.size( ); ++i ) 00210 cout << '\t' << Dat.GetGene( veciCluster[ i ] ); 00211 cout << endl; 00212 output_datasets( sArgs, DatData, DatFunctions, PCLTrusts, veciCluster, veciFunctions2Trusts ); 00213 cout.flush( ); } 00214 00215 dRatio = dIn / iIn; 00216 for( i = 0; i < veciCluster.size( ); ++i ) 00217 for( j = ( i + 1 ); j < veciCluster.size( ); ++j ) 00218 if( !CMeta::IsNaN( d = Dat.Get( veciCluster[ i ], veciCluster[ j ] ) ) ) { 00219 dIn = min( dRatio, d ); 00220 Dat.Set( veciCluster[ i ], veciCluster[ j ], d - dIn ); 00221 vecdConnectivity[ veciCluster[ i ] ] -= dIn; 00222 vecdConnectivity[ veciCluster[ j ] ] -= dIn; } } 00223 00224 return 0; } 00225 00226 bool connectivity( size_t iNew, const vector<size_t>& veciCluster, 00227 const vector<float>& vecdConnectivity, const vector<size_t>& veciConnectivity, float dIn, 00228 size_t iIn, float dTotal, size_t iTotal, const CDat& Dat, float& dSumIn, size_t& iEdgesIn, 00229 float& dSumTotal, size_t& iEdgesTotal ) { 00230 size_t i; 00231 float d; 00232 bool fRet; 00233 00234 iEdgesTotal = iTotal + veciConnectivity[ iNew ]; 00235 dSumTotal = dTotal + vecdConnectivity[ iNew ]; 00236 iEdgesIn = iIn; 00237 dSumIn = dIn; 00238 fRet = false; 00239 for( i = 0; i < veciCluster.size( ); ++i ) 00240 if( !CMeta::IsNaN( d = Dat.Get( iNew, veciCluster[ i ] ) ) ) { 00241 fRet = true; 00242 iEdgesTotal--; 00243 dSumTotal -= d; 00244 iEdgesIn++; 00245 dSumIn += d; } 00246 00247 return fRet; } 00248 00249 void max_connectivity( const vector<bool>& vecfCluster, const vector<size_t>& veciCluster, 00250 const vector<float>& vecdConnectivity, const vector<size_t>& veciConnectivity, float dIn, 00251 size_t iIn, float dTotal, size_t iTotal, size_t iStart, const CDat& Dat, float& dMaxRatio, 00252 size_t& iMax, float& dMaxIn, size_t& iMaxIn, float& dMaxTotal, size_t& iMaxTotal ) { 00253 size_t i, iEdgesIn, iEdgesTotal; 00254 float dRatio, dSumIn, dSumTotal; 00255 00256 dMaxRatio = -FLT_MAX; 00257 for( iMax = 0,i = ( iStart == -1 ) ? 0 : ( iStart + 1 ); i < vecfCluster.size( ); ++i ) { 00258 if( vecfCluster[ i ] || !connectivity( i, veciCluster, vecdConnectivity, veciConnectivity, dIn, 00259 iIn, dTotal, iTotal, Dat, dSumIn, iEdgesIn, dSumTotal, iEdgesTotal ) ) 00260 continue; 00261 dRatio = ( ( dSumIn / iEdgesIn ) / ( dSumTotal / iEdgesTotal ) ); 00262 if( dRatio > dMaxRatio ) { 00263 dMaxRatio = dRatio; 00264 iMax = i; 00265 dMaxIn = dSumIn; 00266 iMaxIn = iEdgesIn; 00267 dMaxTotal = dSumTotal; 00268 iMaxTotal = iEdgesTotal; } } } 00269 00270 struct SColink { 00271 size_t m_iDataset; 00272 float m_dWeight; 00273 00274 SColink( size_t iDataset, float dWeight ) : m_iDataset(iDataset), m_dWeight(dWeight) { } 00275 00276 bool operator<( const SColink& sColink ) const { 00277 00278 return ( m_dWeight < sColink.m_dWeight ); } 00279 }; 00280 00281 void output_datasets( const gengetopt_args_info& sArgs, const CDat& DatData, const CDat& DatFunctions, 00282 const CPCL& PCLTrusts, const vector<size_t>& veciCluster, const vector<size_t>& veciFunctions2Trusts ) { 00283 size_t i, j, iFunction; 00284 float d; 00285 vector<size_t> veciData, veciTrusts2Data; 00286 00287 { 00288 priority_queue<SColink> pqueLinks; 00289 vector<float> vecdData; 00290 00291 vecdData.resize( PCLTrusts.GetExperiments( ) ); 00292 for( i = 0; i < veciCluster.size( ); ++i ) { 00293 if( ( iFunction = veciFunctions2Trusts[ veciCluster[ i ] ] ) == -1 ) 00294 continue; 00295 for( j = 0; j < PCLTrusts.GetExperiments( ); ++j ) 00296 if( !CMeta::IsNaN( d = PCLTrusts.Get( iFunction, j ) ) ) 00297 vecdData[ j ] += d; } 00298 00299 for( i = 0; i < vecdData.size( ); ++i ) 00300 pqueLinks.push( SColink( i, vecdData[ i ] ) ); 00301 00302 for( i = 0; ( i < (size_t)sArgs.size_datasets_arg ) && !pqueLinks.empty( ); ++i ) { 00303 cout << '\t' << pqueLinks.top( ).m_dWeight << '\t' << 00304 PCLTrusts.GetExperiment( pqueLinks.top( ).m_iDataset ) << endl; 00305 veciData.push_back( pqueLinks.top( ).m_iDataset ); 00306 pqueLinks.pop( ); } 00307 } 00308 00309 for( i = 0; i < veciCluster.size( ); ++i ) { 00310 if( veciFunctions2Trusts[ veciCluster[ i ] ] == -1 ) 00311 continue; 00312 for( j = ( i + 1 ); j < veciCluster.size( ); ++j ) 00313 if( veciFunctions2Trusts[ veciCluster[ j ] ] != -1 ) 00314 cout << "\t\t" << DatFunctions.GetGene( veciCluster[ i ] ) << '\t' << 00315 DatFunctions.GetGene( veciCluster[ j ] ) << '\t' << 00316 DatFunctions.Get( veciCluster[ i ], veciCluster[ j ] ) << endl; } 00317 for( i = 0; i < veciCluster.size( ); ++i ) { 00318 if( veciFunctions2Trusts[ veciCluster[ i ] ] == -1 ) 00319 continue; 00320 for( j = 0; j < veciData.size( ); ++j ) 00321 cout << "\t\t" << DatFunctions.GetGene( veciCluster[ i ] ) << '\t' << 00322 PCLTrusts.GetExperiment( veciData[ j ] ) << '\t' << 00323 PCLTrusts.Get( veciFunctions2Trusts[ veciCluster[ i ] ], veciData[ j ] ) << endl; } 00324 veciTrusts2Data.resize( PCLTrusts.GetExperiments( ) ); 00325 for( i = 0; i < veciTrusts2Data.size( ); ++i ) 00326 veciTrusts2Data[ i ] = DatData.GetGene( PCLTrusts.GetExperiment( i ) ); 00327 for( i = 0; i < veciData.size( ); ++i ) 00328 for( j = ( i + 1 ); j < veciData.size( ); ++j ) 00329 cout << "\t\t" << PCLTrusts.GetExperiment( veciData[ i ] ) << '\t' << 00330 PCLTrusts.GetExperiment( veciData[ j ] ) << '\t' << 00331 DatData.Get( veciTrusts2Data[ veciData[ i ] ], veciTrusts2Data[ veciData[ j ] ] ) << endl; }