Sleipnir
tools/Funcographer/Funcographer.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 int output_cograph( const char*, const CDat&, const CDat&, const CPCL&, float );
00026 int explore_graphs( const gengetopt_args_info&, const CDat&, const CDat&, const CPCL& );
00027 bool connectivity( size_t, const vector<size_t>&, const vector<float>&, const vector<size_t>&,
00028     float, size_t, float, size_t, const CDat&, float&, size_t&, float&, size_t& );
00029 void max_connectivity( const vector<bool>&, const vector<size_t>&, const vector<float>&,
00030     const vector<size_t>&, float, size_t, float, size_t, size_t, const CDat&, float&, size_t&, float&,
00031     size_t&, float&, size_t& );
00032 void output_datasets( const gengetopt_args_info&, const CDat&, const CDat&, const CPCL&,
00033     const vector<size_t>&, const vector<size_t>& );
00034 
00035 int main( int iArgs, char** aszArgs ) {
00036     gengetopt_args_info sArgs;
00037     CDat                DatFunctions, DatData;
00038     CPCL                PCLTrusts;
00039     int                 iRet;
00040 
00041     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00042         cmdline_parser_print_help( );
00043         return 1; }
00044     CMeta Meta( sArgs.verbosity_arg );
00045 
00046     if( !DatFunctions.Open( sArgs.functions_arg ) ) {
00047         cerr << "Could not open: " << sArgs.functions_arg << endl;
00048         return 1; }
00049     if( !DatData.Open( sArgs.datasets_arg ) ) {
00050         cerr << "Could not open: " << sArgs.datasets_arg << endl;
00051         return 1; }
00052     if( !PCLTrusts.Open( sArgs.trusts_arg, sArgs.skip_arg ) ) {
00053         cerr << "Could not open: " << sArgs.trusts_arg << endl;
00054         return 1; }
00055     DatFunctions.Normalize( sArgs.output_arg ? CDat::ENormalizeZScore : CDat::ENormalizeMinMax );
00056     DatData.Normalize( sArgs.output_arg ? CDat::ENormalizeZScore : CDat::ENormalizeMinMax );
00057     PCLTrusts.Normalize( sArgs.output_arg ? CPCL::ENormalizeZScore : CPCL::ENormalizeMinMax );
00058 
00059     iRet = sArgs.output_arg ? output_cograph( sArgs.output_arg, DatData, DatFunctions, PCLTrusts,
00060             (float)sArgs.adjust_data_arg ) :
00061             explore_graphs( sArgs, DatData, DatFunctions, PCLTrusts );
00062 
00063     return iRet; }
00064 
00065 int output_cograph( const char* szOutput, const CDat& DatData, const CDat& DatFunctions,
00066     const CPCL& PCLTrusts, float dAdjustData ) {
00067     vector<string>  vecstrNames;
00068     size_t          i, j;
00069     vector<size_t>  veciData, veciFunctions;
00070     CDat            Dat;
00071 
00072     vecstrNames.resize( DatFunctions.GetGenes( ) + DatData.GetGenes( ) );
00073     for( i = 0; i < DatFunctions.GetGenes( ); ++i )
00074         vecstrNames[ i ] = DatFunctions.GetGene( i );
00075     for( i = 0; i < DatData.GetGenes( ); ++i )
00076         vecstrNames[ DatFunctions.GetGenes( ) + i ] = DatData.GetGene( i );
00077     Dat.Open( vecstrNames );
00078     for( i = 0; i < DatFunctions.GetGenes( ); ++i )
00079         for( j = ( i + 1 ); j < DatFunctions.GetGenes( ); ++j )
00080             Dat.Set( i, j, DatFunctions.Get( i, j ) );
00081     for( i = 0; i < DatData.GetGenes( ); ++i )
00082         for( j = ( i + 1 ); j < DatData.GetGenes( ); ++j )
00083             Dat.Set( DatFunctions.GetGenes( ) + i, DatFunctions.GetGenes( ) + j, DatData.Get( i, j ) +
00084                 dAdjustData );
00085 
00086     veciFunctions.resize( PCLTrusts.GetGenes( ) );
00087     veciData.resize( PCLTrusts.GetExperiments( ) );
00088     for( i = 0; i < veciFunctions.size( ); ++i ) {
00089         veciFunctions[ i ] = Dat.GetGene( PCLTrusts.GetGene( i ) );
00090         if( veciFunctions[ i ] == -1 ) {
00091             cerr << "Unknown function: " << PCLTrusts.GetGene( i ) << endl;
00092             return 1; } }
00093     for( i = 0; i < veciData.size( ); ++i ) {
00094         veciData[ i ] = Dat.GetGene( PCLTrusts.GetExperiment( i ) );
00095         if( veciData[ i ] == -1 )
00096             cerr << "Unknown dataset: " << PCLTrusts.GetExperiment( i ) << endl; }
00097     for( i = 0; i < veciFunctions.size( ); ++i ) {
00098         if( veciFunctions[ i ] == -1 )
00099             continue;
00100         for( j = 0; j < veciData.size( ); ++j )
00101             if( veciData[ j ] != -1 )
00102                 Dat.Set( veciFunctions[ i ], veciData[ j ], PCLTrusts.Get( i, j ) ); }
00103     Dat.Save( szOutput );
00104 
00105     return 0; }
00106 
00107 struct SSeed {
00108     size_t  m_iOne;
00109     size_t  m_iTwo;
00110     float   m_dIn;
00111     size_t  m_iIn;
00112     float   m_dTotal;
00113     size_t  m_iTotal;
00114     float   m_dRatio;
00115 
00116     SSeed( size_t iOne, size_t iTwo, float dIn, size_t iIn, float dTotal, size_t iTotal,
00117         float dRatio ) : m_iOne(iOne), m_iTwo(iTwo), m_dIn(dIn), m_iIn(iIn), m_dTotal(dTotal),
00118         m_iTotal(iTotal), m_dRatio(dRatio) { }
00119 
00120     bool operator<( const SSeed& sSeed ) const {
00121 
00122         return ( m_dRatio < sSeed.m_dRatio ); }
00123 };
00124 
00125 int explore_graphs( const gengetopt_args_info& sArgs, const CDat& DatData, const CDat& DatFunctions,
00126     const CPCL& PCLTrusts ) {
00127     vector<bool>    vecfCluster;
00128     vector<float>   vecdConnectivity;
00129     vector<size_t>  veciConnectivity, veciCluster, veciFunctions2Trusts;
00130     size_t          i, j, iIn, iTotal, iMax, iMaxIn, iMaxTotal;
00131     size_t          iClusters;
00132     float           d, dMaxIn, dMaxTotal, dIn, dTotal, dRatio, dHeavy;
00133     CDat            Dat;
00134 
00135     Dat.Open( DatFunctions );
00136     dHeavy = (float)( sArgs.heavy_arg ? sArgs.heavy_arg : ( 1 / sArgs.specificity_arg ) );
00137     veciFunctions2Trusts.resize( DatFunctions.GetGenes( ) );
00138     for( i = 0; i < veciFunctions2Trusts.size( ); ++i )
00139         veciFunctions2Trusts[ i ] = PCLTrusts.GetGene( DatFunctions.GetGene( i ) );
00140 
00141     // veciConnectivity[ i ] contains the number of edges out of gene i
00142     // vecdConnectivity[ i ] contains the sum of edge weights out of gene i
00143     veciConnectivity.resize( Dat.GetGenes( ) );
00144     vecdConnectivity.resize( Dat.GetGenes( ) );
00145     for( i = 0; i < veciConnectivity.size( ); ++i )
00146         for( j = ( i + 1 ); j < veciConnectivity.size( ); ++j )
00147             if( !CMeta::IsNaN( d = Dat.Get( i, j ) ) ) {
00148                 veciConnectivity[ i ]++;
00149                 veciConnectivity[ j ]++;
00150                 vecdConnectivity[ i ] += d;
00151                 vecdConnectivity[ j ] += d; }
00152 
00153     iClusters = 0;
00154     vecfCluster.resize( Dat.GetGenes( ) );
00155     while( ( sArgs.subgraphs_arg == -1 ) || ( iClusters < (size_t)sArgs.subgraphs_arg ) ) {
00156         priority_queue<SSeed>   pqueSeeds;
00157 
00158         veciCluster.resize( 1 );
00159         for( i = 0; i < Dat.GetGenes( ); ++i ) {
00160             veciCluster[ 0 ] = i;
00161             for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j )
00162                 if( connectivity( j, veciCluster, vecdConnectivity, veciConnectivity, 0, 0,
00163                     vecdConnectivity[ i ], veciConnectivity[ i ], Dat, dIn, iIn, dTotal, iTotal ) &&
00164                     ( ( dRatio = ( ( dIn / iIn ) / ( dTotal / iTotal ) ) ) >= sArgs.specificity_arg ) )
00165                     pqueSeeds.push( SSeed( i, j, dIn, iIn, dTotal, iTotal, dRatio ) ); }
00166         if( pqueSeeds.empty( ) )
00167             break;
00168         cerr << "Seeds remaining: " << pqueSeeds.size( ) << endl;
00169 
00170         do {
00171             const SSeed&    sSeed   = pqueSeeds.top( );
00172 
00173             for( i = 0; i < veciCluster.size( ); ++i )
00174                 vecfCluster[ veciCluster[ i ] ] = false;
00175             vecfCluster[ sSeed.m_iOne ] = vecfCluster[ sSeed.m_iTwo ] = true;
00176             veciCluster.resize( 2 );
00177             veciCluster[ 0 ] = sSeed.m_iOne;
00178             veciCluster[ 1 ] = sSeed.m_iTwo;
00179 
00180             cerr << "Cluster " << iClusters << " seed: " << Dat.GetGene( sSeed.m_iOne ) << ", " <<
00181                 Dat.GetGene( sSeed.m_iTwo ) << ", " << sSeed.m_dRatio << endl;
00182             dIn = sSeed.m_dIn;
00183             iIn = sSeed.m_iIn;
00184             dTotal = sSeed.m_dTotal;
00185             iTotal = sSeed.m_iTotal;
00186             while( true ) {
00187                 cerr << "Cluster " << iClusters << ", " << veciCluster.size( ) << " genes" << endl;
00188                 max_connectivity( vecfCluster, veciCluster, vecdConnectivity, veciConnectivity, dIn,
00189                     iIn, dTotal, iTotal, -1, Dat, dRatio, iMax, dMaxIn, iMaxIn, dMaxTotal, iMaxTotal );
00190                 if( dRatio >= ( dHeavy * sSeed.m_dRatio ) ) {
00191                     vecfCluster[ iMax ] = true;
00192                     veciCluster.push_back( iMax );
00193                     dIn = dMaxIn;
00194                     iIn = iMaxIn;
00195                     dTotal = dMaxTotal;
00196                     iTotal = iMaxTotal; }
00197                 else
00198                     break; }
00199             pqueSeeds.pop( ); }
00200         while( !pqueSeeds.empty( ) && ( veciCluster.size( ) < 3 ) );
00201         if( veciCluster.size( ) < 3 )
00202             break;
00203 
00204         cerr << "Found cluster: " << ( ( dIn / iIn ) / ( dTotal / iTotal ) ) << " (" << dIn << '/' <<
00205             iIn << ", " << dTotal << '/' << iTotal << ')' << endl;
00206         if( veciCluster.size( ) >= (size_t)sArgs.size_functions_arg ) {
00207             iClusters++;
00208             cout << ( ( dIn / iIn ) / ( dTotal / iTotal ) );
00209             for( i = 0; i < veciCluster.size( ); ++i )
00210                 cout << '\t' << Dat.GetGene( veciCluster[ i ] );
00211             cout << endl;
00212             output_datasets( sArgs, DatData, DatFunctions, PCLTrusts, veciCluster, veciFunctions2Trusts );
00213             cout.flush( ); }
00214 
00215         dRatio = dIn / iIn;
00216         for( i = 0; i < veciCluster.size( ); ++i )
00217             for( j = ( i + 1 ); j < veciCluster.size( ); ++j )
00218                 if( !CMeta::IsNaN( d = Dat.Get( veciCluster[ i ], veciCluster[ j ] ) ) ) {
00219                     dIn = min( dRatio, d );
00220                     Dat.Set( veciCluster[ i ], veciCluster[ j ], d - dIn );
00221                     vecdConnectivity[ veciCluster[ i ] ] -= dIn;
00222                     vecdConnectivity[ veciCluster[ j ] ] -= dIn; } }
00223 
00224     return 0; }
00225 
00226 bool connectivity( size_t iNew, const vector<size_t>& veciCluster,
00227     const vector<float>& vecdConnectivity, const vector<size_t>& veciConnectivity, float dIn,
00228     size_t iIn, float dTotal, size_t iTotal, const CDat& Dat, float& dSumIn, size_t& iEdgesIn,
00229     float& dSumTotal, size_t& iEdgesTotal ) {
00230     size_t  i;
00231     float   d;
00232     bool    fRet;
00233 
00234     iEdgesTotal = iTotal + veciConnectivity[ iNew ];
00235     dSumTotal = dTotal + vecdConnectivity[ iNew ];
00236     iEdgesIn = iIn;
00237     dSumIn = dIn;
00238     fRet = false;
00239     for( i = 0; i < veciCluster.size( ); ++i )
00240         if( !CMeta::IsNaN( d = Dat.Get( iNew, veciCluster[ i ] ) ) ) {
00241             fRet = true;
00242             iEdgesTotal--;
00243             dSumTotal -= d;
00244             iEdgesIn++;
00245             dSumIn += d; }
00246 
00247     return fRet; }
00248 
00249 void max_connectivity( const vector<bool>& vecfCluster, const vector<size_t>& veciCluster,
00250     const vector<float>& vecdConnectivity, const vector<size_t>& veciConnectivity, float dIn,
00251     size_t iIn, float dTotal, size_t iTotal, size_t iStart, const CDat& Dat, float& dMaxRatio,
00252     size_t& iMax, float& dMaxIn, size_t& iMaxIn, float& dMaxTotal, size_t& iMaxTotal ) {
00253     size_t  i, iEdgesIn, iEdgesTotal;
00254     float   dRatio, dSumIn, dSumTotal;
00255 
00256     dMaxRatio = -FLT_MAX;
00257     for( iMax = 0,i = ( iStart == -1 ) ? 0 : ( iStart + 1 ); i < vecfCluster.size( ); ++i ) {
00258         if( vecfCluster[ i ] || !connectivity( i, veciCluster, vecdConnectivity, veciConnectivity, dIn,
00259             iIn, dTotal, iTotal, Dat, dSumIn, iEdgesIn, dSumTotal, iEdgesTotal ) )
00260             continue;
00261         dRatio = ( ( dSumIn / iEdgesIn ) / ( dSumTotal / iEdgesTotal ) );
00262         if( dRatio > dMaxRatio ) {
00263             dMaxRatio = dRatio;
00264             iMax = i;
00265             dMaxIn = dSumIn;
00266             iMaxIn = iEdgesIn;
00267             dMaxTotal = dSumTotal;
00268             iMaxTotal = iEdgesTotal; } } }
00269 
00270 struct SColink {
00271     size_t  m_iDataset;
00272     float   m_dWeight;
00273 
00274     SColink( size_t iDataset, float dWeight ) : m_iDataset(iDataset), m_dWeight(dWeight) { }
00275 
00276     bool operator<( const SColink& sColink ) const {
00277 
00278         return ( m_dWeight < sColink.m_dWeight ); }
00279 };
00280 
00281 void output_datasets( const gengetopt_args_info& sArgs, const CDat& DatData, const CDat& DatFunctions,
00282     const CPCL& PCLTrusts, const vector<size_t>& veciCluster, const vector<size_t>& veciFunctions2Trusts ) {
00283     size_t          i, j, iFunction;
00284     float           d;
00285     vector<size_t>  veciData, veciTrusts2Data;
00286 
00287     {
00288         priority_queue<SColink> pqueLinks;
00289         vector<float>           vecdData;
00290 
00291         vecdData.resize( PCLTrusts.GetExperiments( ) );
00292         for( i = 0; i < veciCluster.size( ); ++i ) {
00293             if( ( iFunction = veciFunctions2Trusts[ veciCluster[ i ] ] ) == -1 )
00294                 continue;
00295             for( j = 0; j < PCLTrusts.GetExperiments( ); ++j )
00296                 if( !CMeta::IsNaN( d = PCLTrusts.Get( iFunction, j ) ) )
00297                     vecdData[ j ] += d; }
00298 
00299         for( i = 0; i < vecdData.size( ); ++i )
00300             pqueLinks.push( SColink( i, vecdData[ i ] ) );
00301 
00302         for( i = 0; ( i < (size_t)sArgs.size_datasets_arg ) && !pqueLinks.empty( ); ++i ) {
00303             cout << '\t' << pqueLinks.top( ).m_dWeight << '\t' <<
00304                 PCLTrusts.GetExperiment( pqueLinks.top( ).m_iDataset ) << endl;
00305             veciData.push_back( pqueLinks.top( ).m_iDataset );
00306             pqueLinks.pop( ); }
00307     }
00308 
00309     for( i = 0; i < veciCluster.size( ); ++i ) {
00310         if( veciFunctions2Trusts[ veciCluster[ i ] ] == -1 )
00311             continue;
00312         for( j = ( i + 1 ); j < veciCluster.size( ); ++j )
00313             if( veciFunctions2Trusts[ veciCluster[ j ] ] != -1 )
00314                 cout << "\t\t" << DatFunctions.GetGene( veciCluster[ i ] ) << '\t' <<
00315                     DatFunctions.GetGene( veciCluster[ j ] ) << '\t' <<
00316                     DatFunctions.Get( veciCluster[ i ], veciCluster[ j ] ) << endl; }
00317     for( i = 0; i < veciCluster.size( ); ++i ) {
00318         if( veciFunctions2Trusts[ veciCluster[ i ] ] == -1 )
00319             continue;
00320         for( j = 0; j < veciData.size( ); ++j )
00321             cout << "\t\t" << DatFunctions.GetGene( veciCluster[ i ] ) << '\t' <<
00322                 PCLTrusts.GetExperiment( veciData[ j ] ) << '\t' <<
00323                 PCLTrusts.Get( veciFunctions2Trusts[ veciCluster[ i ] ], veciData[ j ] ) << endl; }
00324     veciTrusts2Data.resize( PCLTrusts.GetExperiments( ) );
00325     for( i = 0; i < veciTrusts2Data.size( ); ++i )
00326         veciTrusts2Data[ i ] = DatData.GetGene( PCLTrusts.GetExperiment( i ) );
00327     for( i = 0; i < veciData.size( ); ++i )
00328         for( j = ( i + 1 ); j < veciData.size( ); ++j )
00329             cout << "\t\t" << PCLTrusts.GetExperiment( veciData[ i ] ) << '\t' <<
00330                 PCLTrusts.GetExperiment( veciData[ j ] ) << '\t' <<
00331                 DatData.Get( veciTrusts2Data[ veciData[ i ] ], veciTrusts2Data[ veciData[ j ] ] ) << endl; }