Sleipnir
tools/Cliquer/Cliquer.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 int cliques( const gengetopt_args_info&, const CDat&, const CDat&, const vector<size_t>& );
00026 int heavy( const gengetopt_args_info&, CDat&, const CDat&, const vector<size_t>& );
00027 int heavy2( const gengetopt_args_info&, CDat&, const CDat&, const vector<size_t>& );
00028 int motifs( const gengetopt_args_info&, CDat& );
00029 bool connectivity( size_t, const vector<size_t>&, const vector<float>&, const vector<size_t>&,
00030     float, size_t, float, size_t, const CDat&, float&, size_t&, float&, size_t& );
00031 void max_connectivity( const vector<bool>&, const vector<size_t>&, const vector<float>&,
00032     const vector<size_t>&, float, size_t, float, size_t, size_t, const CDat&, float&, size_t&, float&,
00033     size_t&, float&, size_t& );
00034 
00035 int main( int iArgs, char** aszArgs ) {
00036     gengetopt_args_info sArgs;
00037     CDat                Dat, DatKnowns;
00038     vector<size_t>      veciGenes, veciKnowns;
00039     size_t              i, j;
00040     float               d;
00041     int                 iRet;
00042     
00043     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00044         cmdline_parser_print_help( );
00045         return 1; }
00046     if( sArgs.cutoff_arg < -1e-20 )
00047         sArgs.cutoff_arg = CMeta::GetNaN( );
00048     CMeta Meta( sArgs.verbosity_arg );
00049 
00050     if( sArgs.input_arg ) {
00051         if( !Dat.Open( sArgs.input_arg, sArgs.memmap_flag && !sArgs.normalize_flag &&
00052             !sArgs.heavy_arg && !sArgs.cutoff_arg ) ) {
00053             cerr << "Could not open: " << sArgs.input_arg << endl;
00054             return 1; } }
00055     else if( !Dat.Open( cin, CDat::EFormatText ) ) {
00056         cerr << "Could not open input" << endl;
00057         return 1; }
00058     if( sArgs.normalize_flag )
00059         Dat.Normalize( CDat::ENormalizeSigmoid );
00060     if( !CMeta::IsNaN( sArgs.cutoff_arg ) )
00061         for( i = 0; i < Dat.GetGenes( ); ++i )
00062             for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j )
00063                 if( !CMeta::IsNaN( d = Dat.Get( i, j ) ) && ( d < sArgs.cutoff_arg ) )
00064                     Dat.Set( i, j, CMeta::GetNaN( ) );
00065     if( sArgs.knowns_arg ) {
00066         if( !DatKnowns.Open( sArgs.knowns_arg, !!sArgs.memmap_flag ) ) {
00067             cerr << "Could not open: " << sArgs.knowns_arg << endl;
00068             return 1; }
00069         veciKnowns.resize( Dat.GetGenes( ) );
00070         for( i = 0; i < veciKnowns.size( ); ++i )
00071             veciKnowns[ i ] = DatKnowns.GetGene( Dat.GetGene( i ) ); }
00072 
00073     iRet = sArgs.motifs_arg ? motifs( sArgs, Dat ) : ( sArgs.heavy_arg ?
00074         heavy2( sArgs, Dat, DatKnowns, veciKnowns ) :
00075         cliques( sArgs, Dat, DatKnowns, veciKnowns ) );
00076 
00077     return iRet; }
00078 
00079 int cliques( const gengetopt_args_info& sArgs, const CDat& Dat, const CDat& DatKnowns,
00080     const vector<size_t>& veciKnowns ) {
00081     vector<pair<vector<size_t>,float> > vecprCliques;
00082     vector<size_t>                      veciGenes;
00083     float                               d, dCur;
00084     size_t                              i, j, iOne, iTwo, iMin;
00085     int                                 iCol;
00086 
00087     vecprCliques.resize( sArgs.subgraphs_arg );
00088     for( iMin = i = 0; i < vecprCliques.size( ); ++i ) {
00089         vecprCliques[ i ].first.resize( sArgs.size_arg );
00090         vecprCliques[ i ].second = -FLT_MAX; }
00091     veciGenes.resize( sArgs.size_arg );
00092     for( i = 0; i < veciGenes.size( ); ++i )
00093         veciGenes[ i ] = i;
00094     while( veciGenes[ 0 ] <= ( Dat.GetGenes( ) - veciGenes.size( ) ) ) {
00095         if( !rand( ) && ( ( (float)rand( ) / RAND_MAX ) < 1e-3 ) ) {
00096             for( i = 0; i < veciGenes.size( ); ++i )
00097                 cerr << veciGenes[ i ] << ',';
00098             cerr << endl; }
00099         for( dCur = 0,i = 0; i < veciGenes.size( ); ++i ) {
00100             iOne = sArgs.knowns_arg ? veciKnowns[ veciGenes[ i ] ] : -1;
00101             for( j = ( i + 1 ); j < veciGenes.size( ); ++j ) {
00102                 if( ( iOne != -1 ) && ( ( iTwo = veciKnowns[ veciGenes[ j ] ] ) != -1 ) &&
00103                     !CMeta::IsNaN( d = DatKnowns.Get( iOne, iTwo ) ) )
00104                     continue;
00105                 if( !CMeta::IsNaN( d = Dat.Get( veciGenes[ i ], veciGenes[ j ] ) ) )
00106                     dCur += d; } }
00107         if( dCur > vecprCliques[ iMin ].second ) {
00108 
00109             copy( veciGenes.begin( ), veciGenes.end( ), vecprCliques[ iMin ].first.begin( ) );
00110             vecprCliques[ iMin ].second = dCur;
00111             for( iMin = i = 0; i < vecprCliques.size( ); ++i )
00112                 if( vecprCliques[ i ].second < vecprCliques[ iMin ].second )
00113                     iMin = i; }
00114         for( i = 0; i < veciGenes.size( ); ++i )
00115             if( ++veciGenes[ veciGenes.size( ) - 1 - i ] < ( Dat.GetGenes( ) - i ) ) {
00116                 for( iCol = ( (int)veciGenes.size( ) - (int)i ); ( iCol > 0 ) &&
00117                     ( (size_t)iCol < veciGenes.size( ) ); ++iCol )
00118                     veciGenes[ iCol ] = veciGenes[ iCol - 1 ] + 1;
00119                 break; } }
00120 
00121     for( i = 0; i < vecprCliques.size( ); ++i ) {
00122         cout << vecprCliques[ i ].second;
00123         for( j = 0; j < vecprCliques[ i ].first.size( ); ++j )
00124             cout << '\t' << Dat.GetGene( vecprCliques[ i ].first[ j ] );
00125         cout << endl; }
00126 
00127     return 0; }
00128 
00129 struct SSeed {
00130     size_t  m_iOne;
00131     size_t  m_iTwo;
00132     float   m_dIn;
00133     size_t  m_iIn;
00134     float   m_dTotal;
00135     size_t  m_iTotal;
00136     float   m_dRatio;
00137 
00138     SSeed( size_t iOne, size_t iTwo, float dIn, size_t iIn, float dTotal, size_t iTotal,
00139         float dRatio ) : m_iOne(iOne), m_iTwo(iTwo), m_dIn(dIn), m_iIn(iIn), m_dTotal(dTotal),
00140         m_iTotal(iTotal), m_dRatio(dRatio) { }
00141 
00142     bool operator<( const SSeed& sSeed ) const {
00143 
00144         return ( m_dRatio < sSeed.m_dRatio ); }
00145 };
00146 
00147 int heavy( const gengetopt_args_info& sArgs, CDat& Dat, const CDat& DatKnowns,
00148     const vector<size_t>& veciKnowns ) {
00149     vector<bool>    vecfCluster;
00150     vector<float>   vecdConnectivity;
00151     vector<size_t>  veciConnectivity, veciCluster;
00152     size_t          i, j, iIn, iTotal, iMax, iMaxIn, iMaxTotal;
00153     size_t          iClusters;
00154     float           d, dMaxIn, dMaxTotal, dIn, dTotal, dRatio;
00155 
00156     // veciConnectivity[ i ] contains the number of edges out of gene i
00157     // vecdConnectivity[ i ] contains the sum of edge weights out of gene i
00158     veciConnectivity.resize( Dat.GetGenes( ) );
00159     vecdConnectivity.resize( Dat.GetGenes( ) );
00160     for( i = 0; i < veciConnectivity.size( ); ++i )
00161         for( j = ( i + 1 ); j < veciConnectivity.size( ); ++j )
00162             if( !CMeta::IsNaN( d = Dat.Get( i, j ) ) ) {
00163                 veciConnectivity[ i ]++;
00164                 veciConnectivity[ j ]++;
00165                 vecdConnectivity[ i ] += d;
00166                 vecdConnectivity[ j ] += d; }
00167 
00168     cout << sArgs.heavy_arg << endl;
00169     iClusters = 0;
00170     vecfCluster.resize( Dat.GetGenes( ) );
00171     while( ( sArgs.subgraphs_arg == -1 ) || ( iClusters < (size_t)sArgs.subgraphs_arg ) ) {
00172         priority_queue<SSeed>   pqueSeeds;
00173 
00174         veciCluster.resize( 1 );
00175         for( i = 0; i < Dat.GetGenes( ); ++i ) {
00176             veciCluster[ 0 ] = i;
00177             for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j )
00178                 if( connectivity( j, veciCluster, vecdConnectivity, veciConnectivity, 0, 0,
00179                     vecdConnectivity[ i ], veciConnectivity[ i ], Dat, dIn, iIn, dTotal, iTotal ) &&
00180                     ( ( dRatio = ( ( dIn / iIn ) / ( dTotal / iTotal ) ) ) >= sArgs.specificity_arg ) )
00181                     pqueSeeds.push( SSeed( i, j, dIn, iIn, dTotal, iTotal, dRatio ) ); }
00182         if( pqueSeeds.empty( ) )
00183             break;
00184         cerr << "Seeds remaining: " << pqueSeeds.size( ) << endl;
00185 
00186         do {
00187             const SSeed&    sSeed   = pqueSeeds.top( );
00188 
00189             fill( vecfCluster.begin( ), vecfCluster.end( ), false );
00190             vecfCluster[ sSeed.m_iOne ] = vecfCluster[ sSeed.m_iTwo ] = true;
00191             veciCluster.resize( 2 );
00192             veciCluster[ 0 ] = sSeed.m_iOne;
00193             veciCluster[ 1 ] = sSeed.m_iTwo;
00194 
00195             cerr << "Cluster " << iClusters << " seed: " << Dat.GetGene( sSeed.m_iOne ) << ", " <<
00196                 Dat.GetGene( sSeed.m_iTwo ) << ", " << sSeed.m_dRatio << endl;
00197             dIn = sSeed.m_dIn;
00198             iIn = sSeed.m_iIn;
00199             dTotal = sSeed.m_dTotal;
00200             iTotal = sSeed.m_iTotal;
00201             while( true ) {
00202                 cerr << "Cluster " << iClusters << ", " << veciCluster.size( ) << " genes" << endl;
00203                 max_connectivity( vecfCluster, veciCluster, vecdConnectivity, veciConnectivity, dIn,
00204                     iIn, dTotal, iTotal, -1, Dat, dRatio, iMax, dMaxIn, iMaxIn, dMaxTotal, iMaxTotal );
00205                 if( ( dRatio > 1 ) && ( dRatio >= ( sArgs.heavy_arg * sSeed.m_dRatio ) ) ) {
00206                     vecfCluster[ iMax ] = true;
00207                     veciCluster.push_back( iMax );
00208                     dIn = dMaxIn;
00209                     iIn = iMaxIn;
00210                     dTotal = dMaxTotal;
00211                     iTotal = iMaxTotal; }
00212                 else
00213                     break; }
00214             pqueSeeds.pop( ); }
00215         while( !pqueSeeds.empty( ) && ( veciCluster.size( ) < 3 ) );
00216         if( veciCluster.size( ) < 3 )
00217             break;
00218 
00219         cerr << "Found cluster: " << ( ( dIn / iIn ) / ( dTotal / iTotal ) ) << " (" << dIn << '/' <<
00220             iIn << ", " << dTotal << '/' << iTotal << ')' << endl;
00221         iClusters++;
00222         cout << ( ( dIn / iIn ) / ( dTotal / iTotal ) );
00223         for( i = 0; i < veciCluster.size( ); ++i )
00224             cout << '\t' << Dat.GetGene( veciCluster[ i ] );
00225         cout << endl;
00226         cout.flush( );
00227 
00228         dRatio = dIn / iIn;
00229         for( i = 0; i < veciCluster.size( ); ++i )
00230             for( j = ( i + 1 ); j < veciCluster.size( ); ++j )
00231                 if( !CMeta::IsNaN( d = Dat.Get( veciCluster[ i ], veciCluster[ j ] ) ) ) {
00232                     dIn = min( dRatio, d );
00233                     Dat.Set( veciCluster[ i ], veciCluster[ j ], d - dIn );
00234                     vecdConnectivity[ veciCluster[ i ] ] -= dIn;
00235                     vecdConnectivity[ veciCluster[ j ] ] -= dIn; } }
00236 
00237     return 0; }
00238 
00239 bool connectivity( size_t iNew, const vector<size_t>& veciCluster,
00240     const vector<float>& vecdConnectivity, const vector<size_t>& veciConnectivity, float dIn,
00241     size_t iIn, float dTotal, size_t iTotal, const CDat& Dat, float& dSumIn, size_t& iEdgesIn,
00242     float& dSumTotal, size_t& iEdgesTotal ) {
00243     size_t  i;
00244     float   d;
00245     bool    fRet;
00246 
00247     iEdgesTotal = iTotal + veciConnectivity[ iNew ];
00248     dSumTotal = dTotal + vecdConnectivity[ iNew ];
00249     iEdgesIn = iIn;
00250     dSumIn = dIn;
00251     fRet = false;
00252     for( i = 0; i < veciCluster.size( ); ++i )
00253         if( !CMeta::IsNaN( d = Dat.Get( iNew, veciCluster[ i ] ) ) ) {
00254             fRet = true;
00255             iEdgesTotal--;
00256             dSumTotal -= d;
00257             iEdgesIn++;
00258             dSumIn += d; }
00259 
00260     return fRet; }
00261 
00262 void max_connectivity( const vector<bool>& vecfCluster, const vector<size_t>& veciCluster,
00263     const vector<float>& vecdConnectivity, const vector<size_t>& veciConnectivity, float dIn,
00264     size_t iIn, float dTotal, size_t iTotal, size_t iStart, const CDat& Dat, float& dMaxRatio,
00265     size_t& iMax, float& dMaxIn, size_t& iMaxIn, float& dMaxTotal, size_t& iMaxTotal ) {
00266     size_t  i, iEdgesIn, iEdgesTotal;
00267     float   dRatio, dSumIn, dSumTotal;
00268 
00269     dMaxRatio = -FLT_MAX;
00270     for( iMax = 0,i = ( iStart == -1 ) ? 0 : ( iStart + 1 ); i < vecfCluster.size( ); ++i ) {
00271         if( vecfCluster[ i ] || !connectivity( i, veciCluster, vecdConnectivity, veciConnectivity, dIn,
00272             iIn, dTotal, iTotal, Dat, dSumIn, iEdgesIn, dSumTotal, iEdgesTotal ) )
00273             continue;
00274         dRatio = ( ( dSumIn / iEdgesIn ) / ( dSumTotal / iEdgesTotal ) );
00275         if( dRatio > dMaxRatio ) {
00276             dMaxRatio = dRatio;
00277             iMax = i;
00278             dMaxIn = dSumIn;
00279             iMaxIn = iEdgesIn;
00280             dMaxTotal = dSumTotal;
00281             iMaxTotal = iEdgesTotal; } } }
00282 
00283 int heavy2( const gengetopt_args_info& sArgs, CDat& Dat, const CDat& DatKnowns,
00284     const vector<size_t>& veciKnowns ) {
00285     size_t                          i, j, iClusters, iMax;
00286     float                           d, dCur, dMax;
00287     vector<size_t>                  veciCluster, veciScores;
00288     vector<pair<size_t, size_t> >   vecpriiSeeds;
00289     vector<float>                   vecdScores;
00290     vector<bool>                    vecfCluster;
00291 
00292     for( i = 0; i < Dat.GetGenes( ); ++i )
00293         for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j )
00294             if( !CMeta::IsNaN( d = Dat.Get( i, j ) ) && ( d >= sArgs.specificity_arg ) )
00295                 vecpriiSeeds.push_back( pair<size_t, size_t>( i, j ) );
00296 
00297     cout << sArgs.heavy_arg << endl;
00298     iClusters = 0;
00299     veciScores.resize( Dat.GetGenes( ) );
00300     vecdScores.resize( Dat.GetGenes( ) );
00301     vecfCluster.resize( Dat.GetGenes( ) );
00302     while( ( sArgs.subgraphs_arg == -1 ) || ( iClusters < (size_t)sArgs.subgraphs_arg ) ) {
00303         priority_queue<SSeed>   pqueSeeds;
00304         bool                    fHit;
00305 
00306         for( i = 0; i < vecpriiSeeds.size( ); ++i )
00307             if( ( d = Dat.Get( vecpriiSeeds[ i ].first, vecpriiSeeds[ i ].second ) ) >= sArgs.specificity_arg )
00308                     pqueSeeds.push( SSeed( vecpriiSeeds[ i ].first, vecpriiSeeds[ i ].second, d, 1, 0, 0, d ) );
00309         cerr << "Seeds remaining: " << pqueSeeds.size( ) << endl;
00310 
00311         for( fHit = false; !pqueSeeds.empty( ); ) {
00312             const SSeed&    sSeed   = pqueSeeds.top( );
00313 
00314             fill( vecfCluster.begin( ), vecfCluster.end( ), false );
00315             vecfCluster[ sSeed.m_iOne ] = vecfCluster [ sSeed.m_iTwo ] = true;
00316             veciCluster.resize( 2 );
00317             veciCluster[ 0 ] = sSeed.m_iOne;
00318             veciCluster[ 1 ] = sSeed.m_iTwo;
00319 
00320             cerr << "Cluster " << iClusters << " seed: " << Dat.GetGene( sSeed.m_iOne ) << ", " <<
00321                 Dat.GetGene( sSeed.m_iTwo ) << ", " << sSeed.m_dRatio << endl;
00322             for( i = 0; i < Dat.GetGenes( ); ++i ) {
00323                 vecdScores[ i ] = 0;
00324                 for( veciScores[ i ] = j = 0; j < veciCluster.size( ); ++j )
00325                     if( !CMeta::IsNaN( d = Dat.Get( i, veciCluster[ j ] ) ) ) {
00326                         vecdScores[ i ] += d;
00327                         veciScores[ i ]++; } }
00328             while( true ) {
00329                 cerr << "Cluster " << iClusters << ", " << veciCluster.size( ) << " genes" << endl;
00330                 for( dMax = 0,iMax = i = 0; i < Dat.GetGenes( ); ++i ) {
00331                     if( vecfCluster[ i ] || !veciScores[ i ] )
00332                         continue;
00333                     if( ( dCur = ( vecdScores[ i ] / veciScores[ i ] ) ) > dMax ) {
00334                         dMax = dCur;
00335                         iMax = i; } }
00336                 if( dMax < ( sArgs.heavy_arg * sSeed.m_dRatio ) )
00337                     break;
00338                 for( i = 0; i < Dat.GetGenes( ); ++i )
00339                     if( !CMeta::IsNaN( d = Dat.Get( i, iMax ) ) ) {
00340                         vecdScores[ i ] += d;
00341                         veciScores[ i ]++; }
00342                 vecfCluster[ iMax ] = true;
00343                 veciCluster.push_back( iMax ); }
00344             if( veciCluster.size( ) < 3 ) {
00345                 pqueSeeds.pop( );
00346                 continue; }
00347 
00348             fHit = true;
00349             for( dMax = 0, iMax = i = 0; i < veciCluster.size( ); ++i )
00350                 for( j = ( i + 1 ); j < veciCluster.size( ); ++j )
00351                     if( !CMeta::IsNaN( d = Dat.Get( veciCluster[ i ], veciCluster[ j ] ) ) ) {
00352                         iMax++;
00353                         dMax += d; }
00354 
00355             cerr << "Found cluster: " << ( dMax /= iMax ) << endl;
00356             iClusters++;
00357             cout << dMax;
00358             for( i = 0; i < veciCluster.size( ); ++i )
00359                 cout << '\t' << Dat.GetGene( veciCluster[ i ] );
00360             cout << endl;
00361             cout.flush( );
00362 
00363             for( i = 0; i < veciCluster.size( ); ++i )
00364                 for( j = ( i + 1 ); j < veciCluster.size( ); ++j )
00365                     if( !CMeta::IsNaN( d = Dat.Get( veciCluster[ i ], veciCluster[ j ] ) ) ) {
00366                         dCur = min( dMax, d );
00367                         Dat.Set( veciCluster[ i ], veciCluster[ j ], d - dCur ); }
00368             break; }
00369         if( !fHit )
00370             break; }
00371 
00372     return 0; }
00373 
00374 int motifs( const gengetopt_args_info& sArgs, CDat& Dat ) {
00375     size_t          i, j, k, iOne, iTwo, iThree;
00376     vector<size_t>  veciOne, veciTwo;
00377     vector<bool>    vecfSign;
00378     float           dOne, dTwo, dThree;
00379 
00380     for( i = 0; i < Dat.GetGenes( ); ++i )
00381         for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j )
00382             if( !CMeta::IsNaN( dOne = Dat.Get( i, j ) ) && ( fabs( dOne ) >= sArgs.motifs_arg ) ) {
00383                 iOne = ( dOne >= 0 ) ? 1 : 0;
00384                 for( k = ( j + 1 ); k < Dat.GetGenes( ); ++k )
00385                     if( !CMeta::IsNaN( dTwo = Dat.Get( j, k ) ) && ( fabs( dTwo ) >= sArgs.motifs_arg ) &&
00386                         !CMeta::IsNaN( dThree = Dat.Get( i, k ) ) && ( fabs( dThree ) >= sArgs.motifs_arg ) ) {
00387                         iTwo = ( dTwo >= 0 ) ? 1 : 0;
00388                         iThree = ( dThree >= 0 ) ? 1 : 0;
00389                         if( ( iOne + iTwo + iThree ) == 1 ) {
00390                             cout << Dat.GetGene( i ) << '\t' << dOne << '\t' <<
00391                                 Dat.GetGene( j ) << '\t' << dTwo << '\t' <<
00392                                 Dat.GetGene( k ) << '\t' << dThree << endl; } } }
00393 
00394     return 0; }