Sleipnir
tools/Funcaeologist/Funcaeologist.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 static const char   c_szDab[]   = ".dab";
00026 
00027 struct SBackground {
00028     size_t                  m_iCountOne;
00029     size_t                  m_iCountTwo;
00030     size_t                  m_iSizeOne;
00031     size_t                  m_iSizeTwo;
00032     const vector<float>*    m_pvecdOut;
00033     float                   m_dPrior;
00034     const CDat*             m_pDat;
00035     const CDat*             m_pDatWithin;
00036     vector<float>           m_vecdValues;
00037     const vector<size_t>*   m_pveciGenes;
00038 };
00039 
00040 static int MainSet( const gengetopt_args_info& );
00041 static int MainBackground( const gengetopt_args_info& );
00042 static void* BackgroundThread( void* );
00043 static void* BackgroundThreadSingle( void* );
00044 
00045 int main( int iArgs, char** aszArgs ) {
00046 #ifdef WIN32
00047     pthread_win32_process_attach_np( );
00048 #endif // WIN32
00049     gengetopt_args_info sArgs;
00050     int                 iRet;
00051 
00052     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00053         cmdline_parser_print_help( );
00054         return 1; }
00055     CMeta Meta( sArgs.verbosity_arg, sArgs.random_arg );
00056 
00057     iRet = ( sArgs.sizes_arg ? MainBackground : MainSet )( sArgs );
00058 #ifdef WIN32
00059     pthread_win32_process_detach_np( );
00060 #endif // WIN32
00061     return iRet; }
00062 
00063 int MainSet( const gengetopt_args_info& sArgs ) {
00064     CGenome         Genome;
00065     CGenes          GenesQuery( Genome );
00066     ifstream        ifsm;
00067     size_t          iFunction, i, j;
00068     vector<size_t>  veciQuery;
00069     float           d;
00070 
00071     if( sArgs.genes_arg )
00072         ifsm.open( sArgs.genes_arg );
00073     if( !GenesQuery.Open( sArgs.genes_arg ? ifsm : cin ) ) {
00074         cerr << "Could not open: " << ( sArgs.genes_arg ? sArgs.genes_arg : "genes" ) << endl;
00075         return 1; }
00076     if( sArgs.genes_arg )
00077         ifsm.close( );
00078 
00079     veciQuery.resize( GenesQuery.GetGenes( ) );
00080     cout << "Function   Score   QueryIn QueryOut    FuncIn  FuncOut" << endl;
00081     for( iFunction = 0; iFunction < sArgs.inputs_num; ++iFunction ) {
00082         CGenes          GenesFunction( Genome );
00083         CDat            Dat;
00084         string          strDab;
00085         float           dFuncIn, dFuncOut, dQueryIn, dQueryOut;
00086         size_t          iFuncIn, iFuncOut, iQueryIn, iQueryOut, iOne, iTwo;
00087         vector<size_t>  veciFunction;
00088 
00089         if( !GenesFunction.Open( sArgs.inputs[ iFunction ] ) ) {
00090             cerr << "Could not open: " << sArgs.inputs[ iFunction ] << endl;
00091             return 1; }
00092         strDab = (string)sArgs.directory_arg + '/' + CMeta::Deextension( CMeta::Basename(
00093             sArgs.inputs[ iFunction ] ) ) + c_szDab;
00094         if( !Dat.Open( strDab.c_str( ), sArgs.memmap_flag && !sArgs.normalize_flag ) ) {
00095             cerr << "Could not open: " << strDab << endl;
00096             return 1; }
00097         if( sArgs.normalize_flag )
00098             Dat.Normalize( CDat::ENormalizeSigmoid );
00099 
00100         for( i = 0; i < veciQuery.size( ); ++i )
00101             veciQuery[ i ] = Dat.GetGene( GenesQuery.GetGene( i ).GetName( ) );
00102         veciFunction.resize( GenesFunction.GetGenes( ) );
00103         for( i = 0; i < veciFunction.size( ); ++i )
00104             veciFunction[ i ] = Dat.GetGene( GenesFunction.GetGene( i ).GetName( ) );
00105 
00106         dFuncIn = dFuncOut = dQueryIn = dQueryOut = 0;
00107         for( iQueryIn = iFuncIn = iFuncOut = i = 0; i < veciFunction.size( ); ++i ) {
00108             if( ( iOne = veciFunction[ i ] ) == -1 )
00109                 continue;
00110             for( j = ( i + 1 ); j < veciFunction.size( ); ++j )
00111                 if( ( ( iTwo = veciFunction[ j ] ) != -1 ) &&
00112                     !CMeta::IsNaN( d = Dat.Get( iOne, iTwo ) ) ) {
00113                     iFuncIn++;
00114                     dFuncIn += d; }
00115             for( j = 0; j < Dat.GetGenes( ); ++j )
00116                 if( !CMeta::IsNaN( d = Dat.Get( iOne, j ) ) ) {
00117                     iFuncOut++;
00118                     dFuncOut += d; }
00119             for( j = 0; j < veciQuery.size( ); ++j )
00120                 if( ( ( iTwo = veciQuery[ j ] ) != -1 ) &&
00121                     !CMeta::IsNaN( d = Dat.Get( iOne, iTwo ) ) ) {
00122                     iQueryIn++;
00123                     dQueryIn += d; } }
00124         for( iQueryOut = i = 0; i < veciQuery.size( ); ++i ) {
00125             if( ( iOne = veciQuery[ i ] ) == -1 )
00126                 continue;
00127             for( j = 0; j < Dat.GetGenes( ); ++j )
00128                 if( !CMeta::IsNaN( d = Dat.Get( iOne, j ) ) ) {
00129                     iQueryOut++;
00130                     dQueryOut += d; } }
00131         dFuncIn /= iFuncIn;
00132         dFuncOut /= iFuncOut;
00133         dQueryIn /= iQueryIn;
00134         dQueryOut /= iQueryOut;
00135 
00136         cout << CMeta::Basename( sArgs.inputs[ iFunction ] ) << '\t' <<
00137             ( dQueryIn / dQueryOut / dFuncIn * dFuncOut ) << '\t' << dQueryIn << '\t' << dQueryOut << '\t' <<
00138             dFuncIn << '\t' << dFuncOut << endl; }
00139 
00140     return 0; }
00141 
00142 int MainBackground( const gengetopt_args_info& sArgs ) {
00143     CDat                Dat, DatWithin;
00144     const CDat*         pDatWithin;
00145     CDataMatrix         MatAves, MatStds, MatPValues;
00146     vector<size_t>      veciSizes, veciGenes;
00147     size_t              i, j, iIndexOne, iIndexTwo, iValue, iChunk;
00148     float               d, dPrior;
00149     vector<float>       vecdOut, vecdValues;
00150     vector<pthread_t>   vecpthdThreads;
00151     vector<SBackground> vecsBackground;
00152     long double         dTmp;
00153 
00154     if( !Dat.Open( sArgs.input_arg, sArgs.memmap_flag && !sArgs.normalize_flag ) ) {
00155         cerr << "Could not open: " << ( sArgs.input_arg ? sArgs.input_arg : "stdin" ) << endl;
00156         return 1; }
00157     if( sArgs.normalize_flag )
00158         Dat.Normalize( CDat::ENormalizeSigmoid );
00159 
00160     {
00161         CPCL    PCLSizes( false );
00162 
00163         if( !PCLSizes.Open( sArgs.sizes_arg, 0 ) ) {
00164             cerr << "Could not open: " << sArgs.sizes_arg << endl;
00165             return 1; }
00166         veciSizes.resize( PCLSizes.GetGenes( ) );
00167         for( i = 0; i < veciSizes.size( ); ++i )
00168             veciSizes[ i ] = atoi( PCLSizes.GetGene( i ).c_str( ) );
00169     }
00170 
00171     if( sArgs.genes_arg ) {
00172         CGenome Genome;
00173         CGenes  Genes( Genome );
00174 
00175         if( !Genes.Open( sArgs.genes_arg ) ) {
00176             cerr << "Could not open: " << sArgs.genes_arg << endl;
00177             return 1; }
00178         for( i = 0; i < Genes.GetGenes( ); ++i )
00179             if( ( j = Dat.GetGene( Genes.GetGene( i ).GetName( ) ) ) != -1 )
00180                 veciGenes.push_back( j ); }
00181     else {
00182         veciGenes.resize( Dat.GetGenes( ) );
00183         for( i = 0; i < veciGenes.size( ); ++i )
00184             veciGenes[ i ] = i; }
00185 
00186     {
00187         vector<long double> vecdTmp;
00188         vector<size_t>      veciTmp;
00189 
00190         vecdTmp.resize( Dat.GetGenes( ) );
00191         fill( vecdTmp.begin( ), vecdTmp.end( ), 0 );
00192         veciTmp.resize( Dat.GetGenes( ) );
00193         fill( veciTmp.begin( ), veciTmp.end( ), 0 );
00194         for( i = 0; i < Dat.GetGenes( ); ++i )
00195             for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j )
00196                 if( !CMeta::IsNaN( d = Dat.Get( i, j ) ) ) {
00197                     veciTmp[ i ]++;
00198                     veciTmp[ j ]++;
00199                     vecdTmp[ i ] += d;
00200                     vecdTmp[ j ] += d; }
00201         vecdOut.resize( vecdTmp.size( ) );
00202         for( i = 0; i < vecdOut.size( ); ++i )
00203             vecdOut[ i ] = (float)( vecdTmp[ i ] / ( veciTmp[ i ] ? veciTmp[ i ] : 1 ) );
00204     }
00205 
00206     if( sArgs.input_within_arg ) {
00207         if( !DatWithin.Open( sArgs.input_within_arg ) ) {
00208             cerr << "Could not open: " << sArgs.input_within_arg << endl;
00209             return 1; }
00210         pDatWithin = &DatWithin; }
00211     else
00212         pDatWithin = &Dat;
00213 
00214     for( dTmp = 0,iChunk = i = 0; i < pDatWithin->GetGenes( ); ++i )
00215         for( j = ( i + 1 ); j < pDatWithin->GetGenes( ); ++j )
00216             if( !CMeta::IsNaN( d = pDatWithin->Get( i, j ) ) ) {
00217                 iChunk++;
00218                 dTmp += d; }
00219     dPrior = (float)( dTmp / iChunk );
00220 
00221     vecpthdThreads.resize( min( sArgs.threads_arg, sArgs.count_arg ) );
00222     vecsBackground.resize( vecpthdThreads.size( ) );
00223     iChunk = 1 + ( ( sArgs.count_arg - 1 ) / vecpthdThreads.size( ) );
00224     vecdValues.resize( ( sArgs.singles_flag ? 1 : sArgs.count_arg ) * iChunk * vecpthdThreads.size( ) );
00225     MatAves.Initialize( veciSizes.size( ), veciSizes.size( ) );
00226     MatAves.Clear( );
00227     MatStds.Initialize( veciSizes.size( ), veciSizes.size( ) );
00228     MatStds.Clear( );
00229     MatPValues.Initialize( veciSizes.size( ), veciSizes.size( ) );
00230     for( iIndexOne = 0; iIndexOne < veciSizes.size( ); ++iIndexOne )
00231         for( iIndexTwo = iIndexOne; iIndexTwo < ( sArgs.singles_flag ? ( iIndexOne + 1 ) : veciSizes.size( ) );
00232             ++iIndexTwo ) {
00233             cerr << veciSizes[ iIndexOne ] << ':' << veciSizes[ iIndexTwo ] << endl;
00234             for( i = 0; i < vecpthdThreads.size( ); ++i ) {
00235                 vecsBackground[ i ].m_dPrior = dPrior;
00236                 vecsBackground[ i ].m_iCountOne = iChunk;
00237                 vecsBackground[ i ].m_iCountTwo = sArgs.count_arg;
00238                 vecsBackground[ i ].m_iSizeOne = veciSizes[ iIndexOne ];
00239                 vecsBackground[ i ].m_iSizeTwo = veciSizes[ iIndexTwo ];
00240                 vecsBackground[ i ].m_pDat = &Dat;
00241                 vecsBackground[ i ].m_pDatWithin = pDatWithin;
00242                 vecsBackground[ i ].m_pvecdOut = &vecdOut;
00243                 vecsBackground[ i ].m_pveciGenes = &veciGenes;
00244                 if( pthread_create( &vecpthdThreads[ i ], NULL, sArgs.singles_flag ? BackgroundThreadSingle:
00245                     BackgroundThread, &vecsBackground[ i ] ) ) {
00246                     cerr << "Couldn't create thread for group: " << i << endl;
00247                     return 1; } }
00248             for( iValue = i = 0; i < vecpthdThreads.size( ); ++i ) {
00249                 pthread_join( vecpthdThreads[ i ], NULL );
00250                 for( j = 0; j < vecsBackground[ i ].m_vecdValues.size( ); ++j )
00251                     vecdValues[ iValue++ ] = vecsBackground[ i ].m_vecdValues[ j ]; }
00252             MatAves.Set( iIndexOne, iIndexTwo, (float)CStatistics::Average( vecdValues ) );
00253             if( sArgs.invgauss_flag ) {
00254                 for( i = 0; i < vecdValues.size( ); ++i )
00255                     MatStds.Get( iIndexOne, iIndexTwo ) += 1 / vecdValues[ i ];
00256                 MatStds.Set( iIndexOne, iIndexTwo, vecdValues.size( ) / ( MatStds.Get( iIndexOne, iIndexTwo ) -
00257                     ( vecdValues.size( ) / MatAves.Get( iIndexOne, iIndexTwo ) ) ) ); }
00258             else
00259                 MatStds.Set( iIndexOne, iIndexTwo, (float)sqrt( CStatistics::Variance( vecdValues,
00260                     MatAves.Get( iIndexOne, iIndexTwo ) ) ) );
00261             MatPValues.Set( iIndexOne, iIndexTwo, (float)CStatistics::Percentile( vecdValues.begin( ),
00262                 vecdValues.end( ), sArgs.percentile_arg ) ); }
00263 
00264     if( sArgs.singles_flag )
00265         for( i = 0; i < veciSizes.size( ); ++i )
00266             cout << veciSizes[ i ] << '\t' << MatAves.Get( i, i ) << '|' << MatStds.Get( i, i ) << '|' <<
00267                 MatPValues.Get( i, i ) << endl;
00268     else {
00269         for( i = 0; i < veciSizes.size( ); ++i )
00270             cout << '\t' << veciSizes[ i ];
00271         cout << endl;
00272         for( i = 0; i < MatAves.GetRows( ); ++i ) {
00273             cout << veciSizes[ i ];
00274             for( j = 0; j < i; ++j )
00275                 cout << '\t';
00276             for( ; j < MatAves.GetColumns( ); ++j ) {
00277                 cout << '\t' << MatAves.Get( i, j ) << '|' << MatStds.Get( i, j ) << '|' <<
00278                     MatPValues.Get( i, j ); }
00279             cout << endl; } }
00280 
00281     return 0; }
00282 
00283 void* BackgroundThread( void* pData ) {
00284     size_t              i, j, iCountOne, iCountTwo, iEdgesOne, iEdgesTwo, iValue;
00285     SBackground*        psData;
00286     vector<size_t>      veciOne, veciTwo;
00287     long double         dBackgroundOne, dBackgroundTwo, dWithinOne, dWithinTwo, dWithin, dBackground, dBetween;
00288     vector<long double> vecdBetweenOne, vecdBetweenTwo;
00289     float               d;
00290 
00291     psData = (SBackground*)pData;
00292     psData->m_vecdValues.resize( psData->m_iCountOne * psData->m_iCountTwo );
00293     veciOne.resize( psData->m_iSizeOne );
00294     veciTwo.resize( psData->m_iSizeTwo );
00295     iEdgesOne = veciOne.size( ) * ( veciOne.size( ) + 1 ) / 2;
00296     iEdgesTwo = veciTwo.size( ) * ( veciTwo.size( ) + 1 ) / 2;
00297     vecdBetweenOne.resize( veciOne.size( ) );
00298     vecdBetweenTwo.resize( veciTwo.size( ) );
00299     for( iValue = iCountOne = 0; iCountOne < psData->m_iCountOne; ++iCountOne ) {
00300         const vector<size_t>&   veciGenes   = *psData->m_pveciGenes;
00301         const vector<float>&    vecdOut     = *psData->m_pvecdOut;
00302         set<size_t>             setiOne;
00303 
00304         while( setiOne.size( ) < veciOne.size( ) )
00305             setiOne.insert( veciGenes[ rand( ) % veciGenes.size( ) ] );
00306         copy( setiOne.begin( ), setiOne.end( ), veciOne.begin( ) );
00307 
00308         dBackgroundOne = dWithinOne = 0;
00309         for( i = 0; i < veciOne.size( ); ++i ) {
00310             dBackgroundOne += vecdOut[ veciOne[ i ] ];
00311             dWithinOne += psData->m_dPrior;
00312             for( j = ( i + 1 ); j < veciOne.size( ); ++j )
00313                 dWithinOne += psData->m_pDatWithin->Get( veciOne[ i ], veciOne[ j ] ); }
00314 
00315         for( iCountTwo = 0; iCountTwo < psData->m_iCountTwo; ++iCountTwo ) {
00316             set<size_t>             setiTwo;
00317 
00318             while( setiTwo.size( ) < veciTwo.size( ) )
00319                 setiTwo.insert( veciGenes[ rand( ) % veciGenes.size( ) ] );
00320             copy( setiTwo.begin( ), setiTwo.end( ), veciTwo.begin( ) );
00321 
00322             dBackgroundTwo = dWithinTwo = 0;
00323             for( i = 0; i < veciTwo.size( ); ++i ) {
00324                 dBackgroundTwo += (*psData->m_pvecdOut)[ veciTwo[ i ] ];
00325                 dWithinTwo += psData->m_dPrior;
00326                 for( j = ( i + 1 ); j < veciTwo.size( ); ++j )
00327                     dWithinTwo += psData->m_pDatWithin->Get( veciTwo[ i ], veciTwo[ j ] ); }
00328 
00329 // Alternative: calculate an average over all edges (weighted) rather than between sets (unweighted)
00330 //          dWithin = ( dWithinOne + dWithinTwo ) / ( iEdgesOne + iEdgesTwo );
00331             dWithin = ( ( dWithinOne / iEdgesOne ) + ( dWithinTwo / iEdgesTwo ) ) / 2;
00332             dBackground = ( dBackgroundOne + dBackgroundTwo ) / ( veciOne.size( ) + veciTwo.size( ) );
00333 // Alternative: calculate between scores by edge rather than node
00334 /*
00335             dBetween = 0;
00336             for( i = 0; i < veciOne.size( ); ++i )
00337                 for( j = 0; j < veciTwo.size( ); ++j )
00338                     dBetween += ( veciOne[ i ] == veciTwo[ j ] ) ? 1 :
00339                         psData->m_pDat->Get( veciOne[ i ], veciTwo[ j ] );
00340             dBetween /= veciOne.size( ) * veciTwo.size( );
00341 */
00342             fill( vecdBetweenOne.begin( ), vecdBetweenOne.end( ), 0 );
00343             fill( vecdBetweenTwo.begin( ), vecdBetweenTwo.end( ), 0 );
00344             for( i = 0; i < veciOne.size( ); ++i )
00345                 for( j = 0; j < veciTwo.size( ); ++j ) {
00346                     d = ( veciOne[ i ] == veciTwo[ j ] ) ? 1 :
00347                         psData->m_pDat->Get( veciOne[ i ], veciTwo[ j ] );
00348                     vecdBetweenOne[ i ] += d;
00349                     vecdBetweenTwo[ j ] += d; }
00350             for( dBetween = 0,i = 0; i < vecdBetweenOne.size( ); ++i )
00351                 dBetween += vecdBetweenOne[ i ] / veciTwo.size( );
00352             for( i = 0; i < vecdBetweenTwo.size( ); ++i )
00353                 dBetween += vecdBetweenTwo[ i ] / veciOne.size( );
00354             dBetween /= veciOne.size( ) + veciTwo.size( );
00355 
00356             psData->m_vecdValues[ iValue++ ] = (float)( psData->m_dPrior * dBetween / dWithin /
00357                 dBackground ); } }
00358 
00359     return NULL; }
00360 
00361 void* BackgroundThreadSingle( void* pData ) {
00362     size_t          i, j, iCount, iEdges;
00363     SBackground*    psData;
00364     vector<size_t>  veciOne;
00365     long double     dWithin, dBackground;
00366     float           d;
00367 
00368     psData = (SBackground*)pData;
00369     psData->m_vecdValues.resize( psData->m_iCountOne );
00370     veciOne.resize( psData->m_iSizeOne );
00371     iEdges = veciOne.size( ) * ( veciOne.size( ) + 1 ) / 2;
00372     for( iCount = 0; iCount < psData->m_iCountOne; ++iCount ) {
00373         const vector<size_t>&   veciGenes   = *psData->m_pveciGenes;
00374         const vector<float>&    vecdOut     = *psData->m_pvecdOut;
00375         set<size_t>             setiOne;
00376 
00377         while( setiOne.size( ) < veciOne.size( ) )
00378             setiOne.insert( veciGenes[ rand( ) % veciGenes.size( ) ] );
00379         copy( setiOne.begin( ), setiOne.end( ), veciOne.begin( ) );
00380 
00381         dBackground = dWithin = 0;
00382         for( i = 0; i < veciOne.size( ); ++i ) {
00383             dBackground += vecdOut[ veciOne[ i ] ];
00384             dWithin += psData->m_dPrior;
00385             for( j = ( i + 1 ); j < veciOne.size( ); ++j )
00386                 if( !CMeta::IsNaN( d = psData->m_pDat->Get( veciOne[ i ], veciOne[ j ] ) ) )
00387                     dWithin += d; }
00388         dBackground /= veciOne.size( );
00389         dWithin /= iEdges;
00390 
00391         psData->m_vecdValues[ iCount ] = (float)( dWithin / dBackground ); }
00392 
00393     return NULL; }