Sleipnir
tools/Dat2Graph/Dat2Graph.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 struct SSorter {
00026     const vector<float>&    m_vecdScores;
00027 
00028     SSorter( const vector<float>& vecdScores ) : m_vecdScores(vecdScores) { }
00029 
00030     bool operator()( size_t iOne, size_t iTwo ) {
00031 
00032         return ( m_vecdScores[iTwo] < m_vecdScores[iOne] ); }
00033 };
00034 
00035 int open_genes( const char* szFile, CGenes& Genes ) {
00036     ifstream    ifsm;
00037 
00038     if( szFile ) {
00039         ifsm.open( szFile );
00040         if( !Genes.Open( ifsm ) ) {
00041             cerr << "Could not open: " << szFile << endl;
00042             return 1; } }
00043 
00044     return 0; }
00045 
00046 int open_values( const char* szFile, vector<float>& vecdValues ) {
00047     static const size_t c_iBuf  = 1024;
00048     char                szBuf[ c_iBuf ];
00049     ifstream            ifsm;
00050 
00051     if( szFile ) {
00052         ifsm.open( szFile );
00053         if( !ifsm.is_open( ) ) {
00054             cerr << "Could not open: " << szFile << endl;
00055             return 1; }
00056         while( ifsm.peek( ) != EOF ) {
00057             ifsm.getline( szBuf, c_iBuf - 1 );
00058             vecdValues.push_back( (float)atof( szBuf ) ); }
00059         ifsm.close( ); }
00060 
00061     return 0; }
00062 
00063 int main( int iArgs, char** aszArgs ) {
00064     gengetopt_args_info sArgs;
00065     ifstream            ifsm;
00066     CDat                Dat, DatNew;
00067     CDat*               pDat;
00068     float               d, dCutoff;
00069     CGenome             Genome;
00070     CGenes              GenesIn( Genome ), GenesEx( Genome ), GenesQr( Genome );
00071     int                 iRet;
00072     size_t              i, j, k;
00073     vector<float>       vecdColors, vecdBorders, vecdWeights;
00074     vector<size_t>      veciQuery;
00075 
00076     if( cmdline_parser2( iArgs, aszArgs, &sArgs, 0, 1, 0 ) && ( sArgs.config_arg &&
00077         cmdline_parser_configfile( sArgs.config_arg, &sArgs, 0, 0, 1 ) ) ) {
00078         cmdline_parser_print_help( );
00079         return 1; }
00080     CMeta Meta( sArgs.verbosity_arg );
00081 
00082     if( sArgs.features_arg ) {
00083         ifsm.open( sArgs.features_arg );
00084         if( !Genome.Open( ifsm ) ) {
00085             cerr << "Could not open: " << sArgs.features_arg << endl;
00086             return 1; }
00087         ifsm.close( ); }
00088     if( iRet = open_values( sArgs.colors_arg, vecdColors ) )
00089         return iRet;
00090     if( iRet = open_values( sArgs.borders_arg, vecdBorders ) )
00091         return iRet;
00092     if( iRet = open_values( sArgs.genew_arg, vecdWeights ) )
00093         return iRet;
00094     if( iRet = open_genes( sArgs.genes_arg, GenesIn ) )
00095         return iRet;
00096     if( iRet = open_genes( sArgs.genex_arg, GenesEx ) )
00097         return iRet;
00098     if( iRet = open_genes( sArgs.geneq_arg, GenesQr ) )
00099         return iRet;
00100 
00101     if( sArgs.input_arg ) {
00102         if( !Dat.Open( sArgs.input_arg, sArgs.memmap_flag && !( sArgs.normalize_flag ||
00103             sArgs.genes_arg || sArgs.geneq_arg || sArgs.knowns_arg ) ) ) {
00104             cerr << "Couldn't open: " << sArgs.input_arg << endl;
00105             return 1; } }
00106     else if( !Dat.Open( cin, CDat::EFormatText ) ) {
00107         cerr << "Couldn't open input" << endl;
00108         return 1; }
00109     pDat = &Dat;
00110 
00111     veciQuery.resize( pDat->GetGenes( ) );
00112     for( i = 0; i < veciQuery.size( ); ++i )
00113         veciQuery[ i ] = GenesQr.GetGene( pDat->GetGene( i ) );
00114 
00115     dCutoff = (float)( sArgs.cutoff_given ? sArgs.cutoff_arg : -FLT_MAX );
00116     if( GenesIn.GetGenes( ) ) {
00117         vector<size_t>  veciGenes;
00118 
00119         DatNew.Open( GenesIn.GetGeneNames( ) );
00120         veciGenes.resize( DatNew.GetGenes( ) );
00121         for( i = 0; i < veciGenes.size( ); ++i )
00122             veciGenes[ i ] = Dat.GetGene( DatNew.GetGene( i ) );
00123         for( i = 0; i < veciGenes.size( ); ++i ) {
00124             if( veciGenes[ i ] == -1 )
00125                 continue;
00126             for( j = ( i + 1 ); j < veciGenes.size( ); ++j )
00127                 if( veciGenes[ j ] != -1 )
00128                     DatNew.Set( i, j, Dat.Get( veciGenes[ i ], veciGenes[ j ] ) ); }
00129         pDat = &DatNew; }
00130     if( GenesEx.GetGenes( ) )
00131         pDat->FilterGenes( GenesEx, CDat::EFilterExclude );
00132     if( sArgs.normalize_flag )
00133         pDat->Normalize( CDat::ENormalizeSigmoid );
00134     if( GenesQr.GetGenes( ) ) {
00135         if( sArgs.cutoff_given )
00136             for( i = 0; i < pDat->GetGenes( ); ++i )
00137                 for( j = ( i + 1 ); j < pDat->GetGenes( ); ++j )
00138                     if( !CMeta::IsNaN( d = pDat->Get( i, j ) ) && ( d < sArgs.cutoff_arg ) )
00139                         pDat->Set( i, j, CMeta::GetNaN( ) );
00140         if( sArgs.hubs_arg >= 0 ) {
00141             vector<float>   vecdScores;
00142             vector<size_t>  veciIndices;
00143             vector<bool>    vecfHits;
00144 
00145             veciIndices.resize( pDat->GetGenes( ) );
00146             vecdScores.resize( pDat->GetGenes( ) );
00147             vecfHits.resize( pDat->GetGenes( ) );
00148             for( i = 0; i < pDat->GetGenes( ); ++i )
00149                 if( veciQuery[i] == -1 ) {
00150                     for( j = 0; j < pDat->GetGenes( ); ++j )
00151                         if( veciQuery[j] == -1 )
00152                             pDat->Set( i, j, CMeta::GetNaN( ) ); }
00153                 else {
00154                     fill( vecdScores.begin( ), vecdScores.end( ), -FLT_MAX );
00155                     for( j = 0; j < pDat->GetGenes( ); ++j ) {
00156                         if( CMeta::IsNaN( d = pDat->Get( i, j ) ) )
00157                             d = -FLT_MAX;
00158                         vecdScores[j] = d; }
00159                     for( j = 0; j < veciIndices.size( ); ++j )
00160                         veciIndices[j] = j;
00161                     sort( veciIndices.begin( ), veciIndices.end( ), SSorter( vecdScores ) );
00162                     fill( vecfHits.begin( ), vecfHits.end( ), false );
00163                     for( j = 0; j < (size_t)sArgs.hubs_arg; ++j )
00164                         vecfHits[veciIndices[j]] = true;
00165                     for( j = 0; j < pDat->GetGenes( ); ++j )
00166                         if( !vecfHits[j] )
00167                             pDat->Set( i, j, CMeta::GetNaN( ) ); }
00168             pDat->Normalize( CDat::ENormalizeZScore ); }
00169         else if( !strcmp( sArgs.format_arg, "correl" ) ) {
00170             CMeasurePearson MeasurePearson;
00171             float*          adCentroid;
00172             float*          adCur;
00173             size_t          iCur;
00174             vector<size_t>  veciCounts;
00175             vector<float>   vecdScores;
00176 
00177             veciCounts.resize( pDat->GetGenes( ) );
00178             adCentroid = new float[ pDat->GetGenes( ) ];
00179             for( i = 0; i < GenesQr.GetGenes( ); ++i ) {
00180                 if( ( iCur = pDat->GetGene( GenesQr.GetGene( i ).GetName( ) ) ) == -1 )
00181                     continue;
00182                 for( j = 0; j < pDat->GetGenes( ); ++j )
00183                     if( !CMeta::IsNaN( d = pDat->Get( iCur, j ) ) ) {
00184                         adCentroid[ j ] += d;
00185                         veciCounts[ j ]++; } }
00186             for( i = 0; i < pDat->GetGenes( ); ++i )
00187                 adCentroid[ i ] /= veciCounts[ i ];
00188 
00189             vecdScores.resize( pDat->GetGenes( ) );
00190             adCur = new float[ pDat->GetGenes( ) ];
00191             for( i = 0; i < pDat->GetGenes( ); ++i ) {
00192                 for( j = 0; j < pDat->GetGenes( ); ++j )
00193                     adCur[ j ] = pDat->Get( i, j );
00194                 vecdScores[ i ] = (float)MeasurePearson.Measure( adCentroid, pDat->GetGenes( ), adCur,
00195                     pDat->GetGenes( ), IMeasure::EMapNone, NULL, NULL ); }
00196             delete[] adCur;
00197             delete[] adCentroid;
00198             for( i = 0; i < vecdScores.size( ); ++i )
00199                 cout << pDat->GetGene( i ) << '\t' << vecdScores[ i ] << endl; }
00200         else {
00201             if( vecdColors.empty( ) ) {
00202                 vecdColors.resize( pDat->GetGenes( ) );
00203                 fill( vecdColors.begin( ), vecdColors.end( ), 0.5f );
00204                 for( i = 0; i < GenesQr.GetGenes( ); ++i )
00205                     if( ( j = pDat->GetGene( GenesQr.GetGene( i ).GetName( ) ) ) != -1 )
00206                         vecdColors[ j ] = 1; }
00207             pDat->FilterGenes( GenesQr, sArgs.hefalmp_flag ? CDat::EFilterHefalmp : CDat::EFilterPixie,
00208                 sArgs.neighbors_arg, (float)sArgs.edges_arg, !!sArgs.absolute_flag,
00209                 vecdWeights.empty( ) ? NULL : &vecdWeights ); } }
00210     if( sArgs.knowns_arg ) {
00211         CDat            DatKnowns;
00212         vector<size_t>  veciKnowns;
00213         size_t          iOne, iTwo;
00214 
00215         if( !DatKnowns.Open( sArgs.knowns_arg, !!sArgs.memmap_flag ) ) {
00216             cerr << "Could not open: " << sArgs.knowns_arg << endl;
00217             return 1; }
00218         veciKnowns.resize( pDat->GetGenes( ) );
00219         for( i = 0; i < veciKnowns.size( ); ++i )
00220             veciKnowns[ i ] = DatKnowns.GetGene( pDat->GetGene( i ) );
00221         for( i = 0; i < pDat->GetGenes( ); ++i )
00222             if( ( iOne = veciKnowns[ i ] ) != -1 )
00223                 for( j = ( i + 1 ); j < pDat->GetGenes( ); ++j )
00224                     if( ( ( iTwo = veciKnowns[ j ] ) != -1 ) &&
00225                         !CMeta::IsNaN( d = DatKnowns.Get( iOne, iTwo ) ) && ( d > 0 ) )
00226                         pDat->Set( i, j, CMeta::GetNaN( ) ); }
00227 
00228     if( !strcmp( sArgs.format_arg, "dot" ) )
00229         pDat->SaveDOT( cout, dCutoff, &Genome, false, true, vecdColors.empty( ) ? NULL : &vecdColors,
00230             vecdBorders.empty( ) ? NULL : &vecdBorders );
00231     else if( !strcmp( sArgs.format_arg, "gdf" ) )
00232         pDat->SaveGDF( cout, dCutoff );
00233     else if( !strcmp( sArgs.format_arg, "net" ) )
00234         pDat->SaveNET( cout, dCutoff );
00235     else if( !strcmp( sArgs.format_arg, "matisse" ) )
00236         pDat->SaveMATISSE( cout, dCutoff, &Genome );
00237     else if( !strcmp( sArgs.format_arg, "list" ) ) {
00238         map<size_t, float>              mapGenes;
00239         map<size_t, float>::iterator    iterGene;
00240         float                           dCur;
00241 
00242         for( i = 0; i < pDat->GetGenes( ); ++i )
00243             for( j = ( i + 1 ); j < pDat->GetGenes( ); ++j )
00244                 if( !CMeta::IsNaN( d = pDat->Get( i, j ) ) &&
00245                     ( CMeta::IsNaN( dCutoff ) || ( d > dCutoff ) ) ) {
00246                     if( ( k = veciQuery[ i ] ) != -1 ) {
00247                         dCur = d * ( vecdWeights.empty( ) ? 1 : vecdWeights[ k ] );
00248                         if( ( iterGene = mapGenes.find( j ) ) == mapGenes.end( ) )
00249                             mapGenes[ j ] = dCur;
00250                         else
00251                             iterGene->second += dCur; }
00252                     if( ( k = veciQuery[ j ] ) != -1 ) {
00253                         dCur = d * ( vecdWeights.empty( ) ? 1 : vecdWeights[ k ] );
00254                         if( ( iterGene = mapGenes.find( i ) ) == mapGenes.end( ) )
00255                             mapGenes[ i ] = dCur;
00256                         else
00257                             iterGene->second += dCur; } }
00258         for( iterGene = mapGenes.begin( ); iterGene != mapGenes.end( ); ++iterGene )
00259             cout << pDat->GetGene( iterGene->first ) << '\t' << iterGene->second << endl; }
00260     else if( !strcmp( sArgs.format_arg, "dat" ) ) {
00261         for( i = 0; i < pDat->GetGenes( ); ++i )
00262             for( j = ( i + 1 ); j < pDat->GetGenes( ); ++j )
00263                 if( ( d = pDat->Get( i, j ) ) < dCutoff )
00264                     pDat->Set( i, j, CMeta::GetNaN( ) );
00265         pDat->Save( cout, CDat::EFormatText ); }
00266 
00267     return 0; }