Sleipnir
tools/Explainer/Explainer.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 static const char   c_szExclude[]   = "exclude";
00026 static const char   c_szInclude[]   = "include";
00027 static const char   c_szOnly[]      = "only";
00028 static const char   c_szUnknown[]   = "GO:0008150";
00029 
00030 struct SDatum {
00031     float   m_dDiff;
00032     float   m_dData;
00033     float   m_dAnswer;
00034     size_t  m_iOne;
00035     size_t  m_iTwo;
00036 
00037     SDatum( float dAnswer, float dData, size_t iOne, size_t iTwo ) : m_dData(dData), m_dAnswer(dAnswer), m_iOne(iOne), m_iTwo(iTwo) {
00038 
00039         m_dDiff = fabs( dData - dAnswer ); }
00040 };
00041 
00042 struct SSorter {
00043     enum EMode {
00044         EModeDiff,
00045         EModeData,
00046         EModeAnswer
00047     };
00048 
00049     EMode   m_eMode;
00050     bool    m_fReverse;
00051 
00052     SSorter( EMode eMode, bool fReverse ) : m_eMode(eMode), m_fReverse(fReverse) { }
00053 
00054     bool operator()( const SDatum& sOne, const SDatum& sTwo ) const {
00055         bool    fRet;
00056         float   dOne, dTwo;
00057 
00058         switch( m_eMode ) {
00059             case EModeData:
00060                 dOne = sOne.m_dData;
00061                 dTwo = sTwo.m_dData;
00062                 break;
00063 
00064             case EModeAnswer:
00065                 dOne = sOne.m_dAnswer;
00066                 dTwo = sTwo.m_dAnswer;
00067 
00068             default:
00069                 dOne = sOne.m_dDiff;
00070                 dTwo = sTwo.m_dDiff;
00071                 break; }
00072 
00073         return ( m_fReverse ? ( dOne < dTwo ) : ( dTwo < dOne ) ); }
00074 };
00075 
00076 int main( int iArgs, char** aszArgs ) {
00077     CDat                Answers, Data;
00078     gengetopt_args_info sArgs;
00079     vector<size_t>      veciGenes;
00080     size_t              i, j, k, iOne, iTwo, iNumber;
00081     float               dValue, dAnswer;
00082     vector<SDatum>      vecsData;
00083     ifstream            ifsm;
00084     COntologyOBO            GOBP;
00085     CGenome             Genome;
00086     CGene*              pOne;
00087     CGene*              pTwo;
00088     bool                fOne, fTwo;
00089     string              strOne, strTwo;
00090     int                 iRet;
00091     SSorter::EMode      eMode;
00092 
00093     iRet = cmdline_parser2( iArgs, aszArgs, &sArgs, 0, 1, 0 );
00094     if( sArgs.config_arg )
00095         iRet = cmdline_parser_configfile( sArgs.config_arg, &sArgs, 0, 0, 1 ) && iRet;
00096     if( iRet ) {
00097         cmdline_parser_print_help( );
00098         return iRet; }
00099     CMeta Meta( sArgs.verbosity_arg );
00100 
00101 //  if( !strcmp( c_szInclude, sArgs.unknowns_arg ) || !strcmp( c_szOnly, sArgs.unknowns_arg ) )
00102 //      sArgs.everything_flag = true;
00103 
00104     if( !Answers.Open( sArgs.answers_arg, sArgs.memmap_flag && !sArgs.genes_arg && !sArgs.genet_arg && !sArgs.genex_arg ) ) {
00105         cerr << "Couldn't open: " << sArgs.answers_arg << endl;
00106         return 1; }
00107     if( sArgs.genes_arg && !Answers.FilterGenes( sArgs.genes_arg, CDat::EFilterInclude ) ) {
00108         cerr << "Couldn't open: " << sArgs.genes_arg << endl;
00109         return 1; }
00110     if( sArgs.genet_arg && !Answers.FilterGenes( sArgs.genet_arg, CDat::EFilterTerm ) ) {
00111         cerr << "Couldn't open: " << sArgs.genet_arg << endl;
00112         return 1; }
00113     if( sArgs.genee_arg && !Answers.FilterGenes( sArgs.genee_arg, CDat::EFilterEdge ) ) {
00114         cerr << "Couldn't open: " << sArgs.genee_arg << endl;
00115         return 1; }
00116     if( sArgs.genex_arg && !Answers.FilterGenes( sArgs.genex_arg, CDat::EFilterExclude ) ) {
00117         cerr << "Couldn't open: " << sArgs.genex_arg << endl;
00118         return 1; }
00119     if( !Data.Open( sArgs.input_arg, sArgs.memmap_flag && !sArgs.normalize_flag && !sArgs.invert_flag ) ) {
00120         cerr << "Couldn't open: " << sArgs.input_arg << endl;
00121         return 1; }
00122     if( sArgs.normalize_flag )
00123         Data.Normalize( CDat::ENormalizeMinMax );
00124     if( sArgs.invert_flag )
00125         Data.Invert( );
00126 
00127     if( sArgs.features_arg ) {
00128         ifsm.open( sArgs.features_arg );
00129         if( !Genome.Open( ifsm ) ) {
00130             cerr << "Could not open: " << sArgs.features_arg << endl;
00131             return 1; }
00132         ifsm.close( ); }
00133 
00134     if( sArgs.go_onto_arg ) {
00135         ifstream    ifsmGenes;
00136 
00137         ifsm.clear( );
00138         ifsm.open( sArgs.go_onto_arg );
00139         if( sArgs.go_anno_arg )
00140             ifsmGenes.open( sArgs.go_anno_arg );
00141         if( !GOBP.Open( ifsm, ifsmGenes, Genome, COntologyOBO::c_szBiologicalProcess ) ) {
00142             cerr << "Could not open: " << sArgs.go_onto_arg << ", " << sArgs.go_anno_arg << endl;
00143             return 1; }
00144         ifsm.close( ); }
00145 
00146     if( !strcmp( sArgs.mode_arg, "data" ) )
00147         eMode = SSorter::EModeData;
00148     else if( !strcmp( sArgs.mode_arg, "ans" ) )
00149         eMode = SSorter::EModeAnswer;
00150     else
00151         eMode = SSorter::EModeDiff;
00152 
00153     veciGenes.resize( Data.GetGenes( ) );
00154     for( i = 0; i < Data.GetGenes( ); ++i )
00155         veciGenes[ i ] = Answers.GetGene( Data.GetGene( i ) );
00156     for( i = 0; i < Data.GetGenes( ); ++i ) {
00157         if( !( i % 1000 ) )
00158             cerr << "Gene " << i << '/' << Data.GetGenes( ) << endl;
00159         if( !sArgs.everything_flag && ( ( iOne = veciGenes[ i ] ) == -1 ) )
00160             continue;
00161         for( j = ( i + 1 ); j < Data.GetGenes( ); ++j ) {
00162             if( CMeta::IsNaN( dValue = Data.Get( i, j ) ) )
00163                 continue;
00164             dAnswer = CMeta::GetNaN( );
00165             if( !sArgs.everything_flag && ( ( ( iTwo = veciGenes[ j ] ) == -1 ) ||
00166                 CMeta::IsNaN( dAnswer = Answers.Get( iOne, iTwo ) ) ||
00167                 ( sArgs.positives_flag && ( dAnswer <= 0 ) ) ||
00168                 ( sArgs.negatives_flag && ( dAnswer > 0 ) ) ) )
00169                 continue;
00170             if( ( (float)rand( ) / RAND_MAX ) > sArgs.fraction_arg )
00171                 continue;
00172             if( sArgs.everything_flag && CMeta::IsNaN( dAnswer ) )
00173                 dAnswer = dValue ? ( dValue - ( 1 / dValue ) ) : -FLT_MAX;
00174             vecsData.push_back( SDatum( dAnswer, dValue, i, j ) ); } }
00175     sort( vecsData.begin( ), vecsData.end( ), SSorter( eMode, !!sArgs.reverse_flag ) );
00176 
00177     if( ( ( iNumber = sArgs.count_arg ) < 0 ) || ( iNumber >= vecsData.size( ) ) )
00178         iNumber = vecsData.size( );
00179     for( i = 0; i < iNumber; ++i ) {
00180         const SDatum&   Datum   = vecsData[ i ];
00181 
00182         pOne = pTwo = NULL;
00183         strOne = Data.GetGene( Datum.m_iOne );
00184         strTwo = Data.GetGene( Datum.m_iTwo );
00185         if( Genome.GetGenes( ) ) {
00186             iOne = Genome.GetGene( strOne );
00187             pOne = ( iOne == -1 ) ? NULL : &Genome.GetGene( iOne );
00188             iTwo = Genome.GetGene( strTwo );
00189             pTwo = ( iTwo == -1 ) ? NULL : &Genome.GetGene( iTwo );
00190             fOne = fTwo = true;
00191             if( pOne )
00192                 for( j = 0; j < pOne->GetOntologies( ); ++j )
00193                     if( pOne->GetAnnotations( j ) && ( ( pOne->GetAnnotations( j ) > 1 ) ||
00194                         ( pOne->GetOntology( j )->GetID( pOne->GetAnnotation( j, 0 ) ) != c_szUnknown ) ) ) {
00195                         fOne = false;
00196                         break; }
00197             if( pTwo )
00198                 for( j = 0; j < pTwo->GetOntologies( ); ++j )
00199                     if( pTwo->GetAnnotations( j ) && ( ( pTwo->GetAnnotations( j ) > 1 ) ||
00200                         ( pTwo->GetOntology( j )->GetID( pTwo->GetAnnotation( j, 0 ) ) != c_szUnknown ) ) ) {
00201                         fTwo = false;
00202                         break; }
00203             if( fOne || fTwo ) {
00204                 if( !strcmp( c_szExclude, sArgs.unknowns_arg ) )
00205                     continue; }
00206             else if( !strcmp( c_szOnly, sArgs.unknowns_arg ) )
00207                 continue; }
00208 
00209         cout << strOne << '\t' << strTwo << '\t' << Data.Get( Datum.m_iOne, Datum.m_iTwo ) << '\t';
00210         dAnswer = ( ( ( j = veciGenes[ Datum.m_iOne ] ) == -1 ) || ( ( k = veciGenes[ Datum.m_iTwo ] ) == -1 ) ) ?
00211             CMeta::GetNaN( ) : Answers.Get( j, k );
00212         if( CMeta::IsNaN( dAnswer ) )
00213             cout << '-';
00214         else
00215             cout << dAnswer;
00216         cout << endl;
00217         if( pOne ) {
00218             cout << '\t';
00219             if( pOne->GetSynonyms( ) )
00220                 cout << pOne->GetSynonym( 0 );
00221             cout << '\t' << pOne->GetGloss( ); }
00222         if( Genome.GetGenes( ) )
00223             cout << endl;
00224         if( pTwo ) {
00225             cout << '\t';
00226             if( pTwo->GetSynonyms( ) )
00227                 cout << pTwo->GetSynonym( 0 );
00228             cout << '\t' << pTwo->GetGloss( ); }
00229         if( Genome.GetGenes( ) )
00230             cout << endl; }
00231 
00232     return 0; }