Sleipnir
tools/MEFIT/MEFIT.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 int read_genes( const char*, const char*, CGenome&, vector<string>&, vector<CGenes*>&, CDataPair& );
00026 int write_posteriors( const string&, const CBayesNetSmile&, ostream& );
00027 
00028 int main( int iArgs, char** aszArgs ) {
00029     gengetopt_args_info         sArgs;
00030     CDatasetCompact             Data;
00031     size_t                      i, j, iFunction;
00032     CGenome                     Genome;
00033     ifstream                    ifsm;
00034     CGenes                      GenesIn( Genome ), GenesEx( Genome );
00035     vector<CGenes*>             vecpPositives;
00036     vector<float>               vecdQuants;
00037     vector<string>              vecstrPositives, vecstrIDs, vecstrNodes;
00038     int                         iRet;
00039     ofstream                    ofsmPosteriors;
00040     CBayesNetSmile              BNGlobal;
00041     IMeasure*                   pMeasure;
00042     CMeasurePearson             Pearson;
00043     CMeasureEuclidean           Euclidean;
00044     CMeasureKendallsTau         KendallsTau;
00045     CMeasureKolmogorovSmirnov   KolmSmir;
00046     CMeasureSpearman            Spearman( true );
00047     CMeasureNegate              EuclideanNeg( &Euclidean, false );
00048     CMeasurePearNorm            PearNorm;
00049     IMeasure*                   apMeasures[]    = { &Pearson, &EuclideanNeg, &KendallsTau,
00050         &KolmSmir, &Spearman, &PearNorm, NULL };
00051 
00052     if( cmdline_parser( iArgs, aszArgs, &sArgs ) || !sArgs.inputs_num ) {
00053         cmdline_parser_print_help( );
00054         return 1; }
00055     CMeta Meta( sArgs.verbosity_arg, sArgs.random_arg );
00056 
00057     pMeasure = NULL;
00058     for( i = 0; apMeasures[ i ]; ++i )
00059         if( !strcmp( apMeasures[ i ]->GetName( ), sArgs.distance_arg ) ) {
00060             pMeasure = apMeasures[ i ];
00061             break; }
00062     if( !pMeasure ) {
00063         cmdline_parser_print_help( );
00064         return 1; }
00065 
00066     if( sArgs.genes_arg ) {
00067         ifsm.clear( );
00068         ifsm.open( sArgs.genes_arg );
00069         if( !GenesIn.Open( ifsm ) ) {
00070             cerr << "Could not open: " << sArgs.genes_arg << endl;
00071             return 1; }
00072         ifsm.close( ); }
00073     if( sArgs.genex_arg ) {
00074         ifsm.clear( );
00075         ifsm.open( sArgs.genex_arg );
00076         if( !GenesEx.Open( ifsm ) ) {
00077             cerr << "Could not open: " << sArgs.genex_arg << endl;
00078             return 1; }
00079         ifsm.close( ); }
00080 
00081     if( sArgs.bins_arg ) {
00082         static const size_t c_iBuffer   = 1024;
00083         char            acBuffer[ c_iBuffer ];
00084         vector<string>  vecstrQuants;
00085 
00086         ifsm.clear( );
00087         ifsm.open( sArgs.bins_arg );
00088         ifsm.getline( acBuffer, c_iBuffer - 1 );
00089         if( !( ifsm.is_open( ) && acBuffer[ 0 ] ) ) {
00090             cerr << "Could not open: " << sArgs.bins_arg << endl;
00091             return 1; }
00092         ifsm.close( );
00093 
00094         CMeta::Tokenize( acBuffer, vecstrQuants );
00095         vecdQuants.resize( vecstrQuants.size( ) );
00096         for( i = 0; i < vecdQuants.size( ); ++i )
00097             vecdQuants[ i ] = (float)atof( vecstrQuants[ i ].c_str( ) ); }
00098     else {
00099         static const float  c_adQuants[]    = {-1, 0, 1, 2, 3};
00100 
00101         vecdQuants.resize( ARRAYSIZE(c_adQuants) );
00102         copy( c_adQuants, c_adQuants + ARRAYSIZE(c_adQuants), vecdQuants.begin( ) ); }
00103 
00104     {
00105         static const float  c_adQuantsBinary[]  = {0.5, 1};
00106         CDataPair       Answers;
00107         vector<string>  vecstrPCLs, vecstrGenes;
00108 
00109         vecstrPCLs.resize( sArgs.inputs_num );
00110         copy( sArgs.inputs, sArgs.inputs + sArgs.inputs_num, vecstrPCLs.begin( ) );
00111         vecstrIDs.resize( sArgs.inputs_num );
00112         for( i = 0; i < sArgs.inputs_num; ++i )
00113             vecstrIDs[ i ] = CMeta::Basename( sArgs.inputs[ i ] );
00114         if( !BNGlobal.Open( vecstrIDs, vecdQuants.size( ) ) ) {
00115             cerr << "Failed to create a Bayesian network" << endl;
00116             return 1; }
00117 
00118         if( iRet = read_genes( sArgs.related_arg, sArgs.unrelated_arg, Genome, vecstrPositives, vecpPositives,
00119             Answers ) )
00120             return iRet;
00121         Answers.SetQuants( c_adQuantsBinary, ARRAYSIZE(c_adQuantsBinary) );
00122 
00123         if( !Data.Open( GenesIn, GenesEx, Answers, vecstrPCLs, sArgs.skip_arg, pMeasure, vecdQuants ) )
00124             return 1;
00125     }
00126 
00127     cerr << "Learning global network...";
00128     BNGlobal.Learn( &Data, 1, !!sArgs.zero_flag );
00129     if( !BNGlobal.Save( sArgs.global_arg ) ) {
00130         cerr << "Could not save: " << sArgs.global_arg << endl;
00131         return 1; }
00132     cerr << "done" << endl;
00133 
00134     ofsmPosteriors.open( sArgs.trusts_arg );
00135     if( !ofsmPosteriors.is_open( ) ) {
00136         cerr << "Could not open: " << sArgs.trusts_arg << endl;
00137         return 1; }
00138     BNGlobal.GetNodes( vecstrNodes );
00139     for( i = 1; i < vecstrNodes.size( ); ++i )
00140         ofsmPosteriors << '\t' << vecstrNodes[ i ];
00141     ofsmPosteriors << endl;
00142 
00143     for( iFunction = 0; iFunction < vecpPositives.size( ); ++iFunction ) {
00144         CBayesNetSmile  BNFunction;
00145         CDataMask       DataMask;
00146         string          strFile;
00147         CDat            DatOut;
00148         ofstream        ofsm;
00149 
00150         cerr << "Learning function: " << vecstrPositives[ iFunction ] << endl;
00151 
00152         DataMask.Attach( &Data );
00153         DataMask.FilterGenes( *vecpPositives[ iFunction ], CDat::EFilterTerm );
00154         BNFunction.Open( vecstrIDs, vecdQuants.size( ) );
00155         BNFunction.SetDefault( BNGlobal );
00156         BNFunction.Learn( &DataMask, 1, !!sArgs.zero_flag );
00157         strFile = (string)sArgs.output_arg + '/' + vecstrPositives[ iFunction ] + '.' +
00158             ( sArgs.xdsl_flag ? "x" : "" ) + "dsl";
00159         if( !BNFunction.Save( strFile.c_str( ) ) ) {
00160             cerr << "Could not save: " << strFile << endl;
00161             return 1; }
00162         if( iRet = write_posteriors( vecstrPositives[ iFunction ], BNFunction, ofsmPosteriors ) )
00163             return iRet;
00164 
00165         strFile = (string)sArgs.predictions_arg + '/' + vecstrPositives[ iFunction ] + ( sArgs.dab_flag ?
00166             ".dab" : ".dat" );
00167         DatOut.Open( DataMask.GetGeneNames( ) );
00168         if( !BNFunction.Evaluate( &DataMask, DatOut, !!sArgs.zero_flag ) ) {
00169             cerr << "Could not evaluate: " << strFile << endl;
00170             return 1; }
00171         DatOut.Invert( );
00172         if( sArgs.cutoff_arg > 0 )
00173             for( i = 0; i < DatOut.GetGenes( ); ++i )
00174                 for( j = ( i + 1 ); j < DatOut.GetGenes( ); ++j )
00175                     if( DatOut.Get( i, j ) < sArgs.cutoff_arg )
00176                         DatOut.Set( i, j, CMeta::GetNaN( ) );
00177         DatOut.Save( strFile.c_str( ) ); }
00178 
00179     for( i = 0; i < vecpPositives.size( ); ++i )
00180         delete vecpPositives[ i ];
00181 
00182     return 0; }
00183 
00184 int read_genes( const char* szPositives, const char* szNegatives, CGenome& Genome, vector<string>& vecstrPositives,
00185     vector<CGenes*>& vecpPositives, CDataPair& Answers ) {
00186     CGenes*         pGenes;
00187     string          strDir, strFile, strBase;
00188     CDat            DatNegatives;
00189     ifstream        ifsm;
00190     set<string>     setstrGenes;
00191     size_t          i;
00192 
00193     cerr << "Reading related gene pairs...";
00194     strDir = szPositives;
00195     FOR_EACH_DIRECTORY_FILE(strDir, strFile)
00196         strBase = strFile;
00197         strFile = strDir + '/' + strFile;
00198 
00199         ifsm.clear( );
00200         ifsm.open( strFile.c_str( ) );
00201         pGenes = new CGenes( Genome );
00202         if( !pGenes->Open( ifsm ) ) {
00203             cerr << "Could not open: " << strFile << endl;
00204             return 1; }
00205         ifsm.close( );
00206         vecstrPositives.push_back( CMeta::Deextension( strBase ) );
00207         vecpPositives.push_back( pGenes ); }
00208     cerr << "  done" << endl;
00209 
00210     cerr << "Reading unrelated gene pairs...";
00211     ifsm.clear( );
00212     ifsm.open( szNegatives );
00213     if( !DatNegatives.Open( ifsm, CDat::EFormatText, 0 ) ) {
00214         cerr << "Could not open: " << szNegatives << endl;
00215         return 1; }
00216     ifsm.close( );
00217     for( i = 0; i < DatNegatives.GetGenes( ); ++i )
00218         Genome.AddGene( DatNegatives.GetGene( i ) );
00219     cerr << "  done" << endl;
00220 
00221     return ( Answers.Open( DatNegatives, vecpPositives, Genome, true ) ? 0 : 1 ); }
00222 
00223 int write_posteriors( const string& strFunction, const CBayesNetSmile& BNFunction, ostream& ostm ) {
00224     size_t                  i, iNode;
00225     vector<unsigned char>   vecbDatum;
00226     vector<float>           vecdOut;
00227     float                   dPrior;
00228     unsigned char           bValue;
00229     vector<string>          vecstrNodes;
00230 
00231     BNFunction.GetNodes( vecstrNodes );
00232     vecbDatum.resize( vecstrNodes.size( ) );
00233     for( i = 0; i < vecbDatum.size( ); ++i )
00234         vecbDatum[ i ] = 0;
00235 
00236     BNFunction.Evaluate( vecbDatum, vecdOut, false );
00237     dPrior = vecdOut[ vecdOut.size( ) - 1 ];
00238     ostm << strFunction;
00239     for( iNode = 1; iNode < vecstrNodes.size( ); ++iNode ) {
00240         float   d;
00241 
00242         vecbDatum[ iNode - 1 ] = 0;
00243         vecdOut.clear( );
00244         for( bValue = 0; bValue < BNFunction.GetValues( iNode ); ++bValue ) {
00245             vecbDatum[ iNode ] = bValue + 1;
00246             BNFunction.Evaluate( vecbDatum, vecdOut, false ); }
00247 
00248         d = 0;
00249         for( i = 0; i < vecdOut.size( ); ++i )
00250             d += fabs( dPrior - vecdOut[ i ] );
00251         ostm << '\t' << ( d / BNFunction.GetValues( iNode ) ); }
00252     ostm << endl;
00253 
00254     return 0; }