Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 int read_genes( const char*, const char*, CGenome&, vector<string>&, vector<CGenes*>&, CDataPair& ); 00026 int write_posteriors( const string&, const CBayesNetSmile&, ostream& ); 00027 00028 int main( int iArgs, char** aszArgs ) { 00029 gengetopt_args_info sArgs; 00030 CDatasetCompact Data; 00031 size_t i, j, iFunction; 00032 CGenome Genome; 00033 ifstream ifsm; 00034 CGenes GenesIn( Genome ), GenesEx( Genome ); 00035 vector<CGenes*> vecpPositives; 00036 vector<float> vecdQuants; 00037 vector<string> vecstrPositives, vecstrIDs, vecstrNodes; 00038 int iRet; 00039 ofstream ofsmPosteriors; 00040 CBayesNetSmile BNGlobal; 00041 IMeasure* pMeasure; 00042 CMeasurePearson Pearson; 00043 CMeasureEuclidean Euclidean; 00044 CMeasureKendallsTau KendallsTau; 00045 CMeasureKolmogorovSmirnov KolmSmir; 00046 CMeasureSpearman Spearman( true ); 00047 CMeasureNegate EuclideanNeg( &Euclidean, false ); 00048 CMeasurePearNorm PearNorm; 00049 IMeasure* apMeasures[] = { &Pearson, &EuclideanNeg, &KendallsTau, 00050 &KolmSmir, &Spearman, &PearNorm, NULL }; 00051 00052 if( cmdline_parser( iArgs, aszArgs, &sArgs ) || !sArgs.inputs_num ) { 00053 cmdline_parser_print_help( ); 00054 return 1; } 00055 CMeta Meta( sArgs.verbosity_arg, sArgs.random_arg ); 00056 00057 pMeasure = NULL; 00058 for( i = 0; apMeasures[ i ]; ++i ) 00059 if( !strcmp( apMeasures[ i ]->GetName( ), sArgs.distance_arg ) ) { 00060 pMeasure = apMeasures[ i ]; 00061 break; } 00062 if( !pMeasure ) { 00063 cmdline_parser_print_help( ); 00064 return 1; } 00065 00066 if( sArgs.genes_arg ) { 00067 ifsm.clear( ); 00068 ifsm.open( sArgs.genes_arg ); 00069 if( !GenesIn.Open( ifsm ) ) { 00070 cerr << "Could not open: " << sArgs.genes_arg << endl; 00071 return 1; } 00072 ifsm.close( ); } 00073 if( sArgs.genex_arg ) { 00074 ifsm.clear( ); 00075 ifsm.open( sArgs.genex_arg ); 00076 if( !GenesEx.Open( ifsm ) ) { 00077 cerr << "Could not open: " << sArgs.genex_arg << endl; 00078 return 1; } 00079 ifsm.close( ); } 00080 00081 if( sArgs.bins_arg ) { 00082 static const size_t c_iBuffer = 1024; 00083 char acBuffer[ c_iBuffer ]; 00084 vector<string> vecstrQuants; 00085 00086 ifsm.clear( ); 00087 ifsm.open( sArgs.bins_arg ); 00088 ifsm.getline( acBuffer, c_iBuffer - 1 ); 00089 if( !( ifsm.is_open( ) && acBuffer[ 0 ] ) ) { 00090 cerr << "Could not open: " << sArgs.bins_arg << endl; 00091 return 1; } 00092 ifsm.close( ); 00093 00094 CMeta::Tokenize( acBuffer, vecstrQuants ); 00095 vecdQuants.resize( vecstrQuants.size( ) ); 00096 for( i = 0; i < vecdQuants.size( ); ++i ) 00097 vecdQuants[ i ] = (float)atof( vecstrQuants[ i ].c_str( ) ); } 00098 else { 00099 static const float c_adQuants[] = {-1, 0, 1, 2, 3}; 00100 00101 vecdQuants.resize( ARRAYSIZE(c_adQuants) ); 00102 copy( c_adQuants, c_adQuants + ARRAYSIZE(c_adQuants), vecdQuants.begin( ) ); } 00103 00104 { 00105 static const float c_adQuantsBinary[] = {0.5, 1}; 00106 CDataPair Answers; 00107 vector<string> vecstrPCLs, vecstrGenes; 00108 00109 vecstrPCLs.resize( sArgs.inputs_num ); 00110 copy( sArgs.inputs, sArgs.inputs + sArgs.inputs_num, vecstrPCLs.begin( ) ); 00111 vecstrIDs.resize( sArgs.inputs_num ); 00112 for( i = 0; i < sArgs.inputs_num; ++i ) 00113 vecstrIDs[ i ] = CMeta::Basename( sArgs.inputs[ i ] ); 00114 if( !BNGlobal.Open( vecstrIDs, vecdQuants.size( ) ) ) { 00115 cerr << "Failed to create a Bayesian network" << endl; 00116 return 1; } 00117 00118 if( iRet = read_genes( sArgs.related_arg, sArgs.unrelated_arg, Genome, vecstrPositives, vecpPositives, 00119 Answers ) ) 00120 return iRet; 00121 Answers.SetQuants( c_adQuantsBinary, ARRAYSIZE(c_adQuantsBinary) ); 00122 00123 if( !Data.Open( GenesIn, GenesEx, Answers, vecstrPCLs, sArgs.skip_arg, pMeasure, vecdQuants ) ) 00124 return 1; 00125 } 00126 00127 cerr << "Learning global network..."; 00128 BNGlobal.Learn( &Data, 1, !!sArgs.zero_flag ); 00129 if( !BNGlobal.Save( sArgs.global_arg ) ) { 00130 cerr << "Could not save: " << sArgs.global_arg << endl; 00131 return 1; } 00132 cerr << "done" << endl; 00133 00134 ofsmPosteriors.open( sArgs.trusts_arg ); 00135 if( !ofsmPosteriors.is_open( ) ) { 00136 cerr << "Could not open: " << sArgs.trusts_arg << endl; 00137 return 1; } 00138 BNGlobal.GetNodes( vecstrNodes ); 00139 for( i = 1; i < vecstrNodes.size( ); ++i ) 00140 ofsmPosteriors << '\t' << vecstrNodes[ i ]; 00141 ofsmPosteriors << endl; 00142 00143 for( iFunction = 0; iFunction < vecpPositives.size( ); ++iFunction ) { 00144 CBayesNetSmile BNFunction; 00145 CDataMask DataMask; 00146 string strFile; 00147 CDat DatOut; 00148 ofstream ofsm; 00149 00150 cerr << "Learning function: " << vecstrPositives[ iFunction ] << endl; 00151 00152 DataMask.Attach( &Data ); 00153 DataMask.FilterGenes( *vecpPositives[ iFunction ], CDat::EFilterTerm ); 00154 BNFunction.Open( vecstrIDs, vecdQuants.size( ) ); 00155 BNFunction.SetDefault( BNGlobal ); 00156 BNFunction.Learn( &DataMask, 1, !!sArgs.zero_flag ); 00157 strFile = (string)sArgs.output_arg + '/' + vecstrPositives[ iFunction ] + '.' + 00158 ( sArgs.xdsl_flag ? "x" : "" ) + "dsl"; 00159 if( !BNFunction.Save( strFile.c_str( ) ) ) { 00160 cerr << "Could not save: " << strFile << endl; 00161 return 1; } 00162 if( iRet = write_posteriors( vecstrPositives[ iFunction ], BNFunction, ofsmPosteriors ) ) 00163 return iRet; 00164 00165 strFile = (string)sArgs.predictions_arg + '/' + vecstrPositives[ iFunction ] + ( sArgs.dab_flag ? 00166 ".dab" : ".dat" ); 00167 DatOut.Open( DataMask.GetGeneNames( ) ); 00168 if( !BNFunction.Evaluate( &DataMask, DatOut, !!sArgs.zero_flag ) ) { 00169 cerr << "Could not evaluate: " << strFile << endl; 00170 return 1; } 00171 DatOut.Invert( ); 00172 if( sArgs.cutoff_arg > 0 ) 00173 for( i = 0; i < DatOut.GetGenes( ); ++i ) 00174 for( j = ( i + 1 ); j < DatOut.GetGenes( ); ++j ) 00175 if( DatOut.Get( i, j ) < sArgs.cutoff_arg ) 00176 DatOut.Set( i, j, CMeta::GetNaN( ) ); 00177 DatOut.Save( strFile.c_str( ) ); } 00178 00179 for( i = 0; i < vecpPositives.size( ); ++i ) 00180 delete vecpPositives[ i ]; 00181 00182 return 0; } 00183 00184 int read_genes( const char* szPositives, const char* szNegatives, CGenome& Genome, vector<string>& vecstrPositives, 00185 vector<CGenes*>& vecpPositives, CDataPair& Answers ) { 00186 CGenes* pGenes; 00187 string strDir, strFile, strBase; 00188 CDat DatNegatives; 00189 ifstream ifsm; 00190 set<string> setstrGenes; 00191 size_t i; 00192 00193 cerr << "Reading related gene pairs..."; 00194 strDir = szPositives; 00195 FOR_EACH_DIRECTORY_FILE(strDir, strFile) 00196 strBase = strFile; 00197 strFile = strDir + '/' + strFile; 00198 00199 ifsm.clear( ); 00200 ifsm.open( strFile.c_str( ) ); 00201 pGenes = new CGenes( Genome ); 00202 if( !pGenes->Open( ifsm ) ) { 00203 cerr << "Could not open: " << strFile << endl; 00204 return 1; } 00205 ifsm.close( ); 00206 vecstrPositives.push_back( CMeta::Deextension( strBase ) ); 00207 vecpPositives.push_back( pGenes ); } 00208 cerr << " done" << endl; 00209 00210 cerr << "Reading unrelated gene pairs..."; 00211 ifsm.clear( ); 00212 ifsm.open( szNegatives ); 00213 if( !DatNegatives.Open( ifsm, CDat::EFormatText, 0 ) ) { 00214 cerr << "Could not open: " << szNegatives << endl; 00215 return 1; } 00216 ifsm.close( ); 00217 for( i = 0; i < DatNegatives.GetGenes( ); ++i ) 00218 Genome.AddGene( DatNegatives.GetGene( i ) ); 00219 cerr << " done" << endl; 00220 00221 return ( Answers.Open( DatNegatives, vecpPositives, Genome, true ) ? 0 : 1 ); } 00222 00223 int write_posteriors( const string& strFunction, const CBayesNetSmile& BNFunction, ostream& ostm ) { 00224 size_t i, iNode; 00225 vector<unsigned char> vecbDatum; 00226 vector<float> vecdOut; 00227 float dPrior; 00228 unsigned char bValue; 00229 vector<string> vecstrNodes; 00230 00231 BNFunction.GetNodes( vecstrNodes ); 00232 vecbDatum.resize( vecstrNodes.size( ) ); 00233 for( i = 0; i < vecbDatum.size( ); ++i ) 00234 vecbDatum[ i ] = 0; 00235 00236 BNFunction.Evaluate( vecbDatum, vecdOut, false ); 00237 dPrior = vecdOut[ vecdOut.size( ) - 1 ]; 00238 ostm << strFunction; 00239 for( iNode = 1; iNode < vecstrNodes.size( ); ++iNode ) { 00240 float d; 00241 00242 vecbDatum[ iNode - 1 ] = 0; 00243 vecdOut.clear( ); 00244 for( bValue = 0; bValue < BNFunction.GetValues( iNode ); ++bValue ) { 00245 vecbDatum[ iNode ] = bValue + 1; 00246 BNFunction.Evaluate( vecbDatum, vecdOut, false ); } 00247 00248 d = 0; 00249 for( i = 0; i < vecdOut.size( ); ++i ) 00250 d += fabs( dPrior - vecdOut[ i ] ); 00251 ostm << '\t' << ( d / BNFunction.GetValues( iNode ) ); } 00252 ostm << endl; 00253 00254 return 0; }