Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 static const char c_szRBF[] = "rbf"; 00026 static const char c_szPolynomial[] = "poly"; 00027 00028 int init_svm( const gengetopt_args_info&, CSVM& ); 00029 int main_one( const gengetopt_args_info&, const CPCLSet&, const CDataset&, const CGenes&, 00030 const CGenes&, const CGenes& ); 00031 int main_many( const gengetopt_args_info&, const CPCLSet&, const CGenes&, const CGenes&, 00032 const CGenes& ); 00033 00034 int main( int iArgs, char** aszArgs ) { 00035 gengetopt_args_info sArgs; 00036 CPCLSet PCLs; 00037 CDataset Data; 00038 vector<string> vecstrInputs; 00039 ifstream ifsm; 00040 CGenome Genome; 00041 CGenes GenesIn( Genome ), GenesEx( Genome ), GenesTm( Genome ); 00042 int iRet; 00043 00044 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00045 cmdline_parser_print_help( ); 00046 return 1; } 00047 CMeta Meta( sArgs.verbosity_arg, sArgs.random_arg ); 00048 00049 vecstrInputs.resize( sArgs.inputs_num ); 00050 copy( sArgs.inputs, sArgs.inputs + sArgs.inputs_num, vecstrInputs.begin( ) ); 00051 if( sArgs.pcl_flag ) { 00052 if( !PCLs.Open( vecstrInputs, sArgs.skip_arg, CPCL::ENormalizeRow ) ) { 00053 cerr << "Could not open PCLs" << endl; 00054 return 1; } } 00055 else { 00056 if( !Data.Open( vecstrInputs ) ) { 00057 cerr << "Could not open DATs" << endl; 00058 return 1; } } 00059 00060 if( sArgs.genes_arg ) { 00061 ifsm.clear( ); 00062 ifsm.open( sArgs.genes_arg ); 00063 if( !GenesIn.Open( ifsm ) ) { 00064 cerr << "Could not open: " << sArgs.genes_arg << endl; 00065 return 1; } 00066 ifsm.close( ); } 00067 if( sArgs.genex_arg ) { 00068 ifsm.clear( ); 00069 ifsm.open( sArgs.genex_arg ); 00070 if( !GenesEx.Open( ifsm ) ) { 00071 cerr << "Could not open: " << sArgs.genex_arg << endl; 00072 return 1; } 00073 ifsm.close( ); } 00074 if( sArgs.genet_arg ) { 00075 ifsm.clear( ); 00076 ifsm.open( sArgs.genet_arg ); 00077 if( !GenesTm.Open( ifsm ) ) { 00078 cerr << "Could not open: " << sArgs.genet_arg << endl; 00079 return 1; } 00080 ifsm.close( ); } 00081 00082 iRet = sArgs.genewise_flag ? main_many( sArgs, PCLs, GenesIn, GenesEx, GenesTm ) : 00083 main_one( sArgs, PCLs, Data, GenesIn, GenesEx, GenesTm ); 00084 00085 return iRet; } 00086 00087 int init_svm( const gengetopt_args_info& sArgs, CSVM& SVM ) { 00088 ifstream ifsm; 00089 00090 if( sArgs.alphas_arg ) { 00091 ifsm.open( sArgs.alphas_arg ); 00092 if( !SVM.OpenAlphas( ifsm ) ) { 00093 cerr << "Could not open: " << sArgs.alphas_arg << endl; 00094 return 1; } 00095 ifsm.close( ); } 00096 00097 if( !strcmp( sArgs.kernel_arg, c_szRBF ) ) 00098 SVM.SetKernel( CSVM::EKernelRBF ); 00099 else if( !strcmp( sArgs.kernel_arg, c_szPolynomial ) ) 00100 SVM.SetKernel( CSVM::EKernelPolynomial ); 00101 else 00102 SVM.SetKernel( CSVM::EKernelLinear ); 00103 00104 SVM.SetCache( sArgs.cache_arg ); 00105 SVM.SetIterations( sArgs.iterations_arg ); 00106 SVM.SetGamma( sArgs.gamma_arg ); 00107 SVM.SetDegree( sArgs.degree_arg ); 00108 if( sArgs.tradeoff_given ) 00109 SVM.SetTradeoff( sArgs.tradeoff_arg ); 00110 00111 return 0; } 00112 00113 int main_one( const gengetopt_args_info& sArgs, const CPCLSet& PCLs, const CDataset& Data, 00114 const CGenes& GenesIn, const CGenes& GenesEx, const CGenes& GenesTm ) { 00115 CSVM SVM; 00116 CDataPair Answers; 00117 CDat Dat; 00118 vector<string> vecstrInputs; 00119 ofstream ofsm; 00120 ifstream ifsm; 00121 int iRet; 00122 00123 if( iRet = init_svm( sArgs, SVM ) ) 00124 return iRet; 00125 if( sArgs.binary_arg && !sArgs.output_arg ) { 00126 SVM.Learn( sArgs.binary_arg ); 00127 if( sArgs.model_arg ) 00128 ofsm.open( sArgs.model_arg ); 00129 SVM.Save( sArgs.model_arg ? (ostream&)ofsm : cout ); 00130 if( sArgs.model_arg ) 00131 ofsm.close( ); 00132 else 00133 cout.flush( ); } 00134 else if( sArgs.input_arg ) { 00135 if( !Answers.Open( sArgs.input_arg, false ) ) { 00136 cerr << "Could not open: " << sArgs.input_arg << endl; 00137 return 1; } 00138 if( GenesIn.GetGenes( ) ) 00139 Answers.FilterGenes( GenesIn, CDat::EFilterInclude ); 00140 if( GenesEx.GetGenes( ) ) 00141 Answers.FilterGenes( GenesEx, CDat::EFilterExclude ); 00142 if( GenesTm.GetGenes( ) ) 00143 Answers.FilterGenes( GenesTm, CDat::EFilterTerm ); 00144 if( sArgs.pcl_flag ) 00145 SVM.Learn( PCLs, Answers ); 00146 else 00147 SVM.Learn( &Data, Answers ); 00148 if( sArgs.model_arg ) 00149 ofsm.open( sArgs.model_arg ); 00150 SVM.Save( sArgs.model_arg ? (ostream&)ofsm : cout ); 00151 if( sArgs.model_arg ) 00152 ofsm.close( ); 00153 else 00154 cout.flush( ); } 00155 else if( sArgs.model_arg ) { 00156 ifsm.clear( ); 00157 ifsm.open( sArgs.model_arg ); 00158 if( !SVM.Open( ifsm ) ) { 00159 cerr << "Could not open: " << sArgs.model_arg << endl; 00160 return 1; } 00161 00162 if( sArgs.binary_arg ) 00163 SVM.Evaluate( sArgs.binary_arg, Dat ); 00164 else { 00165 const vector<string>& vecstrGenes = sArgs.pcl_flag ? PCLs.GetGeneNames( ) : 00166 Data.GetGeneNames( ); 00167 00168 Dat.Open( vecstrGenes ); 00169 if( GenesIn.GetGenes( ) ) { 00170 if( sArgs.pcl_flag ) 00171 SVM.Evaluate( PCLs, GenesIn, Dat ); 00172 else 00173 SVM.Evaluate( &Data, GenesIn, Dat ); } 00174 else { 00175 if( sArgs.pcl_flag ) 00176 SVM.Evaluate( PCLs, Dat ); 00177 else 00178 SVM.Evaluate( &Data, Dat ); } } 00179 if( sArgs.output_arg ) 00180 Dat.Save( sArgs.output_arg ); } 00181 else { 00182 cmdline_parser_print_help( ); 00183 return 1; } 00184 00185 return 0; } 00186 00187 int main_many( const gengetopt_args_info& sArgs, const CPCLSet& PCLs, const CGenes& GenesIn, 00188 const CGenes& GenesEx, const CGenes& GenesTm ) { 00189 size_t i, j, iGene; 00190 int iRet; 00191 CDataPair Answers; 00192 ofstream ofsm; 00193 CPCL PCL; 00194 CGenes GenesNl( GenesIn.GetGenome( ) ); 00195 00196 if( PCLs.GetPCLs( ) == 0 ) 00197 PCL.Open( cin, sArgs.skip_arg ); 00198 else { 00199 vector<string> vecstrExperiments, vecstrFeatures; 00200 size_t iPCL, iExp; 00201 00202 for( iPCL = 0; iPCL < PCLs.GetPCLs( ); ++iPCL ) 00203 for( iExp = 0; iExp < PCLs.Get( iPCL ).GetExperiments( ); ++iExp ) 00204 vecstrExperiments.push_back( PCLs.Get( iPCL ).GetExperiment( iExp ) ); 00205 vecstrFeatures.resize( PCLs.Get( 0 ).GetFeatures( ) - 1 ); 00206 for( i = 0; i < vecstrFeatures.size( ); ++i ) 00207 vecstrFeatures[ i ] = PCLs.Get( 0 ).GetFeature( i + 1 ); 00208 PCL.Open( PCLs.GetGeneNames( ), vecstrExperiments, vecstrFeatures ); 00209 for( iGene = 0; iGene < PCL.GetGenes( ); ++iGene ) 00210 for( i = iPCL = 0; iPCL < PCLs.GetPCLs( ); ++iPCL ) 00211 for( iExp = 0; iExp < PCLs.Get( iPCL ).GetExperiments( ); ++iExp ) 00212 PCL.Set( iGene, i++, PCLs.Get( iPCL, iGene, iExp ) ); } 00213 if( sArgs.genel_arg ) { 00214 ifstream ifsm; 00215 00216 ifsm.open( sArgs.genel_arg ); 00217 if( !GenesNl.Open( ifsm ) ) { 00218 cerr << "Could not open: " << sArgs.genel_arg << endl; 00219 return 1; } } 00220 00221 if( sArgs.input_arg ) { 00222 vector<size_t> veciGenes; 00223 size_t iTwo; 00224 00225 if( !Answers.Open( sArgs.input_arg, false ) ) { 00226 cerr << "Could not open: " << sArgs.input_arg << endl; 00227 return 1; } 00228 if( GenesIn.GetGenes( ) ) 00229 Answers.FilterGenes( GenesIn, CDat::EFilterInclude ); 00230 if( GenesEx.GetGenes( ) ) 00231 Answers.FilterGenes( GenesEx, CDat::EFilterExclude ); 00232 if( GenesTm.GetGenes( ) ) 00233 Answers.FilterGenes( GenesTm, CDat::EFilterTerm ); 00234 veciGenes.resize( PCL.GetGenes( ) ); 00235 for( i = 0; i < veciGenes.size( ); ++i ) 00236 veciGenes[ i ] = Answers.GetGene( PCL.GetGene( i ) ); 00237 for( i = 0; i < PCL.GetGenes( ); ++i ) { 00238 CSVM SVM; 00239 CGenome Genome; 00240 CGenes GenesPos( Genome ), GenesNeg( Genome ); 00241 vector<string> vecstrPos, vecstrNeg; 00242 float d; 00243 00244 if( !( i % 100 ) ) 00245 cerr << "Processing gene " << i << '/' << PCL.GetGenes( ) << endl; 00246 if( ( ( iGene = veciGenes[ i ] ) == -1 ) || GenesNl.IsGene( PCL.GetGene( i ) ) || 00247 GenesEx.IsGene( PCL.GetGene( i ) ) ) 00248 continue; 00249 for( j = 0; j < PCL.GetGenes( ); ++j ) 00250 if( ( i != j ) && ( ( iTwo = veciGenes[ j ] ) != -1 ) && 00251 !CMeta::IsNaN( d = Answers.Get( iGene, iTwo ) ) ) { 00252 if( d > 0 ) 00253 vecstrPos.push_back( PCL.GetGene( j ) ); 00254 else 00255 vecstrNeg.push_back( PCL.GetGene( j ) ); } 00256 GenesPos.Open( vecstrPos ); 00257 GenesNeg.Open( vecstrNeg ); 00258 00259 if( iRet = init_svm( sArgs, SVM ) ) 00260 return iRet; 00261 SVM.Learn( PCL, GenesPos, GenesNeg ); 00262 if( sArgs.model_arg ) { 00263 ofsm.clear( ); 00264 ofsm.open( ( (string)sArgs.model_arg + "/" + CMeta::Filename( PCL.GetGene( i ) ) + 00265 ".svm" ).c_str( ) ); } 00266 SVM.Save( sArgs.model_arg ? (ostream&)ofsm : cout ); 00267 if( sArgs.model_arg ) 00268 ofsm.close( ); 00269 else 00270 cout.flush( ); } } 00271 else if( sArgs.model_arg ) { 00272 CDat Dat; 00273 00274 if( GenesIn.GetGenes( ) ) 00275 for( i = 0; i < PCL.GetGenes( ); ++i ) 00276 if( !GenesIn.IsGene( PCL.GetGene( i ) ) ) 00277 PCL.MaskGene( i ); 00278 if( GenesEx.GetGenes( ) ) 00279 for( i = 0; i < GenesEx.GetGenes( ); ++i ) 00280 if( ( iGene = PCL.GetGene( GenesIn.GetGene( i ).GetName( ) ) ) != -1 ) 00281 PCL.MaskGene( iGene ); 00282 00283 Dat.Open( PCL.GetGeneNames( ) ); 00284 for( iGene = 0; iGene < Dat.GetGenes( ); ++iGene ) { 00285 CSVM SVM; 00286 ifstream ifsm; 00287 string strFile; 00288 vector<float> vecdScores; 00289 00290 if( !( iGene % 100 ) ) 00291 cerr << "Processing gene " << iGene << '/' << Dat.GetGenes( ) << endl; 00292 ifsm.open( ( strFile = (string)sArgs.model_arg + "/" + CMeta::Filename( 00293 Dat.GetGene( iGene ) ) + ".svm" ).c_str( ) ); 00294 if( !( ifsm.is_open( ) && SVM.Open( ifsm ) ) ) { 00295 cerr << "Could not open: " << strFile << endl; 00296 continue; } 00297 00298 SVM.Evaluate( PCL, vecdScores ); 00299 for( i = j = 0; i < PCL.GetGenes( ); ++i ) 00300 if( !PCL.IsMasked( i ) ) 00301 Dat.Set( iGene, i, vecdScores[ j++ ] ); } 00302 if( sArgs.output_arg ) 00303 Dat.Save( sArgs.output_arg ); 00304 else 00305 Dat.Save( cout, CDat::EFormatText ); } 00306 else { 00307 cmdline_parser_print_help( ); 00308 return 1; } 00309 00310 return 0; }