Sleipnir
tools/SVMer/SVMer.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 static const char   c_szRBF[]           = "rbf";
00026 static const char   c_szPolynomial[]    = "poly";
00027 
00028 int init_svm( const gengetopt_args_info&, CSVM& );
00029 int main_one( const gengetopt_args_info&, const CPCLSet&, const CDataset&, const CGenes&,
00030     const CGenes&, const CGenes& );
00031 int main_many( const gengetopt_args_info&, const CPCLSet&, const CGenes&, const CGenes&,
00032     const CGenes& );
00033 
00034 int main( int iArgs, char** aszArgs ) {
00035     gengetopt_args_info sArgs;
00036     CPCLSet             PCLs;
00037     CDataset            Data;
00038     vector<string>      vecstrInputs;
00039     ifstream            ifsm;
00040     CGenome             Genome;
00041     CGenes              GenesIn( Genome ), GenesEx( Genome ), GenesTm( Genome );
00042     int                 iRet;
00043 
00044     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00045         cmdline_parser_print_help( );
00046         return 1; }
00047     CMeta Meta( sArgs.verbosity_arg, sArgs.random_arg );
00048 
00049     vecstrInputs.resize( sArgs.inputs_num );
00050     copy( sArgs.inputs, sArgs.inputs + sArgs.inputs_num, vecstrInputs.begin( ) );
00051     if( sArgs.pcl_flag ) {
00052         if( !PCLs.Open( vecstrInputs, sArgs.skip_arg, CPCL::ENormalizeRow ) ) {
00053             cerr << "Could not open PCLs" << endl;
00054             return 1; } }
00055     else {
00056         if( !Data.Open( vecstrInputs ) ) {
00057             cerr << "Could not open DATs" << endl;
00058             return 1; } }
00059 
00060     if( sArgs.genes_arg ) {
00061         ifsm.clear( );
00062         ifsm.open( sArgs.genes_arg );
00063         if( !GenesIn.Open( ifsm ) ) {
00064             cerr << "Could not open: " << sArgs.genes_arg << endl;
00065             return 1; }
00066         ifsm.close( ); }
00067     if( sArgs.genex_arg ) {
00068         ifsm.clear( );
00069         ifsm.open( sArgs.genex_arg );
00070         if( !GenesEx.Open( ifsm ) ) {
00071             cerr << "Could not open: " << sArgs.genex_arg << endl;
00072             return 1; }
00073         ifsm.close( ); }
00074     if( sArgs.genet_arg ) {
00075         ifsm.clear( );
00076         ifsm.open( sArgs.genet_arg );
00077         if( !GenesTm.Open( ifsm ) ) {
00078             cerr << "Could not open: " << sArgs.genet_arg << endl;
00079             return 1; }
00080         ifsm.close( ); }
00081 
00082     iRet = sArgs.genewise_flag ? main_many( sArgs, PCLs, GenesIn, GenesEx, GenesTm ) :
00083         main_one( sArgs, PCLs, Data, GenesIn, GenesEx, GenesTm );
00084 
00085     return iRet; }
00086 
00087 int init_svm( const gengetopt_args_info& sArgs, CSVM& SVM ) {
00088     ifstream    ifsm;
00089 
00090     if( sArgs.alphas_arg ) {
00091         ifsm.open( sArgs.alphas_arg );
00092         if( !SVM.OpenAlphas( ifsm ) ) {
00093             cerr << "Could not open: " << sArgs.alphas_arg << endl;
00094             return 1; }
00095         ifsm.close( ); }
00096 
00097     if( !strcmp( sArgs.kernel_arg, c_szRBF ) )
00098         SVM.SetKernel( CSVM::EKernelRBF );
00099     else if( !strcmp( sArgs.kernel_arg, c_szPolynomial ) )
00100         SVM.SetKernel( CSVM::EKernelPolynomial );
00101     else
00102         SVM.SetKernel( CSVM::EKernelLinear );
00103 
00104     SVM.SetCache( sArgs.cache_arg );
00105     SVM.SetIterations( sArgs.iterations_arg );
00106     SVM.SetGamma( sArgs.gamma_arg );
00107     SVM.SetDegree( sArgs.degree_arg );
00108     if( sArgs.tradeoff_given )
00109         SVM.SetTradeoff( sArgs.tradeoff_arg );
00110 
00111     return 0; }
00112 
00113 int main_one( const gengetopt_args_info& sArgs, const CPCLSet& PCLs, const CDataset& Data,
00114     const CGenes& GenesIn, const CGenes& GenesEx, const CGenes& GenesTm ) {
00115     CSVM                SVM;
00116     CDataPair           Answers;
00117     CDat                Dat;
00118     vector<string>      vecstrInputs;
00119     ofstream            ofsm;
00120     ifstream            ifsm;
00121     int                 iRet;
00122 
00123     if( iRet = init_svm( sArgs, SVM ) )
00124         return iRet;
00125     if( sArgs.binary_arg && !sArgs.output_arg ) {
00126         SVM.Learn( sArgs.binary_arg );
00127         if( sArgs.model_arg )
00128             ofsm.open( sArgs.model_arg );
00129         SVM.Save( sArgs.model_arg ? (ostream&)ofsm : cout );
00130         if( sArgs.model_arg )
00131             ofsm.close( );
00132         else
00133             cout.flush( ); }
00134     else if( sArgs.input_arg ) {
00135         if( !Answers.Open( sArgs.input_arg, false ) ) {
00136             cerr << "Could not open: " << sArgs.input_arg << endl;
00137             return 1; }
00138         if( GenesIn.GetGenes( ) )
00139             Answers.FilterGenes( GenesIn, CDat::EFilterInclude );
00140         if( GenesEx.GetGenes( ) )
00141             Answers.FilterGenes( GenesEx, CDat::EFilterExclude );
00142         if( GenesTm.GetGenes( ) )
00143             Answers.FilterGenes( GenesTm, CDat::EFilterTerm );
00144         if( sArgs.pcl_flag )
00145             SVM.Learn( PCLs, Answers );
00146         else
00147             SVM.Learn( &Data, Answers );
00148         if( sArgs.model_arg )
00149             ofsm.open( sArgs.model_arg );
00150         SVM.Save( sArgs.model_arg ? (ostream&)ofsm : cout );
00151         if( sArgs.model_arg )
00152             ofsm.close( );
00153         else
00154             cout.flush( ); }
00155     else if( sArgs.model_arg ) {
00156         ifsm.clear( );
00157         ifsm.open( sArgs.model_arg );
00158         if( !SVM.Open( ifsm ) ) {
00159             cerr << "Could not open: " << sArgs.model_arg << endl;
00160             return 1; }
00161 
00162         if( sArgs.binary_arg )
00163             SVM.Evaluate( sArgs.binary_arg, Dat );
00164         else {
00165             const vector<string>&   vecstrGenes = sArgs.pcl_flag ? PCLs.GetGeneNames( ) :
00166                 Data.GetGeneNames( );
00167 
00168             Dat.Open( vecstrGenes );
00169             if( GenesIn.GetGenes( ) ) {
00170                 if( sArgs.pcl_flag )
00171                     SVM.Evaluate( PCLs, GenesIn, Dat );
00172                 else
00173                     SVM.Evaluate( &Data, GenesIn, Dat ); }
00174             else {
00175                 if( sArgs.pcl_flag )
00176                     SVM.Evaluate( PCLs, Dat );
00177                 else
00178                     SVM.Evaluate( &Data, Dat ); } }
00179         if( sArgs.output_arg )
00180             Dat.Save( sArgs.output_arg ); }
00181     else {
00182         cmdline_parser_print_help( );
00183         return 1; }
00184 
00185     return 0; }
00186 
00187 int main_many( const gengetopt_args_info& sArgs, const CPCLSet& PCLs, const CGenes& GenesIn,
00188     const CGenes& GenesEx, const CGenes& GenesTm ) {
00189     size_t      i, j, iGene;
00190     int         iRet;
00191     CDataPair   Answers;
00192     ofstream    ofsm;
00193     CPCL        PCL;
00194     CGenes      GenesNl( GenesIn.GetGenome( ) );
00195 
00196     if( PCLs.GetPCLs( ) == 0 )
00197         PCL.Open( cin, sArgs.skip_arg );
00198     else {
00199         vector<string>  vecstrExperiments, vecstrFeatures;
00200         size_t          iPCL, iExp;
00201 
00202         for( iPCL = 0; iPCL < PCLs.GetPCLs( ); ++iPCL )
00203             for( iExp = 0; iExp < PCLs.Get( iPCL ).GetExperiments( ); ++iExp )
00204                 vecstrExperiments.push_back( PCLs.Get( iPCL ).GetExperiment( iExp ) );
00205         vecstrFeatures.resize( PCLs.Get( 0 ).GetFeatures( ) - 1 );
00206         for( i = 0; i < vecstrFeatures.size( ); ++i )
00207             vecstrFeatures[ i ] = PCLs.Get( 0 ).GetFeature( i + 1 );
00208         PCL.Open( PCLs.GetGeneNames( ), vecstrExperiments, vecstrFeatures );
00209         for( iGene = 0; iGene < PCL.GetGenes( ); ++iGene )
00210             for( i = iPCL = 0; iPCL < PCLs.GetPCLs( ); ++iPCL )
00211                 for( iExp = 0; iExp < PCLs.Get( iPCL ).GetExperiments( ); ++iExp )
00212                     PCL.Set( iGene, i++, PCLs.Get( iPCL, iGene, iExp ) ); }
00213     if( sArgs.genel_arg ) {
00214         ifstream    ifsm;
00215 
00216         ifsm.open( sArgs.genel_arg );
00217         if( !GenesNl.Open( ifsm ) ) {
00218             cerr << "Could not open: " << sArgs.genel_arg << endl;
00219             return 1; } }
00220 
00221     if( sArgs.input_arg ) {
00222         vector<size_t>  veciGenes;
00223         size_t          iTwo;
00224 
00225         if( !Answers.Open( sArgs.input_arg, false ) ) {
00226             cerr << "Could not open: " << sArgs.input_arg << endl;
00227             return 1; }
00228         if( GenesIn.GetGenes( ) )
00229             Answers.FilterGenes( GenesIn, CDat::EFilterInclude );
00230         if( GenesEx.GetGenes( ) )
00231             Answers.FilterGenes( GenesEx, CDat::EFilterExclude );
00232         if( GenesTm.GetGenes( ) )
00233             Answers.FilterGenes( GenesTm, CDat::EFilterTerm );
00234         veciGenes.resize( PCL.GetGenes( ) );
00235         for( i = 0; i < veciGenes.size( ); ++i )
00236             veciGenes[ i ] = Answers.GetGene( PCL.GetGene( i ) );
00237         for( i = 0; i < PCL.GetGenes( ); ++i ) {
00238             CSVM            SVM;
00239             CGenome         Genome;
00240             CGenes          GenesPos( Genome ), GenesNeg( Genome );
00241             vector<string>  vecstrPos, vecstrNeg;
00242             float           d;
00243 
00244             if( !( i % 100 ) )
00245                 cerr << "Processing gene " << i << '/' << PCL.GetGenes( ) << endl;
00246             if( ( ( iGene = veciGenes[ i ] ) == -1 ) || GenesNl.IsGene( PCL.GetGene( i ) ) ||
00247                 GenesEx.IsGene( PCL.GetGene( i ) ) )
00248                 continue;
00249             for( j = 0; j < PCL.GetGenes( ); ++j )
00250                 if( ( i != j ) && ( ( iTwo = veciGenes[ j ] ) != -1 ) &&
00251                     !CMeta::IsNaN( d = Answers.Get( iGene, iTwo ) ) ) {
00252                     if( d > 0 )
00253                         vecstrPos.push_back( PCL.GetGene( j ) );
00254                     else
00255                         vecstrNeg.push_back( PCL.GetGene( j ) ); }
00256             GenesPos.Open( vecstrPos );
00257             GenesNeg.Open( vecstrNeg );
00258 
00259             if( iRet = init_svm( sArgs, SVM ) )
00260                 return iRet;
00261             SVM.Learn( PCL, GenesPos, GenesNeg );
00262             if( sArgs.model_arg ) {
00263                 ofsm.clear( );
00264                 ofsm.open( ( (string)sArgs.model_arg + "/" + CMeta::Filename( PCL.GetGene( i ) ) +
00265                     ".svm" ).c_str( ) ); }
00266             SVM.Save( sArgs.model_arg ? (ostream&)ofsm : cout );
00267             if( sArgs.model_arg )
00268                 ofsm.close( );
00269             else
00270                 cout.flush( ); } }
00271     else if( sArgs.model_arg ) {
00272         CDat    Dat;
00273 
00274         if( GenesIn.GetGenes( ) )
00275             for( i = 0; i < PCL.GetGenes( ); ++i )
00276                 if( !GenesIn.IsGene( PCL.GetGene( i ) ) )
00277                     PCL.MaskGene( i );
00278         if( GenesEx.GetGenes( ) )
00279             for( i = 0; i < GenesEx.GetGenes( ); ++i )
00280                 if( ( iGene = PCL.GetGene( GenesIn.GetGene( i ).GetName( ) ) ) != -1 )
00281                     PCL.MaskGene( iGene );
00282 
00283         Dat.Open( PCL.GetGeneNames( ) );
00284         for( iGene = 0; iGene < Dat.GetGenes( ); ++iGene ) {
00285             CSVM            SVM;
00286             ifstream        ifsm;
00287             string          strFile;
00288             vector<float>   vecdScores;
00289 
00290             if( !( iGene % 100 ) )
00291                 cerr << "Processing gene " << iGene << '/' << Dat.GetGenes( ) << endl;
00292             ifsm.open( ( strFile = (string)sArgs.model_arg + "/" + CMeta::Filename(
00293                 Dat.GetGene( iGene ) ) + ".svm" ).c_str( ) );
00294             if( !( ifsm.is_open( ) && SVM.Open( ifsm ) ) ) {
00295                 cerr << "Could not open: " << strFile << endl;
00296                 continue; }
00297 
00298             SVM.Evaluate( PCL, vecdScores );
00299             for( i = j = 0; i < PCL.GetGenes( ); ++i )
00300                 if( !PCL.IsMasked( i ) )
00301                     Dat.Set( iGene, i, vecdScores[ j++ ] ); }
00302         if( sArgs.output_arg )
00303             Dat.Save( sArgs.output_arg );
00304         else
00305             Dat.Save( cout, CDat::EFormatText ); }
00306     else {
00307         cmdline_parser_print_help( );
00308         return 1; }
00309 
00310     return 0; }