Sleipnir
tools/Data2Svm/Data2Svm.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 static const char   c_szRBF[]           = "rbf";
00026 static const char   c_szPolynomial[]    = "poly";
00027 
00028 int main( int iArgs, char** aszArgs ) {
00029     CPCL                Data;
00030     CSVM                SVM;
00031     ifstream            ifsm;
00032     ofstream            ofsm;
00033     CGenome             Genome;
00034     CGenes              Genes( Genome ), GenesEx( Genome );
00035     gengetopt_args_info sArgs;
00036     vector<float>       vecdResults;
00037     size_t              i, j;
00038     float               dAve, dStd;
00039 
00040     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00041         cmdline_parser_print_help( );
00042         return 1; }
00043     CMeta Meta( sArgs.verbosity_arg, sArgs.random_arg );
00044 
00045     ifsm.open( sArgs.input_arg );
00046     if( !Data.Open( ifsm, sArgs.skip_arg ) ) {
00047         cerr << "Could not open: " << sArgs.input_arg << endl;
00048         return 1; }
00049     ifsm.close( );
00050     if( sArgs.normalize_flag )
00051         Data.Normalize( CPCL::ENormalizeZScore );
00052     if( sArgs.random_features_flag )
00053         Data.Randomize( );
00054 
00055     if( sArgs.genes_arg ) {
00056         ifsm.clear( );
00057         ifsm.open( sArgs.genes_arg ); }
00058     if( !Genes.Open( sArgs.genes_arg ? ifsm : cin ) ) {
00059         cerr << "Could not open: " << ( sArgs.genes_arg ? sArgs.genes_arg : "gene input" ) << endl;
00060         return 1; }
00061     if( sArgs.genes_arg )
00062         ifsm.close( );
00063 
00064     if( sArgs.genex_arg ) {
00065         ifsm.clear( );
00066         ifsm.open( sArgs.genex_arg );
00067         if( !GenesEx.Open( ifsm ) ) {
00068             cerr << "Could not open: " << sArgs.genex_arg << endl;
00069             return 1; }
00070         ifsm.close( );
00071 
00072         for( i = 0; i < GenesEx.GetGenes( ); ++i )
00073             if( ( j = Data.GetGene( GenesEx.GetGene( i ).GetName( ) ) ) != -1 )
00074                 Data.MaskGene( j ); }
00075 
00076     if( sArgs.alphas_arg ) {
00077         ifsm.clear( );
00078         ifsm.open( sArgs.alphas_arg );
00079         if( !SVM.OpenAlphas( ifsm ) ) {
00080             cerr << "Could not open: " << sArgs.alphas_arg << endl;
00081             return 1; }
00082         ifsm.close( ); }
00083 
00084     if( !strcmp( sArgs.kernel_arg, c_szRBF ) )
00085         SVM.SetKernel( CSVM::EKernelRBF );
00086     else if( !strcmp( sArgs.kernel_arg, c_szPolynomial ) )
00087         SVM.SetKernel( CSVM::EKernelPolynomial );
00088     else
00089         SVM.SetKernel( CSVM::EKernelLinear );
00090 
00091     SVM.SetCache( sArgs.cache_arg );
00092     SVM.SetIterations( sArgs.iterations_arg );
00093     SVM.SetGamma( sArgs.gamma_arg );
00094     SVM.SetDegree( sArgs.degree_arg );
00095     if( sArgs.tradeoff_given )
00096         SVM.SetTradeoff( sArgs.tradeoff_arg );
00097     SVM.SetVerbosity( 0 );
00098 
00099     SVM.Learn( Data, Genes );
00100     if( sArgs.model_arg ) {
00101         ofsm.open( sArgs.model_arg );
00102         SVM.Save( sArgs.model_arg ? (ostream&)ofsm : cout );
00103         ofsm.close( ); }
00104 
00105     if( sArgs.heldout_flag )
00106         for( i = 0; i < Data.GetGenes( ); ++i )
00107             Data.MaskGene( i, !Data.IsMasked( i ) );
00108     SVM.Evaluate( Data, vecdResults );
00109     if( sArgs.random_output_flag )
00110         random_shuffle( vecdResults.begin( ), vecdResults.end( ) );
00111 
00112     dAve = (float)CStatistics::Average( vecdResults );
00113     dStd = (float)sqrt( CStatistics::Variance( vecdResults, dAve ) );
00114     for( i = 0; i < vecdResults.size( ); ++i )
00115         vecdResults[ i ] = ( vecdResults[ i ] - dAve ) / dStd;
00116     for( i = j = 0; i < Data.GetGenes( ); ++i )
00117         if( !Data.IsMasked( i ) )
00118             cout << Data.GetGene( i ) << '\t' << vecdResults[ j++ ] << endl;
00119 
00120     return 0; }