Sleipnir
tools/MCluster/MCluster.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 const char  c_szDab[]   = "dab";
00026 
00027 int main( int iArgs, char** aszArgs ) {
00028     CHierarchy*                 pHier;
00029     CPCL                        PCL, Weights;
00030     const CPCL*                 pPCL;
00031     CDat                        Dat;
00032     size_t                      i, j, k, iGene, iGenes;
00033     float                       d;
00034     ofstream                    ofsm;
00035     ifstream                    ifsm;
00036     vector<size_t>              veciGenes, veciPCL, veciEpsilon;
00037     vector<float>               vecdPCL;
00038     vector<string>              vecstrGenes, vecstrMissing;
00039     vector<bool>                vecfGenes;
00040     gengetopt_args_info         sArgs;
00041     IMeasure*                   pMeasure;
00042     CMeasurePearson             Pearson;
00043     CMeasureEuclidean           Euclidean;
00044     CMeasureKendallsTau         KendallsTau;
00045     CMeasureKolmogorovSmirnov   KolmSmir;
00046     CMeasureSpearman            Spearman( true );
00047     CMeasureNegate              EuclideanNeg( &Euclidean, false );
00048     IMeasure*                   apMeasures[]    = { &Pearson, &EuclideanNeg, &KendallsTau,
00049         &KolmSmir, &Spearman, NULL };
00050 
00051     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00052         cmdline_parser_print_help( );
00053         return 1; }
00054     CMeta Meta( sArgs.verbosity_arg );
00055 
00056     if( sArgs.input_arg ) {
00057         ifsm.open( sArgs.input_arg );
00058         if( !strstr( sArgs.input_arg, c_szDab ) && PCL.Open( ifsm, sArgs.skip_arg ) ) {
00059             ifsm.close( );
00060             cerr << "Opened PCL: " << sArgs.input_arg << endl; }
00061         else {
00062             ifsm.close( );
00063             if( !Dat.Open( sArgs.input_arg ) ) {
00064                 cerr << "Could not open input: " << sArgs.input_arg << endl;
00065                 return 1; }
00066             if( !PCL.Open( cin, sArgs.skip_arg ) ) {
00067                 cerr << "Could not open PCL" << endl;
00068                 return 1; } } }
00069     else if( PCL.Open( cin, sArgs.skip_arg ) )
00070         cerr << "Opened PCL" << endl;
00071     else {
00072         cerr << "Could not open PCL" << endl;
00073         return 1; }
00074 
00075     if( !Dat.GetGenes( ) ) {
00076         pMeasure = NULL;
00077         for( i = 0; apMeasures[ i ]; ++i )
00078             if( !strcmp( apMeasures[ i ]->GetName( ), sArgs.distance_arg ) ) {
00079                 if( ( pMeasure = apMeasures[ i ] ) == &EuclideanNeg )
00080                     sArgs.normalize_flag = true;
00081                 break; }
00082         if( !pMeasure ) {
00083             cmdline_parser_print_help( );
00084             return 1; }
00085 
00086         if( sArgs.weights_arg ) {
00087             ifsm.clear( );
00088             ifsm.open( sArgs.weights_arg );
00089             if( !Weights.Open( ifsm, sArgs.skip_arg ) ) {
00090                 cerr << "Couldn't open: " << sArgs.weights_arg << endl;
00091                 return 1; }
00092             ifsm.close( );
00093 
00094             if( ( Weights.GetExperiments( ) != PCL.GetExperiments( ) ) ||
00095                 ( Weights.GetGenes( ) != PCL.GetGenes( ) ) ) {
00096                 cerr << "Illegal data sizes: " << PCL.GetExperiments( ) << 'x' << PCL.GetGenes( ) << ", " <<
00097                     Weights.GetExperiments( ) << 'x' << Weights.GetGenes( ) << endl;
00098                 return 1; } }
00099 
00100         {
00101             CPCL    Ranks;
00102 
00103             if( pMeasure->IsRank( ) ) {
00104                 Ranks.Open( PCL );
00105                 Ranks.RankTransform( );
00106                 pPCL = &Ranks; }
00107             else
00108                 pPCL = &PCL;
00109             Dat.Open( PCL.GetGeneNames( ) );
00110             for( i = 0; i < PCL.GetGenes( ); ++i )
00111                 for( j = ( i + 1 ); j < PCL.GetGenes( ); ++j )
00112                     Dat.Set( i, j, (float)pMeasure->Measure( pPCL->Get( i ),
00113                         pPCL->GetExperiments( ), pPCL->Get( j ), pPCL->GetExperiments( ),
00114                         IMeasure::EMapCenter, sArgs.weights_arg ? Weights.Get( i ) : NULL,
00115                         sArgs.weights_arg ? Weights.Get( j ) : NULL ) );
00116         } }
00117 
00118     for( i = 0; i < Dat.GetGenes( ); ++i ) {
00119         if( PCL.GetGene( Dat.GetGene( i ) ) == -1 )
00120             vecstrMissing.push_back( Dat.GetGene( i ) );
00121         for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j )
00122             if( CMeta::IsNaN( Dat.Get( i, j ) ) )
00123                 Dat.Set( i, j, 0 ); }
00124     if( !vecstrMissing.empty( ) && !PCL.AddGenes( vecstrMissing ) ) {
00125         cerr << "Couldn't reconcile data" << endl;
00126         return 1; }
00127     if( sArgs.normalize_flag )
00128         Dat.Normalize( CDat::ENormalizeMinMax );
00129     if( sArgs.power_arg != 1 ) {
00130         for( i = 0; i < Dat.GetGenes( ); ++i )
00131             for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j )
00132                 if( !CMeta::IsNaN( d = Dat.Get( i, j ) ) )
00133                     Dat.Set( i, j, pow( d, (float)sArgs.power_arg ) );
00134         Dat.Normalize( CDat::ENormalizeMinMax ); }
00135     if( sArgs.flip_flag )
00136         Dat.Invert( );
00137 
00138     if( sArgs.epsilon_given ) {
00139         vecfGenes.resize( Dat.GetGenes( ) );
00140         for( i = 0; i < vecfGenes.size( ); ++i )
00141             for( j = ( i + 1 ); j < vecfGenes.size( ); ++j )
00142                 if( Dat.Get( i, j ) > sArgs.epsilon_arg )
00143                     vecfGenes[ i ] = vecfGenes[ j ] = true;
00144         for( iGenes = i = 0; i < vecfGenes.size( ); ++i )
00145             if( vecfGenes[ i ] )
00146                 iGenes++;
00147         veciEpsilon.resize( iGenes );
00148         for( j = i = 0; i < vecfGenes.size( ); ++i )
00149             if( vecfGenes[ i ] )
00150                 veciEpsilon[ j++ ] = i; }
00151     else
00152         iGenes = Dat.GetGenes( );
00153 
00154     pHier = sArgs.epsilon_given ? CClustHierarchical::Cluster( Dat.Get( ), vecfGenes ) :
00155         CClustHierarchical::Cluster( Dat.Get( ) );
00156 
00157     vecdPCL.resize( iGenes );
00158     for( k = i = 0; i < Dat.GetGenes( ); ++i ) {
00159         if( sArgs.epsilon_given && !vecfGenes[ i ] )
00160             continue;
00161         if( ( iGene = PCL.GetGene( Dat.GetGene( i ) ) ) == -1 ) {
00162             vecdPCL[ k++ ] = CMeta::GetNaN( );
00163             continue; }
00164         vecdPCL[ k ] = 0;
00165         for( j = 0; j < PCL.GetExperiments( ); ++j )
00166             vecdPCL[ k ] += PCL.Get( iGene, j );
00167         vecdPCL[ k++ ] /= PCL.GetExperiments( ); }
00168     pHier->SortChildren( vecdPCL );
00169 
00170     pHier->GetGenes( veciGenes );
00171     veciPCL.resize( PCL.GetGenes( ) );
00172     for( i = 0; i < veciGenes.size( ); ++i )
00173         veciPCL[ i ] = PCL.GetGene( Dat.GetGene(
00174             sArgs.epsilon_given ? veciEpsilon[ veciGenes[ i ] ] : veciGenes[ i ] ) );
00175     for( j = 0; j < PCL.GetGenes( ); ++j )
00176         if( ( ( k = Dat.GetGene( PCL.GetGene( j ) ) ) == -1 ) ||
00177             ( sArgs.epsilon_given && !vecfGenes[ k ] ) )
00178             veciPCL[ i++ ] = j;
00179     PCL.SortGenes( veciPCL );
00180     for( i = 0; i < veciGenes.size( ); ++i )
00181         veciPCL[ i ] = veciGenes[ i ];
00182     for( ; i < veciPCL.size( ); ++i )
00183         veciPCL[ i ] = i;
00184     if( sArgs.epsilon_given )
00185         for( i = iGenes; i < PCL.GetGenes( ); ++i )
00186             PCL.MaskGene( i );
00187 
00188     ofsm.open( sArgs.output_arg );
00189     pHier->Save( ofsm, Dat.GetGenes( ) );
00190     ofsm.close( );
00191     pHier->Destroy( );
00192     PCL.Save( cout, &veciPCL );
00193     cout.flush( );
00194 
00195     return 0; }