Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 const char c_szDab[] = "dab"; 00026 00027 int main( int iArgs, char** aszArgs ) { 00028 CHierarchy* pHier; 00029 CPCL PCL, Weights; 00030 const CPCL* pPCL; 00031 CDat Dat; 00032 size_t i, j, k, iGene, iGenes; 00033 float d; 00034 ofstream ofsm; 00035 ifstream ifsm; 00036 vector<size_t> veciGenes, veciPCL, veciEpsilon; 00037 vector<float> vecdPCL; 00038 vector<string> vecstrGenes, vecstrMissing; 00039 vector<bool> vecfGenes; 00040 gengetopt_args_info sArgs; 00041 IMeasure* pMeasure; 00042 CMeasurePearson Pearson; 00043 CMeasureEuclidean Euclidean; 00044 CMeasureKendallsTau KendallsTau; 00045 CMeasureKolmogorovSmirnov KolmSmir; 00046 CMeasureSpearman Spearman( true ); 00047 CMeasureNegate EuclideanNeg( &Euclidean, false ); 00048 IMeasure* apMeasures[] = { &Pearson, &EuclideanNeg, &KendallsTau, 00049 &KolmSmir, &Spearman, NULL }; 00050 00051 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00052 cmdline_parser_print_help( ); 00053 return 1; } 00054 CMeta Meta( sArgs.verbosity_arg ); 00055 00056 if( sArgs.input_arg ) { 00057 ifsm.open( sArgs.input_arg ); 00058 if( !strstr( sArgs.input_arg, c_szDab ) && PCL.Open( ifsm, sArgs.skip_arg ) ) { 00059 ifsm.close( ); 00060 cerr << "Opened PCL: " << sArgs.input_arg << endl; } 00061 else { 00062 ifsm.close( ); 00063 if( !Dat.Open( sArgs.input_arg ) ) { 00064 cerr << "Could not open input: " << sArgs.input_arg << endl; 00065 return 1; } 00066 if( !PCL.Open( cin, sArgs.skip_arg ) ) { 00067 cerr << "Could not open PCL" << endl; 00068 return 1; } } } 00069 else if( PCL.Open( cin, sArgs.skip_arg ) ) 00070 cerr << "Opened PCL" << endl; 00071 else { 00072 cerr << "Could not open PCL" << endl; 00073 return 1; } 00074 00075 if( !Dat.GetGenes( ) ) { 00076 pMeasure = NULL; 00077 for( i = 0; apMeasures[ i ]; ++i ) 00078 if( !strcmp( apMeasures[ i ]->GetName( ), sArgs.distance_arg ) ) { 00079 if( ( pMeasure = apMeasures[ i ] ) == &EuclideanNeg ) 00080 sArgs.normalize_flag = true; 00081 break; } 00082 if( !pMeasure ) { 00083 cmdline_parser_print_help( ); 00084 return 1; } 00085 00086 if( sArgs.weights_arg ) { 00087 ifsm.clear( ); 00088 ifsm.open( sArgs.weights_arg ); 00089 if( !Weights.Open( ifsm, sArgs.skip_arg ) ) { 00090 cerr << "Couldn't open: " << sArgs.weights_arg << endl; 00091 return 1; } 00092 ifsm.close( ); 00093 00094 if( ( Weights.GetExperiments( ) != PCL.GetExperiments( ) ) || 00095 ( Weights.GetGenes( ) != PCL.GetGenes( ) ) ) { 00096 cerr << "Illegal data sizes: " << PCL.GetExperiments( ) << 'x' << PCL.GetGenes( ) << ", " << 00097 Weights.GetExperiments( ) << 'x' << Weights.GetGenes( ) << endl; 00098 return 1; } } 00099 00100 { 00101 CPCL Ranks; 00102 00103 if( pMeasure->IsRank( ) ) { 00104 Ranks.Open( PCL ); 00105 Ranks.RankTransform( ); 00106 pPCL = &Ranks; } 00107 else 00108 pPCL = &PCL; 00109 Dat.Open( PCL.GetGeneNames( ) ); 00110 for( i = 0; i < PCL.GetGenes( ); ++i ) 00111 for( j = ( i + 1 ); j < PCL.GetGenes( ); ++j ) 00112 Dat.Set( i, j, (float)pMeasure->Measure( pPCL->Get( i ), 00113 pPCL->GetExperiments( ), pPCL->Get( j ), pPCL->GetExperiments( ), 00114 IMeasure::EMapCenter, sArgs.weights_arg ? Weights.Get( i ) : NULL, 00115 sArgs.weights_arg ? Weights.Get( j ) : NULL ) ); 00116 } } 00117 00118 for( i = 0; i < Dat.GetGenes( ); ++i ) { 00119 if( PCL.GetGene( Dat.GetGene( i ) ) == -1 ) 00120 vecstrMissing.push_back( Dat.GetGene( i ) ); 00121 for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) 00122 if( CMeta::IsNaN( Dat.Get( i, j ) ) ) 00123 Dat.Set( i, j, 0 ); } 00124 if( !vecstrMissing.empty( ) && !PCL.AddGenes( vecstrMissing ) ) { 00125 cerr << "Couldn't reconcile data" << endl; 00126 return 1; } 00127 if( sArgs.normalize_flag ) 00128 Dat.Normalize( CDat::ENormalizeMinMax ); 00129 if( sArgs.power_arg != 1 ) { 00130 for( i = 0; i < Dat.GetGenes( ); ++i ) 00131 for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) 00132 if( !CMeta::IsNaN( d = Dat.Get( i, j ) ) ) 00133 Dat.Set( i, j, pow( d, (float)sArgs.power_arg ) ); 00134 Dat.Normalize( CDat::ENormalizeMinMax ); } 00135 if( sArgs.flip_flag ) 00136 Dat.Invert( ); 00137 00138 if( sArgs.epsilon_given ) { 00139 vecfGenes.resize( Dat.GetGenes( ) ); 00140 for( i = 0; i < vecfGenes.size( ); ++i ) 00141 for( j = ( i + 1 ); j < vecfGenes.size( ); ++j ) 00142 if( Dat.Get( i, j ) > sArgs.epsilon_arg ) 00143 vecfGenes[ i ] = vecfGenes[ j ] = true; 00144 for( iGenes = i = 0; i < vecfGenes.size( ); ++i ) 00145 if( vecfGenes[ i ] ) 00146 iGenes++; 00147 veciEpsilon.resize( iGenes ); 00148 for( j = i = 0; i < vecfGenes.size( ); ++i ) 00149 if( vecfGenes[ i ] ) 00150 veciEpsilon[ j++ ] = i; } 00151 else 00152 iGenes = Dat.GetGenes( ); 00153 00154 pHier = sArgs.epsilon_given ? CClustHierarchical::Cluster( Dat.Get( ), vecfGenes ) : 00155 CClustHierarchical::Cluster( Dat.Get( ) ); 00156 00157 vecdPCL.resize( iGenes ); 00158 for( k = i = 0; i < Dat.GetGenes( ); ++i ) { 00159 if( sArgs.epsilon_given && !vecfGenes[ i ] ) 00160 continue; 00161 if( ( iGene = PCL.GetGene( Dat.GetGene( i ) ) ) == -1 ) { 00162 vecdPCL[ k++ ] = CMeta::GetNaN( ); 00163 continue; } 00164 vecdPCL[ k ] = 0; 00165 for( j = 0; j < PCL.GetExperiments( ); ++j ) 00166 vecdPCL[ k ] += PCL.Get( iGene, j ); 00167 vecdPCL[ k++ ] /= PCL.GetExperiments( ); } 00168 pHier->SortChildren( vecdPCL ); 00169 00170 pHier->GetGenes( veciGenes ); 00171 veciPCL.resize( PCL.GetGenes( ) ); 00172 for( i = 0; i < veciGenes.size( ); ++i ) 00173 veciPCL[ i ] = PCL.GetGene( Dat.GetGene( 00174 sArgs.epsilon_given ? veciEpsilon[ veciGenes[ i ] ] : veciGenes[ i ] ) ); 00175 for( j = 0; j < PCL.GetGenes( ); ++j ) 00176 if( ( ( k = Dat.GetGene( PCL.GetGene( j ) ) ) == -1 ) || 00177 ( sArgs.epsilon_given && !vecfGenes[ k ] ) ) 00178 veciPCL[ i++ ] = j; 00179 PCL.SortGenes( veciPCL ); 00180 for( i = 0; i < veciGenes.size( ); ++i ) 00181 veciPCL[ i ] = veciGenes[ i ]; 00182 for( ; i < veciPCL.size( ); ++i ) 00183 veciPCL[ i ] = i; 00184 if( sArgs.epsilon_given ) 00185 for( i = iGenes; i < PCL.GetGenes( ); ++i ) 00186 PCL.MaskGene( i ); 00187 00188 ofsm.open( sArgs.output_arg ); 00189 pHier->Save( ofsm, Dat.GetGenes( ) ); 00190 ofsm.close( ); 00191 pHier->Destroy( ); 00192 PCL.Save( cout, &veciPCL ); 00193 cout.flush( ); 00194 00195 return 0; }