Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 enum EShared { 00026 ESharedIgnore = 0, 00027 ESharedDiscard = ESharedIgnore + 1, 00028 ESharedOneOnly = ESharedDiscard + 1 00029 }; 00030 00031 static const char* c_aszShared[] = {"ignore", "discard", "oneonly"}; 00032 00033 int main( int iArgs, char** aszArgs ) { 00034 gengetopt_args_info sArgs; 00035 CDat DatIn; 00036 size_t iF1, iF2, i, j, iOne, iTwo, iCountIn, iCountOut, iSharedOne, iSharedTwo; 00037 CGenome Genome; 00038 vector<CGenes*> vecpGenes; 00039 vector<string> vecstrNames; 00040 vector<vector<size_t> > vecveciGenes; 00041 vector<vector<float> > vecvecdGenes; 00042 float d, dAveIn, dAveOut, dOne, dTwo; 00043 double dCountIn; 00044 ofstream ofsm; 00045 EShared eShared; 00046 00047 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00048 cmdline_parser_print_help( ); 00049 return 1; } 00050 CMeta Meta( sArgs.verbosity_arg ); 00051 00052 for( i = 0; i < ARRAYSIZE(c_aszShared); ++i ) 00053 if( !strcmp( sArgs.shared_arg, c_aszShared[ i ] ) ) 00054 break; 00055 if( i >= ARRAYSIZE(c_aszShared) ) { 00056 cmdline_parser_print_help( ); 00057 return 1; } 00058 eShared = (EShared)i; 00059 00060 if( !DatIn.Open( sArgs.input_arg, sArgs.memmap_flag && !sArgs.normalize_flag ) ) { 00061 cerr << "Could not open: " << sArgs.input_arg << endl; 00062 return 1; } 00063 if( sArgs.normalize_flag ) 00064 DatIn.Normalize( CDat::ENormalizeSigmoid ); 00065 00066 vecpGenes.resize( sArgs.inputs_num ); 00067 vecstrNames.resize( vecpGenes.size( ) ); 00068 vecveciGenes.resize( vecpGenes.size( ) ); 00069 for( i = 0; i < vecpGenes.size( ); ++i ) { 00070 vecstrNames[ i ] = CMeta::Basename( sArgs.inputs[ i ] ); 00071 while( ( j = vecstrNames[ i ].rfind( '.' ) ) != string::npos ) 00072 vecstrNames[ i ] = vecstrNames[ i ].substr( 0, j ); 00073 vecpGenes[ i ] = new CGenes( Genome ); 00074 if( !vecpGenes[ i ]->Open( sArgs.inputs[ i ] ) ) { 00075 cerr << "Could not open: " << sArgs.inputs[ i ] << endl; 00076 return 1; } 00077 00078 vecveciGenes[ i ].resize( vecpGenes[ i ]->GetGenes( ) ); 00079 for( j = 0; j < vecpGenes[ i ]->GetGenes( ); ++j ) 00080 vecveciGenes[ i ][ j ] = DatIn.GetGene( vecpGenes[ i ]->GetGene( j ).GetName( ) ); } 00081 if( sArgs.weights_arg ) { 00082 CPCL PCLWeights; 00083 00084 vecvecdGenes.resize( vecpGenes.size( ) ); 00085 if( !PCLWeights.Open( sArgs.weights_arg, 0 ) ) { 00086 cerr << "Could not open: " << sArgs.weights_arg << endl; 00087 return 1; } 00088 for( i = 0; i < vecstrNames.size( ); ++i ) { 00089 vecvecdGenes[i].resize( vecveciGenes[i].size( ) ); 00090 if( ( iOne = PCLWeights.GetExperiment( vecstrNames[i] ) ) == -1 ) { 00091 cerr << "Could not find gene set weight: " << vecstrNames[i] << endl; 00092 fill( vecvecdGenes[i].begin( ), vecvecdGenes[i].end( ), 0.0f ); 00093 continue; } 00094 for( j = 0; j < vecpGenes[i]->GetGenes( ); ++j ) 00095 vecvecdGenes[i][j] = ( ( iTwo = PCLWeights.GetGene( vecpGenes[i]->GetGene( j ).GetName( ) ) ) == -1 ) ? 00096 0 : PCLWeights.Get( iTwo, iOne ); } } 00097 00098 { 00099 CDat DatOut; 00100 00101 DatOut.Open( vecstrNames ); 00102 for( iF1 = 0; iF1 < vecveciGenes.size( ); ++iF1 ) 00103 for( iF2 = ( iF1 + 1 ); iF2 < vecveciGenes.size( ); ++iF2 ) { 00104 map<const CGene*, size_t> mappiGenes; 00105 map<const CGene*, size_t>::iterator iterGene; 00106 00107 for( i = 0; i < vecpGenes[ iF1 ]->GetGenes( ); ++i ) 00108 mappiGenes[ &vecpGenes[ iF1 ]->GetGene( i ) ] = 1; 00109 for( i = 0; i < vecpGenes[ iF2 ]->GetGenes( ); ++i ) { 00110 const CGene* pGene = &vecpGenes[ iF2 ]->GetGene( i ); 00111 00112 if( ( iterGene = mappiGenes.find( pGene ) ) == mappiGenes.end( ) ) 00113 mappiGenes[ pGene ] = 1; 00114 else 00115 iterGene->second++; } 00116 00117 dCountIn = dAveIn = 0; 00118 for( i = 0; i < vecveciGenes[ iF1 ].size( ); ++i ) { 00119 if( ( iOne = vecveciGenes[ iF1 ][ i ] ) == -1 ) 00120 continue; 00121 dOne = vecvecdGenes.empty( ) ? 1 : vecvecdGenes[iF1][i]; 00122 iSharedOne = mappiGenes[ &vecpGenes[ iF1 ]->GetGene( i ) ]; 00123 if( ( eShared == ESharedDiscard ) && ( iSharedOne > 1 ) ) 00124 continue; 00125 for( j = 0; j < vecveciGenes[ iF2 ].size( ); ++j ) { 00126 iSharedTwo = mappiGenes[ &vecpGenes[ iF2 ]->GetGene( j ) ]; 00127 switch( eShared ) { 00128 case ESharedDiscard: 00129 if( iSharedTwo > 1 ) 00130 continue; 00131 break; 00132 00133 case ESharedOneOnly: 00134 if( ( iSharedOne > 1 ) && ( iSharedTwo > 1 ) ) 00135 continue; 00136 break; } 00137 00138 if( ( ( iTwo = vecveciGenes[ iF2 ][ j ] ) != -1 ) && 00139 !CMeta::IsNaN( d = DatIn.Get( iOne, iTwo ) ) ) { 00140 dTwo = vecvecdGenes.empty( ) ? 1 : vecvecdGenes[iF2][j]; 00141 dCountIn += dOne * dTwo; 00142 dAveIn += d * dOne * dTwo; } } } 00143 DatOut.Set( iF1, iF2, ( dCountIn >= sArgs.minimum_arg ) ? (float)( dAveIn / dCountIn ) : CMeta::GetNaN( ) ); } 00144 if( sArgs.zscore_flag ) 00145 DatOut.Normalize( CDat::ENormalizeZScore ); 00146 DatOut.Save( sArgs.output_arg ); 00147 } 00148 00149 if( sArgs.colors_arg ) { 00150 ofsm.open( sArgs.colors_arg ); 00151 if( !ofsm.is_open( ) ) { 00152 cerr << "Could not open: " << sArgs.colors_arg << endl; 00153 return 1; } 00154 for( i = 0; i < vecpGenes.size( ); ++i ) { 00155 dAveIn = dAveOut = 0; 00156 for( iCountIn = iCountOut = iF1 = 0; iF1 < vecpGenes[ i ]->GetGenes( ); ++iF1 ) { 00157 if( ( iOne = vecveciGenes[ i ][ iF1 ] ) == -1 ) 00158 continue; 00159 for( iF2 = ( iF1 + 1 ); iF2 < vecpGenes[ i ]->GetGenes( ); ++iF2 ) 00160 if( ( ( iTwo = vecveciGenes[ i ][ iF2 ] ) != -1 ) && 00161 !CMeta::IsNaN( d = DatIn.Get( iOne, iTwo ) ) ) { 00162 //cerr << DatIn.GetGene( iOne ) << '\t' << DatIn.GetGene( iTwo ) << '\t' << d << endl; 00163 iCountIn++; 00164 dAveIn += d; } 00165 for( j = 0; j < DatIn.GetGenes( ); ++j ) 00166 if( !vecpGenes[ i ]->IsGene( DatIn.GetGene( j ) ) && 00167 !CMeta::IsNaN( d = DatIn.Get( iOne, j ) ) ) { 00168 iCountOut++; 00169 dAveOut += d; } } 00170 dAveIn /= iCountIn; 00171 dAveOut /= iCountOut; 00172 ofsm << ( dAveIn - dAveOut ) << '\t' << dAveIn << '\t' << dAveOut << '\t' << 00173 vecstrNames[ i ] << endl; } 00174 ofsm.close( ); } 00175 00176 for( i = 0; i < vecpGenes.size( ); ++i ) 00177 delete vecpGenes[ i ]; 00178 00179 return 0; }