Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 struct SSorter { 00026 const vector<float>& m_vecdScores; 00027 00028 SSorter( const vector<float>& vecdScores ) : m_vecdScores(vecdScores) { } 00029 00030 bool operator()( size_t iOne, size_t iTwo ) { 00031 00032 return ( m_vecdScores[iTwo] < m_vecdScores[iOne] ); } 00033 }; 00034 00035 int open_genes( const char* szFile, CGenes& Genes ) { 00036 ifstream ifsm; 00037 00038 if( szFile ) { 00039 ifsm.open( szFile ); 00040 if( !Genes.Open( ifsm ) ) { 00041 cerr << "Could not open: " << szFile << endl; 00042 return 1; } } 00043 00044 return 0; } 00045 00046 int open_values( const char* szFile, vector<float>& vecdValues ) { 00047 static const size_t c_iBuf = 1024; 00048 char szBuf[ c_iBuf ]; 00049 ifstream ifsm; 00050 00051 if( szFile ) { 00052 ifsm.open( szFile ); 00053 if( !ifsm.is_open( ) ) { 00054 cerr << "Could not open: " << szFile << endl; 00055 return 1; } 00056 while( ifsm.peek( ) != EOF ) { 00057 ifsm.getline( szBuf, c_iBuf - 1 ); 00058 vecdValues.push_back( (float)atof( szBuf ) ); } 00059 ifsm.close( ); } 00060 00061 return 0; } 00062 00063 int main( int iArgs, char** aszArgs ) { 00064 gengetopt_args_info sArgs; 00065 ifstream ifsm; 00066 CDat Dat, DatNew; 00067 CDat* pDat; 00068 float d, dCutoff; 00069 CGenome Genome; 00070 CGenes GenesIn( Genome ), GenesEx( Genome ), GenesQr( Genome ); 00071 int iRet; 00072 size_t i, j, k; 00073 vector<float> vecdColors, vecdBorders, vecdWeights; 00074 vector<size_t> veciQuery; 00075 00076 if( cmdline_parser2( iArgs, aszArgs, &sArgs, 0, 1, 0 ) && ( sArgs.config_arg && 00077 cmdline_parser_configfile( sArgs.config_arg, &sArgs, 0, 0, 1 ) ) ) { 00078 cmdline_parser_print_help( ); 00079 return 1; } 00080 CMeta Meta( sArgs.verbosity_arg ); 00081 00082 if( sArgs.features_arg ) { 00083 ifsm.open( sArgs.features_arg ); 00084 if( !Genome.Open( ifsm ) ) { 00085 cerr << "Could not open: " << sArgs.features_arg << endl; 00086 return 1; } 00087 ifsm.close( ); } 00088 if( iRet = open_values( sArgs.colors_arg, vecdColors ) ) 00089 return iRet; 00090 if( iRet = open_values( sArgs.borders_arg, vecdBorders ) ) 00091 return iRet; 00092 if( iRet = open_values( sArgs.genew_arg, vecdWeights ) ) 00093 return iRet; 00094 if( iRet = open_genes( sArgs.genes_arg, GenesIn ) ) 00095 return iRet; 00096 if( iRet = open_genes( sArgs.genex_arg, GenesEx ) ) 00097 return iRet; 00098 if( iRet = open_genes( sArgs.geneq_arg, GenesQr ) ) 00099 return iRet; 00100 00101 if( sArgs.input_arg ) { 00102 if( !Dat.Open( sArgs.input_arg, sArgs.memmap_flag && !( sArgs.normalize_flag || 00103 sArgs.genes_arg || sArgs.geneq_arg || sArgs.knowns_arg ) ) ) { 00104 cerr << "Couldn't open: " << sArgs.input_arg << endl; 00105 return 1; } } 00106 else if( !Dat.Open( cin, CDat::EFormatText ) ) { 00107 cerr << "Couldn't open input" << endl; 00108 return 1; } 00109 pDat = &Dat; 00110 00111 veciQuery.resize( pDat->GetGenes( ) ); 00112 for( i = 0; i < veciQuery.size( ); ++i ) 00113 veciQuery[ i ] = GenesQr.GetGene( pDat->GetGene( i ) ); 00114 00115 dCutoff = (float)( sArgs.cutoff_given ? sArgs.cutoff_arg : -FLT_MAX ); 00116 if( GenesIn.GetGenes( ) ) { 00117 vector<size_t> veciGenes; 00118 00119 DatNew.Open( GenesIn.GetGeneNames( ) ); 00120 veciGenes.resize( DatNew.GetGenes( ) ); 00121 for( i = 0; i < veciGenes.size( ); ++i ) 00122 veciGenes[ i ] = Dat.GetGene( DatNew.GetGene( i ) ); 00123 for( i = 0; i < veciGenes.size( ); ++i ) { 00124 if( veciGenes[ i ] == -1 ) 00125 continue; 00126 for( j = ( i + 1 ); j < veciGenes.size( ); ++j ) 00127 if( veciGenes[ j ] != -1 ) 00128 DatNew.Set( i, j, Dat.Get( veciGenes[ i ], veciGenes[ j ] ) ); } 00129 pDat = &DatNew; } 00130 if( GenesEx.GetGenes( ) ) 00131 pDat->FilterGenes( GenesEx, CDat::EFilterExclude ); 00132 if( sArgs.normalize_flag ) 00133 pDat->Normalize( CDat::ENormalizeSigmoid ); 00134 if( GenesQr.GetGenes( ) ) { 00135 if( sArgs.cutoff_given ) 00136 for( i = 0; i < pDat->GetGenes( ); ++i ) 00137 for( j = ( i + 1 ); j < pDat->GetGenes( ); ++j ) 00138 if( !CMeta::IsNaN( d = pDat->Get( i, j ) ) && ( d < sArgs.cutoff_arg ) ) 00139 pDat->Set( i, j, CMeta::GetNaN( ) ); 00140 if( sArgs.hubs_arg >= 0 ) { 00141 vector<float> vecdScores; 00142 vector<size_t> veciIndices; 00143 vector<bool> vecfHits; 00144 00145 veciIndices.resize( pDat->GetGenes( ) ); 00146 vecdScores.resize( pDat->GetGenes( ) ); 00147 vecfHits.resize( pDat->GetGenes( ) ); 00148 for( i = 0; i < pDat->GetGenes( ); ++i ) 00149 if( veciQuery[i] == -1 ) { 00150 for( j = 0; j < pDat->GetGenes( ); ++j ) 00151 if( veciQuery[j] == -1 ) 00152 pDat->Set( i, j, CMeta::GetNaN( ) ); } 00153 else { 00154 fill( vecdScores.begin( ), vecdScores.end( ), -FLT_MAX ); 00155 for( j = 0; j < pDat->GetGenes( ); ++j ) { 00156 if( CMeta::IsNaN( d = pDat->Get( i, j ) ) ) 00157 d = -FLT_MAX; 00158 vecdScores[j] = d; } 00159 for( j = 0; j < veciIndices.size( ); ++j ) 00160 veciIndices[j] = j; 00161 sort( veciIndices.begin( ), veciIndices.end( ), SSorter( vecdScores ) ); 00162 fill( vecfHits.begin( ), vecfHits.end( ), false ); 00163 for( j = 0; j < (size_t)sArgs.hubs_arg; ++j ) 00164 vecfHits[veciIndices[j]] = true; 00165 for( j = 0; j < pDat->GetGenes( ); ++j ) 00166 if( !vecfHits[j] ) 00167 pDat->Set( i, j, CMeta::GetNaN( ) ); } 00168 pDat->Normalize( CDat::ENormalizeZScore ); } 00169 else if( !strcmp( sArgs.format_arg, "correl" ) ) { 00170 CMeasurePearson MeasurePearson; 00171 float* adCentroid; 00172 float* adCur; 00173 size_t iCur; 00174 vector<size_t> veciCounts; 00175 vector<float> vecdScores; 00176 00177 veciCounts.resize( pDat->GetGenes( ) ); 00178 adCentroid = new float[ pDat->GetGenes( ) ]; 00179 for( i = 0; i < GenesQr.GetGenes( ); ++i ) { 00180 if( ( iCur = pDat->GetGene( GenesQr.GetGene( i ).GetName( ) ) ) == -1 ) 00181 continue; 00182 for( j = 0; j < pDat->GetGenes( ); ++j ) 00183 if( !CMeta::IsNaN( d = pDat->Get( iCur, j ) ) ) { 00184 adCentroid[ j ] += d; 00185 veciCounts[ j ]++; } } 00186 for( i = 0; i < pDat->GetGenes( ); ++i ) 00187 adCentroid[ i ] /= veciCounts[ i ]; 00188 00189 vecdScores.resize( pDat->GetGenes( ) ); 00190 adCur = new float[ pDat->GetGenes( ) ]; 00191 for( i = 0; i < pDat->GetGenes( ); ++i ) { 00192 for( j = 0; j < pDat->GetGenes( ); ++j ) 00193 adCur[ j ] = pDat->Get( i, j ); 00194 vecdScores[ i ] = (float)MeasurePearson.Measure( adCentroid, pDat->GetGenes( ), adCur, 00195 pDat->GetGenes( ), IMeasure::EMapNone, NULL, NULL ); } 00196 delete[] adCur; 00197 delete[] adCentroid; 00198 for( i = 0; i < vecdScores.size( ); ++i ) 00199 cout << pDat->GetGene( i ) << '\t' << vecdScores[ i ] << endl; } 00200 else { 00201 if( vecdColors.empty( ) ) { 00202 vecdColors.resize( pDat->GetGenes( ) ); 00203 fill( vecdColors.begin( ), vecdColors.end( ), 0.5f ); 00204 for( i = 0; i < GenesQr.GetGenes( ); ++i ) 00205 if( ( j = pDat->GetGene( GenesQr.GetGene( i ).GetName( ) ) ) != -1 ) 00206 vecdColors[ j ] = 1; } 00207 pDat->FilterGenes( GenesQr, sArgs.hefalmp_flag ? CDat::EFilterHefalmp : CDat::EFilterPixie, 00208 sArgs.neighbors_arg, (float)sArgs.edges_arg, !!sArgs.absolute_flag, 00209 vecdWeights.empty( ) ? NULL : &vecdWeights ); } } 00210 if( sArgs.knowns_arg ) { 00211 CDat DatKnowns; 00212 vector<size_t> veciKnowns; 00213 size_t iOne, iTwo; 00214 00215 if( !DatKnowns.Open( sArgs.knowns_arg, !!sArgs.memmap_flag ) ) { 00216 cerr << "Could not open: " << sArgs.knowns_arg << endl; 00217 return 1; } 00218 veciKnowns.resize( pDat->GetGenes( ) ); 00219 for( i = 0; i < veciKnowns.size( ); ++i ) 00220 veciKnowns[ i ] = DatKnowns.GetGene( pDat->GetGene( i ) ); 00221 for( i = 0; i < pDat->GetGenes( ); ++i ) 00222 if( ( iOne = veciKnowns[ i ] ) != -1 ) 00223 for( j = ( i + 1 ); j < pDat->GetGenes( ); ++j ) 00224 if( ( ( iTwo = veciKnowns[ j ] ) != -1 ) && 00225 !CMeta::IsNaN( d = DatKnowns.Get( iOne, iTwo ) ) && ( d > 0 ) ) 00226 pDat->Set( i, j, CMeta::GetNaN( ) ); } 00227 00228 if( !strcmp( sArgs.format_arg, "dot" ) ) 00229 pDat->SaveDOT( cout, dCutoff, &Genome, false, true, vecdColors.empty( ) ? NULL : &vecdColors, 00230 vecdBorders.empty( ) ? NULL : &vecdBorders ); 00231 else if( !strcmp( sArgs.format_arg, "gdf" ) ) 00232 pDat->SaveGDF( cout, dCutoff ); 00233 else if( !strcmp( sArgs.format_arg, "net" ) ) 00234 pDat->SaveNET( cout, dCutoff ); 00235 else if( !strcmp( sArgs.format_arg, "matisse" ) ) 00236 pDat->SaveMATISSE( cout, dCutoff, &Genome ); 00237 else if( !strcmp( sArgs.format_arg, "list" ) ) { 00238 map<size_t, float> mapGenes; 00239 map<size_t, float>::iterator iterGene; 00240 float dCur; 00241 00242 for( i = 0; i < pDat->GetGenes( ); ++i ) 00243 for( j = ( i + 1 ); j < pDat->GetGenes( ); ++j ) 00244 if( !CMeta::IsNaN( d = pDat->Get( i, j ) ) && 00245 ( CMeta::IsNaN( dCutoff ) || ( d > dCutoff ) ) ) { 00246 if( ( k = veciQuery[ i ] ) != -1 ) { 00247 dCur = d * ( vecdWeights.empty( ) ? 1 : vecdWeights[ k ] ); 00248 if( ( iterGene = mapGenes.find( j ) ) == mapGenes.end( ) ) 00249 mapGenes[ j ] = dCur; 00250 else 00251 iterGene->second += dCur; } 00252 if( ( k = veciQuery[ j ] ) != -1 ) { 00253 dCur = d * ( vecdWeights.empty( ) ? 1 : vecdWeights[ k ] ); 00254 if( ( iterGene = mapGenes.find( i ) ) == mapGenes.end( ) ) 00255 mapGenes[ i ] = dCur; 00256 else 00257 iterGene->second += dCur; } } 00258 for( iterGene = mapGenes.begin( ); iterGene != mapGenes.end( ); ++iterGene ) 00259 cout << pDat->GetGene( iterGene->first ) << '\t' << iterGene->second << endl; } 00260 else if( !strcmp( sArgs.format_arg, "dat" ) ) { 00261 for( i = 0; i < pDat->GetGenes( ); ++i ) 00262 for( j = ( i + 1 ); j < pDat->GetGenes( ); ++j ) 00263 if( ( d = pDat->Get( i, j ) ) < dCutoff ) 00264 pDat->Set( i, j, CMeta::GetNaN( ) ); 00265 pDat->Save( cout, CDat::EFormatText ); } 00266 00267 return 0; }