Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 enum ETFPN { 00026 ETFPN_TP = 0, 00027 ETFPN_FP = ETFPN_TP + 1, 00028 ETFPN_TN = ETFPN_FP + 1, 00029 ETFPN_FN = ETFPN_TN + 1 00030 }; 00031 00032 struct SDatum { 00033 float m_dValue; 00034 size_t m_iOne; 00035 size_t m_iTwo; 00036 float m_dAnswer; 00037 00038 SDatum( float dValue, size_t iOne, size_t iTwo, float dAnswer ) : m_dValue(dValue), m_iOne(iOne), m_iTwo(iTwo), 00039 m_dAnswer(dAnswer) { } 00040 }; 00041 00042 struct SSorter { 00043 bool m_fInvert; 00044 00045 SSorter( bool fInvert ) : m_fInvert(fInvert) { } 00046 00047 bool operator()( const SDatum& sOne, const SDatum& sTwo ) const { 00048 00049 return ( m_fInvert ? ( sOne.m_dValue > sTwo.m_dValue ) : ( sOne.m_dValue < sTwo.m_dValue ) ); 00050 } 00051 }; 00052 00053 double AUCMod( const CDat&, const CDat&, const vector<bool>&, const vector<bool>&, const gengetopt_args_info&, bool, float ); 00054 00055 int main( int iArgs, char** aszArgs ) { 00056 CDat Answers, Data, wDat; 00057 gengetopt_args_info sArgs; 00058 size_t i, j, k, m, iOne, iTwo, iGenes, iPositives, iNegatives, iBins, iRand; 00059 vector<size_t> veciGenes, veciRec, veciRecTerm; 00060 CFullMatrix<bool> MatGenes; 00061 CFullMatrix<size_t> MatResults; 00062 ETFPN eTFPN; 00063 int iMax; 00064 float dAnswer, dValue; 00065 vector<bool> vecfHere, vecfUbik; 00066 vector<float> vecdScores, vecdSSE, vecdBinValue,vecGeneWeights; 00067 vector<size_t> veciPositives, veciNegatives, veciGenesTerm; 00068 CGenome Genome; 00069 CGenes wGenes(Genome); 00070 ofstream ofsm; 00071 ostream* postm; 00072 map<float,size_t> mapValues; 00073 bool fMapAnswers,isDatWeighted=false; 00074 CGenes GenesTm( Genome ), GenesUbik( Genome ); 00075 00076 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00077 cmdline_parser_print_help( ); 00078 return 1; 00079 } 00080 CMeta Meta( sArgs.verbosity_arg ); 00081 00082 fMapAnswers = !!sArgs.memmap_flag && !( sArgs.genes_arg || sArgs.genet_arg || sArgs.genex_arg || sArgs.genee_arg ); 00083 if( !Answers.Open( sArgs.answers_arg, fMapAnswers ) ) { 00084 cerr << "Couldn't open: " << sArgs.answers_arg << endl; 00085 return 1; 00086 } 00087 if( sArgs.genes_arg && !Answers.FilterGenes( sArgs.genes_arg, CDat::EFilterInclude ) ) { 00088 cerr << "Couldn't open: " << sArgs.genes_arg << endl; 00089 return 1; 00090 } 00091 if( sArgs.genee_arg && !Answers.FilterGenes( sArgs.genee_arg, CDat::EFilterEdge ) ) { 00092 cerr << "Couldn't open: " << sArgs.genee_arg << endl; 00093 return 1; 00094 } 00095 if( sArgs.genet_arg ) { 00096 if( !( Answers.FilterGenes( sArgs.genet_arg, CDat::EFilterTerm ) && 00097 GenesTm.Open( sArgs.genet_arg ) ) ) { 00098 cerr << "Couldn't open: " << sArgs.genet_arg << endl; 00099 return 1; 00100 } 00101 veciGenesTerm.reserve( GenesTm.GetGenes( ) ); 00102 for( i = 0; i < GenesTm.GetGenes( ); ++i ) 00103 if( ( j = Answers.GetGene( GenesTm.GetGene( i ).GetName( ) ) ) != -1 ) 00104 veciGenesTerm.push_back( j ); 00105 } 00106 if( sArgs.genex_arg && !Answers.FilterGenes( sArgs.genex_arg, CDat::EFilterExclude ) ) { 00107 cerr << "Couldn't open: " << sArgs.genex_arg << endl; 00108 return 1; 00109 } 00110 if( !Data.Open( sArgs.input_arg, !!sArgs.memmap_flag ) ) { 00111 cerr << "Couldn't open: " << sArgs.input_arg << endl; 00112 return 1; 00113 } 00114 if( sArgs.normalize_flag ) 00115 Data.Normalize( CDat::ENormalizeMinMax ); 00116 00117 00118 if(sArgs.weights_arg){ 00119 ifstream ifsm; 00120 ifsm.open( sArgs.weights_arg ); 00121 if( !wGenes.OpenWeighted( ifsm ) ) { 00122 if(!wDat.Open(sArgs.weights_arg, !!sArgs.memmap_flag )){ 00123 cerr << "Couldn't open: " << sArgs.inputs[ i ] << endl; 00124 return 1; } 00125 else{ 00126 isDatWeighted = true; 00127 } 00128 }else{ 00129 vecGeneWeights.resize(Answers.GetGenes( )); 00130 for( i = 0; i < vecGeneWeights.size( ); ++i ){ 00131 vecGeneWeights[ i ] = wGenes.GetGeneWeight(wGenes.GetGene( Answers.GetGene( i ) ));} 00132 } 00133 } 00134 00135 if( sArgs.abs_arg ){ 00136 float d; 00137 for( i = 0; i < Data.GetGenes( ); ++i ){ 00138 for( j = ( i + 1 ); j < Data.GetGenes( ); ++j ){ 00139 if( !CMeta::IsNaN( (d = Data.Get( i, j )) ) ){ 00140 Data.Set(i, j, fabs(d - sArgs.abs_arg)); 00141 } 00142 } 00143 } 00144 } 00145 00146 veciGenes.resize( Answers.GetGenes( ) ); 00147 for( i = 0; i < Answers.GetGenes( ); ++i ) 00148 veciGenes[ i ] = Data.GetGene( Answers.GetGene( i ) ); 00149 for( iRand = 0; iRand <= (size_t)sArgs.randomize_arg; ++iRand ) { 00150 if( iRand ) 00151 Data.Randomize( ); 00152 if( sArgs.finite_flag ) { 00153 vector<float> vecdValues; 00154 { 00155 set<float> setdValues; 00156 00157 for( i = 0; i < Answers.GetGenes( ); ++i ) { 00158 if( ( iOne = veciGenes[ i ] ) == -1 ) 00159 continue; 00160 for( j = ( i + 1 ); j < Answers.GetGenes( ); ++j ) { 00161 if( ( ( iTwo = veciGenes[ j ] ) == -1 ) || 00162 CMeta::IsNaN( dValue = Data.Get( iOne, iTwo ) ) || 00163 CMeta::IsNaN( Answers.Get( i, j ) ) ) 00164 continue; 00165 if( sArgs.invert_flag ) 00166 dValue = 1 - dValue; 00167 setdValues.insert( dValue ); 00168 } 00169 } 00170 vecdValues.resize( setdValues.size( ) ); 00171 copy( setdValues.begin( ), setdValues.end( ), vecdValues.begin( ) ); 00172 } 00173 sort( vecdValues.begin( ), vecdValues.end( ) ); 00174 for( i = 0; i < vecdValues.size( ); ++i ) 00175 mapValues[ vecdValues[ i ] ] = i; 00176 iBins = mapValues.size( ); 00177 } 00178 else 00179 iBins = sArgs.bins_arg; 00180 MatResults.Initialize( iBins ? ( iBins + 1 ) : 00181 (size_t)( ( sArgs.max_arg - sArgs.min_arg ) / sArgs.delta_arg ) + 1, 4 ); 00182 MatGenes.Initialize( veciGenes.size( ), MatResults.GetRows( ) ); 00183 00184 for( iGenes = 0; !sArgs.inputs_num || ( iGenes < sArgs.inputs_num ); ++iGenes ) { 00185 MatResults.Clear( ); 00186 MatGenes.Clear( ); 00187 00188 if( sArgs.genep_arg ) { 00189 CGenes Genes( Genome ); 00190 if( !Genes.Open( sArgs.genep_arg ) ) { 00191 cerr << "Couldn't open: " << sArgs.genep_arg << endl; 00192 return 1; 00193 } 00194 vecfHere.resize( Answers.GetGenes( ) ); 00195 for( i = 0; i < vecfHere.size( ); ++i ) { 00196 vecfHere[ i ] = Genes.IsGene( Answers.GetGene( i ) ); 00197 } 00198 } 00199 if( sArgs.ubiqg_arg ) { 00200 if( !GenesUbik.Open( sArgs.ubiqg_arg ) ) { 00201 cerr << "Could not open: " << sArgs.ubiqg_arg << endl; 00202 return 1; 00203 } 00204 vecfUbik.resize( Answers.GetGenes( ) ); 00205 for( i = 0; i < vecfUbik.size( ); ++i ) { 00206 vecfUbik[ i ] = GenesUbik.IsGene( Answers.GetGene( i ) ); 00207 } 00208 } 00209 00210 if( mapValues.size( ) ) { 00211 for( i = 0; i < Answers.GetGenes( ); ++i ) { 00212 if( ( iOne = veciGenes[ i ] ) == -1 ) 00213 continue; 00214 for( j = ( i + 1 ); j < Answers.GetGenes( ); ++j ) { 00215 if( ( ( iTwo = veciGenes[ j ] ) == -1 ) || 00216 CMeta::IsNaN( dValue = Data.Get( iOne, iTwo ) ) || 00217 CMeta::IsNaN( dAnswer = Answers.Get( i, j ) ) ) 00218 continue; 00219 if ( CMeta::SkipEdge( !!dAnswer, i, j, vecfHere, vecfUbik, sArgs.ctxtpos_flag, sArgs.ctxtneg_flag, sArgs.bridgepos_flag, sArgs.bridgeneg_flag, sArgs.outpos_flag, sArgs.outneg_flag ) ) { 00220 continue; 00221 } 00222 if( sArgs.invert_flag ) 00223 dValue = 1 - dValue; 00224 for( k = 0; k <= mapValues[ dValue ]; ++k ) { 00225 MatGenes.Set( i, k, true ); 00226 MatGenes.Set( j, k, true ); 00227 MatResults.Get( k, dAnswer ? ETFPN_TP : ETFPN_FP )++; 00228 } 00229 for( ; k < MatResults.GetRows( ); ++k ) 00230 MatResults.Get( k, dAnswer ? ETFPN_FN : ETFPN_TN )++; 00231 } 00232 } 00233 } 00234 else if( iBins ) { 00235 vector<SDatum> vecsData; 00236 size_t iChunk; 00237 00238 for( iPositives = iNegatives = i = 0; i < Answers.GetGenes( ); ++i ) { 00239 if( ( iOne = veciGenes[ i ] ) == -1 ) 00240 continue; 00241 for( j = ( i + 1 ); j < Answers.GetGenes( ); ++j ) { 00242 if( ( ( iTwo = veciGenes[ j ] ) == -1 ) || 00243 CMeta::IsNaN( dAnswer = Answers.Get( i, j ) ) || 00244 CMeta::IsNaN( dValue = Data.Get( iOne, iTwo ) ) ) 00245 continue; 00246 if ( CMeta::SkipEdge( !!dAnswer, i, j, vecfHere, vecfUbik, sArgs.ctxtpos_flag, sArgs.ctxtneg_flag, sArgs.bridgepos_flag, sArgs.bridgeneg_flag, sArgs.outpos_flag, sArgs.outneg_flag ) ) { 00247 continue; 00248 } 00249 00250 MatGenes.Set( i, 0, true ); 00251 MatGenes.Set( j, 0, true ); 00252 if( dAnswer ) 00253 iPositives++; 00254 else 00255 iNegatives++; 00256 vecsData.push_back( SDatum( dValue, i, j, dAnswer ) ); 00257 } 00258 } 00259 sort( vecsData.begin( ), vecsData.end( ), SSorter( !!sArgs.invert_flag ) ); 00260 //instead of putting all of the uneveness in one bin, spread it out into each bin. 00261 //N.B. only the part without the sse_flag is fixed in this regard 00262 size_t perChunk = (size_t)(vecsData.size()/MatResults.GetRows()); 00263 size_t chnkRem = (size_t)(vecsData.size()%MatResults.GetRows()); 00264 iChunk = (size_t)( 0.5 + ( (float)vecsData.size( ) / ( MatResults.GetRows( ) ) ) ); 00265 if( sArgs.sse_flag ) { 00266 vecdSSE.resize( MatResults.GetRows( ) ); 00267 veciPositives.resize( vecdSSE.size( ) ); 00268 for( i = 1,j = 0; i < vecdSSE.size( ); ++i,j += iChunk ) { 00269 veciPositives[ veciPositives.size( ) - i - 1 ] = veciPositives[ veciPositives.size( ) - i ]; 00270 vecdSSE[ vecdSSE.size( ) - i - 1 ] = vecdSSE[ vecdSSE.size( ) - i ]; 00271 for( k = 0; k < iChunk; ++k ) { 00272 if( ( j + k ) >= vecsData.size( ) ) 00273 break; 00274 const SDatum& sDatum = vecsData[ vecsData.size( ) - ( j + k ) - 1 ]; 00275 00276 for( m = 0; m < ( vecdSSE.size( ) - i ); ++m ) { 00277 MatGenes.Set( sDatum.m_iOne, m, true ); 00278 MatGenes.Set( sDatum.m_iTwo, m, true ); 00279 } 00280 dValue = sDatum.m_dValue - sDatum.m_dAnswer; 00281 veciPositives[ veciPositives.size( ) - i - 1 ]++; 00282 vecdSSE[ vecdSSE.size( ) - i - 1 ] += dValue * dValue; 00283 } 00284 } 00285 } 00286 else { 00287 veciPositives.resize( MatResults.GetRows( ) - 1 ); 00288 veciNegatives.resize( veciPositives.size( ) ); 00289 vecdBinValue.resize( veciPositives.size( )+1 ); 00290 for( i = 0; i < veciNegatives.size( ); ++i ) 00291 veciNegatives[ i ] = veciPositives[ i ] = 0; 00292 for( i = j = 0; i < veciPositives.size( )+1; ++i,j += k ) { 00293 size_t thisChunk = (i < chnkRem) ? (perChunk + 1) : (perChunk); 00294 for( k = 0; k < thisChunk; ++k ) { 00295 //if( ( j + k ) >= vecsData.size( ) ) 00296 // break; 00297 const SDatum& sDatum = vecsData[ j + k ]; 00298 vecdBinValue[ i ] = sDatum.m_dValue; 00299 00300 for( m = i; m > 0; --m ) { 00301 MatGenes.Set( sDatum.m_iOne, m, true ); 00302 MatGenes.Set( sDatum.m_iTwo, m, true ); 00303 } 00304 if( i >= veciPositives.size() ) 00305 continue; 00306 if( Answers.Get( sDatum.m_iOne, sDatum.m_iTwo ) ) 00307 veciPositives[ i ]++; 00308 else 00309 veciNegatives[ i ]++; 00310 } 00311 } 00312 MatResults.Set( 0, ETFPN_TP, iPositives ); 00313 MatResults.Set( 0, ETFPN_FP, iNegatives ); 00314 MatResults.Set( 0, ETFPN_TN, 0 ); 00315 MatResults.Set( 0, ETFPN_FN, 0 ); 00316 for( i = 1; i < MatResults.GetRows( ); ++i ) { 00317 MatResults.Set( i, ETFPN_TP, MatResults.Get( i - 1, ETFPN_TP ) - veciPositives[ i - 1 ] ); 00318 MatResults.Set( i, ETFPN_FP, MatResults.Get( i - 1, ETFPN_FP ) - veciNegatives[ i - 1 ] ); 00319 MatResults.Set( i, ETFPN_TN, MatResults.Get( i - 1, ETFPN_TN ) + veciNegatives[ i - 1 ] ); 00320 MatResults.Set( i, ETFPN_FN, MatResults.Get( i - 1, ETFPN_FN ) + 00321 veciPositives[ i - 1 ] ); 00322 } 00323 } 00324 } 00325 else 00326 for( i = 0; i < Answers.GetGenes( ); ++i ) { 00327 if( !( i % 1000 ) ) 00328 cerr << "Processing gene " << i << '/' << Answers.GetGenes( ) << endl; 00329 if( ( iOne = veciGenes[ i ] ) == -1 ) 00330 continue; 00331 for( j = ( i + 1 ); j < Answers.GetGenes( ); ++j ) { 00332 if( ( ( iTwo = veciGenes[ j ] ) == -1 ) || 00333 CMeta::IsNaN( dAnswer = Answers.Get( i, j ) ) || 00334 CMeta::IsNaN( dValue = Data.Get( iOne, iTwo ) ) ) 00335 continue; 00336 if ( CMeta::SkipEdge( !!dAnswer, i, j, vecfHere, vecfUbik, sArgs.ctxtpos_flag, sArgs.ctxtneg_flag, sArgs.bridgepos_flag, sArgs.bridgeneg_flag, sArgs.outpos_flag, sArgs.outneg_flag ) ) { 00337 continue; 00338 } 00339 00340 if( sArgs.invert_flag ) 00341 dValue = 1 - dValue; 00342 00343 iMax = (int)ceil( ( dValue - sArgs.min_arg ) / sArgs.delta_arg ); 00344 if( iMax > (int)MatResults.GetRows( ) ) 00345 iMax = (int)MatResults.GetRows( ); 00346 eTFPN = (ETFPN)!dAnswer; 00347 for( k = 0; (int)k < iMax; ++k ) { 00348 MatResults.Get( k, eTFPN )++; 00349 MatGenes.Set( i, k, true ); 00350 MatGenes.Set( j, k, true ); 00351 } 00352 eTFPN = (ETFPN)( 2 + !eTFPN ); 00353 for( ; k < (int)MatResults.GetRows( ); ++k ) 00354 MatResults.Get( k, eTFPN )++; 00355 } 00356 } 00357 for( iPositives = iNegatives = i = 0; i < Answers.GetGenes( ); ++i ) 00358 for( j = ( i + 1 ); j < Answers.GetGenes( ); ++j ) { 00359 if( CMeta::IsNaN( dAnswer = Answers.Get( i, j ) ) ) { 00360 continue; 00361 } 00362 00363 if ( CMeta::SkipEdge( !!dAnswer, i, j, vecfHere, vecfUbik, sArgs.ctxtpos_flag, sArgs.ctxtneg_flag, sArgs.bridgepos_flag, sArgs.bridgeneg_flag, sArgs.outpos_flag, sArgs.outneg_flag ) ) { 00364 continue; 00365 } 00366 00367 if( dAnswer ) 00368 iPositives++; 00369 else { 00370 iNegatives++; 00371 } 00372 } 00373 00374 veciRec.resize( MatResults.GetRows( ) ); 00375 veciRecTerm.resize( MatResults.GetRows( ) ); 00376 for( i = 0; i < veciRec.size( ); ++i ) { 00377 veciRec[ i ] = veciRecTerm[ i ] = 0; 00378 for( j = 0; j < MatGenes.GetRows( ); ++j ) 00379 if( MatGenes.Get( j, i ) ) { 00380 veciRec[ i ]++; 00381 if( vecfHere.size( ) && vecfHere[ j ] ) 00382 veciRecTerm[ i ]++; 00383 } 00384 for( j = 0; j < veciGenesTerm.size( ); ++j ) 00385 if( MatGenes.Get( veciGenesTerm[ j ], i ) && 00386 ( vecfHere.empty( ) || !vecfHere[ veciGenesTerm[ j ] ] ) ) 00387 veciRecTerm[ i ]++; 00388 } 00389 00390 if( sArgs.inputs_num ) { 00391 ofsm.open( ( (string)sArgs.directory_arg + '/' + 00392 CMeta::Basename( sArgs.inputs[ iGenes ] ) + ".bins" ).c_str( ) ); 00393 postm = &ofsm; 00394 } 00395 else 00396 postm = &cout; 00397 00398 if( !sArgs.sse_flag ) { //Weighted context is currently only supported for AUC calculation 00399 *postm << "# P " << iPositives << endl; 00400 *postm << "# N " << iNegatives << endl; 00401 } 00402 00403 if(!sArgs.weights_arg){ 00404 *postm << "Cut Genes " << ( sArgs.sse_flag ? "Pairs SSE" : "TP FP TN FN RC PR VALUE" ) << endl; 00405 for( i = 0; i < MatResults.GetRows( ); ++i ) { 00406 *postm << ( iBins ? i : ( sArgs.min_arg + ( i * sArgs.delta_arg ) ) ) << '\t' << 00407 veciRec[ i ]; 00408 if( sArgs.sse_flag ) 00409 *postm << '\t' << veciPositives[ i ] << '\t' << vecdSSE[ i ]; 00410 else 00411 for( j = 0; j < MatResults.GetColumns( ); ++j ) 00412 *postm << '\t' << MatResults.Get( i, j ); 00413 if( veciGenesTerm.size( ) || vecfHere.size( ) ) 00414 *postm << '\t' << veciRecTerm[ i ]; 00415 00416 // print precision/recall 00417 *postm << '\t' << (float)MatResults.Get(i,0)/(MatResults.Get(0,0)); 00418 00419 if( (MatResults.Get(i,1)+MatResults.Get(i,0)) != 0) 00420 *postm << '\t' << (float)MatResults.Get(i,0)/(MatResults.Get(i,1)+MatResults.Get(i,0)); 00421 else 00422 *postm << '\t' << 0.0; 00423 *postm << '\t' << vecdBinValue[ i ]; 00424 00425 *postm << endl; 00426 }} 00427 else{ 00428 *postm << "AUC is calculated using weighted context."<<endl; 00429 if(sArgs.flipneg_flag) 00430 *postm << "Flipneg ON"<<endl; 00431 else 00432 *postm << "Flipneg OFF"<<endl; 00433 } 00434 00435 if( !sArgs.sse_flag ) 00436 { 00437 if(sArgs.weights_arg){ 00438 if(isDatWeighted) 00439 *postm << "# AUC " << CStatistics::WilcoxonRankSum( Data, Answers, wDat,sArgs.flipneg_flag ) << endl; 00440 else 00441 *postm << "# AUC " << CStatistics::WilcoxonRankSum( Data, Answers, vecGeneWeights,sArgs.flipneg_flag ) << endl; 00442 } 00443 else{ 00444 if( sArgs.auc_arg) 00445 *postm << "# AUC " << AUCMod( Data, Answers, vecfHere, vecfUbik, sArgs, !!sArgs.invert_flag, sArgs.auc_arg ) << endl; 00446 else{ 00447 *postm << "# AUC " << CStatistics::WilcoxonRankSum( Data, Answers, vecfHere, vecfUbik, !!sArgs.ctxtpos_flag, !!sArgs.ctxtneg_flag, !!sArgs.bridgepos_flag, !!sArgs.bridgeneg_flag, !!sArgs.outpos_flag, !!sArgs.outneg_flag, !!sArgs.invert_flag ) << endl; 00448 00449 } 00450 } 00451 } 00452 00453 if( sArgs.inputs_num ) 00454 ofsm.close( ); 00455 else 00456 cout.flush( ); 00457 00458 if( !sArgs.inputs_num ) 00459 break; 00460 } 00461 } 00462 00463 return 0; 00464 } 00465 00466 struct SSorterMod { 00467 const vector<float>& m_vecdValues; 00468 00469 SSorterMod( const vector<float>& vecdValues ) : m_vecdValues(vecdValues) { } 00470 00471 bool operator()( size_t iOne, size_t iTwo ) { 00472 00473 return ( m_vecdValues[iTwo] < m_vecdValues[iOne] ); 00474 } 00475 }; 00476 00477 double AUCMod( const CDat& DatData, const CDat& DatAnswers, const vector<bool>& vecfHere, const vector<bool>& vecfUbik, const gengetopt_args_info& sArgs, bool fInvert, 00478 float dAUC ) { 00479 size_t i, j, iPos, iNeg, iPosCur, iNegCur, iOne, iTwo, iIndex, iAUC; 00480 vector<float> vecdValues, vecdAnswers; 00481 vector<size_t> veciGenes, veciIndices; 00482 bool fAnswer; 00483 double dRet; 00484 float d, dAnswer, dSens, dSpec, dSensPrev, dSpecPrev; 00485 00486 veciGenes.resize( DatAnswers.GetGenes( ) ); 00487 for( i = 0; i < veciGenes.size( ); ++i ) 00488 veciGenes[ i ] = DatData.GetGene( DatAnswers.GetGene( i ) ); 00489 00490 for( iPos = iNeg = i = 0; i < DatAnswers.GetGenes( ); ++i ) { 00491 if( ( iOne = veciGenes[ i ] ) == -1 ) 00492 continue; 00493 for( j = ( i + 1 ); j < DatAnswers.GetGenes( ); ++j ) { 00494 if( ( ( iTwo = veciGenes[ j ] ) == -1 ) || 00495 CMeta::IsNaN( dAnswer = DatAnswers.Get( i, j ) ) || 00496 CMeta::IsNaN( d = DatData.Get( iOne, iTwo ) ) ) 00497 continue; 00498 fAnswer = dAnswer > 0; 00499 00500 if ( CMeta::SkipEdge( fAnswer, i, j, vecfHere, vecfUbik, sArgs.ctxtpos_flag, sArgs.ctxtneg_flag, sArgs.bridgepos_flag, sArgs.bridgeneg_flag, sArgs.outpos_flag, sArgs.outneg_flag ) ) { 00501 continue; 00502 } 00503 00504 if( fAnswer ) 00505 iPos++; 00506 else 00507 iNeg++; 00508 if( fInvert ) 00509 d = 1 - d; 00510 vecdAnswers.push_back( dAnswer ); 00511 vecdValues.push_back( d ); 00512 } 00513 } 00514 00515 veciIndices.resize( vecdValues.size( ) ); 00516 for( i = 0; i < veciIndices.size( ); ++i ) 00517 veciIndices[i] = i; 00518 sort( veciIndices.begin( ), veciIndices.end( ), SSorterMod( vecdValues ) ); 00519 00520 iAUC = (size_t)( ( dAUC < 1 ) ? ( dAUC * iNeg ) : dAUC ); 00521 dRet = dSensPrev = dSpecPrev = 0; 00522 for( iPosCur = iNegCur = i = 0; ( i < veciIndices.size( ) ) && ( iNegCur < iAUC ); ++i ) { 00523 iIndex = veciIndices[i]; 00524 if( vecdAnswers[iIndex] > 0 ) 00525 iPosCur++; 00526 else 00527 iNegCur++; 00528 dSens = (float)iPosCur / iPos; 00529 dSpec = 1 - (float)( iNeg - iNegCur ) / iNeg; 00530 if( dSpec > dSpecPrev ) { 00531 dRet += ( dSpec - dSpecPrev ) * dSens; 00532 dSensPrev = dSens; 00533 dSpecPrev = dSpec; 00534 } 00535 } 00536 dRet *= max( 1.0f, (float)iNeg / iAUC ); 00537 00538 return dRet; 00539 }