Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "dat.h" 00024 #include "genome.h" 00025 #include "statistics.h" 00026 #include "annotation.h" 00027 #include "color.h" 00028 #include "meta.h" 00029 00030 namespace Sleipnir { 00031 00032 const char CDatImpl::c_acComment[] = "#"; 00033 const CColor& CDatImpl::c_ColorMax = CColor::c_Red; 00034 const CColor& CDatImpl::c_ColorMin = CColor::c_Green; 00035 const CColor& CDatImpl::c_ColorMid = CColor::c_Black; 00036 00037 static const struct { 00038 const char* m_szExtension; 00039 const CDat::EFormat m_eFormat; 00040 } c_asFormats[] = { 00041 {"dat", CDat::EFormatText}, 00042 {"das", CDat::EFormatSparse}, 00043 {"pcl", CDat::EFormatPCL}, 00044 {"qdab",CDat::EFormatQdab}, 00045 {NULL, CDat::EFormatBinary} 00046 }; 00047 00048 size_t CDatImpl::MapGene( TMapStrI& mapGenes, TVecStr& vecGenes, const std::string& strToken ) { 00049 TMapStrI::iterator iterGenes; 00050 size_t iRet; 00051 00052 if( ( iterGenes = mapGenes.find( strToken ) ) == mapGenes.end( ) ) { 00053 iRet = mapGenes.size( ); 00054 mapGenes[ strToken ] = iRet; 00055 vecGenes.push_back( strToken ); } 00056 else 00057 iRet = iterGenes->second; 00058 00059 return iRet; } 00060 00061 void CDatImpl::ResizeNaN( TAF& vecf, size_t iLast ) { 00062 size_t i; 00063 00064 if( ( i = vecf.size( ) ) > iLast ) 00065 return; 00066 00067 vecf.resize( iLast + 1 ); 00068 for( ; i < vecf.size( ); ++i ) 00069 vecf[ i ] = CMeta::GetNaN( ); } 00070 00071 void CDatImpl::DabGene( std::istream& istm, char* acBuffer ) { 00072 size_t i; 00073 00074 i = 0; 00075 do 00076 istm.seekg( 1, ios_base::cur ); 00077 while( acBuffer[ i++ ] = istm.get( ) ); } 00078 00079 CDatImpl::~CDatImpl( ) { 00080 00081 Reset( ); } 00082 00083 void CDatImpl::Reset( ) { 00084 00085 m_Data.Reset( ); 00086 m_vecstrGenes.clear( ); 00087 00088 if( m_pPCL && m_fPCLMemory ) 00089 delete m_pPCL; 00090 m_pPCL = NULL; 00091 if( m_pMeasure && m_fMeasureMemory ) 00092 delete m_pMeasure; 00093 m_pMeasure = NULL; 00094 00095 CMeta::Unmap( m_abData, m_hndlData, m_iData ); 00096 m_abData = NULL; 00097 m_hndlData = 0; 00098 m_iData = 0; 00099 if( m_aadData ) 00100 delete[] m_aadData; 00101 m_aadData = NULL; } 00102 00103 void CDatImpl::SlimCache( const CSlim& Slim, vector<vector<size_t> >& vecveciGenes ) const { 00104 size_t iS, iG; 00105 00106 vecveciGenes.resize( Slim.GetSlims( ) ); 00107 for( iS = 0; iS < vecveciGenes.size( ); ++iS ) { 00108 vecveciGenes[ iS ].resize( Slim.GetGenes( iS ) ); 00109 for( iG = 0; iG < vecveciGenes[ iS ].size( ); ++iG ) 00110 vecveciGenes[ iS ][ iG ] = GetGene( Slim.GetGene( iS, iG ).GetName( ) ); } } 00111 00129 bool CDat::Open( const CSlim& Slim ) { 00130 vector<string> vecstrGenes; 00131 size_t iS1, iS2, iG1, iG2, iGene1, iGene2; 00132 vector<vector<size_t> > vecveciGenes; 00133 00134 Reset( ); 00135 Slim.GetGeneNames( vecstrGenes ); 00136 if( !Open( vecstrGenes ) ) 00137 return false; 00138 00139 SlimCache( Slim, vecveciGenes ); 00140 for( iS1 = 0; iS1 < Slim.GetSlims( ); ++iS1 ) { 00141 g_CatSleipnir( ).info( "CDat::Open( ) processing slim: %s", 00142 Slim.GetSlim( iS1 ).c_str( ) ); 00143 for( iG1 = 0; iG1 < Slim.GetGenes( iS1 ); ++iG1 ) { 00144 iGene1 = vecveciGenes[ iS1 ][ iG1 ]; 00145 for( iG2 = ( iG1 + 1 ); iG2 < Slim.GetGenes( iS1 ); ++iG2 ) 00146 Set( iGene1, vecveciGenes[ iS1 ][ iG2 ], 1 ); 00147 for( iS2 = ( iS1 + 1 ); iS2 < Slim.GetSlims( ); ++iS2 ) 00148 for( iG2 = 0; iG2 < Slim.GetGenes( iS2 ); ++iG2 ) { 00149 iGene2 = vecveciGenes[ iS2 ][ iG2 ]; 00150 if( CMeta::IsNaN( Get( iGene1, iGene2 ) ) ) 00151 Set( iGene1, iGene2, 0 ); } } } 00152 00153 return true; } 00154 00176 bool CDat::Open( const CSlim& SlimPositives, const CSlim& SlimNonnegatives ) { 00177 set<string> setstrGenes; 00178 vector<string> vecstrGenes; 00179 size_t iS1, iS2, iG1, iG2, iGene1, iGene2; 00180 vector<vector<size_t> > vecveciGenes; 00181 00182 Reset( ); 00183 SlimPositives.GetGeneNames( vecstrGenes ); 00184 setstrGenes.insert( vecstrGenes.begin( ), vecstrGenes.end( ) ); 00185 vecstrGenes.clear( ); 00186 SlimNonnegatives.GetGeneNames( vecstrGenes ); 00187 setstrGenes.insert( vecstrGenes.begin( ), vecstrGenes.end( ) ); 00188 vecstrGenes.clear( ); 00189 vecstrGenes.resize( setstrGenes.size( ) ); 00190 copy( setstrGenes.begin( ), setstrGenes.end( ), vecstrGenes.begin( ) ); 00191 if( !Open( vecstrGenes ) ) 00192 return false; 00193 00194 SlimCache( SlimPositives, vecveciGenes ); 00195 for( iS1 = 0; iS1 < SlimPositives.GetSlims( ); ++iS1 ) { 00196 g_CatSleipnir( ).info( "CDat::Open( ) processing slim: %s", 00197 SlimPositives.GetSlim( iS1 ).c_str( ) ); 00198 for( iG1 = 0; iG1 < SlimPositives.GetGenes( iS1 ); ++iG1 ) { 00199 iGene1 = vecveciGenes[ iS1 ][ iG1 ]; 00200 for( iG2 = ( iG1 + 1 ); iG2 < SlimPositives.GetGenes( iS1 ); ++iG2 ) 00201 Set( iGene1, vecveciGenes[ iS1 ][ iG2 ], 1 ); } } 00202 vecveciGenes.clear( ); 00203 SlimCache( SlimNonnegatives, vecveciGenes ); 00204 for( iS1 = 0; iS1 < SlimNonnegatives.GetSlims( ); ++iS1 ) { 00205 g_CatSleipnir( ).info( "CDat::Open( ) processing slim: %s", 00206 SlimNonnegatives.GetSlim( iS1 ).c_str( ) ); 00207 for( iG1 = 0; iG1 < SlimNonnegatives.GetGenes( iS1 ); ++iG1 ) { 00208 iGene1 = vecveciGenes[ iS1 ][ iG1 ]; 00209 for( iS2 = ( iS1 + 1 ); iS2 < SlimNonnegatives.GetSlims( ); ++iS2 ) 00210 for( iG2 = 0; iG2 < SlimNonnegatives.GetGenes( iS2 ); ++iG2 ) { 00211 iGene2 = vecveciGenes[ iS2 ][ iG2 ]; 00212 if( CMeta::IsNaN( Get( iGene1, iGene2 ) ) ) 00213 Set( iGene1, iGene2, 0 ); } } } 00214 00215 return true; } 00216 00253 bool CDat::Open( const CDat& DatKnown, const vector<CGenes*>& vecpOther, const CGenome& Genome, 00254 bool fKnownNegatives, bool fIncident) { 00255 size_t i, j, iOne, iTwo; 00256 vector<size_t> veciGenes; 00257 float d; 00258 00259 Reset( ); 00260 Open( Genome.GetGeneNames( ) ); 00261 for( i = 0; i < vecpOther.size( ); ++i ) 00262 OpenHelper( vecpOther[ i ], 1 ); 00263 if( !fKnownNegatives ) 00264 for( i = 0; i < GetGenes( ); ++i ) 00265 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 00266 Set( i, j, CMeta::IsNaN( Get( i, j ) ) ? 0 : CMeta::GetNaN( ) ); 00267 00268 veciGenes.resize( DatKnown.GetGenes( ) ); 00269 for( i = 0; i < veciGenes.size( ); ++i ) 00270 veciGenes[ i ] = GetGene( DatKnown.GetGene( i ) ); 00271 for( i = 0; i < DatKnown.GetGenes( ); ++i ) { 00272 iOne = veciGenes[ i ]; 00273 for( j = ( i + 1 ); j < DatKnown.GetGenes( ); ++j ) { 00274 iTwo = veciGenes[ j ]; 00275 if( CMeta::IsNaN( d = DatKnown.Get( i, j ) ) ) { 00276 if( fKnownNegatives && CMeta::IsNaN( Get( iOne, iTwo ) ) ) 00277 Set( iOne, iTwo, 0 ); } 00278 else if( fKnownNegatives == !d ) 00279 Set( iOne, iTwo, d ); } } 00280 00281 00282 if( fIncident ) { 00283 vector<float> vecfNegatives; 00284 vecfNegatives.resize( GetGenes( ) ); 00285 fill( vecfNegatives.begin( ), vecfNegatives.end( ), false ); 00286 00287 for( i = 0; i < vecfNegatives.size( ); ++i ) 00288 for( j = 0; j < vecpOther.size( ); ++j ) 00289 if( vecpOther[j]->IsGene( GetGene( i ) ) ) { 00290 vecfNegatives[i] = true; 00291 break; 00292 } 00293 00294 for( i = 0; i < vecfNegatives.size( ); ++i ) 00295 if( DatKnown.GetGene( GetGene( i ) ) != -1 ) 00296 vecfNegatives[i] = true; 00297 00298 for( i = 0; i < GetGenes( ); ++i ) { 00299 if( vecfNegatives[i] ) 00300 continue; 00301 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 00302 if( !( vecfNegatives[j] || Get( i, j ) ) ){ 00303 Set( i, j, CMeta::GetNaN( ) ); 00304 //cerr << "setting to NaN" << endl; 00305 } 00306 } 00307 } 00308 00309 return true; } 00310 00321 bool CDat::Open( const CDat& Dat ) { 00322 size_t i; 00323 00324 if( !Open( Dat.GetGeneNames( ) ) ) 00325 return false; 00326 00327 for( i = 0; i < GetGenes( ); ++i ) 00328 memcpy( Get( i ), Dat.Get( i ), ( GetGenes( ) - i - 1 ) * sizeof(*Get( i )) ); 00329 00330 return true; } 00331 00367 bool CDat::Open( const vector<CGenes*>& vecpPositives, const vector<CGenes*>& vecpNonnegatives, 00368 float dPValue, const CGenome& Genome, bool fIncident ) { 00369 size_t i, j, k, iOne, iTwo, iOverlap; 00370 float d; 00371 const CGenes* pBig; 00372 const CGenes* pSmall; 00373 00374 Reset( ); 00375 Open( Genome.GetGeneNames( ) ); 00376 for( i = 0; i < vecpPositives.size( ); ++i ) 00377 OpenHelper( vecpPositives[ i ], 1 ); 00378 if( vecpNonnegatives.size( ) ) { 00379 for( i = 0; i < vecpNonnegatives.size( ); ++i ) 00380 OpenHelper( vecpNonnegatives[ i ], 0 ); 00381 if( dPValue > 0 ) 00382 for( i = 0; i < vecpPositives.size( ); ++i ) { 00383 iOne = vecpPositives[ i ]->GetGenes( ); 00384 for( j = ( i + 1 ); j < vecpPositives.size( ); ++j ) { 00385 iTwo = vecpPositives[ j ]->GetGenes( ); 00386 if( iOne < iTwo ) { 00387 pSmall = vecpPositives[ i ]; 00388 pBig = vecpPositives[ j ]; } 00389 else { 00390 pSmall = vecpPositives[ j ]; 00391 pBig = vecpPositives[ i ]; } 00392 for( iOverlap = k = 0; k < pSmall->GetGenes( ); ++k ) 00393 if( pBig->IsGene( pSmall->GetGene( k ).GetName( ) ) ) 00394 iOverlap++; 00395 if( CStatistics::HypergeometricCDF( iOverlap, iOne, iTwo, GetGenes( ) ) < dPValue ) 00396 OpenHelper( pBig, pSmall, 0 ); } } 00397 for( i = 0; i < GetGenes( ); ++i ) 00398 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 00399 if( CMeta::IsNaN( d = Get( i, j ) ) ) 00400 Set( i, j, 0 ); 00401 else if( !d ) 00402 Set( i, j, CMeta::GetNaN( ) ); 00403 if( fIncident ) { 00404 vector<float> vecfNegatives; 00405 00406 vecfNegatives.resize( GetGenes( ) ); 00407 fill( vecfNegatives.begin( ), vecfNegatives.end( ), false ); 00408 for( i = 0; i < vecfNegatives.size( ); ++i ) 00409 for( j = 0; j < vecpNonnegatives.size( ); ++j ) 00410 if( vecpNonnegatives[j]->IsGene( GetGene( i ) ) ) { 00411 vecfNegatives[i] = true; 00412 break; } 00413 for( i = 0; i < GetGenes( ); ++i ) { 00414 if( vecfNegatives[i] ) 00415 continue; 00416 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 00417 if( !( vecfNegatives[j] || Get( i, j ) ) ) 00418 Set( i, j, CMeta::GetNaN( ) ); } } } 00419 00420 return true; } 00421 00422 void CDatImpl::OpenHelper( const CGenes* pGenes, float dValue ) { 00423 vector<size_t> veciGenes; 00424 size_t i, j, iOne, iTwo; 00425 00426 veciGenes.resize( pGenes->GetGenes( ) ); 00427 for( i = 0; i < veciGenes.size( ); ++i ) 00428 veciGenes[ i ] = GetGene( pGenes->GetGene( i ).GetName( ) ); 00429 for( i = 0; i < veciGenes.size( ); ++i ) { 00430 iOne = veciGenes[ i ]; 00431 for( j = ( i + 1 ); j < veciGenes.size( ); ++j ) { 00432 iTwo = veciGenes[ j ]; 00433 if( CMeta::IsNaN( Get( iOne, iTwo ) ) ) 00434 Set( iOne, iTwo, dValue ); } } } 00435 00436 void CDatImpl::OpenHelper( const CGenes* pOne, const CGenes* pTwo, float dValue ) { 00437 vector<size_t> veciOne, veciTwo; 00438 size_t i, j, iOne, iTwo; 00439 00440 veciOne.resize( pOne->GetGenes( ) ); 00441 for( i = 0; i < veciOne.size( ); ++i ) 00442 veciOne[ i ] = GetGene( pOne->GetGene( i ).GetName( ) ); 00443 veciTwo.resize( pTwo->GetGenes( ) ); 00444 for( i = 0; i < veciTwo.size( ); ++i ) 00445 veciTwo[ i ] = GetGene( pTwo->GetGene( i ).GetName( ) ); 00446 for( i = 0; i < veciOne.size( ); ++i ) { 00447 iOne = veciOne[ i ]; 00448 for( j = 0; j < veciTwo.size( ); ++j ) { 00449 iTwo = veciTwo[ j ]; 00450 if( CMeta::IsNaN( Get( iOne, iTwo ) ) ) 00451 Set( iOne, iTwo, dValue ); } } } 00452 00492 bool CDat::Open( std::istream& istm, EFormat eFormat, float dDefault, bool fDuplicates, size_t iSkip, 00493 bool fZScore, bool fSeek ) { 00494 00495 switch( eFormat ) { 00496 case EFormatText: 00497 return OpenText( istm, dDefault, fDuplicates ); 00498 00499 case EFormatPCL: 00500 return OpenPCL( istm, iSkip, fZScore ); 00501 00502 case EFormatSparse: 00503 return OpenSparse( istm ); 00504 00505 case EFormatQdab: 00506 return OpenQdab( istm ); 00507 00508 } 00509 00510 if(fSeek){ 00511 m_fSeek = true; 00512 } 00513 00514 return OpenBinary( istm, fSeek ); 00515 } 00516 00517 bool CDatImpl::OpenPCL( std::istream& istm, size_t iSkip, bool fZScore ) { 00518 00519 Reset( ); 00520 m_pPCL = new CPCL( ); 00521 if( !m_pPCL->Open( istm, iSkip ) ) 00522 return false; 00523 00524 m_pMeasure = new CMeasurePearNorm( ); 00525 m_fMeasureMemory = true; 00526 if( fZScore ) { 00527 size_t iN; 00528 double dAve, dStd; 00529 00530 AveStd( dAve, dStd, iN ); 00531 delete m_pMeasure; 00532 m_pMeasure = new CMeasurePearNorm( dAve, dStd ); } 00533 00534 return true; } 00535 00562 bool CDat::Open( const CPCL& PCL, const IMeasure* pMeasure, bool fMeasureMemory ) { 00563 00564 Reset( ); 00565 m_pPCL = (CPCL*)&PCL; 00566 m_fPCLMemory = false; 00567 m_pMeasure = pMeasure; 00568 m_fMeasureMemory = fMeasureMemory; 00569 00570 return true; } 00571 00572 bool CDatImpl::OpenText( std::istream& istm, float dDefault, bool fDuplicates ) { 00573 const char* pc; 00574 char* pcTail; 00575 char* acBuf; 00576 string strToken, strCache, strValue; 00577 TMapStrI mapGenes; 00578 size_t iOne, iTwo, i; 00579 float dScore; 00580 TAAF vecvecfScores; 00581 00582 Reset( ); 00583 acBuf = new char[ c_iBufferSize ]; 00584 while( istm.peek( ) != EOF ) { 00585 istm.getline( acBuf, c_iBufferSize - 1 ); 00586 strToken = OpenToken( acBuf, &pc ); 00587 if( !strToken.length( ) ) 00588 break; 00589 if( strToken == c_acComment ) 00590 continue; 00591 if( strToken != strCache ) { 00592 strCache = strToken; 00593 iOne = MapGene( mapGenes, m_vecstrGenes, strToken ); } 00594 00595 strToken = OpenToken( pc, &pc ); 00596 if( !strToken.length( ) ) { 00597 Reset( ); 00598 delete[] acBuf; 00599 return false; } 00600 iTwo = MapGene( mapGenes, m_vecstrGenes, strToken ); 00601 strValue = OpenToken( pc ); 00602 if( !strValue.length( ) ) { 00603 if( CMeta::IsNaN( dScore = dDefault ) ) { 00604 Reset( ); 00605 delete[] acBuf; 00606 return false; } } 00607 else if( !( dScore = (float)strtod( strValue.c_str( ), &pcTail ) ) && 00608 ( pcTail != ( strValue.c_str( ) + strValue.length( ) ) ) ) { 00609 Reset( ); 00610 delete[] acBuf; 00611 return false; } 00612 00613 i = ( ( iOne > iTwo ) ? iOne : iTwo ); 00614 if( vecvecfScores.size( ) <= i ) 00615 vecvecfScores.resize( i + 1 ); 00616 ResizeNaN( vecvecfScores[ iOne ], i ); 00617 ResizeNaN( vecvecfScores[ iTwo ], i ); 00618 if( !CMeta::IsNaN( vecvecfScores[ iOne ][ iTwo ] ) && ( vecvecfScores[ iOne ][ iTwo ] != dScore ) ) { 00619 g_CatSleipnir( ).error( "CDatImpl::OpenText( ) duplicate genes %s, %s (%g:%g)", 00620 strCache.c_str( ), strToken.c_str( ), vecvecfScores[ iOne ][ iTwo ], 00621 dScore ); 00622 if( !fDuplicates ) { 00623 Reset( ); 00624 delete[] acBuf; 00625 return false; } } 00626 vecvecfScores[ iOne ][ iTwo ] = vecvecfScores[ iTwo ][ iOne ] = dScore; } 00627 delete[] acBuf; 00628 00629 m_Data.Initialize( GetGenes( ) ); 00630 for( iOne = 0; iOne < GetGenes( ); ++iOne ) 00631 for( iTwo = ( iOne + 1 ); iTwo < GetGenes( ); ++iTwo ) 00632 Set( iOne, iTwo, ( ( iTwo < vecvecfScores[ iOne ].size( ) ) ? 00633 vecvecfScores[ iOne ][ iTwo ] : CMeta::GetNaN( ) ) ); 00634 00635 return true; } 00636 00637 bool CDatImpl::OpenBinary( std::istream& istm, bool fSeek ) { 00638 size_t i; 00639 float* adScores; 00640 00641 if(fSeek){ 00642 if(!OpenHeader(istm)){ 00643 cerr << "Error opening header" << endl; 00644 return false; 00645 } 00646 return true; 00647 } 00648 00649 if( !OpenGenes( istm, true, false ) ) 00650 return false; 00651 00652 fprintf(stderr, "Reading genes\n"); 00653 m_Data.Initialize( GetGenes( ) ); 00654 adScores = new float[ GetGenes( ) - 1 ]; 00655 for( i = 0; ( i + 1 ) < GetGenes( ); ++i ) { 00656 istm.read( (char*)adScores, sizeof(*adScores) * ( GetGenes( ) - i - 1 ) ); 00657 Set( i, adScores ); } 00658 delete[] adScores; 00659 00660 return true; } 00661 00662 /* still to be tested 00663 * used for BINARY mode, and DAB file only, ie Float matrix 00664 */ 00665 bool CDatImpl::OpenHeader(std::istream& istm){ 00666 if(!m_fSeek){ 00667 cerr << "Don't know how you got here" << endl; 00668 } 00669 00670 if(!OpenGenes(istm, true, false)){ 00671 return false; 00672 } 00673 EstimateSeekPositions(istm); 00674 return true; 00675 } 00676 00677 /* still to be tested 00678 * used for BINARY mode, and DAB file only, ie Float matrix 00679 */ 00680 float* CDatImpl::GetRowSeek(std::istream& istm, const size_t &ind) const { 00681 if(!m_fSeek){ 00682 cerr << "Don't know how you got here" << endl; 00683 } 00684 00685 size_t iRow, iColumn; 00686 size_t i, iNumGenes; 00687 iNumGenes = GetGenes(); 00688 float* adScores = (float*)malloc(iNumGenes * sizeof(float)); 00689 00690 size_t j; 00691 for(i=0; i<ind; i++){ 00692 iRow = i; 00693 iColumn = ind - 1; 00694 size_t offset1 = m_iHeader + m_veciSeekPos[iRow] + iColumn * sizeof(float); 00695 istm.seekg(offset1, ios_base::beg); 00696 float v; 00697 char *p = (char*) &v; 00698 istm.read(p, sizeof(float)); 00699 adScores[i] = v; 00700 } 00701 00702 adScores[ind] = CMeta::GetNaN(); 00703 00704 int iSize = iNumGenes - (ind+1); 00705 if(iSize==0){ 00706 return adScores; 00707 } 00708 00709 float *v = (float*)malloc(iSize*sizeof(float)); 00710 char *p = (char*) v; 00711 istm.seekg(m_iHeader+m_veciSeekPos[ind], ios_base::beg); 00712 istm.read(p, iSize*sizeof(float)); 00713 for(i=0; i<iSize; i++){ 00714 adScores[i+ind+1] = v[i]; 00715 } 00716 free(v); 00717 00718 return adScores; 00719 } 00720 00721 /* still to be tested 00722 * used for BINARY mode, and DAB file only, ie Float matrix 00723 */ 00724 float* CDatImpl::GetRowSeek(std::istream& istm, const std::string &strGene) const{ 00725 if(!m_fSeek){ 00726 cerr << "Don't know how you got here" << endl; 00727 } 00728 size_t i; 00729 size_t iNumGenes = GetGenes(); 00730 size_t ind; 00731 00732 if( (ind = GetGeneIndex(strGene) ) ==-1){ 00733 return NULL; //missing gene 00734 } 00735 00736 return GetRowSeek(istm, ind); 00737 } 00738 00739 bool CDatImpl::OpenQdab( std::istream& istm ) { 00740 size_t iTotal, i, j, num_bins, num_bits, iPos; 00741 float* adScores; 00742 unsigned char tmp; 00743 float* bounds; 00744 unsigned char btmpf; 00745 unsigned char btmpb; 00746 00747 unsigned char bufferA; 00748 unsigned char bufferB; 00749 00750 float nan_val; 00751 00752 if( !OpenGenes( istm, true, false ) ) 00753 return false; 00754 m_Data.Initialize( GetGenes( ) ); 00755 00756 // read the number of bins 00757 istm.read((char*)&tmp, sizeof(char)); 00758 num_bins = (size_t)tmp; 00759 00760 //read the bin boundaries 00761 bounds = new float[num_bins]; 00762 istm.read((char*)bounds, sizeof(float) * num_bins); 00763 00764 // number of bits required for each bin representation 00765 num_bits = (size_t)ceil(log( num_bins ) / log ( 2.0 )); 00766 00767 // add one more bit for NaN case 00768 if( pow(2, num_bits) == num_bins ) 00769 ++num_bits; 00770 00771 // set nan value 00772 nan_val = pow(2, num_bits) -1; 00773 00774 // read the data 00775 adScores = new float[ GetGenes( ) - 1 ]; 00776 00777 istm.read( (char*)&bufferA, sizeof(bufferA)); 00778 istm.read( (char*)&bufferB, sizeof(bufferB)); 00779 00780 for( iTotal = i = 0; ( i + 1 ) < GetGenes( ); ++i ) { 00781 for(j = 0; j < ( GetGenes( ) - i - 1 ); ++j){ 00782 iPos = (iTotal * num_bits) % 8; 00783 00784 // check bit data overflow?? 00785 if( iPos + num_bits > 8){ 00786 btmpb = (bufferA << iPos); 00787 btmpf = (bufferB >> (16 - num_bits - iPos)) << (8-num_bits); 00788 adScores[j] = ((btmpb | btmpf) >> (8 - num_bits)); 00789 ++iTotal; 00790 bufferA = bufferB; 00791 istm.read( (char*)&bufferB, sizeof(bufferB)); 00792 } 00793 else{ 00794 adScores[j] = (((bufferA << iPos) & 0xFF) >> (8 - num_bits)); 00795 ++iTotal; 00796 if( iPos + num_bits == 8 ) { 00797 bufferA = bufferB; 00798 istm.read( (char*)&bufferB, sizeof(bufferB)); 00799 } 00800 } 00801 00802 // check if value added was promised 2^#bits -1 (NaN value) 00803 if(adScores[j] == nan_val) 00804 adScores[j] = CMeta::GetNaN( ); 00805 } 00806 00807 Set( i, adScores ); 00808 } 00809 00810 delete[] adScores; 00811 delete[] bounds; 00812 return true; } 00813 00814 00815 bool CDatImpl::OpenSparse( std::istream& istm ) { 00816 size_t i; 00817 uint32_t j; 00818 float d; 00819 00820 if( !OpenGenes( istm, true, false ) ) 00821 return false; 00822 m_Data.Initialize( GetGenes( ) ); 00823 for( i = 0; ( i + 1 ) < GetGenes( ); ++i ) { 00824 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 00825 Set( i, j, CMeta::GetNaN( ) ); 00826 while( true ) { 00827 istm.read( (char*)&j, sizeof(j) ); 00828 if( j == -1 ) 00829 break; 00830 istm.read( (char*)&d, sizeof(d) ); 00831 Set( i, j, d ); } } 00832 00833 return true; } 00834 00857 bool CDat::OpenGenes( const char* szFile, size_t iSkip ) { 00858 ifstream ifsm; 00859 bool fBinary; 00860 size_t i; 00861 EFormat eFormat; 00862 00863 for( i = 0; c_asFormats[ i ].m_szExtension; ++i ) 00864 if( !strcmp( szFile + strlen( szFile ) - strlen( c_asFormats[ i ].m_szExtension ), 00865 c_asFormats[ i ].m_szExtension ) ) 00866 break; 00867 eFormat = c_asFormats[ i ].m_eFormat; 00868 fBinary = ( eFormat != EFormatText ) && ( eFormat != EFormatPCL ); 00869 ifsm.open( szFile, fBinary ? ios_base::binary : ios_base::in ); 00870 00871 return OpenGenes( ifsm, fBinary, ( eFormat == EFormatPCL ) ); } 00872 00899 bool CDat::OpenGenes( std::istream& istm, bool fBinary, bool fPCL ) { 00900 00901 return CDatImpl::OpenGenes( istm, fBinary, fPCL ); } 00902 00903 bool CDatImpl::OpenGenes( std::istream& istm, bool fBinary, bool fPCL ) { 00904 size_t i, iToken; 00905 uint32_t iCount; 00906 string strToken, strCache; 00907 float d; 00908 const char* pc; 00909 char* acBuf; 00910 00911 Reset( ); 00912 if( fPCL ) { 00913 m_pPCL = new CPCL( ); 00914 if( m_pPCL->Open( istm ) ) { 00915 m_pMeasure = (IMeasure*)1; 00916 m_fMeasureMemory = false; 00917 return true; } 00918 return false; } 00919 acBuf = new char[ c_iBufferSize ]; 00920 if( fBinary ) { 00921 istm.read( (char*)&iCount, sizeof(iCount) ); 00922 if( iCount > c_iGeneLimit ) { 00923 delete[] acBuf; 00924 return false; } 00925 m_vecstrGenes.resize( iCount ); 00926 for( i = 0; i < iCount; ++i ) { 00927 DabGene( istm, acBuf ); 00928 m_vecstrGenes[ i ] = acBuf; 00929 m_mapstrGenes[ acBuf ] = i; } } 00930 else { 00931 set<string> setstrGenes; 00932 set<string>::const_iterator iterGenes; 00933 00934 while( istm.peek( ) != EOF ) { 00935 istm.getline( acBuf, c_iBufferSize - 1 ); 00936 for( iToken = 0; iToken < 3; ++iToken ) { 00937 strToken = OpenToken( acBuf, &pc ); 00938 if( !strToken.length( ) ) 00939 break; 00940 if( strToken != strCache ) { 00941 strCache = strToken; 00942 setstrGenes.insert( strToken ); } 00943 00944 strToken = OpenToken( pc, &pc ); 00945 setstrGenes.insert( strToken ); 00946 d = (float)strtod( ( strToken = OpenToken( pc ) ).c_str( ), (char**)&pc ); 00947 if( !d && ( ( pc - strToken.c_str( ) ) != strToken.length( ) ) ) { 00948 delete[] acBuf; 00949 return false; } } } 00950 m_vecstrGenes.reserve( setstrGenes.size( ) ); 00951 for( iterGenes = setstrGenes.begin( ); iterGenes != setstrGenes.end( ); ++iterGenes ) 00952 m_vecstrGenes.push_back( *iterGenes ); } 00953 delete[] acBuf; 00954 00955 return true; } 00956 00974 void CDat::Save( const char* szFile ) const { 00975 size_t i; 00976 EFormat eFormat; 00977 ofstream ofsm; 00978 00979 if( !szFile ) { 00980 Save( cout, EFormatText ); 00981 cout.flush( ); 00982 return; } 00983 00984 for( i = 0; c_asFormats[ i ].m_szExtension; ++i ) 00985 if( !strcmp( szFile + strlen( szFile ) - strlen( c_asFormats[ i ].m_szExtension ), 00986 c_asFormats[ i ].m_szExtension ) ) 00987 break; 00988 eFormat = c_asFormats[ i ].m_eFormat; 00989 ofsm.open( szFile, ( ( eFormat == EFormatText ) || ( eFormat == EFormatPCL ) ) ? ios_base::out : 00990 ios_base::binary ); 00991 Save( ofsm, eFormat ); } 00992 01010 void CDat::Save( std::ostream& ostm, EFormat eFormat ) const { 01011 01012 switch( eFormat ) { 01013 case EFormatText: 01014 SaveText( ostm ); 01015 return; 01016 01017 case EFormatSparse: 01018 SaveSparse( ostm ); 01019 return; } 01020 01021 SaveBinary( ostm ); } 01022 01023 void CDatImpl::SaveText( std::ostream& ostm ) const { 01024 size_t i, j; 01025 float d; 01026 01027 for( i = 0; i < GetGenes( ); ++i ) 01028 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01029 if( !CMeta::IsNaN( d = Get( i, j ) ) ) 01030 ostm << GetGene( i ) << '\t' << GetGene( j ) << '\t' << d << endl; } 01031 01032 void CDatImpl::SaveBinary( std::ostream& ostm ) const { 01033 size_t i, j; 01034 const float* pd; 01035 float d; 01036 01037 SaveGenes( ostm ); 01038 if( m_pMeasure ) { 01039 for( i = 0; i < GetGenes( ); ++i ) 01040 for( j = ( i + 1 ); j < GetGenes( ); ++j ) { 01041 d = Get( i, j ); 01042 ostm.write( (char*)&d, sizeof(d) ); } } 01043 else 01044 for( i = 0; ( i + 1 ) < GetGenes( ); ++i ) { 01045 pd = m_Data.Get( i ); 01046 ostm.write( (char*)pd, sizeof(*pd) * ( GetGenes( ) - i - 1 ) ); } } 01047 01048 void CDatImpl::SaveSparse( std::ostream& ostm ) const { 01049 uint32_t i, j; 01050 float d; 01051 01052 SaveGenes( ostm ); 01053 for( i = 0; i < GetGenes( ); ++i ) { 01054 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01055 if( !CMeta::IsNaN( d = Get( i, j ) ) ) { 01056 ostm.write( (char*)&j, sizeof(j) ); 01057 ostm.write( (char*)&d, sizeof(d) ); } 01058 j = -1; 01059 ostm.write( (char*)&j, sizeof(j) ); } } 01060 01061 void CDatImpl::SaveGenes( std::ostream& ostm ) const { 01062 size_t i, j; 01063 uint32_t iSize; 01064 string strGene; 01065 01066 iSize = GetGenes( ); 01067 ostm.write( (char*)&iSize, sizeof(iSize) ); 01068 for( i = 0; i < iSize; ++i ) { 01069 strGene = GetGene( i ); 01070 for( j = 0; j < strGene.length( ); ++j ) { 01071 ostm.put( 0 ); 01072 ostm.put( strGene[ j ] ); } 01073 ostm.put( 0 ); 01074 ostm.put( 0 ); } } 01075 01095 bool CDat::Open( const std::vector<std::string>& vecstrGenes, bool fClear, const char* szFile ) { 01096 size_t i, j, iSize; 01097 unsigned char* pb; 01098 01099 Reset( ); 01100 m_vecstrGenes.resize( vecstrGenes.size( ) ); 01101 copy( vecstrGenes.begin( ), vecstrGenes.end( ), m_vecstrGenes.begin( ) ); 01102 01103 m_mapstrGenes.clear(); 01104 for( i = 0; i < vecstrGenes.size(); ++i ) 01105 m_mapstrGenes[vecstrGenes[i]] = i; 01106 01107 if( szFile ) { 01108 iSize = sizeof(uint32_t); 01109 for( i = 0; i < GetGenes( ); ++i ) 01110 iSize += 2 * ( GetGene( i ).length( ) + 1 ); 01111 iSize += CDistanceMatrix::GetSpace( GetGenes( ) ); 01112 if( !CMeta::MapWrite( m_abData, m_hndlData, iSize, szFile ) ) 01113 return false; 01114 *(uint32_t*)( pb = m_abData ) = GetGenes( ); 01115 pb += sizeof(uint32_t); 01116 for( i = 0; i < GetGenes( ); ++i ) { 01117 const string& strGene = GetGene( i ); 01118 01119 for( j = 0; j < strGene.length( ); ++j ) { 01120 *pb++ = 0; 01121 *pb++ = strGene[ j ]; } 01122 *pb++ = 0; 01123 *pb++ = 0; } 01124 01125 if( !OpenMemmap( pb ) ) 01126 return false; } 01127 else 01128 m_Data.Initialize( GetGenes( ) ); 01129 01130 if( fClear ) 01131 for( i = 0; i < m_vecstrGenes.size( ); ++i ) 01132 for( j = ( i + 1 ); j < m_vecstrGenes.size( ); ++j ) 01133 Set( i, j, CMeta::GetNaN( ) ); 01134 01135 return true; } 01136 01157 bool CDat::Open( const std::vector<std::string>& vecstrGenes, const CDistanceMatrix& MatScores ) { 01158 size_t i; 01159 01160 if( vecstrGenes.size( ) != MatScores.GetSize( ) ) 01161 return false; 01162 01163 Reset( ); 01164 m_vecstrGenes.reserve( vecstrGenes.size( ) ); 01165 for( i = 0; i < vecstrGenes.size( ); ++i ) 01166 m_vecstrGenes.push_back( vecstrGenes[ i ] ); 01167 01168 for( i = 0; i < vecstrGenes.size(); ++i ) 01169 m_mapstrGenes[vecstrGenes[i]] = i; 01170 01171 m_Data.Initialize( MatScores ); 01172 01173 return true; } 01174 01175 size_t CDatImpl::GetGene( const std::string& strGene ) const { 01176 size_t i; 01177 01178 if( m_pMeasure ) 01179 return m_pPCL->GetGene( strGene ); 01180 01181 for( i = 0; i < GetGenes( ); ++i ) 01182 if( m_vecstrGenes[ i ] == strGene ) 01183 return i; 01184 01185 return -1; } 01186 01220 bool CDat::Open( const char* szFile, bool fMemmap, size_t iSkip, bool fZScore, bool fDuplicates, bool fSeek ) { 01221 EFormat eFormat; 01222 size_t i; 01223 01224 if( !szFile ) 01225 return Open( cin, EFormatText, (float)HUGE_VAL, fDuplicates ); 01226 01227 for( i = 0; c_asFormats[ i ].m_szExtension; ++i ) 01228 if( !strcmp( szFile + strlen( szFile ) - strlen( c_asFormats[ i ].m_szExtension ), 01229 c_asFormats[ i ].m_szExtension ) ) 01230 break; 01231 eFormat = c_asFormats[ i ].m_eFormat; 01232 01233 if( fMemmap && ( eFormat == EFormatBinary ) ) { 01234 Reset( ); 01235 if( !CMeta::MapRead( m_abData, m_hndlData, m_iData, szFile ) ) { 01236 g_CatSleipnir( ).error( "CDat::Open( %s, %d ) failed memory mapping", szFile, fMemmap ); 01237 return false; } 01238 return OpenHelper( ); } 01239 01240 if(m_ifsm.is_open()){ 01241 m_ifsm.close(); 01242 } 01243 01244 m_ifsm.open( szFile, ( ( eFormat == EFormatText ) || ( eFormat == EFormatPCL ) ) ? ios_base::in : 01245 ios_base::binary ); 01246 if( !m_ifsm.is_open( ) ) 01247 return false; 01248 01249 if(fSeek){ 01250 m_fSeek = true; 01251 } 01252 01253 return Open( m_ifsm, eFormat, (float)HUGE_VAL, fDuplicates, iSkip, fZScore, fSeek ); } 01254 01255 bool CDatImpl::OpenHelper( ) { 01256 unsigned char* pb; 01257 size_t i; 01258 01259 m_vecstrGenes.resize( *(uint32_t*)( pb = m_abData ) ); 01260 pb += sizeof(uint32_t); 01261 for( i = 0; i < GetGenes( ); ++i ) { 01262 string& strGene = m_vecstrGenes[ i ]; 01263 01264 while( *++pb ) 01265 strGene += *pb++; 01266 pb++; } 01267 01268 return OpenMemmap( pb ); } 01269 01270 bool CDatImpl::OpenMemmap( const unsigned char* pb ) { 01271 size_t i; 01272 01273 m_aadData = new float*[ GetGenes( ) - 1 ]; 01274 m_aadData[ 0 ] = (float*)pb; 01275 for( i = 1; ( i + 1 ) < m_vecstrGenes.size( ); ++i ) 01276 m_aadData[ i ] = m_aadData[ i - 1 ] + GetGenes( ) - i; 01277 m_Data.Initialize( GetGenes( ), (float**)m_aadData ); 01278 01279 return true; } 01280 01281 void CDatImpl::AveStd( double& dAverage, double& dStdev, size_t& iN, size_t iApproximate ) const { 01282 size_t i, j; 01283 float d; 01284 01285 dAverage = dStdev = 0; 01286 if( iApproximate == -1 ) { 01287 for( iN = i = 0; i < GetGenes( ); ++i ) 01288 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01289 if( !CMeta::IsNaN( d = Get( i, j ) ) ) { 01290 iN++; 01291 dAverage += d; 01292 dStdev += d * d; } } 01293 else { 01294 size_t iOne, iTwo; 01295 01296 for( i = 0; i < iApproximate; ++i ) { 01297 iOne = rand( ) % GetGenes( ); 01298 if( ( ( iTwo = rand( ) % GetGenes( ) ) == iOne ) || 01299 CMeta::IsNaN( d = Get( iOne, iTwo ) ) ) { 01300 i--; 01301 continue; } 01302 dAverage += d; 01303 dStdev += d * d; } 01304 iN = i; } 01305 if( iN ) { 01306 dAverage /= iN; 01307 dStdev = sqrt( ( dStdev / ( iN - ( ( iN > 1 ) ? 1 : 0 ) ) ) - ( dAverage * dAverage ) ); 01308 } 01309 01310 } 01311 01312 void CDatImpl::NormalizeStdev( ) { 01313 double d, dAve, dDev; 01314 size_t i, j, iN; 01315 01316 AveStd( dAve, dDev, iN ); 01317 if( !( iN && dDev ) ) 01318 return; 01319 for( i = 0; i < GetGenes( ); ++i ) 01320 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01321 if( !CMeta::IsNaN( (float)( d = Get( i, j ) ) ) ) 01322 Set( i, j, (float)( ( d - dAve ) / dDev ) ); } 01323 01324 void CDatImpl::NormalizeSigmoid( ) { 01325 size_t i, j; 01326 float d; 01327 01328 for( i = 0; i < GetGenes( ); ++i ) 01329 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01330 if( !CMeta::IsNaN( d = Get( i, j ) ) ) 01331 Set( i, j, 1.0f / ( 1 + exp( -d ) ) ); } 01332 01333 void CDatImpl::NormalizeNormCDF( ) { 01334 size_t i, j, iN; 01335 float d; 01336 double dAve, dStd; 01337 01338 AveStd( dAve, dStd, iN ); 01339 for( i = 0; i < GetGenes( ); ++i ) 01340 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01341 if( !CMeta::IsNaN( d = Get( i, j ) ) ) 01342 Set( i, j, (float)CStatistics::NormalCDF( d, dAve, dStd ) ); } 01343 01344 void CDatImpl::NormalizeMinmax( ) { 01345 float d, dMin, dMax; 01346 size_t i, j; 01347 01348 dMax = -( dMin = FLT_MAX ); 01349 for( i = 0; i < GetGenes( ); ++i ) 01350 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01351 if( !CMeta::IsNaN( d = Get( i, j ) ) ) { 01352 if( d < dMin ) 01353 dMin = d; 01354 if( d > dMax ) 01355 dMax = d; } 01356 if( dMax -= dMin ) 01357 for( i = 0; i < GetGenes( ); ++i ) 01358 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01359 Set( i, j, ( Get( i, j ) - dMin ) / dMax ); } 01360 01361 void CDatImpl::NormalizeMinmaxNPone( ) { 01362 float d, dMin, dMax; 01363 size_t i, j; 01364 01365 dMax = -( dMin = FLT_MAX ); 01366 for( i = 0; i < GetGenes( ); ++i ) 01367 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01368 if( !CMeta::IsNaN( d = Get( i, j ) ) ) { 01369 if( d < dMin ) 01370 dMin = d; 01371 if( d > dMax ) 01372 dMax = d; } 01373 if( dMax -= dMin ) 01374 for( i = 0; i < GetGenes( ); ++i ) 01375 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01376 Set( i, j, ( ( Get( i, j ) - dMin ) / dMax ) * 2.0 + -1.0 ); } 01377 01378 void CDatImpl::NormalizePCC( ) { 01379 size_t i, j; 01380 vector<float> vecdAves, vecdStds; 01381 vector<size_t> veciCounts; 01382 float d, dOne, dTwo; 01383 01384 vecdAves.resize( GetGenes( ) ); 01385 vecdStds.resize( GetGenes( ) ); 01386 veciCounts.resize( GetGenes( ) ); 01387 for( i = 0; i < GetGenes( ); ++i ) 01388 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01389 if( !CMeta::IsNaN( d = Get( i, j ) ) ) { 01390 veciCounts[i]++; 01391 veciCounts[j]++; 01392 vecdAves[i] += d; 01393 vecdAves[j] += d; 01394 d *= d; 01395 vecdStds[i] += d; 01396 vecdStds[j] += d; } 01397 for( i = 0; i < GetGenes( ); ++i ) { 01398 if( veciCounts[i] ) { 01399 vecdAves[i] /= veciCounts[i]; 01400 vecdStds[i] = sqrt( ( vecdStds[i] / ( max( veciCounts[i], 2 ) - 1 ) ) - 01401 ( vecdAves[i] * vecdAves[i] ) ); } 01402 if( !vecdStds[i] ) 01403 vecdStds[i] = 1; } 01404 for( i = 0; i < GetGenes( ); ++i ) { 01405 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01406 if( !CMeta::IsNaN( d = Get( i, j ) ) ) { 01407 dOne = ( d - vecdAves[i] ) / vecdStds[i]; 01408 dTwo = ( d - vecdAves[j] ) / vecdStds[j]; 01409 Set( i, j, sqrt( ( dOne * dOne ) + ( dTwo * dTwo ) ) ); } } } 01410 01418 void CDat::Invert( ) { 01419 size_t i, j; 01420 float d; 01421 01422 for( i = 0; i < GetGenes( ); ++i ) 01423 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01424 if( !CMeta::IsNaN( d = Get( i, j ) ) ) 01425 Set( i, j, 1 - d ); } 01426 01450 bool CDat::FilterGenes( const char* szGenes, EFilter eFilter, size_t iLimit ) { 01451 CGenome Genome; 01452 CGenes Genes( Genome ); 01453 ifstream ifsm; 01454 if( !szGenes ) 01455 return false; 01456 01457 ifsm.open( szGenes ); 01458 if( !Genes.Open( ifsm ) ) 01459 return false; 01460 FilterGenes( Genes, eFilter, iLimit ); 01461 return true; } 01462 01487 void CDat::FilterGenes( const CGenes& Genes, EFilter eFilter, size_t iLimit, float dEdgeAggressiveness, 01488 bool fAbsolute, const vector<float>* pvecdWeights ) { 01489 size_t i, j; 01490 vector<bool> vecfGenes; 01491 vecfGenes.resize( GetGenes( ) ); 01492 for( i = 0; i < Genes.GetGenes( ); ++i ) 01493 if( ( j = GetGene( Genes.GetGene( i ).GetName( ) ) ) != -1 ) 01494 vecfGenes[ j ] = true; 01495 01496 switch( eFilter ) { 01497 case EFilterPixie: 01498 case EFilterHefalmp: 01499 FilterGenesGraph( Genes, vecfGenes, iLimit, dEdgeAggressiveness, eFilter == EFilterHefalmp, fAbsolute, pvecdWeights ); 01500 return; } 01501 01502 for( i = 0; i < GetGenes( ); ++i ) { 01503 if( ( ( eFilter == EFilterExclude ) && vecfGenes[ i ] ) || 01504 ( ( eFilter == EFilterInclude ) && !vecfGenes[ i ] ) || 01505 ( ( eFilter == EFilterIncludePos ) && !vecfGenes[ i ] ) ) { 01506 for( j = ( i + 1 ); j < GetGenes( ); ++j ) { 01507 if ( Get( i, j ) || eFilter != EFilterIncludePos ) 01508 Set( i, j, CMeta::GetNaN( ) ); 01509 } 01510 continue; } 01511 if( ( eFilter == EFilterEdge ) && vecfGenes[ i ] ) 01512 continue; 01513 if( ( eFilter == EFilterExEdge ) && !vecfGenes[ i ] ) 01514 continue; 01515 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01516 switch( eFilter ) { 01517 case EFilterInclude: 01518 case EFilterEdge: 01519 if( !vecfGenes[ j ] ) 01520 Set( i, j, CMeta::GetNaN( ) ); 01521 break; 01522 case EFilterIncludePos: 01523 if( !vecfGenes[ j ] && Get( i, j ) ) 01524 Set( i, j, CMeta::GetNaN( ) ); 01525 break; 01526 case EFilterTerm: 01527 if( !( vecfGenes[ i ] && vecfGenes[ j ] ) && 01528 ( !( vecfGenes[ i ] || vecfGenes[ j ] ) || Get( i, j ) ) ) 01529 Set( i, j, CMeta::GetNaN( ) ); 01530 break; 01531 case EFilterExEdge: 01532 if( vecfGenes[ j ] ) 01533 Set( i, j, CMeta::GetNaN( ) ); 01534 break; 01535 case EFilterExclude: 01536 if( vecfGenes[ j ] ) 01537 Set( i, j, CMeta::GetNaN( ) ); 01538 break; } } } 01539 01540 struct SPixie { 01541 size_t m_iNode; 01542 float m_dScore; 01543 01544 SPixie( size_t iNode, float dScore ) : m_iNode(iNode), m_dScore(dScore) { } 01545 01546 bool operator<( const SPixie& sPixie ) const { 01547 01548 return ( m_dScore < sPixie.m_dScore ); } 01549 }; 01550 01551 void CDatImpl::FilterGenesGraph( const CGenes& Genes, vector<bool>& vecfGenes, size_t iLimit, 01552 float dEdgeAggressiveness, bool fHefalmp, bool fAbsolute, const vector<float>* pvecdWeights ) { 01553 vector<float> vecdNeighbors, vecdWeights; 01554 size_t i, j, iOne, iTwo, iMinOne, iMinTwo, iN; 01555 vector<size_t> veciGenes, veciFinal, veciDegree; 01556 set<size_t> setiN; 01557 set<size_t>::const_iterator iterN; 01558 float d, dMin, dCutoff; 01559 priority_queue<SPixie> pqueNeighbors; 01560 bool fDone; 01561 double dAve, dDev; 01562 01563 if( iLimit == -1 ) 01564 iLimit = c_iNeighborhood; 01565 veciGenes.resize( Genes.GetGenes( ) ); 01566 for( i = 0; i < veciGenes.size( ); ++i ) 01567 veciGenes[ i ] = GetGene( Genes.GetGene( i ).GetName( ) ); 01568 if( !pvecdWeights || ( pvecdWeights->size( ) < veciGenes.size( ) ) ) { 01569 vecdWeights.resize( veciGenes.size( ) ); 01570 fill( vecdWeights.begin( ), vecdWeights.end( ), 1.0f ); 01571 pvecdWeights = &vecdWeights; } 01572 01573 vecdNeighbors.resize( GetGenes( ) ); 01574 fill( vecdNeighbors.begin( ), vecdNeighbors.end( ), 0.0f ); 01575 if( fHefalmp ) 01576 for( i = 0; i < GetGenes( ); ++i ) { 01577 size_t iIn, iOut; 01578 float dIn, dOut; 01579 01580 if( vecfGenes[ i ] ) 01581 continue; 01582 dIn = dOut = 0; 01583 for( iIn = j = 0; j < veciGenes.size( ); ++j ) { 01584 if( ( iOne = veciGenes[ j ] ) == -1 ) 01585 continue; 01586 if( !CMeta::IsNaN( d = Get( i, iOne ) ) ) { 01587 if( fAbsolute ) 01588 d = fabs( d ); 01589 iIn++; 01590 dIn += d * (*pvecdWeights)[ j ]; } } 01591 for( iOut = j = 0; j < GetGenes( ); ++j ) 01592 if( !CMeta::IsNaN( d = Get( i, j ) ) ) { 01593 if( fAbsolute ) 01594 d = fabs( d ); 01595 iOut++; 01596 dOut += d; } 01597 vecdNeighbors[ i ] = ( iIn && dOut ) ? ( dIn * iOut / iIn / dOut ) : 0; } 01598 else 01599 for( i = 0; i < veciGenes.size( ); ++i ) { 01600 if( ( iOne = veciGenes[ i ] ) == -1 ) 01601 continue; 01602 for( j = 0; j < GetGenes( ); ++j ) { 01603 if( vecfGenes[ j ] ) 01604 continue; 01605 if( !CMeta::IsNaN( d = Get( iOne, j ) ) ) { 01606 if( fAbsolute ) 01607 d = fabs( d ); 01608 vecdNeighbors[ j ] += d * (*pvecdWeights)[ i ]; } } } 01609 for( i = 0; i < vecdNeighbors.size( ); ++i ) 01610 if( ( d = vecdNeighbors[ i ] ) > 0 ) 01611 pqueNeighbors.push( SPixie( i, d ) ); 01612 01613 for( i = 0; i < veciGenes.size( ); ++i ) 01614 if( ( iOne = veciGenes[ i ] ) != -1 ) 01615 veciFinal.push_back( iOne ); 01616 while( !pqueNeighbors.empty( ) && ( setiN.size( ) < iLimit ) ) { 01617 veciFinal.push_back( pqueNeighbors.top( ).m_iNode ); 01618 setiN.insert( pqueNeighbors.top( ).m_iNode ); 01619 pqueNeighbors.pop( ); } 01620 01621 for( iterN = setiN.begin( ); iterN != setiN.end( ); ++iterN ) 01622 vecfGenes[ *iterN ] = true; 01623 for( i = 0; i < GetGenes( ); ++i ) { 01624 if( !vecfGenes[ i ] ) { 01625 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01626 Set( i, j, CMeta::GetNaN( ) ); 01627 continue; } 01628 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01629 if( !vecfGenes[ j ] ) 01630 Set( i, j, CMeta::GetNaN( ) ); } 01631 AveStd( dAve, dDev, iN ); 01632 dCutoff = (float)( dAve + ( dEdgeAggressiveness * dDev ) ); 01633 01634 veciDegree.resize( veciFinal.size( ) ); 01635 for( i = 0; i < veciDegree.size( ); ++i ) { 01636 iOne = veciFinal[ i ]; 01637 for( j = ( i + 1 ); j < veciDegree.size( ); ++j ) { 01638 iTwo = veciFinal[ j ]; 01639 if( !CMeta::IsNaN( Get( iOne, iTwo ) ) ) { 01640 veciDegree[ i ]++; 01641 veciDegree[ j ]++; } } } 01642 for( fDone = CMeta::IsNaN( dEdgeAggressiveness ); !fDone; ) { 01643 fDone = true; 01644 dMin = FLT_MAX; 01645 for( i = 0; i < veciFinal.size( ); ++i ) { 01646 iOne = veciFinal[ i ]; 01647 for( j = ( i + 1 ); j < veciFinal.size( ); ++j ) { 01648 iTwo = veciFinal[ j ]; 01649 if( !CMeta::IsNaN( d = Get( iOne, iTwo ) ) && ( d < dCutoff ) && ( d < dMin ) && 01650 ( veciDegree[ i ] > c_iDegree ) && ( veciDegree[ j ] > c_iDegree ) ) { 01651 fDone = false; 01652 dMin = d; 01653 iMinOne = i; 01654 iMinTwo = j; } } } 01655 if( !fDone ) { 01656 veciDegree[ iMinOne ]--; 01657 veciDegree[ iMinTwo ]--; 01658 Set( veciFinal[ iMinOne ], veciFinal[ iMinTwo ], CMeta::GetNaN( ) ); } } } 01659 01700 void CDat::SaveDOT( std::ostream& ostm, float dCutoff, const CGenome* pGenome, bool fUnlabeled, bool fHashes, 01701 const std::vector<float>* pvecdColors, const std::vector<float>* pvecdBorders ) const { 01702 size_t i, j, iCount; 01703 float d, dAve, dStd; 01704 bool fAll, fLabel; 01705 vector<string> vecstrNames; 01706 vector<bool> vecfGenes; 01707 01708 fAll = CMeta::IsNaN( dCutoff ); 01709 ostm << "graph G {" << endl; 01710 ostm << " pack = \"true\";" << endl; 01711 ostm << " overlap = \"scale\";" << endl; 01712 ostm << " splines = \"true\";" << endl; 01713 01714 if( pvecdColors || pvecdBorders || !( fUnlabeled || fAll ) ) { 01715 vecfGenes.resize( GetGenes( ) ); 01716 for( i = 0; i < vecfGenes.size( ); ++i ) 01717 for( j = ( i + 1 ); j < vecfGenes.size( ); ++j ) 01718 if( !CMeta::IsNaN( d = Get( i, j ) ) && ( fAll || ( d >= dCutoff ) ) ) 01719 vecfGenes[ i ] = vecfGenes[ j ] = true; } 01720 01721 vecstrNames.resize( GetGenes( ) ); 01722 for( i = 0; i < vecstrNames.size( ); ++i ) { 01723 string strName; 01724 01725 fLabel = !fUnlabeled && ( fAll || vecfGenes[ i ] ); 01726 vecstrNames[ i ] = CMeta::Filename( strName = GetGene( i ) ); 01727 if( !isalpha( vecstrNames[ i ][ 0 ] ) ) 01728 vecstrNames[ i ] = "_" + vecstrNames[ i ]; 01729 if( pGenome && ( ( j = pGenome->GetGene( GetGene( i ) ) ) != -1 ) ) { 01730 const CGene& Gene = pGenome->GetGene( j ); 01731 01732 strName = Gene.GetSynonyms( ) ? Gene.GetSynonym( 0 ) : Gene.GetName( ); 01733 if( fUnlabeled ) { 01734 if( strName != vecstrNames[ i ] ) { 01735 vecstrNames[ i ] += '_'; 01736 vecstrNames[ i ] += Gene.GetSynonym( 0 ); } } } 01737 if( ( pvecdColors || pvecdBorders || fLabel ) && ( fAll || vecfGenes[ i ] ) ) { 01738 ostm << vecstrNames[ i ] << " [shape = \"ellipse\", style = \"filled"; 01739 if( pvecdBorders ) 01740 ostm << ", setlinewidth(" << (*pvecdBorders)[ i ] << ")"; 01741 ostm << "\""; 01742 ostm << ", fillcolor = \"" << ( fHashes ? "#" : "" ) << ( pvecdColors ? CColor::Interpolate( 01743 (*pvecdColors)[ i ], 01744 CColor::c_Cyan, CColor::c_White, CColor::c_Yellow 01745 //CColor::c_White, CColor::c_Cyan, CColor::c_Yellow 01746 ).ToRGB( ) : 01747 "FFFFFF" ) << "\""; 01748 if( fLabel ) 01749 ostm << ", label=\"" << strName << "\""; 01750 ostm << "];" << endl; } } 01751 01752 ostm << endl; 01753 dAve = dStd = 0; 01754 for( iCount = i = 0; i < GetGenes( ); ++i ) 01755 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01756 if( !CMeta::IsNaN( d = Get( i, j ) ) && ( fAll || ( d >= dCutoff ) ) ) { 01757 iCount++; 01758 dAve += d; 01759 dStd += d * d; } 01760 if( iCount ) { 01761 dAve /= iCount; 01762 dStd = sqrt( max( 0.0f, ( dStd / ( max( iCount, 2 ) - 1 ) ) - ( dAve * dAve ) ) ); } 01763 for( i = 0; i < GetGenes( ); ++i ) 01764 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01765 if( !CMeta::IsNaN( d = Get( i, j ) ) && ( fAll || ( d >= dCutoff ) ) ) { 01766 d = 1.0f / ( 1 + exp( ( dAve - d ) / dStd ) ); 01767 ostm << vecstrNames[ i ] << " -- " << vecstrNames[ j ] << " [weight = " << d << 01768 ", color = \"" << ( fHashes ? "#" : "" ) << CColor::Interpolate( d, 01769 //CColor::c_Green, CColor::c_Black, CColor::c_Red ).ToRGB( ) 01770 CColor::c_Orange, CColor::c_DarkGreen, CColor::c_Blue ).ToRGB( ) 01771 << "\"];" << endl; } 01772 01773 ostm << "}" << endl; } 01774 01797 void CDat::SaveGDF( std::ostream& ostm, float dCutoff ) const { 01798 size_t i, j; 01799 float d; 01800 bool fAll; 01801 vector<string> vecstrNames; 01802 01803 fAll = CMeta::IsNaN( dCutoff ); 01804 ostm << "nodedef> name" << endl; 01805 01806 vecstrNames.resize( GetGenes( ) ); 01807 for( i = 0; i < vecstrNames.size( ); ++i ) 01808 vecstrNames[ i ] = CMeta::Filename( GetGene( i ) ); 01809 for( i = 0; i < GetGenes( ); ++i ) 01810 ostm << vecstrNames[ i ] << endl; 01811 01812 ostm << endl << "edgedef> node1,node2,weight" << endl; 01813 for( i = 0; i < GetGenes( ); ++i ) 01814 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01815 if( !CMeta::IsNaN( d = Get( i, j ) ) && ( fAll || ( d >= dCutoff ) ) ) 01816 ostm << vecstrNames[ i ] << "," << vecstrNames[ j ] << "," << ( 10 * Get( i, j ) ) << endl; } 01817 01836 void CDat::SaveNET( std::ostream& ostm, float dCutoff ) const { 01837 size_t i, j; 01838 float d; 01839 bool fAll; 01840 01841 fAll = CMeta::IsNaN( dCutoff ); 01842 ostm << "*Vertices " << GetGenes( ) << endl; 01843 for( i = 0; i < GetGenes( ); ++i ) 01844 ostm << ( i + 1 ) << " \"" << GetGene( i ) << '"' << endl; 01845 01846 ostm << "*Edges" << endl; 01847 for( i = 0; i < GetGenes( ); ++i ) 01848 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01849 if( !CMeta::IsNaN( d = Get( i, j ) ) && ( fAll || ( d >= dCutoff ) ) ) 01850 ostm << ( i + 1 ) << ' ' << ( j + 1 ) << ' ' << d << endl; } 01851 01873 void CDat::SaveMATISSE( std::ostream& ostm, float dCutoff, const CGenome* pGenome ) const { 01874 size_t i, j, k; 01875 float d; 01876 bool fAll; 01877 01878 fAll = CMeta::IsNaN( dCutoff ); 01879 for( i = 0; i < GetGenes( ); ++i ) { 01880 j = pGenome ? pGenome->GetGene( GetGene( i ) ) : -1; 01881 ostm << i << '\t' << GetGene( i ) << '\t' << ( ( ( j == -1 ) || 01882 !pGenome->GetGene( j ).GetSynonyms( ) ) ? GetGene( i ) : 01883 pGenome->GetGene( j ).GetSynonym( 0 ) ) << '\t' << ( ( j == -1 ) ? "-" : 01884 pGenome->GetGene( j ).GetGloss( ) ) << endl; } 01885 01886 for( k = i = 0; i < GetGenes( ); ++i ) 01887 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01888 if( !CMeta::IsNaN( d = Get( i, j ) ) && ( fAll || ( d >= dCutoff ) ) ) 01889 ostm << k++ << '\t' << i << '\t' << j << " false " << d << endl; } 01890 01891 struct SSorterRank { 01892 const CDat& m_Dat; 01893 01894 SSorterRank( const CDat& Dat ) : m_Dat(Dat) { } 01895 01896 bool operator()( const pair<size_t, size_t>& prOne, const pair<size_t, size_t>& prTwo ) const { 01897 01898 return ( m_Dat.Get( prOne.first, prOne.second ) < m_Dat.Get( prTwo.first, prTwo.second ) ); } 01899 01900 }; 01901 01909 void CDat::Rank( ) { 01910 vector<pair<size_t, size_t> > vecprData; 01911 size_t i, j, iRank; 01912 float d, dPrev; 01913 01914 for( i = 0; i < GetGenes( ); ++i ) 01915 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01916 if( !CMeta::IsNaN( Get( i, j ) ) ) 01917 vecprData.push_back( pair<size_t, size_t>( i, j ) ); 01918 sort( vecprData.begin( ), vecprData.end( ), SSorterRank( *this ) ); 01919 for( iRank = i = 0; i < vecprData.size( ); ++i ) { 01920 d = Get( vecprData[ i ].first, vecprData[ i ].second ); 01921 if( i && ( d != dPrev ) ) 01922 iRank = i; 01923 dPrev = d; 01924 Set( vecprData[ i ].first, vecprData[ i ].second, (float)iRank ); } } 01925 01926 void CDat::NormalizeQuantiles( size_t iQuantiles ) { 01927 static const size_t c_iTest = 100; 01928 float d, dTest; 01929 size_t i, j, k, iValues, iPrev, iNext; 01930 set<float> setiTest; 01931 bool fDiscrete; 01932 vector<float> vecdValues; 01933 01934 for( iValues = i = 0; i < GetGenes( ); ++i ) 01935 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01936 if( !CMeta::IsNaN( d = Get( i, j ) ) ) 01937 iValues++; 01938 01939 // Heuristic to test for discrete valued dats 01940 dTest = (float)c_iTest / iValues; 01941 for( i = 0; i < GetGenes( ); ++i ) 01942 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01943 if( !CMeta::IsNaN( d = Get( i, j ) ) && ( ( (float)rand( ) / RAND_MAX ) < dTest ) ) 01944 setiTest.insert( d ); 01945 fDiscrete = ( setiTest.size( ) * 2 ) < c_iTest; 01946 01947 if( fDiscrete ) { 01948 map<float, size_t> mapdiValues; 01949 map<float, size_t>::iterator iterValue; 01950 map<float, float> mapddQuantiles; 01951 01952 for( i = 0; i < GetGenes( ); ++i ) 01953 for( j = ( i + 1 ); j < GetGenes( ); ++j ) { 01954 if( CMeta::IsNaN( d = Get( i, j ) ) ) 01955 continue; 01956 if( ( iterValue = mapdiValues.find( d ) ) == mapdiValues.end( ) ) 01957 mapdiValues[d] = 1; 01958 else 01959 iterValue->second++; } 01960 vecdValues.resize( mapdiValues.size( ) ); 01961 for( iterValue = mapdiValues.begin( ),i = 0; iterValue != mapdiValues.end( ); ++iterValue,++i ) 01962 vecdValues[i] = iterValue->first; 01963 sort( vecdValues.begin( ), vecdValues.end( ) ); 01964 01965 for( i = iPrev = 0; i < vecdValues.size( ); ++i,iPrev = iNext ) { 01966 iNext = iPrev + mapdiValues[vecdValues[i]]; 01967 mapddQuantiles[vecdValues[i]] = iQuantiles * ( iPrev + iNext - 1 ) / 2.0f / iValues; } 01968 for( i = 0; i < GetGenes( ); ++i ) 01969 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01970 if( !CMeta::IsNaN( d = Get( i, j ) ) ) 01971 Set( i, j, mapddQuantiles[d] ); } 01972 else { 01973 vector<float> vecdTops; 01974 01975 vecdValues.reserve( min( iValues, iQuantiles * c_iTest ) ); 01976 dTest = (float)vecdValues.capacity( ) / iValues; 01977 for( i = 0; i < GetGenes( ); ++i ) 01978 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 01979 if( !CMeta::IsNaN( d = Get( i, j ) ) && ( ( (float)rand( ) / RAND_MAX ) < dTest ) ) 01980 vecdValues.push_back( d ); 01981 sort( vecdValues.begin( ), vecdValues.end( ) ); 01982 01983 vecdTops.resize( iQuantiles - 1 ); 01984 for( i = 0; i < vecdTops.size( ); ++i ) { 01985 d = (float)( i + 1 ) * ( vecdValues.size( ) - 1 ) / iQuantiles; 01986 j = (size_t)d; 01987 vecdTops[i] = ( ( d - j ) && ( ( j + 1 ) < vecdValues.size( ) ) ) ? ( ( vecdValues[j] + vecdValues[j + 1] ) / 2 ) : 01988 vecdValues[j]; } 01989 01990 for( i = 0; i < GetGenes( ); ++i ) 01991 for( j = ( i + 1 ); j < GetGenes( ); ++j ) { 01992 if( CMeta::IsNaN( d = Get( i, j ) ) ) 01993 continue; 01994 for( k = 0; k < vecdTops.size( ); ++k ) 01995 if( d < vecdTops[k] ) 01996 break; 01997 Set( i, j, (float)k ); } } } 01998 01999 }