Sleipnir
src/dat.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "dat.h"
00024 #include "genome.h"
00025 #include "statistics.h"
00026 #include "annotation.h"
00027 #include "color.h"
00028 #include "meta.h"
00029 
00030 namespace Sleipnir {
00031 
00032 const char      CDatImpl::c_acComment[]     = "#";
00033 const CColor&   CDatImpl::c_ColorMax        = CColor::c_Red;
00034 const CColor&   CDatImpl::c_ColorMin        = CColor::c_Green;
00035 const CColor&   CDatImpl::c_ColorMid        = CColor::c_Black;
00036 
00037 static const struct {
00038     const char*         m_szExtension;
00039     const CDat::EFormat m_eFormat;
00040 } c_asFormats[] = {
00041     {"dat", CDat::EFormatText},
00042     {"das", CDat::EFormatSparse},
00043     {"pcl", CDat::EFormatPCL},
00044     {"qdab",CDat::EFormatQdab},
00045     {NULL,  CDat::EFormatBinary}
00046 };
00047 
00048 size_t CDatImpl::MapGene( TMapStrI& mapGenes, TVecStr& vecGenes, const std::string& strToken ) {
00049     TMapStrI::iterator  iterGenes;
00050     size_t              iRet;
00051 
00052     if( ( iterGenes = mapGenes.find( strToken ) ) == mapGenes.end( ) ) {
00053         iRet = mapGenes.size( );
00054         mapGenes[ strToken ] = iRet;
00055         vecGenes.push_back( strToken ); }
00056     else
00057         iRet = iterGenes->second;
00058 
00059     return iRet; }
00060 
00061 void CDatImpl::ResizeNaN( TAF& vecf, size_t iLast ) {
00062     size_t  i;
00063 
00064     if( ( i = vecf.size( ) ) > iLast )
00065         return;
00066 
00067     vecf.resize( iLast + 1 );
00068     for( ; i < vecf.size( ); ++i )
00069         vecf[ i ] = CMeta::GetNaN( ); }
00070 
00071 void CDatImpl::DabGene( std::istream& istm, char* acBuffer ) {
00072     size_t  i;
00073 
00074     i = 0;
00075     do
00076         istm.seekg( 1, ios_base::cur );
00077     while( acBuffer[ i++ ] = istm.get( ) ); }
00078 
00079 CDatImpl::~CDatImpl( ) {
00080 
00081     Reset( ); }
00082 
00083 void CDatImpl::Reset( ) {
00084 
00085     m_Data.Reset( );
00086     m_vecstrGenes.clear( );
00087 
00088     if( m_pPCL && m_fPCLMemory )
00089         delete m_pPCL;
00090     m_pPCL = NULL;
00091     if( m_pMeasure && m_fMeasureMemory )
00092         delete m_pMeasure;
00093     m_pMeasure = NULL;
00094 
00095     CMeta::Unmap( m_abData, m_hndlData, m_iData );
00096     m_abData = NULL;
00097     m_hndlData = 0;
00098     m_iData = 0;
00099     if( m_aadData )
00100         delete[] m_aadData;
00101     m_aadData = NULL; }
00102 
00103 void CDatImpl::SlimCache( const CSlim& Slim, vector<vector<size_t> >& vecveciGenes ) const {
00104     size_t  iS, iG;
00105 
00106     vecveciGenes.resize( Slim.GetSlims( ) );
00107     for( iS = 0; iS < vecveciGenes.size( ); ++iS ) {
00108         vecveciGenes[ iS ].resize( Slim.GetGenes( iS ) );
00109         for( iG = 0; iG < vecveciGenes[ iS ].size( ); ++iG )
00110             vecveciGenes[ iS ][ iG ] =  GetGene( Slim.GetGene( iS, iG ).GetName( ) ); } }
00111 
00129 bool CDat::Open( const CSlim& Slim ) {
00130     vector<string>          vecstrGenes;
00131     size_t                  iS1, iS2, iG1, iG2, iGene1, iGene2;
00132     vector<vector<size_t> > vecveciGenes;
00133 
00134     Reset( );
00135     Slim.GetGeneNames( vecstrGenes );
00136     if( !Open( vecstrGenes ) )
00137         return false;
00138 
00139     SlimCache( Slim, vecveciGenes );
00140     for( iS1 = 0; iS1 < Slim.GetSlims( ); ++iS1 ) {
00141         g_CatSleipnir( ).info( "CDat::Open( ) processing slim: %s",
00142             Slim.GetSlim( iS1 ).c_str( ) );
00143         for( iG1 = 0; iG1 < Slim.GetGenes( iS1 ); ++iG1 ) {
00144             iGene1 = vecveciGenes[ iS1 ][ iG1 ];
00145             for( iG2 = ( iG1 + 1 ); iG2 < Slim.GetGenes( iS1 ); ++iG2 )
00146                 Set( iGene1, vecveciGenes[ iS1 ][ iG2 ], 1 );
00147             for( iS2 = ( iS1 + 1 ); iS2 < Slim.GetSlims( ); ++iS2 )
00148                 for( iG2 = 0; iG2 < Slim.GetGenes( iS2 ); ++iG2 ) {
00149                     iGene2 = vecveciGenes[ iS2 ][ iG2 ];
00150                     if( CMeta::IsNaN( Get( iGene1, iGene2 ) ) )
00151                         Set( iGene1, iGene2, 0 ); } } }
00152 
00153     return true; }
00154 
00176 bool CDat::Open( const CSlim& SlimPositives, const CSlim& SlimNonnegatives ) {
00177     set<string>             setstrGenes;
00178     vector<string>          vecstrGenes;
00179     size_t                  iS1, iS2, iG1, iG2, iGene1, iGene2;
00180     vector<vector<size_t> > vecveciGenes;
00181 
00182     Reset( );
00183     SlimPositives.GetGeneNames( vecstrGenes );
00184     setstrGenes.insert( vecstrGenes.begin( ), vecstrGenes.end( ) );
00185     vecstrGenes.clear( );
00186     SlimNonnegatives.GetGeneNames( vecstrGenes );
00187     setstrGenes.insert( vecstrGenes.begin( ), vecstrGenes.end( ) );
00188     vecstrGenes.clear( );
00189     vecstrGenes.resize( setstrGenes.size( ) );
00190     copy( setstrGenes.begin( ), setstrGenes.end( ), vecstrGenes.begin( ) );
00191     if( !Open( vecstrGenes ) )
00192         return false;
00193 
00194     SlimCache( SlimPositives, vecveciGenes );
00195     for( iS1 = 0; iS1 < SlimPositives.GetSlims( ); ++iS1 ) {
00196         g_CatSleipnir( ).info( "CDat::Open( ) processing slim: %s",
00197             SlimPositives.GetSlim( iS1 ).c_str( ) );
00198         for( iG1 = 0; iG1 < SlimPositives.GetGenes( iS1 ); ++iG1 ) {
00199             iGene1 = vecveciGenes[ iS1 ][ iG1 ];
00200             for( iG2 = ( iG1 + 1 ); iG2 < SlimPositives.GetGenes( iS1 ); ++iG2 )
00201                 Set( iGene1, vecveciGenes[ iS1 ][ iG2 ], 1 ); } }
00202     vecveciGenes.clear( );
00203     SlimCache( SlimNonnegatives, vecveciGenes );
00204     for( iS1 = 0; iS1 < SlimNonnegatives.GetSlims( ); ++iS1 ) {
00205         g_CatSleipnir( ).info( "CDat::Open( ) processing slim: %s",
00206             SlimNonnegatives.GetSlim( iS1 ).c_str( ) );
00207         for( iG1 = 0; iG1 < SlimNonnegatives.GetGenes( iS1 ); ++iG1 ) {
00208             iGene1 = vecveciGenes[ iS1 ][ iG1 ];
00209             for( iS2 = ( iS1 + 1 ); iS2 < SlimNonnegatives.GetSlims( ); ++iS2 )
00210                 for( iG2 = 0; iG2 < SlimNonnegatives.GetGenes( iS2 ); ++iG2 ) {
00211                     iGene2 = vecveciGenes[ iS2 ][ iG2 ];
00212                     if( CMeta::IsNaN( Get( iGene1, iGene2 ) ) )
00213                         Set( iGene1, iGene2, 0 ); } } }
00214 
00215     return true; }
00216 
00253 bool CDat::Open( const CDat& DatKnown, const vector<CGenes*>& vecpOther, const CGenome& Genome,
00254     bool fKnownNegatives, bool fIncident) {
00255     size_t          i, j, iOne, iTwo;
00256     vector<size_t>  veciGenes;
00257     float           d;
00258 
00259     Reset( );
00260     Open( Genome.GetGeneNames( ) );
00261     for( i = 0; i < vecpOther.size( ); ++i )
00262         OpenHelper( vecpOther[ i ], 1 );
00263     if( !fKnownNegatives )
00264         for( i = 0; i < GetGenes( ); ++i )
00265             for( j = ( i + 1 ); j < GetGenes( ); ++j )
00266                 Set( i, j, CMeta::IsNaN( Get( i, j ) ) ? 0 : CMeta::GetNaN( ) );
00267     
00268     veciGenes.resize( DatKnown.GetGenes( ) );
00269     for( i = 0; i < veciGenes.size( ); ++i )
00270         veciGenes[ i ] = GetGene( DatKnown.GetGene( i ) );
00271     for( i = 0; i < DatKnown.GetGenes( ); ++i ) {
00272         iOne = veciGenes[ i ];
00273         for( j = ( i + 1 ); j < DatKnown.GetGenes( ); ++j ) {
00274             iTwo = veciGenes[ j ];
00275             if( CMeta::IsNaN( d = DatKnown.Get( i, j ) ) ) {
00276                 if( fKnownNegatives && CMeta::IsNaN( Get( iOne, iTwo ) ) )
00277                     Set( iOne, iTwo, 0 ); }
00278             else if( fKnownNegatives == !d )
00279                 Set( iOne, iTwo, d ); } }
00280     
00281 
00282     if( fIncident ) {
00283       vector<float> vecfNegatives;
00284       vecfNegatives.resize( GetGenes( ) );
00285       fill( vecfNegatives.begin( ), vecfNegatives.end( ), false );
00286       
00287       for( i = 0; i < vecfNegatives.size( ); ++i )      
00288         for( j = 0; j < vecpOther.size( ); ++j )          
00289           if( vecpOther[j]->IsGene( GetGene( i ) ) ) {
00290         vecfNegatives[i] = true;        
00291         break; 
00292           }
00293       
00294       for( i = 0; i < vecfNegatives.size( ); ++i )
00295         if( DatKnown.GetGene( GetGene( i ) ) != -1 )
00296           vecfNegatives[i] = true;
00297       
00298       for( i = 0; i < GetGenes( ); ++i ) {
00299         if(  vecfNegatives[i] )
00300           continue;
00301         for( j = ( i + 1 ); j < GetGenes( ); ++j )
00302           if( !( vecfNegatives[j] || Get( i, j ) ) ){
00303         Set( i, j, CMeta::GetNaN( ) ); 
00304         //cerr << "setting to NaN" << endl;
00305           }
00306       } 
00307     }
00308     
00309     return true; }
00310 
00321 bool CDat::Open( const CDat& Dat ) {
00322     size_t  i;
00323 
00324     if( !Open( Dat.GetGeneNames( ) ) )
00325         return false;
00326 
00327     for( i = 0; i < GetGenes( ); ++i )
00328         memcpy( Get( i ), Dat.Get( i ), ( GetGenes( ) - i - 1 ) * sizeof(*Get( i )) );
00329 
00330     return true; }
00331 
00367 bool CDat::Open( const vector<CGenes*>& vecpPositives, const vector<CGenes*>& vecpNonnegatives,
00368     float dPValue, const CGenome& Genome, bool fIncident ) {
00369     size_t          i, j, k, iOne, iTwo, iOverlap;
00370     float           d;
00371     const CGenes*   pBig;
00372     const CGenes*   pSmall;
00373 
00374     Reset( );
00375     Open( Genome.GetGeneNames( ) );
00376     for( i = 0; i < vecpPositives.size( ); ++i )
00377         OpenHelper( vecpPositives[ i ], 1 );
00378     if( vecpNonnegatives.size( ) ) {
00379         for( i = 0; i < vecpNonnegatives.size( ); ++i )
00380             OpenHelper( vecpNonnegatives[ i ], 0 );
00381         if( dPValue > 0 )
00382             for( i = 0; i < vecpPositives.size( ); ++i ) {
00383                 iOne = vecpPositives[ i ]->GetGenes( );
00384                 for( j = ( i + 1 ); j < vecpPositives.size( ); ++j ) {
00385                     iTwo = vecpPositives[ j ]->GetGenes( );
00386                     if( iOne < iTwo ) {
00387                         pSmall = vecpPositives[ i ];
00388                         pBig = vecpPositives[ j ]; }
00389                     else {
00390                         pSmall = vecpPositives[ j ];
00391                         pBig = vecpPositives[ i ]; }
00392                     for( iOverlap = k = 0; k < pSmall->GetGenes( ); ++k )
00393                         if( pBig->IsGene( pSmall->GetGene( k ).GetName( ) ) )
00394                             iOverlap++;
00395                     if( CStatistics::HypergeometricCDF( iOverlap, iOne, iTwo, GetGenes( ) ) < dPValue )
00396                       OpenHelper( pBig, pSmall, 0 ); } }
00397         for( i = 0; i < GetGenes( ); ++i )
00398             for( j = ( i + 1 ); j < GetGenes( ); ++j )
00399                 if( CMeta::IsNaN( d = Get( i, j ) ) )
00400                     Set( i, j, 0 );
00401                 else if( !d )
00402                     Set( i, j, CMeta::GetNaN( ) );
00403         if( fIncident ) {
00404             vector<float>   vecfNegatives;
00405 
00406             vecfNegatives.resize( GetGenes( ) );
00407             fill( vecfNegatives.begin( ), vecfNegatives.end( ), false );
00408             for( i = 0; i < vecfNegatives.size( ); ++i )
00409                 for( j = 0; j < vecpNonnegatives.size( ); ++j )
00410                     if( vecpNonnegatives[j]->IsGene( GetGene( i ) ) ) {
00411                         vecfNegatives[i] = true;
00412                         break; }
00413             for( i = 0; i < GetGenes( ); ++i ) {
00414                 if( vecfNegatives[i] )
00415                     continue;
00416                 for( j = ( i + 1 ); j < GetGenes( ); ++j )
00417                     if( !( vecfNegatives[j] || Get( i, j ) ) )
00418                         Set( i, j, CMeta::GetNaN( ) ); } } }
00419 
00420     return true; }
00421 
00422 void CDatImpl::OpenHelper( const CGenes* pGenes, float dValue ) {
00423     vector<size_t>  veciGenes;
00424     size_t          i, j, iOne, iTwo;
00425 
00426     veciGenes.resize( pGenes->GetGenes( ) );
00427     for( i = 0; i < veciGenes.size( ); ++i )
00428         veciGenes[ i ] = GetGene( pGenes->GetGene( i ).GetName( ) );
00429     for( i = 0; i < veciGenes.size( ); ++i ) {
00430         iOne = veciGenes[ i ];
00431         for( j = ( i + 1 ); j < veciGenes.size( ); ++j ) {
00432             iTwo = veciGenes[ j ];
00433             if( CMeta::IsNaN( Get( iOne, iTwo ) ) )
00434                 Set( iOne, iTwo, dValue ); } } }
00435 
00436 void CDatImpl::OpenHelper( const CGenes* pOne, const CGenes* pTwo, float dValue ) {
00437     vector<size_t>  veciOne, veciTwo;
00438     size_t          i, j, iOne, iTwo;
00439 
00440     veciOne.resize( pOne->GetGenes( ) );
00441     for( i = 0; i < veciOne.size( ); ++i )
00442         veciOne[ i ] = GetGene( pOne->GetGene( i ).GetName( ) );
00443     veciTwo.resize( pTwo->GetGenes( ) );
00444     for( i = 0; i < veciTwo.size( ); ++i )
00445         veciTwo[ i ] = GetGene( pTwo->GetGene( i ).GetName( ) );
00446     for( i = 0; i < veciOne.size( ); ++i ) {
00447         iOne = veciOne[ i ];
00448         for( j = 0; j < veciTwo.size( ); ++j ) {
00449             iTwo = veciTwo[ j ];
00450             if( CMeta::IsNaN( Get( iOne, iTwo ) ) )
00451                 Set( iOne, iTwo, dValue ); } } }
00452 
00492 bool CDat::Open( std::istream& istm, EFormat eFormat, float dDefault, bool fDuplicates, size_t iSkip,
00493     bool fZScore, bool fSeek ) {
00494 
00495     switch( eFormat ) {
00496         case EFormatText:
00497             return OpenText( istm, dDefault, fDuplicates );
00498 
00499         case EFormatPCL:
00500             return OpenPCL( istm, iSkip, fZScore );
00501 
00502         case EFormatSparse:
00503             return OpenSparse( istm ); 
00504     
00505         case EFormatQdab:
00506             return OpenQdab( istm );               
00507 
00508     }
00509 
00510     if(fSeek){
00511         m_fSeek = true;
00512     }
00513 
00514     return OpenBinary( istm, fSeek );
00515 }
00516 
00517 bool CDatImpl::OpenPCL( std::istream& istm, size_t iSkip, bool fZScore ) {
00518 
00519     Reset( );
00520     m_pPCL = new CPCL( );
00521     if( !m_pPCL->Open( istm, iSkip ) )
00522         return false;
00523 
00524     m_pMeasure = new CMeasurePearNorm( );
00525     m_fMeasureMemory = true;
00526     if( fZScore ) {
00527         size_t  iN;
00528         double  dAve, dStd;
00529 
00530         AveStd( dAve, dStd, iN );
00531         delete m_pMeasure;
00532         m_pMeasure = new CMeasurePearNorm( dAve, dStd ); }
00533 
00534     return true; }
00535 
00562 bool CDat::Open( const CPCL& PCL, const IMeasure* pMeasure, bool fMeasureMemory ) {
00563 
00564     Reset( );
00565     m_pPCL = (CPCL*)&PCL;
00566     m_fPCLMemory = false;
00567     m_pMeasure = pMeasure;
00568     m_fMeasureMemory = fMeasureMemory;
00569 
00570     return true; }
00571 
00572 bool CDatImpl::OpenText( std::istream& istm, float dDefault, bool fDuplicates ) {
00573     const char* pc;
00574     char*       pcTail;
00575     char*       acBuf;
00576     string      strToken, strCache, strValue;
00577     TMapStrI    mapGenes;
00578     size_t      iOne, iTwo, i;
00579     float       dScore;
00580     TAAF        vecvecfScores;
00581 
00582     Reset( );
00583     acBuf = new char[ c_iBufferSize ];
00584     while( istm.peek( ) != EOF ) {
00585         istm.getline( acBuf, c_iBufferSize - 1 );
00586         strToken = OpenToken( acBuf, &pc );
00587         if( !strToken.length( ) )
00588             break;
00589         if( strToken == c_acComment )
00590             continue;
00591         if( strToken != strCache ) {
00592             strCache = strToken;
00593             iOne = MapGene( mapGenes, m_vecstrGenes, strToken ); }
00594 
00595         strToken = OpenToken( pc, &pc );
00596         if( !strToken.length( ) ) {
00597             Reset( );
00598             delete[] acBuf;
00599             return false; }
00600         iTwo = MapGene( mapGenes, m_vecstrGenes, strToken );
00601         strValue = OpenToken( pc );
00602         if( !strValue.length( ) ) {
00603             if( CMeta::IsNaN( dScore = dDefault ) ) {
00604                 Reset( );
00605                 delete[] acBuf;
00606                 return false; } }
00607         else if( !( dScore = (float)strtod( strValue.c_str( ), &pcTail ) ) &&
00608             ( pcTail != ( strValue.c_str( ) + strValue.length( ) ) ) ) {
00609             Reset( );
00610             delete[] acBuf;
00611             return false; }
00612 
00613         i = ( ( iOne > iTwo ) ? iOne : iTwo );
00614         if( vecvecfScores.size( ) <= i )
00615             vecvecfScores.resize( i + 1 );
00616         ResizeNaN( vecvecfScores[ iOne ], i );
00617         ResizeNaN( vecvecfScores[ iTwo ], i );
00618         if( !CMeta::IsNaN( vecvecfScores[ iOne ][ iTwo ] ) && ( vecvecfScores[ iOne ][ iTwo ] != dScore ) ) {
00619             g_CatSleipnir( ).error( "CDatImpl::OpenText( ) duplicate genes %s, %s (%g:%g)",
00620                 strCache.c_str( ), strToken.c_str( ), vecvecfScores[ iOne ][ iTwo ],
00621                 dScore );
00622             if( !fDuplicates ) {
00623                 Reset( );
00624                 delete[] acBuf;
00625                 return false; } }
00626         vecvecfScores[ iOne ][ iTwo ] = vecvecfScores[ iTwo ][ iOne ] = dScore; }
00627     delete[] acBuf;
00628 
00629     m_Data.Initialize( GetGenes( ) );
00630     for( iOne = 0; iOne < GetGenes( ); ++iOne )
00631         for( iTwo = ( iOne + 1 ); iTwo < GetGenes( ); ++iTwo )
00632             Set( iOne, iTwo, ( ( iTwo < vecvecfScores[ iOne ].size( ) ) ?
00633                 vecvecfScores[ iOne ][ iTwo ] : CMeta::GetNaN( ) ) );
00634 
00635     return true; }
00636 
00637 bool CDatImpl::OpenBinary( std::istream& istm, bool fSeek ) {
00638     size_t  i;
00639     float*  adScores;
00640 
00641     if(fSeek){
00642         if(!OpenHeader(istm)){
00643             cerr << "Error opening header" << endl;
00644             return false;
00645         }
00646         return true;
00647     }
00648 
00649     if( !OpenGenes( istm, true, false ) )
00650         return false;
00651 
00652     fprintf(stderr, "Reading genes\n");
00653     m_Data.Initialize( GetGenes( ) );
00654     adScores = new float[ GetGenes( ) - 1 ];
00655     for( i = 0; ( i + 1 ) < GetGenes( ); ++i ) {
00656         istm.read( (char*)adScores, sizeof(*adScores) * ( GetGenes( ) - i - 1 ) );
00657         Set( i, adScores ); }
00658     delete[] adScores;
00659 
00660     return true; }
00661 
00662 /* still to be tested
00663  * used for BINARY mode, and DAB file only, ie Float matrix
00664  */
00665 bool CDatImpl::OpenHeader(std::istream& istm){
00666     if(!m_fSeek){
00667         cerr << "Don't know how you got here" << endl;
00668     }
00669 
00670     if(!OpenGenes(istm, true, false)){
00671         return false;
00672     }
00673     EstimateSeekPositions(istm);
00674     return true;
00675 }
00676 
00677 /* still to be tested
00678  * used for BINARY mode, and DAB file only, ie Float matrix
00679  */
00680 float* CDatImpl::GetRowSeek(std::istream& istm, const size_t &ind) const {
00681     if(!m_fSeek){
00682         cerr << "Don't know how you got here" << endl;
00683     }
00684 
00685     size_t iRow, iColumn;
00686     size_t i, iNumGenes;
00687     iNumGenes = GetGenes();
00688     float* adScores = (float*)malloc(iNumGenes * sizeof(float));
00689 
00690     size_t j;
00691     for(i=0; i<ind; i++){
00692         iRow = i;
00693         iColumn = ind - 1;
00694         size_t offset1 = m_iHeader + m_veciSeekPos[iRow] + iColumn * sizeof(float);
00695         istm.seekg(offset1, ios_base::beg);
00696         float v;
00697         char *p = (char*) &v;
00698         istm.read(p, sizeof(float));
00699         adScores[i] = v;
00700     }
00701 
00702     adScores[ind] = CMeta::GetNaN();
00703 
00704     int iSize = iNumGenes - (ind+1);
00705     if(iSize==0){
00706         return adScores;
00707     }
00708 
00709     float *v = (float*)malloc(iSize*sizeof(float));
00710     char *p = (char*) v;
00711     istm.seekg(m_iHeader+m_veciSeekPos[ind], ios_base::beg);
00712     istm.read(p, iSize*sizeof(float));
00713     for(i=0; i<iSize; i++){
00714         adScores[i+ind+1] = v[i];
00715     }
00716     free(v);
00717 
00718     return adScores;
00719 }
00720 
00721 /* still to be tested
00722  * used for BINARY mode, and DAB file only, ie Float matrix
00723  */
00724 float* CDatImpl::GetRowSeek(std::istream& istm, const std::string &strGene) const{
00725     if(!m_fSeek){
00726         cerr << "Don't know how you got here" << endl;
00727     }
00728     size_t i;
00729     size_t iNumGenes = GetGenes();
00730     size_t ind;
00731 
00732     if( (ind = GetGeneIndex(strGene) ) ==-1){
00733         return NULL; //missing gene
00734     }
00735 
00736     return GetRowSeek(istm, ind);
00737 }
00738 
00739 bool CDatImpl::OpenQdab( std::istream& istm ) {
00740   size_t    iTotal, i, j, num_bins, num_bits, iPos;
00741     float*  adScores;
00742     unsigned char tmp;
00743     float* bounds;
00744     unsigned char btmpf;
00745     unsigned char btmpb;
00746     
00747     unsigned char bufferA;
00748     unsigned char bufferB;
00749     
00750     float nan_val;
00751     
00752     if( !OpenGenes( istm, true, false ) )
00753         return false;
00754     m_Data.Initialize( GetGenes( ) );
00755     
00756     // read the number of bins 
00757     istm.read((char*)&tmp, sizeof(char));       
00758     num_bins = (size_t)tmp;
00759 
00760     //read the bin boundaries
00761     bounds = new float[num_bins];
00762     istm.read((char*)bounds, sizeof(float) * num_bins);
00763     
00764     // number of bits required for each bin representation
00765     num_bits = (size_t)ceil(log( num_bins ) / log ( 2.0 )); 
00766     
00767     // add one more bit for NaN case
00768     if( pow(2, num_bits) == num_bins )
00769       ++num_bits;
00770     
00771     // set nan value
00772     nan_val = pow(2, num_bits) -1;
00773     
00774     // read the data    
00775     adScores = new float[ GetGenes( ) - 1 ];
00776     
00777     istm.read( (char*)&bufferA, sizeof(bufferA));
00778     istm.read( (char*)&bufferB, sizeof(bufferB));
00779     
00780     for( iTotal = i = 0; ( i + 1 ) < GetGenes( ); ++i ) {
00781         for(j = 0; j < ( GetGenes( ) - i - 1 ); ++j){
00782           iPos = (iTotal * num_bits) % 8;
00783           
00784           // check bit data overflow??
00785           if( iPos + num_bits > 8){
00786             btmpb = (bufferA << iPos);
00787             btmpf = (bufferB >> (16 - num_bits - iPos)) << (8-num_bits);            
00788             adScores[j] = ((btmpb | btmpf) >> (8 - num_bits));          
00789             ++iTotal;
00790             bufferA = bufferB;
00791             istm.read( (char*)&bufferB, sizeof(bufferB));
00792           }
00793           else{
00794             adScores[j] = (((bufferA << iPos) & 0xFF) >> (8 - num_bits));
00795             ++iTotal;
00796             if( iPos + num_bits == 8 ) {
00797                 bufferA = bufferB;
00798                             istm.read( (char*)&bufferB, sizeof(bufferB));
00799             }
00800           }
00801 
00802           // check if value added was promised 2^#bits -1 (NaN value)
00803           if(adScores[j] == nan_val)
00804             adScores[j] =  CMeta::GetNaN( );
00805         }
00806         
00807         Set( i, adScores ); 
00808     }
00809     
00810     delete[] adScores;
00811     delete[] bounds;
00812     return true; }
00813 
00814 
00815 bool CDatImpl::OpenSparse( std::istream& istm ) {
00816     size_t      i;
00817     uint32_t    j;
00818     float       d;
00819 
00820     if( !OpenGenes( istm, true, false ) )
00821         return false;
00822     m_Data.Initialize( GetGenes( ) );
00823     for( i = 0; ( i + 1 ) < GetGenes( ); ++i ) {
00824         for( j = ( i + 1 ); j < GetGenes( ); ++j )
00825             Set( i, j, CMeta::GetNaN( ) );
00826         while( true ) {
00827             istm.read( (char*)&j, sizeof(j) );
00828             if( j == -1 )
00829                 break;
00830             istm.read( (char*)&d, sizeof(d) );
00831             Set( i, j, d ); } }
00832 
00833     return true; }
00834 
00857 bool CDat::OpenGenes( const char* szFile, size_t iSkip ) {
00858     ifstream    ifsm;
00859     bool        fBinary;
00860     size_t      i;
00861     EFormat     eFormat;
00862 
00863     for( i = 0; c_asFormats[ i ].m_szExtension; ++i )
00864         if( !strcmp( szFile + strlen( szFile ) - strlen( c_asFormats[ i ].m_szExtension ),
00865             c_asFormats[ i ].m_szExtension ) )
00866             break;
00867     eFormat = c_asFormats[ i ].m_eFormat;
00868     fBinary = ( eFormat != EFormatText ) && ( eFormat != EFormatPCL );
00869     ifsm.open( szFile, fBinary ? ios_base::binary : ios_base::in );
00870 
00871     return OpenGenes( ifsm, fBinary, ( eFormat == EFormatPCL ) ); }
00872 
00899 bool CDat::OpenGenes( std::istream& istm, bool fBinary, bool fPCL ) {
00900 
00901     return CDatImpl::OpenGenes( istm, fBinary, fPCL ); }
00902 
00903 bool CDatImpl::OpenGenes( std::istream& istm, bool fBinary, bool fPCL ) {
00904     size_t      i, iToken;
00905     uint32_t    iCount;
00906     string      strToken, strCache;
00907     float       d;
00908     const char* pc;
00909     char*       acBuf;
00910 
00911     Reset( );
00912     if( fPCL ) {
00913         m_pPCL = new CPCL( );
00914         if( m_pPCL->Open( istm ) ) {
00915             m_pMeasure = (IMeasure*)1;
00916             m_fMeasureMemory = false;
00917             return true; }
00918         return false; }
00919     acBuf = new char[ c_iBufferSize ];
00920     if( fBinary ) {
00921         istm.read( (char*)&iCount, sizeof(iCount) );
00922         if( iCount > c_iGeneLimit ) {
00923             delete[] acBuf;
00924             return false; }
00925         m_vecstrGenes.resize( iCount );
00926         for( i = 0; i < iCount; ++i ) {
00927             DabGene( istm, acBuf );
00928             m_vecstrGenes[ i ] = acBuf;
00929             m_mapstrGenes[ acBuf ] = i; } }
00930     else {
00931         set<string>                 setstrGenes;
00932         set<string>::const_iterator iterGenes;
00933 
00934         while( istm.peek( ) != EOF ) {
00935             istm.getline( acBuf, c_iBufferSize - 1 );
00936             for( iToken = 0; iToken < 3; ++iToken ) {
00937                 strToken = OpenToken( acBuf, &pc );
00938                 if( !strToken.length( ) )
00939                     break;
00940                 if( strToken != strCache ) {
00941                     strCache = strToken;
00942                     setstrGenes.insert( strToken ); }
00943 
00944                 strToken = OpenToken( pc, &pc );
00945                 setstrGenes.insert( strToken );
00946                 d = (float)strtod( ( strToken = OpenToken( pc ) ).c_str( ), (char**)&pc );
00947                 if( !d && ( ( pc - strToken.c_str( ) ) != strToken.length( ) ) ) {
00948                     delete[] acBuf;
00949                     return false; } } }
00950         m_vecstrGenes.reserve( setstrGenes.size( ) );
00951         for( iterGenes = setstrGenes.begin( ); iterGenes != setstrGenes.end( ); ++iterGenes )
00952             m_vecstrGenes.push_back( *iterGenes ); }
00953     delete[] acBuf;
00954 
00955     return true; }
00956 
00974 void CDat::Save( const char* szFile ) const {
00975     size_t      i;
00976     EFormat     eFormat;
00977     ofstream    ofsm;
00978 
00979     if( !szFile ) {
00980         Save( cout, EFormatText );
00981         cout.flush( );
00982         return; }
00983 
00984     for( i = 0; c_asFormats[ i ].m_szExtension; ++i )
00985         if( !strcmp( szFile + strlen( szFile ) - strlen( c_asFormats[ i ].m_szExtension ),
00986             c_asFormats[ i ].m_szExtension ) )
00987             break;
00988     eFormat = c_asFormats[ i ].m_eFormat;
00989     ofsm.open( szFile, ( ( eFormat == EFormatText ) || ( eFormat == EFormatPCL ) ) ? ios_base::out :
00990         ios_base::binary );
00991     Save( ofsm, eFormat ); }
00992 
01010 void CDat::Save( std::ostream& ostm, EFormat eFormat ) const {
01011 
01012     switch( eFormat ) {
01013         case EFormatText:
01014             SaveText( ostm );
01015             return;
01016 
01017         case EFormatSparse:
01018             SaveSparse( ostm );
01019             return; }
01020 
01021     SaveBinary( ostm ); }
01022 
01023 void CDatImpl::SaveText( std::ostream& ostm ) const {
01024     size_t  i, j;
01025     float   d;
01026 
01027     for( i = 0; i < GetGenes( ); ++i )
01028         for( j = ( i + 1 ); j < GetGenes( ); ++j )
01029             if( !CMeta::IsNaN( d = Get( i, j ) ) )
01030                 ostm << GetGene( i ) << '\t' << GetGene( j ) << '\t' << d << endl; }
01031 
01032 void CDatImpl::SaveBinary( std::ostream& ostm ) const {
01033     size_t          i, j;
01034     const float*    pd;
01035     float           d;
01036 
01037     SaveGenes( ostm );
01038     if( m_pMeasure ) {
01039         for( i = 0; i < GetGenes( ); ++i )
01040             for( j = ( i + 1 ); j < GetGenes( ); ++j ) {
01041                 d = Get( i, j );
01042                 ostm.write( (char*)&d, sizeof(d) ); } }
01043     else
01044         for( i = 0; ( i + 1 ) < GetGenes( ); ++i ) {
01045             pd = m_Data.Get( i );
01046             ostm.write( (char*)pd, sizeof(*pd) * ( GetGenes( ) - i - 1 ) ); } }
01047 
01048 void CDatImpl::SaveSparse( std::ostream& ostm ) const {
01049     uint32_t    i, j;
01050     float       d;
01051 
01052     SaveGenes( ostm );
01053     for( i = 0; i < GetGenes( ); ++i ) {
01054         for( j = ( i + 1 ); j < GetGenes( ); ++j )
01055             if( !CMeta::IsNaN( d = Get( i, j ) ) ) {
01056                 ostm.write( (char*)&j, sizeof(j) );
01057                 ostm.write( (char*)&d, sizeof(d) ); }
01058         j = -1;
01059         ostm.write( (char*)&j, sizeof(j) ); } }
01060 
01061 void CDatImpl::SaveGenes( std::ostream& ostm ) const {
01062     size_t      i, j;
01063     uint32_t    iSize;
01064     string      strGene;
01065 
01066     iSize = GetGenes( );
01067     ostm.write( (char*)&iSize, sizeof(iSize) );
01068     for( i = 0; i < iSize; ++i ) {
01069         strGene = GetGene( i );
01070         for( j = 0; j < strGene.length( ); ++j ) {
01071             ostm.put( 0 );
01072             ostm.put( strGene[ j ] ); }
01073         ostm.put( 0 );
01074         ostm.put( 0 ); } }
01075 
01095 bool CDat::Open( const std::vector<std::string>& vecstrGenes, bool fClear, const char* szFile ) {
01096     size_t          i, j, iSize;
01097     unsigned char*  pb;
01098 
01099     Reset( );
01100     m_vecstrGenes.resize( vecstrGenes.size( ) );
01101     copy( vecstrGenes.begin( ), vecstrGenes.end( ), m_vecstrGenes.begin( ) );
01102 
01103     m_mapstrGenes.clear();
01104     for( i = 0; i < vecstrGenes.size(); ++i )
01105         m_mapstrGenes[vecstrGenes[i]] = i; 
01106 
01107     if( szFile ) {
01108         iSize = sizeof(uint32_t);
01109         for( i = 0; i < GetGenes( ); ++i )
01110             iSize += 2 * ( GetGene( i ).length( ) + 1 );
01111         iSize += CDistanceMatrix::GetSpace( GetGenes( ) );
01112         if( !CMeta::MapWrite( m_abData, m_hndlData, iSize, szFile ) )
01113             return false;
01114         *(uint32_t*)( pb = m_abData ) = GetGenes( );
01115         pb += sizeof(uint32_t);
01116         for( i = 0; i < GetGenes( ); ++i ) {
01117             const string&   strGene = GetGene( i );
01118 
01119             for( j = 0; j < strGene.length( ); ++j ) {
01120                 *pb++ = 0;
01121                 *pb++ = strGene[ j ]; }
01122             *pb++ = 0;
01123             *pb++ = 0; }
01124 
01125         if( !OpenMemmap( pb ) )
01126             return false; }
01127     else
01128         m_Data.Initialize( GetGenes( ) );
01129 
01130     if( fClear )
01131         for( i = 0; i < m_vecstrGenes.size( ); ++i )
01132             for( j = ( i + 1 ); j < m_vecstrGenes.size( ); ++j )
01133                 Set( i, j, CMeta::GetNaN( ) );
01134 
01135     return true; }
01136 
01157 bool CDat::Open( const std::vector<std::string>& vecstrGenes, const CDistanceMatrix& MatScores ) {
01158     size_t  i;
01159 
01160     if( vecstrGenes.size( ) != MatScores.GetSize( ) )
01161         return false;
01162 
01163     Reset( );
01164     m_vecstrGenes.reserve( vecstrGenes.size( ) );
01165     for( i = 0; i < vecstrGenes.size( ); ++i )
01166         m_vecstrGenes.push_back( vecstrGenes[ i ] );
01167 
01168     for( i = 0; i < vecstrGenes.size(); ++i )
01169         m_mapstrGenes[vecstrGenes[i]] = i; 
01170 
01171     m_Data.Initialize( MatScores );
01172 
01173     return true; }
01174 
01175 size_t CDatImpl::GetGene( const std::string& strGene ) const {
01176     size_t  i;
01177 
01178     if( m_pMeasure )
01179         return m_pPCL->GetGene( strGene );
01180 
01181     for( i = 0; i < GetGenes( ); ++i )
01182         if( m_vecstrGenes[ i ] == strGene )
01183             return i;
01184 
01185     return -1; }
01186 
01220 bool CDat::Open( const char* szFile, bool fMemmap, size_t iSkip, bool fZScore, bool fDuplicates, bool fSeek ) {
01221     EFormat     eFormat;
01222     size_t      i;
01223 
01224     if( !szFile )
01225         return Open( cin, EFormatText, (float)HUGE_VAL, fDuplicates );
01226 
01227     for( i = 0; c_asFormats[ i ].m_szExtension; ++i )
01228         if( !strcmp( szFile + strlen( szFile ) - strlen( c_asFormats[ i ].m_szExtension ),
01229             c_asFormats[ i ].m_szExtension ) )
01230             break;
01231     eFormat = c_asFormats[ i ].m_eFormat;
01232 
01233     if( fMemmap && ( eFormat == EFormatBinary ) ) {
01234         Reset( );
01235         if( !CMeta::MapRead( m_abData, m_hndlData, m_iData, szFile ) ) {
01236             g_CatSleipnir( ).error( "CDat::Open( %s, %d ) failed memory mapping", szFile, fMemmap );
01237             return false; }
01238         return OpenHelper( ); }
01239 
01240     if(m_ifsm.is_open()){
01241         m_ifsm.close();
01242     }
01243 
01244     m_ifsm.open( szFile, ( ( eFormat == EFormatText ) || ( eFormat == EFormatPCL ) ) ? ios_base::in :
01245         ios_base::binary );
01246     if( !m_ifsm.is_open( ) )
01247         return false;
01248 
01249     if(fSeek){
01250         m_fSeek = true;
01251     }
01252 
01253     return Open( m_ifsm, eFormat, (float)HUGE_VAL, fDuplicates, iSkip, fZScore, fSeek ); }
01254 
01255 bool CDatImpl::OpenHelper( ) {
01256     unsigned char*  pb;
01257     size_t          i;
01258 
01259     m_vecstrGenes.resize( *(uint32_t*)( pb = m_abData ) );
01260     pb += sizeof(uint32_t);
01261     for( i = 0; i < GetGenes( ); ++i ) {
01262         string& strGene = m_vecstrGenes[ i ];
01263 
01264         while( *++pb )
01265             strGene += *pb++;
01266         pb++; }
01267 
01268     return OpenMemmap( pb ); }
01269 
01270 bool CDatImpl::OpenMemmap( const unsigned char* pb ) {
01271     size_t  i;
01272 
01273     m_aadData = new float*[ GetGenes( ) - 1 ];
01274     m_aadData[ 0 ] = (float*)pb;
01275     for( i = 1; ( i + 1 ) < m_vecstrGenes.size( ); ++i )
01276         m_aadData[ i ] = m_aadData[ i - 1 ] + GetGenes( ) - i;
01277     m_Data.Initialize( GetGenes( ), (float**)m_aadData );
01278 
01279     return true; }
01280 
01281 void CDatImpl::AveStd( double& dAverage, double& dStdev, size_t& iN, size_t iApproximate ) const {
01282     size_t  i, j;
01283     float   d;
01284 
01285     dAverage = dStdev = 0;
01286     if( iApproximate == -1 ) {
01287         for( iN = i = 0; i < GetGenes( ); ++i )
01288             for( j = ( i + 1 ); j < GetGenes( ); ++j )
01289                 if( !CMeta::IsNaN( d = Get( i, j ) ) ) {
01290                     iN++;
01291                     dAverage += d;
01292                     dStdev += d * d; } }
01293     else {
01294         size_t  iOne, iTwo;
01295 
01296         for( i = 0; i < iApproximate; ++i ) {
01297             iOne = rand( ) % GetGenes( );
01298             if( ( ( iTwo = rand( ) % GetGenes( ) ) == iOne ) ||
01299                 CMeta::IsNaN( d = Get( iOne, iTwo ) ) ) {
01300                 i--;
01301                 continue; }
01302             dAverage += d;
01303             dStdev += d * d; }
01304         iN = i; }
01305     if( iN ) {
01306         dAverage /= iN;
01307         dStdev = sqrt( ( dStdev / ( iN - ( ( iN > 1 ) ? 1 : 0 ) ) ) - ( dAverage * dAverage ) ); 
01308     } 
01309 
01310 }
01311 
01312 void CDatImpl::NormalizeStdev( ) {
01313     double  d, dAve, dDev;
01314     size_t  i, j, iN;
01315 
01316     AveStd( dAve, dDev, iN );
01317     if( !( iN && dDev ) )
01318         return;
01319     for( i = 0; i < GetGenes( ); ++i )
01320         for( j = ( i + 1 ); j < GetGenes( ); ++j )
01321             if( !CMeta::IsNaN( (float)( d = Get( i, j ) ) ) )
01322                 Set( i, j, (float)( ( d - dAve ) / dDev ) ); }
01323 
01324 void CDatImpl::NormalizeSigmoid( ) {
01325     size_t  i, j;
01326     float   d;
01327 
01328     for( i = 0; i < GetGenes( ); ++i )
01329         for( j = ( i + 1 ); j < GetGenes( ); ++j )
01330             if( !CMeta::IsNaN( d = Get( i, j ) ) )
01331                 Set( i, j, 1.0f / ( 1 + exp( -d ) ) ); }
01332 
01333 void CDatImpl::NormalizeNormCDF( ) {
01334     size_t  i, j, iN;
01335     float   d;
01336     double  dAve, dStd;
01337 
01338     AveStd( dAve, dStd, iN );
01339     for( i = 0; i < GetGenes( ); ++i )
01340         for( j = ( i + 1 ); j < GetGenes( ); ++j )
01341             if( !CMeta::IsNaN( d = Get( i, j ) ) )
01342                 Set( i, j, (float)CStatistics::NormalCDF( d, dAve, dStd ) ); }
01343 
01344 void CDatImpl::NormalizeMinmax( ) {
01345     float   d, dMin, dMax;
01346     size_t  i, j;
01347 
01348     dMax = -( dMin = FLT_MAX );
01349     for( i = 0; i < GetGenes( ); ++i )
01350         for( j = ( i + 1 ); j < GetGenes( ); ++j )
01351             if( !CMeta::IsNaN( d = Get( i, j ) ) ) {
01352                 if( d < dMin )
01353                     dMin = d;
01354                 if( d > dMax )
01355                     dMax = d; }
01356     if( dMax -= dMin )
01357         for( i = 0; i < GetGenes( ); ++i )
01358             for( j = ( i + 1 ); j < GetGenes( ); ++j )
01359                 Set( i, j, ( Get( i, j ) - dMin ) / dMax ); }
01360 
01361 void CDatImpl::NormalizeMinmaxNPone( ) {
01362     float   d, dMin, dMax;
01363     size_t  i, j;
01364 
01365     dMax = -( dMin = FLT_MAX );
01366     for( i = 0; i < GetGenes( ); ++i )
01367         for( j = ( i + 1 ); j < GetGenes( ); ++j )
01368             if( !CMeta::IsNaN( d = Get( i, j ) ) ) {
01369                 if( d < dMin )
01370                     dMin = d;
01371                 if( d > dMax )
01372                     dMax = d; }
01373     if( dMax -= dMin )
01374         for( i = 0; i < GetGenes( ); ++i )
01375             for( j = ( i + 1 ); j < GetGenes( ); ++j )
01376               Set( i, j, (  ( Get( i, j ) - dMin ) / dMax ) * 2.0 + -1.0 ); }
01377 
01378 void CDatImpl::NormalizePCC( ) {
01379     size_t          i, j;
01380     vector<float>   vecdAves, vecdStds;
01381     vector<size_t>  veciCounts;
01382     float           d, dOne, dTwo;
01383 
01384     vecdAves.resize( GetGenes( ) );
01385     vecdStds.resize( GetGenes( ) );
01386     veciCounts.resize( GetGenes( ) );
01387     for( i = 0; i < GetGenes( ); ++i )
01388         for( j = ( i + 1 ); j < GetGenes( ); ++j )
01389             if( !CMeta::IsNaN( d = Get( i, j ) ) ) {
01390                 veciCounts[i]++;
01391                 veciCounts[j]++;
01392                 vecdAves[i] += d;
01393                 vecdAves[j] += d;
01394                 d *= d;
01395                 vecdStds[i] += d;
01396                 vecdStds[j] += d; }
01397     for( i = 0; i < GetGenes( ); ++i ) {
01398         if( veciCounts[i] ) {
01399             vecdAves[i] /= veciCounts[i];
01400             vecdStds[i] = sqrt( ( vecdStds[i] / ( max( veciCounts[i], 2 ) - 1 ) ) -
01401                 ( vecdAves[i] * vecdAves[i] ) ); }
01402         if( !vecdStds[i] )
01403             vecdStds[i] = 1; }
01404     for( i = 0; i < GetGenes( ); ++i ) {
01405         for( j = ( i + 1 ); j < GetGenes( ); ++j )
01406             if( !CMeta::IsNaN( d = Get( i, j ) ) ) {
01407                 dOne = ( d - vecdAves[i] ) / vecdStds[i];
01408                 dTwo = ( d - vecdAves[j] ) / vecdStds[j];
01409                 Set( i, j, sqrt( ( dOne * dOne ) + ( dTwo * dTwo ) ) ); } } }
01410 
01418 void CDat::Invert( ) {
01419     size_t  i, j;
01420     float   d;
01421 
01422     for( i = 0; i < GetGenes( ); ++i )
01423         for( j = ( i + 1 ); j < GetGenes( ); ++j )
01424             if( !CMeta::IsNaN( d = Get( i, j ) ) )
01425                 Set( i, j, 1 - d ); }
01426 
01450 bool CDat::FilterGenes( const char* szGenes, EFilter eFilter, size_t iLimit ) {
01451     CGenome     Genome;
01452     CGenes      Genes( Genome );
01453     ifstream    ifsm;
01454     if( !szGenes )
01455         return false;
01456 
01457     ifsm.open( szGenes );
01458     if( !Genes.Open( ifsm ) )
01459         return false;
01460     FilterGenes( Genes, eFilter, iLimit );
01461     return true; }
01462 
01487 void CDat::FilterGenes( const CGenes& Genes, EFilter eFilter, size_t iLimit, float dEdgeAggressiveness,
01488     bool fAbsolute, const vector<float>* pvecdWeights ) {
01489     size_t          i, j;
01490     vector<bool>    vecfGenes;
01491     vecfGenes.resize( GetGenes( ) );
01492     for( i = 0; i < Genes.GetGenes( ); ++i )
01493         if( ( j = GetGene( Genes.GetGene( i ).GetName( ) ) ) != -1 )
01494             vecfGenes[ j ] = true;
01495 
01496     switch( eFilter ) {
01497         case EFilterPixie:
01498         case EFilterHefalmp:
01499             FilterGenesGraph( Genes, vecfGenes, iLimit, dEdgeAggressiveness, eFilter == EFilterHefalmp, fAbsolute, pvecdWeights );
01500             return; }
01501 
01502     for( i = 0; i < GetGenes( ); ++i ) {
01503         if( ( ( eFilter == EFilterExclude ) && vecfGenes[ i ] ) ||
01504             ( ( eFilter == EFilterInclude ) && !vecfGenes[ i ] ) ||
01505             ( ( eFilter == EFilterIncludePos ) && !vecfGenes[ i ] ) ) {
01506             for( j = ( i + 1 ); j < GetGenes( ); ++j ) {
01507                 if ( Get( i, j ) || eFilter != EFilterIncludePos )  
01508                     Set( i, j, CMeta::GetNaN( ) );
01509             }
01510             continue; }
01511         if( ( eFilter == EFilterEdge ) && vecfGenes[ i ] )
01512             continue;
01513         if( ( eFilter == EFilterExEdge ) && !vecfGenes[ i ] )
01514             continue;
01515         for( j = ( i + 1 ); j < GetGenes( ); ++j )
01516             switch( eFilter ) {
01517                 case EFilterInclude:
01518                 case EFilterEdge:
01519                     if( !vecfGenes[ j ] )
01520                         Set( i, j, CMeta::GetNaN( ) );
01521                     break;
01522                 case EFilterIncludePos:
01523                     if( !vecfGenes[ j ] && Get( i, j ) )
01524                         Set( i, j, CMeta::GetNaN( ) );
01525                     break;
01526                 case EFilterTerm:
01527                     if( !( vecfGenes[ i ] && vecfGenes[ j ] ) &&
01528                         ( !( vecfGenes[ i ] || vecfGenes[ j ] ) || Get( i, j ) ) )
01529                         Set( i, j, CMeta::GetNaN( ) );
01530                     break;                  
01531                     case EFilterExEdge:
01532                         if( vecfGenes[ j ] )
01533                       Set( i, j, CMeta::GetNaN( ) );
01534                         break;
01535                 case EFilterExclude:
01536                     if( vecfGenes[ j ] )
01537                         Set( i, j, CMeta::GetNaN( ) );
01538                     break; } } }
01539 
01540 struct SPixie {
01541     size_t  m_iNode;
01542     float   m_dScore;
01543 
01544     SPixie( size_t iNode, float dScore ) : m_iNode(iNode), m_dScore(dScore) { }
01545 
01546     bool operator<( const SPixie& sPixie ) const {
01547 
01548         return ( m_dScore < sPixie.m_dScore ); }
01549 };
01550 
01551 void CDatImpl::FilterGenesGraph( const CGenes& Genes, vector<bool>& vecfGenes, size_t iLimit,
01552     float dEdgeAggressiveness, bool fHefalmp, bool fAbsolute, const vector<float>* pvecdWeights ) {
01553     vector<float>               vecdNeighbors, vecdWeights;
01554     size_t                      i, j, iOne, iTwo, iMinOne, iMinTwo, iN;
01555     vector<size_t>              veciGenes, veciFinal, veciDegree;
01556     set<size_t>                 setiN;
01557     set<size_t>::const_iterator iterN;
01558     float                       d, dMin, dCutoff;
01559     priority_queue<SPixie>      pqueNeighbors;
01560     bool                        fDone;
01561     double                      dAve, dDev;
01562 
01563     if( iLimit == -1 )
01564         iLimit = c_iNeighborhood;
01565     veciGenes.resize( Genes.GetGenes( ) );
01566     for( i = 0; i < veciGenes.size( ); ++i )
01567         veciGenes[ i ] = GetGene( Genes.GetGene( i ).GetName( ) );
01568     if( !pvecdWeights || ( pvecdWeights->size( ) < veciGenes.size( ) ) ) {
01569         vecdWeights.resize( veciGenes.size( ) );
01570         fill( vecdWeights.begin( ), vecdWeights.end( ), 1.0f );
01571         pvecdWeights = &vecdWeights; }
01572 
01573     vecdNeighbors.resize( GetGenes( ) );
01574     fill( vecdNeighbors.begin( ), vecdNeighbors.end( ), 0.0f );
01575     if( fHefalmp )
01576         for( i = 0; i < GetGenes( ); ++i ) {
01577             size_t  iIn, iOut;
01578             float   dIn, dOut;
01579 
01580             if( vecfGenes[ i ] )
01581                 continue;
01582             dIn = dOut = 0;
01583             for( iIn = j = 0; j < veciGenes.size( ); ++j ) {
01584                 if( ( iOne = veciGenes[ j ] ) == -1 )
01585                     continue;
01586                 if( !CMeta::IsNaN( d = Get( i, iOne ) ) ) {
01587                     if( fAbsolute )
01588                         d = fabs( d );
01589                     iIn++;
01590                     dIn += d * (*pvecdWeights)[ j ]; } }
01591             for( iOut = j = 0; j < GetGenes( ); ++j )
01592                 if( !CMeta::IsNaN( d = Get( i, j ) ) ) {
01593                     if( fAbsolute )
01594                         d = fabs( d );
01595                     iOut++;
01596                     dOut += d; }
01597             vecdNeighbors[ i ] = ( iIn && dOut ) ? ( dIn * iOut / iIn / dOut ) : 0; }
01598     else
01599         for( i = 0; i < veciGenes.size( ); ++i ) {
01600             if( ( iOne = veciGenes[ i ] ) == -1 )
01601                 continue;
01602             for( j = 0; j < GetGenes( ); ++j ) {
01603                 if( vecfGenes[ j ] )
01604                     continue;
01605                 if( !CMeta::IsNaN( d = Get( iOne, j ) ) ) {
01606                     if( fAbsolute )
01607                         d = fabs( d );
01608                     vecdNeighbors[ j ] += d * (*pvecdWeights)[ i ]; } } }
01609     for( i = 0; i < vecdNeighbors.size( ); ++i )
01610         if( ( d = vecdNeighbors[ i ] ) > 0 )
01611             pqueNeighbors.push( SPixie( i, d ) );
01612 
01613     for( i = 0; i < veciGenes.size( ); ++i )
01614         if( ( iOne = veciGenes[ i ] ) != -1 )
01615             veciFinal.push_back( iOne );
01616     while( !pqueNeighbors.empty( ) && ( setiN.size( ) < iLimit ) ) {
01617         veciFinal.push_back( pqueNeighbors.top( ).m_iNode );
01618         setiN.insert( pqueNeighbors.top( ).m_iNode );
01619         pqueNeighbors.pop( ); }
01620 
01621     for( iterN = setiN.begin( ); iterN != setiN.end( ); ++iterN )
01622         vecfGenes[ *iterN ] = true;
01623     for( i = 0; i < GetGenes( ); ++i ) {
01624         if( !vecfGenes[ i ] ) {
01625             for( j = ( i + 1 ); j < GetGenes( ); ++j )
01626                 Set( i, j, CMeta::GetNaN( ) );
01627             continue; }
01628         for( j = ( i + 1 ); j < GetGenes( ); ++j )
01629             if( !vecfGenes[ j ] )
01630                 Set( i, j, CMeta::GetNaN( ) ); }
01631     AveStd( dAve, dDev, iN );
01632     dCutoff = (float)( dAve + ( dEdgeAggressiveness * dDev ) );
01633 
01634     veciDegree.resize( veciFinal.size( ) );
01635     for( i = 0; i < veciDegree.size( ); ++i ) {
01636         iOne = veciFinal[ i ];
01637         for( j = ( i + 1 ); j < veciDegree.size( ); ++j ) {
01638             iTwo = veciFinal[ j ];
01639             if( !CMeta::IsNaN( Get( iOne, iTwo ) ) ) {
01640                 veciDegree[ i ]++;
01641                 veciDegree[ j ]++; } } }
01642     for( fDone = CMeta::IsNaN( dEdgeAggressiveness ); !fDone; ) {
01643         fDone = true;
01644         dMin = FLT_MAX;
01645         for( i = 0; i < veciFinal.size( ); ++i ) {
01646             iOne = veciFinal[ i ];
01647             for( j = ( i + 1 ); j < veciFinal.size( ); ++j ) {
01648                 iTwo = veciFinal[ j ];
01649                 if( !CMeta::IsNaN( d = Get( iOne, iTwo ) ) && ( d < dCutoff ) && ( d < dMin ) &&
01650                     ( veciDegree[ i ] > c_iDegree ) && ( veciDegree[ j ] > c_iDegree ) ) {
01651                     fDone = false;
01652                     dMin = d;
01653                     iMinOne = i;
01654                     iMinTwo = j; } } }
01655         if( !fDone ) {
01656             veciDegree[ iMinOne ]--;
01657             veciDegree[ iMinTwo ]--;
01658             Set( veciFinal[ iMinOne ], veciFinal[ iMinTwo ], CMeta::GetNaN( ) ); } } }
01659 
01700 void CDat::SaveDOT( std::ostream& ostm, float dCutoff, const CGenome* pGenome, bool fUnlabeled, bool fHashes,
01701     const std::vector<float>* pvecdColors, const std::vector<float>* pvecdBorders ) const {
01702     size_t          i, j, iCount;
01703     float           d, dAve, dStd;
01704     bool            fAll, fLabel;
01705     vector<string>  vecstrNames;
01706     vector<bool>    vecfGenes;
01707 
01708     fAll = CMeta::IsNaN( dCutoff );
01709     ostm << "graph G {" << endl;
01710     ostm << "   pack = \"true\";" << endl;
01711     ostm << "   overlap = \"scale\";" << endl;
01712     ostm << "   splines = \"true\";" << endl;
01713 
01714     if( pvecdColors || pvecdBorders || !( fUnlabeled || fAll ) ) {
01715         vecfGenes.resize( GetGenes( ) );
01716         for( i = 0; i < vecfGenes.size( ); ++i )
01717             for( j = ( i + 1 ); j < vecfGenes.size( ); ++j )
01718                 if( !CMeta::IsNaN( d = Get( i, j ) ) && ( fAll || ( d >= dCutoff ) ) )
01719                     vecfGenes[ i ] = vecfGenes[ j ] = true; }
01720 
01721     vecstrNames.resize( GetGenes( ) );
01722     for( i = 0; i < vecstrNames.size( ); ++i ) {
01723         string  strName;
01724 
01725         fLabel = !fUnlabeled && ( fAll || vecfGenes[ i ] );
01726         vecstrNames[ i ] = CMeta::Filename( strName = GetGene( i ) );
01727         if( !isalpha( vecstrNames[ i ][ 0 ] ) )
01728             vecstrNames[ i ] = "_" + vecstrNames[ i ];
01729         if( pGenome && ( ( j = pGenome->GetGene( GetGene( i ) ) ) != -1 ) ) {
01730             const CGene&    Gene    = pGenome->GetGene( j );
01731 
01732             strName = Gene.GetSynonyms( ) ? Gene.GetSynonym( 0 ) : Gene.GetName( );
01733             if( fUnlabeled ) {
01734                 if( strName != vecstrNames[ i ] ) {
01735                     vecstrNames[ i ] += '_';
01736                     vecstrNames[ i ] += Gene.GetSynonym( 0 ); } } }
01737         if( ( pvecdColors || pvecdBorders || fLabel ) && ( fAll || vecfGenes[ i ] ) ) {
01738             ostm << vecstrNames[ i ] << " [shape = \"ellipse\", style = \"filled";
01739             if( pvecdBorders )
01740                 ostm << ", setlinewidth(" << (*pvecdBorders)[ i ] << ")";
01741             ostm << "\"";
01742             ostm << ", fillcolor = \"" << ( fHashes ? "#" : "" ) << ( pvecdColors ? CColor::Interpolate(
01743                 (*pvecdColors)[ i ], 
01744                 CColor::c_Cyan, CColor::c_White, CColor::c_Yellow 
01745                 //CColor::c_White, CColor::c_Cyan, CColor::c_Yellow 
01746                 ).ToRGB( ) :
01747                 "FFFFFF" ) << "\"";
01748             if( fLabel )
01749                 ostm << ", label=\"" << strName << "\"";
01750             ostm << "];" << endl; } }
01751 
01752     ostm << endl;
01753     dAve = dStd = 0;
01754     for( iCount = i = 0; i < GetGenes( ); ++i )
01755         for( j = ( i + 1 ); j < GetGenes( ); ++j )
01756             if( !CMeta::IsNaN( d = Get( i, j ) ) && ( fAll || ( d >= dCutoff ) ) ) {
01757                 iCount++;
01758                 dAve += d;
01759                 dStd += d * d; }
01760     if( iCount ) {
01761         dAve /= iCount;
01762         dStd = sqrt( max( 0.0f, ( dStd / ( max( iCount, 2 ) - 1 ) ) - ( dAve * dAve ) ) ); }
01763     for( i = 0; i < GetGenes( ); ++i )
01764         for( j = ( i + 1 ); j < GetGenes( ); ++j )
01765             if( !CMeta::IsNaN( d = Get( i, j ) ) && ( fAll || ( d >= dCutoff ) ) ) {
01766                 d = 1.0f / ( 1 + exp( ( dAve - d ) / dStd ) );
01767                 ostm << vecstrNames[ i ] << " -- " << vecstrNames[ j ] << " [weight = " << d <<
01768                     ", color = \"" << ( fHashes ? "#" : "" ) << CColor::Interpolate( d,
01769                     //CColor::c_Green, CColor::c_Black, CColor::c_Red ).ToRGB( ) 
01770                     CColor::c_Orange, CColor::c_DarkGreen, CColor::c_Blue ).ToRGB( ) 
01771                     << "\"];" << endl; }
01772 
01773     ostm << "}" << endl; }
01774 
01797 void CDat::SaveGDF( std::ostream& ostm, float dCutoff ) const {
01798     size_t          i, j;
01799     float           d;
01800     bool            fAll;
01801     vector<string>  vecstrNames;
01802 
01803     fAll = CMeta::IsNaN( dCutoff );
01804     ostm << "nodedef> name" << endl;
01805 
01806     vecstrNames.resize( GetGenes( ) );
01807     for( i = 0; i < vecstrNames.size( ); ++i )
01808         vecstrNames[ i ] = CMeta::Filename( GetGene( i ) );
01809     for( i = 0; i < GetGenes( ); ++i )
01810         ostm << vecstrNames[ i ] << endl;
01811 
01812     ostm << endl << "edgedef> node1,node2,weight" << endl;
01813     for( i = 0; i < GetGenes( ); ++i )
01814         for( j = ( i + 1 ); j < GetGenes( ); ++j )
01815             if( !CMeta::IsNaN( d = Get( i, j ) ) && ( fAll || ( d >= dCutoff ) ) )
01816                 ostm << vecstrNames[ i ] << "," << vecstrNames[ j ] << "," << ( 10 * Get( i, j ) ) << endl; }
01817 
01836 void CDat::SaveNET( std::ostream& ostm, float dCutoff ) const {
01837     size_t  i, j;
01838     float   d;
01839     bool    fAll;
01840 
01841     fAll = CMeta::IsNaN( dCutoff );
01842     ostm << "*Vertices " << GetGenes( ) << endl;
01843     for( i = 0; i < GetGenes( ); ++i )
01844         ostm << ( i + 1 ) << " \"" << GetGene( i ) << '"' << endl;
01845 
01846     ostm << "*Edges" << endl;
01847     for( i = 0; i < GetGenes( ); ++i )
01848         for( j = ( i + 1 ); j < GetGenes( ); ++j )
01849             if( !CMeta::IsNaN( d = Get( i, j ) ) && ( fAll || ( d >= dCutoff ) ) )
01850                 ostm << ( i + 1 ) << ' ' << ( j + 1 ) << ' ' << d << endl; }
01851 
01873 void CDat::SaveMATISSE( std::ostream& ostm, float dCutoff, const CGenome* pGenome ) const {
01874     size_t  i, j, k;
01875     float   d;
01876     bool    fAll;
01877 
01878     fAll = CMeta::IsNaN( dCutoff );
01879     for( i = 0; i < GetGenes( ); ++i ) {
01880         j = pGenome ? pGenome->GetGene( GetGene( i ) ) : -1;
01881         ostm << i << '\t' << GetGene( i ) << '\t' << ( ( ( j == -1 ) ||
01882             !pGenome->GetGene( j ).GetSynonyms( ) ) ? GetGene( i ) :
01883             pGenome->GetGene( j ).GetSynonym( 0 ) ) << '\t' << ( ( j == -1 ) ? "-" :
01884             pGenome->GetGene( j ).GetGloss( ) ) << endl; }
01885 
01886     for( k = i = 0; i < GetGenes( ); ++i )
01887         for( j = ( i + 1 ); j < GetGenes( ); ++j )
01888             if( !CMeta::IsNaN( d = Get( i, j ) ) && ( fAll || ( d >= dCutoff ) ) )
01889                 ostm << k++ << '\t' << i << '\t' << j << "  false   " << d << endl; }
01890 
01891 struct SSorterRank {
01892     const CDat& m_Dat;
01893 
01894     SSorterRank( const CDat& Dat ) : m_Dat(Dat) { }
01895 
01896     bool operator()( const pair<size_t, size_t>& prOne, const pair<size_t, size_t>& prTwo ) const {
01897 
01898         return ( m_Dat.Get( prOne.first, prOne.second ) < m_Dat.Get( prTwo.first, prTwo.second ) ); }
01899 
01900 };
01901 
01909 void CDat::Rank( ) {
01910     vector<pair<size_t, size_t> >   vecprData;
01911     size_t                          i, j, iRank;
01912     float                           d, dPrev;
01913 
01914     for( i = 0; i < GetGenes( ); ++i )
01915         for( j = ( i + 1 ); j < GetGenes( ); ++j )
01916             if( !CMeta::IsNaN( Get( i, j ) ) )
01917                 vecprData.push_back( pair<size_t, size_t>( i, j ) );
01918     sort( vecprData.begin( ), vecprData.end( ), SSorterRank( *this ) );
01919     for( iRank = i = 0; i < vecprData.size( ); ++i ) {
01920         d = Get( vecprData[ i ].first, vecprData[ i ].second );
01921         if( i && ( d != dPrev ) )
01922             iRank = i;
01923         dPrev = d;
01924         Set( vecprData[ i ].first, vecprData[ i ].second, (float)iRank ); } }
01925 
01926 void CDat::NormalizeQuantiles( size_t iQuantiles ) {
01927     static const size_t c_iTest = 100;
01928     float               d, dTest;
01929     size_t              i, j, k, iValues, iPrev, iNext;
01930     set<float>          setiTest;
01931     bool                fDiscrete;
01932     vector<float>       vecdValues;
01933 
01934     for( iValues = i = 0; i < GetGenes( ); ++i )
01935         for( j = ( i + 1 ); j < GetGenes( ); ++j )
01936             if( !CMeta::IsNaN( d = Get( i, j ) ) )
01937                 iValues++;
01938 
01939 // Heuristic to test for discrete valued dats
01940     dTest = (float)c_iTest / iValues;
01941     for( i = 0; i < GetGenes( ); ++i )
01942         for( j = ( i + 1 ); j < GetGenes( ); ++j )
01943             if( !CMeta::IsNaN( d = Get( i, j ) ) && ( ( (float)rand( ) / RAND_MAX ) < dTest ) )
01944                 setiTest.insert( d );
01945     fDiscrete = ( setiTest.size( ) * 2 ) < c_iTest;
01946 
01947     if( fDiscrete ) {
01948         map<float, size_t>              mapdiValues;
01949         map<float, size_t>::iterator    iterValue;
01950         map<float, float>               mapddQuantiles;
01951 
01952         for( i = 0; i < GetGenes( ); ++i )
01953             for( j = ( i + 1 ); j < GetGenes( ); ++j ) {
01954                 if( CMeta::IsNaN( d = Get( i, j ) ) )
01955                     continue;
01956                 if( ( iterValue = mapdiValues.find( d ) ) == mapdiValues.end( ) )
01957                     mapdiValues[d] = 1;
01958                 else
01959                     iterValue->second++; }
01960         vecdValues.resize( mapdiValues.size( ) );
01961         for( iterValue = mapdiValues.begin( ),i = 0; iterValue != mapdiValues.end( ); ++iterValue,++i )
01962             vecdValues[i] = iterValue->first;
01963         sort( vecdValues.begin( ), vecdValues.end( ) );
01964 
01965         for( i = iPrev = 0; i < vecdValues.size( ); ++i,iPrev = iNext ) {
01966             iNext = iPrev + mapdiValues[vecdValues[i]];
01967             mapddQuantiles[vecdValues[i]] = iQuantiles * ( iPrev + iNext - 1 ) / 2.0f / iValues; }
01968         for( i = 0; i < GetGenes( ); ++i )
01969             for( j = ( i + 1 ); j < GetGenes( ); ++j )
01970                 if( !CMeta::IsNaN( d = Get( i, j ) ) )
01971                     Set( i, j, mapddQuantiles[d] ); }
01972     else {
01973         vector<float>   vecdTops;
01974 
01975         vecdValues.reserve( min( iValues, iQuantiles * c_iTest ) );
01976         dTest = (float)vecdValues.capacity( ) / iValues;
01977         for( i = 0; i < GetGenes( ); ++i )
01978             for( j = ( i + 1 ); j < GetGenes( ); ++j )
01979                 if( !CMeta::IsNaN( d = Get( i, j ) ) && ( ( (float)rand( ) / RAND_MAX ) < dTest ) )
01980                     vecdValues.push_back( d );
01981         sort( vecdValues.begin( ), vecdValues.end( ) );
01982 
01983         vecdTops.resize( iQuantiles - 1 );
01984         for( i = 0; i < vecdTops.size( ); ++i ) {
01985             d = (float)( i + 1 ) * ( vecdValues.size( ) - 1 ) / iQuantiles;
01986             j = (size_t)d;
01987             vecdTops[i] = ( ( d - j ) && ( ( j + 1 ) < vecdValues.size( ) ) ) ? ( ( vecdValues[j] + vecdValues[j + 1] ) / 2 ) :
01988                 vecdValues[j]; }
01989 
01990         for( i = 0; i < GetGenes( ); ++i )
01991             for( j = ( i + 1 ); j < GetGenes( ); ++j ) {
01992                 if( CMeta::IsNaN( d = Get( i, j ) ) )
01993                     continue;
01994                 for( k = 0; k < vecdTops.size( ); ++k )
01995                     if( d < vecdTops[k] )
01996                         break;
01997                 Set( i, j, (float)k ); } } }
01998 
01999 }