Sleipnir
src/dataset.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "dataset.h"
00024 #include "bayesnetint.h"
00025 #include "genome.h"
00026 #include "compactmatrix.h"
00027 
00028 namespace Sleipnir {
00029 
00030 const char  CDataImpl::c_szDat[]    = ".dat";
00031 const char  CDataImpl::c_szDab[]    = ".dab";
00032 
00033 void CDataImpl::FilterGenes( IDataset* pData, const CGenes& Genes, CDat::EFilter eFilt ) {
00034     vector<bool>    vecfGenes;
00035     size_t          i, j;
00036 
00037     if( !Genes.GetGenes( ) )
00038         return;
00039 
00040     vecfGenes.resize( pData->GetGenes( ) );
00041     for( i = 0; i < vecfGenes.size( ); ++i )
00042         vecfGenes[ i ] = Genes.IsGene( pData->GetGene( i ) );
00043 
00044     for( i = 0; i < vecfGenes.size( ); ++i ) {
00045         if( ( ( eFilt == CDat::EFilterInclude ) && !vecfGenes[ i ] ) ||
00046             ( ( eFilt == CDat::EFilterExclude ) && vecfGenes[ i ] ) ) {
00047             for( j = ( i + 1 ); j < vecfGenes.size( ); ++j )
00048                 pData->Remove( i, j );
00049             continue; }
00050         if( ( eFilt == CDat::EFilterEdge ) && vecfGenes[ i ] )
00051             continue;
00052         for( j = ( i + 1 ); j < vecfGenes.size( ); ++j )
00053             switch( eFilt ) {
00054                 case CDat::EFilterInclude:
00055                 case CDat::EFilterEdge:
00056                     if( !vecfGenes[ j ] )
00057                         pData->Remove( i, j );
00058                     break;
00059 
00060                 case CDat::EFilterTerm:
00061                     if( !( vecfGenes[ i ] && vecfGenes[ j ] ) &&
00062                         ( !( vecfGenes[ i ] || vecfGenes[ j ] ) || pData->GetDiscrete( i, j, 0 ) ) )
00063                             pData->Remove( i, j );
00064                     break;
00065 
00066                 case CDat::EFilterExclude:
00067                     if( vecfGenes[ j ] )
00068                         pData->Remove( i, j );
00069                     break; } } }
00070 
00071 size_t CDataImpl::OpenMax( const char* szDataDir, const std::vector<std::string>& vecstrNodes,
00072     bool fAnswers, std::vector<std::string>& vecstrData, std::set<std::string>* psetstrGenes ) {
00073     size_t      i, iLength, iMap, iRet;
00074     string      strFile;
00075     ifstream    ifsm;
00076     CPCL        PCL;
00077 
00078     strFile = szDataDir;
00079     strFile += c_cSeparator;
00080     iLength = strFile.size( );
00081 
00082     iRet = 0;
00083     m_veciMapping.resize( vecstrNodes.size( ) );
00084     m_veciMapping[ 0 ] = fAnswers ? 0 : -1;
00085     iMap = fAnswers ? 1 : 0;
00086     for( i = 1; i < vecstrNodes.size( ); ++i ) {
00087         m_veciMapping[ i ] = -1;
00088         strFile.resize( iLength );
00089         strFile += vecstrNodes[ i ];
00090         strFile += c_szDab;
00091         ifsm.clear( );
00092         ifsm.open( strFile.c_str( ), ios_base::binary );
00093         if( ifsm.is_open( ) ) {
00094             iRet++;
00095             m_veciMapping[ i ] = iMap++;
00096             vecstrData.push_back( strFile );
00097             if( psetstrGenes )
00098                 OpenGenes( ifsm, true, false, *psetstrGenes ); }
00099         else {
00100             strFile.resize( strFile.length( ) - strlen( c_szDab ) );
00101             strFile += c_szDat;
00102             ifsm.clear( );
00103             ifsm.open( strFile.c_str( ) );
00104             if( ifsm.is_open( ) ) {
00105                 iRet++;
00106                 m_veciMapping[ i ] = iMap++;
00107                 vecstrData.push_back( strFile );
00108                 if( psetstrGenes )
00109                     OpenGenes( ifsm, false, false, *psetstrGenes ); }
00110             else {
00111                 strFile.resize( strFile.length( ) - strlen( c_szDat ) );
00112                 strFile += CPCL::GetExtension( );
00113                 ifsm.clear( );
00114                 ifsm.open( strFile.c_str( ) );
00115                 if( ifsm.is_open( ) ) {
00116                     iRet++;
00117                     m_veciMapping[ i ] = iMap++;
00118                     vecstrData.push_back( strFile );
00119                     if( psetstrGenes )
00120                         OpenGenes( ifsm, false, true, *psetstrGenes ); }
00121                 else {
00122                     g_CatSleipnir( ).info( "CDataImpl::OpenMax( %s ) assuming %s is hidden",
00123                         szDataDir, vecstrNodes[ i ].c_str( ) );
00124                     continue; } } }
00125         ifsm.close( ); }
00126 
00127     return iRet; }
00128 
00129 bool CDataImpl::OpenGenes( std::istream& istm, bool fBinary, bool fPCL,
00130     std::set<std::string>& setstrGenes ) const {
00131     CDat    Dat;
00132     size_t  i;
00133 
00134     if( !Dat.OpenGenes( istm, fBinary, fPCL ) )
00135         return false;
00136     for( i = 0; i < Dat.GetGenes( ); ++i )
00137         setstrGenes.insert( Dat.GetGene( i ) );
00138     return true; }
00139 
00140 bool CDataImpl::OpenGenes( const std::vector<std::string>& vecstrData ) {
00141     size_t                      i;
00142     ifstream                    ifsm;
00143     set<string>                 setstrGenes;
00144     set<string>::const_iterator iterGenes;
00145 
00146     m_veciMapping.resize( vecstrData.size( ) );
00147     m_veccQuants.resize( vecstrData.size( ) );
00148     for( i = 0; i < vecstrData.size( ); ++i ) {
00149         m_veciMapping[ i ] = i;
00150         ifsm.clear( );
00151         ifsm.open( vecstrData[ i ].c_str( ), ios_base::binary );
00152         if( !( ifsm.is_open( ) && OpenGenes( ifsm, true, false, setstrGenes ) ) ) {
00153             ifsm.close( );
00154             ifsm.clear( );
00155             ifsm.open( vecstrData[ i ].c_str( ) );
00156             if( !( ifsm.is_open( ) && OpenGenes( ifsm, false, false, setstrGenes ) ) ) {
00157                 ifsm.close( );
00158                 ifsm.clear( );
00159                 ifsm.open( vecstrData[ i ].c_str( ) );
00160                 if( !( ifsm.is_open( ) && OpenGenes( ifsm, false, true, setstrGenes ) ) ) {
00161                     g_CatSleipnir( ).error( "CDataImpl::OpenGenes( ) failed to open: %s", vecstrData[ i ].c_str( ) );
00162                     return false; } } }
00163         ifsm.close( ); }
00164 
00165     m_vecstrGenes.resize( setstrGenes.size( ) );
00166     i = 0;
00167     for( iterGenes = setstrGenes.begin( ); iterGenes != setstrGenes.end( ); ++iterGenes )
00168         m_vecstrGenes[ i++ ] = *iterGenes;
00169 
00170     return true; }
00171 
00172 size_t CDataImpl::GetGene( const std::string& strGene ) const {
00173     size_t  i;
00174 
00175     for( i = 0; i < m_vecstrGenes.size( ); ++i )
00176         if( m_vecstrGenes[ i ] == strGene )
00177             return i;
00178 
00179     return -1; }
00180 
00181 const unsigned char* CDataImpl::OpenBinary( const unsigned char* pbData ) {
00182     uint32_t    iVal;
00183     size_t      i;
00184 
00185     m_fContinuous = !!*(uint32_t*)pbData;
00186     pbData += sizeof(uint32_t);
00187 
00188     iVal = *(uint32_t*)pbData;
00189     pbData += sizeof(iVal);
00190     m_veciMapping.resize( iVal );
00191     for( i = 0; i < m_veciMapping.size( ); ++i ) {
00192         iVal = *(uint32_t*)pbData;
00193         m_veciMapping[ i ] = ( iVal == -1 ) ? (size_t)-1 : iVal;
00194         pbData += sizeof(iVal); }
00195 
00196     iVal = *(uint32_t*)pbData;
00197     pbData += 2 * sizeof(iVal);
00198     m_vecstrGenes.resize( iVal );
00199     for( i = 0; i < m_vecstrGenes.size( ); ++i ) {
00200         m_vecstrGenes[ i ] = (char*)pbData;
00201         pbData += m_vecstrGenes[ i ].length( ) + 1; }
00202 
00203     iVal = *(uint32_t*)pbData;
00204     pbData += sizeof(iVal);
00205     m_veccQuants.resize( iVal );
00206     for( i = 0; i < m_veccQuants.size( ); ++i )
00207         m_veccQuants[ i ] = *pbData++;
00208 
00209     return pbData; }
00210 
00211 bool CDataImpl::OpenBinary( std::istream& istm ) {
00212     uint32_t    iVal;
00213     size_t      i, j;
00214     char*       ac;
00215 
00216     istm.read( (char*)&iVal, sizeof(iVal) );
00217     m_fContinuous = !!iVal;
00218 
00219     istm.read( (char*)&iVal, sizeof(iVal) );
00220     m_veciMapping.resize( iVal );
00221     for( i = 0; i < m_veciMapping.size( ); ++i ) {
00222         istm.read( (char*)&iVal, sizeof(iVal) );
00223         m_veciMapping[ i ] = ( iVal == -1 ) ? (size_t)-1 : iVal; }
00224 
00225     istm.read( (char*)&iVal, sizeof(iVal) );
00226     m_vecstrGenes.resize( iVal );
00227     istm.read( (char*)&iVal, sizeof(iVal) );
00228     ac = new char[ iVal ];
00229     istm.read( ac, iVal );
00230     for( i = j = 0; i < m_vecstrGenes.size( ); ++i ) {
00231         m_vecstrGenes[ i ] = ac + j;
00232         j += m_vecstrGenes[ i ].length( ) + 1; }
00233     delete[] ac;
00234 
00235     istm.read( (char*)&iVal, sizeof(iVal) );
00236     ac = new char[ iVal ];
00237     m_veccQuants.resize( iVal );
00238     istm.read( ac, iVal );
00239     copy( ac, ac + iVal, m_veccQuants.begin( ) );
00240     delete[] ac;
00241 
00242     return true; }
00243 
00244 void CDataImpl::SaveBinary( std::ostream& ostm ) const {
00245     size_t      i;
00246     uint32_t    iVal;
00247     char*       ac;
00248     char        c;
00249 
00250     iVal = m_fContinuous;
00251     ostm.write( (char*)&iVal, sizeof(iVal) );
00252 
00253     iVal = (uint32_t)m_veciMapping.size( );
00254     ostm.write( (char*)&iVal, sizeof(iVal) );
00255     for( i = 0; i < m_veciMapping.size( ); ++i ) {
00256         iVal = ( m_veciMapping[ i ] == -1 ) ? -1 :
00257             (uint32_t)m_veciMapping[ i ];
00258         ostm.write( (char*)&iVal, sizeof(iVal) ); }
00259 
00260     iVal = (uint32_t)m_vecstrGenes.size( );
00261     ostm.write( (char*)&iVal, sizeof(iVal) );
00262     for( i = iVal = 0; i < m_vecstrGenes.size( ); ++i )
00263         iVal += (uint32_t)m_vecstrGenes[ i ].length( ) + 1;
00264     ostm.write( (char*)&iVal, sizeof(iVal) );
00265     for( i = c = 0; i < m_vecstrGenes.size( ); ++i ) {
00266         ostm.write( m_vecstrGenes[ i ].c_str( ), (streamsize)m_vecstrGenes[ i ].length( ) );
00267         ostm.write( &c, sizeof(c) ); }
00268 
00269     ac = new char[ iVal = (uint32_t)m_veccQuants.size( ) ];
00270     for( i = 0; i < iVal; ++i )
00271         ac[ i ] = m_veccQuants[ i ];
00272     ostm.write( (char*)&iVal, sizeof(iVal) );
00273     ostm.write( ac, iVal );
00274     delete[] ac; }
00275 
00276 CDatasetImpl::CDatasetImpl( ) : m_apData(NULL) { }
00277 
00278 CDatasetImpl::~CDatasetImpl( ) {
00279 
00280     Reset( ); }
00281 
00282 void CDatasetImpl::Reset( ) {
00283     size_t  i;
00284 
00285     if( m_apData ) {
00286         for( i = 0; i < m_veccQuants.size( ); ++i )
00287             if( m_veccQuants[ i ] == (unsigned char)-1 )
00288                 delete (CDistanceMatrix*)m_apData[ i ];
00289             else
00290                 delete (CCompactMatrix*)m_apData[ i ];
00291         delete[] m_apData; } }
00292 
00293 void CDatasetImpl::SaveBinary( std::ostream& ostm ) const {
00294     size_t      i;
00295     uint32_t    iData;
00296 
00297     CDataImpl::SaveBinary( ostm );
00298     for( i = iData = 0; i < m_veciMapping.size( ); ++i )
00299         if( m_veciMapping[ i ] != -1 )
00300             iData++;
00301     ostm.write( (char*)&iData, sizeof(iData) );
00302     for( i = 0; i < iData; ++i )
00303         if( m_veccQuants[ i ] == (unsigned char)-1 )
00304             ((CDistanceMatrix*)m_apData[ i ])->Save( ostm, true );
00305         else
00306             ((CCompactMatrix*)m_apData[ i ])->Save( ostm ); }
00307 
00308 void CDatasetImpl::SaveText( std::ostream& ostm ) const {
00309     size_t          i, j, k;
00310     vector<float>   vecdValues;
00311     bool            fHit;
00312 
00313     vecdValues.resize( GetExperiments( ) );
00314     for( i = 0; i < GetGenes( ); ++i )
00315         for( j = ( i + 1 ); j < GetGenes( ); ++j ) {
00316             fHit = false;
00317             for( k = 0; k < vecdValues.size( ); ++k )
00318                 if( !CMeta::IsNaN( vecdValues[ k ] = GetContinuous( i, j, k ) ) )
00319                     fHit = true;
00320             if( !fHit )
00321                 continue;
00322             ostm << GetGene( i ) << '\t' << GetGene( j );
00323             for( k = 0; k < vecdValues.size( ); ++k ) {
00324                 ostm << '\t';
00325                 if( !CMeta::IsNaN( vecdValues[ k ] ) )
00326                     ostm << vecdValues[ k ]; }
00327             ostm << endl; } }
00328 
00356 bool CDataset::Open( const char* szAnswerFile, const char* szDataDirectory, const IBayesNet* pBayesNet ) {
00357     CDataPair   Answers;
00358 
00359     return ( Answers.Open( szAnswerFile, pBayesNet->IsContinuous( 0 ) ) &&
00360         Open( Answers, szDataDirectory, pBayesNet ) ); }
00361 
00362 bool CDatasetImpl::Open( const CDataPair* pAnswers, const char* szDataDir, const IBayesNet* pBayesNet ) {
00363     size_t                      i;
00364     vector<string>              vecstrData, vecstrNodes;
00365     set<string>                 setstrGenes;
00366     set<string>::const_iterator iterGene;
00367 
00368     Reset( );
00369     m_fContinuous = pBayesNet->IsContinuous( );
00370     pBayesNet->GetNodes( vecstrNodes );
00371     m_veccQuants.resize( ( pAnswers ? 1 : 0 ) + OpenMax( szDataDir, vecstrNodes, !!pAnswers,
00372         vecstrData, &setstrGenes ) );
00373     if( pAnswers ) {
00374         m_vecstrGenes.resize( pAnswers->GetGenes( ) );
00375         for( i = 0; i < m_vecstrGenes.size( ); ++i )
00376             m_vecstrGenes[ i ] = pAnswers->GetGene( i ); }
00377     else {
00378         m_vecstrGenes.resize( setstrGenes.size( ) );
00379         for( i = 0,iterGene = setstrGenes.begin( ); iterGene != setstrGenes.end( ); ++i,++iterGene )
00380             m_vecstrGenes[ i ] = *iterGene; }
00381     m_apData = new void*[ m_veccQuants.size( ) ];
00382     if( pAnswers && !CDatasetImpl::Open( *pAnswers, 0 ) )
00383         return false;
00384 
00385     for( i = 0; i < vecstrData.size( ); ++i ) {
00386         CDataPair   Datum;
00387 
00388         if( !( Datum.Open( vecstrData[ i ].c_str( ), pBayesNet->IsContinuous( i + 1 ) ) &&
00389             CDatasetImpl::Open( Datum, i + ( pAnswers ? 1 : 0 ) ) ) )
00390             return false; }
00391 
00392     return true; }
00393 
00394 bool CDatasetImpl::Open( const CDataPair& Datum, size_t iExp ) {
00395     vector<size_t>  veciGenes;
00396     size_t          i, j, iOne, iTwo;
00397     float           d;
00398 
00399     m_veccQuants[ iExp ] = Datum.IsContinuous( ) ? -1 : Datum.GetValues( );
00400     veciGenes.resize( m_vecstrGenes.size( ) );
00401     for( i = 0; i < veciGenes.size( ); ++i )
00402         veciGenes[ i ] = Datum.GetGene( m_vecstrGenes[ i ] );
00403 
00404     if( Datum.IsContinuous( ) ) {
00405         CDistanceMatrix*    pDatum;
00406 
00407         pDatum = new CDistanceMatrix( );
00408         pDatum->Initialize( m_vecstrGenes.size( ) );
00409         for( i = 0; i < pDatum->GetSize( ); ++i )
00410             for( j = ( i + 1 ); j < pDatum->GetSize( ); ++j )
00411                 pDatum->Set( i, j, CMeta::GetNaN( ) );
00412         for( i = 0; i < veciGenes.size( ); ++i ) {
00413             if( ( iOne = veciGenes[ i ] ) == -1 )
00414                 continue;
00415             for( j = ( i + 1 ); j < veciGenes.size( ); ++j )
00416                 if( ( ( iTwo = veciGenes[ j ] ) != -1 ) &&
00417                     !CMeta::IsNaN( d = Datum.Get( iOne, iTwo ) ) )
00418                     pDatum->Set( i, j, d ); }
00419         m_apData[ iExp ] = pDatum; }
00420     else {
00421         CCompactMatrix* pDatum;
00422 
00423         pDatum = new CCompactMatrix( );
00424         pDatum->Initialize( m_vecstrGenes.size( ), m_veccQuants[ iExp ] + 1, true );
00425         for( i = 0; i < veciGenes.size( ); ++i )
00426             if( ( iOne = veciGenes[ i ] ) != -1 )
00427                 for( j = ( i + 1 ); j < veciGenes.size( ); ++j )
00428                     if( ( ( iTwo = veciGenes[ j ] ) != -1 ) &&
00429                         !CMeta::IsNaN( d = Datum.Get( iOne, iTwo ) ) )
00430                         pDatum->Set( i, j, (unsigned char)( Datum.Quantize( d ) + 1 ) );
00431         m_apData[ iExp ] = pDatum; }
00432 
00433     return true; }
00434 
00450 bool CDataset::Open( const char* szAnswerFile, const std::vector<std::string>& vecstrDataFiles ) {
00451     size_t  i;
00452 
00453     Reset( );
00454     m_veccQuants.resize( 1 + vecstrDataFiles.size( ) );
00455     {
00456         CDataPair   Answers;
00457 
00458         if( !Answers.Open( szAnswerFile, true ) )
00459             return false;
00460 
00461         m_vecstrGenes.resize( Answers.GetGenes( ) );
00462         for( i = 0; i < m_vecstrGenes.size( ); ++i )
00463             m_vecstrGenes[ i ] = Answers.GetGene( i );
00464         m_apData = new void*[ m_veccQuants.size( ) ];
00465         if( !CDatasetImpl::Open( Answers, 0 ) )
00466             return false;
00467     }
00468 
00469     m_veciMapping.resize( m_veccQuants.size( ) );
00470     m_veciMapping[ 0 ] = 0;
00471     for( i = 1; i <= vecstrDataFiles.size( ); ++i ) {
00472         CDataPair   Datum;
00473 
00474         if( !Datum.Open( vecstrDataFiles[ i - 1 ].c_str( ), true ) ||
00475             !CDatasetImpl::Open( Datum, i ) )
00476             return false;
00477         m_veciMapping[ i ] = i; }
00478 
00479     return true; }
00480 
00494 bool CDataset::Open( const std::vector<std::string>& vecstrDataFiles ) {
00495     size_t  i;
00496 
00497     if( !OpenGenes( vecstrDataFiles ) )
00498         return false;
00499     Reset( );
00500     m_apData = new void*[ m_veccQuants.size( ) ];
00501 
00502     for( i = 0; i < vecstrDataFiles.size( ); ++i ) {
00503         CDataPair   Datum;
00504 
00505         if( !( Datum.Open( vecstrDataFiles[ i ].c_str( ), true ) &&
00506             CDatasetImpl::Open( Datum, i ) ) )
00507             return false; }
00508 
00509     return true; }
00510 
00511 float CDatasetImpl::GetContinuous( size_t iY, size_t iX, size_t iNode ) const {
00512     size_t  iMap;
00513 
00514     if( ( iMap = m_veciMapping[ iNode ] ) == -1 )
00515         return CMeta::GetNaN( );
00516 
00517     return ( ( m_veccQuants[ iMap ] == (unsigned char)-1 ) ?
00518         ((CDistanceMatrix*)m_apData[ iMap ])->Get( iY, iX ) :
00519         ((CCompactMatrix*)m_apData[ iMap ])->Get( iY, iX ) ); }
00520 
00521 size_t CDataset::GetDiscrete( size_t iY, size_t iX, size_t iNode ) const {
00522     size_t  iMap;
00523 
00524     if( ( iMap = m_veciMapping[ iNode ] ) == -1 )
00525         return -1;
00526 
00527     return ( ( m_veccQuants[ iMap ] == (unsigned char)-1 ) ? -1 :
00528         ( ((CCompactMatrix*)m_apData[ iMap ])->Get( iY, iX ) - 1 ) ); }
00529 
00530 bool CDataset::IsExample( size_t iY, size_t iX ) const {
00531     size_t  i;
00532 
00533     for( i = 0; i < m_veccQuants.size( ); ++i )
00534         if( m_veccQuants[ i ] == (unsigned char)-1 ) {
00535             if( !CMeta::IsNaN( ((CDistanceMatrix*)m_apData[ i ])->Get( iY, iX ) ) )
00536                 return true; }
00537         else if( ((CCompactMatrix*)m_apData[ i ])->Get( iY, iX ) )
00538             return true;
00539 
00540     return false; }
00541 
00542 void CDataset::Remove( size_t iY, size_t iX ) {
00543     size_t  i;
00544 
00545     for( i = 0; i < m_veccQuants.size( ); ++i )
00546         if( m_veccQuants[ i ] == (unsigned char)-1 )
00547             ((CDistanceMatrix*)m_apData[ i ])->Set( iY, iX, CMeta::GetNaN( ) );
00548         else
00549             ((CCompactMatrix*)m_apData[ i ])->Set( iY, iX, 0 ); }
00550 
00551 // CDataOverlayImpl
00552 
00553 const vector<string>& CDataOverlayImpl::GetGeneNames( ) const {
00554 
00555     return m_pDataset->GetGeneNames( ); }
00556 
00557 size_t CDataOverlayImpl::GetExperiments( ) const {
00558 
00559     return m_pDataset->GetExperiments( ); }
00560 
00561 size_t CDataOverlayImpl::GetGene( const std::string& strGene ) const {
00562 
00563     return m_pDataset->GetGene( strGene ); }
00564 
00565 size_t CDataOverlayImpl::GetBins( size_t iExp ) const {
00566 
00567     return m_pDataset->GetBins( iExp ); }
00568 
00569 size_t CDataOverlayImpl::GetGenes( ) const {
00570 
00571     return m_pDataset->GetGenes( ); }
00572 
00573 bool CDataOverlayImpl::IsHidden( size_t iNode ) const {
00574 
00575     return m_pDataset->IsHidden( iNode ); }
00576 
00577 size_t CDataOverlayImpl::GetDiscrete( size_t iX, size_t iY, size_t iNode ) const {
00578 
00579     return m_pDataset->GetDiscrete( iX, iY, iNode ); }
00580 
00581 float CDataOverlayImpl::GetContinuous( size_t iX, size_t iY, size_t iNode ) const {
00582 
00583     return m_pDataset->GetContinuous( iX, iY, iNode ); }
00584 
00585 const string& CDataOverlayImpl::GetGene( size_t iGene ) const {
00586 
00587     return m_pDataset->GetGene( iGene ); }
00588 
00589 void CDataOverlayImpl::Save( std::ostream& ostm, bool fBinary ) const {
00590 
00591     m_pDataset->Save( ostm, fBinary ); }
00592 
00593 // CDataFilter
00594 
00617 void CDataFilter::Attach( const IDataset* pDataset, const CGenes& Genes, CDat::EFilter eFilter,
00618     const CDat* pAnswers ) {
00619     size_t  i;
00620 
00621     m_pDataset = pDataset;
00622     m_pGenes = &Genes;
00623     m_eFilter = eFilter;
00624     m_pAnswers = pAnswers;
00625 
00626     m_vecfGenes.resize( GetGenes( ) );
00627     for( i = 0; i < m_vecfGenes.size( ); ++i )
00628         m_vecfGenes[ i ] = m_pGenes->IsGene( GetGene( i ) );
00629     if( m_pAnswers ) {
00630         m_veciAnswers.resize( GetGenes( ) );
00631         for( i = 0; i < m_veciAnswers.size( ); ++i )
00632             m_veciAnswers[ i ] = m_pAnswers->GetGene( GetGene( i ) ); } }
00633 
00634 bool CDataFilter::IsExample( size_t iY, size_t iX ) const {
00635 
00636     if( !m_pDataset )
00637         return false;
00638     if( !( m_pGenes && m_pGenes->GetGenes( ) ) )
00639         return m_pDataset->IsExample( iY, iX );
00640 
00641     switch( m_eFilter ) {
00642         case CDat::EFilterInclude:
00643             if( !( m_vecfGenes[ iY ] && m_vecfGenes[ iX ] ) )
00644                 return false;
00645             break;
00646 
00647         case CDat::EFilterExclude:
00648             if( m_vecfGenes[ iY ] || m_vecfGenes[ iX ] )
00649                 return false;
00650             break;
00651 
00652         case CDat::EFilterEdge:
00653             if( !( m_vecfGenes[ iY ] || m_vecfGenes[ iX ] ) )
00654                 return false;
00655             break;
00656 
00657         case CDat::EFilterTerm:
00658             if( ( m_pAnswers && ( ( m_veciAnswers[ iY ] == -1 ) || ( m_veciAnswers[ iX ] == -1 ) ) ) ||
00659                 ( !( m_vecfGenes[ iY ] && m_vecfGenes[ iX ] ) &&
00660                 ( !( m_vecfGenes[ iY ] || m_vecfGenes[ iX ] ) || ( m_pAnswers &&
00661                 ( m_pAnswers->Get( m_veciAnswers[ iY ], m_veciAnswers[ iX ] ) > 0 ) ) ) ) )
00662                 return false;
00663             break; }
00664 
00665     return m_pDataset->IsExample( iY, iX ); }
00666 
00667 // CDataMask
00668 
00679 void CDataMask::AttachRandom( const IDataset* pDataset, float dFraction ) {
00680     size_t  i, j;
00681 
00682     Attach( pDataset );
00683     for( i = 0; i < m_Mask.GetSize( ); ++i )
00684         for( j = ( i + 1 ); j < m_Mask.GetSize( ); ++j )
00685             m_Mask.Set( i, j, ( m_Mask.Get( i, j ) && ( ( (float)rand( ) / RAND_MAX ) < dFraction ) ) ); }
00686 
00698 void CDataMask::AttachComplement( const CDataMask& DataMask ) {
00699     size_t  i, j;
00700 
00701     Attach( DataMask.m_pDataset );
00702     for( i = 0; i < m_Mask.GetSize( ); ++i )
00703         for( j = ( i + 1 ); j < m_Mask.GetSize( ); ++j )
00704             m_Mask.Set( i, j, ( m_Mask.Get( i, j ) && !DataMask.m_Mask.Get( i, j ) ) ); }
00705 
00713 void CDataMask::Attach( const IDataset* pDataset ) {
00714     size_t  i, j;
00715 
00716     m_pDataset = pDataset;
00717     m_Mask.Initialize( m_pDataset->GetGenes( ) );
00718     for( i = 0; i < m_Mask.GetSize( ); ++i )
00719         for( j = ( i + 1 ); j < m_Mask.GetSize( ); ++j )
00720             m_Mask.Set( i, j, m_pDataset->IsExample( i, j ) ); }
00721 
00722 // CDataSubset
00723 
00749 bool CDataSubset::Initialize( const char* szDataDirectory, const IBayesNet* pBayesNet, size_t iGeneSize ) {
00750     size_t  i;
00751 
00752     m_iSize = iGeneSize;
00753     m_vecstrData.clear( );
00754     m_fContinuous = pBayesNet->IsContinuous( );
00755     {
00756         set<string>                 setstrGenes;
00757         set<string>::const_iterator iterGenes;
00758         vector<string>              vecstrNodes;
00759 
00760         pBayesNet->GetNodes( vecstrNodes );
00761         OpenMax( szDataDirectory, vecstrNodes, false, m_vecstrData, &setstrGenes );
00762         m_vecstrGenes.resize( setstrGenes.size( ) );
00763         i = 0;
00764         for( iterGenes = setstrGenes.begin( ); iterGenes != setstrGenes.end( ); ++iterGenes )
00765             m_vecstrGenes[ i++ ] = *iterGenes;
00766     }
00767     m_Examples.Initialize( m_iSize, m_vecstrGenes.size( ) );
00768 
00769     return true; }
00770 
00791 bool CDataSubset::Initialize( const std::vector<std::string>& vecstrDataFiles, size_t iGeneSize ) {
00792     size_t  i;
00793 
00794     m_iSize = iGeneSize;
00795     m_vecstrData.resize( vecstrDataFiles.size( ) );
00796     m_veccQuants.resize( vecstrDataFiles.size( ) );
00797     for( i = 0; i < vecstrDataFiles.size( ); ++i )
00798         m_vecstrData[ i ] = vecstrDataFiles[ i ];
00799     m_fContinuous = true;
00800 
00801     if( !OpenGenes( vecstrDataFiles ) )
00802         return false;
00803     m_Examples.Initialize( m_iSize, m_vecstrGenes.size( ) );
00804 
00805     return true; }
00806 
00823 bool CDataSubset::Open( size_t iGeneOffset ) {
00824     size_t  i, j;
00825 
00826     m_iOffset = iGeneOffset;
00827     for( i = 0; i < m_Examples.GetRows( ); ++i )
00828         for( j = 0; j < m_Examples.GetColumns( ); ++j )
00829             m_Examples.Get( i, j ).Reset( );
00830 
00831     m_iSize = ( ( m_iOffset + m_Examples.GetRows( ) ) > m_vecstrGenes.size( ) ) ?
00832         ( m_vecstrGenes.size( ) - m_iOffset ) : m_Examples.GetRows( );
00833     for( i = 0; i < m_vecstrData.size( ); ++i ) {
00834         CDataPair   Datum;
00835 
00836         if( !( Datum.Open( m_vecstrData[ i ].c_str( ), m_fContinuous ) &&
00837             CDataSubsetImpl::Open( Datum, i ) ) )
00838             return false; }
00839 
00840     return true; }
00841 
00842 bool CDataSubsetImpl::Open( const CDataPair& Datum, size_t iExp ) {
00843     vector<size_t>  veciGenes;
00844     size_t          i, j, iOne, iTwo;
00845     float           d;
00846 
00847     m_veccQuants[ iExp ] = Datum.IsContinuous( ) ? -1 : Datum.GetValues( );
00848     veciGenes.resize( m_vecstrGenes.size( ) );
00849     for( i = 0; i < veciGenes.size( ); ++i )
00850         veciGenes[ i ] = Datum.GetGene( m_vecstrGenes[ i ] );
00851 
00852     for( i = 0; i < m_iSize; ++i ) {
00853         if( ( iOne = veciGenes[ i + m_iOffset ] ) == -1 )
00854             continue;
00855         for( j = 0; j < veciGenes.size( ); ++j )
00856             if( ( ( iTwo = veciGenes[ j ] ) != -1 ) &&
00857                 !CMeta::IsNaN( d = Datum.Get( iOne, iTwo ) ) )
00858                 m_Examples.Get( i, j ).Set( iExp, d, Datum, m_vecstrData.size( ) ); }
00859 
00860     return true; }
00861 
00862 }