Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "dataset.h" 00024 #include "bayesnetint.h" 00025 #include "genome.h" 00026 #include "compactmatrix.h" 00027 00028 namespace Sleipnir { 00029 00030 const char CDataImpl::c_szDat[] = ".dat"; 00031 const char CDataImpl::c_szDab[] = ".dab"; 00032 00033 void CDataImpl::FilterGenes( IDataset* pData, const CGenes& Genes, CDat::EFilter eFilt ) { 00034 vector<bool> vecfGenes; 00035 size_t i, j; 00036 00037 if( !Genes.GetGenes( ) ) 00038 return; 00039 00040 vecfGenes.resize( pData->GetGenes( ) ); 00041 for( i = 0; i < vecfGenes.size( ); ++i ) 00042 vecfGenes[ i ] = Genes.IsGene( pData->GetGene( i ) ); 00043 00044 for( i = 0; i < vecfGenes.size( ); ++i ) { 00045 if( ( ( eFilt == CDat::EFilterInclude ) && !vecfGenes[ i ] ) || 00046 ( ( eFilt == CDat::EFilterExclude ) && vecfGenes[ i ] ) ) { 00047 for( j = ( i + 1 ); j < vecfGenes.size( ); ++j ) 00048 pData->Remove( i, j ); 00049 continue; } 00050 if( ( eFilt == CDat::EFilterEdge ) && vecfGenes[ i ] ) 00051 continue; 00052 for( j = ( i + 1 ); j < vecfGenes.size( ); ++j ) 00053 switch( eFilt ) { 00054 case CDat::EFilterInclude: 00055 case CDat::EFilterEdge: 00056 if( !vecfGenes[ j ] ) 00057 pData->Remove( i, j ); 00058 break; 00059 00060 case CDat::EFilterTerm: 00061 if( !( vecfGenes[ i ] && vecfGenes[ j ] ) && 00062 ( !( vecfGenes[ i ] || vecfGenes[ j ] ) || pData->GetDiscrete( i, j, 0 ) ) ) 00063 pData->Remove( i, j ); 00064 break; 00065 00066 case CDat::EFilterExclude: 00067 if( vecfGenes[ j ] ) 00068 pData->Remove( i, j ); 00069 break; } } } 00070 00071 size_t CDataImpl::OpenMax( const char* szDataDir, const std::vector<std::string>& vecstrNodes, 00072 bool fAnswers, std::vector<std::string>& vecstrData, std::set<std::string>* psetstrGenes ) { 00073 size_t i, iLength, iMap, iRet; 00074 string strFile; 00075 ifstream ifsm; 00076 CPCL PCL; 00077 00078 strFile = szDataDir; 00079 strFile += c_cSeparator; 00080 iLength = strFile.size( ); 00081 00082 iRet = 0; 00083 m_veciMapping.resize( vecstrNodes.size( ) ); 00084 m_veciMapping[ 0 ] = fAnswers ? 0 : -1; 00085 iMap = fAnswers ? 1 : 0; 00086 for( i = 1; i < vecstrNodes.size( ); ++i ) { 00087 m_veciMapping[ i ] = -1; 00088 strFile.resize( iLength ); 00089 strFile += vecstrNodes[ i ]; 00090 strFile += c_szDab; 00091 ifsm.clear( ); 00092 ifsm.open( strFile.c_str( ), ios_base::binary ); 00093 if( ifsm.is_open( ) ) { 00094 iRet++; 00095 m_veciMapping[ i ] = iMap++; 00096 vecstrData.push_back( strFile ); 00097 if( psetstrGenes ) 00098 OpenGenes( ifsm, true, false, *psetstrGenes ); } 00099 else { 00100 strFile.resize( strFile.length( ) - strlen( c_szDab ) ); 00101 strFile += c_szDat; 00102 ifsm.clear( ); 00103 ifsm.open( strFile.c_str( ) ); 00104 if( ifsm.is_open( ) ) { 00105 iRet++; 00106 m_veciMapping[ i ] = iMap++; 00107 vecstrData.push_back( strFile ); 00108 if( psetstrGenes ) 00109 OpenGenes( ifsm, false, false, *psetstrGenes ); } 00110 else { 00111 strFile.resize( strFile.length( ) - strlen( c_szDat ) ); 00112 strFile += CPCL::GetExtension( ); 00113 ifsm.clear( ); 00114 ifsm.open( strFile.c_str( ) ); 00115 if( ifsm.is_open( ) ) { 00116 iRet++; 00117 m_veciMapping[ i ] = iMap++; 00118 vecstrData.push_back( strFile ); 00119 if( psetstrGenes ) 00120 OpenGenes( ifsm, false, true, *psetstrGenes ); } 00121 else { 00122 g_CatSleipnir( ).info( "CDataImpl::OpenMax( %s ) assuming %s is hidden", 00123 szDataDir, vecstrNodes[ i ].c_str( ) ); 00124 continue; } } } 00125 ifsm.close( ); } 00126 00127 return iRet; } 00128 00129 bool CDataImpl::OpenGenes( std::istream& istm, bool fBinary, bool fPCL, 00130 std::set<std::string>& setstrGenes ) const { 00131 CDat Dat; 00132 size_t i; 00133 00134 if( !Dat.OpenGenes( istm, fBinary, fPCL ) ) 00135 return false; 00136 for( i = 0; i < Dat.GetGenes( ); ++i ) 00137 setstrGenes.insert( Dat.GetGene( i ) ); 00138 return true; } 00139 00140 bool CDataImpl::OpenGenes( const std::vector<std::string>& vecstrData ) { 00141 size_t i; 00142 ifstream ifsm; 00143 set<string> setstrGenes; 00144 set<string>::const_iterator iterGenes; 00145 00146 m_veciMapping.resize( vecstrData.size( ) ); 00147 m_veccQuants.resize( vecstrData.size( ) ); 00148 for( i = 0; i < vecstrData.size( ); ++i ) { 00149 m_veciMapping[ i ] = i; 00150 ifsm.clear( ); 00151 ifsm.open( vecstrData[ i ].c_str( ), ios_base::binary ); 00152 if( !( ifsm.is_open( ) && OpenGenes( ifsm, true, false, setstrGenes ) ) ) { 00153 ifsm.close( ); 00154 ifsm.clear( ); 00155 ifsm.open( vecstrData[ i ].c_str( ) ); 00156 if( !( ifsm.is_open( ) && OpenGenes( ifsm, false, false, setstrGenes ) ) ) { 00157 ifsm.close( ); 00158 ifsm.clear( ); 00159 ifsm.open( vecstrData[ i ].c_str( ) ); 00160 if( !( ifsm.is_open( ) && OpenGenes( ifsm, false, true, setstrGenes ) ) ) { 00161 g_CatSleipnir( ).error( "CDataImpl::OpenGenes( ) failed to open: %s", vecstrData[ i ].c_str( ) ); 00162 return false; } } } 00163 ifsm.close( ); } 00164 00165 m_vecstrGenes.resize( setstrGenes.size( ) ); 00166 i = 0; 00167 for( iterGenes = setstrGenes.begin( ); iterGenes != setstrGenes.end( ); ++iterGenes ) 00168 m_vecstrGenes[ i++ ] = *iterGenes; 00169 00170 return true; } 00171 00172 size_t CDataImpl::GetGene( const std::string& strGene ) const { 00173 size_t i; 00174 00175 for( i = 0; i < m_vecstrGenes.size( ); ++i ) 00176 if( m_vecstrGenes[ i ] == strGene ) 00177 return i; 00178 00179 return -1; } 00180 00181 const unsigned char* CDataImpl::OpenBinary( const unsigned char* pbData ) { 00182 uint32_t iVal; 00183 size_t i; 00184 00185 m_fContinuous = !!*(uint32_t*)pbData; 00186 pbData += sizeof(uint32_t); 00187 00188 iVal = *(uint32_t*)pbData; 00189 pbData += sizeof(iVal); 00190 m_veciMapping.resize( iVal ); 00191 for( i = 0; i < m_veciMapping.size( ); ++i ) { 00192 iVal = *(uint32_t*)pbData; 00193 m_veciMapping[ i ] = ( iVal == -1 ) ? (size_t)-1 : iVal; 00194 pbData += sizeof(iVal); } 00195 00196 iVal = *(uint32_t*)pbData; 00197 pbData += 2 * sizeof(iVal); 00198 m_vecstrGenes.resize( iVal ); 00199 for( i = 0; i < m_vecstrGenes.size( ); ++i ) { 00200 m_vecstrGenes[ i ] = (char*)pbData; 00201 pbData += m_vecstrGenes[ i ].length( ) + 1; } 00202 00203 iVal = *(uint32_t*)pbData; 00204 pbData += sizeof(iVal); 00205 m_veccQuants.resize( iVal ); 00206 for( i = 0; i < m_veccQuants.size( ); ++i ) 00207 m_veccQuants[ i ] = *pbData++; 00208 00209 return pbData; } 00210 00211 bool CDataImpl::OpenBinary( std::istream& istm ) { 00212 uint32_t iVal; 00213 size_t i, j; 00214 char* ac; 00215 00216 istm.read( (char*)&iVal, sizeof(iVal) ); 00217 m_fContinuous = !!iVal; 00218 00219 istm.read( (char*)&iVal, sizeof(iVal) ); 00220 m_veciMapping.resize( iVal ); 00221 for( i = 0; i < m_veciMapping.size( ); ++i ) { 00222 istm.read( (char*)&iVal, sizeof(iVal) ); 00223 m_veciMapping[ i ] = ( iVal == -1 ) ? (size_t)-1 : iVal; } 00224 00225 istm.read( (char*)&iVal, sizeof(iVal) ); 00226 m_vecstrGenes.resize( iVal ); 00227 istm.read( (char*)&iVal, sizeof(iVal) ); 00228 ac = new char[ iVal ]; 00229 istm.read( ac, iVal ); 00230 for( i = j = 0; i < m_vecstrGenes.size( ); ++i ) { 00231 m_vecstrGenes[ i ] = ac + j; 00232 j += m_vecstrGenes[ i ].length( ) + 1; } 00233 delete[] ac; 00234 00235 istm.read( (char*)&iVal, sizeof(iVal) ); 00236 ac = new char[ iVal ]; 00237 m_veccQuants.resize( iVal ); 00238 istm.read( ac, iVal ); 00239 copy( ac, ac + iVal, m_veccQuants.begin( ) ); 00240 delete[] ac; 00241 00242 return true; } 00243 00244 void CDataImpl::SaveBinary( std::ostream& ostm ) const { 00245 size_t i; 00246 uint32_t iVal; 00247 char* ac; 00248 char c; 00249 00250 iVal = m_fContinuous; 00251 ostm.write( (char*)&iVal, sizeof(iVal) ); 00252 00253 iVal = (uint32_t)m_veciMapping.size( ); 00254 ostm.write( (char*)&iVal, sizeof(iVal) ); 00255 for( i = 0; i < m_veciMapping.size( ); ++i ) { 00256 iVal = ( m_veciMapping[ i ] == -1 ) ? -1 : 00257 (uint32_t)m_veciMapping[ i ]; 00258 ostm.write( (char*)&iVal, sizeof(iVal) ); } 00259 00260 iVal = (uint32_t)m_vecstrGenes.size( ); 00261 ostm.write( (char*)&iVal, sizeof(iVal) ); 00262 for( i = iVal = 0; i < m_vecstrGenes.size( ); ++i ) 00263 iVal += (uint32_t)m_vecstrGenes[ i ].length( ) + 1; 00264 ostm.write( (char*)&iVal, sizeof(iVal) ); 00265 for( i = c = 0; i < m_vecstrGenes.size( ); ++i ) { 00266 ostm.write( m_vecstrGenes[ i ].c_str( ), (streamsize)m_vecstrGenes[ i ].length( ) ); 00267 ostm.write( &c, sizeof(c) ); } 00268 00269 ac = new char[ iVal = (uint32_t)m_veccQuants.size( ) ]; 00270 for( i = 0; i < iVal; ++i ) 00271 ac[ i ] = m_veccQuants[ i ]; 00272 ostm.write( (char*)&iVal, sizeof(iVal) ); 00273 ostm.write( ac, iVal ); 00274 delete[] ac; } 00275 00276 CDatasetImpl::CDatasetImpl( ) : m_apData(NULL) { } 00277 00278 CDatasetImpl::~CDatasetImpl( ) { 00279 00280 Reset( ); } 00281 00282 void CDatasetImpl::Reset( ) { 00283 size_t i; 00284 00285 if( m_apData ) { 00286 for( i = 0; i < m_veccQuants.size( ); ++i ) 00287 if( m_veccQuants[ i ] == (unsigned char)-1 ) 00288 delete (CDistanceMatrix*)m_apData[ i ]; 00289 else 00290 delete (CCompactMatrix*)m_apData[ i ]; 00291 delete[] m_apData; } } 00292 00293 void CDatasetImpl::SaveBinary( std::ostream& ostm ) const { 00294 size_t i; 00295 uint32_t iData; 00296 00297 CDataImpl::SaveBinary( ostm ); 00298 for( i = iData = 0; i < m_veciMapping.size( ); ++i ) 00299 if( m_veciMapping[ i ] != -1 ) 00300 iData++; 00301 ostm.write( (char*)&iData, sizeof(iData) ); 00302 for( i = 0; i < iData; ++i ) 00303 if( m_veccQuants[ i ] == (unsigned char)-1 ) 00304 ((CDistanceMatrix*)m_apData[ i ])->Save( ostm, true ); 00305 else 00306 ((CCompactMatrix*)m_apData[ i ])->Save( ostm ); } 00307 00308 void CDatasetImpl::SaveText( std::ostream& ostm ) const { 00309 size_t i, j, k; 00310 vector<float> vecdValues; 00311 bool fHit; 00312 00313 vecdValues.resize( GetExperiments( ) ); 00314 for( i = 0; i < GetGenes( ); ++i ) 00315 for( j = ( i + 1 ); j < GetGenes( ); ++j ) { 00316 fHit = false; 00317 for( k = 0; k < vecdValues.size( ); ++k ) 00318 if( !CMeta::IsNaN( vecdValues[ k ] = GetContinuous( i, j, k ) ) ) 00319 fHit = true; 00320 if( !fHit ) 00321 continue; 00322 ostm << GetGene( i ) << '\t' << GetGene( j ); 00323 for( k = 0; k < vecdValues.size( ); ++k ) { 00324 ostm << '\t'; 00325 if( !CMeta::IsNaN( vecdValues[ k ] ) ) 00326 ostm << vecdValues[ k ]; } 00327 ostm << endl; } } 00328 00356 bool CDataset::Open( const char* szAnswerFile, const char* szDataDirectory, const IBayesNet* pBayesNet ) { 00357 CDataPair Answers; 00358 00359 return ( Answers.Open( szAnswerFile, pBayesNet->IsContinuous( 0 ) ) && 00360 Open( Answers, szDataDirectory, pBayesNet ) ); } 00361 00362 bool CDatasetImpl::Open( const CDataPair* pAnswers, const char* szDataDir, const IBayesNet* pBayesNet ) { 00363 size_t i; 00364 vector<string> vecstrData, vecstrNodes; 00365 set<string> setstrGenes; 00366 set<string>::const_iterator iterGene; 00367 00368 Reset( ); 00369 m_fContinuous = pBayesNet->IsContinuous( ); 00370 pBayesNet->GetNodes( vecstrNodes ); 00371 m_veccQuants.resize( ( pAnswers ? 1 : 0 ) + OpenMax( szDataDir, vecstrNodes, !!pAnswers, 00372 vecstrData, &setstrGenes ) ); 00373 if( pAnswers ) { 00374 m_vecstrGenes.resize( pAnswers->GetGenes( ) ); 00375 for( i = 0; i < m_vecstrGenes.size( ); ++i ) 00376 m_vecstrGenes[ i ] = pAnswers->GetGene( i ); } 00377 else { 00378 m_vecstrGenes.resize( setstrGenes.size( ) ); 00379 for( i = 0,iterGene = setstrGenes.begin( ); iterGene != setstrGenes.end( ); ++i,++iterGene ) 00380 m_vecstrGenes[ i ] = *iterGene; } 00381 m_apData = new void*[ m_veccQuants.size( ) ]; 00382 if( pAnswers && !CDatasetImpl::Open( *pAnswers, 0 ) ) 00383 return false; 00384 00385 for( i = 0; i < vecstrData.size( ); ++i ) { 00386 CDataPair Datum; 00387 00388 if( !( Datum.Open( vecstrData[ i ].c_str( ), pBayesNet->IsContinuous( i + 1 ) ) && 00389 CDatasetImpl::Open( Datum, i + ( pAnswers ? 1 : 0 ) ) ) ) 00390 return false; } 00391 00392 return true; } 00393 00394 bool CDatasetImpl::Open( const CDataPair& Datum, size_t iExp ) { 00395 vector<size_t> veciGenes; 00396 size_t i, j, iOne, iTwo; 00397 float d; 00398 00399 m_veccQuants[ iExp ] = Datum.IsContinuous( ) ? -1 : Datum.GetValues( ); 00400 veciGenes.resize( m_vecstrGenes.size( ) ); 00401 for( i = 0; i < veciGenes.size( ); ++i ) 00402 veciGenes[ i ] = Datum.GetGene( m_vecstrGenes[ i ] ); 00403 00404 if( Datum.IsContinuous( ) ) { 00405 CDistanceMatrix* pDatum; 00406 00407 pDatum = new CDistanceMatrix( ); 00408 pDatum->Initialize( m_vecstrGenes.size( ) ); 00409 for( i = 0; i < pDatum->GetSize( ); ++i ) 00410 for( j = ( i + 1 ); j < pDatum->GetSize( ); ++j ) 00411 pDatum->Set( i, j, CMeta::GetNaN( ) ); 00412 for( i = 0; i < veciGenes.size( ); ++i ) { 00413 if( ( iOne = veciGenes[ i ] ) == -1 ) 00414 continue; 00415 for( j = ( i + 1 ); j < veciGenes.size( ); ++j ) 00416 if( ( ( iTwo = veciGenes[ j ] ) != -1 ) && 00417 !CMeta::IsNaN( d = Datum.Get( iOne, iTwo ) ) ) 00418 pDatum->Set( i, j, d ); } 00419 m_apData[ iExp ] = pDatum; } 00420 else { 00421 CCompactMatrix* pDatum; 00422 00423 pDatum = new CCompactMatrix( ); 00424 pDatum->Initialize( m_vecstrGenes.size( ), m_veccQuants[ iExp ] + 1, true ); 00425 for( i = 0; i < veciGenes.size( ); ++i ) 00426 if( ( iOne = veciGenes[ i ] ) != -1 ) 00427 for( j = ( i + 1 ); j < veciGenes.size( ); ++j ) 00428 if( ( ( iTwo = veciGenes[ j ] ) != -1 ) && 00429 !CMeta::IsNaN( d = Datum.Get( iOne, iTwo ) ) ) 00430 pDatum->Set( i, j, (unsigned char)( Datum.Quantize( d ) + 1 ) ); 00431 m_apData[ iExp ] = pDatum; } 00432 00433 return true; } 00434 00450 bool CDataset::Open( const char* szAnswerFile, const std::vector<std::string>& vecstrDataFiles ) { 00451 size_t i; 00452 00453 Reset( ); 00454 m_veccQuants.resize( 1 + vecstrDataFiles.size( ) ); 00455 { 00456 CDataPair Answers; 00457 00458 if( !Answers.Open( szAnswerFile, true ) ) 00459 return false; 00460 00461 m_vecstrGenes.resize( Answers.GetGenes( ) ); 00462 for( i = 0; i < m_vecstrGenes.size( ); ++i ) 00463 m_vecstrGenes[ i ] = Answers.GetGene( i ); 00464 m_apData = new void*[ m_veccQuants.size( ) ]; 00465 if( !CDatasetImpl::Open( Answers, 0 ) ) 00466 return false; 00467 } 00468 00469 m_veciMapping.resize( m_veccQuants.size( ) ); 00470 m_veciMapping[ 0 ] = 0; 00471 for( i = 1; i <= vecstrDataFiles.size( ); ++i ) { 00472 CDataPair Datum; 00473 00474 if( !Datum.Open( vecstrDataFiles[ i - 1 ].c_str( ), true ) || 00475 !CDatasetImpl::Open( Datum, i ) ) 00476 return false; 00477 m_veciMapping[ i ] = i; } 00478 00479 return true; } 00480 00494 bool CDataset::Open( const std::vector<std::string>& vecstrDataFiles ) { 00495 size_t i; 00496 00497 if( !OpenGenes( vecstrDataFiles ) ) 00498 return false; 00499 Reset( ); 00500 m_apData = new void*[ m_veccQuants.size( ) ]; 00501 00502 for( i = 0; i < vecstrDataFiles.size( ); ++i ) { 00503 CDataPair Datum; 00504 00505 if( !( Datum.Open( vecstrDataFiles[ i ].c_str( ), true ) && 00506 CDatasetImpl::Open( Datum, i ) ) ) 00507 return false; } 00508 00509 return true; } 00510 00511 float CDatasetImpl::GetContinuous( size_t iY, size_t iX, size_t iNode ) const { 00512 size_t iMap; 00513 00514 if( ( iMap = m_veciMapping[ iNode ] ) == -1 ) 00515 return CMeta::GetNaN( ); 00516 00517 return ( ( m_veccQuants[ iMap ] == (unsigned char)-1 ) ? 00518 ((CDistanceMatrix*)m_apData[ iMap ])->Get( iY, iX ) : 00519 ((CCompactMatrix*)m_apData[ iMap ])->Get( iY, iX ) ); } 00520 00521 size_t CDataset::GetDiscrete( size_t iY, size_t iX, size_t iNode ) const { 00522 size_t iMap; 00523 00524 if( ( iMap = m_veciMapping[ iNode ] ) == -1 ) 00525 return -1; 00526 00527 return ( ( m_veccQuants[ iMap ] == (unsigned char)-1 ) ? -1 : 00528 ( ((CCompactMatrix*)m_apData[ iMap ])->Get( iY, iX ) - 1 ) ); } 00529 00530 bool CDataset::IsExample( size_t iY, size_t iX ) const { 00531 size_t i; 00532 00533 for( i = 0; i < m_veccQuants.size( ); ++i ) 00534 if( m_veccQuants[ i ] == (unsigned char)-1 ) { 00535 if( !CMeta::IsNaN( ((CDistanceMatrix*)m_apData[ i ])->Get( iY, iX ) ) ) 00536 return true; } 00537 else if( ((CCompactMatrix*)m_apData[ i ])->Get( iY, iX ) ) 00538 return true; 00539 00540 return false; } 00541 00542 void CDataset::Remove( size_t iY, size_t iX ) { 00543 size_t i; 00544 00545 for( i = 0; i < m_veccQuants.size( ); ++i ) 00546 if( m_veccQuants[ i ] == (unsigned char)-1 ) 00547 ((CDistanceMatrix*)m_apData[ i ])->Set( iY, iX, CMeta::GetNaN( ) ); 00548 else 00549 ((CCompactMatrix*)m_apData[ i ])->Set( iY, iX, 0 ); } 00550 00551 // CDataOverlayImpl 00552 00553 const vector<string>& CDataOverlayImpl::GetGeneNames( ) const { 00554 00555 return m_pDataset->GetGeneNames( ); } 00556 00557 size_t CDataOverlayImpl::GetExperiments( ) const { 00558 00559 return m_pDataset->GetExperiments( ); } 00560 00561 size_t CDataOverlayImpl::GetGene( const std::string& strGene ) const { 00562 00563 return m_pDataset->GetGene( strGene ); } 00564 00565 size_t CDataOverlayImpl::GetBins( size_t iExp ) const { 00566 00567 return m_pDataset->GetBins( iExp ); } 00568 00569 size_t CDataOverlayImpl::GetGenes( ) const { 00570 00571 return m_pDataset->GetGenes( ); } 00572 00573 bool CDataOverlayImpl::IsHidden( size_t iNode ) const { 00574 00575 return m_pDataset->IsHidden( iNode ); } 00576 00577 size_t CDataOverlayImpl::GetDiscrete( size_t iX, size_t iY, size_t iNode ) const { 00578 00579 return m_pDataset->GetDiscrete( iX, iY, iNode ); } 00580 00581 float CDataOverlayImpl::GetContinuous( size_t iX, size_t iY, size_t iNode ) const { 00582 00583 return m_pDataset->GetContinuous( iX, iY, iNode ); } 00584 00585 const string& CDataOverlayImpl::GetGene( size_t iGene ) const { 00586 00587 return m_pDataset->GetGene( iGene ); } 00588 00589 void CDataOverlayImpl::Save( std::ostream& ostm, bool fBinary ) const { 00590 00591 m_pDataset->Save( ostm, fBinary ); } 00592 00593 // CDataFilter 00594 00617 void CDataFilter::Attach( const IDataset* pDataset, const CGenes& Genes, CDat::EFilter eFilter, 00618 const CDat* pAnswers ) { 00619 size_t i; 00620 00621 m_pDataset = pDataset; 00622 m_pGenes = &Genes; 00623 m_eFilter = eFilter; 00624 m_pAnswers = pAnswers; 00625 00626 m_vecfGenes.resize( GetGenes( ) ); 00627 for( i = 0; i < m_vecfGenes.size( ); ++i ) 00628 m_vecfGenes[ i ] = m_pGenes->IsGene( GetGene( i ) ); 00629 if( m_pAnswers ) { 00630 m_veciAnswers.resize( GetGenes( ) ); 00631 for( i = 0; i < m_veciAnswers.size( ); ++i ) 00632 m_veciAnswers[ i ] = m_pAnswers->GetGene( GetGene( i ) ); } } 00633 00634 bool CDataFilter::IsExample( size_t iY, size_t iX ) const { 00635 00636 if( !m_pDataset ) 00637 return false; 00638 if( !( m_pGenes && m_pGenes->GetGenes( ) ) ) 00639 return m_pDataset->IsExample( iY, iX ); 00640 00641 switch( m_eFilter ) { 00642 case CDat::EFilterInclude: 00643 if( !( m_vecfGenes[ iY ] && m_vecfGenes[ iX ] ) ) 00644 return false; 00645 break; 00646 00647 case CDat::EFilterExclude: 00648 if( m_vecfGenes[ iY ] || m_vecfGenes[ iX ] ) 00649 return false; 00650 break; 00651 00652 case CDat::EFilterEdge: 00653 if( !( m_vecfGenes[ iY ] || m_vecfGenes[ iX ] ) ) 00654 return false; 00655 break; 00656 00657 case CDat::EFilterTerm: 00658 if( ( m_pAnswers && ( ( m_veciAnswers[ iY ] == -1 ) || ( m_veciAnswers[ iX ] == -1 ) ) ) || 00659 ( !( m_vecfGenes[ iY ] && m_vecfGenes[ iX ] ) && 00660 ( !( m_vecfGenes[ iY ] || m_vecfGenes[ iX ] ) || ( m_pAnswers && 00661 ( m_pAnswers->Get( m_veciAnswers[ iY ], m_veciAnswers[ iX ] ) > 0 ) ) ) ) ) 00662 return false; 00663 break; } 00664 00665 return m_pDataset->IsExample( iY, iX ); } 00666 00667 // CDataMask 00668 00679 void CDataMask::AttachRandom( const IDataset* pDataset, float dFraction ) { 00680 size_t i, j; 00681 00682 Attach( pDataset ); 00683 for( i = 0; i < m_Mask.GetSize( ); ++i ) 00684 for( j = ( i + 1 ); j < m_Mask.GetSize( ); ++j ) 00685 m_Mask.Set( i, j, ( m_Mask.Get( i, j ) && ( ( (float)rand( ) / RAND_MAX ) < dFraction ) ) ); } 00686 00698 void CDataMask::AttachComplement( const CDataMask& DataMask ) { 00699 size_t i, j; 00700 00701 Attach( DataMask.m_pDataset ); 00702 for( i = 0; i < m_Mask.GetSize( ); ++i ) 00703 for( j = ( i + 1 ); j < m_Mask.GetSize( ); ++j ) 00704 m_Mask.Set( i, j, ( m_Mask.Get( i, j ) && !DataMask.m_Mask.Get( i, j ) ) ); } 00705 00713 void CDataMask::Attach( const IDataset* pDataset ) { 00714 size_t i, j; 00715 00716 m_pDataset = pDataset; 00717 m_Mask.Initialize( m_pDataset->GetGenes( ) ); 00718 for( i = 0; i < m_Mask.GetSize( ); ++i ) 00719 for( j = ( i + 1 ); j < m_Mask.GetSize( ); ++j ) 00720 m_Mask.Set( i, j, m_pDataset->IsExample( i, j ) ); } 00721 00722 // CDataSubset 00723 00749 bool CDataSubset::Initialize( const char* szDataDirectory, const IBayesNet* pBayesNet, size_t iGeneSize ) { 00750 size_t i; 00751 00752 m_iSize = iGeneSize; 00753 m_vecstrData.clear( ); 00754 m_fContinuous = pBayesNet->IsContinuous( ); 00755 { 00756 set<string> setstrGenes; 00757 set<string>::const_iterator iterGenes; 00758 vector<string> vecstrNodes; 00759 00760 pBayesNet->GetNodes( vecstrNodes ); 00761 OpenMax( szDataDirectory, vecstrNodes, false, m_vecstrData, &setstrGenes ); 00762 m_vecstrGenes.resize( setstrGenes.size( ) ); 00763 i = 0; 00764 for( iterGenes = setstrGenes.begin( ); iterGenes != setstrGenes.end( ); ++iterGenes ) 00765 m_vecstrGenes[ i++ ] = *iterGenes; 00766 } 00767 m_Examples.Initialize( m_iSize, m_vecstrGenes.size( ) ); 00768 00769 return true; } 00770 00791 bool CDataSubset::Initialize( const std::vector<std::string>& vecstrDataFiles, size_t iGeneSize ) { 00792 size_t i; 00793 00794 m_iSize = iGeneSize; 00795 m_vecstrData.resize( vecstrDataFiles.size( ) ); 00796 m_veccQuants.resize( vecstrDataFiles.size( ) ); 00797 for( i = 0; i < vecstrDataFiles.size( ); ++i ) 00798 m_vecstrData[ i ] = vecstrDataFiles[ i ]; 00799 m_fContinuous = true; 00800 00801 if( !OpenGenes( vecstrDataFiles ) ) 00802 return false; 00803 m_Examples.Initialize( m_iSize, m_vecstrGenes.size( ) ); 00804 00805 return true; } 00806 00823 bool CDataSubset::Open( size_t iGeneOffset ) { 00824 size_t i, j; 00825 00826 m_iOffset = iGeneOffset; 00827 for( i = 0; i < m_Examples.GetRows( ); ++i ) 00828 for( j = 0; j < m_Examples.GetColumns( ); ++j ) 00829 m_Examples.Get( i, j ).Reset( ); 00830 00831 m_iSize = ( ( m_iOffset + m_Examples.GetRows( ) ) > m_vecstrGenes.size( ) ) ? 00832 ( m_vecstrGenes.size( ) - m_iOffset ) : m_Examples.GetRows( ); 00833 for( i = 0; i < m_vecstrData.size( ); ++i ) { 00834 CDataPair Datum; 00835 00836 if( !( Datum.Open( m_vecstrData[ i ].c_str( ), m_fContinuous ) && 00837 CDataSubsetImpl::Open( Datum, i ) ) ) 00838 return false; } 00839 00840 return true; } 00841 00842 bool CDataSubsetImpl::Open( const CDataPair& Datum, size_t iExp ) { 00843 vector<size_t> veciGenes; 00844 size_t i, j, iOne, iTwo; 00845 float d; 00846 00847 m_veccQuants[ iExp ] = Datum.IsContinuous( ) ? -1 : Datum.GetValues( ); 00848 veciGenes.resize( m_vecstrGenes.size( ) ); 00849 for( i = 0; i < veciGenes.size( ); ++i ) 00850 veciGenes[ i ] = Datum.GetGene( m_vecstrGenes[ i ] ); 00851 00852 for( i = 0; i < m_iSize; ++i ) { 00853 if( ( iOne = veciGenes[ i + m_iOffset ] ) == -1 ) 00854 continue; 00855 for( j = 0; j < veciGenes.size( ); ++j ) 00856 if( ( ( iTwo = veciGenes[ j ] ) != -1 ) && 00857 !CMeta::IsNaN( d = Datum.Get( iOne, iTwo ) ) ) 00858 m_Examples.Get( i, j ).Set( iExp, d, Datum, m_vecstrData.size( ) ); } 00859 00860 return true; } 00861 00862 }