Sleipnir
src/datasetcompact.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "dataset.h"
00024 #include "bayesnetint.h"
00025 #include "genome.h"
00026 #include "compactmatrix.h"
00027 
00028 namespace Sleipnir {
00029 
00030 CDatasetCompactImpl::CDatasetCompactImpl( ) : m_iData(0), m_aData(NULL) {
00031 
00032     m_fContinuous = false; }
00033 
00034 CDatasetCompactImpl::~CDatasetCompactImpl( ) {
00035 
00036     if( m_aData )
00037         delete[] m_aData; }
00038 
00054 bool CDatasetCompact::Open( const std::vector<std::string>& vecstrDataFiles, bool fMemmap ) {
00055     size_t  i;
00056 
00057     if( !OpenGenes( vecstrDataFiles ) )
00058         return false;
00059     if( m_aData )
00060         delete[] m_aData;
00061     m_aData = new CCompactMatrix[ m_iData = (uint32_t)vecstrDataFiles.size( ) ];
00062 
00063     for( i = 0; i < vecstrDataFiles.size( ); ++i ) {
00064         CDataPair   Datum;
00065 
00066         if( !( Datum.Open( vecstrDataFiles[ i ].c_str( ), false, fMemmap ) &&
00067             CDatasetCompactImpl::Open( Datum, i ) ) )
00068             return false; }
00069 
00070     return true; }
00071 
00072 struct SIsGene {
00073     const CGenes&   m_Genes;
00074     bool            m_fIn;
00075 
00076     SIsGene( const CGenes& Genes, bool fIn ) : m_Genes(Genes), m_fIn(fIn) { }
00077 
00078     bool operator()( const string& strGene ) {
00079 
00080         return ( m_fIn == m_Genes.IsGene( strGene ) ); }
00081 };
00082 
00112 bool CDatasetCompact::Open( const CDataPair& Answers, const char* szDataDirectory, const IBayesNet* pBayesNet,
00113     bool fEverything ) {
00114     CGenome Genome;
00115     CGenes  GenesIn( Genome ), GenesEx( Genome );
00116 
00117     return Open( Answers, szDataDirectory, pBayesNet, GenesIn, GenesEx, fEverything ); }
00118 
00154 bool CDatasetCompact::Open( const CDataPair& Answers, const char* szDataDirectory, const IBayesNet* pBayesNet,
00155     const CGenes& GenesInclude, const CGenes& GenesExclude, bool fEverything ) {
00156     size_t          i;
00157     vector<string>  vecstrData, vecstrNodes;
00158     set<string>     setstrGenes;
00159 
00160     if( pBayesNet->IsContinuous( ) )
00161         return false;
00162 
00163     pBayesNet->GetNodes( vecstrNodes );
00164     m_iData = 1 + (uint32_t)OpenMax( szDataDirectory, vecstrNodes, true, vecstrData, fEverything ?
00165         &setstrGenes : NULL );
00166     m_veccQuants.resize( m_iData );
00167     if( m_aData )
00168         delete[] m_aData;
00169     m_aData = new CCompactMatrix[ m_iData ];
00170 
00171     if( fEverything ) {
00172         m_vecstrGenes.resize( setstrGenes.size( ) );
00173         copy( setstrGenes.begin( ), setstrGenes.end( ), m_vecstrGenes.begin( ) ); }
00174     else {
00175         m_vecstrGenes.resize( Answers.GetGenes( ) );
00176         for( i = 0; i < m_vecstrGenes.size( ); ++i )
00177             m_vecstrGenes[ i ] = Answers.GetGene( i ); }
00178     if( GenesInclude.GetGenes( ) )
00179         remove_if( m_vecstrGenes.begin( ), m_vecstrGenes.end( ), SIsGene( GenesInclude, false ) );
00180     if( GenesExclude.GetGenes( ) )
00181         remove_if( m_vecstrGenes.begin( ), m_vecstrGenes.end( ), SIsGene( GenesExclude, true ) );
00182 
00183     if( !CDatasetCompactImpl::Open( Answers, 0 ) )
00184         return false;
00185     for( i = 0; i < vecstrData.size( ); ++i ) {
00186         CDataPair   Datum;
00187 
00188         if( !( Datum.Open( vecstrData[ i ].c_str( ), false ) &&
00189             CDatasetCompactImpl::Open( Datum, i + 1 ) ) )
00190             return false; }
00191 
00192 /*
00193     for( i = 0; i < m_vecstrGenes.size( ); ++i )
00194         for( j = ( i + 1 ); j < m_vecstrGenes.size( ); ++j ) {
00195             for( k = 1; k < m_iData; ++k )
00196                 if( m_aData[ k ].Get( i, j ) )
00197                     break;
00198             if( k >= m_iData )
00199                 m_aData[ 0 ].Set( i, j, 0 ); }
00200 */
00201 
00202     return true; }
00203 
00235 bool CDatasetCompact::Open( const CDataPair& Answers, const std::vector<std::string>& vecstrDataFiles,
00236     bool fEverything, bool fMemmap, size_t iSkip, bool fZScore ) {
00237     size_t  i, j, k;
00238 
00239     if( Answers.GetGenes( ) && Answers.IsContinuous( ) )
00240         return false;
00241 
00242     m_veciMapping.resize( m_iData = 1 + vecstrDataFiles.size( ) );
00243     for( i = 0; i < m_veciMapping.size( ); ++i )
00244         m_veciMapping[ i ] = i;
00245     m_veccQuants.resize( m_iData );
00246     if( m_aData )
00247         delete[] m_aData;
00248     m_aData = new CCompactMatrix[ m_iData ];
00249 
00250     if( fEverything ) {
00251         set<string> setstrGenes;
00252 
00253         for( i = 0; i < Answers.GetGenes( ); ++i )
00254             setstrGenes.insert( Answers.GetGene( i ) );
00255         for( i = 0; i < vecstrDataFiles.size( ); ++i ) {
00256             CDat    Dat;
00257 
00258             if( !Dat.OpenGenes( vecstrDataFiles[ i ].c_str( ), iSkip ) )
00259                     return false;
00260             for( j = 0; j < Dat.GetGenes( ); ++j )
00261                 setstrGenes.insert( Dat.GetGene( j ) ); }
00262         m_vecstrGenes.resize( setstrGenes.size( ) );
00263         copy( setstrGenes.begin( ), setstrGenes.end( ), m_vecstrGenes.begin( ) ); }
00264     else {
00265         m_vecstrGenes.resize( Answers.GetGenes( ) );
00266         for( i = 0; i < m_vecstrGenes.size( ); ++i )
00267             m_vecstrGenes[ i ] = Answers.GetGene( i ); }
00268 
00269     if( !CDatasetCompactImpl::Open( Answers, 0 ) )
00270         return false;
00271     for( i = 0; i < vecstrDataFiles.size( ); ++i ) {
00272         CDataPair   Datum;
00273 
00274         if( !( Datum.Open( vecstrDataFiles[ i ].c_str( ), false, fMemmap, iSkip, fZScore ) &&
00275             CDatasetCompactImpl::Open( Datum, i + 1 ) ) )
00276             return false; }
00277 
00278     if( !fEverything && ( m_iData > 1 ) )
00279         for( i = 0; i < m_vecstrGenes.size( ); ++i )
00280             for( j = ( i + 1 ); j < m_vecstrGenes.size( ); ++j ) {
00281                 for( k = 1; k < m_iData; ++k )
00282                     if( m_aData[ k ].Get( i, j ) )
00283                         break;
00284                 if( k >= m_iData )
00285                     m_aData[ 0 ].Set( i, j, 0 ); }
00286 
00287     return true; }
00288 
00289 bool CDatasetCompactImpl::Open( const CDataPair& Datum, size_t iExp ) {
00290     vector<size_t>  veciGenes;
00291     size_t          i, j, iOne, iTwo;
00292     float           d;
00293     CCompactMatrix& Target  = m_aData[ iExp ];
00294 
00295     m_veccQuants[ iExp ] = Datum.IsContinuous( ) ? -1 : Datum.GetValues( );
00296     Target.Initialize( m_vecstrGenes.size( ), (unsigned char)( Datum.GetValues( ) + 1 ),
00297         true );
00298     veciGenes.resize( m_vecstrGenes.size( ) );
00299     for( i = 0; i < veciGenes.size( ); ++i )
00300         veciGenes[ i ] = Datum.GetGene( m_vecstrGenes[ i ] );
00301 
00302     for( i = 0; i < veciGenes.size( ); ++i )
00303         if( ( iOne = veciGenes[ i ] ) != -1 )
00304             for( j = ( i + 1 ); j < veciGenes.size( ); ++j )
00305                 if( ( ( iTwo = veciGenes[ j ] ) != -1 ) &&
00306                     !CMeta::IsNaN( d = Datum.Get( iOne, iTwo ) ) )
00307                     Target.Set( i, j, (unsigned char)( Datum.Quantize( d ) + 1 ) );
00308 
00309     return true; }
00310 
00311 bool CDatasetCompactImpl::Open( const char* szDataDir, const IBayesNet* pBayesNet,
00312     const CGenes* pGenesIn, const CGenes* pGenesEx ) {
00313     size_t                      i;
00314     vector<string>              vecstrData, vecstrNodes;
00315     set<string>                 setstrGenes;
00316     set<string>::const_iterator iterGenes;
00317 
00318     if( pBayesNet->IsContinuous( ) )
00319         return false;
00320 
00321     pBayesNet->GetNodes( vecstrNodes );
00322     m_iData = (uint32_t)OpenMax( szDataDir, vecstrNodes, false, vecstrData, &setstrGenes );
00323     m_veccQuants.resize( m_iData );
00324     if( pGenesIn )
00325         for( i = 0; i < pGenesIn->GetGenes( ); ++i )
00326             setstrGenes.insert( pGenesIn->GetGene( i ).GetName( ) );
00327     if( pGenesEx )
00328         for( i = 0; i < pGenesEx->GetGenes( ); ++i )
00329             setstrGenes.erase( pGenesEx->GetGene( i ).GetName( ) );
00330     m_vecstrGenes.resize( setstrGenes.size( ) );
00331     for( i = 0,iterGenes = setstrGenes.begin( ); iterGenes != setstrGenes.end( );
00332         ++iterGenes )
00333         m_vecstrGenes[ i++ ] = *iterGenes;
00334 
00335     if( m_aData )
00336         delete[] m_aData;
00337     m_aData = new CCompactMatrix[ m_iData ];
00338 
00339     for( i = 0; i < vecstrData.size( ); ++i ) {
00340         CDataPair   Datum;
00341 
00342         if( !( Datum.Open( vecstrData[ i ].c_str( ), false ) &&
00343             CDatasetCompactImpl::Open( Datum, i ) ) )
00344             return false; }
00345 
00346     return true; }
00347 
00367 bool CDatasetCompact::FilterGenes( const char* szGenes, CDat::EFilter eFilter ) {
00368     ifstream    ifsm;
00369     CGenome     Genome;
00370     CGenes      Genes( Genome );
00371 
00372     ifsm.open( szGenes );
00373     if( !( ifsm.is_open( ) && Genes.Open( ifsm ) ) )
00374         return false;
00375     FilterGenes( Genes, eFilter );
00376 
00377     return true; }
00378 
00386 void CDatasetCompact::FilterAnswers( ) {
00387     size_t  i, j;
00388 
00389     for( i = 0; i < GetGenes( ); ++i )
00390         for( j = ( i + 1 ); j < GetGenes( ); ++j )
00391             if( IsExample( i, j ) && ( GetDiscrete( i, j, 0 ) == -1 ) )
00392                 Remove( i, j ); }
00393 
00394 size_t CDatasetCompactImpl::GetDiscrete( size_t iX, size_t iY, size_t iNode ) const {
00395     size_t  iMap;
00396 
00397     if( ( iMap = m_veciMapping[ iNode ] ) == -1 )
00398         return -1;
00399 
00400     return ( m_aData[ iMap ].Get( iX, iY ) - 1 ); }
00401 
00402 bool CDatasetCompactImpl::IsExample( size_t iX, size_t iY ) const {
00403     size_t  i;
00404 
00405     for( i = 0; i < m_iData; ++i )
00406         if( m_aData[ i ].Get( iX, iY ) )
00407             return true;
00408 
00409     return false; }
00410 
00411 void CDatasetCompactImpl::Remove( size_t iX, size_t iY ) {
00412     size_t  i;
00413 
00414     for( i = 0; i < m_iData; ++i )
00415         m_aData[ i ].Set( iX, iY, 0 ); }
00416 
00417 bool CDatasetCompactImpl::Open( const unsigned char* pbData ) {
00418     size_t  i;
00419 
00420     if( m_aData )
00421         delete[] m_aData;
00422 
00423     if( !( pbData = CDataImpl::OpenBinary( pbData ) ) )
00424         return false;
00425     m_iData = *(uint32_t*)pbData;
00426     pbData += sizeof(m_iData);
00427     m_aData = new CCompactMatrix[ m_iData ];
00428     for( i = 0; i < m_iData; ++i )
00429         if( !( pbData = m_aData[ i ].Open( pbData ) ) )
00430             return false;
00431 
00432     return true; }
00433 
00447 bool CDatasetCompact::Open( std::istream& istm ) {
00448     size_t  i;
00449 
00450     if( m_aData )
00451         delete[] m_aData;
00452 
00453     if( !CDataImpl::OpenBinary( istm ) )
00454         return false;
00455     istm.read( (char*)&m_iData, sizeof(m_iData) );
00456     m_aData = new CCompactMatrix[ m_iData ];
00457     for( i = 0; i < m_iData; ++i )
00458         if( !m_aData[ i ].Open( istm ) )
00459             return false;
00460 
00461     return true; }
00462 
00463 void CDatasetCompactImpl::SaveBinary( std::ostream& ostm ) const {
00464     size_t  i;
00465 
00466     CDataImpl::SaveBinary( ostm );
00467     ostm.write( (char*)&m_iData, sizeof(m_iData) );
00468     for( i = 0; i < m_iData; ++i )
00469         m_aData[ i ].Save( ostm ); }
00470 
00471 void CDatasetCompactImpl::SaveText( std::ostream& ostm ) const {
00472     size_t  i, j, k, iVal;
00473 
00474     for( i = 0; i < GetGenes( ); ++i )
00475         for( j = ( i + 1 ); j < GetGenes( ); ++j )
00476             if( IsExample( i, j ) ) {
00477                 ostm << GetGene( i ) << '\t' << GetGene( j );
00478                 for( k = 0; k < GetExperiments( ); ++k ) {
00479                     ostm << '\t';
00480                     if( ( iVal = GetDiscrete( i, j, k ) ) == -1 )
00481                         ostm << "-1";
00482                     else
00483                         ostm << iVal; }
00484                 ostm << endl; } }
00485 
00525 bool CDatasetCompact::Open( const CGenes& GenesInclude, const CGenes& GenesExclude, const CDataPair& Answers,
00526     const std::vector<std::string>& vecstrPCLs, size_t iSkip, const IMeasure* pMeasure,
00527     const std::vector<float>& vecdBinEdges ) {
00528     size_t                  i, j, iPCL;
00529     set<string>             setstrGenes;
00530     set<string>::iterator   iterGene;
00531 
00532     g_CatSleipnir( ).notice( "CDatasetCompact::Open( %d ) opening PCL files",
00533         iSkip );
00534 
00535     m_veciMapping.resize( m_iData = 1 + (uint32_t)vecstrPCLs.size( ) );
00536     for( i = 0; i < m_veciMapping.size( ); ++i )
00537         m_veciMapping[ i ] = i;
00538     m_veccQuants.resize( m_iData );
00539     m_veccQuants[ 0 ] = Answers.GetValues( );
00540     for( i = 1; i < m_veccQuants.size( ); ++i )
00541         m_veccQuants[ i ] = (unsigned char)vecdBinEdges.size( );
00542 
00543     for( i = 0; i < Answers.GetGenes( ); ++i )
00544         setstrGenes.insert( Answers.GetGene( i ) );
00545     for( iPCL = 0; iPCL < vecstrPCLs.size( ); ++iPCL ) {
00546         ifstream    ifsm;
00547 
00548         ifsm.open( vecstrPCLs[ iPCL ].c_str( ) );
00549         if( !CDataImpl::OpenGenes( ifsm, false, true, setstrGenes ) ) {
00550             g_CatSleipnir( ).error( "CDatasetCompact::Open( %d ) could not open: %s", iSkip,
00551                 vecstrPCLs[ iPCL ].c_str( ) );
00552             return false; } }
00553     if( GenesInclude.GetGenes( ) ) {
00554         for( iterGene = setstrGenes.begin( ); iterGene != setstrGenes.end( ); ++iterGene )
00555             if( !GenesInclude.IsGene( *iterGene ) )
00556                 setstrGenes.erase( iterGene );
00557         for( i = 0; i < GenesInclude.GetGenes( ); ++i )
00558             setstrGenes.insert( GenesInclude.GetGene( i ).GetName( ) ); }
00559     if( GenesExclude.GetGenes( ) )
00560         for( i = 0; i < GenesExclude.GetGenes( ); ++i )
00561             setstrGenes.erase( GenesExclude.GetGene( i ).GetName( ) );
00562     m_vecstrGenes.resize( setstrGenes.size( ) );
00563     copy( setstrGenes.begin( ), setstrGenes.end( ), m_vecstrGenes.begin( ) );
00564 
00565     if( m_aData )
00566         delete[] m_aData;
00567     m_aData = new CCompactMatrix[ m_iData ];
00568     if( !CDatasetCompactImpl::Open( Answers, 0 ) )
00569         return false;
00570 
00571     for( iPCL = 0; iPCL < vecstrPCLs.size( ); ++iPCL ) {
00572         CPCL            PCL;
00573         ifstream        ifsm;
00574         CDistanceMatrix Dist;
00575         CDataPair       Datum;
00576         vector<size_t>  veciGenes;
00577         vector<string>  vecstrGenes;
00578         size_t          iGenes, iOne, iTwo;
00579         const float*    adOne;
00580 
00581         g_CatSleipnir( ).notice( "CDatasetCompact::Open( %d ) opening: %s", iSkip, vecstrPCLs[ iPCL ].c_str( ) );
00582         ifsm.open( vecstrPCLs[ iPCL ].c_str( ) );
00583         if( !PCL.Open( ifsm, iSkip ) ) {
00584             g_CatSleipnir( ).error( "CDatasetCompact::Open( %d ) could not open: %s", iSkip, vecstrPCLs[ iPCL ].c_str( ) );
00585             return 1; }
00586         if( pMeasure->IsRank( ) )
00587             PCL.RankTransform( );
00588 
00589         veciGenes.resize( PCL.GetGenes( ) );
00590         if( GenesInclude.GetGenes( ) || GenesExclude.GetGenes( ) )
00591             for( i = 0; i < PCL.GetGenes( ); ++i ) {
00592                 const string&   strGene = PCL.GetGene( i );
00593 
00594                 if( GenesExclude.GetGenes( ) && GenesExclude.IsGene( strGene ) )
00595                     veciGenes[ i ] = -1;
00596                 else if( GenesInclude.GetGenes( ) )
00597                     veciGenes[ i ] = (unsigned int)( GenesInclude.IsGene( strGene ) ? iGenes++ : -1 );
00598                 else
00599                     veciGenes[ i ] = (unsigned int)iGenes++;
00600                 if( veciGenes[ i ] != -1 )
00601                     vecstrGenes.push_back( strGene ); }
00602         else {
00603             vecstrGenes.resize( PCL.GetGenes( ) );
00604             copy( PCL.GetGeneNames( ).begin( ), PCL.GetGeneNames( ).end( ), vecstrGenes.begin( ) );
00605             for( i = 0; i < veciGenes.size( ); ++i )
00606                 veciGenes[ i ] = i; }
00607         Dist.Initialize( vecstrGenes.size( ) );
00608         for( i = 0; i < Dist.GetSize( ); ++i )
00609             for( j = ( i + 1 ); j < Dist.GetSize( ); ++j )
00610                 Dist.Set( i, j, CMeta::GetNaN( ) );
00611         for( i = 0; i < PCL.GetGenes( ); ++i ) {
00612             if( ( iOne = veciGenes[ i ] ) == -1 )
00613                 continue;
00614             adOne = PCL.Get( i );
00615             for( j = ( i + 1 ); j < PCL.GetGenes( ); ++j )
00616                 if( ( iTwo = veciGenes[ j ] ) != -1 )
00617                     Dist.Set( iOne, iTwo, (float)pMeasure->Measure( adOne, PCL.GetExperiments( ), PCL.Get( j ),
00618                         PCL.GetExperiments( ) ) ); }
00619 
00620         Datum.Open( vecstrGenes, Dist );
00621         Datum.Normalize( CDat::ENormalizeZScore );
00622         Datum.SetQuants( vecdBinEdges );
00623         if( !CDatasetCompactImpl::Open( Datum, iPCL + 1 ) )
00624             return false; }
00625 
00626     return true; }
00627 
00638 void CDatasetCompact::Randomize( ) {
00639     size_t  i;
00640 
00641     if( !m_aData )
00642         return;
00643 
00644     for( i = 1; i < m_iData; ++i )
00645         m_aData[ i ].Randomize( ); }
00646 
00647 CDatasetCompactMap::CDatasetCompactMap( ) : m_pbData(NULL), m_hndlMap(0) { }
00648 
00649 CDatasetCompactMap::~CDatasetCompactMap( ) {
00650 
00651     CMeta::Unmap( m_pbData, m_hndlMap, m_iData ); }
00652 
00663 bool CDatasetCompactMap::Open( const char* szFile ) {
00664     size_t  i, j;
00665 
00666     CMeta::MapRead( m_pbData, m_hndlMap, m_iData, szFile );
00667     if( !CDatasetCompactImpl::Open( m_pbData ) ) {
00668         CMeta::Unmap( m_pbData, m_hndlMap, m_iData );
00669         return false; }
00670 
00671     m_Mask.Initialize( GetGenes( ) );
00672     for( i = 0; i < m_Mask.GetSize( ); ++i )
00673         for( j = ( i + 1 ); j < m_Mask.GetSize( ); ++j )
00674             m_Mask.Set( i, j, CDatasetCompact::IsExample( i, j ) );
00675     return true; }
00676 
00677 }