Sleipnir
src/datapair.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "datapair.h"
00024 #include "meta.h"
00025 #include "genome.h"
00026 #include "math.h"
00027 
00028 namespace Sleipnir {
00029 
00030 const char  CPairImpl::c_szQuantExt[]   = ".quant";
00031 const char   CDataPairImpl::c_acQdab[]   = ".qdab";
00032 
00033 bool CPairImpl::Open( const char* szDatafile, std::ifstream& ifsm ) {
00034     string      strToken;
00035     const char* pc;
00036 
00037     strToken = szDatafile;
00038     strToken += c_szQuantExt;
00039     ifsm.open( strToken.c_str( ) );
00040     if( !ifsm.is_open( ) ) {
00041         if( !( pc = strrchr( szDatafile, '.' ) ) ) {
00042             g_CatSleipnir( ).error( "CPairImpl::Open( %s ) could not replace extension for quant file",
00043                 szDatafile );
00044             return false; }
00045         strToken = szDatafile;
00046         strToken.resize( pc - szDatafile );
00047         strToken += c_szQuantExt;
00048         ifsm.clear( );
00049         ifsm.open( strToken.c_str( ) );
00050         if( !ifsm.is_open( ) ) {
00051             g_CatSleipnir( ).error( "CPairImpl::Open( %s ) could not open quant file: %s", szDatafile,
00052                 strToken.c_str( ) );
00053             return false; } }
00054 
00055     return true; }
00056 
00057 bool CPairImpl::Open( const char* szLine, vector<float>& vecdQuant ) {
00058     vector<string>  vecstrQuant;
00059     size_t          i;
00060 
00061     CMeta::Tokenize( szLine, vecstrQuant, CMeta::c_szWS, true );
00062     vecdQuant.resize( vecstrQuant.size( ) );
00063     for( i = 0; i < vecdQuant.size( ); ++i )
00064         vecdQuant[ i ] = (float)atof( vecstrQuant[ i ].c_str( ) );
00065 
00066     return true; }
00067 
00084 bool CDataPair::Open( const CSlim& Slim ) {
00085     m_fQuantized = false;
00086     Reset( false );
00087     return CDat::Open( Slim ); }
00088 
00114 bool CDataPair::Open( const char* szDatafile, bool fContinuous, bool fMemmap, size_t iSkip,
00115     bool fZScore, bool fSeek ) {
00116 
00117 
00118     g_CatSleipnir( ).notice( "CDataPair::Open( %s, %d )", szDatafile, fContinuous );
00119 
00120     Reset( fContinuous );
00121     m_fQuantized = false;
00122 
00123     const char* file_ext = NULL;
00124 
00125     if((file_ext = strstr(szDatafile, c_acQdab)) != NULL){
00126 
00127       return OpenQdab( szDatafile );
00128     }
00129     else{
00130         if( m_fContinuous ? true : OpenQuants( szDatafile ) ) {
00131             if( CDat::Open( szDatafile, fMemmap, iSkip, fZScore, false, fSeek ) ) {
00132                 return true;
00133             }
00134         }
00135         return false;
00136     }
00137 }
00138 
00139 bool CDataPairImpl::OpenQdab( const char* szDatafile ){
00140   size_t    iTotal, i, j, num_bins, num_bits, iPos;
00141   float*    adScores;
00142   unsigned char tmp;
00143   float* bounds;
00144   unsigned char btmpf;
00145   unsigned char btmpb;
00146   
00147   unsigned char bufferA;
00148   unsigned char bufferB;
00149   ifstream        istm;
00150   
00151   float nan_val;
00152 
00153   g_CatSleipnir( ).notice( "CDataPair::OpenQdab( %s )", szDatafile );
00154 
00155   istm.open( szDatafile, ios_base::binary );
00156   
00157   if( !CDat::OpenGenes( istm, true, false ) )
00158     return false;
00159   m_Data.Initialize( GetGenes( ) );
00160   
00161   // read the number of bins 
00162   istm.read((char*)&tmp, sizeof(char));
00163   num_bins = (size_t)tmp;
00164   
00165   //read the bin boundaries
00166   bounds = new float[num_bins];
00167   istm.read((char*)bounds, sizeof(float) * num_bins);
00168   
00169   // set quant values
00170   SetQuants( bounds, num_bins );
00171   
00172   // number of bits required for each bin representation
00173   num_bits = (size_t)ceil(log( num_bins ) / log ( 2.0 ));   
00174 
00175   // add one more bit for NaN case
00176   if( pow(2, num_bits) == num_bins )
00177     ++num_bits;
00178   
00179   // set nan value
00180   nan_val = pow(2, num_bits) -1;
00181   
00182   // read the data  
00183   adScores = new float[ GetGenes( ) - 1 ];
00184   
00185   istm.read( (char*)&bufferA, sizeof(bufferA));
00186   istm.read( (char*)&bufferB, sizeof(bufferB));
00187   
00188   for( iTotal = i = 0; ( i + 1 ) < GetGenes( ); ++i ) {
00189     for(j = 0; j < ( GetGenes( ) - i - 1 ); ++j){
00190       iPos = (iTotal * num_bits) % 8;
00191       
00192       // check bit data overflow??
00193       if( iPos + num_bits > 8){
00194     btmpb = (bufferA << iPos);
00195     btmpf = (bufferB >> (16 - num_bits - iPos)) << (8-num_bits);
00196     adScores[j] = (float)((btmpb | btmpf) >> (8 - num_bits));
00197     ++iTotal;
00198     bufferA = bufferB;
00199     istm.read( (char*)&bufferB, sizeof(bufferB));
00200       }
00201       else{
00202     adScores[j] = (((bufferA << iPos) & 0xFF) >> (8 - num_bits));
00203     ++iTotal;
00204     if( iPos + num_bits == 8 ) {
00205         bufferA = bufferB;
00206         istm.read( (char*)&bufferB, sizeof(bufferB));
00207         }
00208 
00209       }
00210       
00211       // check if value added was promised 2^#bits -1 (NaN value)
00212       if(adScores[j] == nan_val)
00213     adScores[j] =  CMeta::GetNaN( );
00214       
00215     }    
00216     
00217     CDat::Set( i, adScores ); 
00218   }
00219   
00220   istm.close();
00221   
00222   delete[] adScores;
00223   delete[] bounds;
00224 
00225   m_fQuantized = true;
00226 
00227   return true; 
00228 }
00229   
00230   
00231 bool CDataPair::Open( const CDat& dat ) {
00232     m_fContinuous = true;   
00233     Reset( true );
00234     if( !CDat::Open( dat ) ) return false;
00235 }
00236 
00251 bool CDataPair::OpenQuants( const char* szDatafile ) {
00252     static const size_t c_iBuf  = 8192;
00253     char        szBuf[ c_iBuf ];
00254     ifstream    ifsm;
00255 
00256     if( !CPairImpl::Open( szDatafile, ifsm ) ){
00257         fprintf(stderr, "Quant file does not exist or error opening it.\n");
00258         return false;
00259     }
00260     ifsm.getline( szBuf, c_iBuf - 1 );
00261     ifsm.close( );
00262     return CPairImpl::Open( szBuf, m_vecdQuant ); }
00263 
00282 size_t CDataPair::Quantize( float dValue ) const {
00283     if ( m_fQuantized ) 
00284         return (size_t)dValue;
00285     else
00286         return CMeta::Quantize( dValue, m_vecdQuant ); }
00287 
00288 
00289 void CDataPair::Quantize() {
00290     if ( m_fQuantized )
00291         return;
00292     for( size_t i = 0; i < GetGenes( ); ++i ) {
00293                 for( size_t j = ( i + 1 ); j < GetGenes( ); ++j ) {
00294             float d = Get( i, j );
00295             if( CMeta::IsNaN( d ) ) continue;
00296 
00297             Set( i, j, Quantize( d ) );
00298         }
00299     }
00300     m_fQuantized = true;
00301 }
00302 
00303 
00326 size_t CDataPair::Quantize( size_t iY, size_t iX, size_t iZero ) const {
00327     float d;
00328     if( iY == -1 || iX == -1 ) {
00329     return -1;
00330     }
00331     else if( CMeta::IsNaN( (d = Get( iY, iX )) ) ) {
00332     return iZero;
00333     }
00334     else {
00335     return Quantize(d); 
00336     }
00337 }
00338 
00339 
00340 
00341 
00342 void CDataPairImpl::Reset( bool fContinuous ) {
00343 
00344     m_vecdQuant.clear( );
00345     m_fContinuous = fContinuous; }
00346 
00360 void CDataPairImpl::SetQuants( const float* adBinEdges, size_t iBins ) {
00361 
00362     Reset( false );
00363     m_vecdQuant.resize( iBins );
00364     copy( adBinEdges, adBinEdges + iBins, m_vecdQuant.begin( ) ); }
00365 
00366 
00377 void CDataPair::SetQuants( const vector<float>& vecdBinEdges ) {
00378 
00379     Reset( false );
00380     m_vecdQuant.resize( vecdBinEdges.size( ) );
00381     copy( vecdBinEdges.begin( ), vecdBinEdges.end( ), m_vecdQuant.begin( ) ); }
00382 
00383 void CDataPair::Save( const char* szFile ) const {
00384 
00385     if( !strcmp( szFile + strlen( szFile ) - strlen( "qdab" ), "qdab" ) ) {
00386         g_CatSleipnir().info( "CDataPair::Save( ) qdab file: %s", szFile ); 
00387         ofstream ofsm;
00388         ofsm.open( szFile, ios_base::binary );      
00389         CDat::SaveGenes( ofsm );
00390         
00391         unsigned char bins = (unsigned char)m_vecdQuant.size();     
00392         size_t bit_len = (size_t)ceil( log((size_t)bins)/log(2) );
00393 
00394         //reserve largest value for NaN     
00395         if( (size_t)pow(2, bit_len) == (size_t)bins )
00396             bit_len++;
00397 
00398         //NaN = 2^N - 1
00399         size_t nan_val = (size_t)pow(2, bit_len) -1;
00400 
00401         //write out total bins, bin boundaries
00402         ofsm.write( (char*)&bins, sizeof(bins) );
00403         for( size_t i =0; i < m_vecdQuant.size(); i++ ) {
00404             float b = m_vecdQuant[i];
00405             ofsm.write( (char*)&b, sizeof(b) );
00406         }
00407     
00408         size_t offset = 0, byte_count = 0;
00409         unsigned char cur_byte = 0;
00410 
00411                 for( size_t i = 0; i < GetGenes( ); ++i ) {
00412                         for( size_t j = ( i + 1 ); j < GetGenes( ); ++j ) {
00413                                 unsigned char d = (unsigned char)Get( i, j );
00414                 if( CMeta::IsNaN( Get(i,j) ) ) {
00415                     d = nan_val;
00416                 }
00417                 //check if fits in one byte
00418                 if( offset + bit_len <= 8 ) { 
00419                     cur_byte = cur_byte | (d << (8-bit_len-offset));
00420                     offset = offset + bit_len;
00421                     if( offset == 8 ) {
00422                         ofsm.write((char*)&cur_byte, 1);
00423                         cur_byte = offset = 0;
00424                         byte_count++;
00425                     }
00426                 }
00427                 else {
00428                     //spans byte boundary; offset+bit_len > 8
00429                     cur_byte = cur_byte | (d >> (offset+bit_len-8));
00430                     ofsm.write((char*)&cur_byte, 1);
00431                     cur_byte = d << (16-offset-bit_len);
00432                     offset = offset+bit_len-8;
00433                     byte_count++;
00434                 }   
00435             }
00436         }
00437         size_t bytes_req = (size_t)ceil((GetGenes()*(GetGenes()-1)/2.0) * bit_len / 8.0);
00438         if( byte_count < bytes_req ) {
00439             ofsm.write((char*)&cur_byte, 1);
00440         }   
00441         //g_CatSleipnir().info( "CDataPair::Save( ) byte count: %s", byte_count );
00442     }
00443     else {
00444             CDat::Save( szFile );
00445     }
00446 }
00447 
00464 bool CPCLPair::Open( const char* szDatafile, size_t iSkip ) {
00465     static const size_t c_iBuf  = 8192;
00466     char        szBuf[ c_iBuf ];
00467     ifstream    ifsm;
00468     size_t      i;
00469 
00470     g_CatSleipnir( ).notice( "CPCLPair::Open( %s )", szDatafile );
00471     
00472     if( !CPCL::Open( szDatafile, iSkip ) )
00473         return false;
00474     
00475     if( !CPairImpl::Open( szDatafile, ifsm ) )
00476         return false;
00477 
00478     m_vecvecdQuants.resize( GetExperiments( ) );
00479     for( i = 0; i < m_vecvecdQuants.size( ); ++i ) {
00480         if( ifsm.eof( ) ) {
00481             g_CatSleipnir( ).error( "CPCLPair::Open( %s, %d ) invalid quant file", szDatafile, iSkip );
00482             return false; }
00483         ifsm.getline( szBuf, c_iBuf - 1 );
00484         if( !CPairImpl::Open( szBuf, m_vecvecdQuants[ i ] ) )
00485             return false; }
00486 
00487     return true; }
00488 
00505 size_t CPCLPair::Quantize( float dValue, size_t iExperiment ) const {
00506 
00507     return CMeta::Quantize( dValue, m_vecvecdQuants[ iExperiment ] ); }
00508 
00509 void CPCLPair::Quantize() {
00510   for( size_t i = 0; i < GetGenes( ); ++i ) {
00511     for( size_t j = 0; j < GetExperiments( ); ++j ) {
00512       float d = Get( i, j );
00513       if( CMeta::IsNaN( d ) ) continue;      
00514       Set( i, j, Quantize( d, j ) );
00515     }
00516   }
00517 }
00518 
00519 bool CDatFilterImpl::Attach( const CDataPair* pDat, const CDatFilter* pFilter, const CGenes* pGenes,
00520     CDat::EFilter eFilter, const CDat* pAnswers ) {
00521     size_t  i;
00522 
00523     if( !( ( eFilter == CDat::EFilterInclude ) || ( eFilter == CDat::EFilterExclude ) ||
00524         ( eFilter == CDat::EFilterTerm ) || ( eFilter == CDat::EFilterEdge ) ) )
00525         return false;
00526 
00527     m_pFilter = pFilter;
00528     m_pDat = pDat;
00529     m_pAnswers = pAnswers;
00530     m_eFilter = eFilter;
00531     if( m_pAnswers ) {
00532         m_veciAnswers.resize( GetGenes( ) );
00533         for( i = 0; i < m_veciAnswers.size( ); ++i )
00534             m_veciAnswers[ i ] = m_pAnswers->GetGene( GetGene( i ) ); }
00535     else {
00536         m_veciAnswers.clear( );
00537         if( m_eFilter == CDat::EFilterTerm )
00538             m_eFilter = CDat::EFilterEdge; }
00539 
00540     if( pGenes ) {
00541         m_vecfGenes.resize( GetGenes( ) );
00542         for( i = 0; i < m_vecfGenes.size( ); ++i )
00543             m_vecfGenes[ i ] = pGenes->IsGene( GetGene( i ) ); }
00544     else
00545         m_vecfGenes.clear( );
00546 
00547     return true; }
00548 
00549 size_t CDatFilterImpl::GetGenes( ) const {
00550 
00551     return ( m_pFilter ? m_pFilter->GetGenes( ) : ( m_pDat ? m_pDat->GetGenes( ) : -1 ) ); }
00552 
00553 string CDatFilterImpl::GetGene( size_t iGene ) const {
00554 
00555     return ( m_pFilter ? m_pFilter->GetGene( iGene ) : ( m_pDat ? m_pDat->GetGene( iGene ) : "" ) ); }
00556 
00582 bool CDatFilter::Attach( const CDataPair& Dat, const CGenes& Genes, CDat::EFilter eFilter,
00583     const CDat* pAnswers ) {
00584 
00585     return CDatFilterImpl::Attach( &Dat, NULL, &Genes, eFilter, pAnswers ); }
00586 
00611 bool CDatFilter::Attach( const CDatFilter& Dat, const CGenes& Genes, CDat::EFilter eFilter,
00612     const CDat* pAnswers ) {
00613 
00614     return CDatFilterImpl::Attach( NULL, &Dat, &Genes, eFilter, pAnswers ); }
00615 
00616 }