Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "datapair.h" 00024 #include "meta.h" 00025 #include "genome.h" 00026 #include "math.h" 00027 00028 namespace Sleipnir { 00029 00030 const char CPairImpl::c_szQuantExt[] = ".quant"; 00031 const char CDataPairImpl::c_acQdab[] = ".qdab"; 00032 00033 bool CPairImpl::Open( const char* szDatafile, std::ifstream& ifsm ) { 00034 string strToken; 00035 const char* pc; 00036 00037 strToken = szDatafile; 00038 strToken += c_szQuantExt; 00039 ifsm.open( strToken.c_str( ) ); 00040 if( !ifsm.is_open( ) ) { 00041 if( !( pc = strrchr( szDatafile, '.' ) ) ) { 00042 g_CatSleipnir( ).error( "CPairImpl::Open( %s ) could not replace extension for quant file", 00043 szDatafile ); 00044 return false; } 00045 strToken = szDatafile; 00046 strToken.resize( pc - szDatafile ); 00047 strToken += c_szQuantExt; 00048 ifsm.clear( ); 00049 ifsm.open( strToken.c_str( ) ); 00050 if( !ifsm.is_open( ) ) { 00051 g_CatSleipnir( ).error( "CPairImpl::Open( %s ) could not open quant file: %s", szDatafile, 00052 strToken.c_str( ) ); 00053 return false; } } 00054 00055 return true; } 00056 00057 bool CPairImpl::Open( const char* szLine, vector<float>& vecdQuant ) { 00058 vector<string> vecstrQuant; 00059 size_t i; 00060 00061 CMeta::Tokenize( szLine, vecstrQuant, CMeta::c_szWS, true ); 00062 vecdQuant.resize( vecstrQuant.size( ) ); 00063 for( i = 0; i < vecdQuant.size( ); ++i ) 00064 vecdQuant[ i ] = (float)atof( vecstrQuant[ i ].c_str( ) ); 00065 00066 return true; } 00067 00084 bool CDataPair::Open( const CSlim& Slim ) { 00085 m_fQuantized = false; 00086 Reset( false ); 00087 return CDat::Open( Slim ); } 00088 00114 bool CDataPair::Open( const char* szDatafile, bool fContinuous, bool fMemmap, size_t iSkip, 00115 bool fZScore, bool fSeek ) { 00116 00117 00118 g_CatSleipnir( ).notice( "CDataPair::Open( %s, %d )", szDatafile, fContinuous ); 00119 00120 Reset( fContinuous ); 00121 m_fQuantized = false; 00122 00123 const char* file_ext = NULL; 00124 00125 if((file_ext = strstr(szDatafile, c_acQdab)) != NULL){ 00126 00127 return OpenQdab( szDatafile ); 00128 } 00129 else{ 00130 if( m_fContinuous ? true : OpenQuants( szDatafile ) ) { 00131 if( CDat::Open( szDatafile, fMemmap, iSkip, fZScore, false, fSeek ) ) { 00132 return true; 00133 } 00134 } 00135 return false; 00136 } 00137 } 00138 00139 bool CDataPairImpl::OpenQdab( const char* szDatafile ){ 00140 size_t iTotal, i, j, num_bins, num_bits, iPos; 00141 float* adScores; 00142 unsigned char tmp; 00143 float* bounds; 00144 unsigned char btmpf; 00145 unsigned char btmpb; 00146 00147 unsigned char bufferA; 00148 unsigned char bufferB; 00149 ifstream istm; 00150 00151 float nan_val; 00152 00153 g_CatSleipnir( ).notice( "CDataPair::OpenQdab( %s )", szDatafile ); 00154 00155 istm.open( szDatafile, ios_base::binary ); 00156 00157 if( !CDat::OpenGenes( istm, true, false ) ) 00158 return false; 00159 m_Data.Initialize( GetGenes( ) ); 00160 00161 // read the number of bins 00162 istm.read((char*)&tmp, sizeof(char)); 00163 num_bins = (size_t)tmp; 00164 00165 //read the bin boundaries 00166 bounds = new float[num_bins]; 00167 istm.read((char*)bounds, sizeof(float) * num_bins); 00168 00169 // set quant values 00170 SetQuants( bounds, num_bins ); 00171 00172 // number of bits required for each bin representation 00173 num_bits = (size_t)ceil(log( num_bins ) / log ( 2.0 )); 00174 00175 // add one more bit for NaN case 00176 if( pow(2, num_bits) == num_bins ) 00177 ++num_bits; 00178 00179 // set nan value 00180 nan_val = pow(2, num_bits) -1; 00181 00182 // read the data 00183 adScores = new float[ GetGenes( ) - 1 ]; 00184 00185 istm.read( (char*)&bufferA, sizeof(bufferA)); 00186 istm.read( (char*)&bufferB, sizeof(bufferB)); 00187 00188 for( iTotal = i = 0; ( i + 1 ) < GetGenes( ); ++i ) { 00189 for(j = 0; j < ( GetGenes( ) - i - 1 ); ++j){ 00190 iPos = (iTotal * num_bits) % 8; 00191 00192 // check bit data overflow?? 00193 if( iPos + num_bits > 8){ 00194 btmpb = (bufferA << iPos); 00195 btmpf = (bufferB >> (16 - num_bits - iPos)) << (8-num_bits); 00196 adScores[j] = (float)((btmpb | btmpf) >> (8 - num_bits)); 00197 ++iTotal; 00198 bufferA = bufferB; 00199 istm.read( (char*)&bufferB, sizeof(bufferB)); 00200 } 00201 else{ 00202 adScores[j] = (((bufferA << iPos) & 0xFF) >> (8 - num_bits)); 00203 ++iTotal; 00204 if( iPos + num_bits == 8 ) { 00205 bufferA = bufferB; 00206 istm.read( (char*)&bufferB, sizeof(bufferB)); 00207 } 00208 00209 } 00210 00211 // check if value added was promised 2^#bits -1 (NaN value) 00212 if(adScores[j] == nan_val) 00213 adScores[j] = CMeta::GetNaN( ); 00214 00215 } 00216 00217 CDat::Set( i, adScores ); 00218 } 00219 00220 istm.close(); 00221 00222 delete[] adScores; 00223 delete[] bounds; 00224 00225 m_fQuantized = true; 00226 00227 return true; 00228 } 00229 00230 00231 bool CDataPair::Open( const CDat& dat ) { 00232 m_fContinuous = true; 00233 Reset( true ); 00234 if( !CDat::Open( dat ) ) return false; 00235 } 00236 00251 bool CDataPair::OpenQuants( const char* szDatafile ) { 00252 static const size_t c_iBuf = 8192; 00253 char szBuf[ c_iBuf ]; 00254 ifstream ifsm; 00255 00256 if( !CPairImpl::Open( szDatafile, ifsm ) ){ 00257 fprintf(stderr, "Quant file does not exist or error opening it.\n"); 00258 return false; 00259 } 00260 ifsm.getline( szBuf, c_iBuf - 1 ); 00261 ifsm.close( ); 00262 return CPairImpl::Open( szBuf, m_vecdQuant ); } 00263 00282 size_t CDataPair::Quantize( float dValue ) const { 00283 if ( m_fQuantized ) 00284 return (size_t)dValue; 00285 else 00286 return CMeta::Quantize( dValue, m_vecdQuant ); } 00287 00288 00289 void CDataPair::Quantize() { 00290 if ( m_fQuantized ) 00291 return; 00292 for( size_t i = 0; i < GetGenes( ); ++i ) { 00293 for( size_t j = ( i + 1 ); j < GetGenes( ); ++j ) { 00294 float d = Get( i, j ); 00295 if( CMeta::IsNaN( d ) ) continue; 00296 00297 Set( i, j, Quantize( d ) ); 00298 } 00299 } 00300 m_fQuantized = true; 00301 } 00302 00303 00326 size_t CDataPair::Quantize( size_t iY, size_t iX, size_t iZero ) const { 00327 float d; 00328 if( iY == -1 || iX == -1 ) { 00329 return -1; 00330 } 00331 else if( CMeta::IsNaN( (d = Get( iY, iX )) ) ) { 00332 return iZero; 00333 } 00334 else { 00335 return Quantize(d); 00336 } 00337 } 00338 00339 00340 00341 00342 void CDataPairImpl::Reset( bool fContinuous ) { 00343 00344 m_vecdQuant.clear( ); 00345 m_fContinuous = fContinuous; } 00346 00360 void CDataPairImpl::SetQuants( const float* adBinEdges, size_t iBins ) { 00361 00362 Reset( false ); 00363 m_vecdQuant.resize( iBins ); 00364 copy( adBinEdges, adBinEdges + iBins, m_vecdQuant.begin( ) ); } 00365 00366 00377 void CDataPair::SetQuants( const vector<float>& vecdBinEdges ) { 00378 00379 Reset( false ); 00380 m_vecdQuant.resize( vecdBinEdges.size( ) ); 00381 copy( vecdBinEdges.begin( ), vecdBinEdges.end( ), m_vecdQuant.begin( ) ); } 00382 00383 void CDataPair::Save( const char* szFile ) const { 00384 00385 if( !strcmp( szFile + strlen( szFile ) - strlen( "qdab" ), "qdab" ) ) { 00386 g_CatSleipnir().info( "CDataPair::Save( ) qdab file: %s", szFile ); 00387 ofstream ofsm; 00388 ofsm.open( szFile, ios_base::binary ); 00389 CDat::SaveGenes( ofsm ); 00390 00391 unsigned char bins = (unsigned char)m_vecdQuant.size(); 00392 size_t bit_len = (size_t)ceil( log((size_t)bins)/log(2) ); 00393 00394 //reserve largest value for NaN 00395 if( (size_t)pow(2, bit_len) == (size_t)bins ) 00396 bit_len++; 00397 00398 //NaN = 2^N - 1 00399 size_t nan_val = (size_t)pow(2, bit_len) -1; 00400 00401 //write out total bins, bin boundaries 00402 ofsm.write( (char*)&bins, sizeof(bins) ); 00403 for( size_t i =0; i < m_vecdQuant.size(); i++ ) { 00404 float b = m_vecdQuant[i]; 00405 ofsm.write( (char*)&b, sizeof(b) ); 00406 } 00407 00408 size_t offset = 0, byte_count = 0; 00409 unsigned char cur_byte = 0; 00410 00411 for( size_t i = 0; i < GetGenes( ); ++i ) { 00412 for( size_t j = ( i + 1 ); j < GetGenes( ); ++j ) { 00413 unsigned char d = (unsigned char)Get( i, j ); 00414 if( CMeta::IsNaN( Get(i,j) ) ) { 00415 d = nan_val; 00416 } 00417 //check if fits in one byte 00418 if( offset + bit_len <= 8 ) { 00419 cur_byte = cur_byte | (d << (8-bit_len-offset)); 00420 offset = offset + bit_len; 00421 if( offset == 8 ) { 00422 ofsm.write((char*)&cur_byte, 1); 00423 cur_byte = offset = 0; 00424 byte_count++; 00425 } 00426 } 00427 else { 00428 //spans byte boundary; offset+bit_len > 8 00429 cur_byte = cur_byte | (d >> (offset+bit_len-8)); 00430 ofsm.write((char*)&cur_byte, 1); 00431 cur_byte = d << (16-offset-bit_len); 00432 offset = offset+bit_len-8; 00433 byte_count++; 00434 } 00435 } 00436 } 00437 size_t bytes_req = (size_t)ceil((GetGenes()*(GetGenes()-1)/2.0) * bit_len / 8.0); 00438 if( byte_count < bytes_req ) { 00439 ofsm.write((char*)&cur_byte, 1); 00440 } 00441 //g_CatSleipnir().info( "CDataPair::Save( ) byte count: %s", byte_count ); 00442 } 00443 else { 00444 CDat::Save( szFile ); 00445 } 00446 } 00447 00464 bool CPCLPair::Open( const char* szDatafile, size_t iSkip ) { 00465 static const size_t c_iBuf = 8192; 00466 char szBuf[ c_iBuf ]; 00467 ifstream ifsm; 00468 size_t i; 00469 00470 g_CatSleipnir( ).notice( "CPCLPair::Open( %s )", szDatafile ); 00471 00472 if( !CPCL::Open( szDatafile, iSkip ) ) 00473 return false; 00474 00475 if( !CPairImpl::Open( szDatafile, ifsm ) ) 00476 return false; 00477 00478 m_vecvecdQuants.resize( GetExperiments( ) ); 00479 for( i = 0; i < m_vecvecdQuants.size( ); ++i ) { 00480 if( ifsm.eof( ) ) { 00481 g_CatSleipnir( ).error( "CPCLPair::Open( %s, %d ) invalid quant file", szDatafile, iSkip ); 00482 return false; } 00483 ifsm.getline( szBuf, c_iBuf - 1 ); 00484 if( !CPairImpl::Open( szBuf, m_vecvecdQuants[ i ] ) ) 00485 return false; } 00486 00487 return true; } 00488 00505 size_t CPCLPair::Quantize( float dValue, size_t iExperiment ) const { 00506 00507 return CMeta::Quantize( dValue, m_vecvecdQuants[ iExperiment ] ); } 00508 00509 void CPCLPair::Quantize() { 00510 for( size_t i = 0; i < GetGenes( ); ++i ) { 00511 for( size_t j = 0; j < GetExperiments( ); ++j ) { 00512 float d = Get( i, j ); 00513 if( CMeta::IsNaN( d ) ) continue; 00514 Set( i, j, Quantize( d, j ) ); 00515 } 00516 } 00517 } 00518 00519 bool CDatFilterImpl::Attach( const CDataPair* pDat, const CDatFilter* pFilter, const CGenes* pGenes, 00520 CDat::EFilter eFilter, const CDat* pAnswers ) { 00521 size_t i; 00522 00523 if( !( ( eFilter == CDat::EFilterInclude ) || ( eFilter == CDat::EFilterExclude ) || 00524 ( eFilter == CDat::EFilterTerm ) || ( eFilter == CDat::EFilterEdge ) ) ) 00525 return false; 00526 00527 m_pFilter = pFilter; 00528 m_pDat = pDat; 00529 m_pAnswers = pAnswers; 00530 m_eFilter = eFilter; 00531 if( m_pAnswers ) { 00532 m_veciAnswers.resize( GetGenes( ) ); 00533 for( i = 0; i < m_veciAnswers.size( ); ++i ) 00534 m_veciAnswers[ i ] = m_pAnswers->GetGene( GetGene( i ) ); } 00535 else { 00536 m_veciAnswers.clear( ); 00537 if( m_eFilter == CDat::EFilterTerm ) 00538 m_eFilter = CDat::EFilterEdge; } 00539 00540 if( pGenes ) { 00541 m_vecfGenes.resize( GetGenes( ) ); 00542 for( i = 0; i < m_vecfGenes.size( ); ++i ) 00543 m_vecfGenes[ i ] = pGenes->IsGene( GetGene( i ) ); } 00544 else 00545 m_vecfGenes.clear( ); 00546 00547 return true; } 00548 00549 size_t CDatFilterImpl::GetGenes( ) const { 00550 00551 return ( m_pFilter ? m_pFilter->GetGenes( ) : ( m_pDat ? m_pDat->GetGenes( ) : -1 ) ); } 00552 00553 string CDatFilterImpl::GetGene( size_t iGene ) const { 00554 00555 return ( m_pFilter ? m_pFilter->GetGene( iGene ) : ( m_pDat ? m_pDat->GetGene( iGene ) : "" ) ); } 00556 00582 bool CDatFilter::Attach( const CDataPair& Dat, const CGenes& Genes, CDat::EFilter eFilter, 00583 const CDat* pAnswers ) { 00584 00585 return CDatFilterImpl::Attach( &Dat, NULL, &Genes, eFilter, pAnswers ); } 00586 00611 bool CDatFilter::Attach( const CDatFilter& Dat, const CGenes& Genes, CDat::EFilter eFilter, 00612 const CDat* pAnswers ) { 00613 00614 return CDatFilterImpl::Attach( NULL, &Dat, &Genes, eFilter, pAnswers ); } 00615 00616 }