Sleipnir
src/fasta.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "fasta.h"
00024 #include "meta.h"
00025 
00026 namespace Sleipnir {
00027 
00028 const char CFASTAImpl::c_acComment[]    = "#";
00029 const char CFASTAImpl::c_acHeader[]     = ">";
00030 
00031 CFASTAImpl::CFASTAImpl( ) {
00032 
00033     m_szBuffer = new char[ c_iBufferSize ];
00034     pthread_mutex_init( &m_mutx, NULL ); }
00035 
00036 CFASTAImpl::~CFASTAImpl( ) {
00037 
00038     pthread_mutex_destroy( &m_mutx );
00039     delete[] m_szBuffer; }
00040 
00062 bool CFASTA::Open( const char* szFile, const std::set<std::string>& setstrTypes ) {
00063     char*               pc;
00064     vector<string>      vecstrLine;
00065     TMapStrI::iterator  iterGene;
00066     size_t              i, iGene;
00067 
00068     m_ifsm.close( );
00069     m_ifsm.clear( );
00070     m_mapstriGenes.clear( );
00071     m_vecstrGenes.clear( );
00072     m_vecmapstriSequences.clear( );
00073     m_vecmapstrstrHeaders.clear( );
00074 
00075     m_ifsm.open( szFile, ios_base::binary );
00076     if( !m_ifsm.is_open( ) )
00077         return false;
00078 
00079     while( !m_ifsm.eof( ) ) {
00080         m_ifsm.getline( m_szBuffer, c_iBufferSize );
00081         m_szBuffer[ c_iBufferSize - 1 ] = 0;
00082         if( !m_szBuffer[ 0 ] || !strchr( c_acHeader, m_szBuffer[ 0 ] ) )
00083             continue;
00084         for( pc = m_szBuffer + strlen( m_szBuffer ) - 1; ( pc >= m_szBuffer ) && strchr( "\n\r", *pc ); --pc )
00085             *pc = 0;
00086         for( pc = m_szBuffer + 1; *pc && isspace( *pc ); ++pc );
00087         vecstrLine.clear( );
00088         CMeta::Tokenize( pc, vecstrLine );
00089         if( vecstrLine.empty( ) ) {
00090             g_CatSleipnir( ).warn( "CFASTA::Open( %s ) invalid header line: %s", szFile, m_szBuffer );
00091             continue; }
00092         if( vecstrLine.size( ) < 2 )
00093             vecstrLine.push_back( "" );
00094         if( !setstrTypes.empty( ) && ( setstrTypes.find( vecstrLine[ 1 ] ) == setstrTypes.end( ) ) )
00095             continue;
00096         m_setstrTypes.insert( vecstrLine[ 1 ] );
00097         if( ( iterGene = m_mapstriGenes.find( vecstrLine[ 0 ] ) ) == m_mapstriGenes.end( ) ) {
00098             m_mapstriGenes[ vecstrLine[ 0 ] ] = iGene = m_vecstrGenes.size( );
00099             m_vecstrGenes.push_back( vecstrLine[ 0 ] );
00100             while( m_vecmapstriSequences.size( ) <= iGene )
00101                 m_vecmapstriSequences.push_back( TMapStrI( ) ); }
00102         else
00103             iGene = iterGene->second;
00104         m_vecmapstriSequences[ iGene ][ vecstrLine[ 1 ] ] = m_ifsm.tellg( );
00105         if( vecstrLine.size( ) > 2 ) {
00106             string  strHeader;
00107 
00108             while( m_vecmapstrstrHeaders.size( ) <= iGene )
00109                 m_vecmapstrstrHeaders.push_back( map<string, string>( ) );
00110             strHeader = vecstrLine[ 2 ];
00111             for( i = 3; i < vecstrLine.size( ); ++i )
00112                 strHeader += '\t' + vecstrLine[ i ];
00113             m_vecmapstrstrHeaders[ iGene ][ vecstrLine[ 1 ] ] = strHeader; } }
00114     while( m_vecmapstrstrHeaders.size( ) <= iGene )
00115         m_vecmapstrstrHeaders.push_back( map<string, string>( ) );
00116 
00117     return true; }
00118 
00135 void CFASTA::Save( std::ostream& ostm, size_t iWrap ) const {
00136     size_t  i, j, iGene;
00137 
00138     for( iGene = 0; iGene < GetGenes( ); ++iGene ) {
00139         vector<SFASTASequence>  vecsSequences;
00140 
00141         Get( iGene, vecsSequences );
00142         for( i = 0; i < vecsSequences.size( ); ++i ) {
00143             const SFASTASequence&   sSequence   = vecsSequences[ i ];
00144             string                  strSequence;
00145             bool                    fIntron;
00146 
00147             ostm << c_acHeader[ 0 ] << ' ' << GetGene( iGene );
00148             if( !sSequence.m_strType.empty( ) )
00149                 ostm << '\t' << sSequence.m_strType;
00150             if( ( strSequence = GetHeader( iGene, sSequence.m_strType ) ).length( ) ) {
00151                 if( sSequence.m_strType.empty( ) )
00152                     ostm << '\t';
00153                 ostm << '\t' << strSequence; }
00154             ostm << endl;
00155 
00156             for( strSequence = "",fIntron = sSequence.m_fIntronFirst,j = 0;
00157                 j < sSequence.m_vecstrSequences.size( ); ++j,fIntron = !fIntron ) {
00158                 string  strCur;
00159 
00160                 strCur = sSequence.m_vecstrSequences[ j ];
00161                 transform( strCur.begin( ), strCur.end( ), strCur.begin( ), fIntron ? ::tolower : ::toupper );
00162                 strSequence += strCur; }
00163             for( j = 0; j < strSequence.length( ); j += iWrap )
00164                 ostm << strSequence.substr( j, iWrap ) << endl; } } }
00165 
00166 bool CFASTAImpl::Get( size_t iGene, vector<SFASTASequence>* pvecsSequences,
00167     vector<SFASTAWiggle>* pvecsValues ) const {
00168 
00169     if( iGene == -1 )
00170         return false;
00171 
00172     const TMapStrI&             mapstriSequences    = m_vecmapstriSequences[ iGene ];
00173     TMapStrI::const_iterator    iterGene;
00174     char*                       pc;
00175     bool                        fOpen;
00176 
00177     pthread_mutex_lock( &m_mutx );
00178     fOpen = m_ifsm.is_open( );
00179     pthread_mutex_unlock( &m_mutx );
00180     if( !fOpen )
00181         return false;
00182     for( iterGene = mapstriSequences.begin( ); iterGene != mapstriSequences.end( ); ++iterGene ) {
00183         SFASTASequence  sSequence;
00184         SFASTAWiggle    sValues;
00185         string          strSequence;
00186 
00187         sSequence.m_strType = sValues.m_strType = iterGene->first;
00188         pthread_mutex_lock( &m_mutx );
00189         m_ifsm.clear( );
00190         m_ifsm.seekg( iterGene->second );
00191         if( (size_t)m_ifsm.tellg( ) != iterGene->second ) {
00192             g_CatSleipnir( ).error( "CFASTA::Get( %d ) error parsing: %s %s at %d (%d)", iGene,
00193                 GetGene( iGene ).c_str( ), iterGene->first.c_str( ), iterGene->second,
00194                 (size_t)m_ifsm.tellg( ) );
00195             pthread_mutex_unlock( &m_mutx );
00196             return false; }
00197         while( !m_ifsm.eof( ) ) {
00198             m_ifsm.getline( m_szBuffer, c_iBufferSize );
00199             m_szBuffer[ c_iBufferSize - 1 ] = 0;
00200             if( !m_szBuffer[ 0 ] || strchr( c_acComment, m_szBuffer[ 0 ] ) )
00201                 continue;
00202             if( strchr( c_acHeader, m_szBuffer[ 0 ] ) )
00203                 break;
00204             if( pvecsValues )
00205                 sValues.m_vecdValues.push_back( (float)atof( m_szBuffer ) );
00206             if( pvecsSequences ) {
00207                 for( pc = m_szBuffer + strlen( m_szBuffer ) - 1;
00208                     ( pc >= m_szBuffer ) && strchr( "\n\r", *pc ); --pc )
00209                     *pc = 0;
00210                 strSequence += m_szBuffer; } }
00211         pthread_mutex_unlock( &m_mutx );
00212 
00213         if( pvecsValues && !Get( iGene, *pvecsValues, iterGene->second, sValues ) )
00214             return false;
00215         if( pvecsSequences && !Get( iGene, *pvecsSequences, iterGene->second, strSequence, sSequence ) )
00216             return false; }
00217 
00218     return true; }
00219 
00220 bool CFASTAImpl::Get( size_t iGene, std::vector<SFASTASequence>& vecsSequences, size_t iOffset,
00221     const std::string& strSequence, SFASTASequence& sSequence ) const {
00222     size_t  iBegin, iEnd;
00223 
00224     if( strSequence.empty( ) ) {
00225         g_CatSleipnir( ).debug( "CFASTA::Get( %d ) no sequence found: %s %s at %d", iGene,
00226             GetGene( iGene ).c_str( ), sSequence.m_strType.c_str( ), iOffset );
00227         return true; }
00228     for( iBegin = 0; iBegin < strSequence.size( ); iBegin = iEnd ) {
00229         bool    fBegin;
00230         string  strCur;
00231 
00232         fBegin = !!isupper( strSequence[ iBegin ] );
00233         if( !iBegin )
00234             sSequence.m_fIntronFirst = !fBegin;
00235         for( iEnd = iBegin + 1; ( iEnd < strSequence.size( ) ) &&
00236             ( fBegin == !!isupper( strSequence[ iEnd ] ) ); ++iEnd );
00237         strCur = strSequence.substr( iBegin, iEnd - iBegin );
00238         transform( strCur.begin( ), strCur.end( ), strCur.begin( ), ::toupper );
00239         sSequence.m_vecstrSequences.push_back( strCur ); }
00240     vecsSequences.push_back( sSequence );
00241 
00242     return true; }
00243 
00244 bool CFASTAImpl::Get( size_t iGene, vector<SFASTAWiggle>& vecsValues, size_t iOffset,
00245     SFASTAWiggle& sValues ) const {
00246 
00247     if( sValues.m_vecdValues.empty( ) ) {
00248         g_CatSleipnir( ).debug( "CFASTA::Get( %d ) no values found: %s %s at %d", iGene,
00249             GetGene( iGene ).c_str( ), sValues.m_strType.c_str( ), iOffset );
00250         return true; }
00251     vecsValues.push_back( sValues );
00252 
00253     return true; }
00254 
00255 }