Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "fasta.h" 00024 #include "meta.h" 00025 00026 namespace Sleipnir { 00027 00028 const char CFASTAImpl::c_acComment[] = "#"; 00029 const char CFASTAImpl::c_acHeader[] = ">"; 00030 00031 CFASTAImpl::CFASTAImpl( ) { 00032 00033 m_szBuffer = new char[ c_iBufferSize ]; 00034 pthread_mutex_init( &m_mutx, NULL ); } 00035 00036 CFASTAImpl::~CFASTAImpl( ) { 00037 00038 pthread_mutex_destroy( &m_mutx ); 00039 delete[] m_szBuffer; } 00040 00062 bool CFASTA::Open( const char* szFile, const std::set<std::string>& setstrTypes ) { 00063 char* pc; 00064 vector<string> vecstrLine; 00065 TMapStrI::iterator iterGene; 00066 size_t i, iGene; 00067 00068 m_ifsm.close( ); 00069 m_ifsm.clear( ); 00070 m_mapstriGenes.clear( ); 00071 m_vecstrGenes.clear( ); 00072 m_vecmapstriSequences.clear( ); 00073 m_vecmapstrstrHeaders.clear( ); 00074 00075 m_ifsm.open( szFile, ios_base::binary ); 00076 if( !m_ifsm.is_open( ) ) 00077 return false; 00078 00079 while( !m_ifsm.eof( ) ) { 00080 m_ifsm.getline( m_szBuffer, c_iBufferSize ); 00081 m_szBuffer[ c_iBufferSize - 1 ] = 0; 00082 if( !m_szBuffer[ 0 ] || !strchr( c_acHeader, m_szBuffer[ 0 ] ) ) 00083 continue; 00084 for( pc = m_szBuffer + strlen( m_szBuffer ) - 1; ( pc >= m_szBuffer ) && strchr( "\n\r", *pc ); --pc ) 00085 *pc = 0; 00086 for( pc = m_szBuffer + 1; *pc && isspace( *pc ); ++pc ); 00087 vecstrLine.clear( ); 00088 CMeta::Tokenize( pc, vecstrLine ); 00089 if( vecstrLine.empty( ) ) { 00090 g_CatSleipnir( ).warn( "CFASTA::Open( %s ) invalid header line: %s", szFile, m_szBuffer ); 00091 continue; } 00092 if( vecstrLine.size( ) < 2 ) 00093 vecstrLine.push_back( "" ); 00094 if( !setstrTypes.empty( ) && ( setstrTypes.find( vecstrLine[ 1 ] ) == setstrTypes.end( ) ) ) 00095 continue; 00096 m_setstrTypes.insert( vecstrLine[ 1 ] ); 00097 if( ( iterGene = m_mapstriGenes.find( vecstrLine[ 0 ] ) ) == m_mapstriGenes.end( ) ) { 00098 m_mapstriGenes[ vecstrLine[ 0 ] ] = iGene = m_vecstrGenes.size( ); 00099 m_vecstrGenes.push_back( vecstrLine[ 0 ] ); 00100 while( m_vecmapstriSequences.size( ) <= iGene ) 00101 m_vecmapstriSequences.push_back( TMapStrI( ) ); } 00102 else 00103 iGene = iterGene->second; 00104 m_vecmapstriSequences[ iGene ][ vecstrLine[ 1 ] ] = m_ifsm.tellg( ); 00105 if( vecstrLine.size( ) > 2 ) { 00106 string strHeader; 00107 00108 while( m_vecmapstrstrHeaders.size( ) <= iGene ) 00109 m_vecmapstrstrHeaders.push_back( map<string, string>( ) ); 00110 strHeader = vecstrLine[ 2 ]; 00111 for( i = 3; i < vecstrLine.size( ); ++i ) 00112 strHeader += '\t' + vecstrLine[ i ]; 00113 m_vecmapstrstrHeaders[ iGene ][ vecstrLine[ 1 ] ] = strHeader; } } 00114 while( m_vecmapstrstrHeaders.size( ) <= iGene ) 00115 m_vecmapstrstrHeaders.push_back( map<string, string>( ) ); 00116 00117 return true; } 00118 00135 void CFASTA::Save( std::ostream& ostm, size_t iWrap ) const { 00136 size_t i, j, iGene; 00137 00138 for( iGene = 0; iGene < GetGenes( ); ++iGene ) { 00139 vector<SFASTASequence> vecsSequences; 00140 00141 Get( iGene, vecsSequences ); 00142 for( i = 0; i < vecsSequences.size( ); ++i ) { 00143 const SFASTASequence& sSequence = vecsSequences[ i ]; 00144 string strSequence; 00145 bool fIntron; 00146 00147 ostm << c_acHeader[ 0 ] << ' ' << GetGene( iGene ); 00148 if( !sSequence.m_strType.empty( ) ) 00149 ostm << '\t' << sSequence.m_strType; 00150 if( ( strSequence = GetHeader( iGene, sSequence.m_strType ) ).length( ) ) { 00151 if( sSequence.m_strType.empty( ) ) 00152 ostm << '\t'; 00153 ostm << '\t' << strSequence; } 00154 ostm << endl; 00155 00156 for( strSequence = "",fIntron = sSequence.m_fIntronFirst,j = 0; 00157 j < sSequence.m_vecstrSequences.size( ); ++j,fIntron = !fIntron ) { 00158 string strCur; 00159 00160 strCur = sSequence.m_vecstrSequences[ j ]; 00161 transform( strCur.begin( ), strCur.end( ), strCur.begin( ), fIntron ? ::tolower : ::toupper ); 00162 strSequence += strCur; } 00163 for( j = 0; j < strSequence.length( ); j += iWrap ) 00164 ostm << strSequence.substr( j, iWrap ) << endl; } } } 00165 00166 bool CFASTAImpl::Get( size_t iGene, vector<SFASTASequence>* pvecsSequences, 00167 vector<SFASTAWiggle>* pvecsValues ) const { 00168 00169 if( iGene == -1 ) 00170 return false; 00171 00172 const TMapStrI& mapstriSequences = m_vecmapstriSequences[ iGene ]; 00173 TMapStrI::const_iterator iterGene; 00174 char* pc; 00175 bool fOpen; 00176 00177 pthread_mutex_lock( &m_mutx ); 00178 fOpen = m_ifsm.is_open( ); 00179 pthread_mutex_unlock( &m_mutx ); 00180 if( !fOpen ) 00181 return false; 00182 for( iterGene = mapstriSequences.begin( ); iterGene != mapstriSequences.end( ); ++iterGene ) { 00183 SFASTASequence sSequence; 00184 SFASTAWiggle sValues; 00185 string strSequence; 00186 00187 sSequence.m_strType = sValues.m_strType = iterGene->first; 00188 pthread_mutex_lock( &m_mutx ); 00189 m_ifsm.clear( ); 00190 m_ifsm.seekg( iterGene->second ); 00191 if( (size_t)m_ifsm.tellg( ) != iterGene->second ) { 00192 g_CatSleipnir( ).error( "CFASTA::Get( %d ) error parsing: %s %s at %d (%d)", iGene, 00193 GetGene( iGene ).c_str( ), iterGene->first.c_str( ), iterGene->second, 00194 (size_t)m_ifsm.tellg( ) ); 00195 pthread_mutex_unlock( &m_mutx ); 00196 return false; } 00197 while( !m_ifsm.eof( ) ) { 00198 m_ifsm.getline( m_szBuffer, c_iBufferSize ); 00199 m_szBuffer[ c_iBufferSize - 1 ] = 0; 00200 if( !m_szBuffer[ 0 ] || strchr( c_acComment, m_szBuffer[ 0 ] ) ) 00201 continue; 00202 if( strchr( c_acHeader, m_szBuffer[ 0 ] ) ) 00203 break; 00204 if( pvecsValues ) 00205 sValues.m_vecdValues.push_back( (float)atof( m_szBuffer ) ); 00206 if( pvecsSequences ) { 00207 for( pc = m_szBuffer + strlen( m_szBuffer ) - 1; 00208 ( pc >= m_szBuffer ) && strchr( "\n\r", *pc ); --pc ) 00209 *pc = 0; 00210 strSequence += m_szBuffer; } } 00211 pthread_mutex_unlock( &m_mutx ); 00212 00213 if( pvecsValues && !Get( iGene, *pvecsValues, iterGene->second, sValues ) ) 00214 return false; 00215 if( pvecsSequences && !Get( iGene, *pvecsSequences, iterGene->second, strSequence, sSequence ) ) 00216 return false; } 00217 00218 return true; } 00219 00220 bool CFASTAImpl::Get( size_t iGene, std::vector<SFASTASequence>& vecsSequences, size_t iOffset, 00221 const std::string& strSequence, SFASTASequence& sSequence ) const { 00222 size_t iBegin, iEnd; 00223 00224 if( strSequence.empty( ) ) { 00225 g_CatSleipnir( ).debug( "CFASTA::Get( %d ) no sequence found: %s %s at %d", iGene, 00226 GetGene( iGene ).c_str( ), sSequence.m_strType.c_str( ), iOffset ); 00227 return true; } 00228 for( iBegin = 0; iBegin < strSequence.size( ); iBegin = iEnd ) { 00229 bool fBegin; 00230 string strCur; 00231 00232 fBegin = !!isupper( strSequence[ iBegin ] ); 00233 if( !iBegin ) 00234 sSequence.m_fIntronFirst = !fBegin; 00235 for( iEnd = iBegin + 1; ( iEnd < strSequence.size( ) ) && 00236 ( fBegin == !!isupper( strSequence[ iEnd ] ) ); ++iEnd ); 00237 strCur = strSequence.substr( iBegin, iEnd - iBegin ); 00238 transform( strCur.begin( ), strCur.end( ), strCur.begin( ), ::toupper ); 00239 sSequence.m_vecstrSequences.push_back( strCur ); } 00240 vecsSequences.push_back( sSequence ); 00241 00242 return true; } 00243 00244 bool CFASTAImpl::Get( size_t iGene, vector<SFASTAWiggle>& vecsValues, size_t iOffset, 00245 SFASTAWiggle& sValues ) const { 00246 00247 if( sValues.m_vecdValues.empty( ) ) { 00248 g_CatSleipnir( ).debug( "CFASTA::Get( %d ) no values found: %s %s at %d", iGene, 00249 GetGene( iGene ).c_str( ), sValues.m_strType.c_str( ), iOffset ); 00250 return true; } 00251 vecsValues.push_back( sValues ); 00252 00253 return true; } 00254 00255 }