Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 static const char c_acNucleotides[] = "ACGT"; 00026 00027 int main( int iArgs, char** aszArgs ) { 00028 gengetopt_args_info sArgs; 00029 CPCL PCL, PCLBackground; 00030 vector<float> vecdSumsIn, vecdSumSqsIn, vecdSumsOut, vecdSumSqsOut; 00031 vector<size_t> veciCountsIn, veciCountsOut; 00032 size_t i, j, k; 00033 float d; 00034 set<size_t> setiGenes; 00035 00036 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00037 cmdline_parser_print_help( ); 00038 return 1; } 00039 CMeta Meta( sArgs.verbosity_arg ); 00040 00041 if( !PCL.Open( sArgs.input_arg, sArgs.skip_arg ) ) { 00042 cerr << "Could not open: " << ( sArgs.input_arg ? sArgs.input_arg : "stdin" ) << endl; 00043 return 1; } 00044 if( sArgs.background_arg && !PCLBackground.Open( sArgs.background_arg, sArgs.skip_arg ) ) { 00045 cerr << "Could not open: " << sArgs.background_arg << endl; 00046 return 1; } 00047 00048 if( sArgs.fasta_arg ) { 00049 CFASTA FASTA; 00050 map<string, size_t> mapstriTypes; 00051 map<string, size_t>::const_iterator iterType; 00052 size_t iRow, iColumn, iLength, iCount; 00053 set<size_t> setiForeground; 00054 CFullMatrix<size_t> MatSums, MatSumSqs, MatCounts; 00055 CFullMatrix<CHMM> MatHMMs; 00056 float dAve, dStd; 00057 00058 if( !FASTA.Open( sArgs.fasta_arg ) ) { 00059 cerr << "Could not open: " << sArgs.fasta_arg << endl; 00060 return 1; } 00061 00062 if( sArgs.motifs_arg ) { 00063 ifstream ifsm; 00064 vector<SMotifMatch> vecsMotifs; 00065 CCoalesceMotifLibrary Motifs( sArgs.k_arg ); 00066 SCoalesceModifiers sModifiers; 00067 SCoalesceModifierCache sCache( sModifiers ); 00068 CPCL PCLScores; 00069 vector<string> vecstrMotifs; 00070 00071 ifsm.open( sArgs.motifs_arg ); 00072 if( !CCoalesceMotifLibrary::Open( ifsm, vecsMotifs, &Motifs ) ) { 00073 cerr << "Could not open: " << sArgs.motifs_arg << endl; 00074 return 1; } 00075 ifsm.close( ); 00076 00077 vecstrMotifs.resize( vecsMotifs.size( ) ); 00078 for( i = 0; i < vecstrMotifs.size( ); ++i ) 00079 vecstrMotifs[ i ] = Motifs.GetMotif( vecsMotifs[ i ].m_iMotif ); 00080 PCLScores.Open( PCL.GetGeneNames( ), vecstrMotifs, vector<string>( ) ); 00081 PCLScores.Clear( ); 00082 00083 for( iRow = 0; iRow < PCL.GetGenes( ); ++iRow ) { 00084 vector<SFASTASequence> vecsSequences; 00085 00086 if( !( iRow % 100 ) ) 00087 cerr << "Processing " << iRow << '/' << PCL.GetGenes( ) << endl; 00088 if( ( i = FASTA.GetGene( PCL.GetGene( iRow ) ) ) == -1 ) { 00089 for( i = 0; i < PCLScores.GetExperiments( ); ++i ) 00090 PCLScores.Set( iRow, i, CMeta::GetNaN( ) ); 00091 continue; } 00092 if( !FASTA.Get( i, vecsSequences ) ) 00093 return 1; 00094 for( iColumn = 0; iColumn < vecsMotifs.size( ); ++iColumn ) { 00095 const SMotifMatch& sMotif = vecsMotifs[ iColumn ]; 00096 00097 for( iLength = i = 0; i < vecsSequences.size( ); ++i ) { 00098 const SFASTASequence& sSequence = vecsSequences[ i ]; 00099 00100 if( sMotif.m_strType != sSequence.m_strType ) 00101 continue; 00102 for( j = 0; j < sSequence.m_vecstrSequences.size( ); ++j ) { 00103 const string& strSequence = sSequence.m_vecstrSequences[ j ]; 00104 00105 if( ( sMotif.m_eSubsequence != CCoalesceSequencerBase::ESubsequenceTotal ) && 00106 ( sMotif.m_eSubsequence != ( ( sSequence.m_fIntronFirst == !( j % 2 ) ) ? 00107 CCoalesceSequencerBase::ESubsequenceIntrons : 00108 CCoalesceSequencerBase::ESubsequenceExons ) ) ) 00109 continue; 00110 iLength += strSequence.size( ); 00111 PCLScores.Get( iRow, iColumn ) += Motifs.GetMatch( strSequence, 00112 sMotif.m_iMotif, 0, sCache ); } } 00113 if( iLength ) 00114 PCLScores.Get( iRow, iColumn ) /= iLength; } } 00115 00116 PCLScores.Save( cout ); 00117 return 0; } 00118 00119 for( i = 0; i < FASTA.GetGenes( ); ++i ) 00120 if( PCL.GetGene( FASTA.GetGene( i ) ) != -1 ) { 00121 setiGenes.insert( i ); 00122 setiForeground.insert( i ); } 00123 else if( PCLBackground.GetGenes( ) ) { 00124 if( PCLBackground.GetGene( FASTA.GetGene( i ) ) != -1 ) 00125 setiGenes.insert( i ); } 00126 else 00127 setiGenes.insert( i ); 00128 MatSums.Initialize( FASTA.GetTypes( ).size( ), 2 ); 00129 MatSums.Clear( ); 00130 MatSumSqs.Initialize( FASTA.GetTypes( ).size( ), 2 ); 00131 MatSumSqs.Clear( ); 00132 MatCounts.Initialize( FASTA.GetTypes( ).size( ), 2 ); 00133 MatCounts.Clear( ); 00134 MatHMMs.Initialize( FASTA.GetTypes( ).size( ), 2 ); 00135 for( i = 0; i < MatHMMs.GetRows( ); ++i ) 00136 for( j = 0; j < MatHMMs.GetColumns( ); ++j ) 00137 MatHMMs.Get( i, j ).Open( sArgs.degree_arg, c_acNucleotides ); 00138 for( i = 0; i < FASTA.GetGenes( ); ++i ) { 00139 vector<SFASTASequence> vecsSequences; 00140 00141 if( setiGenes.find( i ) == setiGenes.end( ) ) 00142 continue; 00143 if( !FASTA.Get( i, vecsSequences ) ) 00144 return 1; 00145 iColumn = ( setiForeground.find( i ) == setiForeground.end( ) ) ? 1 : 0; 00146 for( j = 0; j < vecsSequences.size( ); ++j ) { 00147 const SFASTASequence& sSequence = vecsSequences[ j ]; 00148 00149 if( ( iterType = mapstriTypes.find( sSequence.m_strType ) ) == mapstriTypes.end( ) ) { 00150 iRow = mapstriTypes.size( ); 00151 mapstriTypes[ sSequence.m_strType ] = iRow; } 00152 else 00153 iRow = iterType->second; 00154 MatCounts.Get( iRow, iColumn )++; 00155 for( iLength = k = 0; k < sSequence.m_vecstrSequences.size( ); ++k ) { 00156 const string& strSequence = sSequence.m_vecstrSequences[ k ]; 00157 00158 iLength += strSequence.length( ); 00159 MatHMMs.Get( iRow, iColumn ).Add( strSequence ); } 00160 MatSums.Get( iRow, iColumn ) += iLength; 00161 MatSumSqs.Get( iRow, iColumn ) += iLength * iLength; } } 00162 00163 for( iterType = mapstriTypes.begin( ); iterType != mapstriTypes.end( ); ++iterType ) { 00164 cout << iterType->first; 00165 for( i = 0; i < MatCounts.GetColumns( ); ++i ) { 00166 iCount = max( (size_t)1, MatCounts.Get( iterType->second, i ) ); 00167 dAve = (float)MatSums.Get( iterType->second, i ) / iCount; 00168 dStd = sqrt( ( (float)MatSumSqs.Get( iterType->second, i ) / iCount ) - ( dAve * dAve ) ); 00169 cout << '\t' << dAve << '\t' << dStd; } 00170 cout << endl; 00171 00172 for( i = 0; i < MatHMMs.GetColumns( ); ++i ) 00173 MatHMMs.Get( iterType->second, i ).Save( cout ); } 00174 return 0; } 00175 00176 if( sArgs.genes_arg ) { 00177 CGenome Genome; 00178 CGenes Genes( Genome ); 00179 00180 if( !Genes.Open( sArgs.genes_arg ) ) { 00181 cerr << "Could not open: " << sArgs.genes_arg << endl; 00182 return 1; } 00183 for( i = 0; i < Genes.GetGenes( ); ++i ) 00184 if( ( j = PCL.GetGene( Genes.GetGene( i ).GetName( ) ) ) != -1 ) 00185 setiGenes.insert( j ); } 00186 00187 vecdSumsIn.resize( PCL.GetExperiments( ) ); 00188 vecdSumsOut.resize( PCL.GetExperiments( ) ); 00189 vecdSumSqsIn.resize( PCL.GetExperiments( ) ); 00190 vecdSumSqsOut.resize( PCL.GetExperiments( ) ); 00191 veciCountsIn.resize( PCL.GetExperiments( ) ); 00192 veciCountsOut.resize( PCL.GetExperiments( ) ); 00193 for( i = 0; i < PCL.GetGenes( ); ++i ) { 00194 bool fIn = sArgs.background_arg || 00195 ( sArgs.genes_arg && ( setiGenes.find( i ) != setiGenes.end( ) ) ) || 00196 ( PCL.GetFeatures( ) && ( PCL.GetFeature( i, 1 )[ 0 ] == '*' ) ); 00197 vector<float>& vecdSums = fIn ? vecdSumsIn : vecdSumsOut; 00198 vector<float>& vecdSumSqs = fIn ? vecdSumSqsIn : vecdSumSqsOut; 00199 vector<size_t>& veciCounts = fIn ? veciCountsIn : veciCountsOut; 00200 00201 for( j = 0; j < PCL.GetExperiments( ); ++j ) 00202 if( !CMeta::IsNaN( d = PCL.Get( i, j ) ) ) { 00203 veciCounts[ j ]++; 00204 vecdSums[ j ] += d; 00205 vecdSumSqs[ j ] += d * d; } } 00206 00207 if( sArgs.background_arg ) { 00208 vector<size_t> veciExperiments; 00209 map<string, size_t> mapstriExperiments; 00210 map<string, size_t>::const_iterator iterExperiment; 00211 00212 if( PCLBackground.GetExperiments( ) != PCL.GetExperiments( ) ) { 00213 cerr << "Inconsistent experiments: " << PCL.GetExperiments( ) << " vs. " << 00214 PCLBackground.GetExperiments( ) << endl; 00215 return 1; } 00216 00217 setiGenes.clear( ); 00218 for( i = 0; i < PCL.GetGenes( ); ++i ) 00219 if( ( j = PCLBackground.GetGene( PCL.GetGene( i ) ) ) != -1 ) 00220 setiGenes.insert( j ); 00221 for( i = 0; i < PCL.GetExperiments( ); ++i ) 00222 mapstriExperiments[ PCLBackground.GetExperiment( i ) ] = i; 00223 veciExperiments.resize( PCL.GetExperiments( ) ); 00224 for( i = 0; i < veciExperiments.size( ); ++i ) { 00225 const string& strExperiment = PCL.GetExperiment( i ); 00226 00227 if( ( iterExperiment = mapstriExperiments.find( ( strExperiment[ 0 ] == '*' ) ? 00228 strExperiment.substr( 1 ) : strExperiment ) ) == mapstriExperiments.end( ) ) { 00229 cerr << "Could not match experiment " << i << ": " << strExperiment << endl; 00230 return 1; } 00231 veciExperiments[ i ] = iterExperiment->second; } 00232 00233 for( i = 0; i < PCLBackground.GetGenes( ); ++i ) { 00234 if( setiGenes.find( i ) != setiGenes.end( ) ) 00235 continue; 00236 for( j = 0; j < veciExperiments.size( ); ++j ) 00237 if( !CMeta::IsNaN( d = PCLBackground.Get( i, veciExperiments[ j ] ) ) ) { 00238 veciCountsOut[ j ]++; 00239 vecdSumsOut[ j ] += d; 00240 vecdSumSqsOut[ j ] += d * d; } } } 00241 00242 for( i = 0; i < PCL.GetExperiments( ); ++i ) { 00243 if( j = veciCountsIn[ i ] ) { 00244 d = ( vecdSumsIn[ i ] /= j ); 00245 vecdSumSqsIn[ i ] = sqrt( ( vecdSumSqsIn[ i ] / j ) - ( d * d ) ); } 00246 if( j = veciCountsOut[ i ] ) { 00247 d = ( vecdSumsOut[ i ] /= j ); 00248 vecdSumSqsOut[ i ] = sqrt( ( vecdSumSqsOut[ i ] / j ) - ( d * d ) ); } } 00249 00250 for( i = 0; i < PCL.GetExperiments( ); ++i ) 00251 cout << ( i ? "\t" : "" ) << PCL.GetExperiment( i ); 00252 cout << endl; 00253 for( i = 0; i < PCL.GetExperiments( ); ++i ) 00254 cout << ( i ? "\t" : "" ) << vecdSumsIn[ i ]; 00255 cout << endl; 00256 for( i = 0; i < PCL.GetExperiments( ); ++i ) 00257 cout << ( i ? "\t" : "" ) << vecdSumSqsIn[ i ]; 00258 cout << endl; 00259 for( i = 0; i < PCL.GetExperiments( ); ++i ) 00260 cout << ( i ? "\t" : "" ) << vecdSumsOut[ i ]; 00261 cout << endl; 00262 for( i = 0; i < PCL.GetExperiments( ); ++i ) 00263 cout << ( i ? "\t" : "" ) << vecdSumSqsOut[ i ]; 00264 cout << endl; 00265 00266 00267 return 0; }