Sleipnir
tools/PCLPlotter/PCLPlotter.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 static const char   c_acNucleotides[]   = "ACGT";
00026 
00027 int main( int iArgs, char** aszArgs ) {
00028     gengetopt_args_info sArgs;
00029     CPCL                PCL, PCLBackground;
00030     vector<float>       vecdSumsIn, vecdSumSqsIn, vecdSumsOut, vecdSumSqsOut;
00031     vector<size_t>      veciCountsIn, veciCountsOut;
00032     size_t              i, j, k;
00033     float               d;
00034     set<size_t>         setiGenes;
00035 
00036     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00037         cmdline_parser_print_help( );
00038         return 1; }
00039     CMeta Meta( sArgs.verbosity_arg );
00040 
00041     if( !PCL.Open( sArgs.input_arg, sArgs.skip_arg ) ) {
00042         cerr << "Could not open: " << ( sArgs.input_arg ? sArgs.input_arg : "stdin" ) << endl;
00043         return 1; }
00044     if( sArgs.background_arg && !PCLBackground.Open( sArgs.background_arg, sArgs.skip_arg ) ) {
00045         cerr << "Could not open: " << sArgs.background_arg << endl;
00046         return 1; }
00047 
00048     if( sArgs.fasta_arg ) {
00049         CFASTA                              FASTA;
00050         map<string, size_t>                 mapstriTypes;
00051         map<string, size_t>::const_iterator iterType;
00052         size_t                              iRow, iColumn, iLength, iCount;
00053         set<size_t>                         setiForeground;
00054         CFullMatrix<size_t>                 MatSums, MatSumSqs, MatCounts;
00055         CFullMatrix<CHMM>                   MatHMMs;
00056         float                               dAve, dStd;
00057 
00058         if( !FASTA.Open( sArgs.fasta_arg ) ) {
00059             cerr << "Could not open: " << sArgs.fasta_arg << endl;
00060             return 1; }
00061 
00062         if( sArgs.motifs_arg ) {
00063             ifstream                ifsm;
00064             vector<SMotifMatch>     vecsMotifs;
00065             CCoalesceMotifLibrary   Motifs( sArgs.k_arg );
00066             SCoalesceModifiers      sModifiers;
00067             SCoalesceModifierCache  sCache( sModifiers );
00068             CPCL                    PCLScores;
00069             vector<string>          vecstrMotifs;
00070 
00071             ifsm.open( sArgs.motifs_arg );
00072             if( !CCoalesceMotifLibrary::Open( ifsm, vecsMotifs, &Motifs ) ) {
00073                 cerr << "Could not open: " << sArgs.motifs_arg << endl;
00074                 return 1; }
00075             ifsm.close( );
00076 
00077             vecstrMotifs.resize( vecsMotifs.size( ) );
00078             for( i = 0; i < vecstrMotifs.size( ); ++i )
00079                 vecstrMotifs[ i ] = Motifs.GetMotif( vecsMotifs[ i ].m_iMotif );
00080             PCLScores.Open( PCL.GetGeneNames( ), vecstrMotifs, vector<string>( ) );
00081             PCLScores.Clear( );
00082 
00083             for( iRow = 0; iRow < PCL.GetGenes( ); ++iRow ) {
00084                 vector<SFASTASequence>  vecsSequences;
00085 
00086                 if( !( iRow % 100 ) )
00087                     cerr << "Processing " << iRow << '/' << PCL.GetGenes( ) << endl;
00088                 if( ( i = FASTA.GetGene( PCL.GetGene( iRow ) ) ) == -1 ) {
00089                     for( i = 0; i < PCLScores.GetExperiments( ); ++i )
00090                         PCLScores.Set( iRow, i, CMeta::GetNaN( ) );
00091                     continue; }
00092                 if( !FASTA.Get( i, vecsSequences ) )
00093                     return 1;
00094                 for( iColumn = 0; iColumn < vecsMotifs.size( ); ++iColumn ) {
00095                     const SMotifMatch&  sMotif  = vecsMotifs[ iColumn ];
00096 
00097                     for( iLength = i = 0; i < vecsSequences.size( ); ++i ) {
00098                         const SFASTASequence&   sSequence   = vecsSequences[ i ];
00099 
00100                         if( sMotif.m_strType != sSequence.m_strType )
00101                             continue;
00102                         for( j = 0; j < sSequence.m_vecstrSequences.size( ); ++j ) {
00103                             const string&   strSequence = sSequence.m_vecstrSequences[ j ];
00104 
00105                             if( ( sMotif.m_eSubsequence != CCoalesceSequencerBase::ESubsequenceTotal ) &&
00106                                 ( sMotif.m_eSubsequence != ( ( sSequence.m_fIntronFirst == !( j % 2 ) ) ?
00107                                 CCoalesceSequencerBase::ESubsequenceIntrons :
00108                                 CCoalesceSequencerBase::ESubsequenceExons ) ) )
00109                                 continue;
00110                             iLength += strSequence.size( );
00111                             PCLScores.Get( iRow, iColumn ) += Motifs.GetMatch( strSequence,
00112                                 sMotif.m_iMotif, 0, sCache ); } }
00113                     if( iLength )
00114                         PCLScores.Get( iRow, iColumn ) /= iLength; } }
00115 
00116             PCLScores.Save( cout );
00117             return 0; }
00118 
00119         for( i = 0; i < FASTA.GetGenes( ); ++i )
00120             if( PCL.GetGene( FASTA.GetGene( i ) ) != -1 ) {
00121                 setiGenes.insert( i );
00122                 setiForeground.insert( i ); }
00123             else if( PCLBackground.GetGenes( ) ) {
00124                 if( PCLBackground.GetGene( FASTA.GetGene( i ) ) != -1 )
00125                     setiGenes.insert( i ); }
00126             else
00127                 setiGenes.insert( i );
00128         MatSums.Initialize( FASTA.GetTypes( ).size( ), 2 );
00129         MatSums.Clear( );
00130         MatSumSqs.Initialize( FASTA.GetTypes( ).size( ), 2 );
00131         MatSumSqs.Clear( );
00132         MatCounts.Initialize( FASTA.GetTypes( ).size( ), 2 );
00133         MatCounts.Clear( );
00134         MatHMMs.Initialize( FASTA.GetTypes( ).size( ), 2 );
00135         for( i = 0; i < MatHMMs.GetRows( ); ++i )
00136             for( j = 0; j < MatHMMs.GetColumns( ); ++j )
00137                 MatHMMs.Get( i, j ).Open( sArgs.degree_arg, c_acNucleotides );
00138         for( i = 0; i < FASTA.GetGenes( ); ++i ) {
00139             vector<SFASTASequence>  vecsSequences;
00140 
00141             if( setiGenes.find( i ) == setiGenes.end( ) )
00142                 continue;
00143             if( !FASTA.Get( i, vecsSequences ) )
00144                 return 1;
00145             iColumn = ( setiForeground.find( i ) == setiForeground.end( ) ) ? 1 : 0;
00146             for( j = 0; j < vecsSequences.size( ); ++j ) {
00147                 const SFASTASequence&   sSequence   = vecsSequences[ j ];
00148 
00149                 if( ( iterType = mapstriTypes.find( sSequence.m_strType ) ) == mapstriTypes.end( ) ) {
00150                     iRow = mapstriTypes.size( );
00151                     mapstriTypes[ sSequence.m_strType ] = iRow; }
00152                 else
00153                     iRow = iterType->second;
00154                 MatCounts.Get( iRow, iColumn )++;
00155                 for( iLength = k = 0; k < sSequence.m_vecstrSequences.size( ); ++k ) {
00156                     const string&   strSequence = sSequence.m_vecstrSequences[ k ];
00157 
00158                     iLength += strSequence.length( );
00159                     MatHMMs.Get( iRow, iColumn ).Add( strSequence ); }
00160                 MatSums.Get( iRow, iColumn ) += iLength;
00161                 MatSumSqs.Get( iRow, iColumn ) += iLength * iLength; } }
00162 
00163         for( iterType = mapstriTypes.begin( ); iterType != mapstriTypes.end( ); ++iterType ) {
00164             cout << iterType->first;
00165             for( i = 0; i < MatCounts.GetColumns( ); ++i ) {
00166                 iCount = max( (size_t)1, MatCounts.Get( iterType->second, i ) );
00167                 dAve = (float)MatSums.Get( iterType->second, i ) / iCount;
00168                 dStd = sqrt( ( (float)MatSumSqs.Get( iterType->second, i ) / iCount ) - ( dAve * dAve ) );
00169                 cout << '\t' << dAve << '\t' << dStd; }
00170             cout << endl;
00171 
00172             for( i = 0; i < MatHMMs.GetColumns( ); ++i )
00173                 MatHMMs.Get( iterType->second, i ).Save( cout ); }
00174         return 0; }
00175 
00176     if( sArgs.genes_arg ) {
00177         CGenome Genome;
00178         CGenes  Genes( Genome );
00179 
00180         if( !Genes.Open( sArgs.genes_arg ) ) {
00181             cerr << "Could not open: " << sArgs.genes_arg << endl;
00182             return 1; }
00183         for( i = 0; i < Genes.GetGenes( ); ++i )
00184             if( ( j = PCL.GetGene( Genes.GetGene( i ).GetName( ) ) ) != -1 )
00185                 setiGenes.insert( j ); }
00186 
00187     vecdSumsIn.resize( PCL.GetExperiments( ) );
00188     vecdSumsOut.resize( PCL.GetExperiments( ) );
00189     vecdSumSqsIn.resize( PCL.GetExperiments( ) );
00190     vecdSumSqsOut.resize( PCL.GetExperiments( ) );
00191     veciCountsIn.resize( PCL.GetExperiments( ) );
00192     veciCountsOut.resize( PCL.GetExperiments( ) );
00193     for( i = 0; i < PCL.GetGenes( ); ++i ) {
00194         bool            fIn         = sArgs.background_arg ||
00195                                         ( sArgs.genes_arg && ( setiGenes.find( i ) != setiGenes.end( ) ) ) ||
00196                                         ( PCL.GetFeatures( ) && ( PCL.GetFeature( i, 1 )[ 0 ] == '*' ) );
00197         vector<float>&  vecdSums    = fIn ? vecdSumsIn : vecdSumsOut;
00198         vector<float>&  vecdSumSqs  = fIn ? vecdSumSqsIn : vecdSumSqsOut;
00199         vector<size_t>& veciCounts  = fIn ? veciCountsIn : veciCountsOut;
00200 
00201         for( j = 0; j < PCL.GetExperiments( ); ++j )
00202             if( !CMeta::IsNaN( d = PCL.Get( i, j ) ) ) {
00203                 veciCounts[ j ]++;
00204                 vecdSums[ j ] += d;
00205                 vecdSumSqs[ j ] += d * d; } }
00206 
00207     if( sArgs.background_arg ) {
00208         vector<size_t>                      veciExperiments;
00209         map<string, size_t>                 mapstriExperiments;
00210         map<string, size_t>::const_iterator iterExperiment;
00211 
00212         if( PCLBackground.GetExperiments( ) != PCL.GetExperiments( ) ) {
00213             cerr << "Inconsistent experiments: " << PCL.GetExperiments( ) << " vs. " <<
00214                 PCLBackground.GetExperiments( ) << endl;
00215             return 1; }
00216 
00217         setiGenes.clear( );
00218         for( i = 0; i < PCL.GetGenes( ); ++i )
00219             if( ( j = PCLBackground.GetGene( PCL.GetGene( i ) ) ) != -1 )
00220                 setiGenes.insert( j );
00221         for( i = 0; i < PCL.GetExperiments( ); ++i )
00222             mapstriExperiments[ PCLBackground.GetExperiment( i ) ] = i;
00223         veciExperiments.resize( PCL.GetExperiments( ) );
00224         for( i = 0; i < veciExperiments.size( ); ++i ) {
00225             const string&   strExperiment   = PCL.GetExperiment( i );
00226 
00227             if( ( iterExperiment = mapstriExperiments.find( ( strExperiment[ 0 ] == '*' ) ?
00228                 strExperiment.substr( 1 ) : strExperiment ) ) == mapstriExperiments.end( ) ) {
00229                 cerr << "Could not match experiment " << i << ": " << strExperiment << endl;
00230                 return 1; }
00231             veciExperiments[ i ] = iterExperiment->second; }
00232 
00233         for( i = 0; i < PCLBackground.GetGenes( ); ++i ) {
00234             if( setiGenes.find( i ) != setiGenes.end( ) )
00235                 continue;
00236             for( j = 0; j < veciExperiments.size( ); ++j )
00237                 if( !CMeta::IsNaN( d = PCLBackground.Get( i, veciExperiments[ j ] ) ) ) {
00238                     veciCountsOut[ j ]++;
00239                     vecdSumsOut[ j ] += d;
00240                     vecdSumSqsOut[ j ] += d * d; } } }
00241 
00242     for( i = 0; i < PCL.GetExperiments( ); ++i ) {
00243         if( j = veciCountsIn[ i ] ) {
00244             d = ( vecdSumsIn[ i ] /= j );
00245             vecdSumSqsIn[ i ] = sqrt( ( vecdSumSqsIn[ i ] / j ) - ( d * d ) ); }
00246         if( j = veciCountsOut[ i ] ) {
00247             d = ( vecdSumsOut[ i ] /= j );
00248             vecdSumSqsOut[ i ] = sqrt( ( vecdSumSqsOut[ i ] / j ) - ( d * d ) ); } }
00249 
00250     for( i = 0; i < PCL.GetExperiments( ); ++i )
00251         cout << ( i ? "\t" : "" ) << PCL.GetExperiment( i );
00252     cout << endl;
00253     for( i = 0; i < PCL.GetExperiments( ); ++i )
00254         cout << ( i ? "\t" : "" ) << vecdSumsIn[ i ];
00255     cout << endl;
00256     for( i = 0; i < PCL.GetExperiments( ); ++i )
00257         cout << ( i ? "\t" : "" ) << vecdSumSqsIn[ i ];
00258     cout << endl;
00259     for( i = 0; i < PCL.GetExperiments( ); ++i )
00260         cout << ( i ? "\t" : "" ) << vecdSumsOut[ i ];
00261     cout << endl;
00262     for( i = 0; i < PCL.GetExperiments( ); ++i )
00263         cout << ( i ? "\t" : "" ) << vecdSumSqsOut[ i ];
00264     cout << endl;
00265 
00266 
00267     return 0; }