Sleipnir
tools/Synthesizer/Synthesizer.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 static const char   c_acAlphabetExons[] = "ACGT";
00026 static const char   c_acAlphabetTotal[] = "ACGTacgt";
00027 
00028 struct STF {
00029     void Initialize( double dAverage, double dStdev, size_t iGenes, double dPGene, size_t iConditions,
00030         double dPCondition, size_t iMin, size_t iMax ) {
00031         size_t  i, iLength;
00032 
00033         m_strBS = "";
00034         iLength = ( rand( ) % ( iMax - iMin ) ) + iMin;
00035         for( i = 0; i < iLength; ++i )
00036             m_strBS += c_acAlphabetExons[ rand( ) % ( ARRAYSIZE(c_acAlphabetExons) - 1 ) ];
00037         m_dActivity = (float)( ( CStatistics::SampleNormalStandard( ) * dStdev ) + dAverage );
00038         if( ( (float)rand( ) / RAND_MAX ) < 0.5 )
00039             m_dActivity *= -1;
00040 
00041         m_veciGenes.reserve( (size_t)( iGenes * dPGene + 1 ) );
00042         for( i = 0; i < iGenes; ++i )
00043             if( ( (float)rand( ) / RAND_MAX ) < dPGene )
00044                 m_veciGenes.push_back( i );
00045         m_veciConditions.reserve( (size_t)( iConditions * dPCondition + 1 ) );
00046         for( i = 0; i < iConditions; ++i )
00047             if( ( (float)rand( ) / RAND_MAX ) < dPCondition )
00048                 m_veciConditions.push_back( i ); }
00049 
00050     void Save( ostream& ostm ) const {
00051         size_t  i;
00052 
00053         ostm << m_strBS << endl;
00054         ostm << m_dActivity << endl;
00055         for( i = 0; i < m_veciGenes.size( ); ++i )
00056             ostm << ( i ? "\t" : "" ) << m_veciGenes[ i ];
00057         ostm << endl;
00058         for( i = 0; i < m_veciConditions.size( ); ++i )
00059             ostm << ( i ? "\t" : "" ) << m_veciConditions[ i ];
00060         ostm << endl; }
00061 
00062     string          m_strBS;
00063     float           m_dActivity;
00064     vector<size_t>  m_veciGenes;
00065     vector<size_t>  m_veciConditions;
00066 };
00067 
00068 int main( int iArgs, char** aszArgs ) {
00069     static const size_t                 c_iBuffer   = 16;
00070     gengetopt_args_info                 sArgs;
00071     CPCL                                PCL;
00072     vector<string>                      vecstrGenes, vecstrExperiments, vecstrFeatures;
00073     size_t                              i, j, k, iCopies;
00074     char                                acBuffer[ c_iBuffer ];
00075     vector<STF>                         vecsTFs;
00076     vector<CHMM>                        vecHMMs;
00077     CFASTA                              FASTA;
00078     ofstream                            ofsm;
00079     map<string, size_t>                 mapstriTypes;
00080     map<string, size_t>::const_iterator iterType;
00081     string                              strSequence;
00082 
00083     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00084         cmdline_parser_print_help( );
00085         return 1; }
00086 #ifdef WIN32
00087     pthread_win32_process_attach_np( );
00088 #endif // WIN32
00089     CMeta Meta( sArgs.verbosity_arg, sArgs.random_arg );
00090 
00091     cout << "#  genes " << sArgs.genes_arg << " conditions " << sArgs.conditions_arg << endl;
00092     cout << "#  tfs " << sArgs.tfs_arg << " tf_gene " << sArgs.tf_gene_arg << " tf_condition " <<
00093         sArgs.tf_condition_arg << " tf_min " << sArgs.tf_min_arg << "   tf_max " << sArgs.tf_max_arg << endl;
00094     cout << "#  mean " << sArgs.mean_arg << "   stdev " << sArgs.stdev_arg << " tf_mean " << sArgs.tf_mean_arg <<
00095         "   tf_stdev " << sArgs.tf_stdev_arg << endl;
00096     cout << "#  fasta " << ( sArgs.fasta_arg ? sArgs.fasta_arg : "none" ) << "  degree " << sArgs.degree_arg <<
00097         "   seq_min " << sArgs.seq_min_arg << " seq_max " << sArgs.seq_max_arg << " tf_copm " <<
00098         sArgs.tf_copm_arg << "  tf_copx " << sArgs.tf_copx_arg << " tf_types " << ( sArgs.tf_types_arg ?
00099         sArgs.tf_types_arg : "all" ) << endl;
00100 
00101     vecsTFs.resize( sArgs.tfs_arg );
00102     for( i = 0; i < vecsTFs.size( ); ++i ) {
00103         vecsTFs[ i ].Initialize( sArgs.tf_mean_arg, sArgs.tf_stdev_arg, sArgs.genes_arg, sArgs.tf_gene_arg,
00104             sArgs.conditions_arg, sArgs.tf_condition_arg, sArgs.tf_min_arg, sArgs.tf_max_arg );
00105         vecsTFs[ i ].Save( cout ); }
00106 
00107     vecstrGenes.resize( sArgs.genes_arg );
00108     for( i = 0; i < vecstrGenes.size( ); ++i ) {
00109         sprintf_s( acBuffer, "G%07d", i );
00110         vecstrGenes[ i ] = acBuffer; }
00111     vecstrExperiments.resize( sArgs.conditions_arg );
00112     for( i = 0; i < vecstrExperiments.size( ); ++i ) {
00113         sprintf_s( acBuffer, "E%07d", i );
00114         vecstrExperiments[ i ] = acBuffer; }
00115     vecstrFeatures.push_back( "GID" );
00116     vecstrFeatures.push_back( "TFS" );
00117     vecstrFeatures.push_back( "GWEIGHT" );
00118     PCL.Open( vecstrGenes, vecstrExperiments, vecstrFeatures );
00119 
00120     for( i = 0; i < PCL.GetGenes( ); ++i ) {
00121         PCL.SetFeature( i, 2, "1" );
00122         for( j = 0; j < PCL.GetExperiments( ); ++j )
00123             PCL.Set( i, j, (float)( ( CStatistics::SampleNormalStandard( ) * sArgs.stdev_arg ) +
00124                 sArgs.mean_arg ) ); }
00125     for( i = 0; i < vecsTFs.size( ); ++i )
00126         for( j = 0; j < vecsTFs[ i ].m_veciGenes.size( ); ++j ) {
00127             size_t          iGene   = vecsTFs[ i ].m_veciGenes[ j ];
00128             ostringstream   ossm;
00129 
00130             ossm << PCL.GetFeature( iGene, 1 );
00131             if( !ossm.str( ).empty( ) )
00132                 ossm << ", ";
00133             ossm << i << '@' << vecsTFs[ i ].m_dActivity;
00134             PCL.SetFeature( iGene, 1, ossm.str( ) );
00135             for( k = 0; k < vecsTFs[ i ].m_veciConditions.size( ); ++k )
00136                 PCL.Get( iGene, vecsTFs[ i ].m_veciConditions[ k ] ) +=
00137                     (float)( ( CStatistics::SampleNormalStandard( ) * sArgs.stdev_arg ) +
00138                     vecsTFs[ i ].m_dActivity ); }
00139     if( sArgs.output_pcl_arg ) {
00140         ofsm.open( sArgs.output_pcl_arg );
00141         if( !ofsm.is_open( ) ) {
00142             cerr << "Could not open: " << sArgs.output_pcl_arg << endl;
00143             return 1; }
00144         PCL.Save( ofsm );
00145         ofsm.close( ); }
00146 
00147     if( sArgs.fasta_arg ) {
00148         set<string>::const_iterator iterType;
00149 
00150         if( !FASTA.Open( sArgs.fasta_arg ) ) {
00151             cerr << "Could not open: " << sArgs.fasta_arg << endl;
00152             return 1; }
00153         vecHMMs.resize( FASTA.GetTypes( ).size( ) );
00154         for( i = 0; i < vecHMMs.size( ); ++i )
00155             vecHMMs[ i ].Open( sArgs.degree_arg, c_acAlphabetTotal );
00156         for( iterType = FASTA.GetTypes( ).begin( ); iterType != FASTA.GetTypes( ).end( ); ++iterType ) {
00157 // This is insane - somehow the compiler optimzation horks this if you do it on one line
00158             i = mapstriTypes.size( );
00159             mapstriTypes[ *iterType ] = i; }
00160         for( i = 0; i < FASTA.GetGenes( ); ++i ) {
00161             vector<SFASTASequence>  vecsSequences;
00162 
00163             if( FASTA.Get( i, vecsSequences ) )
00164                 for( j = 0; j < vecsSequences.size( ); ++j ) {
00165                     const SFASTASequence&   sSequence   = vecsSequences[ j ];
00166                     string                  strCur, strSequence;
00167 
00168                     for( k = 0; k < sSequence.m_vecstrSequences.size( ); ++k ) {
00169                         strCur = sSequence.m_vecstrSequences[ k ];
00170                         if( sSequence.m_fIntronFirst == !( k % 2 ) )
00171                             transform( strCur.begin( ), strCur.end( ), strCur.begin( ), ::tolower );
00172                         strSequence += strCur; }
00173                     if( !vecHMMs[ mapstriTypes[ sSequence.m_strType ] ].Add( strSequence ) ) {
00174                         cerr << "Could not add sequence: " << strSequence << endl;
00175                         return 1; } } } }
00176     else {
00177         vecHMMs.resize( 1 );
00178         vecHMMs[ 0 ].Open( sArgs.degree_arg, c_acAlphabetExons );
00179         vecHMMs[ 0 ].SetUniform( );
00180         mapstriTypes[ "" ] = 0; }
00181 
00182     if( sArgs.output_fasta_arg ) {
00183         vector<string>          vecstrTypes;
00184         vector<vector<size_t> > vecveciTFs;
00185 
00186         if( sArgs.tf_types_arg )
00187             CMeta::Tokenize( sArgs.tf_types_arg, vecstrTypes, "," );
00188         else {
00189             vecstrTypes.resize( mapstriTypes.size( ) );
00190             for( i = 0,iterType = mapstriTypes.begin( ); iterType != mapstriTypes.end( ); ++i,++iterType )
00191                 vecstrTypes[ i ] = iterType->first; }
00192         vecveciTFs.resize( PCL.GetGenes( ) );
00193         for( i = 0; i < vecsTFs.size( ); ++i )
00194             for( j = 0; j < vecsTFs[ i ].m_veciGenes.size( ); ++j )
00195                 vecveciTFs[ vecsTFs[ i ].m_veciGenes[ j ] ].push_back( i );
00196         ofsm.clear( );
00197         ofsm.open( sArgs.output_fasta_arg );
00198         if( !ofsm.is_open( ) ) {
00199             cerr << "Could not open: " << sArgs.output_fasta_arg << endl;
00200             return 1; }
00201         for( iterType = mapstriTypes.begin( ); iterType != mapstriTypes.end( ); ++iterType ) {
00202             const CHMM& HMM         = vecHMMs[ iterType->second ];
00203             string      strHeader;
00204             bool        fTFs;
00205 
00206             fTFs = ( find( vecstrTypes.begin( ), vecstrTypes.end( ), iterType->first ) !=
00207                 vecstrTypes.end( ) );
00208             cerr << "Generating " << iterType->first << " sequence with" << ( fTFs ? "" : "out" ) <<
00209                 " TF sequences..." << endl;
00210             if( !iterType->first.empty( ) )
00211                 strHeader = strHeader + '\t' + iterType->first;
00212             for( i = 0; i < PCL.GetGenes( ); ++i ) {
00213                 ofsm << "> " << PCL.GetGene( i ) << strHeader << endl;
00214                 j = ( sArgs.seq_max_arg <= sArgs.seq_min_arg ) ? sArgs.seq_min_arg :
00215                     ( ( rand( ) % ( sArgs.seq_max_arg - sArgs.seq_min_arg ) ) + sArgs.seq_min_arg );
00216                 strSequence = HMM.Get( j );
00217                 if( fTFs )
00218                     for( j = 0; j < vecveciTFs[ i ].size( ); ++j ) {
00219                         const STF&  sTF = vecsTFs[ vecveciTFs[ i ][ j ] ];
00220 
00221                         iCopies = ( rand( ) % ( sArgs.tf_copx_arg - sArgs.tf_copm_arg ) ) + sArgs.tf_copm_arg;
00222                         for( k = 0; k < iCopies; ++k ) {
00223                             string  strBS   = sTF.m_strBS;
00224                             size_t  iTarget = rand( ) % ( strSequence.length( ) - strBS.length( ) );
00225 
00226                             if( islower( strSequence[ iTarget ] ) )
00227                                 transform( strBS.begin( ), strBS.end( ), strBS.begin( ), ::tolower );
00228                             strSequence.replace( iTarget, strBS.length( ), strBS ); } }
00229                 for( j = 0; j < strSequence.length( ); j += sArgs.wrap_arg )
00230                     ofsm << strSequence.substr( j, sArgs.wrap_arg ) << endl; } }
00231         ofsm.close( ); }
00232 
00233 #ifdef WIN32
00234     pthread_win32_process_detach_np( );
00235 #endif // WIN32
00236     return 0; }