Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 static const char c_acAlphabetExons[] = "ACGT"; 00026 static const char c_acAlphabetTotal[] = "ACGTacgt"; 00027 00028 struct STF { 00029 void Initialize( double dAverage, double dStdev, size_t iGenes, double dPGene, size_t iConditions, 00030 double dPCondition, size_t iMin, size_t iMax ) { 00031 size_t i, iLength; 00032 00033 m_strBS = ""; 00034 iLength = ( rand( ) % ( iMax - iMin ) ) + iMin; 00035 for( i = 0; i < iLength; ++i ) 00036 m_strBS += c_acAlphabetExons[ rand( ) % ( ARRAYSIZE(c_acAlphabetExons) - 1 ) ]; 00037 m_dActivity = (float)( ( CStatistics::SampleNormalStandard( ) * dStdev ) + dAverage ); 00038 if( ( (float)rand( ) / RAND_MAX ) < 0.5 ) 00039 m_dActivity *= -1; 00040 00041 m_veciGenes.reserve( (size_t)( iGenes * dPGene + 1 ) ); 00042 for( i = 0; i < iGenes; ++i ) 00043 if( ( (float)rand( ) / RAND_MAX ) < dPGene ) 00044 m_veciGenes.push_back( i ); 00045 m_veciConditions.reserve( (size_t)( iConditions * dPCondition + 1 ) ); 00046 for( i = 0; i < iConditions; ++i ) 00047 if( ( (float)rand( ) / RAND_MAX ) < dPCondition ) 00048 m_veciConditions.push_back( i ); } 00049 00050 void Save( ostream& ostm ) const { 00051 size_t i; 00052 00053 ostm << m_strBS << endl; 00054 ostm << m_dActivity << endl; 00055 for( i = 0; i < m_veciGenes.size( ); ++i ) 00056 ostm << ( i ? "\t" : "" ) << m_veciGenes[ i ]; 00057 ostm << endl; 00058 for( i = 0; i < m_veciConditions.size( ); ++i ) 00059 ostm << ( i ? "\t" : "" ) << m_veciConditions[ i ]; 00060 ostm << endl; } 00061 00062 string m_strBS; 00063 float m_dActivity; 00064 vector<size_t> m_veciGenes; 00065 vector<size_t> m_veciConditions; 00066 }; 00067 00068 int main( int iArgs, char** aszArgs ) { 00069 static const size_t c_iBuffer = 16; 00070 gengetopt_args_info sArgs; 00071 CPCL PCL; 00072 vector<string> vecstrGenes, vecstrExperiments, vecstrFeatures; 00073 size_t i, j, k, iCopies; 00074 char acBuffer[ c_iBuffer ]; 00075 vector<STF> vecsTFs; 00076 vector<CHMM> vecHMMs; 00077 CFASTA FASTA; 00078 ofstream ofsm; 00079 map<string, size_t> mapstriTypes; 00080 map<string, size_t>::const_iterator iterType; 00081 string strSequence; 00082 00083 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00084 cmdline_parser_print_help( ); 00085 return 1; } 00086 #ifdef WIN32 00087 pthread_win32_process_attach_np( ); 00088 #endif // WIN32 00089 CMeta Meta( sArgs.verbosity_arg, sArgs.random_arg ); 00090 00091 cout << "# genes " << sArgs.genes_arg << " conditions " << sArgs.conditions_arg << endl; 00092 cout << "# tfs " << sArgs.tfs_arg << " tf_gene " << sArgs.tf_gene_arg << " tf_condition " << 00093 sArgs.tf_condition_arg << " tf_min " << sArgs.tf_min_arg << " tf_max " << sArgs.tf_max_arg << endl; 00094 cout << "# mean " << sArgs.mean_arg << " stdev " << sArgs.stdev_arg << " tf_mean " << sArgs.tf_mean_arg << 00095 " tf_stdev " << sArgs.tf_stdev_arg << endl; 00096 cout << "# fasta " << ( sArgs.fasta_arg ? sArgs.fasta_arg : "none" ) << " degree " << sArgs.degree_arg << 00097 " seq_min " << sArgs.seq_min_arg << " seq_max " << sArgs.seq_max_arg << " tf_copm " << 00098 sArgs.tf_copm_arg << " tf_copx " << sArgs.tf_copx_arg << " tf_types " << ( sArgs.tf_types_arg ? 00099 sArgs.tf_types_arg : "all" ) << endl; 00100 00101 vecsTFs.resize( sArgs.tfs_arg ); 00102 for( i = 0; i < vecsTFs.size( ); ++i ) { 00103 vecsTFs[ i ].Initialize( sArgs.tf_mean_arg, sArgs.tf_stdev_arg, sArgs.genes_arg, sArgs.tf_gene_arg, 00104 sArgs.conditions_arg, sArgs.tf_condition_arg, sArgs.tf_min_arg, sArgs.tf_max_arg ); 00105 vecsTFs[ i ].Save( cout ); } 00106 00107 vecstrGenes.resize( sArgs.genes_arg ); 00108 for( i = 0; i < vecstrGenes.size( ); ++i ) { 00109 sprintf_s( acBuffer, "G%07d", i ); 00110 vecstrGenes[ i ] = acBuffer; } 00111 vecstrExperiments.resize( sArgs.conditions_arg ); 00112 for( i = 0; i < vecstrExperiments.size( ); ++i ) { 00113 sprintf_s( acBuffer, "E%07d", i ); 00114 vecstrExperiments[ i ] = acBuffer; } 00115 vecstrFeatures.push_back( "GID" ); 00116 vecstrFeatures.push_back( "TFS" ); 00117 vecstrFeatures.push_back( "GWEIGHT" ); 00118 PCL.Open( vecstrGenes, vecstrExperiments, vecstrFeatures ); 00119 00120 for( i = 0; i < PCL.GetGenes( ); ++i ) { 00121 PCL.SetFeature( i, 2, "1" ); 00122 for( j = 0; j < PCL.GetExperiments( ); ++j ) 00123 PCL.Set( i, j, (float)( ( CStatistics::SampleNormalStandard( ) * sArgs.stdev_arg ) + 00124 sArgs.mean_arg ) ); } 00125 for( i = 0; i < vecsTFs.size( ); ++i ) 00126 for( j = 0; j < vecsTFs[ i ].m_veciGenes.size( ); ++j ) { 00127 size_t iGene = vecsTFs[ i ].m_veciGenes[ j ]; 00128 ostringstream ossm; 00129 00130 ossm << PCL.GetFeature( iGene, 1 ); 00131 if( !ossm.str( ).empty( ) ) 00132 ossm << ", "; 00133 ossm << i << '@' << vecsTFs[ i ].m_dActivity; 00134 PCL.SetFeature( iGene, 1, ossm.str( ) ); 00135 for( k = 0; k < vecsTFs[ i ].m_veciConditions.size( ); ++k ) 00136 PCL.Get( iGene, vecsTFs[ i ].m_veciConditions[ k ] ) += 00137 (float)( ( CStatistics::SampleNormalStandard( ) * sArgs.stdev_arg ) + 00138 vecsTFs[ i ].m_dActivity ); } 00139 if( sArgs.output_pcl_arg ) { 00140 ofsm.open( sArgs.output_pcl_arg ); 00141 if( !ofsm.is_open( ) ) { 00142 cerr << "Could not open: " << sArgs.output_pcl_arg << endl; 00143 return 1; } 00144 PCL.Save( ofsm ); 00145 ofsm.close( ); } 00146 00147 if( sArgs.fasta_arg ) { 00148 set<string>::const_iterator iterType; 00149 00150 if( !FASTA.Open( sArgs.fasta_arg ) ) { 00151 cerr << "Could not open: " << sArgs.fasta_arg << endl; 00152 return 1; } 00153 vecHMMs.resize( FASTA.GetTypes( ).size( ) ); 00154 for( i = 0; i < vecHMMs.size( ); ++i ) 00155 vecHMMs[ i ].Open( sArgs.degree_arg, c_acAlphabetTotal ); 00156 for( iterType = FASTA.GetTypes( ).begin( ); iterType != FASTA.GetTypes( ).end( ); ++iterType ) { 00157 // This is insane - somehow the compiler optimzation horks this if you do it on one line 00158 i = mapstriTypes.size( ); 00159 mapstriTypes[ *iterType ] = i; } 00160 for( i = 0; i < FASTA.GetGenes( ); ++i ) { 00161 vector<SFASTASequence> vecsSequences; 00162 00163 if( FASTA.Get( i, vecsSequences ) ) 00164 for( j = 0; j < vecsSequences.size( ); ++j ) { 00165 const SFASTASequence& sSequence = vecsSequences[ j ]; 00166 string strCur, strSequence; 00167 00168 for( k = 0; k < sSequence.m_vecstrSequences.size( ); ++k ) { 00169 strCur = sSequence.m_vecstrSequences[ k ]; 00170 if( sSequence.m_fIntronFirst == !( k % 2 ) ) 00171 transform( strCur.begin( ), strCur.end( ), strCur.begin( ), ::tolower ); 00172 strSequence += strCur; } 00173 if( !vecHMMs[ mapstriTypes[ sSequence.m_strType ] ].Add( strSequence ) ) { 00174 cerr << "Could not add sequence: " << strSequence << endl; 00175 return 1; } } } } 00176 else { 00177 vecHMMs.resize( 1 ); 00178 vecHMMs[ 0 ].Open( sArgs.degree_arg, c_acAlphabetExons ); 00179 vecHMMs[ 0 ].SetUniform( ); 00180 mapstriTypes[ "" ] = 0; } 00181 00182 if( sArgs.output_fasta_arg ) { 00183 vector<string> vecstrTypes; 00184 vector<vector<size_t> > vecveciTFs; 00185 00186 if( sArgs.tf_types_arg ) 00187 CMeta::Tokenize( sArgs.tf_types_arg, vecstrTypes, "," ); 00188 else { 00189 vecstrTypes.resize( mapstriTypes.size( ) ); 00190 for( i = 0,iterType = mapstriTypes.begin( ); iterType != mapstriTypes.end( ); ++i,++iterType ) 00191 vecstrTypes[ i ] = iterType->first; } 00192 vecveciTFs.resize( PCL.GetGenes( ) ); 00193 for( i = 0; i < vecsTFs.size( ); ++i ) 00194 for( j = 0; j < vecsTFs[ i ].m_veciGenes.size( ); ++j ) 00195 vecveciTFs[ vecsTFs[ i ].m_veciGenes[ j ] ].push_back( i ); 00196 ofsm.clear( ); 00197 ofsm.open( sArgs.output_fasta_arg ); 00198 if( !ofsm.is_open( ) ) { 00199 cerr << "Could not open: " << sArgs.output_fasta_arg << endl; 00200 return 1; } 00201 for( iterType = mapstriTypes.begin( ); iterType != mapstriTypes.end( ); ++iterType ) { 00202 const CHMM& HMM = vecHMMs[ iterType->second ]; 00203 string strHeader; 00204 bool fTFs; 00205 00206 fTFs = ( find( vecstrTypes.begin( ), vecstrTypes.end( ), iterType->first ) != 00207 vecstrTypes.end( ) ); 00208 cerr << "Generating " << iterType->first << " sequence with" << ( fTFs ? "" : "out" ) << 00209 " TF sequences..." << endl; 00210 if( !iterType->first.empty( ) ) 00211 strHeader = strHeader + '\t' + iterType->first; 00212 for( i = 0; i < PCL.GetGenes( ); ++i ) { 00213 ofsm << "> " << PCL.GetGene( i ) << strHeader << endl; 00214 j = ( sArgs.seq_max_arg <= sArgs.seq_min_arg ) ? sArgs.seq_min_arg : 00215 ( ( rand( ) % ( sArgs.seq_max_arg - sArgs.seq_min_arg ) ) + sArgs.seq_min_arg ); 00216 strSequence = HMM.Get( j ); 00217 if( fTFs ) 00218 for( j = 0; j < vecveciTFs[ i ].size( ); ++j ) { 00219 const STF& sTF = vecsTFs[ vecveciTFs[ i ][ j ] ]; 00220 00221 iCopies = ( rand( ) % ( sArgs.tf_copx_arg - sArgs.tf_copm_arg ) ) + sArgs.tf_copm_arg; 00222 for( k = 0; k < iCopies; ++k ) { 00223 string strBS = sTF.m_strBS; 00224 size_t iTarget = rand( ) % ( strSequence.length( ) - strBS.length( ) ); 00225 00226 if( islower( strSequence[ iTarget ] ) ) 00227 transform( strBS.begin( ), strBS.end( ), strBS.begin( ), ::tolower ); 00228 strSequence.replace( iTarget, strBS.length( ), strBS ); } } 00229 for( j = 0; j < strSequence.length( ); j += sArgs.wrap_arg ) 00230 ofsm << strSequence.substr( j, sArgs.wrap_arg ) << endl; } } 00231 ofsm.close( ); } 00232 00233 #ifdef WIN32 00234 pthread_win32_process_detach_np( ); 00235 #endif // WIN32 00236 return 0; }