Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 enum EFile { 00026 EFilePCL, 00027 EFileWIG, 00028 EFileError 00029 }; 00030 00031 const char* c_aszMatchTypes[] = {"pvalue", "rmse", "js", NULL}; 00032 const SMotifMatch::EType c_aeMatchTypes[] = 00033 {SMotifMatch::ETypePValue, SMotifMatch::ETypeRMSE, SMotifMatch::ETypeJensenShannon, SMotifMatch::ETypePValue}; 00034 00035 int main_postprocess( const gengetopt_args_info&, CCoalesceMotifLibrary& ); 00036 bool recluster( const gengetopt_args_info&, size_t, CCoalesceMotifLibrary&, const CHierarchy&, 00037 const vector<CCoalesceCluster>&, const vector<string>&, const CPCL&, size_t& ); 00038 bool trim( const gengetopt_args_info&, const CPCL&, vector<CCoalesceCluster>& ); 00039 00040 EFile open_pclwig( const char* szFile, size_t iSkip, CFASTA& FASTA, CPCL& PCL ) { 00041 00042 if( FASTA.Open( szFile ) ) 00043 return EFilePCL; 00044 if( PCL.Open( szFile, iSkip ) ) 00045 return EFileWIG; 00046 00047 cerr << "Could not open: " << szFile << endl; 00048 return EFileError; } 00049 00050 int main( int iArgs, char** aszArgs ) { 00051 gengetopt_args_info sArgs; 00052 CFASTA FASTA; 00053 CPCL PCL, PCLSeed; 00054 CCoalesce Coalesce; 00055 vector<CCoalesceCluster> vecClusters; 00056 size_t i, j; 00057 set<string> setstrTypes; 00058 vector<CFASTA*> vecpFASTAs; 00059 00060 #ifdef WIN32 00061 pthread_win32_process_attach_np( ); 00062 #endif // WIN32 00063 00064 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00065 cmdline_parser_print_help( ); 00066 return 1; } 00067 CMeta Meta( sArgs.verbosity_arg, sArgs.random_arg ); 00068 00069 CCoalesceMotifLibrary Motifs( sArgs.k_arg ); 00070 Motifs.SetPenaltyGap( (float)sArgs.penalty_gap_arg ); 00071 Motifs.SetPenaltyMismatch( (float)sArgs.penalty_mismatch_arg ); 00072 00073 if( sArgs.postprocess_arg ) 00074 return main_postprocess( sArgs, Motifs ); 00075 00076 if( sArgs.sequences_arg ) { 00077 vector<string> vecstrTypes; 00078 00079 CMeta::Tokenize( sArgs.sequences_arg, vecstrTypes, "," ); 00080 for( i = 0; i < vecstrTypes.size( ); ++i ) 00081 setstrTypes.insert( vecstrTypes[ i ] ); } 00082 00083 if( !PCL.Open( sArgs.input_arg, sArgs.skip_arg ) ) { 00084 cerr << "Could not open: " << ( sArgs.input_arg ? sArgs.input_arg : "stdin" ) << endl; 00085 return 1; } 00086 if( sArgs.fasta_arg && !FASTA.Open( sArgs.fasta_arg, setstrTypes ) ) { 00087 cerr << "Could not open: " << sArgs.fasta_arg << endl; 00088 return 1; } 00089 00090 if( sArgs.seed_arg && !PCLSeed.Open( sArgs.seed_arg, sArgs.skip_arg ) ) { 00091 cerr << "Could not open: " << sArgs.seed_arg << endl; 00092 return 1; } 00093 if( sArgs.datasets_arg ) { 00094 static const size_t c_iBuffer = 131072; 00095 ifstream ifsm; 00096 char acBuffer[ c_iBuffer ]; 00097 00098 ifsm.open( sArgs.datasets_arg ); 00099 if( !ifsm.is_open( ) ) { 00100 cerr << "Could not open: " << sArgs.datasets_arg << endl; 00101 return 1; } 00102 while( !ifsm.eof( ) ) { 00103 vector<string> vecstrLine; 00104 set<size_t> setiConditions; 00105 00106 ifsm.getline( acBuffer, c_iBuffer - 1 ); 00107 acBuffer[ c_iBuffer - 1 ] = 0; 00108 CMeta::Tokenize( CMeta::Trim( acBuffer ).c_str( ), vecstrLine ); 00109 for( i = 0; i < vecstrLine.size( ); ++i ) 00110 for( j = 0; j < PCL.GetExperiments( ); ++j ) 00111 if( vecstrLine[ i ] == PCL.GetExperiment( j ) ) { 00112 setiConditions.insert( j ); 00113 break; } 00114 Coalesce.AddDataset( setiConditions ); } } 00115 00116 Coalesce.SetMotifs( Motifs ); 00117 Coalesce.SetProbabilityGene( (float)sArgs.prob_gene_arg ); 00118 Coalesce.SetPValueCondition( (float)sArgs.pvalue_cond_arg ); 00119 Coalesce.SetZScoreCondition( (float)sArgs.zscore_cond_arg ); 00120 Coalesce.SetPValueMotif( (float)sArgs.pvalue_motif_arg ); 00121 Coalesce.SetZScoreMotif( (float)sArgs.zscore_motif_arg ); 00122 Coalesce.SetPValueCorrelation( (float)sArgs.pvalue_correl_arg ); 00123 Coalesce.SetNumberCorrelation( sArgs.number_correl_arg ); 00124 Coalesce.SetPValueMerge( (float)sArgs.pvalue_merge_arg ); 00125 Coalesce.SetCutoffMerge( (float)sArgs.cutoff_merge_arg ); 00126 Coalesce.SetBasesPerMatch( sArgs.bases_arg ); 00127 Coalesce.SetSizeMinimum( sArgs.size_minimum_arg ); 00128 Coalesce.SetSizeMerge( sArgs.size_merge_arg ); 00129 Coalesce.SetSizeMaximum( sArgs.size_maximum_arg ); 00130 Coalesce.SetNormalize( !!sArgs.normalize_flag ); 00131 Coalesce.SetThreads( sArgs.threads_arg ); 00132 Coalesce.SetSeed( PCLSeed ); 00133 if( sArgs.output_arg ) 00134 Coalesce.SetDirectoryIntermediate( sArgs.output_arg ); 00135 00136 vecpFASTAs.resize( sArgs.inputs_num ); 00137 for( i = 0; i < vecpFASTAs.size( ); ++i ) { 00138 vecpFASTAs[ i ] = new CFASTA( ); 00139 if( !vecpFASTAs[ i ]->Open( sArgs.inputs[ i ] ) ) { 00140 cerr << "Could not open: " << sArgs.inputs[ i ] << endl; 00141 return 1; } 00142 Coalesce.AddWiggle( *vecpFASTAs[ i ] ); } 00143 00144 if( sArgs.progressive_flag ) 00145 Coalesce.AddOutputIntermediate( cout ); 00146 if( !Coalesce.Cluster( PCL, FASTA, vecClusters ) ) { 00147 cerr << "Clustering failed" << endl; 00148 return 1; } 00149 00150 for( i = 0; i < vecpFASTAs.size( ); ++i ) 00151 delete vecpFASTAs[ i ]; 00152 if( !sArgs.progressive_flag ) 00153 for( i = 0; i < vecClusters.size( ); ++i ) 00154 vecClusters[ i ].Save( cout, i, PCL, &Motifs ); 00155 00156 #ifdef WIN32 00157 pthread_win32_process_detach_np( ); 00158 #endif // WIN32 00159 return 0; } 00160 00161 int main_postprocess( const gengetopt_args_info& sArgs, CCoalesceMotifLibrary& Motifs ) { 00162 string strDir, strFile, strBase; 00163 vector<CCoalesceCluster> vecClustersFrom, vecClustersTo; 00164 bool fFailed; 00165 CDistanceMatrix MatSim; 00166 size_t i, j, iDatasets; 00167 CHierarchy* pHier; 00168 vector<string> vecstrClusters; 00169 CPCL PCL; 00170 ifstream ifsm; 00171 00172 if( sArgs.input_arg ) { 00173 if( !PCL.Open( sArgs.input_arg, sArgs.skip_arg ) ) { 00174 cerr << "Could not open: " << sArgs.input_arg << endl; 00175 return 1; } } 00176 else if( !PCL.Open( cin, sArgs.skip_arg ) ) { 00177 cerr << "Could not open: stdin" << endl; 00178 return 1; } 00179 if( sArgs.known_motifs_arg ) { 00180 ifsm.open( sArgs.known_motifs_arg ); 00181 if( !Motifs.OpenKnown( ifsm ) ) { 00182 cerr << "Could not open: " << sArgs.known_motifs_arg << endl; 00183 return 1; } 00184 ifsm.close( ); } 00185 fFailed = false; 00186 strDir = sArgs.postprocess_arg; 00187 FOR_EACH_DIRECTORY_FILE(strDir, strFile) 00188 if( !CMeta::IsExtension( strFile, CPCL::GetExtension( ) ) ) 00189 continue; 00190 strBase = CMeta::Deextension( strFile ); 00191 strFile = strDir + '/' + strFile; 00192 00193 if( !fFailed ) 00194 vecClustersFrom.push_back( CCoalesceCluster( ) ); 00195 if( fFailed = ( ( iDatasets = vecClustersFrom.back( ).Open( strFile, sArgs.skip_arg, PCL, 00196 &Motifs ) ) == -1 ) ) { 00197 cerr << "Could not open: " << strFile << endl; 00198 continue; } 00199 if( !( vecstrClusters.size( ) % 50 ) ) 00200 cerr << "Opened cluster " << vecstrClusters.size( ) << "..." << endl; 00201 vecstrClusters.push_back( strBase ); } 00202 if( fFailed ) 00203 vecClustersFrom.pop_back( ); 00204 if( vecClustersFrom.empty( ) ) { 00205 char szBuffer[ 16 ]; 00206 00207 fFailed = false; 00208 ifsm.clear( ); 00209 ifsm.open( sArgs.postprocess_arg ); 00210 while( !ifsm.eof( ) ) { 00211 if( !fFailed ) 00212 vecClustersFrom.push_back( CCoalesceCluster( ) ); 00213 if( fFailed = ( ( iDatasets = vecClustersFrom.back( ).Open( ifsm, PCL, &Motifs ) ) == -1 ) ) 00214 continue; 00215 if( !( vecstrClusters.size( ) % 50 ) ) 00216 cerr << "Opened cluster " << vecstrClusters.size( ) << "..." << endl; 00217 sprintf_s( szBuffer, "c%06d", vecstrClusters.size( ) ); 00218 vecstrClusters.push_back( szBuffer ); } 00219 if( fFailed ) 00220 vecClustersFrom.pop_back( ); } 00221 00222 cerr << "Trimming clusters..." << endl; 00223 if( !trim( sArgs, PCL, vecClustersFrom ) ) 00224 return 1; 00225 00226 cerr << "Calculating cluster similarities..." << endl; 00227 MatSim.Initialize( vecClustersFrom.size( ) ); 00228 for( i = 0; i < MatSim.GetSize( ); ++i ) 00229 for( j = ( i + 1 ); j < MatSim.GetSize( ); ++j ) { 00230 if( sArgs.verbosity_arg >= 7 ) 00231 cerr << "Comparing clusters: " << i << '\t' << j << endl; 00232 MatSim.Set( i, j, vecClustersFrom[ i ].GetSimilarity( vecClustersFrom[ j ], PCL.GetGenes( ), 00233 iDatasets ) ); } 00234 00235 i = 0; 00236 return ( ( ( pHier = CClustHierarchical::Cluster( MatSim ) ) && 00237 recluster( sArgs, MatSim.GetSize( ) * ( MatSim.GetSize( ) - 1 ) / 2, Motifs, *pHier, vecClustersFrom, 00238 vecstrClusters, PCL, i ) ) ? 0 : 1 ); } 00239 00240 bool recluster( const gengetopt_args_info& sArgs, size_t iPairs, CCoalesceMotifLibrary& Motifs, 00241 const CHierarchy& Hier, const vector<CCoalesceCluster>& vecClustersFrom, 00242 const vector<string>& vecstrClustersFrom, const CPCL& PCL, size_t& iID ) { 00243 SMotifMatch::EType eMatchType; 00244 size_t i; 00245 00246 for( i = 0; c_aszMatchTypes[ i ]; ++i ) 00247 if( !strcmp( c_aszMatchTypes[ i ], sArgs.known_type_arg ) ) 00248 break; 00249 eMatchType = c_aeMatchTypes[ i ]; 00250 00251 if( Hier.IsGene( ) || ( Hier.GetSimilarity( ) >= sArgs.cutoff_postprocess_arg ) ) { 00252 CCoalesceCluster Cluster; 00253 00254 cerr << "Creating output cluster " << iID << endl; 00255 if( !Cluster.Open( Hier, vecClustersFrom, vecstrClustersFrom, 00256 (float)sArgs.fraction_postprocess_arg, (float)sArgs.cutoff_merge_arg, sArgs.max_motifs_arg, 00257 &Motifs ) ) 00258 return false; 00259 if( Cluster.GetGenes( ).size( ) < (size_t)sArgs.size_minimum_arg ) { 00260 cerr << "Cluster too small: " << Cluster.GetGenes( ).size( ) << endl; 00261 return true; } 00262 00263 if( !Cluster.LabelMotifs( Motifs, eMatchType, (float)sArgs.penalty_gap_arg, 00264 (float)sArgs.penalty_mismatch_arg, (float)sArgs.known_cutoff_arg ) ) 00265 return false; 00266 if( sArgs.output_arg ) 00267 Cluster.Save( sArgs.output_arg, iID, PCL, &Motifs ); 00268 Cluster.Save( cout, iID, PCL, &Motifs, (float)sArgs.min_info_arg, (float)sArgs.penalty_gap_arg, 00269 (float)sArgs.penalty_mismatch_arg, !!sArgs.remove_rcs_flag ); 00270 iID++; 00271 return true; } 00272 00273 return ( recluster( sArgs, iPairs, Motifs, Hier.Get( false ), vecClustersFrom, vecstrClustersFrom, PCL, 00274 iID ) && recluster( sArgs, iPairs, Motifs, Hier.Get( true ), vecClustersFrom, vecstrClustersFrom, 00275 PCL, iID ) ); } 00276 00277 bool trim( const gengetopt_args_info& sArgs, const CPCL& PCL, vector<CCoalesceCluster>& vecClusters ) { 00278 vector<size_t> veciCounts, veciGenes, veciRemove; 00279 CHalfMatrix<size_t> MatCounts; 00280 CDistanceMatrix MatScores; 00281 size_t i, j, iOne, iCluster; 00282 float dAve, dStd, d, dCutoff; 00283 vector<float> vecdScores; 00284 00285 veciCounts.resize( PCL.GetGenes( ) ); 00286 MatCounts.Initialize( PCL.GetGenes( ) ); 00287 MatCounts.Clear( ); 00288 for( iCluster = 0; iCluster < vecClusters.size( ); ++iCluster ) { 00289 const CCoalesceCluster& Cluster = vecClusters[ iCluster ]; 00290 00291 veciGenes.resize( Cluster.GetGenes( ).size( ) ); 00292 copy( Cluster.GetGenes( ).begin( ), Cluster.GetGenes( ).end( ), veciGenes.begin( ) ); 00293 for( i = 0; i < veciGenes.size( ); ++i ) { 00294 veciCounts[ iOne = veciGenes[ i ] ]++; 00295 for( j = ( i + 1 ); j < veciGenes.size( ); ++j ) 00296 MatCounts.Get( iOne, veciGenes[ j ] )++; } } 00297 MatScores.Initialize( MatCounts.GetSize( ) ); 00298 for( i = 0; i < MatScores.GetSize( ); ++i ) 00299 for( j = ( i + 1 ); j < MatScores.GetSize( ); ++j ) { 00300 iOne = MatCounts.Get( i, j ); 00301 iCluster = veciCounts[ i ] + veciCounts[ j ] - iOne; 00302 MatScores.Set( i, j, iCluster ? ( (float)iOne / iCluster ) : CMeta::GetNaN( ) ); } 00303 00304 dAve = dStd = 0; 00305 for( iOne = i = 0; i < MatScores.GetSize( ); ++i ) 00306 for( j = ( i + 1 ); j < MatScores.GetSize( ); ++j ) 00307 if( !CMeta::IsNaN( d = MatScores.Get( i, j ) ) ) { 00308 iOne++; 00309 dAve += d; 00310 dStd += d * d; } 00311 dAve /= iOne; 00312 dStd = sqrt( ( dStd / iOne ) - ( dAve * dAve ) ); 00313 dCutoff = dAve + ( (float)sArgs.cutoff_trim_arg * dStd ); 00314 if( sArgs.verbosity_arg >= 6 ) 00315 cerr << "Cutoff " << dCutoff << " (" << dAve << ',' << dStd << ')' << endl; 00316 00317 for( iCluster = 0; iCluster < vecClusters.size( ); ++iCluster ) { 00318 CCoalesceCluster& Cluster = vecClusters[ iCluster ]; 00319 00320 if( sArgs.verbosity_arg >= 6 ) 00321 cerr << "Trimming cluster " << iCluster << endl; 00322 veciGenes.resize( Cluster.GetGenes( ).size( ) ); 00323 copy( Cluster.GetGenes( ).begin( ), Cluster.GetGenes( ).end( ), veciGenes.begin( ) ); 00324 vecdScores.resize( veciGenes.size( ) ); 00325 for( i = 0; i < vecdScores.size( ); ++i ) { 00326 iOne = veciGenes[ i ]; 00327 for( j = ( i + 1 ); j < vecdScores.size( ); ++j ) { 00328 d = MatScores.Get( iOne, veciGenes[ j ] ); 00329 vecdScores[ i ] += d; 00330 vecdScores[ j ] += d; } } 00331 for( i = 0; i < vecdScores.size( ); ++i ) 00332 vecdScores[ i ] /= veciGenes.size( ) - 1; 00333 if( sArgs.verbosity_arg >= 7 ) 00334 for( i = 0; i < vecdScores.size( ); ++i ) 00335 cerr << "Gene " << i << " (" << PCL.GetGene( veciGenes[ i ] ) << ") " << vecdScores[ i ] << 00336 endl; 00337 00338 veciRemove.clear( ); 00339 for( i = 0; i < vecdScores.size( ); ++i ) 00340 if( vecdScores[ i ] < dCutoff ) { 00341 if( sArgs.verbosity_arg >= 6 ) 00342 cerr << "Removing " << PCL.GetGene( veciGenes[ i ] ) << " at " << vecdScores[ i ] << endl; 00343 veciRemove.push_back( veciGenes[ i ] ); } 00344 Cluster.RemoveGenes( veciRemove ); } 00345 00346 return true; }