Sleipnir
tools/COALESCE/COALESCE.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 enum EFile {
00026     EFilePCL,
00027     EFileWIG,
00028     EFileError
00029 };
00030 
00031 const char*                 c_aszMatchTypes[]   = {"pvalue", "rmse", "js", NULL};
00032 const SMotifMatch::EType    c_aeMatchTypes[]    =
00033     {SMotifMatch::ETypePValue, SMotifMatch::ETypeRMSE, SMotifMatch::ETypeJensenShannon, SMotifMatch::ETypePValue};
00034 
00035 int main_postprocess( const gengetopt_args_info&, CCoalesceMotifLibrary& );
00036 bool recluster( const gengetopt_args_info&, size_t, CCoalesceMotifLibrary&, const CHierarchy&,
00037     const vector<CCoalesceCluster>&, const vector<string>&, const CPCL&, size_t& );
00038 bool trim( const gengetopt_args_info&, const CPCL&, vector<CCoalesceCluster>& );
00039 
00040 EFile open_pclwig( const char* szFile, size_t iSkip, CFASTA& FASTA, CPCL& PCL ) {
00041 
00042     if( FASTA.Open( szFile ) )
00043         return EFilePCL;
00044     if( PCL.Open( szFile, iSkip ) )
00045         return EFileWIG;
00046 
00047     cerr << "Could not open: " << szFile << endl;
00048     return EFileError; }
00049 
00050 int main( int iArgs, char** aszArgs ) {
00051     gengetopt_args_info         sArgs;
00052     CFASTA                      FASTA;
00053     CPCL                        PCL, PCLSeed;
00054     CCoalesce                   Coalesce;
00055     vector<CCoalesceCluster>    vecClusters;
00056     size_t                      i, j;
00057     set<string>                 setstrTypes;
00058     vector<CFASTA*>             vecpFASTAs;
00059 
00060 #ifdef WIN32
00061     pthread_win32_process_attach_np( );
00062 #endif // WIN32
00063 
00064     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00065         cmdline_parser_print_help( );
00066         return 1; }
00067     CMeta Meta( sArgs.verbosity_arg, sArgs.random_arg );
00068 
00069     CCoalesceMotifLibrary       Motifs( sArgs.k_arg );
00070     Motifs.SetPenaltyGap( (float)sArgs.penalty_gap_arg );
00071     Motifs.SetPenaltyMismatch( (float)sArgs.penalty_mismatch_arg );
00072 
00073     if( sArgs.postprocess_arg )
00074         return main_postprocess( sArgs, Motifs );
00075 
00076     if( sArgs.sequences_arg ) {
00077         vector<string>  vecstrTypes;
00078 
00079         CMeta::Tokenize( sArgs.sequences_arg, vecstrTypes, "," );
00080         for( i = 0; i < vecstrTypes.size( ); ++i )
00081             setstrTypes.insert( vecstrTypes[ i ] ); }
00082 
00083     if( !PCL.Open( sArgs.input_arg, sArgs.skip_arg ) ) {
00084         cerr << "Could not open: " << ( sArgs.input_arg ? sArgs.input_arg : "stdin" ) << endl;
00085         return 1; }
00086     if( sArgs.fasta_arg && !FASTA.Open( sArgs.fasta_arg, setstrTypes ) ) {
00087         cerr << "Could not open: " << sArgs.fasta_arg << endl;
00088         return 1; }
00089 
00090     if( sArgs.seed_arg && !PCLSeed.Open( sArgs.seed_arg, sArgs.skip_arg ) ) {
00091         cerr << "Could not open: " << sArgs.seed_arg << endl;
00092         return 1; }
00093     if( sArgs.datasets_arg ) {
00094         static const size_t c_iBuffer   = 131072;
00095         ifstream            ifsm;
00096         char                acBuffer[ c_iBuffer ];
00097 
00098         ifsm.open( sArgs.datasets_arg );
00099         if( !ifsm.is_open( ) ) {
00100             cerr << "Could not open: " << sArgs.datasets_arg << endl;
00101             return 1; }
00102         while( !ifsm.eof( ) ) {
00103             vector<string>  vecstrLine;
00104             set<size_t>     setiConditions;
00105 
00106             ifsm.getline( acBuffer, c_iBuffer - 1 );
00107             acBuffer[ c_iBuffer - 1 ] = 0;
00108             CMeta::Tokenize( CMeta::Trim( acBuffer ).c_str( ), vecstrLine );
00109             for( i = 0; i < vecstrLine.size( ); ++i )
00110                 for( j = 0; j < PCL.GetExperiments( ); ++j )
00111                     if( vecstrLine[ i ] == PCL.GetExperiment( j ) ) {
00112                         setiConditions.insert( j );
00113                         break; }
00114             Coalesce.AddDataset( setiConditions ); } }
00115 
00116     Coalesce.SetMotifs( Motifs );
00117     Coalesce.SetProbabilityGene( (float)sArgs.prob_gene_arg );
00118     Coalesce.SetPValueCondition( (float)sArgs.pvalue_cond_arg );
00119     Coalesce.SetZScoreCondition( (float)sArgs.zscore_cond_arg );
00120     Coalesce.SetPValueMotif( (float)sArgs.pvalue_motif_arg );
00121     Coalesce.SetZScoreMotif( (float)sArgs.zscore_motif_arg );
00122     Coalesce.SetPValueCorrelation( (float)sArgs.pvalue_correl_arg );
00123     Coalesce.SetNumberCorrelation( sArgs.number_correl_arg );
00124     Coalesce.SetPValueMerge( (float)sArgs.pvalue_merge_arg );
00125     Coalesce.SetCutoffMerge( (float)sArgs.cutoff_merge_arg );
00126     Coalesce.SetBasesPerMatch( sArgs.bases_arg );
00127     Coalesce.SetSizeMinimum( sArgs.size_minimum_arg );
00128     Coalesce.SetSizeMerge( sArgs.size_merge_arg );
00129     Coalesce.SetSizeMaximum( sArgs.size_maximum_arg );
00130     Coalesce.SetNormalize( !!sArgs.normalize_flag );
00131     Coalesce.SetThreads( sArgs.threads_arg );
00132     Coalesce.SetSeed( PCLSeed );
00133     if( sArgs.output_arg )
00134         Coalesce.SetDirectoryIntermediate( sArgs.output_arg );
00135 
00136     vecpFASTAs.resize( sArgs.inputs_num );
00137     for( i = 0; i < vecpFASTAs.size( ); ++i ) {
00138         vecpFASTAs[ i ] = new CFASTA( );
00139         if( !vecpFASTAs[ i ]->Open( sArgs.inputs[ i ] ) ) {
00140             cerr << "Could not open: " << sArgs.inputs[ i ] << endl;
00141             return 1; }
00142         Coalesce.AddWiggle( *vecpFASTAs[ i ] ); }
00143 
00144     if( sArgs.progressive_flag )
00145         Coalesce.AddOutputIntermediate( cout );
00146     if( !Coalesce.Cluster( PCL, FASTA, vecClusters ) ) {
00147         cerr << "Clustering failed" << endl;
00148         return 1; }
00149 
00150     for( i = 0; i < vecpFASTAs.size( ); ++i )
00151         delete vecpFASTAs[ i ];
00152     if( !sArgs.progressive_flag )
00153         for( i = 0; i < vecClusters.size( ); ++i )
00154             vecClusters[ i ].Save( cout, i, PCL, &Motifs );
00155 
00156 #ifdef WIN32
00157     pthread_win32_process_detach_np( );
00158 #endif // WIN32
00159     return 0; }
00160 
00161 int main_postprocess( const gengetopt_args_info& sArgs, CCoalesceMotifLibrary& Motifs ) {
00162     string                      strDir, strFile, strBase;
00163     vector<CCoalesceCluster>    vecClustersFrom, vecClustersTo;
00164     bool                        fFailed;
00165     CDistanceMatrix             MatSim;
00166     size_t                      i, j, iDatasets;
00167     CHierarchy*                 pHier;
00168     vector<string>              vecstrClusters;
00169     CPCL                        PCL;
00170     ifstream                    ifsm;
00171 
00172     if( sArgs.input_arg ) {
00173         if( !PCL.Open( sArgs.input_arg, sArgs.skip_arg ) ) {
00174             cerr << "Could not open: " << sArgs.input_arg << endl;
00175             return 1; } }
00176     else if( !PCL.Open( cin, sArgs.skip_arg ) ) {
00177         cerr << "Could not open: stdin" << endl;
00178         return 1; }
00179     if( sArgs.known_motifs_arg ) {
00180         ifsm.open( sArgs.known_motifs_arg );
00181         if( !Motifs.OpenKnown( ifsm ) ) {
00182             cerr << "Could not open: " << sArgs.known_motifs_arg << endl;
00183             return 1; }
00184         ifsm.close( ); }
00185     fFailed = false;
00186     strDir = sArgs.postprocess_arg;
00187     FOR_EACH_DIRECTORY_FILE(strDir, strFile)
00188         if( !CMeta::IsExtension( strFile, CPCL::GetExtension( ) ) )
00189             continue;
00190         strBase = CMeta::Deextension( strFile );
00191         strFile = strDir + '/' + strFile;
00192 
00193         if( !fFailed )
00194             vecClustersFrom.push_back( CCoalesceCluster( ) );
00195         if( fFailed = ( ( iDatasets = vecClustersFrom.back( ).Open( strFile, sArgs.skip_arg, PCL,
00196             &Motifs ) ) == -1 ) ) {
00197             cerr << "Could not open: " << strFile << endl;
00198             continue; }
00199         if( !( vecstrClusters.size( ) % 50 ) )
00200             cerr << "Opened cluster " << vecstrClusters.size( ) << "..." << endl;
00201         vecstrClusters.push_back( strBase ); }
00202     if( fFailed )
00203         vecClustersFrom.pop_back( );
00204     if( vecClustersFrom.empty( ) ) {
00205         char    szBuffer[ 16 ];
00206 
00207         fFailed = false;
00208         ifsm.clear( );
00209         ifsm.open( sArgs.postprocess_arg );
00210         while( !ifsm.eof( ) ) {
00211             if( !fFailed )
00212                 vecClustersFrom.push_back( CCoalesceCluster( ) );
00213             if( fFailed = ( ( iDatasets = vecClustersFrom.back( ).Open( ifsm, PCL, &Motifs ) ) == -1 ) )
00214                 continue;
00215             if( !( vecstrClusters.size( ) % 50 ) )
00216                 cerr << "Opened cluster " << vecstrClusters.size( ) << "..." << endl;
00217             sprintf_s( szBuffer, "c%06d", vecstrClusters.size( ) );
00218             vecstrClusters.push_back( szBuffer ); }
00219         if( fFailed )
00220             vecClustersFrom.pop_back( ); }
00221 
00222     cerr << "Trimming clusters..." << endl;
00223     if( !trim( sArgs, PCL, vecClustersFrom ) )
00224         return 1;
00225 
00226     cerr << "Calculating cluster similarities..." << endl;
00227     MatSim.Initialize( vecClustersFrom.size( ) );
00228     for( i = 0; i < MatSim.GetSize( ); ++i )
00229         for( j = ( i + 1 ); j < MatSim.GetSize( ); ++j ) {
00230             if( sArgs.verbosity_arg >= 7 )
00231                 cerr << "Comparing clusters:    " << i << '\t' << j << endl;
00232             MatSim.Set( i, j, vecClustersFrom[ i ].GetSimilarity( vecClustersFrom[ j ], PCL.GetGenes( ),
00233                 iDatasets ) ); }
00234 
00235     i = 0;
00236     return ( ( ( pHier = CClustHierarchical::Cluster( MatSim ) ) &&
00237         recluster( sArgs, MatSim.GetSize( ) * ( MatSim.GetSize( ) - 1 ) / 2, Motifs, *pHier, vecClustersFrom,
00238         vecstrClusters, PCL, i ) ) ? 0 : 1 ); }
00239 
00240 bool recluster( const gengetopt_args_info& sArgs, size_t iPairs, CCoalesceMotifLibrary& Motifs,
00241     const CHierarchy& Hier, const vector<CCoalesceCluster>& vecClustersFrom,
00242     const vector<string>& vecstrClustersFrom, const CPCL& PCL, size_t& iID ) {
00243     SMotifMatch::EType  eMatchType;
00244     size_t              i;
00245 
00246     for( i = 0; c_aszMatchTypes[ i ]; ++i )
00247         if( !strcmp( c_aszMatchTypes[ i ], sArgs.known_type_arg ) )
00248             break;
00249     eMatchType = c_aeMatchTypes[ i ];
00250 
00251     if( Hier.IsGene( ) || ( Hier.GetSimilarity( ) >= sArgs.cutoff_postprocess_arg ) ) {
00252         CCoalesceCluster    Cluster;
00253 
00254         cerr << "Creating output cluster " << iID << endl;
00255         if( !Cluster.Open( Hier, vecClustersFrom, vecstrClustersFrom,
00256             (float)sArgs.fraction_postprocess_arg, (float)sArgs.cutoff_merge_arg, sArgs.max_motifs_arg,
00257             &Motifs ) )
00258             return false;
00259         if( Cluster.GetGenes( ).size( ) < (size_t)sArgs.size_minimum_arg ) {
00260             cerr << "Cluster too small: " << Cluster.GetGenes( ).size( ) << endl;
00261             return true; }
00262 
00263         if( !Cluster.LabelMotifs( Motifs, eMatchType, (float)sArgs.penalty_gap_arg,
00264             (float)sArgs.penalty_mismatch_arg, (float)sArgs.known_cutoff_arg ) )
00265             return false;
00266         if( sArgs.output_arg )
00267             Cluster.Save( sArgs.output_arg, iID, PCL, &Motifs );
00268         Cluster.Save( cout, iID, PCL, &Motifs, (float)sArgs.min_info_arg, (float)sArgs.penalty_gap_arg,
00269             (float)sArgs.penalty_mismatch_arg, !!sArgs.remove_rcs_flag );
00270         iID++;
00271         return true; }
00272 
00273     return ( recluster( sArgs, iPairs, Motifs, Hier.Get( false ), vecClustersFrom, vecstrClustersFrom, PCL,
00274         iID ) && recluster( sArgs, iPairs, Motifs, Hier.Get( true ), vecClustersFrom, vecstrClustersFrom,
00275         PCL, iID ) ); }
00276 
00277 bool trim( const gengetopt_args_info& sArgs, const CPCL& PCL, vector<CCoalesceCluster>& vecClusters ) {
00278     vector<size_t>              veciCounts, veciGenes, veciRemove;
00279     CHalfMatrix<size_t>         MatCounts;
00280     CDistanceMatrix             MatScores;
00281     size_t                      i, j, iOne, iCluster;
00282     float                       dAve, dStd, d, dCutoff;
00283     vector<float>               vecdScores;
00284 
00285     veciCounts.resize( PCL.GetGenes( ) );
00286     MatCounts.Initialize( PCL.GetGenes( ) );
00287     MatCounts.Clear( );
00288     for( iCluster = 0; iCluster < vecClusters.size( ); ++iCluster ) {
00289         const CCoalesceCluster& Cluster = vecClusters[ iCluster ];
00290 
00291         veciGenes.resize( Cluster.GetGenes( ).size( ) );
00292         copy( Cluster.GetGenes( ).begin( ), Cluster.GetGenes( ).end( ), veciGenes.begin( ) );
00293         for( i = 0; i < veciGenes.size( ); ++i ) {
00294             veciCounts[ iOne = veciGenes[ i ] ]++;
00295             for( j = ( i + 1 ); j < veciGenes.size( ); ++j )
00296                 MatCounts.Get( iOne, veciGenes[ j ] )++; } }
00297     MatScores.Initialize( MatCounts.GetSize( ) );
00298     for( i = 0; i < MatScores.GetSize( ); ++i )
00299         for( j = ( i + 1 ); j < MatScores.GetSize( ); ++j ) {
00300             iOne = MatCounts.Get( i, j );
00301             iCluster = veciCounts[ i ] + veciCounts[ j ] - iOne;
00302             MatScores.Set( i, j, iCluster ? ( (float)iOne / iCluster ) : CMeta::GetNaN( ) ); }
00303 
00304     dAve = dStd = 0;
00305     for( iOne = i = 0; i < MatScores.GetSize( ); ++i )
00306         for( j = ( i + 1 ); j < MatScores.GetSize( ); ++j )
00307             if( !CMeta::IsNaN( d = MatScores.Get( i, j ) ) ) {
00308                 iOne++;
00309                 dAve += d;
00310                 dStd += d * d; }
00311     dAve /= iOne;
00312     dStd = sqrt( ( dStd / iOne ) - ( dAve * dAve ) );
00313     dCutoff = dAve + ( (float)sArgs.cutoff_trim_arg * dStd );
00314     if( sArgs.verbosity_arg >= 6 )
00315         cerr << "Cutoff " << dCutoff << " (" << dAve << ',' << dStd << ')' << endl;
00316 
00317     for( iCluster = 0; iCluster < vecClusters.size( ); ++iCluster ) {
00318         CCoalesceCluster&   Cluster = vecClusters[ iCluster ];
00319 
00320         if( sArgs.verbosity_arg >= 6 )
00321             cerr << "Trimming cluster " << iCluster << endl;
00322         veciGenes.resize( Cluster.GetGenes( ).size( ) );
00323         copy( Cluster.GetGenes( ).begin( ), Cluster.GetGenes( ).end( ), veciGenes.begin( ) );
00324         vecdScores.resize( veciGenes.size( ) );
00325         for( i = 0; i < vecdScores.size( ); ++i ) {
00326             iOne = veciGenes[ i ];
00327             for( j = ( i + 1 ); j < vecdScores.size( ); ++j ) {
00328                 d = MatScores.Get( iOne, veciGenes[ j ] );
00329                 vecdScores[ i ] += d;
00330                 vecdScores[ j ] += d; } }
00331         for( i = 0; i < vecdScores.size( ); ++i )
00332             vecdScores[ i ] /= veciGenes.size( ) - 1;
00333         if( sArgs.verbosity_arg >= 7 )
00334             for( i = 0; i < vecdScores.size( ); ++i )
00335                 cerr << "Gene " << i << " (" << PCL.GetGene( veciGenes[ i ] ) << ") " << vecdScores[ i ] <<
00336                     endl;
00337 
00338         veciRemove.clear( );
00339         for( i = 0; i < vecdScores.size( ); ++i )
00340             if( vecdScores[ i ] < dCutoff ) {
00341                 if( sArgs.verbosity_arg >= 6 )
00342                     cerr << "Removing " << PCL.GetGene( veciGenes[ i ] ) << " at " << vecdScores[ i ] << endl;
00343                 veciRemove.push_back( veciGenes[ i ] ); }
00344         Cluster.RemoveGenes( veciRemove ); }
00345 
00346     return true; }