Sleipnir
tools/Clusters2Dab/Clusters2Dab.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 static const char   c_szBick[]  = "[Bick]";
00026 static const char   c_szBicd[]  = "[Bicd]";
00027 static const char   c_szSamba[] = "samba";
00028 static const char   c_szFuzzy[] = "fuzzy";
00029 static const char   c_szList[]  = "list";
00030 static const char   c_szParam[] = "param";
00031 
00032 int open_list( istream&, CGenome&, vector<float>&, vector<CGenes*>& );
00033 int open_samba( istream&, CGenome&, vector<float>&, vector<CGenes*>& );
00034 int open_param( istream&, CGenome&, vector<float>&, vector<CGenes*>& );
00035 int open_fuzzy( istream&, size_t, CDat& );
00036 
00037 int main( int iArgs, char** aszArgs ) {
00038     CDat                                Dat;
00039     ifstream                            ifsm;
00040     istream*                            pistm;
00041     gengetopt_args_info                 sArgs;
00042     CGenome                             Genome;
00043     vector<float>                       vecdWeights;
00044     vector<CGenes*>                     vecpClusters;
00045     vector<string>                      vecstrGenes;
00046     set<string>                         setstrGenes;
00047     set<string>::const_iterator         iterGene;
00048     vector<vector<size_t> >             vecveciGenes;
00049     int                                 iRet;
00050     size_t                              i, j, iCluster, iGeneOne, iGeneTwo, iOne, iTwo;
00051     CGenes*                             pClusterOne;
00052     float                               d, dOne;
00053     map<string, size_t>                 mapstriGenes;
00054     map<string, size_t>::const_iterator iterIndex;
00055 
00056     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00057         cmdline_parser_print_help( );
00058         return 1; }
00059     CMeta Meta( sArgs.verbosity_arg );
00060 
00061     if( sArgs.input_arg ) {
00062         ifsm.open( sArgs.input_arg );
00063         pistm = &ifsm; }
00064     else
00065         pistm = &cin;
00066     iRet = 1;
00067     if( !strcmp( c_szSamba, sArgs.type_arg ) )
00068         iRet = open_samba( *pistm, Genome, vecdWeights, vecpClusters );
00069     else if( !strcmp( c_szList, sArgs.type_arg ) )
00070         iRet = open_list( *pistm, Genome, vecdWeights, vecpClusters );
00071     else if( !strcmp( c_szParam, sArgs.type_arg ) )
00072         iRet = open_param( *pistm, Genome, vecdWeights, vecpClusters );
00073     else if( !strcmp( c_szFuzzy, sArgs.type_arg ) )
00074         iRet = open_fuzzy( *pistm, sArgs.skip_arg, Dat );
00075     else
00076         cmdline_parser_print_help( );
00077     if( iRet )
00078         return iRet;
00079 
00080     if( !vecpClusters.size( ) ) {
00081         cerr << "No clusters found!" << endl;
00082         return 1; }
00083 
00084     for( i = 0; i < vecpClusters.size( ); ++i )
00085         for( j = 0; j < vecpClusters[ i ]->GetGenes( ); ++j )
00086             setstrGenes.insert( vecpClusters[ i ]->GetGene( j ).GetName( ) );
00087     vecstrGenes.resize( setstrGenes.size( ) );
00088     for( i = 0,iterGene = setstrGenes.begin( ); iterGene != setstrGenes.end( ); ++i,++iterGene )
00089         vecstrGenes[ i ] = *iterGene;
00090 
00091     Dat.Open( vecstrGenes );
00092     for( i = 0; i < Dat.GetGenes( ); ++i )
00093         mapstriGenes[ Dat.GetGene( i ) ] = i;
00094     vecveciGenes.resize( vecpClusters.size( ) );
00095     for( i = 0; i < vecveciGenes.size( ); ++i ) {
00096         vecveciGenes[ i ].resize( vecpClusters[ i ]->GetGenes( ) );
00097         for( j = 0; j < vecveciGenes[ i ].size( ); ++j )
00098             vecveciGenes[ i ][ j ] = ( ( iterIndex = mapstriGenes.find(
00099                 vecpClusters[ i ]->GetGene( j ).GetName( ) ) ) == mapstriGenes.end( ) ) ? -1 :
00100                 iterIndex->second; }
00101 
00102     if( sArgs.size_flag )
00103         for( iCluster = 0; iCluster < vecpClusters.size( ); ++iCluster )
00104             vecdWeights[ iCluster ] = 1.0f / vecpClusters[ iCluster ]->GetGenes( );
00105     if( sArgs.counts_flag )
00106         for( iCluster = 0; iCluster < vecpClusters.size( ); ++iCluster ) {
00107             const vector<size_t>&   veciOne = vecveciGenes[ iCluster ];
00108 
00109             cerr << "Processing cluster " << iCluster << "/" << vecpClusters.size( ) << endl;
00110             pClusterOne = vecpClusters[ iCluster ];
00111             dOne = vecdWeights[ iCluster ];
00112             for( iGeneOne = 0; iGeneOne < veciOne.size( ); ++iGeneOne ) {
00113                 iOne = veciOne[ iGeneOne ];
00114                 for( iGeneTwo = ( iGeneOne + 1 ); iGeneTwo < veciOne.size( ); ++iGeneTwo ) {
00115                     float   d;
00116 
00117                     iTwo = veciOne[ iGeneTwo ];
00118                     d = Dat.Get( iOne, iTwo );
00119                     Dat.Set( iOne, iTwo, ( CMeta::IsNaN( d ) ? 0 : d ) + dOne ); } } }
00120     else {
00121         vector<bool>    vecfClustered;
00122 
00123         vecfClustered.resize( Dat.GetGenes( ) );
00124         for( iGeneOne = 0; iGeneOne < vecfClustered.size( ); ++iGeneOne ) {
00125             const string&   strGene = Dat.GetGene( iGeneOne );
00126 
00127             for( iCluster = 0; iCluster < vecpClusters.size( ); ++iCluster )
00128                 if( vecpClusters[ iCluster ]->IsGene( strGene ) ) {
00129                     vecfClustered[ iGeneOne ] = true;
00130                     break; } }
00131         for( iGeneOne = 0; iGeneOne < Dat.GetGenes( ); ++iGeneOne ) {
00132             if( !vecfClustered[ iGeneOne ] )
00133                 continue;
00134             for( iGeneTwo = ( iGeneOne + 1 ); iGeneTwo < Dat.GetGenes( ); ++iGeneTwo )
00135                 if( vecfClustered[ iGeneTwo ] )
00136                     Dat.Set( iGeneOne, iGeneTwo, 0 ); }
00137         for( iCluster = 0; iCluster < vecpClusters.size( ); ++iCluster ) {
00138             const vector<size_t>&   veciOne = vecveciGenes[ iCluster ];
00139 
00140             cerr << "Processing cluster " << iCluster << "/" << vecpClusters.size( ) << endl;
00141             pClusterOne = vecpClusters[ iCluster ];
00142             dOne = vecdWeights[ iCluster ];
00143             for( iGeneOne = 0; iGeneOne < veciOne.size( ); ++iGeneOne ) {
00144                 iOne = veciOne[ iGeneOne ];
00145                 for( iGeneTwo = ( iGeneOne + 1 ); iGeneTwo < veciOne.size( ); ++iGeneTwo ) {
00146                     iTwo = veciOne[ iGeneTwo ];
00147                     if( CMeta::IsNaN( d = Dat.Get( iOne, iTwo ) ) || ( dOne > d ) )
00148                         Dat.Set( iOne, iTwo, dOne ); } } } }
00149 
00150     Dat.Save( sArgs.output_arg );
00151     for( i = 0; i < vecpClusters.size( ); ++i )
00152         delete vecpClusters[ i ];
00153 
00154     return 0; }
00155 
00156 struct SSorter {
00157     const vector<float>&    m_vecdWeights;
00158 
00159     SSorter( const vector<float>& vecdWeights ) : m_vecdWeights(vecdWeights) { }
00160 
00161     bool operator()( size_t iOne, size_t iTwo ) {
00162 
00163         return ( m_vecdWeights[ iOne ] < m_vecdWeights[ iTwo ] ); }
00164 };
00165 
00166 int open_samba( istream& istm, CGenome& Genome, vector<float>& vecdWeights, vector<CGenes*>& vecpClusters ) {
00167     static const size_t c_iLine = 1024;
00168     char                szLine[ c_iLine ];
00169     vector<string>      vecstrLine, vecstrGenes;
00170     CGenes*             pCluster;
00171     size_t              iCluster, iCur, i;
00172     vector<size_t>      veciIndices;
00173     vector<float>       vecdCopy;
00174     vector<CGenes*>     vecpCopy;
00175 
00176     while( !istm.eof( ) ) {
00177         istm.getline( szLine, c_iLine - 1 );
00178         if( !strcmp( szLine, c_szBick ) )
00179             continue;
00180         if( !strcmp( szLine, c_szBicd ) )
00181             break;
00182         vecstrLine.clear( );
00183         CMeta::Tokenize( szLine, vecstrLine );
00184         if( vecstrLine.size( ) != 2 ) {
00185             cerr << "Illegal line: " << szLine;
00186             return 1; }
00187         vecdCopy.push_back( (float)atof( vecstrLine[ 1 ].c_str( ) ) ); }
00188 
00189     iCluster = -1;
00190     while( !istm.eof( ) ) {
00191         istm.getline( szLine, c_iLine - 1 );
00192         if( !szLine[ 0 ] )
00193             break;
00194         vecstrLine.clear( );
00195         CMeta::Tokenize( szLine, vecstrLine );
00196         if( vecstrLine.size( ) != 3 ) {
00197             cerr << "Illegal line: " << szLine;
00198             return 1; }
00199         if( atoi( vecstrLine[ 1 ].c_str( ) ) != 1 )
00200             continue;
00201         if( ( iCur = atoi( vecstrLine[ 0 ].c_str( ) ) ) != iCluster ) {
00202             if( vecstrGenes.size( ) ) {
00203                 vecpCopy.push_back( pCluster = new CGenes( Genome ) );
00204                 pCluster->Open( vecstrGenes ); }
00205             vecstrGenes.clear( );
00206             iCluster = iCur; }
00207         vecstrGenes.push_back( vecstrLine[ 2 ] ); }
00208     if( vecstrGenes.size( ) ) {
00209         vecpCopy.push_back( pCluster = new CGenes( Genome ) );
00210         pCluster->Open( vecstrGenes ); }
00211 
00212     veciIndices.resize( vecdCopy.size( ) );
00213     for( i = 0; i < veciIndices.size( ); ++i )
00214         veciIndices[ i ] = i;
00215     sort( veciIndices.begin( ), veciIndices.end( ), SSorter( vecdCopy ) );
00216     for( i = 0; i < veciIndices.size( ); ++i ) {
00217         vecdWeights.push_back( vecdCopy[ veciIndices[ i ] ] );
00218         vecpClusters.push_back( vecpCopy[ veciIndices[ i ] ] ); }
00219 
00220     return 0; }
00221 
00222 int open_list( istream& istm, CGenome& Genome, vector<float>& vecdWeights, vector<CGenes*>& vecpClusters ) {
00223     static const size_t c_iLine = 1024;
00224     char            szLine[ c_iLine ];
00225     vector<string>  vecstrLine, vecstrGenes;
00226     CGenes*         pCluster;
00227     size_t          iCluster, iCur;
00228 
00229     iCluster = -1;
00230     while( !istm.eof( ) ) {
00231         istm.getline( szLine, c_iLine - 1 );
00232         if( !szLine[ 0 ] )
00233             break;
00234         vecstrLine.clear( );
00235         CMeta::Tokenize( szLine, vecstrLine );
00236         if( vecstrLine.size( ) != 2 ) {
00237             cerr << "Illegal line: " << szLine;
00238             return 1; }
00239         if( ( iCur = atoi( vecstrLine[ 1 ].c_str( ) ) ) != iCluster ) {
00240             if( vecstrGenes.size( ) ) {
00241                 vecdWeights.push_back( 1 );
00242                 vecpClusters.push_back( pCluster = new CGenes( Genome ) );
00243                 pCluster->Open( vecstrGenes ); }
00244             vecstrGenes.clear( );
00245             iCluster = iCur; }
00246         vecstrGenes.push_back( vecstrLine[ 0 ] ); }
00247     if( vecstrGenes.size( ) ) {
00248         vecdWeights.push_back( 1 );
00249         vecpClusters.push_back( pCluster = new CGenes( Genome ) );
00250         pCluster->Open( vecstrGenes ); }
00251 
00252     return 0; }
00253 
00254 int open_param( istream& istm, CGenome& Genome, vector<float>& vecdWeights, vector<CGenes*>& vecpClusters ) {
00255     static const size_t c_iLine = 1024;
00256     char            szLine[ c_iLine ];
00257     vector<string>  vecstrLine, vecstrGenes;
00258     CGenes*         pCluster;
00259     size_t          iCluster, iCur;
00260     float           dParam;
00261 
00262     while( !istm.eof( ) ) {
00263         istm.getline( szLine, c_iLine - 1 );
00264         if( !szLine[ 0 ] )
00265             break;
00266         vecstrLine.clear( );
00267         CMeta::Tokenize( szLine, vecstrLine );
00268         if( vecstrLine.size( ) != 2 ) {
00269             cerr << "Illegal line: " << szLine;
00270             return 1; }
00271         if( vecstrLine[ 0 ] == c_szParam ) {
00272             if( vecstrGenes.size( ) ) {
00273                 vecdWeights.push_back( dParam );
00274                 vecpClusters.push_back( pCluster = new CGenes( Genome ) );
00275                 pCluster->Open( vecstrGenes ); }
00276             iCluster = -1;
00277             vecstrGenes.clear( );
00278             dParam = (float)atof( vecstrLine[ 1 ].c_str( ) );
00279             continue; }
00280         if( ( iCur = atoi( vecstrLine[ 1 ].c_str( ) ) ) != iCluster ) {
00281             if( vecstrGenes.size( ) ) {
00282                 vecdWeights.push_back( dParam );
00283                 vecpClusters.push_back( pCluster = new CGenes( Genome ) );
00284                 pCluster->Open( vecstrGenes ); }
00285             iCluster = iCur;
00286             vecstrGenes.clear( ); }
00287         vecstrGenes.push_back( vecstrLine[ 0 ] ); }
00288     if( vecstrGenes.size( ) ) {
00289         vecdWeights.push_back( dParam );
00290         vecpClusters.push_back( pCluster = new CGenes( Genome ) );
00291         pCluster->Open( vecstrGenes ); }
00292 
00293     return 0; }
00294 
00295 int open_fuzzy( istream& istm, size_t iSkip, CDat& Dat ) {
00296     CPCL    PCL;
00297     size_t  i, j, k;
00298     float   dCur, dMax;
00299 
00300     if( !PCL.Open( istm, iSkip ) ) {
00301         cerr << "Could not open fuzzy PCL" << endl;
00302         return 1; }
00303 
00304     Dat.Open( PCL.GetGeneNames( ) );
00305     for( i = 0; i < Dat.GetGenes( ); ++i )
00306         for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) {
00307             dMax = 0;
00308             for( k = 0; k < PCL.GetExperiments( ); ++k )
00309                 if( ( dCur = PCL.Get( i, k ) * PCL.Get( j, k ) ) > dMax )
00310                     dMax = dCur;
00311             Dat.Set( i, j, dMax ); }
00312 
00313     return 0; }