Sleipnir
tools/Orthologer/Orthologer.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 static const size_t c_iConfirmation     = 512;
00026 static const char   c_szOrthologized[]  = "_ortho.dab";
00027 
00028 int main( int iArgs, char** aszArgs ) {
00029     gengetopt_args_info     sArgs;
00030     COrthology              Orthology;
00031     ifstream                ifsm;
00032     size_t                  i, j, iOrganism, iCluster, iOne, iTwo, iOrgOne, iIndexOne, iIndexTwo;
00033     size_t                  iGeneOne, iGeneTwo;
00034     vector<size_t>          veciHits, veciOrgsIn2Orth, veciOrgsOrth2In;
00035     vector<CDatasetCompact> vecData;
00036     vector<string>          vecstrData;
00037     CBinaryMatrix           MatRelated;
00038 
00039     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00040         cmdline_parser_print_help( );
00041         return 1; }
00042     CMeta Meta( sArgs.verbosity_arg );
00043 
00044     if( sArgs.input_arg )
00045         ifsm.open( sArgs.input_arg );
00046     if( !Orthology.Open( sArgs.input_arg ? ifsm : cin ) ) {
00047         cerr << "Could not open: " << ( sArgs.input_arg || "standard input" ) << endl;
00048         return 1; }
00049     if( sArgs.input_arg )
00050         ifsm.close( );
00051 
00052     veciHits.resize( Orthology.GetGenomes( ) );
00053     veciOrgsIn2Orth.resize( sArgs.inputs_num );
00054     for( iOrganism = 0; iOrganism < sArgs.inputs_num; ++iOrganism ) {
00055         CDat    Answers;
00056 
00057         ifsm.clear( );
00058         ifsm.open( sArgs.inputs[ iOrganism ] );
00059         if( !Answers.OpenGenes( ifsm, true ) ) {
00060             cerr << "Could not open: " << sArgs.input_arg << endl;
00061             return 1; }
00062         ifsm.close( );
00063 
00064         for( i = 0; i < Orthology.GetGenomes( ); ++i ) {
00065             const CGenome*  pGenome = Orthology.GetGenome( i );
00066 
00067             veciHits[ i ] = 0;
00068             for( j = 0; j < min( Answers.GetGenes( ), c_iConfirmation ); ++j )
00069                 if( pGenome->FindGene( Answers.GetGene( j ) ) != -1 )
00070                     veciHits[ i ]++; }
00071 
00072         veciOrgsIn2Orth[ iOrganism ] = 0;
00073         for( i = 1; i < veciHits.size( ); ++i )
00074             if( veciHits[ i ] > veciHits[ veciOrgsIn2Orth[ iOrganism ] ] )
00075                 veciOrgsIn2Orth[ iOrganism ] = i; }
00076     veciOrgsOrth2In.resize( Orthology.GetGenomes( ) );
00077     for( i = 0; i < veciOrgsIn2Orth.size( ); ++i )
00078         veciOrgsOrth2In[ veciOrgsIn2Orth[ i ] ] = i;
00079 
00080     for( i = 0; i < sArgs.inputs_num; ++i )
00081         cerr << sArgs.inputs[ i ] << ": " << Orthology.GetOrganism( veciOrgsIn2Orth[ i ] ) << endl;
00082 
00083     vecData.resize( sArgs.inputs_num );
00084     vecstrData.resize( 1 );
00085     for( i = 0; i < vecData.size( ); ++i ) {
00086         vecstrData[ 0 ] = sArgs.inputs[ i ];
00087         if( !vecData[ i ].Open( vecstrData ) ) {
00088             cerr << "Could not open inputs" << endl;
00089             return 1; } }
00090 
00091     vector<vector<pair<size_t, size_t> > >  vecvecpriiOrthology;
00092     vecvecpriiOrthology.resize( Orthology.GetClusters( ) );
00093     for( i = 0; i < vecvecpriiOrthology.size( ); ++i ) {
00094         vecvecpriiOrthology[ i ].resize( Orthology.GetGenes( i ) );
00095         for( j = 0; j < vecvecpriiOrthology[ i ].size( ); ++j ) {
00096             const CGene&            GeneOne = Orthology.GetGene( i, j );
00097             const CDatasetCompact&  Dataset = vecData[ iOrgOne = veciOrgsOrth2In[
00098                 Orthology.GetOrganism( GeneOne ) ] ];
00099 
00100             vecvecpriiOrthology[ i ][ j ].first = Dataset.GetGene( GeneOne.GetName( ) );
00101             vecvecpriiOrthology[ i ][ j ].second = iOrgOne; } }
00102 
00103     MatRelated.Initialize( vecvecpriiOrthology.size( ) );
00104     for( iOne = 0; iOne < MatRelated.GetSize( ); ++iOne ) {
00105         if( !( iOne % 1000 ) )
00106             cerr << "Linking group " << iOne << '/' << MatRelated.GetSize( ) << endl;
00107         for( iTwo = ( iOne + 1 ); iTwo < MatRelated.GetSize( ); ++iTwo )
00108             for( iGeneOne = 0; iGeneOne < vecvecpriiOrthology[ iOne ].size( ); ++iGeneOne ) {
00109                 const pair<size_t, size_t>& priiOne = vecvecpriiOrthology[ iOne ][ iGeneOne ];
00110                 const CDatasetCompact&      Dataset = vecData[ priiOne.second ];
00111 
00112                 if( priiOne.first == -1 )
00113                     continue;
00114                 for( iGeneTwo = 0; iGeneTwo < Orthology.GetGenes( iTwo ); ++iGeneTwo ) {
00115                     const pair<size_t, size_t>& priiTwo = vecvecpriiOrthology[ iTwo ][ iGeneTwo ];
00116 
00117                     if( ( priiOne.second == priiTwo.second ) && ( priiTwo.first != -1 ) &&
00118                         ( ( i = Dataset.GetDiscrete( priiOne.first, priiTwo.first, 0 ) ) != -1 ) && i ) {
00119                         if( sArgs.verbosity_arg > 6 )
00120                             cerr << "Linked " << iOne << ':' << iTwo << " by " <<
00121                                 sArgs.inputs[ priiOne.second ] << ' ' << Dataset.GetGene(
00122                                 priiOne.first ) << ':' << Dataset.GetGene( priiTwo.first ) << endl;
00123                         MatRelated.Set( iOne, iTwo, true );
00124                         break; } }
00125                 if( iGeneTwo < Orthology.GetGenes( iTwo ) )
00126                     break; } }
00127 
00128     for( iOrganism = 0; iOrganism < sArgs.inputs_num; ++iOrganism ) {
00129         CDat                        Dat;
00130         vector<string>              vecstrGenes;
00131         map<const CGene*,size_t>    mapGenes;
00132         size_t                      iPositives, iNegatives;
00133 
00134         cerr << "Processing " << sArgs.inputs[ iOrganism ] << endl;
00135         vecstrGenes.resize( vecData[ iOrganism ].GetGenes( ) );
00136         for( i = 0; i < vecstrGenes.size( ); ++i )
00137             vecstrGenes[ i ] = vecData[ iOrganism ].GetGene( i );
00138         for( iCluster = 0; iCluster < Orthology.GetClusters( ); ++iCluster )
00139             for( iOne = 0; iOne < Orthology.GetGenes( iCluster ); ++iOne ) {
00140                 const CGene&    Gene    = Orthology.GetGene( iCluster, iOne );
00141 
00142                 if( ( Orthology.GetOrganism( Gene ) == veciOrgsIn2Orth[ iOrganism ] ) &&
00143                     ( vecData[ iOrganism ].GetGene( Gene.GetName( ) ) == -1 ) ) {
00144                     mapGenes[ &Gene ] = vecstrGenes.size( );
00145                     vecstrGenes.push_back( Gene.GetName( ) ); } }
00146 
00147         Dat.Open( vecstrGenes, true );
00148         cerr << "Integrating orthology clusters" << endl;
00149         for( iPositives = iOne = 0; iOne < vecData[ iOrganism ].GetGenes( ); ++iOne )
00150             for( iTwo = ( iOne + 1 ); iTwo < vecData[ iOrganism ].GetGenes( ); ++iTwo )
00151                 if( ( ( i = vecData[ iOrganism ].GetDiscrete( iOne, iTwo, 0 ) ) != -1 ) && i ) {
00152                     iPositives++;
00153                     Dat.Set( iOne, iTwo, 1 ); }
00154         for( iOne = 0; iOne < vecvecpriiOrthology.size( ); ++iOne )
00155             for( iGeneOne = 0; iGeneOne < vecvecpriiOrthology[ iOne ].size( ); ++iGeneOne ) {
00156                 const pair<size_t, size_t>& priiOne = vecvecpriiOrthology[ iOne ][ iGeneOne ];
00157 
00158                 if( priiOne.second != iOrganism )
00159                     continue;
00160                 iIndexOne = ( priiOne.first == -1 ) ? mapGenes[ &Orthology.GetGene( iOne,
00161                     iGeneOne ) ] : priiOne.first;
00162                 for( iGeneTwo = ( iGeneOne + 1 ); iGeneTwo < vecvecpriiOrthology[ iOne ].size( );
00163                     ++iGeneTwo ) {
00164                     const pair<size_t, size_t>& priiTwo = vecvecpriiOrthology[ iOne ][ iGeneTwo ];
00165 
00166                     if( priiOne.second == priiTwo.second ) {
00167                         iIndexTwo = ( priiTwo.first == -1 ) ? mapGenes[ &Orthology.GetGene( iOne,
00168                             iGeneTwo ) ] : priiTwo.first;
00169                         if( CMeta::IsNaN( Dat.Get( iIndexOne, iIndexTwo ) ) ) {
00170                             iPositives++;
00171                             Dat.Set( iIndexOne, iIndexTwo, (float)sArgs.weight1_arg ); } } } }
00172 
00173         cerr << "Integrating orthology links" << endl;
00174         for( iOne = 0; iOne < MatRelated.GetSize( ); ++iOne ) {
00175             if( !( iOne % 1000 ) )
00176                 cerr << "Linking group " << iOne << '/' << MatRelated.GetSize( ) << endl;
00177             for( iTwo = ( iOne + 1 ); iTwo < MatRelated.GetSize( ); ++iTwo )
00178                 if( MatRelated.Get( iOne, iTwo ) )
00179                     for( iGeneOne = 0; iGeneOne < vecvecpriiOrthology[ iOne ].size( ); ++iGeneOne ) {
00180                         const pair<size_t, size_t>& priiOne = vecvecpriiOrthology[ iOne ][ iGeneOne ];
00181 
00182                         if( priiOne.second != iOrganism )
00183                             continue;
00184                         iIndexOne = ( priiOne.first == -1 ) ? mapGenes[ &Orthology.GetGene( iOne,
00185                             iGeneOne ) ] : priiOne.first;
00186                         for( iGeneTwo = 0; iGeneTwo < vecvecpriiOrthology[ iTwo ].size( ); ++iGeneTwo ) {
00187                             const pair<size_t, size_t>& priiTwo = vecvecpriiOrthology[ iTwo ][ iGeneTwo ];
00188 
00189                             if( priiOne.second == priiTwo.second ) {
00190                                 iIndexTwo = ( priiTwo.first == -1 ) ? mapGenes[ &Orthology.GetGene(
00191                                     iTwo, iGeneTwo ) ] : priiTwo.first;
00192                                 if( CMeta::IsNaN( Dat.Get( iIndexOne, iIndexTwo ) ) ) {
00193                                     iPositives++;
00194                                     Dat.Set( iIndexOne, iIndexTwo, (float)sArgs.weight2_arg ); } } } } }
00195 
00196         i = Dat.GetGenes( ) * ( Dat.GetGenes( ) - 1 ) / 2;
00197         iNegatives = min( (size_t)( iPositives * ( ( 1 / sArgs.positives_arg ) - 1 ) ), i - iPositives );
00198         if( iNegatives ) {
00199             float   dNegatives;
00200 
00201             dNegatives = (float)iNegatives / i;
00202             cerr << "Generating random negatives (" << ( dNegatives * 100 ) << "%)" << endl;
00203             for( i = 0; i < Dat.GetGenes( ); ++i )
00204                 for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j )
00205                     if( CMeta::IsNaN( Dat.Get( i, j ) ) && ( ( (float)rand( ) / RAND_MAX ) < dNegatives ) )
00206                         Dat.Set( i, j, 0 ); }
00207 
00208         Dat.Save( ( CMeta::Deextension( sArgs.inputs[ iOrganism ] ) + c_szOrthologized ).c_str( ) ); }
00209 
00210     return 0; }