Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 static const size_t c_iConfirmation = 512; 00026 static const char c_szOrthologized[] = "_ortho.dab"; 00027 00028 int main( int iArgs, char** aszArgs ) { 00029 gengetopt_args_info sArgs; 00030 COrthology Orthology; 00031 ifstream ifsm; 00032 size_t i, j, iOrganism, iCluster, iOne, iTwo, iOrgOne, iIndexOne, iIndexTwo; 00033 size_t iGeneOne, iGeneTwo; 00034 vector<size_t> veciHits, veciOrgsIn2Orth, veciOrgsOrth2In; 00035 vector<CDatasetCompact> vecData; 00036 vector<string> vecstrData; 00037 CBinaryMatrix MatRelated; 00038 00039 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00040 cmdline_parser_print_help( ); 00041 return 1; } 00042 CMeta Meta( sArgs.verbosity_arg ); 00043 00044 if( sArgs.input_arg ) 00045 ifsm.open( sArgs.input_arg ); 00046 if( !Orthology.Open( sArgs.input_arg ? ifsm : cin ) ) { 00047 cerr << "Could not open: " << ( sArgs.input_arg || "standard input" ) << endl; 00048 return 1; } 00049 if( sArgs.input_arg ) 00050 ifsm.close( ); 00051 00052 veciHits.resize( Orthology.GetGenomes( ) ); 00053 veciOrgsIn2Orth.resize( sArgs.inputs_num ); 00054 for( iOrganism = 0; iOrganism < sArgs.inputs_num; ++iOrganism ) { 00055 CDat Answers; 00056 00057 ifsm.clear( ); 00058 ifsm.open( sArgs.inputs[ iOrganism ] ); 00059 if( !Answers.OpenGenes( ifsm, true ) ) { 00060 cerr << "Could not open: " << sArgs.input_arg << endl; 00061 return 1; } 00062 ifsm.close( ); 00063 00064 for( i = 0; i < Orthology.GetGenomes( ); ++i ) { 00065 const CGenome* pGenome = Orthology.GetGenome( i ); 00066 00067 veciHits[ i ] = 0; 00068 for( j = 0; j < min( Answers.GetGenes( ), c_iConfirmation ); ++j ) 00069 if( pGenome->FindGene( Answers.GetGene( j ) ) != -1 ) 00070 veciHits[ i ]++; } 00071 00072 veciOrgsIn2Orth[ iOrganism ] = 0; 00073 for( i = 1; i < veciHits.size( ); ++i ) 00074 if( veciHits[ i ] > veciHits[ veciOrgsIn2Orth[ iOrganism ] ] ) 00075 veciOrgsIn2Orth[ iOrganism ] = i; } 00076 veciOrgsOrth2In.resize( Orthology.GetGenomes( ) ); 00077 for( i = 0; i < veciOrgsIn2Orth.size( ); ++i ) 00078 veciOrgsOrth2In[ veciOrgsIn2Orth[ i ] ] = i; 00079 00080 for( i = 0; i < sArgs.inputs_num; ++i ) 00081 cerr << sArgs.inputs[ i ] << ": " << Orthology.GetOrganism( veciOrgsIn2Orth[ i ] ) << endl; 00082 00083 vecData.resize( sArgs.inputs_num ); 00084 vecstrData.resize( 1 ); 00085 for( i = 0; i < vecData.size( ); ++i ) { 00086 vecstrData[ 0 ] = sArgs.inputs[ i ]; 00087 if( !vecData[ i ].Open( vecstrData ) ) { 00088 cerr << "Could not open inputs" << endl; 00089 return 1; } } 00090 00091 vector<vector<pair<size_t, size_t> > > vecvecpriiOrthology; 00092 vecvecpriiOrthology.resize( Orthology.GetClusters( ) ); 00093 for( i = 0; i < vecvecpriiOrthology.size( ); ++i ) { 00094 vecvecpriiOrthology[ i ].resize( Orthology.GetGenes( i ) ); 00095 for( j = 0; j < vecvecpriiOrthology[ i ].size( ); ++j ) { 00096 const CGene& GeneOne = Orthology.GetGene( i, j ); 00097 const CDatasetCompact& Dataset = vecData[ iOrgOne = veciOrgsOrth2In[ 00098 Orthology.GetOrganism( GeneOne ) ] ]; 00099 00100 vecvecpriiOrthology[ i ][ j ].first = Dataset.GetGene( GeneOne.GetName( ) ); 00101 vecvecpriiOrthology[ i ][ j ].second = iOrgOne; } } 00102 00103 MatRelated.Initialize( vecvecpriiOrthology.size( ) ); 00104 for( iOne = 0; iOne < MatRelated.GetSize( ); ++iOne ) { 00105 if( !( iOne % 1000 ) ) 00106 cerr << "Linking group " << iOne << '/' << MatRelated.GetSize( ) << endl; 00107 for( iTwo = ( iOne + 1 ); iTwo < MatRelated.GetSize( ); ++iTwo ) 00108 for( iGeneOne = 0; iGeneOne < vecvecpriiOrthology[ iOne ].size( ); ++iGeneOne ) { 00109 const pair<size_t, size_t>& priiOne = vecvecpriiOrthology[ iOne ][ iGeneOne ]; 00110 const CDatasetCompact& Dataset = vecData[ priiOne.second ]; 00111 00112 if( priiOne.first == -1 ) 00113 continue; 00114 for( iGeneTwo = 0; iGeneTwo < Orthology.GetGenes( iTwo ); ++iGeneTwo ) { 00115 const pair<size_t, size_t>& priiTwo = vecvecpriiOrthology[ iTwo ][ iGeneTwo ]; 00116 00117 if( ( priiOne.second == priiTwo.second ) && ( priiTwo.first != -1 ) && 00118 ( ( i = Dataset.GetDiscrete( priiOne.first, priiTwo.first, 0 ) ) != -1 ) && i ) { 00119 if( sArgs.verbosity_arg > 6 ) 00120 cerr << "Linked " << iOne << ':' << iTwo << " by " << 00121 sArgs.inputs[ priiOne.second ] << ' ' << Dataset.GetGene( 00122 priiOne.first ) << ':' << Dataset.GetGene( priiTwo.first ) << endl; 00123 MatRelated.Set( iOne, iTwo, true ); 00124 break; } } 00125 if( iGeneTwo < Orthology.GetGenes( iTwo ) ) 00126 break; } } 00127 00128 for( iOrganism = 0; iOrganism < sArgs.inputs_num; ++iOrganism ) { 00129 CDat Dat; 00130 vector<string> vecstrGenes; 00131 map<const CGene*,size_t> mapGenes; 00132 size_t iPositives, iNegatives; 00133 00134 cerr << "Processing " << sArgs.inputs[ iOrganism ] << endl; 00135 vecstrGenes.resize( vecData[ iOrganism ].GetGenes( ) ); 00136 for( i = 0; i < vecstrGenes.size( ); ++i ) 00137 vecstrGenes[ i ] = vecData[ iOrganism ].GetGene( i ); 00138 for( iCluster = 0; iCluster < Orthology.GetClusters( ); ++iCluster ) 00139 for( iOne = 0; iOne < Orthology.GetGenes( iCluster ); ++iOne ) { 00140 const CGene& Gene = Orthology.GetGene( iCluster, iOne ); 00141 00142 if( ( Orthology.GetOrganism( Gene ) == veciOrgsIn2Orth[ iOrganism ] ) && 00143 ( vecData[ iOrganism ].GetGene( Gene.GetName( ) ) == -1 ) ) { 00144 mapGenes[ &Gene ] = vecstrGenes.size( ); 00145 vecstrGenes.push_back( Gene.GetName( ) ); } } 00146 00147 Dat.Open( vecstrGenes, true ); 00148 cerr << "Integrating orthology clusters" << endl; 00149 for( iPositives = iOne = 0; iOne < vecData[ iOrganism ].GetGenes( ); ++iOne ) 00150 for( iTwo = ( iOne + 1 ); iTwo < vecData[ iOrganism ].GetGenes( ); ++iTwo ) 00151 if( ( ( i = vecData[ iOrganism ].GetDiscrete( iOne, iTwo, 0 ) ) != -1 ) && i ) { 00152 iPositives++; 00153 Dat.Set( iOne, iTwo, 1 ); } 00154 for( iOne = 0; iOne < vecvecpriiOrthology.size( ); ++iOne ) 00155 for( iGeneOne = 0; iGeneOne < vecvecpriiOrthology[ iOne ].size( ); ++iGeneOne ) { 00156 const pair<size_t, size_t>& priiOne = vecvecpriiOrthology[ iOne ][ iGeneOne ]; 00157 00158 if( priiOne.second != iOrganism ) 00159 continue; 00160 iIndexOne = ( priiOne.first == -1 ) ? mapGenes[ &Orthology.GetGene( iOne, 00161 iGeneOne ) ] : priiOne.first; 00162 for( iGeneTwo = ( iGeneOne + 1 ); iGeneTwo < vecvecpriiOrthology[ iOne ].size( ); 00163 ++iGeneTwo ) { 00164 const pair<size_t, size_t>& priiTwo = vecvecpriiOrthology[ iOne ][ iGeneTwo ]; 00165 00166 if( priiOne.second == priiTwo.second ) { 00167 iIndexTwo = ( priiTwo.first == -1 ) ? mapGenes[ &Orthology.GetGene( iOne, 00168 iGeneTwo ) ] : priiTwo.first; 00169 if( CMeta::IsNaN( Dat.Get( iIndexOne, iIndexTwo ) ) ) { 00170 iPositives++; 00171 Dat.Set( iIndexOne, iIndexTwo, (float)sArgs.weight1_arg ); } } } } 00172 00173 cerr << "Integrating orthology links" << endl; 00174 for( iOne = 0; iOne < MatRelated.GetSize( ); ++iOne ) { 00175 if( !( iOne % 1000 ) ) 00176 cerr << "Linking group " << iOne << '/' << MatRelated.GetSize( ) << endl; 00177 for( iTwo = ( iOne + 1 ); iTwo < MatRelated.GetSize( ); ++iTwo ) 00178 if( MatRelated.Get( iOne, iTwo ) ) 00179 for( iGeneOne = 0; iGeneOne < vecvecpriiOrthology[ iOne ].size( ); ++iGeneOne ) { 00180 const pair<size_t, size_t>& priiOne = vecvecpriiOrthology[ iOne ][ iGeneOne ]; 00181 00182 if( priiOne.second != iOrganism ) 00183 continue; 00184 iIndexOne = ( priiOne.first == -1 ) ? mapGenes[ &Orthology.GetGene( iOne, 00185 iGeneOne ) ] : priiOne.first; 00186 for( iGeneTwo = 0; iGeneTwo < vecvecpriiOrthology[ iTwo ].size( ); ++iGeneTwo ) { 00187 const pair<size_t, size_t>& priiTwo = vecvecpriiOrthology[ iTwo ][ iGeneTwo ]; 00188 00189 if( priiOne.second == priiTwo.second ) { 00190 iIndexTwo = ( priiTwo.first == -1 ) ? mapGenes[ &Orthology.GetGene( 00191 iTwo, iGeneTwo ) ] : priiTwo.first; 00192 if( CMeta::IsNaN( Dat.Get( iIndexOne, iIndexTwo ) ) ) { 00193 iPositives++; 00194 Dat.Set( iIndexOne, iIndexTwo, (float)sArgs.weight2_arg ); } } } } } 00195 00196 i = Dat.GetGenes( ) * ( Dat.GetGenes( ) - 1 ) / 2; 00197 iNegatives = min( (size_t)( iPositives * ( ( 1 / sArgs.positives_arg ) - 1 ) ), i - iPositives ); 00198 if( iNegatives ) { 00199 float dNegatives; 00200 00201 dNegatives = (float)iNegatives / i; 00202 cerr << "Generating random negatives (" << ( dNegatives * 100 ) << "%)" << endl; 00203 for( i = 0; i < Dat.GetGenes( ); ++i ) 00204 for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) 00205 if( CMeta::IsNaN( Dat.Get( i, j ) ) && ( ( (float)rand( ) / RAND_MAX ) < dNegatives ) ) 00206 Dat.Set( i, j, 0 ); } 00207 00208 Dat.Save( ( CMeta::Deextension( sArgs.inputs[ iOrganism ] ) + c_szOrthologized ).c_str( ) ); } 00209 00210 return 0; }