Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 static const char c_szExclude[] = "exclude"; 00026 static const char c_szInclude[] = "include"; 00027 static const char c_szOnly[] = "only"; 00028 static const char c_szUnknown[] = "GO:0008150"; 00029 00030 struct SDatum { 00031 float m_dDiff; 00032 float m_dData; 00033 float m_dAnswer; 00034 size_t m_iOne; 00035 size_t m_iTwo; 00036 00037 SDatum( float dAnswer, float dData, size_t iOne, size_t iTwo ) : m_dData(dData), m_dAnswer(dAnswer), m_iOne(iOne), m_iTwo(iTwo) { 00038 00039 m_dDiff = fabs( dData - dAnswer ); } 00040 }; 00041 00042 struct SSorter { 00043 enum EMode { 00044 EModeDiff, 00045 EModeData, 00046 EModeAnswer 00047 }; 00048 00049 EMode m_eMode; 00050 bool m_fReverse; 00051 00052 SSorter( EMode eMode, bool fReverse ) : m_eMode(eMode), m_fReverse(fReverse) { } 00053 00054 bool operator()( const SDatum& sOne, const SDatum& sTwo ) const { 00055 bool fRet; 00056 float dOne, dTwo; 00057 00058 switch( m_eMode ) { 00059 case EModeData: 00060 dOne = sOne.m_dData; 00061 dTwo = sTwo.m_dData; 00062 break; 00063 00064 case EModeAnswer: 00065 dOne = sOne.m_dAnswer; 00066 dTwo = sTwo.m_dAnswer; 00067 00068 default: 00069 dOne = sOne.m_dDiff; 00070 dTwo = sTwo.m_dDiff; 00071 break; } 00072 00073 return ( m_fReverse ? ( dOne < dTwo ) : ( dTwo < dOne ) ); } 00074 }; 00075 00076 int main( int iArgs, char** aszArgs ) { 00077 CDat Answers, Data; 00078 gengetopt_args_info sArgs; 00079 vector<size_t> veciGenes; 00080 size_t i, j, k, iOne, iTwo, iNumber; 00081 float dValue, dAnswer; 00082 vector<SDatum> vecsData; 00083 ifstream ifsm; 00084 COntologyOBO GOBP; 00085 CGenome Genome; 00086 CGene* pOne; 00087 CGene* pTwo; 00088 bool fOne, fTwo; 00089 string strOne, strTwo; 00090 int iRet; 00091 SSorter::EMode eMode; 00092 00093 iRet = cmdline_parser2( iArgs, aszArgs, &sArgs, 0, 1, 0 ); 00094 if( sArgs.config_arg ) 00095 iRet = cmdline_parser_configfile( sArgs.config_arg, &sArgs, 0, 0, 1 ) && iRet; 00096 if( iRet ) { 00097 cmdline_parser_print_help( ); 00098 return iRet; } 00099 CMeta Meta( sArgs.verbosity_arg ); 00100 00101 // if( !strcmp( c_szInclude, sArgs.unknowns_arg ) || !strcmp( c_szOnly, sArgs.unknowns_arg ) ) 00102 // sArgs.everything_flag = true; 00103 00104 if( !Answers.Open( sArgs.answers_arg, sArgs.memmap_flag && !sArgs.genes_arg && !sArgs.genet_arg && !sArgs.genex_arg ) ) { 00105 cerr << "Couldn't open: " << sArgs.answers_arg << endl; 00106 return 1; } 00107 if( sArgs.genes_arg && !Answers.FilterGenes( sArgs.genes_arg, CDat::EFilterInclude ) ) { 00108 cerr << "Couldn't open: " << sArgs.genes_arg << endl; 00109 return 1; } 00110 if( sArgs.genet_arg && !Answers.FilterGenes( sArgs.genet_arg, CDat::EFilterTerm ) ) { 00111 cerr << "Couldn't open: " << sArgs.genet_arg << endl; 00112 return 1; } 00113 if( sArgs.genee_arg && !Answers.FilterGenes( sArgs.genee_arg, CDat::EFilterEdge ) ) { 00114 cerr << "Couldn't open: " << sArgs.genee_arg << endl; 00115 return 1; } 00116 if( sArgs.genex_arg && !Answers.FilterGenes( sArgs.genex_arg, CDat::EFilterExclude ) ) { 00117 cerr << "Couldn't open: " << sArgs.genex_arg << endl; 00118 return 1; } 00119 if( !Data.Open( sArgs.input_arg, sArgs.memmap_flag && !sArgs.normalize_flag && !sArgs.invert_flag ) ) { 00120 cerr << "Couldn't open: " << sArgs.input_arg << endl; 00121 return 1; } 00122 if( sArgs.normalize_flag ) 00123 Data.Normalize( CDat::ENormalizeMinMax ); 00124 if( sArgs.invert_flag ) 00125 Data.Invert( ); 00126 00127 if( sArgs.features_arg ) { 00128 ifsm.open( sArgs.features_arg ); 00129 if( !Genome.Open( ifsm ) ) { 00130 cerr << "Could not open: " << sArgs.features_arg << endl; 00131 return 1; } 00132 ifsm.close( ); } 00133 00134 if( sArgs.go_onto_arg ) { 00135 ifstream ifsmGenes; 00136 00137 ifsm.clear( ); 00138 ifsm.open( sArgs.go_onto_arg ); 00139 if( sArgs.go_anno_arg ) 00140 ifsmGenes.open( sArgs.go_anno_arg ); 00141 if( !GOBP.Open( ifsm, ifsmGenes, Genome, COntologyOBO::c_szBiologicalProcess ) ) { 00142 cerr << "Could not open: " << sArgs.go_onto_arg << ", " << sArgs.go_anno_arg << endl; 00143 return 1; } 00144 ifsm.close( ); } 00145 00146 if( !strcmp( sArgs.mode_arg, "data" ) ) 00147 eMode = SSorter::EModeData; 00148 else if( !strcmp( sArgs.mode_arg, "ans" ) ) 00149 eMode = SSorter::EModeAnswer; 00150 else 00151 eMode = SSorter::EModeDiff; 00152 00153 veciGenes.resize( Data.GetGenes( ) ); 00154 for( i = 0; i < Data.GetGenes( ); ++i ) 00155 veciGenes[ i ] = Answers.GetGene( Data.GetGene( i ) ); 00156 for( i = 0; i < Data.GetGenes( ); ++i ) { 00157 if( !( i % 1000 ) ) 00158 cerr << "Gene " << i << '/' << Data.GetGenes( ) << endl; 00159 if( !sArgs.everything_flag && ( ( iOne = veciGenes[ i ] ) == -1 ) ) 00160 continue; 00161 for( j = ( i + 1 ); j < Data.GetGenes( ); ++j ) { 00162 if( CMeta::IsNaN( dValue = Data.Get( i, j ) ) ) 00163 continue; 00164 dAnswer = CMeta::GetNaN( ); 00165 if( !sArgs.everything_flag && ( ( ( iTwo = veciGenes[ j ] ) == -1 ) || 00166 CMeta::IsNaN( dAnswer = Answers.Get( iOne, iTwo ) ) || 00167 ( sArgs.positives_flag && ( dAnswer <= 0 ) ) || 00168 ( sArgs.negatives_flag && ( dAnswer > 0 ) ) ) ) 00169 continue; 00170 if( ( (float)rand( ) / RAND_MAX ) > sArgs.fraction_arg ) 00171 continue; 00172 if( sArgs.everything_flag && CMeta::IsNaN( dAnswer ) ) 00173 dAnswer = dValue ? ( dValue - ( 1 / dValue ) ) : -FLT_MAX; 00174 vecsData.push_back( SDatum( dAnswer, dValue, i, j ) ); } } 00175 sort( vecsData.begin( ), vecsData.end( ), SSorter( eMode, !!sArgs.reverse_flag ) ); 00176 00177 if( ( ( iNumber = sArgs.count_arg ) < 0 ) || ( iNumber >= vecsData.size( ) ) ) 00178 iNumber = vecsData.size( ); 00179 for( i = 0; i < iNumber; ++i ) { 00180 const SDatum& Datum = vecsData[ i ]; 00181 00182 pOne = pTwo = NULL; 00183 strOne = Data.GetGene( Datum.m_iOne ); 00184 strTwo = Data.GetGene( Datum.m_iTwo ); 00185 if( Genome.GetGenes( ) ) { 00186 iOne = Genome.GetGene( strOne ); 00187 pOne = ( iOne == -1 ) ? NULL : &Genome.GetGene( iOne ); 00188 iTwo = Genome.GetGene( strTwo ); 00189 pTwo = ( iTwo == -1 ) ? NULL : &Genome.GetGene( iTwo ); 00190 fOne = fTwo = true; 00191 if( pOne ) 00192 for( j = 0; j < pOne->GetOntologies( ); ++j ) 00193 if( pOne->GetAnnotations( j ) && ( ( pOne->GetAnnotations( j ) > 1 ) || 00194 ( pOne->GetOntology( j )->GetID( pOne->GetAnnotation( j, 0 ) ) != c_szUnknown ) ) ) { 00195 fOne = false; 00196 break; } 00197 if( pTwo ) 00198 for( j = 0; j < pTwo->GetOntologies( ); ++j ) 00199 if( pTwo->GetAnnotations( j ) && ( ( pTwo->GetAnnotations( j ) > 1 ) || 00200 ( pTwo->GetOntology( j )->GetID( pTwo->GetAnnotation( j, 0 ) ) != c_szUnknown ) ) ) { 00201 fTwo = false; 00202 break; } 00203 if( fOne || fTwo ) { 00204 if( !strcmp( c_szExclude, sArgs.unknowns_arg ) ) 00205 continue; } 00206 else if( !strcmp( c_szOnly, sArgs.unknowns_arg ) ) 00207 continue; } 00208 00209 cout << strOne << '\t' << strTwo << '\t' << Data.Get( Datum.m_iOne, Datum.m_iTwo ) << '\t'; 00210 dAnswer = ( ( ( j = veciGenes[ Datum.m_iOne ] ) == -1 ) || ( ( k = veciGenes[ Datum.m_iTwo ] ) == -1 ) ) ? 00211 CMeta::GetNaN( ) : Answers.Get( j, k ); 00212 if( CMeta::IsNaN( dAnswer ) ) 00213 cout << '-'; 00214 else 00215 cout << dAnswer; 00216 cout << endl; 00217 if( pOne ) { 00218 cout << '\t'; 00219 if( pOne->GetSynonyms( ) ) 00220 cout << pOne->GetSynonym( 0 ); 00221 cout << '\t' << pOne->GetGloss( ); } 00222 if( Genome.GetGenes( ) ) 00223 cout << endl; 00224 if( pTwo ) { 00225 cout << '\t'; 00226 if( pTwo->GetSynonyms( ) ) 00227 cout << pTwo->GetSynonym( 0 ); 00228 cout << '\t' << pTwo->GetGloss( ); } 00229 if( Genome.GetGenes( ) ) 00230 cout << endl; } 00231 00232 return 0; }