Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 const char c_szBP[] = "bp"; 00026 const char c_szCC[] = "cc"; 00027 const char c_szMF[] = "mf"; 00028 00029 double overlap( const CGenome&, const CSlim&, size_t, size_t ); 00030 00031 int main( int iArgs, char** aszArgs ) { 00032 CGenome Genome; 00033 gengetopt_args_info sArgs; 00034 ifstream ifsmAnno, ifsmOnto, ifsmTerm; 00035 COntologyOBO OBO; 00036 COntologyKEGG KEGG; 00037 COntologyMIPS MIPS; 00038 const IOntology* pOnto; 00039 CDat Dat; 00040 CSlim Slim, SlimNeg; 00041 ofstream ofsm; 00042 size_t i, j, k; 00043 map<const CGene*,bool> mapGenes; 00044 map<const CGene*,bool>::iterator iterGene; 00045 int iRet; 00046 const char* szNamespace; 00047 00048 iRet = cmdline_parser2( iArgs, aszArgs, &sArgs, 0, 1, 0 ); 00049 if( sArgs.config_arg ) 00050 iRet = cmdline_parser_configfile( sArgs.config_arg, &sArgs, 0, 0, 1 ) && iRet; 00051 if( iRet ) { 00052 cmdline_parser_print_help( ); 00053 return 1; } 00054 CMeta Meta( sArgs.verbosity_arg, sArgs.random_arg ); 00055 00056 if( sArgs.onto_arg ) { 00057 ifsmOnto.open( sArgs.onto_arg ); 00058 if( !strcmp( sArgs.namespace_arg, c_szBP ) ) 00059 szNamespace = COntologyOBO::c_szBiologicalProcess; 00060 else if( !strcmp( sArgs.namespace_arg, c_szCC ) ) 00061 szNamespace = COntologyOBO::c_szCellularComponent; 00062 else if( !strcmp( sArgs.namespace_arg, c_szMF ) ) 00063 szNamespace = COntologyOBO::c_szMolecularFunction; 00064 else 00065 szNamespace = sArgs.namespace_arg; 00066 00067 if( sArgs.obo_anno_arg ) { 00068 ifsmAnno.open( sArgs.obo_anno_arg ); 00069 00070 if( !OBO.Open( ifsmOnto, ifsmAnno, Genome, szNamespace, !!sArgs.dbids_flag ) ) { 00071 cerr << "Couldn't open: "; 00072 if( sArgs.obo_anno_arg ) 00073 cerr << sArgs.obo_anno_arg << ", "; 00074 cerr << sArgs.onto_arg << endl; 00075 return 1; 00076 } 00077 00078 ifsmAnno.close( ); 00079 pOnto = &OBO; 00080 } 00081 ifsmOnto.close( ); 00082 } 00083 00084 else if( sArgs.mips_onto_arg ) { 00085 ifsmOnto.open( sArgs.mips_onto_arg ); 00086 if( sArgs.mips_anno_arg ) 00087 ifsmAnno.open( sArgs.mips_anno_arg ); 00088 if( !MIPS.Open( ifsmOnto, ifsmAnno, Genome ) ) { 00089 cerr << "Couldn't open: " << sArgs.mips_anno_arg << ", " << 00090 sArgs.mips_onto_arg << endl; 00091 return 1; } 00092 ifsmOnto.close( ); 00093 if( sArgs.mips_anno_arg ) 00094 ifsmAnno.close( ); 00095 pOnto = &MIPS; } 00096 else if( sArgs.kegg_arg ) { 00097 ifsmOnto.open( sArgs.kegg_arg ); 00098 if( !KEGG.Open( ifsmOnto, Genome, sArgs.kegg_org_arg ) ) { 00099 cerr << "Couldn't open: " << sArgs.kegg_arg << endl; 00100 return 1; } 00101 ifsmOnto.close( ); 00102 pOnto = &KEGG; } 00103 else { 00104 cerr << "No ontology found." << endl; 00105 return 1; } 00106 00107 ifsmOnto.clear( ); 00108 ifsmOnto.open( sArgs.input_arg ); 00109 if( !Slim.Open( ifsmOnto, pOnto ) ) { 00110 cerr << "Couldn't open: " << sArgs.input_arg << endl; 00111 return 1; } 00112 ifsmOnto.close( ); 00113 00114 if( sArgs.sql_arg ) { 00115 ofsm.open( ( (string)sArgs.sql_arg + "/functions.txt" ).c_str( ) ); 00116 for( i = 0; i < Slim.GetSlims( ); ++i ) 00117 ofsm << ( i + 1 ) << '\t' << Slim.GetSlim( i ) << endl; 00118 ofsm.close( ); 00119 00120 ofsm.open( ( (string)sArgs.sql_arg + "/genes.txt" ).c_str( ) ); 00121 for( i = 0; i < Genome.GetGenes( ); ++i ) 00122 ofsm << ( i + 1 ) << '\t' << Genome.GetGene( i ).GetName( ) << endl; 00123 ofsm.close( ); 00124 00125 ofsm.open( ( (string)sArgs.sql_arg + "/gene_names.txt" ).c_str( ) ); 00126 for( k = i = 0; i < Genome.GetGenes( ); ++i ) { 00127 const CGene& Gene = Genome.GetGene( i ); 00128 00129 for( j = 0; j < Gene.GetSynonyms( ); ++j ) 00130 ofsm << ( k++ + 1 ) << '\t' << ( i + 1 ) << '\t' << Gene.GetSynonym( j ) << endl; } 00131 ofsm.close( ); 00132 00133 ofsm.open( ( (string)sArgs.sql_arg + "/functions_genes.txt" ).c_str( ) ); 00134 for( i = 0; i < Slim.GetSlims( ); ++i ) 00135 for( j = 0; j < Slim.GetGenes( i ); ++j ) 00136 ofsm << ( i + 1 ) << '\t' << ( Genome.GetGene( Slim.GetGene( i, j ).GetName( ) ) + 1 ) << endl; 00137 ofsm.close( ); 00138 00139 return 0; } 00140 00141 if( sArgs.nsets_flag ) { 00142 for( i = 0; i < Slim.GetSlims( ); ++i ) { 00143 map<const CGene*,bool> mapGenes; 00144 map<const CGene*,bool>::iterator iterGene; 00145 00146 for( j = 0; j < Slim.GetSlims( ); ++j ) { 00147 if( i == j ) 00148 continue; 00149 if( overlap( Genome, Slim, i, j ) < sArgs.nsetlap_arg ) 00150 for( k = 0; k < Slim.GetGenes( j ); ++k ) 00151 mapGenes[ &Slim.GetGene( j, k ) ] = false; 00152 else 00153 for( k = 0; k < Slim.GetGenes( j ); ++k ) { 00154 const CGene* pGene = &Slim.GetGene( j, k ); 00155 00156 if( mapGenes.find( pGene ) == mapGenes.end( ) ) 00157 mapGenes[ pGene ] = true; } } 00158 00159 ofsm.open( ( (string)sArgs.directory_arg + '/' + CMeta::Filename( Slim.GetSlim( i ) ) ).c_str( ) ); 00160 for( iterGene = mapGenes.begin( ); iterGene != mapGenes.end( ); ++iterGene ) 00161 if( iterGene->second ) 00162 ofsm << iterGene->first->GetName( ) << endl; 00163 ofsm.close( ); } 00164 return 0; } 00165 00166 if( sArgs.output_arg ) { 00167 if( sArgs.negatives_arg ) { 00168 ifsmOnto.clear( ); 00169 ifsmOnto.open( sArgs.negatives_arg ); 00170 if( !( SlimNeg.Open( ifsmOnto, pOnto ) && Dat.Open( Slim, SlimNeg ) ) ) { 00171 cerr << "Couldn't open: " << sArgs.negatives_arg << endl; 00172 return 1; } 00173 ifsmOnto.close( ); } 00174 else if( !Dat.Open( Slim ) ) { 00175 cerr << "Couldn't open: " << sArgs.input_arg << endl; 00176 return 1; } 00177 00178 Dat.Save( sArgs.output_arg ); } 00179 00180 if( sArgs.test_arg ) { 00181 for( i = 0; i < Slim.GetSlims( ); ++i ) 00182 for( j = 0; j < Slim.GetGenes( i ); ++j ) 00183 mapGenes[ &Slim.GetGene( i, j ) ] = false; 00184 for( i = 0; i < Slim.GetSlims( ); ++i ) 00185 mapGenes[ &Slim.GetGene( i, rand( ) % Slim.GetGenes( i ) ) ] = true; 00186 for( iterGene = mapGenes.begin( ); iterGene != mapGenes.end( ); ++iterGene ) 00187 if( ( (float)rand( ) / RAND_MAX ) < sArgs.test_arg ) 00188 iterGene->second = true; } 00189 00190 if( sArgs.annotations_arg ) { 00191 ofsm.clear( ); 00192 ofsm.open( sArgs.annotations_arg ); 00193 for( i = 0; i < Slim.GetSlims( ); ++i ) { 00194 00195 for( j = 0; j < Slim.GetGenes( i ); ++j ) { 00196 const CGene& Gene = Slim.GetGene( i, j ); 00197 const string& strName = 00198 (sArgs.synonyms_flag && Gene.GetSynonyms( )) ? Gene.GetSynonym( 0 ) 00199 : Gene.GetName( ); 00200 00201 ofsm << Slim.GetSlim( i ) << "\t" << strName; 00202 ofsm << endl; 00203 } 00204 } 00205 ofsm.close( ); 00206 } 00207 00208 if( sArgs.directory_arg ) 00209 for( i = 0; i < Slim.GetSlims( ); ++i ) { 00210 ofsm.clear( ); 00211 ofsm.open( ( (string)sArgs.directory_arg + '/' + 00212 CMeta::Filename( Slim.GetSlim( i ) ) ).c_str( ) ); 00213 for( j = 0; j < Slim.GetGenes( i ); ++j ) { 00214 const CGene& Gene = Slim.GetGene( i, j ); 00215 const string& strName = ( sArgs.synonyms_flag && Gene.GetSynonyms( ) ) ? 00216 Gene.GetSynonym( 0 ) : Gene.GetName( ); 00217 00218 ofsm << strName; 00219 if( sArgs.allids_flag ) 00220 for( k = 0; k < Gene.GetSynonyms( ); ++k ) 00221 ofsm << '\t' << ( ( !k && sArgs.synonyms_flag ) ? Gene.GetName( ) : 00222 Gene.GetSynonym( k ) ); 00223 ofsm << endl; } 00224 ofsm.close( ); } 00225 if( sArgs.test_arg ) 00226 for( iterGene = mapGenes.begin( ); iterGene != mapGenes.end( ); ++iterGene ) 00227 if( iterGene->second ) { 00228 const string& strName = ( sArgs.synonyms_flag && iterGene->first->GetSynonyms( ) ) ? 00229 iterGene->first->GetSynonym( 0 ) : iterGene->first->GetName( ); 00230 00231 cout << strName << endl; } 00232 00233 return 0; } 00234 00235 double overlap( const CGenome& Genome, const CSlim& Slim, size_t iOne, size_t iTwo ) { 00236 size_t i, iOverlap; 00237 set<const CGene*> setOne; 00238 00239 for( i = 0; i < Slim.GetGenes( iOne ); ++i ) 00240 setOne.insert( &Slim.GetGene( iOne, i ) ); 00241 for( iOverlap = i = 0; i < Slim.GetGenes( iTwo ); ++i ) 00242 if( setOne.find( &Slim.GetGene( iTwo, i ) ) != setOne.end( ) ) 00243 iOverlap++; 00244 00245 return CStatistics::HypergeometricCDF( iOverlap, Slim.GetGenes( iOne ), Slim.GetGenes( iTwo ), 00246 Genome.GetGenes( ) ); }