Sleipnir
tools/BNFunc/BNFunc.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 const char  c_szBP[]    = "bp";
00026 const char  c_szCC[]    = "cc";
00027 const char  c_szMF[]    = "mf";
00028 
00029 double overlap( const CGenome&, const CSlim&, size_t, size_t );
00030 
00031 int main( int iArgs, char** aszArgs ) {
00032     CGenome                             Genome;
00033     gengetopt_args_info                 sArgs;
00034     ifstream                            ifsmAnno, ifsmOnto, ifsmTerm;
00035     COntologyOBO OBO;
00036     COntologyKEGG                       KEGG;
00037     COntologyMIPS                       MIPS;
00038     const IOntology*                    pOnto;
00039     CDat                                Dat;
00040     CSlim                               Slim, SlimNeg;
00041     ofstream                            ofsm;
00042     size_t                              i, j, k;
00043     map<const CGene*,bool>              mapGenes;
00044     map<const CGene*,bool>::iterator    iterGene;
00045     int                                 iRet;
00046     const char*                         szNamespace;
00047 
00048     iRet = cmdline_parser2( iArgs, aszArgs, &sArgs, 0, 1, 0 );
00049     if( sArgs.config_arg )
00050         iRet = cmdline_parser_configfile( sArgs.config_arg, &sArgs, 0, 0, 1 ) && iRet;
00051     if( iRet ) {
00052         cmdline_parser_print_help( );
00053         return 1; }
00054     CMeta Meta( sArgs.verbosity_arg, sArgs.random_arg );
00055         
00056     if( sArgs.onto_arg ) {
00057         ifsmOnto.open( sArgs.onto_arg );
00058         if( !strcmp( sArgs.namespace_arg, c_szBP ) )
00059             szNamespace = COntologyOBO::c_szBiologicalProcess;
00060         else if( !strcmp( sArgs.namespace_arg, c_szCC ) )
00061             szNamespace = COntologyOBO::c_szCellularComponent;
00062         else if( !strcmp( sArgs.namespace_arg, c_szMF ) )
00063             szNamespace = COntologyOBO::c_szMolecularFunction;
00064         else
00065             szNamespace = sArgs.namespace_arg;
00066 
00067         if( sArgs.obo_anno_arg ) {
00068             ifsmAnno.open( sArgs.obo_anno_arg );
00069 
00070             if( !OBO.Open( ifsmOnto, ifsmAnno, Genome, szNamespace, !!sArgs.dbids_flag ) ) {
00071                 cerr << "Couldn't open: ";
00072                 if( sArgs.obo_anno_arg )
00073                     cerr << sArgs.obo_anno_arg << ", ";
00074                 cerr << sArgs.onto_arg << endl;
00075                 return 1;
00076             }
00077 
00078             ifsmAnno.close( );
00079             pOnto = &OBO;
00080         }
00081         ifsmOnto.close( );
00082     }
00083 
00084     else if( sArgs.mips_onto_arg ) {
00085         ifsmOnto.open( sArgs.mips_onto_arg );
00086         if( sArgs.mips_anno_arg )
00087             ifsmAnno.open( sArgs.mips_anno_arg );
00088         if( !MIPS.Open( ifsmOnto, ifsmAnno, Genome ) ) {
00089             cerr << "Couldn't open: " << sArgs.mips_anno_arg << ", " <<
00090                 sArgs.mips_onto_arg << endl;
00091             return 1; }
00092         ifsmOnto.close( );
00093         if( sArgs.mips_anno_arg )
00094             ifsmAnno.close( );
00095         pOnto = &MIPS; }
00096     else if( sArgs.kegg_arg ) {
00097         ifsmOnto.open( sArgs.kegg_arg );
00098         if( !KEGG.Open( ifsmOnto, Genome, sArgs.kegg_org_arg ) ) {
00099             cerr << "Couldn't open: " << sArgs.kegg_arg << endl;
00100             return 1; }
00101         ifsmOnto.close( );
00102         pOnto = &KEGG; }
00103     else {
00104         cerr << "No ontology found." << endl;
00105         return 1; }
00106 
00107     ifsmOnto.clear( );
00108     ifsmOnto.open( sArgs.input_arg );
00109     if( !Slim.Open( ifsmOnto, pOnto ) ) {
00110         cerr << "Couldn't open: " << sArgs.input_arg << endl;
00111         return 1; }
00112     ifsmOnto.close( );
00113 
00114     if( sArgs.sql_arg ) {
00115         ofsm.open( ( (string)sArgs.sql_arg + "/functions.txt" ).c_str( ) );
00116         for( i = 0; i < Slim.GetSlims( ); ++i )
00117             ofsm << ( i + 1 ) << '\t' << Slim.GetSlim( i ) << endl;
00118         ofsm.close( );
00119 
00120         ofsm.open( ( (string)sArgs.sql_arg + "/genes.txt" ).c_str( ) );
00121         for( i = 0; i < Genome.GetGenes( ); ++i )
00122             ofsm << ( i + 1 ) << '\t' << Genome.GetGene( i ).GetName( ) << endl;
00123         ofsm.close( );
00124 
00125         ofsm.open( ( (string)sArgs.sql_arg + "/gene_names.txt" ).c_str( ) );
00126         for( k = i = 0; i < Genome.GetGenes( ); ++i ) {
00127             const CGene&    Gene    = Genome.GetGene( i );
00128 
00129             for( j = 0; j < Gene.GetSynonyms( ); ++j )
00130                 ofsm << ( k++ + 1 ) << '\t' << ( i + 1 ) << '\t' << Gene.GetSynonym( j ) << endl; }
00131         ofsm.close( );
00132 
00133         ofsm.open( ( (string)sArgs.sql_arg + "/functions_genes.txt" ).c_str( ) );
00134         for( i = 0; i < Slim.GetSlims( ); ++i )
00135             for( j = 0; j < Slim.GetGenes( i ); ++j )
00136                 ofsm << ( i + 1 ) << '\t' << ( Genome.GetGene( Slim.GetGene( i, j ).GetName( ) ) + 1 ) << endl;
00137         ofsm.close( );
00138 
00139         return 0; }
00140 
00141     if( sArgs.nsets_flag ) {
00142         for( i = 0; i < Slim.GetSlims( ); ++i ) {
00143             map<const CGene*,bool>              mapGenes;
00144             map<const CGene*,bool>::iterator    iterGene;
00145 
00146             for( j = 0; j < Slim.GetSlims( ); ++j ) {
00147                 if( i == j )
00148                     continue;
00149                 if( overlap( Genome, Slim, i, j ) < sArgs.nsetlap_arg )
00150                     for( k = 0; k < Slim.GetGenes( j ); ++k )
00151                         mapGenes[ &Slim.GetGene( j, k ) ] = false;
00152                 else
00153                     for( k = 0; k < Slim.GetGenes( j ); ++k ) {
00154                         const CGene*    pGene   = &Slim.GetGene( j, k );
00155 
00156                         if( mapGenes.find( pGene ) == mapGenes.end( ) )
00157                             mapGenes[ pGene ] = true; } }
00158 
00159             ofsm.open( ( (string)sArgs.directory_arg + '/' + CMeta::Filename( Slim.GetSlim( i ) ) ).c_str( ) );
00160             for( iterGene = mapGenes.begin( ); iterGene != mapGenes.end( ); ++iterGene )
00161                 if( iterGene->second )
00162                     ofsm << iterGene->first->GetName( ) << endl;
00163             ofsm.close( ); }
00164         return 0; }
00165 
00166     if( sArgs.output_arg ) {
00167         if( sArgs.negatives_arg ) {
00168             ifsmOnto.clear( );
00169             ifsmOnto.open( sArgs.negatives_arg );
00170             if( !( SlimNeg.Open( ifsmOnto, pOnto ) && Dat.Open( Slim, SlimNeg ) ) ) {
00171                 cerr << "Couldn't open: " << sArgs.negatives_arg << endl;
00172                 return 1; }
00173             ifsmOnto.close( ); }
00174         else if( !Dat.Open( Slim ) ) {
00175             cerr << "Couldn't open: " << sArgs.input_arg << endl;
00176             return 1; }
00177 
00178         Dat.Save( sArgs.output_arg ); }
00179 
00180     if( sArgs.test_arg ) {
00181         for( i = 0; i < Slim.GetSlims( ); ++i )
00182             for( j = 0; j < Slim.GetGenes( i ); ++j )
00183                 mapGenes[ &Slim.GetGene( i, j ) ] = false;
00184         for( i = 0; i < Slim.GetSlims( ); ++i )
00185             mapGenes[ &Slim.GetGene( i, rand( ) % Slim.GetGenes( i ) ) ] = true;
00186         for( iterGene = mapGenes.begin( ); iterGene != mapGenes.end( ); ++iterGene )
00187             if( ( (float)rand( ) / RAND_MAX ) < sArgs.test_arg )
00188                 iterGene->second = true; }
00189 
00190     if( sArgs.annotations_arg ) {
00191         ofsm.clear( );
00192         ofsm.open( sArgs.annotations_arg );
00193         for( i = 0; i < Slim.GetSlims( ); ++i ) {
00194         
00195             for( j = 0; j < Slim.GetGenes( i ); ++j ) {
00196                 const CGene& Gene = Slim.GetGene( i, j );
00197                 const string& strName =
00198                         (sArgs.synonyms_flag && Gene.GetSynonyms( )) ? Gene.GetSynonym( 0 )
00199                                 : Gene.GetName( );
00200 
00201                 ofsm << Slim.GetSlim( i ) << "\t" << strName;
00202                 ofsm << endl;
00203             }
00204         }
00205         ofsm.close( );
00206     }
00207 
00208     if( sArgs.directory_arg )
00209         for( i = 0; i < Slim.GetSlims( ); ++i ) {
00210             ofsm.clear( );
00211             ofsm.open( ( (string)sArgs.directory_arg + '/' +
00212                 CMeta::Filename( Slim.GetSlim( i ) ) ).c_str( ) );
00213             for( j = 0; j < Slim.GetGenes( i ); ++j ) {
00214                 const CGene&    Gene    = Slim.GetGene( i, j );
00215                 const string&   strName = ( sArgs.synonyms_flag && Gene.GetSynonyms( ) ) ?
00216                     Gene.GetSynonym( 0 ) : Gene.GetName( );
00217             
00218                 ofsm << strName;
00219                 if( sArgs.allids_flag )
00220                     for( k = 0; k < Gene.GetSynonyms( ); ++k )
00221                         ofsm << '\t' << ( ( !k && sArgs.synonyms_flag ) ? Gene.GetName( ) :
00222                             Gene.GetSynonym( k ) );
00223                 ofsm << endl; }
00224             ofsm.close( ); }
00225     if( sArgs.test_arg )
00226         for( iterGene = mapGenes.begin( ); iterGene != mapGenes.end( ); ++iterGene )
00227             if( iterGene->second ) {
00228                 const string&   strName = ( sArgs.synonyms_flag && iterGene->first->GetSynonyms( ) ) ?
00229                     iterGene->first->GetSynonym( 0 ) : iterGene->first->GetName( );
00230 
00231                 cout << strName << endl; }
00232 
00233     return 0; }
00234 
00235 double overlap( const CGenome& Genome, const CSlim& Slim, size_t iOne, size_t iTwo ) {
00236     size_t              i, iOverlap;
00237     set<const CGene*>   setOne;
00238 
00239     for( i = 0; i < Slim.GetGenes( iOne ); ++i )
00240         setOne.insert( &Slim.GetGene( iOne, i ) );
00241     for( iOverlap = i = 0; i < Slim.GetGenes( iTwo ); ++i )
00242         if( setOne.find( &Slim.GetGene( iTwo, i ) ) != setOne.end( ) )
00243             iOverlap++;
00244 
00245     return CStatistics::HypergeometricCDF( iOverlap, Slim.GetGenes( iOne ), Slim.GetGenes( iTwo ),
00246         Genome.GetGenes( ) ); }