Sleipnir
tools/Answerer/Answerer.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 int read_genes( const char*, CGenome&, vector<CGenes*>& );
00026 
00027 int main( int iArgs, char** aszArgs ) {
00028         gengetopt_args_info sArgs;
00029         CGenome             Genome;
00030         CDat                Dat;
00031         vector<CGenes*>     vecpPositives, vecpNegatives;
00032         size_t              i, j, iPositives, iNegatives, cNegatives;
00033         int                 iRet;
00034         float               d, dProb;
00035 
00036         if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00037                 cmdline_parser_print_help( );
00038                 return 1; }
00039         CMeta Meta( sArgs.verbosity_arg, sArgs.random_arg );
00040         if( sArgs.genome_arg ) {
00041           ifstream  ifsm;
00042           CGenome   Genome_tmp;
00043           CGenes    Genes( Genome_tmp );
00044 
00045           ifsm.open( sArgs.genome_arg );
00046           if( !Genes.Open( ifsm ) ){
00047             cerr << "Couldn't open: " << sArgs.genome_arg << endl;
00048             return 1;
00049           }
00050           for( i = 0; i < Genes.GetGenes( ); ++i )
00051             Genome.AddGene( Genes.GetGene( i ).GetName( ) );
00052 
00053           cerr << "Genome added " << Genome.GetGenes() << " genes from " << sArgs.genome_arg << endl;
00054         }
00055 
00056         if( ( sArgs.positives_arg && ( iRet = read_genes( sArgs.positives_arg, Genome, vecpPositives ) ) ) ||
00057                 ( sArgs.negatives_arg && ( iRet = read_genes( sArgs.negatives_arg, Genome, vecpNegatives ) ) ) )
00058                 return iRet;
00059 
00060         if( sArgs.input_arg ) {
00061                 CDat    DatPositives;
00062 
00063                 if( !DatPositives.Open( sArgs.input_arg, true ) ) {
00064                         cerr << "Could not open: " << sArgs.input_arg << endl;
00065                         return 1; }
00066                 for( i = 0; i < DatPositives.GetGenes( ); ++i )
00067                         Genome.AddGene( DatPositives.GetGene( i ) );
00068 
00069                 // ok when given a gene list and target prior, first only select negatives that have at least one annotated pair
00070                 // then later add random negatives that will bring it up to the target prior
00071                 if( sArgs.genome_arg && (sArgs.interactions_given || sArgs.prior_given || !!sArgs.incident_flag) ){
00072                   if( !Dat.Open( DatPositives, vecpNegatives, Genome, false, true ) ) {
00073                     cerr << "Could not open " << sArgs.input_arg << " with negatives" << endl;
00074                     return 1;
00075                   }
00076                 }
00077                 else if( !Dat.Open( DatPositives, vecpNegatives, Genome, false ) ) {
00078                   cerr << "Could not open " << sArgs.input_arg << " with negatives" << endl;
00079                   return 1; } }
00080         else
00081                 Dat.Open( vecpPositives, vecpNegatives, (float)sArgs.overlap_arg, Genome, !!sArgs.incident_flag );
00082         if( sArgs.interactions_given || sArgs.prior_given ) {
00083                 for( iPositives = cNegatives = i = 0; i < Dat.GetGenes( ); ++i )
00084                   for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j )
00085                     if( !CMeta::IsNaN( d = Dat.Get( i, j ) ) ){
00086                       if( d == 1)
00087                         iPositives++;
00088                       else
00089                         cNegatives++;
00090                     }
00091 
00092                 if( sArgs.interactions_given )
00093                   iNegatives = (size_t)( ( (float)( iPositives * ( Dat.GetGenes( ) - 1 ) ) /
00094                                            sArgs.interactions_arg ) + 0.5f );
00095                 else
00096                   iNegatives = (size_t)( iPositives * (1 - sArgs.prior_arg) / sArgs.prior_arg + 0.5f );
00097 
00098                 // subtract negatives that are already assigned
00099                 // if existing negatives exceed the target number keep the current count.
00100                 if( iNegatives < cNegatives)
00101                   iNegatives = 0;
00102                 else
00103                   iNegatives -= cNegatives;
00104 
00105                 i = ( Dat.GetGenes( ) * ( Dat.GetGenes( ) - 1 ) / 2 ) - iPositives - cNegatives;
00106 
00107                 if( iNegatives > i ) {
00108                   cerr << "WARNING: Impossibly large negative interaction requirement: " << (iNegatives+cNegatives) << endl;
00109                   cerr << "WARNING: Failed to reach target prior, currently set at: " << ((float)iPositives / (iPositives+cNegatives)) << endl;
00110                   // set sampling count to 0, keep the lower prior
00111                   iNegatives = 0;
00112                 }
00113 
00114                 dProb = (float)iNegatives / i;
00115                 for( i = 0; i < Dat.GetGenes( ); ++i )
00116                   for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j )
00117                     if( ( CMeta::IsNaN( d = Dat.Get( i, j ) ) ) &&
00118                         ( ( rand( ) / RAND_MAX ) < dProb ) )
00119                       Dat.Set( i, j, 0 );
00120         }
00121         if( sArgs.test_arg ) {
00122                 set<string>                 setstrGenes;
00123                 set<string>::const_iterator iterGenes;
00124                 ostream&                    ostm    = sArgs.output_arg ? cout : cerr;
00125 
00126                 for( i = 0; i < vecpPositives.size( ); ++i ) {
00127                         size_t  iGenes;
00128 
00129                         if( iGenes = vecpPositives[ i ]->GetGenes( ) )
00130                                 setstrGenes.insert( vecpPositives[ i ]->GetGene( rand( ) % iGenes ).GetName( ) ); }
00131                 while( setstrGenes.size( ) < ( sArgs.test_arg * Dat.GetGenes( ) ) )
00132                         setstrGenes.insert( Dat.GetGene( rand( ) % Dat.GetGenes( ) ) );
00133                 for( iterGenes = setstrGenes.begin( ); iterGenes != setstrGenes.end( ); ++iterGenes )
00134                         ostm << *iterGenes << endl; }
00135         if( sArgs.exclude_arg ) {
00136                 CDat            DatExclude;
00137                 vector<size_t>  veciGenes;
00138                 size_t          iOne, iTwo;
00139 
00140                 if( !DatExclude.Open( sArgs.exclude_arg ) ) {
00141                         cerr << "Could not open: " << sArgs.exclude_arg << endl;
00142                         return 1; }
00143                 veciGenes.resize( DatExclude.GetGenes( ) );
00144                 for( i = 0; i < veciGenes.size( ); ++i )
00145                         veciGenes[i] = Dat.GetGene( DatExclude.GetGene( i ) );
00146                 for( i = 0; i < DatExclude.GetGenes( ); ++i ) {
00147                         if( ( iOne = veciGenes[i] ) == -1 )
00148                                 continue;
00149                         for( j = ( i + 1 ); j < DatExclude.GetGenes( ); ++j ) {
00150                                 if( ( iTwo = veciGenes[j] ) == -1 )
00151                                         continue;
00152                                 if( !CMeta::IsNaN( d = DatExclude.Get( i, j ) ) && d )
00153                                         Dat.Set( iOne, iTwo, CMeta::GetNaN( ) ); } } }
00154         if( sArgs.scramble_arg > 0 ) {
00155                 float   dPositives;
00156                 size_t  iPositives, iTotal;
00157 
00158                 for( i = iPositives = iTotal = 0; i < Dat.GetGenes( ); ++i )
00159                         for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j )
00160                                 if( !CMeta::IsNaN( d = Dat.Get( i, j ) ) ) {
00161                                         iTotal++;
00162                                         if( d )
00163                                                 iPositives++; }
00164                 dPositives = (float)iPositives / iTotal;
00165                 for( i = 0; i < Dat.GetGenes( ); ++i )
00166                         for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j )
00167                                 if( ( (float)rand( ) / RAND_MAX ) < sArgs.scramble_arg )
00168                                         Dat.Set( i, j, ( ( (float)rand( ) / RAND_MAX ) < dPositives ) ? 1.0f : 0 ); }
00169 
00170         Dat.Save( sArgs.output_arg );
00171 
00172         for( i = 0; i < vecpPositives.size( ); ++i )
00173                 delete vecpPositives[ i ];
00174         for( i = 0; i < vecpNegatives.size( ); ++i )
00175                 delete vecpNegatives[ i ];
00176         return 0; }
00177 
00178 int read_genes( const char* szDir, CGenome& Genome, vector<CGenes*>& vecpGenes ) {
00179         CGenes*     pGenes;
00180         string      strDir, strFile;
00181 
00182         strDir = szDir;
00183         FOR_EACH_DIRECTORY_FILE(strDir, strFile)
00184                 ifstream    ifsm;
00185 
00186                 strFile = strDir + '/' + strFile;
00187                 ifsm.open( strFile.c_str( ) );
00188                 pGenes = new CGenes( Genome );
00189                 if( !pGenes->Open( ifsm ) ) {
00190                         cerr << "Couldn't open: " << strFile << endl;
00191                         return 1; }
00192                 vecpGenes.push_back( pGenes ); }
00193 
00194         return 0; }