Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 int read_genes( const char*, CGenome&, vector<CGenes*>& ); 00026 00027 int main( int iArgs, char** aszArgs ) { 00028 gengetopt_args_info sArgs; 00029 CGenome Genome; 00030 CDat Dat; 00031 vector<CGenes*> vecpPositives, vecpNegatives; 00032 size_t i, j, iPositives, iNegatives, cNegatives; 00033 int iRet; 00034 float d, dProb; 00035 00036 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00037 cmdline_parser_print_help( ); 00038 return 1; } 00039 CMeta Meta( sArgs.verbosity_arg, sArgs.random_arg ); 00040 if( sArgs.genome_arg ) { 00041 ifstream ifsm; 00042 CGenome Genome_tmp; 00043 CGenes Genes( Genome_tmp ); 00044 00045 ifsm.open( sArgs.genome_arg ); 00046 if( !Genes.Open( ifsm ) ){ 00047 cerr << "Couldn't open: " << sArgs.genome_arg << endl; 00048 return 1; 00049 } 00050 for( i = 0; i < Genes.GetGenes( ); ++i ) 00051 Genome.AddGene( Genes.GetGene( i ).GetName( ) ); 00052 00053 cerr << "Genome added " << Genome.GetGenes() << " genes from " << sArgs.genome_arg << endl; 00054 } 00055 00056 if( ( sArgs.positives_arg && ( iRet = read_genes( sArgs.positives_arg, Genome, vecpPositives ) ) ) || 00057 ( sArgs.negatives_arg && ( iRet = read_genes( sArgs.negatives_arg, Genome, vecpNegatives ) ) ) ) 00058 return iRet; 00059 00060 if( sArgs.input_arg ) { 00061 CDat DatPositives; 00062 00063 if( !DatPositives.Open( sArgs.input_arg, true ) ) { 00064 cerr << "Could not open: " << sArgs.input_arg << endl; 00065 return 1; } 00066 for( i = 0; i < DatPositives.GetGenes( ); ++i ) 00067 Genome.AddGene( DatPositives.GetGene( i ) ); 00068 00069 // ok when given a gene list and target prior, first only select negatives that have at least one annotated pair 00070 // then later add random negatives that will bring it up to the target prior 00071 if( sArgs.genome_arg && (sArgs.interactions_given || sArgs.prior_given || !!sArgs.incident_flag) ){ 00072 if( !Dat.Open( DatPositives, vecpNegatives, Genome, false, true ) ) { 00073 cerr << "Could not open " << sArgs.input_arg << " with negatives" << endl; 00074 return 1; 00075 } 00076 } 00077 else if( !Dat.Open( DatPositives, vecpNegatives, Genome, false ) ) { 00078 cerr << "Could not open " << sArgs.input_arg << " with negatives" << endl; 00079 return 1; } } 00080 else 00081 Dat.Open( vecpPositives, vecpNegatives, (float)sArgs.overlap_arg, Genome, !!sArgs.incident_flag ); 00082 if( sArgs.interactions_given || sArgs.prior_given ) { 00083 for( iPositives = cNegatives = i = 0; i < Dat.GetGenes( ); ++i ) 00084 for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) 00085 if( !CMeta::IsNaN( d = Dat.Get( i, j ) ) ){ 00086 if( d == 1) 00087 iPositives++; 00088 else 00089 cNegatives++; 00090 } 00091 00092 if( sArgs.interactions_given ) 00093 iNegatives = (size_t)( ( (float)( iPositives * ( Dat.GetGenes( ) - 1 ) ) / 00094 sArgs.interactions_arg ) + 0.5f ); 00095 else 00096 iNegatives = (size_t)( iPositives * (1 - sArgs.prior_arg) / sArgs.prior_arg + 0.5f ); 00097 00098 // subtract negatives that are already assigned 00099 // if existing negatives exceed the target number keep the current count. 00100 if( iNegatives < cNegatives) 00101 iNegatives = 0; 00102 else 00103 iNegatives -= cNegatives; 00104 00105 i = ( Dat.GetGenes( ) * ( Dat.GetGenes( ) - 1 ) / 2 ) - iPositives - cNegatives; 00106 00107 if( iNegatives > i ) { 00108 cerr << "WARNING: Impossibly large negative interaction requirement: " << (iNegatives+cNegatives) << endl; 00109 cerr << "WARNING: Failed to reach target prior, currently set at: " << ((float)iPositives / (iPositives+cNegatives)) << endl; 00110 // set sampling count to 0, keep the lower prior 00111 iNegatives = 0; 00112 } 00113 00114 dProb = (float)iNegatives / i; 00115 for( i = 0; i < Dat.GetGenes( ); ++i ) 00116 for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) 00117 if( ( CMeta::IsNaN( d = Dat.Get( i, j ) ) ) && 00118 ( ( rand( ) / RAND_MAX ) < dProb ) ) 00119 Dat.Set( i, j, 0 ); 00120 } 00121 if( sArgs.test_arg ) { 00122 set<string> setstrGenes; 00123 set<string>::const_iterator iterGenes; 00124 ostream& ostm = sArgs.output_arg ? cout : cerr; 00125 00126 for( i = 0; i < vecpPositives.size( ); ++i ) { 00127 size_t iGenes; 00128 00129 if( iGenes = vecpPositives[ i ]->GetGenes( ) ) 00130 setstrGenes.insert( vecpPositives[ i ]->GetGene( rand( ) % iGenes ).GetName( ) ); } 00131 while( setstrGenes.size( ) < ( sArgs.test_arg * Dat.GetGenes( ) ) ) 00132 setstrGenes.insert( Dat.GetGene( rand( ) % Dat.GetGenes( ) ) ); 00133 for( iterGenes = setstrGenes.begin( ); iterGenes != setstrGenes.end( ); ++iterGenes ) 00134 ostm << *iterGenes << endl; } 00135 if( sArgs.exclude_arg ) { 00136 CDat DatExclude; 00137 vector<size_t> veciGenes; 00138 size_t iOne, iTwo; 00139 00140 if( !DatExclude.Open( sArgs.exclude_arg ) ) { 00141 cerr << "Could not open: " << sArgs.exclude_arg << endl; 00142 return 1; } 00143 veciGenes.resize( DatExclude.GetGenes( ) ); 00144 for( i = 0; i < veciGenes.size( ); ++i ) 00145 veciGenes[i] = Dat.GetGene( DatExclude.GetGene( i ) ); 00146 for( i = 0; i < DatExclude.GetGenes( ); ++i ) { 00147 if( ( iOne = veciGenes[i] ) == -1 ) 00148 continue; 00149 for( j = ( i + 1 ); j < DatExclude.GetGenes( ); ++j ) { 00150 if( ( iTwo = veciGenes[j] ) == -1 ) 00151 continue; 00152 if( !CMeta::IsNaN( d = DatExclude.Get( i, j ) ) && d ) 00153 Dat.Set( iOne, iTwo, CMeta::GetNaN( ) ); } } } 00154 if( sArgs.scramble_arg > 0 ) { 00155 float dPositives; 00156 size_t iPositives, iTotal; 00157 00158 for( i = iPositives = iTotal = 0; i < Dat.GetGenes( ); ++i ) 00159 for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) 00160 if( !CMeta::IsNaN( d = Dat.Get( i, j ) ) ) { 00161 iTotal++; 00162 if( d ) 00163 iPositives++; } 00164 dPositives = (float)iPositives / iTotal; 00165 for( i = 0; i < Dat.GetGenes( ); ++i ) 00166 for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) 00167 if( ( (float)rand( ) / RAND_MAX ) < sArgs.scramble_arg ) 00168 Dat.Set( i, j, ( ( (float)rand( ) / RAND_MAX ) < dPositives ) ? 1.0f : 0 ); } 00169 00170 Dat.Save( sArgs.output_arg ); 00171 00172 for( i = 0; i < vecpPositives.size( ); ++i ) 00173 delete vecpPositives[ i ]; 00174 for( i = 0; i < vecpNegatives.size( ); ++i ) 00175 delete vecpNegatives[ i ]; 00176 return 0; } 00177 00178 int read_genes( const char* szDir, CGenome& Genome, vector<CGenes*>& vecpGenes ) { 00179 CGenes* pGenes; 00180 string strDir, strFile; 00181 00182 strDir = szDir; 00183 FOR_EACH_DIRECTORY_FILE(strDir, strFile) 00184 ifstream ifsm; 00185 00186 strFile = strDir + '/' + strFile; 00187 ifsm.open( strFile.c_str( ) ); 00188 pGenes = new CGenes( Genome ); 00189 if( !pGenes->Open( ifsm ) ) { 00190 cerr << "Couldn't open: " << strFile << endl; 00191 return 1; } 00192 vecpGenes.push_back( pGenes ); } 00193 00194 return 0; }