Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 #include <vector> 00025 00026 static const char c_acDab[] = ".dab"; 00027 static const char c_acQdab[] = ".qdab"; 00028 static const char c_acXdsl[] = ".xdsl"; 00029 00030 int main( int iArgs, char** aszArgs ) { 00031 gengetopt_args_info sArgs; 00032 ifstream ifsm; 00033 CBayesNetSmile BNSmile; 00034 CDat Dat, DatLookup; 00035 istream* pistm; 00036 vector<size_t> veciGenes; 00037 CPCL PCLLookup; 00038 vector<string> vecstrNodes, vecstrGenes, vecstrDummy; 00039 size_t i, j, k, l, iOne, iTwo, iGene, nStart, nEnd, num_to_skip; 00040 float dPrior; 00041 CDataMatrix MatCPT; 00042 unsigned char b; 00043 00044 vector<CBayesNetSmile> BNSmileList; 00045 vector<float> dPriorList; 00046 float SumPosterior; 00047 DIR* dp; 00048 struct dirent* ep; 00049 00050 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00051 cmdline_parser_print_help( ); 00052 return 1; } 00053 CMeta Meta( sArgs.verbosity_arg ); 00054 00055 // single network xdsl file 00056 if( sArgs.network_given ){ 00057 if( !BNSmile.Open( sArgs.network_arg ) ) { 00058 cerr << "Could not open: " << sArgs.network_arg << endl; 00059 return 1; } 00060 00061 BNSmileList.push_back( BNSmile ); 00062 } 00063 // directory directory network xdsl files: used for context average Edge2Post 00064 else if( sArgs.netdir_given ){ 00065 i = 0; 00066 dp = opendir (sArgs.netdir_arg); 00067 if (dp != NULL){ 00068 while (ep = readdir (dp)){ 00069 if( strstr(ep->d_name, c_acXdsl) != NULL ){ 00070 BNSmileList.push_back( CBayesNetSmile() ); 00071 if( !BNSmileList[i].Open(((string)sArgs.netdir_arg + "/" + ep->d_name).c_str() ) ){ 00072 cerr << "Could not open: " << (string)sArgs.netdir_arg + "/" + ep->d_name << endl; 00073 return 1; } 00074 i++; 00075 } 00076 } 00077 (void) closedir (dp); 00078 } 00079 00080 if ( BNSmileList.size() == 0 ){ 00081 cerr << "No xdsl files in given directory:" << sArgs.netdir_arg << endl; 00082 return 1; 00083 } 00084 } 00085 else{ 00086 cerr << "Need Network file or directory" << endl; 00087 return 1; 00088 } 00089 00090 if( !Dat.Open( sArgs.input_arg, !!sArgs.memmap_flag ) ) { 00091 cerr << "Could not open: " << sArgs.input_arg << endl; 00092 return 1; } 00093 00094 if( sArgs.cutoff_given ) { 00095 DatLookup.Open(Dat.GetGeneNames()); 00096 for( i = 0; i < Dat.GetGenes( ); ++i ) 00097 for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) 00098 if( Dat.Get( i, j ) < sArgs.cutoff_arg ) 00099 Dat.Set( i, j, CMeta::GetNaN( ) ); 00100 else 00101 DatLookup.Set( i, j, 1); 00102 } 00103 else if( sArgs.lookup_arg ) { 00104 ifsm.open( sArgs.lookup_arg ); 00105 pistm = &ifsm; } 00106 else 00107 pistm = &cin; 00108 00109 if( sArgs.lookup_given && !DatLookup.Open( *pistm, CDat::EFormatText, 1 ) ) { 00110 cerr << "Could not open: " << ( sArgs.lookup_arg ? sArgs.lookup_arg : "stdin" ) << endl; 00111 return 1; } 00112 ifsm.close( ); 00113 00114 for( i = 0; i < DatLookup.GetGenes( ); ++i ) 00115 for( j = ( i + 1 ); j < DatLookup.GetGenes( ); ++j ) 00116 if( DatLookup.Get( i, j ) == 1 ) 00117 vecstrGenes.push_back( DatLookup.GetGene( i ) + " - " + DatLookup.GetGene( j ) ); 00118 cerr << vecstrGenes.size() << endl; 00119 00120 // Assumes all network xdsl has same datasets!! 00121 BNSmileList[ 0 ].GetNodes( vecstrNodes ); 00122 vecstrNodes[ 0 ] = sArgs.input_arg; 00123 00125 nStart = 0; 00126 nEnd = 0; 00127 num_to_skip = 0; 00128 if( sArgs.start_arg > -1){ 00129 nStart = sArgs.start_arg + 1; 00130 num_to_skip = sArgs.start_arg; 00131 00132 if( nStart == 2 ) 00133 vecstrNodes.erase( vecstrNodes.begin() + 1 ); 00134 else if( nStart <= vecstrNodes.size() ) 00135 vecstrNodes.erase( vecstrNodes.begin() + 1, vecstrNodes.begin() + nStart ); 00136 } 00137 if( sArgs.end_arg > -1 ){ 00138 nEnd = (sArgs.end_arg+1) - nStart + 1; 00139 00140 if( (nEnd+1) < vecstrNodes.size() ) 00141 vecstrNodes.erase( (vecstrNodes.begin() + nEnd + 1), vecstrNodes.end() ); 00142 } 00143 00144 PCLLookup.Open( vecstrGenes, vecstrNodes, vecstrDummy ); 00145 00146 veciGenes.resize( DatLookup.GetGenes( ) ); 00147 for( i = 0; i < veciGenes.size( ); ++i ) 00148 veciGenes[ i ] = Dat.GetGene( DatLookup.GetGene( i ) ); 00149 for( iGene = i = 0; i < DatLookup.GetGenes( ); ++i ) { 00150 iOne = veciGenes[ i ]; 00151 for( j = ( i + 1 ); j < DatLookup.GetGenes( ); ++j ) { 00152 iTwo = veciGenes[ j ]; 00153 if( DatLookup.Get( i, j ) != 1 ) 00154 continue; 00155 if( ( iOne != -1 ) && ( iTwo != -1 ) ) 00156 PCLLookup.Set( iGene, 0, Dat.Get( iOne, iTwo ) ); 00157 iGene++; } } 00158 00159 // store all priors from each network 00160 dPriorList.resize( BNSmileList.size( ) ); 00161 for( i = 0; i < BNSmileList.size( ); ++i ) { 00162 BNSmileList[i].GetCPT( 0, MatCPT ); 00163 dPriorList[i] = 1 - MatCPT.Get( 0, 0 ); 00164 } 00165 00166 for( i = 1; i < vecstrNodes.size( ); ++i ) { 00167 CDataPair DatCur; 00168 string strIn; 00169 size_t iValue; 00170 00171 cerr << vecstrNodes[ i ] << endl; 00172 strIn = (string)sArgs.directory_arg + "/" + vecstrNodes[ i ] + c_acDab; 00173 if( !DatCur.Open( strIn.c_str( ), false, !!sArgs.memmap_flag ) ) { 00174 strIn = (string)sArgs.directory_arg + "/" + vecstrNodes[ i ] + c_acQdab; 00175 if( !DatCur.Open( strIn.c_str( ), false, !!sArgs.memmap_flag ) ) { 00176 cerr << "Could not open: " << strIn << endl; 00177 return 1; }} 00178 00179 for( j = 0; j < veciGenes.size( ); ++j ) 00180 veciGenes[ j ] = DatCur.GetGene( DatLookup.GetGene( j ) ); 00181 00182 // Cache each context's bin posterior prob 00183 // num bins * num contexts 00184 // DatCur.GetValues( ) * BNSmileList.size( ); 00185 float CacheBinPost[BNSmileList.size( )][DatCur.GetValues( )]; 00186 for(l = 0; l < BNSmileList.size( ); ++l){ 00187 for( j = 0; j < DatCur.GetValues( ); ++j ){ 00188 CacheBinPost[l][j] = (1 - BNSmileList[l].Evaluate( i + num_to_skip, (unsigned char)j ) - dPriorList[l]); 00189 } 00190 } 00191 00192 for( iGene = j = 0; j < DatLookup.GetGenes( ); ++j ) { 00193 iOne = veciGenes[ j ]; 00194 for( k = ( j + 1 ); k < DatLookup.GetGenes( ); ++k ) { 00195 if( DatLookup.Get( j, k ) != 1 ) 00196 continue; 00197 iTwo = veciGenes[ k ]; 00198 iValue = -1; 00199 00200 // Use zeros flag if edge is missing and both genes exist in dataset 00201 iValue = DatCur.Quantize( iOne, iTwo, BNSmileList[0].GetDefault( i + num_to_skip ) ); 00202 00203 if( iValue != -1 ){ 00204 // iterate through network xdsl files and average the posteriors 00205 SumPosterior = 0; 00206 for( l = 0; l < BNSmileList.size( ); ++l ) { 00207 SumPosterior += CacheBinPost[l][iValue]; 00208 } 00209 PCLLookup.Set( iGene, i, SumPosterior / BNSmileList.size() ); 00210 } 00211 00212 iGene++; } } } 00213 PCLLookup.Save( cout ); 00214 00215 return 0; }