Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 static const char c_szXDSL[] = ".xdsl"; 00026 static const char c_szDSL[] = ".dsl"; 00027 00028 int main_database( const gengetopt_args_info& ); 00029 int main_datfile( const gengetopt_args_info& ); 00030 size_t count_contexts( const char* ); 00031 bool open_contexts( const char*, size_t, size_t, CCompactFullMatrix& ); 00032 00033 int main( int iArgs, char** aszArgs ) { 00034 gengetopt_args_info sArgs; 00035 int iRet; 00036 00037 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00038 cmdline_parser_print_help( ); 00039 return 1; } 00040 CMeta Meta( sArgs.verbosity_arg ); 00041 00042 iRet = sArgs.dat_flag ? main_datfile( sArgs ) : main_database( sArgs ); 00043 00044 return iRet; } 00045 00046 size_t count_contexts( const char* szContexts ) { 00047 static const size_t c_iBuffer = 1024; 00048 char acBuffer[ c_iBuffer ]; 00049 ifstream ifsm; 00050 vector<string> vecstrLine; 00051 size_t iRet, iCur; 00052 00053 iRet = -1; 00054 ifsm.open( szContexts ); 00055 if( !ifsm.is_open( ) ) { 00056 cerr << "Could not open: " << szContexts << endl; 00057 return -1; } 00058 while( !ifsm.eof( ) ) { 00059 ifsm.getline( acBuffer, c_iBuffer - 1 ); 00060 acBuffer[ c_iBuffer - 1 ] = 0; 00061 if( !acBuffer[ 0 ] ) 00062 continue; 00063 vecstrLine.clear( ); 00064 CMeta::Tokenize( acBuffer, vecstrLine ); 00065 if( vecstrLine.empty( ) ) { 00066 cerr << "Invalid line in " << szContexts << ": " << acBuffer << endl; 00067 return -1; } 00068 iCur = atoi( vecstrLine[ 0 ].c_str( ) ); 00069 if( ( iRet == -1 ) || ( iCur > iRet ) ) 00070 iRet = iCur; } 00071 ifsm.close( ); 00072 00073 return iRet; } 00074 00075 bool open_contexts( const char* szContexts, size_t iContexts, size_t iGenes, 00076 CCompactFullMatrix& MatContexts ) { 00077 static const size_t c_iBuffer = 1024; 00078 char acBuffer[ c_iBuffer ]; 00079 ifstream ifsm; 00080 vector<string> vecstrLine; 00081 00082 if( ( iContexts == -1 ) && ( ( iContexts = count_contexts( szContexts ) ) == -1 ) ) 00083 return false; 00084 00085 ifsm.open( szContexts ); 00086 if( !ifsm.is_open( ) ) { 00087 cerr << "Could not open: " << szContexts << endl; 00088 return false; } 00089 MatContexts.Initialize( iContexts, iGenes, 2, true ); 00090 while( !ifsm.eof( ) ) { 00091 size_t iContext, iGene; 00092 00093 ifsm.getline( acBuffer, c_iBuffer - 1 ); 00094 acBuffer[ c_iBuffer - 1 ] = 0; 00095 if( !acBuffer[ 0 ] ) 00096 continue; 00097 vecstrLine.clear( ); 00098 CMeta::Tokenize( acBuffer, vecstrLine ); 00099 if( vecstrLine.size( ) != 2 ) { 00100 cerr << "Invalid line in " << szContexts << ": " << acBuffer << endl; 00101 return false; } 00102 iContext = atoi( vecstrLine[ 0 ].c_str( ) ) - 1; 00103 iGene = atoi( vecstrLine[ 1 ].c_str( ) ) - 1; 00104 if( iContext >= MatContexts.GetRows( ) ) { 00105 cerr << "Invalid context on line: " << acBuffer << endl; 00106 return false; } 00107 if( iGene >= MatContexts.GetColumns( ) ) { 00108 cerr << "Invalid gene on line: " << acBuffer << endl; 00109 return false; } 00110 MatContexts.Set( iContext, iGene, 1 ); } 00111 ifsm.close( ); 00112 00113 return true; } 00114 00115 int main_database( const gengetopt_args_info& sArgs ) { 00116 static const size_t c_iBuffer = 1024; 00117 char acBuffer[ c_iBuffer ]; 00118 CBayesNetSmile BNSmile; 00119 CBayesNetMinimal BNDefault; 00120 vector<CBayesNetMinimal> vecBNs; 00121 ifstream ifsm; 00122 istream* pistm; 00123 map<size_t, string> mapistrBNs; 00124 map<size_t, string>::const_iterator iterBN; 00125 size_t i, iMax, iGene; 00126 //CDatabase Database; 00127 uint32_t iSize; 00128 float* adGenes; 00129 CDat Dat; 00130 CCompactFullMatrix MatContexts; 00131 vector<string> vecstrLine; 00132 00133 bool isNibble = true; 00134 if(sArgs.is_nibble_arg==0){ 00135 isNibble = false; 00136 } 00137 CDatabase Database(isNibble); 00138 00139 if( !Database.Open( sArgs.database_arg ) ) { 00140 cerr << "Could not open: " << sArgs.database_arg << endl; 00141 return 1; } 00142 if( sArgs.minimal_in_flag ) { 00143 ifsm.open( sArgs.networks_arg, ios_base::binary ); 00144 if( !BNDefault.Open( ifsm ) ) { 00145 cerr << "Could not read: " << sArgs.networks_arg << endl; 00146 return 1; } 00147 ifsm.read( (char*)&iSize, sizeof(iSize) ); 00148 vecBNs.resize( iSize ); 00149 for( i = 0; i < vecBNs.size( ); ++i ) 00150 if( !vecBNs[ i ].Open( ifsm ) ) { 00151 cerr << "Could not read: " << sArgs.networks_arg << " (" << i << ")" << endl; 00152 return 1; } 00153 ifsm.close( ); } 00154 else { 00155 if( sArgs.default_arg && !( BNSmile.Open( sArgs.default_arg ) && BNDefault.Open( BNSmile ) ) ) { 00156 cerr << "Could not open: " << sArgs.default_arg << endl; 00157 return 1; } 00158 BNDefault.SetID( sArgs.default_arg ); 00159 if( sArgs.input_arg ) { 00160 ifsm.open( sArgs.input_arg ); 00161 pistm = &ifsm; } 00162 else 00163 pistm = &cin; 00164 iMax = 0; 00165 while( !pistm->eof( ) ) { 00166 pistm->getline( acBuffer, c_iBuffer - 1 ); 00167 acBuffer[ c_iBuffer - 1 ] = 0; 00168 if( !acBuffer[ 0 ] ) 00169 continue; 00170 vecstrLine.clear( ); 00171 CMeta::Tokenize( acBuffer, vecstrLine ); 00172 if( vecstrLine.size( ) < 2 ) { 00173 cerr << "Ignoring line: " << acBuffer << endl; 00174 continue; } 00175 if( ( i = atoi( vecstrLine[ 0 ].c_str( ) ) ) > iMax ) 00176 iMax = i; 00177 mapistrBNs[ i ] = vecstrLine[ 1 ]; } 00178 if( sArgs.input_arg ) 00179 ifsm.close( ); 00180 vecBNs.resize( iMax ); 00181 for( iterBN = mapistrBNs.begin( ); iterBN != mapistrBNs.end( ); ++iterBN ) { 00182 if( !( BNSmile.Open( ( (string)sArgs.networks_arg + '/' + CMeta::Filename( iterBN->second ) + 00183 ( sArgs.xdsl_flag ? c_szXDSL : c_szDSL ) ).c_str( ) ) && 00184 vecBNs[ iterBN->first - 1 ].Open( BNSmile ) ) ) { 00185 cerr << "Could not open: " << iterBN->second << endl; 00186 return 1; } 00187 vecBNs[ iterBN->first - 1 ].SetID( iterBN->second ); } } 00188 00189 if( sArgs.minimal_out_arg ) { 00190 ofstream ofsm; 00191 00192 ofsm.open( sArgs.minimal_out_arg, ios_base::binary ); 00193 BNDefault.Save( ofsm ); 00194 iSize = (uint32_t)vecBNs.size( ); 00195 ofsm.write( (const char*)&iSize, sizeof(iSize) ); 00196 for( i = 0; i < vecBNs.size( ); ++i ) 00197 vecBNs[ i ].Save( ofsm ); } 00198 00199 if( !open_contexts( sArgs.contexts_arg, vecBNs.size( ), Database.GetGenes( ), MatContexts ) ) 00200 return 1; 00201 00202 vecstrLine.resize( Database.GetGenes( ) ); 00203 for( i = 0; i < vecstrLine.size( ); ++i ) 00204 vecstrLine[ i ] = Database.GetGene( i ); 00205 Dat.Open( vecstrLine ); 00206 adGenes = new float[ Database.GetGenes( ) ]; 00207 for( iGene = 0; ( iGene + 1 ) < Database.GetGenes( ); ++iGene ) { 00208 vector<unsigned char> vecbData; 00209 00210 if( !Database.Get( iGene, vecbData ) ) { 00211 cerr << "Could not retrieve gene: " << iGene << " (" << Database.GetGene( iGene ) << ")" << endl; 00212 return 1; } 00213 if( !vecBNs[ sArgs.context_arg - 1 ].Evaluate( vecbData, adGenes, Database.GetGenes( ), iGene + 1 ) ) { 00214 cerr << "Inference failed on gene: " << iGene << " (" << Database.GetGene( iGene ) << ")" << endl; 00215 return 1; } 00216 00217 for( i = ( iGene + 1 ); i < Database.GetGenes( ); ++i ) 00218 Dat.Set( iGene, i, adGenes[ i ] ); } 00219 delete[] adGenes; 00220 00221 return 0; } 00222 00223 int main_datfile( const gengetopt_args_info& sArgs ) { 00224 static const size_t c_iBuffer = 1024; 00225 char acBuffer[ c_iBuffer ]; 00226 CDat Dat; 00227 size_t i, j, iGene, iIn, iOut; 00228 float d, dIn, dOut; 00229 CCompactFullMatrix MatContexts; 00230 vector<size_t> veciGenes; 00231 map<string, size_t> mapstriGenes; 00232 map<string, size_t>::const_iterator iterGene; 00233 ifstream ifsm; 00234 vector<string> vecstrLine, vecstrGenes; 00235 00236 if( sArgs.input_arg ) { 00237 if( !Dat.Open( sArgs.input_arg, !!sArgs.memmap_flag ) ) { 00238 cerr << "Could not open: " << sArgs.input_arg << endl; 00239 return 1; } } 00240 else { 00241 if( !Dat.Open( cin, CDat::EFormatText ) ) { 00242 cerr << "Could not open DAT" << endl; 00243 return 1; } } 00244 00245 ifsm.open( sArgs.genes_arg ); 00246 if( !ifsm.is_open( ) ) { 00247 cerr << "Could not open: " << sArgs.genes_arg << endl; 00248 return -1; } 00249 while( !ifsm.eof( ) ) { 00250 ifsm.getline( acBuffer, c_iBuffer - 1 ); 00251 acBuffer[ c_iBuffer - 1 ] = 0; 00252 if( !acBuffer[ 0 ] ) 00253 continue; 00254 vecstrLine.clear( ); 00255 CMeta::Tokenize( acBuffer, vecstrLine ); 00256 if( vecstrLine.size( ) != 2 ) { 00257 cerr << "Invalid line in " << sArgs.genes_arg << ": " << acBuffer << endl; 00258 return -1; } 00259 vecstrGenes.push_back( vecstrLine[ 1 ] ); 00260 mapstriGenes[ vecstrLine[ 1 ] ] = atoi( vecstrLine[ 0 ].c_str( ) ); } 00261 ifsm.close( ); 00262 00263 if( !open_contexts( sArgs.contexts_arg, -1, Dat.GetGenes( ), MatContexts ) ) 00264 return false; 00265 00266 if( sArgs.lookup_arg ) { 00267 veciGenes.resize( Dat.GetGenes( ) ); 00268 for( i = 0; i < Dat.GetGenes( ); ++i ) 00269 veciGenes[ i ] = ( ( iterGene = mapstriGenes.find( Dat.GetGene( i ) ) ) == mapstriGenes.end( ) ) ? 00270 -1 : ( iterGene->second - 1 ); 00271 if( ( iGene = Dat.GetGene( sArgs.lookup_arg ) ) == -1 ) { 00272 cerr << "Unknown gene: " << sArgs.lookup_arg << endl; 00273 return 1; } 00274 dIn = dOut = 0; 00275 for( iIn = iOut = i = 0; i < Dat.GetGenes( ); ++i ) { 00276 if( ( i == iGene ) || CMeta::IsNaN( d = Dat.Get( iGene, i ) ) || ( veciGenes[ i ] == -1 ) ) 00277 continue; 00278 if( MatContexts.Get( sArgs.context_arg - 1, veciGenes[ i ] ) ) { 00279 iIn++; 00280 dIn += d; } 00281 iOut++; 00282 dOut += d; } 00283 cout << Dat.GetGene( iGene ) << '\t' << ( dIn / iIn ) << '\t' << ( dOut / iOut ) << endl; } 00284 else { 00285 veciGenes.resize( vecstrGenes.size( ) ); 00286 for( i = 0; i < vecstrGenes.size( ); ++i ) 00287 veciGenes[ i ] = Dat.GetGene( vecstrGenes[ i ] ); 00288 for( i = 0; i < vecstrGenes.size( ); ++i ) { 00289 if( !( i % 100 ) ) 00290 cerr << "Gene " << i << '/' << vecstrGenes.size( ) << endl; 00291 if( ( iGene = veciGenes[ i ] ) == -1 ) { 00292 cout << vecstrGenes[ i ] << '\t' << 0 << '\t' << 0 << endl; 00293 continue; } 00294 dIn = dOut = 0; 00295 for( iIn = iOut = j = 0; j < vecstrGenes.size( ); ++j ) { 00296 if( ( i == j ) || ( veciGenes[ j ] == -1 ) || 00297 CMeta::IsNaN( d = Dat.Get( iGene, veciGenes[ j ] ) ) ) 00298 continue; 00299 if( MatContexts.Get( sArgs.context_arg - 1, j ) ) { 00300 iIn++; 00301 dIn += d; } 00302 iOut++; 00303 dOut += d; } 00304 cout << vecstrGenes[ i ] << '\t' << ( dIn / iIn ) << '\t' << ( dOut / iOut ) << endl; } } 00305 00306 return 0; }