Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 static const char c_szDab[] = ".dab"; 00026 00027 struct SBackground { 00028 size_t m_iCountOne; 00029 size_t m_iCountTwo; 00030 size_t m_iSizeOne; 00031 size_t m_iSizeTwo; 00032 const vector<float>* m_pvecdOut; 00033 float m_dPrior; 00034 const CDat* m_pDat; 00035 const CDat* m_pDatWithin; 00036 vector<float> m_vecdValues; 00037 const vector<size_t>* m_pveciGenes; 00038 }; 00039 00040 static int MainSet( const gengetopt_args_info& ); 00041 static int MainBackground( const gengetopt_args_info& ); 00042 static void* BackgroundThread( void* ); 00043 static void* BackgroundThreadSingle( void* ); 00044 00045 int main( int iArgs, char** aszArgs ) { 00046 #ifdef WIN32 00047 pthread_win32_process_attach_np( ); 00048 #endif // WIN32 00049 gengetopt_args_info sArgs; 00050 int iRet; 00051 00052 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00053 cmdline_parser_print_help( ); 00054 return 1; } 00055 CMeta Meta( sArgs.verbosity_arg, sArgs.random_arg ); 00056 00057 iRet = ( sArgs.sizes_arg ? MainBackground : MainSet )( sArgs ); 00058 #ifdef WIN32 00059 pthread_win32_process_detach_np( ); 00060 #endif // WIN32 00061 return iRet; } 00062 00063 int MainSet( const gengetopt_args_info& sArgs ) { 00064 CGenome Genome; 00065 CGenes GenesQuery( Genome ); 00066 ifstream ifsm; 00067 size_t iFunction, i, j; 00068 vector<size_t> veciQuery; 00069 float d; 00070 00071 if( sArgs.genes_arg ) 00072 ifsm.open( sArgs.genes_arg ); 00073 if( !GenesQuery.Open( sArgs.genes_arg ? ifsm : cin ) ) { 00074 cerr << "Could not open: " << ( sArgs.genes_arg ? sArgs.genes_arg : "genes" ) << endl; 00075 return 1; } 00076 if( sArgs.genes_arg ) 00077 ifsm.close( ); 00078 00079 veciQuery.resize( GenesQuery.GetGenes( ) ); 00080 cout << "Function Score QueryIn QueryOut FuncIn FuncOut" << endl; 00081 for( iFunction = 0; iFunction < sArgs.inputs_num; ++iFunction ) { 00082 CGenes GenesFunction( Genome ); 00083 CDat Dat; 00084 string strDab; 00085 float dFuncIn, dFuncOut, dQueryIn, dQueryOut; 00086 size_t iFuncIn, iFuncOut, iQueryIn, iQueryOut, iOne, iTwo; 00087 vector<size_t> veciFunction; 00088 00089 if( !GenesFunction.Open( sArgs.inputs[ iFunction ] ) ) { 00090 cerr << "Could not open: " << sArgs.inputs[ iFunction ] << endl; 00091 return 1; } 00092 strDab = (string)sArgs.directory_arg + '/' + CMeta::Deextension( CMeta::Basename( 00093 sArgs.inputs[ iFunction ] ) ) + c_szDab; 00094 if( !Dat.Open( strDab.c_str( ), sArgs.memmap_flag && !sArgs.normalize_flag ) ) { 00095 cerr << "Could not open: " << strDab << endl; 00096 return 1; } 00097 if( sArgs.normalize_flag ) 00098 Dat.Normalize( CDat::ENormalizeSigmoid ); 00099 00100 for( i = 0; i < veciQuery.size( ); ++i ) 00101 veciQuery[ i ] = Dat.GetGene( GenesQuery.GetGene( i ).GetName( ) ); 00102 veciFunction.resize( GenesFunction.GetGenes( ) ); 00103 for( i = 0; i < veciFunction.size( ); ++i ) 00104 veciFunction[ i ] = Dat.GetGene( GenesFunction.GetGene( i ).GetName( ) ); 00105 00106 dFuncIn = dFuncOut = dQueryIn = dQueryOut = 0; 00107 for( iQueryIn = iFuncIn = iFuncOut = i = 0; i < veciFunction.size( ); ++i ) { 00108 if( ( iOne = veciFunction[ i ] ) == -1 ) 00109 continue; 00110 for( j = ( i + 1 ); j < veciFunction.size( ); ++j ) 00111 if( ( ( iTwo = veciFunction[ j ] ) != -1 ) && 00112 !CMeta::IsNaN( d = Dat.Get( iOne, iTwo ) ) ) { 00113 iFuncIn++; 00114 dFuncIn += d; } 00115 for( j = 0; j < Dat.GetGenes( ); ++j ) 00116 if( !CMeta::IsNaN( d = Dat.Get( iOne, j ) ) ) { 00117 iFuncOut++; 00118 dFuncOut += d; } 00119 for( j = 0; j < veciQuery.size( ); ++j ) 00120 if( ( ( iTwo = veciQuery[ j ] ) != -1 ) && 00121 !CMeta::IsNaN( d = Dat.Get( iOne, iTwo ) ) ) { 00122 iQueryIn++; 00123 dQueryIn += d; } } 00124 for( iQueryOut = i = 0; i < veciQuery.size( ); ++i ) { 00125 if( ( iOne = veciQuery[ i ] ) == -1 ) 00126 continue; 00127 for( j = 0; j < Dat.GetGenes( ); ++j ) 00128 if( !CMeta::IsNaN( d = Dat.Get( iOne, j ) ) ) { 00129 iQueryOut++; 00130 dQueryOut += d; } } 00131 dFuncIn /= iFuncIn; 00132 dFuncOut /= iFuncOut; 00133 dQueryIn /= iQueryIn; 00134 dQueryOut /= iQueryOut; 00135 00136 cout << CMeta::Basename( sArgs.inputs[ iFunction ] ) << '\t' << 00137 ( dQueryIn / dQueryOut / dFuncIn * dFuncOut ) << '\t' << dQueryIn << '\t' << dQueryOut << '\t' << 00138 dFuncIn << '\t' << dFuncOut << endl; } 00139 00140 return 0; } 00141 00142 int MainBackground( const gengetopt_args_info& sArgs ) { 00143 CDat Dat, DatWithin; 00144 const CDat* pDatWithin; 00145 CDataMatrix MatAves, MatStds, MatPValues; 00146 vector<size_t> veciSizes, veciGenes; 00147 size_t i, j, iIndexOne, iIndexTwo, iValue, iChunk; 00148 float d, dPrior; 00149 vector<float> vecdOut, vecdValues; 00150 vector<pthread_t> vecpthdThreads; 00151 vector<SBackground> vecsBackground; 00152 long double dTmp; 00153 00154 if( !Dat.Open( sArgs.input_arg, sArgs.memmap_flag && !sArgs.normalize_flag ) ) { 00155 cerr << "Could not open: " << ( sArgs.input_arg ? sArgs.input_arg : "stdin" ) << endl; 00156 return 1; } 00157 if( sArgs.normalize_flag ) 00158 Dat.Normalize( CDat::ENormalizeSigmoid ); 00159 00160 { 00161 CPCL PCLSizes( false ); 00162 00163 if( !PCLSizes.Open( sArgs.sizes_arg, 0 ) ) { 00164 cerr << "Could not open: " << sArgs.sizes_arg << endl; 00165 return 1; } 00166 veciSizes.resize( PCLSizes.GetGenes( ) ); 00167 for( i = 0; i < veciSizes.size( ); ++i ) 00168 veciSizes[ i ] = atoi( PCLSizes.GetGene( i ).c_str( ) ); 00169 } 00170 00171 if( sArgs.genes_arg ) { 00172 CGenome Genome; 00173 CGenes Genes( Genome ); 00174 00175 if( !Genes.Open( sArgs.genes_arg ) ) { 00176 cerr << "Could not open: " << sArgs.genes_arg << endl; 00177 return 1; } 00178 for( i = 0; i < Genes.GetGenes( ); ++i ) 00179 if( ( j = Dat.GetGene( Genes.GetGene( i ).GetName( ) ) ) != -1 ) 00180 veciGenes.push_back( j ); } 00181 else { 00182 veciGenes.resize( Dat.GetGenes( ) ); 00183 for( i = 0; i < veciGenes.size( ); ++i ) 00184 veciGenes[ i ] = i; } 00185 00186 { 00187 vector<long double> vecdTmp; 00188 vector<size_t> veciTmp; 00189 00190 vecdTmp.resize( Dat.GetGenes( ) ); 00191 fill( vecdTmp.begin( ), vecdTmp.end( ), 0 ); 00192 veciTmp.resize( Dat.GetGenes( ) ); 00193 fill( veciTmp.begin( ), veciTmp.end( ), 0 ); 00194 for( i = 0; i < Dat.GetGenes( ); ++i ) 00195 for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) 00196 if( !CMeta::IsNaN( d = Dat.Get( i, j ) ) ) { 00197 veciTmp[ i ]++; 00198 veciTmp[ j ]++; 00199 vecdTmp[ i ] += d; 00200 vecdTmp[ j ] += d; } 00201 vecdOut.resize( vecdTmp.size( ) ); 00202 for( i = 0; i < vecdOut.size( ); ++i ) 00203 vecdOut[ i ] = (float)( vecdTmp[ i ] / ( veciTmp[ i ] ? veciTmp[ i ] : 1 ) ); 00204 } 00205 00206 if( sArgs.input_within_arg ) { 00207 if( !DatWithin.Open( sArgs.input_within_arg ) ) { 00208 cerr << "Could not open: " << sArgs.input_within_arg << endl; 00209 return 1; } 00210 pDatWithin = &DatWithin; } 00211 else 00212 pDatWithin = &Dat; 00213 00214 for( dTmp = 0,iChunk = i = 0; i < pDatWithin->GetGenes( ); ++i ) 00215 for( j = ( i + 1 ); j < pDatWithin->GetGenes( ); ++j ) 00216 if( !CMeta::IsNaN( d = pDatWithin->Get( i, j ) ) ) { 00217 iChunk++; 00218 dTmp += d; } 00219 dPrior = (float)( dTmp / iChunk ); 00220 00221 vecpthdThreads.resize( min( sArgs.threads_arg, sArgs.count_arg ) ); 00222 vecsBackground.resize( vecpthdThreads.size( ) ); 00223 iChunk = 1 + ( ( sArgs.count_arg - 1 ) / vecpthdThreads.size( ) ); 00224 vecdValues.resize( ( sArgs.singles_flag ? 1 : sArgs.count_arg ) * iChunk * vecpthdThreads.size( ) ); 00225 MatAves.Initialize( veciSizes.size( ), veciSizes.size( ) ); 00226 MatAves.Clear( ); 00227 MatStds.Initialize( veciSizes.size( ), veciSizes.size( ) ); 00228 MatStds.Clear( ); 00229 MatPValues.Initialize( veciSizes.size( ), veciSizes.size( ) ); 00230 for( iIndexOne = 0; iIndexOne < veciSizes.size( ); ++iIndexOne ) 00231 for( iIndexTwo = iIndexOne; iIndexTwo < ( sArgs.singles_flag ? ( iIndexOne + 1 ) : veciSizes.size( ) ); 00232 ++iIndexTwo ) { 00233 cerr << veciSizes[ iIndexOne ] << ':' << veciSizes[ iIndexTwo ] << endl; 00234 for( i = 0; i < vecpthdThreads.size( ); ++i ) { 00235 vecsBackground[ i ].m_dPrior = dPrior; 00236 vecsBackground[ i ].m_iCountOne = iChunk; 00237 vecsBackground[ i ].m_iCountTwo = sArgs.count_arg; 00238 vecsBackground[ i ].m_iSizeOne = veciSizes[ iIndexOne ]; 00239 vecsBackground[ i ].m_iSizeTwo = veciSizes[ iIndexTwo ]; 00240 vecsBackground[ i ].m_pDat = &Dat; 00241 vecsBackground[ i ].m_pDatWithin = pDatWithin; 00242 vecsBackground[ i ].m_pvecdOut = &vecdOut; 00243 vecsBackground[ i ].m_pveciGenes = &veciGenes; 00244 if( pthread_create( &vecpthdThreads[ i ], NULL, sArgs.singles_flag ? BackgroundThreadSingle: 00245 BackgroundThread, &vecsBackground[ i ] ) ) { 00246 cerr << "Couldn't create thread for group: " << i << endl; 00247 return 1; } } 00248 for( iValue = i = 0; i < vecpthdThreads.size( ); ++i ) { 00249 pthread_join( vecpthdThreads[ i ], NULL ); 00250 for( j = 0; j < vecsBackground[ i ].m_vecdValues.size( ); ++j ) 00251 vecdValues[ iValue++ ] = vecsBackground[ i ].m_vecdValues[ j ]; } 00252 MatAves.Set( iIndexOne, iIndexTwo, (float)CStatistics::Average( vecdValues ) ); 00253 if( sArgs.invgauss_flag ) { 00254 for( i = 0; i < vecdValues.size( ); ++i ) 00255 MatStds.Get( iIndexOne, iIndexTwo ) += 1 / vecdValues[ i ]; 00256 MatStds.Set( iIndexOne, iIndexTwo, vecdValues.size( ) / ( MatStds.Get( iIndexOne, iIndexTwo ) - 00257 ( vecdValues.size( ) / MatAves.Get( iIndexOne, iIndexTwo ) ) ) ); } 00258 else 00259 MatStds.Set( iIndexOne, iIndexTwo, (float)sqrt( CStatistics::Variance( vecdValues, 00260 MatAves.Get( iIndexOne, iIndexTwo ) ) ) ); 00261 MatPValues.Set( iIndexOne, iIndexTwo, (float)CStatistics::Percentile( vecdValues.begin( ), 00262 vecdValues.end( ), sArgs.percentile_arg ) ); } 00263 00264 if( sArgs.singles_flag ) 00265 for( i = 0; i < veciSizes.size( ); ++i ) 00266 cout << veciSizes[ i ] << '\t' << MatAves.Get( i, i ) << '|' << MatStds.Get( i, i ) << '|' << 00267 MatPValues.Get( i, i ) << endl; 00268 else { 00269 for( i = 0; i < veciSizes.size( ); ++i ) 00270 cout << '\t' << veciSizes[ i ]; 00271 cout << endl; 00272 for( i = 0; i < MatAves.GetRows( ); ++i ) { 00273 cout << veciSizes[ i ]; 00274 for( j = 0; j < i; ++j ) 00275 cout << '\t'; 00276 for( ; j < MatAves.GetColumns( ); ++j ) { 00277 cout << '\t' << MatAves.Get( i, j ) << '|' << MatStds.Get( i, j ) << '|' << 00278 MatPValues.Get( i, j ); } 00279 cout << endl; } } 00280 00281 return 0; } 00282 00283 void* BackgroundThread( void* pData ) { 00284 size_t i, j, iCountOne, iCountTwo, iEdgesOne, iEdgesTwo, iValue; 00285 SBackground* psData; 00286 vector<size_t> veciOne, veciTwo; 00287 long double dBackgroundOne, dBackgroundTwo, dWithinOne, dWithinTwo, dWithin, dBackground, dBetween; 00288 vector<long double> vecdBetweenOne, vecdBetweenTwo; 00289 float d; 00290 00291 psData = (SBackground*)pData; 00292 psData->m_vecdValues.resize( psData->m_iCountOne * psData->m_iCountTwo ); 00293 veciOne.resize( psData->m_iSizeOne ); 00294 veciTwo.resize( psData->m_iSizeTwo ); 00295 iEdgesOne = veciOne.size( ) * ( veciOne.size( ) + 1 ) / 2; 00296 iEdgesTwo = veciTwo.size( ) * ( veciTwo.size( ) + 1 ) / 2; 00297 vecdBetweenOne.resize( veciOne.size( ) ); 00298 vecdBetweenTwo.resize( veciTwo.size( ) ); 00299 for( iValue = iCountOne = 0; iCountOne < psData->m_iCountOne; ++iCountOne ) { 00300 const vector<size_t>& veciGenes = *psData->m_pveciGenes; 00301 const vector<float>& vecdOut = *psData->m_pvecdOut; 00302 set<size_t> setiOne; 00303 00304 while( setiOne.size( ) < veciOne.size( ) ) 00305 setiOne.insert( veciGenes[ rand( ) % veciGenes.size( ) ] ); 00306 copy( setiOne.begin( ), setiOne.end( ), veciOne.begin( ) ); 00307 00308 dBackgroundOne = dWithinOne = 0; 00309 for( i = 0; i < veciOne.size( ); ++i ) { 00310 dBackgroundOne += vecdOut[ veciOne[ i ] ]; 00311 dWithinOne += psData->m_dPrior; 00312 for( j = ( i + 1 ); j < veciOne.size( ); ++j ) 00313 dWithinOne += psData->m_pDatWithin->Get( veciOne[ i ], veciOne[ j ] ); } 00314 00315 for( iCountTwo = 0; iCountTwo < psData->m_iCountTwo; ++iCountTwo ) { 00316 set<size_t> setiTwo; 00317 00318 while( setiTwo.size( ) < veciTwo.size( ) ) 00319 setiTwo.insert( veciGenes[ rand( ) % veciGenes.size( ) ] ); 00320 copy( setiTwo.begin( ), setiTwo.end( ), veciTwo.begin( ) ); 00321 00322 dBackgroundTwo = dWithinTwo = 0; 00323 for( i = 0; i < veciTwo.size( ); ++i ) { 00324 dBackgroundTwo += (*psData->m_pvecdOut)[ veciTwo[ i ] ]; 00325 dWithinTwo += psData->m_dPrior; 00326 for( j = ( i + 1 ); j < veciTwo.size( ); ++j ) 00327 dWithinTwo += psData->m_pDatWithin->Get( veciTwo[ i ], veciTwo[ j ] ); } 00328 00329 // Alternative: calculate an average over all edges (weighted) rather than between sets (unweighted) 00330 // dWithin = ( dWithinOne + dWithinTwo ) / ( iEdgesOne + iEdgesTwo ); 00331 dWithin = ( ( dWithinOne / iEdgesOne ) + ( dWithinTwo / iEdgesTwo ) ) / 2; 00332 dBackground = ( dBackgroundOne + dBackgroundTwo ) / ( veciOne.size( ) + veciTwo.size( ) ); 00333 // Alternative: calculate between scores by edge rather than node 00334 /* 00335 dBetween = 0; 00336 for( i = 0; i < veciOne.size( ); ++i ) 00337 for( j = 0; j < veciTwo.size( ); ++j ) 00338 dBetween += ( veciOne[ i ] == veciTwo[ j ] ) ? 1 : 00339 psData->m_pDat->Get( veciOne[ i ], veciTwo[ j ] ); 00340 dBetween /= veciOne.size( ) * veciTwo.size( ); 00341 */ 00342 fill( vecdBetweenOne.begin( ), vecdBetweenOne.end( ), 0 ); 00343 fill( vecdBetweenTwo.begin( ), vecdBetweenTwo.end( ), 0 ); 00344 for( i = 0; i < veciOne.size( ); ++i ) 00345 for( j = 0; j < veciTwo.size( ); ++j ) { 00346 d = ( veciOne[ i ] == veciTwo[ j ] ) ? 1 : 00347 psData->m_pDat->Get( veciOne[ i ], veciTwo[ j ] ); 00348 vecdBetweenOne[ i ] += d; 00349 vecdBetweenTwo[ j ] += d; } 00350 for( dBetween = 0,i = 0; i < vecdBetweenOne.size( ); ++i ) 00351 dBetween += vecdBetweenOne[ i ] / veciTwo.size( ); 00352 for( i = 0; i < vecdBetweenTwo.size( ); ++i ) 00353 dBetween += vecdBetweenTwo[ i ] / veciOne.size( ); 00354 dBetween /= veciOne.size( ) + veciTwo.size( ); 00355 00356 psData->m_vecdValues[ iValue++ ] = (float)( psData->m_dPrior * dBetween / dWithin / 00357 dBackground ); } } 00358 00359 return NULL; } 00360 00361 void* BackgroundThreadSingle( void* pData ) { 00362 size_t i, j, iCount, iEdges; 00363 SBackground* psData; 00364 vector<size_t> veciOne; 00365 long double dWithin, dBackground; 00366 float d; 00367 00368 psData = (SBackground*)pData; 00369 psData->m_vecdValues.resize( psData->m_iCountOne ); 00370 veciOne.resize( psData->m_iSizeOne ); 00371 iEdges = veciOne.size( ) * ( veciOne.size( ) + 1 ) / 2; 00372 for( iCount = 0; iCount < psData->m_iCountOne; ++iCount ) { 00373 const vector<size_t>& veciGenes = *psData->m_pveciGenes; 00374 const vector<float>& vecdOut = *psData->m_pvecdOut; 00375 set<size_t> setiOne; 00376 00377 while( setiOne.size( ) < veciOne.size( ) ) 00378 setiOne.insert( veciGenes[ rand( ) % veciGenes.size( ) ] ); 00379 copy( setiOne.begin( ), setiOne.end( ), veciOne.begin( ) ); 00380 00381 dBackground = dWithin = 0; 00382 for( i = 0; i < veciOne.size( ); ++i ) { 00383 dBackground += vecdOut[ veciOne[ i ] ]; 00384 dWithin += psData->m_dPrior; 00385 for( j = ( i + 1 ); j < veciOne.size( ); ++j ) 00386 if( !CMeta::IsNaN( d = psData->m_pDat->Get( veciOne[ i ], veciOne[ j ] ) ) ) 00387 dWithin += d; } 00388 dBackground /= veciOne.size( ); 00389 dWithin /= iEdges; 00390 00391 psData->m_vecdValues[ iCount ] = (float)( dWithin / dBackground ); } 00392 00393 return NULL; }