Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 struct SProcessor { 00026 const CDat& m_Dat; 00027 const vector<vector<size_t> >& m_vecveciSets1; 00028 const vector<vector<size_t> >& m_vecveciSets2; 00029 long double* m_adResults; 00030 size_t m_iThreads; 00031 00032 SProcessor( const CDat& Dat, const vector<vector<size_t> >& vecveciSets1, 00033 const vector<vector<size_t> >& vecveciSets2, long double* adResults, size_t iThreads ) : m_Dat(Dat), 00034 m_vecveciSets1(vecveciSets1), m_vecveciSets2(vecveciSets2), m_adResults(adResults), 00035 m_iThreads(iThreads) { } 00036 }; 00037 00038 typedef int (TFnProcessor)( const SProcessor& ); 00039 00040 static const char c_szDab[] = ".dab"; 00041 00042 struct SWithin { 00043 const CDat* m_pDat; 00044 const vector<vector<size_t> >* m_pvecveciSets; 00045 long double* m_adResults; 00046 long double m_dPrior; 00047 size_t m_iStart; 00048 size_t m_iLength; 00049 }; 00050 00051 struct SBetween { 00052 const CDat* m_pDat; 00053 const vector<vector<size_t> >* m_pvecveciSets1; 00054 const vector<vector<size_t> >* m_pvecveciSets2; 00055 long double* m_adResults; 00056 size_t m_iStart; 00057 size_t m_iLength; 00058 }; 00059 00060 struct SBackground { 00061 const CDat* m_pDat; 00062 const vector<vector<size_t> >* m_pvecveciSets; 00063 long double* m_adResults; 00064 size_t m_iStart; 00065 size_t m_iLength; 00066 }; 00067 00068 struct SSummary { 00069 size_t m_iCount; 00070 float m_dSum; 00071 float m_dSumSquares; 00072 00073 SSummary( ) { 00074 00075 Clear( ); } 00076 00077 void Add( float d ) { 00078 00079 m_iCount++; 00080 m_dSum += d; 00081 m_dSumSquares += d * d; } 00082 00083 void Add( const SSummary& sSummary ) { 00084 00085 m_iCount += sSummary.m_iCount; 00086 m_dSum += sSummary.m_dSum; 00087 m_dSumSquares += sSummary.m_dSumSquares; } 00088 00089 void Multiply( float d ) { 00090 00091 m_iCount = (size_t)( m_iCount * d ); 00092 m_dSum *= d; 00093 m_dSumSquares *= d; } 00094 00095 void Clear( ) { 00096 00097 m_iCount = 0; 00098 m_dSum = m_dSumSquares = 0; } 00099 00100 float GetAverage( ) const { 00101 00102 return ( m_iCount ? ( m_dSum / m_iCount ) : CMeta::GetNaN( ) ); } 00103 00104 float GetStdev( ) const { 00105 00106 return ( m_iCount ? sqrt( ( m_dSumSquares / ( max( (size_t)2, m_iCount ) - 1 ) ) - pow( GetAverage( ), 2 ) ) : CMeta::GetNaN( ) ); } 00107 }; 00108 00109 struct SDatum { 00110 SSummary m_sHubbiness; 00111 SSummary m_sCliquiness; 00112 vector<pair<size_t,float> > m_vecprSpecific; 00113 00114 void Clear( ) { 00115 00116 m_sHubbiness.Clear( ); 00117 m_sCliquiness.Clear( ); 00118 m_vecprSpecific.clear( ); } 00119 }; 00120 00121 struct STerm { 00122 string m_strFile; 00123 size_t m_iGenes; 00124 size_t m_iTotal; 00125 const CDat* m_pDat; 00126 const CPCL* m_pPCLWeights; 00127 string m_strClip; 00128 const vector<size_t>* m_pveciDat2PCL; 00129 const vector<SSummary>* m_pvecsHubs; 00130 pthread_mutex_t* m_pmutx; 00131 }; 00132 00133 static size_t hubs( const CDat&, vector<SSummary>&, const CPCL&, const vector<size_t>& ); 00134 static size_t cliques( const CDat&, size_t, const vector<SSummary>&, size_t, SDatum&, const CGenes*, const CPCL&, const vector<size_t>& ); 00135 static void enset( const CDat&, const vector<vector<string> >&, vector<vector<size_t> >& ); 00136 static int sets( const char*, const vector<string>&, vector<vector<string> >& ); 00137 static int process( const char*, bool, bool, const vector<vector<string> >&, const vector<vector<string> >&, 00138 long double*, TFnProcessor*, size_t ); 00139 static TFnProcessor background; 00140 static TFnProcessor within; 00141 static TFnProcessor between; 00142 static void* between_thread( void* ); 00143 static void* within_thread( void* ); 00144 static void* background_thread( void* ); 00145 static void* term_thread( void* ); 00146 00147 int main( int iArgs, char** aszArgs ) { 00148 #ifdef WIN32 00149 pthread_win32_process_attach_np( ); 00150 #endif // WIN32 00151 gengetopt_args_info sArgs; 00152 CDat Dat; 00153 size_t i, j, iGenes, iTotal, iThread; 00154 vector<SSummary> vecsHubs; 00155 SDatum sDatum; 00156 CPCL PCLWeights; 00157 vector<size_t> veciDat2PCL; 00158 vector<STerm> vecsTerms; 00159 vector<pthread_t> vecpthdThreads; 00160 pthread_mutex_t mutxTerms; 00161 00162 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00163 cmdline_parser_print_help( ); 00164 return 1; } 00165 CMeta Meta( sArgs.verbosity_arg ); 00166 00167 if( sArgs.output_arg ) { 00168 CDataMatrix MatBackgrounds; 00169 int iRet; 00170 vector<string> vecstrGenes, vecstrContexts; 00171 vector<vector<string> > vecvecstrSets1, vecvecstrSets2; 00172 TFnProcessor* pfnProcessor; 00173 long double* adResults; 00174 00175 if( !( sArgs.contexts_arg && sArgs.genelist_arg ) ) { 00176 cmdline_parser_print_help( ); 00177 return 1; } 00178 { 00179 static const size_t c_iContext = 2; 00180 CPCL PCLContexts( false ); 00181 00182 if( !PCLContexts.Open( sArgs.contexts_arg, c_iContext ) ) { 00183 cerr << "Could not open: " << sArgs.contexts_arg << endl; 00184 return 1; } 00185 vecstrContexts.resize( PCLContexts.GetGenes( ) ); 00186 for( i = 0; i < vecstrContexts.size( ); ++i ) 00187 vecstrContexts[ i ] = PCLContexts.GetFeature( i, c_iContext ); 00188 } 00189 { 00190 CPCL PCLGenes( false ); 00191 00192 if( !PCLGenes.Open( sArgs.genelist_arg, 1 ) ) { 00193 cerr << "Could not open: " << sArgs.genelist_arg << endl; 00194 return 1; } 00195 vecstrGenes.resize( PCLGenes.GetGenes( ) ); 00196 for( i = 0; i < vecstrGenes.size( ); ++i ) 00197 vecstrGenes[ i ] = PCLGenes.GetFeature( i, 1 ); 00198 } 00199 if( sArgs.genesets_arg ) { 00200 if( iRet = sets( sArgs.genesets_arg, vecstrGenes, vecvecstrSets1 ) ) 00201 return iRet; 00202 if( sArgs.between_arg ) { 00203 if( iRet = sets( sArgs.between_arg, vecstrGenes, vecvecstrSets2 ) ) 00204 return iRet; 00205 pfnProcessor = between; } 00206 else 00207 pfnProcessor = within; } 00208 else { 00209 vecvecstrSets1.resize( vecstrGenes.size( ) ); 00210 for( i = 0; i < vecvecstrSets1.size( ); ++i ) { 00211 vecvecstrSets1[ i ].resize( 1 ); 00212 vecvecstrSets1[ i ][ 0 ] = vecstrGenes[ i ]; } 00213 pfnProcessor = background; } 00214 00215 MatBackgrounds.Initialize( vecstrContexts.size( ) + 1, vecvecstrSets1.size( ) * max( (size_t)1, 00216 vecvecstrSets2.size( ) ) ); 00217 adResults = new long double[ MatBackgrounds.GetColumns( ) ]; 00218 if( iRet = process( sArgs.input_arg, !!sArgs.memmap_flag, !!sArgs.normalize_flag, vecvecstrSets1, 00219 vecvecstrSets2, adResults, pfnProcessor, sArgs.threads_arg ) ) 00220 return iRet; 00221 for( i = 0; i < MatBackgrounds.GetColumns( ); ++i ) 00222 MatBackgrounds.Set( 0, i, (float)adResults[ i ] ); 00223 for( i = 1; i < MatBackgrounds.GetRows( ); ++i ) { 00224 if( iRet = process( ( (string)sArgs.directory_arg + '/' + 00225 CMeta::Filename( vecstrContexts[ i - 1 ] ) + ".dab" ).c_str( ), !!sArgs.memmap_flag, 00226 !!sArgs.normalize_flag, vecvecstrSets1, vecvecstrSets2, adResults, pfnProcessor, 00227 sArgs.threads_arg ) ) 00228 return iRet; 00229 for( j = 0; j < MatBackgrounds.GetColumns( ); ++j ) 00230 MatBackgrounds.Set( i, j, (float)adResults[ j ] ); } 00231 00232 MatBackgrounds.Save( sArgs.output_arg, true ); 00233 return 0; } 00234 00235 if( sArgs.input_arg ) { 00236 if( !Dat.Open( sArgs.input_arg, !( sArgs.memmap_flag || sArgs.normalize_flag || sArgs.genin_arg || sArgs.genex_arg ) ) ) { 00237 cerr << "Could not open: " << sArgs.input_arg << endl; 00238 return 1; } } 00239 else if( !Dat.Open( cin, CDat::EFormatText ) ) { 00240 cerr << "Could not open input" << endl; 00241 return 1; } 00242 if( sArgs.genin_arg && !Dat.FilterGenes( sArgs.genin_arg, CDat::EFilterInclude ) ) { 00243 cerr << "Could not open: " << sArgs.genin_arg << endl; 00244 return 1; } 00245 if( sArgs.genex_arg && !Dat.FilterGenes( sArgs.genex_arg, CDat::EFilterExclude ) ) { 00246 cerr << "Could not open: " << sArgs.genex_arg << endl; 00247 return 1; } 00248 if( sArgs.normalize_flag ) 00249 Dat.Normalize( CDat::ENormalizeSigmoid ); 00250 00251 if( sArgs.weights_arg ) { 00252 if( !PCLWeights.Open( sArgs.weights_arg, 0 ) ) { 00253 cerr << "Could not open weights: " << sArgs.weights_arg << endl; 00254 return 1; } 00255 veciDat2PCL.resize( Dat.GetGenes( ) ); 00256 for( i = 0; i < veciDat2PCL.size( ); ++i ) 00257 veciDat2PCL[i] = PCLWeights.GetGene( Dat.GetGene( i ) ); } 00258 00259 iTotal = hubs( Dat, vecsHubs, PCLWeights, veciDat2PCL ); 00260 if( sArgs.genes_arg == -1 ) { 00261 cout << "Function"; 00262 for( i = 0; i < Dat.GetGenes( ); ++i ) 00263 cout << '\t' << Dat.GetGene( i ); } 00264 else { 00265 cliques( Dat, iTotal, vecsHubs, 0, sDatum, NULL, PCLWeights, veciDat2PCL ); 00266 cout << "name size hubbiness hubbiness std. hubbiness n cliquiness cliquiness std. cliquiness n" << endl; 00267 cout << "total " << iTotal << '\t' << sDatum.m_sHubbiness.GetAverage( ) << '\t' << 00268 sDatum.m_sHubbiness.GetStdev( ) << '\t' << sDatum.m_sHubbiness.m_iCount << '\t' << sDatum.m_sCliquiness.GetAverage( ) << '\t' << 00269 sDatum.m_sCliquiness.GetStdev( ) << '\t' << sDatum.m_sCliquiness.m_iCount; } 00270 cout << endl; 00271 00272 pthread_mutex_init( &mutxTerms, NULL ); 00273 vecsTerms.resize( sArgs.threads_arg ); 00274 vecpthdThreads.resize( sArgs.threads_arg ); 00275 for( iGenes = 0; iGenes < sArgs.inputs_num; iGenes += sArgs.threads_arg ) { 00276 for( iThread = 0; ( iThread < (size_t)sArgs.threads_arg ) && ( ( iGenes + iThread ) < sArgs.inputs_num ); ++iThread ) { 00277 if( !( ( iGenes + iThread ) % 25 ) ) 00278 cerr << ( iGenes + iThread ) << '/' << sArgs.inputs_num << endl; 00279 vecsTerms[iThread].m_strFile = sArgs.inputs[iGenes + iThread]; 00280 vecsTerms[iThread].m_iGenes = sArgs.genes_arg; 00281 vecsTerms[iThread].m_pDat = &Dat; 00282 vecsTerms[iThread].m_iTotal = iTotal; 00283 vecsTerms[iThread].m_pPCLWeights = &PCLWeights; 00284 vecsTerms[iThread].m_strClip = sArgs.clip_arg ? sArgs.clip_arg : ""; 00285 vecsTerms[iThread].m_pveciDat2PCL = &veciDat2PCL; 00286 vecsTerms[iThread].m_pvecsHubs = &vecsHubs; 00287 vecsTerms[iThread].m_pmutx = &mutxTerms; 00288 if( pthread_create( &vecpthdThreads[iThread], NULL, term_thread, &vecsTerms[iThread] ) ) { 00289 cerr << "Couldn't create thread for term: " << sArgs.input_arg[iGenes + iThread] << endl; 00290 return 1; } } 00291 for( i = 0; ( i < vecpthdThreads.size( ) ) && ( ( i + iGenes ) < sArgs.inputs_num ); ++i ) 00292 pthread_join( vecpthdThreads[i], NULL ); } 00293 pthread_mutex_destroy( &mutxTerms ); 00294 00295 #ifdef WIN32 00296 pthread_win32_process_detach_np( ); 00297 #endif // WIN32 00298 return 0; } 00299 00300 static inline float get_weight( size_t iGene, const vector<size_t>& veciDat2PCL, const CPCL& PCLWeights ) { 00301 size_t iPCL; 00302 00303 if( iGene >= veciDat2PCL.size( ) ) 00304 return 1; 00305 00306 iPCL = veciDat2PCL[iGene]; 00307 return ( ( ( iPCL == -1 ) || ( iPCL >= PCLWeights.GetGenes( ) ) ) ? 1 : PCLWeights.Get( iPCL, 0 ) ); } 00308 00309 size_t hubs( const CDat& Dat, vector<SSummary>& vecsHubs, const CPCL& PCLWeights, const vector<size_t>& veciDat2PCL ) { 00310 size_t i, j, iRet; 00311 float d; 00312 00313 vecsHubs.resize( Dat.GetGenes( ) ); 00314 for( i = 0; i < Dat.GetGenes( ); ++i ) 00315 for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) { 00316 if( CMeta::IsNaN( d = Dat.Get( i, j ) ) ) 00317 continue; 00318 d *= get_weight( i, veciDat2PCL, PCLWeights ) * get_weight( j, veciDat2PCL, PCLWeights ); 00319 vecsHubs[ i ].Add( d ); 00320 vecsHubs[ j ].Add( d ); } 00321 for( iRet = i = 0; i < vecsHubs.size( ); ++i ) 00322 if( vecsHubs[ i ].m_iCount ) 00323 iRet++; 00324 00325 return iRet; } 00326 00327 struct SSorter { 00328 00329 bool operator()( const pair<size_t,float>& prOne, const pair<size_t,float>& prTwo ) const { 00330 00331 return ( prOne.second > prTwo.second ); } 00332 }; 00333 00334 size_t cliques( const CDat& Dat, size_t iGenes, const vector<SSummary>& vecsHubs, size_t iSort, SDatum& sDatum, 00335 const CGenes* pGenes, const CPCL& PCLWeights, const vector<size_t>& veciDat2PCL ) { 00336 size_t i, j, iRet; 00337 float d; 00338 vector<SSummary> vecsClique; 00339 vector<bool> vecfOutside; 00340 00341 vecfOutside.resize( Dat.GetGenes( ) ); 00342 if( pGenes ) { 00343 for( i = 0; i < vecfOutside.size( ); ++i ) 00344 if( !pGenes->IsGene( Dat.GetGene( i ) ) ) 00345 vecfOutside[ i ] = true; } 00346 vecsClique.resize( Dat.GetGenes( ) ); 00347 sDatum.Clear( ); 00348 for( i = 0; i < Dat.GetGenes( ); ++i ) 00349 for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) { 00350 if( CMeta::IsNaN( d = Dat.Get( i, j ) ) ) 00351 continue; 00352 d *= get_weight( i, veciDat2PCL, PCLWeights ) * get_weight( j, veciDat2PCL, PCLWeights ); 00353 // Added 9/4/13 to fix Arjun Issue #17, ignore precomputed hubs -- avoids double counting with summary output 00354 if (!vecfOutside[ i ] || !vecfOutside[ j ] ) {//if either is not outside, edge is incident to context 00355 sDatum.m_sHubbiness.Add( d ); 00356 } 00357 if( !vecfOutside[ i ] ) 00358 vecsClique[ j ].Add( d ); 00359 if( !vecfOutside[ j ] ) 00360 vecsClique[ i ].Add( d ); } 00361 for( iRet = i = 0; i < vecsClique.size( ); ++i ) 00362 if( !vecfOutside[ i ] ) { 00363 if( vecsClique[ i ].m_iCount ) 00364 iRet++; 00365 sDatum.m_sCliquiness.Add( vecsClique[ i ] ); } 00366 sDatum.m_sCliquiness.Multiply( 0.5 ); 00367 00368 /* Removed 9/4/13 to fix Arjun Issue #17 00369 for( i = 0; i < Dat.GetGenes( ); ++i ) 00370 if( !vecfOutside[ i ] ) 00371 sDatum.m_sHubbiness.Add( vecsHubs[ i ] ); 00372 */ 00373 00374 if( iSort ) { 00375 sDatum.m_vecprSpecific.resize( Dat.GetGenes( ) ); 00376 for( i = 0; i < sDatum.m_vecprSpecific.size( ); ++i ) { 00377 sDatum.m_vecprSpecific[ i ].first = i; 00378 //divide each genes cliquiness (within) by precomputed hubbiness (overall average) to calculate group membership 00379 sDatum.m_vecprSpecific[ i ].second = vecsClique[ i ].GetAverage( ) / vecsHubs[ i ].GetAverage( ); } 00380 if( iSort != -1 ) 00381 sort( sDatum.m_vecprSpecific.begin( ), sDatum.m_vecprSpecific.end( ), SSorter( ) ); } 00382 00383 return iRet; } 00384 00385 int process( const char* szFile, bool fMemmap, bool fNormalize, const vector<vector<string> >& vecvecstrSets1, 00386 const vector<vector<string> >& vecvecstrSets2, long double* adResults, TFnProcessor* pfnProcessor, 00387 size_t iThreads ) { 00388 CDat Dat; 00389 vector<vector<size_t> > vecveciSets1, vecveciSets2; 00390 00391 if( !Dat.Open( szFile, fMemmap && !fNormalize ) ) { 00392 cerr << "Could not open: " << szFile << endl; 00393 return 1; } 00394 cerr << "Calculating for: " << szFile << endl; 00395 if( fNormalize ) 00396 Dat.Normalize( CDat::ENormalizeSigmoid ); 00397 00398 enset( Dat, vecvecstrSets1, vecveciSets1 ); 00399 enset( Dat, vecvecstrSets2, vecveciSets2 ); 00400 return pfnProcessor( SProcessor( Dat, vecveciSets1, vecveciSets2, adResults, iThreads ) ); } 00401 00402 int background( const SProcessor& sProcessor ) { 00403 const CDat& Dat = sProcessor.m_Dat; 00404 const vector<vector<size_t> >& vecveciSets = sProcessor.m_vecveciSets1; 00405 long double* adResults = sProcessor.m_adResults; 00406 vector<pthread_t> vecpthdThreads; 00407 vector<SBackground> vecsBackground; 00408 size_t i, iChunk; 00409 00410 vecpthdThreads.resize( min( sProcessor.m_iThreads, vecveciSets.size( ) ) ); 00411 vecsBackground.resize( vecpthdThreads.size( ) ); 00412 iChunk = 1 + ( ( vecveciSets.size( ) - 1 ) / vecpthdThreads.size( ) ); 00413 for( i = 0; i < vecpthdThreads.size( ); ++i ) { 00414 vecsBackground[ i ].m_adResults = adResults; 00415 vecsBackground[ i ].m_iLength = iChunk; 00416 vecsBackground[ i ].m_iStart = i * iChunk; 00417 vecsBackground[ i ].m_pDat = &Dat; 00418 vecsBackground[ i ].m_pvecveciSets = &vecveciSets; 00419 if( pthread_create( &vecpthdThreads[ i ], NULL, background_thread, &vecsBackground[ i ] ) ) { 00420 cerr << "Couldn't create thread for set group: " << i << endl; 00421 return 1; } } 00422 for( i = 0; i < vecpthdThreads.size( ); ++i ) 00423 pthread_join( vecpthdThreads[ i ], NULL ); 00424 00425 return 0; } 00426 00427 void* background_thread( void* pData ) { 00428 SBackground* psData; 00429 vector<float> vecdBackgrounds, vecdBackground; 00430 size_t i, j, k; 00431 float d; 00432 00433 psData = (SBackground*)pData; 00434 for( i = 0; ( i < psData->m_iLength ) && ( ( psData->m_iStart + i ) < psData->m_pvecveciSets->size( ) ); 00435 ++i ) { 00436 size_t iSet = psData->m_iStart + i; 00437 const vector<size_t>& veciSet = psData->m_pvecveciSets->at( iSet ); 00438 00439 vecdBackgrounds.resize( veciSet.size( ) ); 00440 for( j = 0; j < veciSet.size( ); ++j ) { 00441 size_t iOne = veciSet[ j ]; 00442 00443 vecdBackground.clear( ); 00444 vecdBackground.reserve( psData->m_pDat->GetGenes( ) ); 00445 for( k = 0; k < psData->m_pDat->GetGenes( ); ++k ) 00446 if( !CMeta::IsNaN( d = psData->m_pDat->Get( iOne, k ) ) ) 00447 vecdBackground.push_back( d ); 00448 CStatistics::Winsorize( vecdBackground, ( vecdBackground.size( ) / 10 ) + 1 ); 00449 vecdBackgrounds[ j ] = (float)CStatistics::Average( vecdBackground ); } 00450 // vecdBackgrounds[ j ] = (float)CStatistics::Median( vecdBackground ); } 00451 // psData->m_adResults[ iSet ] = CStatistics::Median( vecdBackgrounds ); } 00452 CStatistics::Winsorize( vecdBackgrounds, ( vecdBackgrounds.size( ) / 10 ) + 1 ); 00453 psData->m_adResults[ iSet ] = CStatistics::Average( vecdBackgrounds ); } 00454 00455 return NULL; } 00456 00457 int within( const SProcessor& sProcessor ) { 00458 const CDat& Dat = sProcessor.m_Dat; 00459 const vector<vector<size_t> >& vecveciSets = sProcessor.m_vecveciSets1; 00460 long double* adResults = sProcessor.m_adResults; 00461 size_t i, j, iCount, iChunk; 00462 float d; 00463 long double dPrior; 00464 vector<pthread_t> vecpthdThreads; 00465 vector<SWithin> vecsWithin; 00466 00467 for( dPrior = 0,iCount = i = 0; i < Dat.GetGenes( ); ++i ) 00468 for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) 00469 if( !CMeta::IsNaN( d = Dat.Get( i, j ) ) ) { 00470 iCount++; 00471 dPrior += d; } 00472 dPrior /= iCount ? iCount : 1; 00473 00474 vecpthdThreads.resize( min( sProcessor.m_iThreads, vecveciSets.size( ) ) ); 00475 vecsWithin.resize( vecpthdThreads.size( ) ); 00476 iChunk = 1 + ( ( vecveciSets.size( ) - 1 ) / vecpthdThreads.size( ) ); 00477 for( i = 0; i < vecpthdThreads.size( ); ++i ) { 00478 vecsWithin[ i ].m_dPrior = dPrior; 00479 vecsWithin[ i ].m_pDat = &Dat; 00480 vecsWithin[ i ].m_adResults = adResults; 00481 vecsWithin[ i ].m_pvecveciSets = &vecveciSets; 00482 vecsWithin[ i ].m_iStart = i * iChunk; 00483 vecsWithin[ i ].m_iLength = iChunk; 00484 if( pthread_create( &vecpthdThreads[ i ], NULL, within_thread, &vecsWithin[ i ] ) ) { 00485 cerr << "Couldn't create thread for set group: " << i << endl; 00486 return 1; } } 00487 for( i = 0; i < vecpthdThreads.size( ); ++i ) 00488 pthread_join( vecpthdThreads[ i ], NULL ); 00489 00490 return 0; } 00491 00492 void* within_thread( void* pData ) { 00493 SWithin* psData; 00494 size_t i, j, k; 00495 float d; 00496 vector<float> vecdWithin; 00497 00498 psData = (SWithin*)pData; 00499 for( i = 0; ( i < psData->m_iLength ) && ( ( psData->m_iStart + i ) < psData->m_pvecveciSets->size( ) ); 00500 ++i ) { 00501 size_t iSet = psData->m_iStart + i; 00502 const vector<size_t>& veciSet = psData->m_pvecveciSets->at( iSet ); 00503 00504 vecdWithin.clear( ); 00505 vecdWithin.reserve( veciSet.size( ) * ( veciSet.size( ) + 1 ) / 2 ); 00506 for( j = 0; j < veciSet.size( ); ++j ) { 00507 vecdWithin.push_back( (float)psData->m_dPrior ); 00508 for( k = ( j + 1 ); k < veciSet.size( ); ++k ) 00509 if( !CMeta::IsNaN( d = psData->m_pDat->Get( veciSet[ j ], veciSet[ k ] ) ) ) 00510 vecdWithin.push_back( d ); } 00511 CStatistics::Winsorize( vecdWithin, ( vecdWithin.size( ) / 10 ) + 1 ); 00512 psData->m_adResults[ iSet ] = CStatistics::Average( vecdWithin ); } 00513 // psData->m_adResults[ iSet ] = CStatistics::Median( vecdWithin ); } 00514 00515 return NULL; } 00516 00517 int between( const SProcessor& sProcessor ) { 00518 const CDat& Dat = sProcessor.m_Dat; 00519 const vector<vector<size_t> >& vecveciSets1 = sProcessor.m_vecveciSets1; 00520 const vector<vector<size_t> >& vecveciSets2 = sProcessor.m_vecveciSets2; 00521 long double* adResults = sProcessor.m_adResults; 00522 vector<pthread_t> vecpthdThreads; 00523 vector<SBetween> vecsBetween; 00524 size_t i, iChunk; 00525 00526 vecpthdThreads.resize( min( sProcessor.m_iThreads, vecveciSets1.size( ) ) ); 00527 vecsBetween.resize( vecpthdThreads.size( ) ); 00528 iChunk = 1 + ( ( vecveciSets1.size( ) - 1 ) / vecpthdThreads.size( ) ); 00529 for( i = 0; i < vecpthdThreads.size( ); ++i ) { 00530 vecsBetween[ i ].m_adResults = adResults; 00531 vecsBetween[ i ].m_iLength = iChunk; 00532 vecsBetween[ i ].m_iStart = i * iChunk; 00533 vecsBetween[ i ].m_pDat = &Dat; 00534 vecsBetween[ i ].m_pvecveciSets1 = &vecveciSets1; 00535 vecsBetween[ i ].m_pvecveciSets2 = &vecveciSets2; 00536 if( pthread_create( &vecpthdThreads[ i ], NULL, between_thread, &vecsBetween[ i ] ) ) { 00537 cerr << "Couldn't create thread for set group: " << i << endl; 00538 return 1; } } 00539 for( i = 0; i < vecpthdThreads.size( ); ++i ) 00540 pthread_join( vecpthdThreads[ i ], NULL ); 00541 00542 return 0; } 00543 00544 void* between_thread( void* pData ) { 00545 size_t i, iSetOne, iSetTwo, iGeneOne, iGeneTwo, iOne, iTwo; 00546 float d; 00547 SBetween* psData; 00548 vector<long double> vecdBetween; 00549 00550 psData = (SBetween*)pData; 00551 for( iSetOne = 0; ( iSetOne < psData->m_iLength ) && 00552 ( ( psData->m_iStart + iSetOne ) < psData->m_pvecveciSets1->size( ) ); ++iSetOne ) { 00553 const vector<size_t>& veciOne = (*psData->m_pvecveciSets1)[ psData->m_iStart + iSetOne ]; 00554 00555 cerr << iSetOne << '/' << psData->m_iLength << endl; 00556 for( iSetTwo = 0; iSetTwo < psData->m_pvecveciSets2->size( ); ++iSetTwo ) { 00557 const vector<size_t>& veciTwo = (*psData->m_pvecveciSets2)[ iSetTwo ]; 00558 00559 vecdBetween.resize( veciOne.size( ) + veciTwo.size( ) ); 00560 fill( vecdBetween.begin( ), vecdBetween.end( ), 0 ); 00561 for( iGeneOne = 0; iGeneOne < veciOne.size( ); ++iGeneOne ) { 00562 iOne = veciOne[ iGeneOne ]; 00563 for( iGeneTwo = 0; iGeneTwo < veciTwo.size( ); ++iGeneTwo ) { 00564 iTwo = veciTwo[ iGeneTwo ]; 00565 d = ( iOne == iTwo ) ? 1 : psData->m_pDat->Get( iOne, iTwo ); 00566 vecdBetween[ iGeneOne ] += d; 00567 vecdBetween[ veciOne.size( ) + iGeneTwo ] += d; } } 00568 for( i = 0; i < veciOne.size( ); ++i ) 00569 vecdBetween[ i ] /= max( (size_t)1, veciTwo.size( ) ); 00570 for( i = 0; i < veciTwo.size( ); ++i ) 00571 vecdBetween[ veciOne.size( ) + i ] /= max( (size_t)1, veciOne.size( ) ); 00572 CStatistics::Winsorize( vecdBetween, ( vecdBetween.size( ) / 10 ) + 1 ); 00573 psData->m_adResults[ ( ( psData->m_iStart + iSetOne ) * psData->m_pvecveciSets2->size( ) ) + 00574 iSetTwo ] = CStatistics::Average( vecdBetween ); } } 00575 // iSetTwo ] = CStatistics::Median( vecdBetween ); } } 00576 00577 return NULL; } 00578 00579 int sets( const char* szFile, const vector<string>& vecstrGenes, vector<vector<string> >& vecvecstrSets ) { 00580 CPCL PCLSets( false ); 00581 size_t i, iSet, iSets; 00582 00583 if( !PCLSets.Open( szFile, 1 ) ) { 00584 cerr << "Could not open: " << szFile << endl; 00585 return 1; } 00586 for( iSets = i = 0; i < PCLSets.GetGenes( ); ++i ) { 00587 if( !( iSet = atoi( PCLSets.GetGene( i ).c_str( ) ) ) ) { 00588 cerr << "Invalid set: " << PCLSets.GetGene( i ) << endl; 00589 return 1; } 00590 if( iSet > iSets ) 00591 iSets = iSet; } 00592 vecvecstrSets.resize( iSets ); 00593 for( i = 0; i < PCLSets.GetGenes( ); ++i ) { 00594 size_t iGene; 00595 00596 iSet = atoi( PCLSets.GetGene( i ).c_str( ) ) - 1; 00597 iGene = atoi( PCLSets.GetFeature( i, 1 ).c_str( ) ) - 1; 00598 if( iGene >= vecstrGenes.size( ) ) { 00599 cerr << "Unknown gene: " << iGene << " (" << PCLSets.GetFeature( i, 1 ) << ") in set " << 00600 PCLSets.GetGene( i ) << endl; 00601 return 1; } 00602 vecvecstrSets[ iSet ].push_back( vecstrGenes[ iGene ] ); } 00603 00604 return 0; } 00605 00606 void enset( const CDat& Dat, const vector<vector<string> >& vecvecstrSets, 00607 vector<vector<size_t> >& vecveciSets ) { 00608 size_t i, j, iGene; 00609 map<string,size_t> mapstriGenes; 00610 00611 for( i = 0; i < vecvecstrSets.size( ); ++i ) 00612 for( j = 0; j < vecvecstrSets[ i ].size( ); ++j ) { 00613 const string& strGene = vecvecstrSets[ i ][ j ]; 00614 00615 if( mapstriGenes.find( strGene ) == mapstriGenes.end( ) ) 00616 mapstriGenes[ strGene ] = Dat.GetGene( strGene ); } 00617 00618 vecveciSets.resize( vecvecstrSets.size( ) ); 00619 for( i = 0; i < vecveciSets.size( ); ++i ) { 00620 vector<size_t>& veciSet = vecveciSets[ i ]; 00621 const vector<string>& vecstrSet = vecvecstrSets[ i ]; 00622 00623 veciSet.clear( ); 00624 veciSet.reserve( vecstrSet.size( ) ); 00625 for( j = 0; j < vecstrSet.size( ); ++j ) 00626 if( ( iGene = mapstriGenes[ vecstrSet[ j ] ] ) != -1 ) 00627 veciSet.push_back( iGene ); } } 00628 00629 void* term_thread( void* pData ) { 00630 CGenome Genome; 00631 CGenes Genes( Genome ); 00632 ifstream ifsm; 00633 size_t i, iCur; 00634 STerm* psData; 00635 SDatum sDatum; 00636 00637 psData = (STerm*)pData; 00638 ifsm.open( psData->m_strFile.c_str( ) ); 00639 if( !Genes.Open( ifsm ) ) { 00640 cerr << "Could not open: " << psData->m_strFile << endl; 00641 return NULL; } 00642 ifsm.close( ); 00643 iCur = cliques( *psData->m_pDat, psData->m_iTotal, *psData->m_pvecsHubs, psData->m_iGenes, sDatum, &Genes, *psData->m_pPCLWeights, *psData->m_pveciDat2PCL ); 00644 pthread_mutex_lock( psData->m_pmutx ); 00645 cout << CMeta::Basename( psData->m_strFile.c_str( ) ); 00646 if( psData->m_iGenes == -1 ) 00647 for( i = 0; i < sDatum.m_vecprSpecific.size( ); ++i ) 00648 cout << '\t' << ( sDatum.m_vecprSpecific[ i ].second * 00649 ( Genes.IsGene( psData->m_pDat->GetGene( sDatum.m_vecprSpecific[ i ].first ) ) ? -1 : 1 ) ); 00650 else { 00651 cout << '\t' << iCur << '\t' << sDatum.m_sHubbiness.GetAverage( ) << '\t' << 00652 sDatum.m_sHubbiness.GetStdev( ) << '\t' << sDatum.m_sHubbiness.m_iCount << '\t' << sDatum.m_sCliquiness.GetAverage( ) << '\t' << 00653 sDatum.m_sCliquiness.GetStdev( ) << '\t' << sDatum.m_sCliquiness.m_iCount; 00654 for( i = 0; i < min( psData->m_iGenes, sDatum.m_vecprSpecific.size( ) ); ++i ) 00655 cout << '\t' << psData->m_pDat->GetGene( sDatum.m_vecprSpecific[ i ].first ) << '|' << 00656 sDatum.m_vecprSpecific[ i ].second << '|' << 00657 ( Genes.IsGene( psData->m_pDat->GetGene( sDatum.m_vecprSpecific[ i ].first ) ) ? 1 : 0 ); } 00658 cout << endl; 00659 pthread_mutex_unlock( psData->m_pmutx ); 00660 00661 if( psData->m_strClip.length( ) ) { 00662 CDat DatOut; 00663 size_t j; 00664 vector<bool> vecfGenes; 00665 00666 vecfGenes.resize( psData->m_pDat->GetGenes( ) ); 00667 for( i = 0; i < vecfGenes.size( ); ++i ) 00668 vecfGenes[ i ] = Genes.IsGene( psData->m_pDat->GetGene( i ) ); 00669 DatOut.Open( psData->m_pDat->GetGeneNames( ) ); 00670 for( i = 0; i < DatOut.GetGenes( ); ++i ) 00671 for( j = ( i + 1 ); j < DatOut.GetGenes( ); ++j ) 00672 if( vecfGenes[ i ] || vecfGenes[ j ] ) 00673 DatOut.Set( i, j, psData->m_pDat->Get( i, j ) ); 00674 DatOut.Save( ( psData->m_strClip + '/' + CMeta::Deextension( CMeta::Basename( 00675 psData->m_strFile.c_str( ) ) ) + c_szDab ).c_str( ) ); } 00676 00677 return NULL; }