Sleipnir
tools/Hubber/Hubber.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 struct SProcessor {
00026     const CDat&                     m_Dat;
00027     const vector<vector<size_t> >&  m_vecveciSets1;
00028     const vector<vector<size_t> >&  m_vecveciSets2;
00029     long double*                    m_adResults;
00030     size_t                          m_iThreads;
00031 
00032     SProcessor( const CDat& Dat, const vector<vector<size_t> >& vecveciSets1,
00033         const vector<vector<size_t> >& vecveciSets2, long double* adResults, size_t iThreads ) : m_Dat(Dat),
00034         m_vecveciSets1(vecveciSets1), m_vecveciSets2(vecveciSets2), m_adResults(adResults),
00035         m_iThreads(iThreads) { }
00036 };
00037 
00038 typedef int (TFnProcessor)( const SProcessor& );
00039 
00040 static const char   c_szDab[]   = ".dab";
00041 
00042 struct SWithin {
00043     const CDat*                     m_pDat;
00044     const vector<vector<size_t> >*  m_pvecveciSets;
00045     long double*                    m_adResults;
00046     long double                     m_dPrior;
00047     size_t                          m_iStart;
00048     size_t                          m_iLength;
00049 };
00050 
00051 struct SBetween {
00052     const CDat*                     m_pDat;
00053     const vector<vector<size_t> >*  m_pvecveciSets1;
00054     const vector<vector<size_t> >*  m_pvecveciSets2;
00055     long double*                    m_adResults;
00056     size_t                          m_iStart;
00057     size_t                          m_iLength;
00058 };
00059 
00060 struct SBackground {
00061     const CDat*                     m_pDat;
00062     const vector<vector<size_t> >*  m_pvecveciSets;
00063     long double*                    m_adResults;
00064     size_t                          m_iStart;
00065     size_t                          m_iLength;
00066 };
00067 
00068 struct SSummary {
00069     size_t  m_iCount;
00070     float   m_dSum;
00071     float   m_dSumSquares;
00072 
00073     SSummary( ) {
00074 
00075         Clear( ); }
00076 
00077     void Add( float d ) {
00078 
00079         m_iCount++;
00080         m_dSum += d;
00081         m_dSumSquares += d * d; }
00082 
00083     void Add( const SSummary& sSummary ) {
00084 
00085         m_iCount += sSummary.m_iCount;
00086         m_dSum += sSummary.m_dSum;
00087         m_dSumSquares += sSummary.m_dSumSquares; }
00088 
00089     void Multiply( float d ) {
00090 
00091         m_iCount = (size_t)( m_iCount * d );
00092         m_dSum *= d;
00093         m_dSumSquares *= d; }
00094 
00095     void Clear( ) {
00096 
00097         m_iCount = 0;
00098         m_dSum = m_dSumSquares = 0; }
00099 
00100     float GetAverage( ) const {
00101 
00102         return ( m_iCount ? ( m_dSum / m_iCount ) : CMeta::GetNaN( ) ); }
00103 
00104     float GetStdev( ) const {
00105 
00106         return ( m_iCount ? sqrt( ( m_dSumSquares / ( max( (size_t)2, m_iCount ) - 1 ) ) - pow( GetAverage( ), 2 ) ) : CMeta::GetNaN( ) ); }
00107 };
00108 
00109 struct SDatum {
00110     SSummary                    m_sHubbiness;
00111     SSummary                    m_sCliquiness;
00112     vector<pair<size_t,float> > m_vecprSpecific;
00113 
00114     void Clear( ) {
00115 
00116         m_sHubbiness.Clear( );
00117         m_sCliquiness.Clear( );
00118         m_vecprSpecific.clear( ); }
00119 };
00120 
00121 struct STerm {
00122     string                  m_strFile;
00123     size_t                  m_iGenes;
00124     size_t                  m_iTotal;
00125     const CDat*             m_pDat;
00126     const CPCL*             m_pPCLWeights;
00127     string                  m_strClip;
00128     const vector<size_t>*   m_pveciDat2PCL;
00129     const vector<SSummary>* m_pvecsHubs;
00130     pthread_mutex_t*        m_pmutx;
00131 };
00132 
00133 static size_t hubs( const CDat&, vector<SSummary>&, const CPCL&, const vector<size_t>& );
00134 static size_t cliques( const CDat&, size_t, const vector<SSummary>&, size_t, SDatum&, const CGenes*, const CPCL&, const vector<size_t>& );
00135 static void enset( const CDat&, const vector<vector<string> >&, vector<vector<size_t> >& );
00136 static int sets( const char*, const vector<string>&, vector<vector<string> >& );
00137 static int process( const char*, bool, bool, const vector<vector<string> >&, const vector<vector<string> >&,
00138     long double*, TFnProcessor*, size_t );
00139 static TFnProcessor background;
00140 static TFnProcessor within;
00141 static TFnProcessor between;
00142 static void* between_thread( void* );
00143 static void* within_thread( void* );
00144 static void* background_thread( void* );
00145 static void* term_thread( void* );
00146 
00147 int main( int iArgs, char** aszArgs ) {
00148 #ifdef WIN32
00149     pthread_win32_process_attach_np( );
00150 #endif // WIN32
00151     gengetopt_args_info sArgs;
00152     CDat                Dat;
00153     size_t              i, j, iGenes, iTotal, iThread;
00154     vector<SSummary>    vecsHubs;
00155     SDatum              sDatum;
00156     CPCL                PCLWeights;
00157     vector<size_t>      veciDat2PCL;
00158     vector<STerm>       vecsTerms;
00159     vector<pthread_t>   vecpthdThreads;
00160     pthread_mutex_t     mutxTerms;
00161 
00162     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00163         cmdline_parser_print_help( );
00164         return 1; }
00165     CMeta Meta( sArgs.verbosity_arg );
00166 
00167     if( sArgs.output_arg ) {
00168         CDataMatrix             MatBackgrounds;
00169         int                     iRet;
00170         vector<string>          vecstrGenes, vecstrContexts;
00171         vector<vector<string> > vecvecstrSets1, vecvecstrSets2;
00172         TFnProcessor*           pfnProcessor;
00173         long double*            adResults;
00174 
00175         if( !( sArgs.contexts_arg && sArgs.genelist_arg ) ) {
00176             cmdline_parser_print_help( );
00177             return 1; }
00178         {
00179             static const size_t c_iContext  = 2;
00180             CPCL    PCLContexts( false );
00181 
00182             if( !PCLContexts.Open( sArgs.contexts_arg, c_iContext ) ) {
00183                 cerr << "Could not open: " << sArgs.contexts_arg << endl;
00184                 return 1; }
00185             vecstrContexts.resize( PCLContexts.GetGenes( ) );
00186             for( i = 0; i < vecstrContexts.size( ); ++i )
00187                 vecstrContexts[ i ] = PCLContexts.GetFeature( i, c_iContext );
00188         }
00189             {
00190             CPCL    PCLGenes( false );
00191 
00192             if( !PCLGenes.Open( sArgs.genelist_arg, 1 ) ) {
00193                 cerr << "Could not open: " << sArgs.genelist_arg << endl;
00194                 return 1; }
00195             vecstrGenes.resize( PCLGenes.GetGenes( ) );
00196             for( i = 0; i < vecstrGenes.size( ); ++i )
00197                 vecstrGenes[ i ] = PCLGenes.GetFeature( i, 1 );
00198         }
00199         if( sArgs.genesets_arg ) {
00200             if( iRet = sets( sArgs.genesets_arg, vecstrGenes, vecvecstrSets1 ) )
00201                 return iRet;
00202             if( sArgs.between_arg ) {
00203                 if( iRet = sets( sArgs.between_arg, vecstrGenes, vecvecstrSets2 ) )
00204                     return iRet;
00205                 pfnProcessor = between; }
00206             else
00207                 pfnProcessor = within; }
00208         else {
00209             vecvecstrSets1.resize( vecstrGenes.size( ) );
00210             for( i = 0; i < vecvecstrSets1.size( ); ++i ) {
00211                 vecvecstrSets1[ i ].resize( 1 );
00212                 vecvecstrSets1[ i ][ 0 ] = vecstrGenes[ i ]; }
00213             pfnProcessor = background; }
00214 
00215         MatBackgrounds.Initialize( vecstrContexts.size( ) + 1, vecvecstrSets1.size( ) * max( (size_t)1,
00216             vecvecstrSets2.size( ) ) );
00217         adResults = new long double[ MatBackgrounds.GetColumns( ) ];
00218         if( iRet = process( sArgs.input_arg, !!sArgs.memmap_flag, !!sArgs.normalize_flag, vecvecstrSets1,
00219             vecvecstrSets2, adResults, pfnProcessor, sArgs.threads_arg ) )
00220             return iRet;
00221         for( i = 0; i < MatBackgrounds.GetColumns( ); ++i )
00222             MatBackgrounds.Set( 0, i, (float)adResults[ i ] );
00223         for( i = 1; i < MatBackgrounds.GetRows( ); ++i ) {
00224             if( iRet = process( ( (string)sArgs.directory_arg + '/' +
00225                 CMeta::Filename( vecstrContexts[ i - 1 ] ) + ".dab" ).c_str( ), !!sArgs.memmap_flag,
00226                 !!sArgs.normalize_flag, vecvecstrSets1, vecvecstrSets2, adResults, pfnProcessor,
00227                 sArgs.threads_arg ) )
00228                 return iRet;
00229             for( j = 0; j < MatBackgrounds.GetColumns( ); ++j )
00230                 MatBackgrounds.Set( i, j, (float)adResults[ j ] ); }
00231 
00232         MatBackgrounds.Save( sArgs.output_arg, true );
00233         return 0; }
00234 
00235     if( sArgs.input_arg ) {
00236         if( !Dat.Open( sArgs.input_arg, !( sArgs.memmap_flag || sArgs.normalize_flag || sArgs.genin_arg || sArgs.genex_arg ) ) ) {
00237             cerr << "Could not open: " << sArgs.input_arg << endl;
00238             return 1; } }
00239     else if( !Dat.Open( cin, CDat::EFormatText ) ) {
00240         cerr << "Could not open input" << endl;
00241         return 1; }
00242     if( sArgs.genin_arg && !Dat.FilterGenes( sArgs.genin_arg, CDat::EFilterInclude ) ) {
00243         cerr << "Could not open: " << sArgs.genin_arg << endl;
00244         return 1; }
00245     if( sArgs.genex_arg && !Dat.FilterGenes( sArgs.genex_arg, CDat::EFilterExclude ) ) {
00246         cerr << "Could not open: " << sArgs.genex_arg << endl;
00247         return 1; }
00248     if( sArgs.normalize_flag )
00249         Dat.Normalize( CDat::ENormalizeSigmoid );
00250 
00251     if( sArgs.weights_arg ) {
00252         if( !PCLWeights.Open( sArgs.weights_arg, 0 ) ) {
00253             cerr << "Could not open weights: " << sArgs.weights_arg << endl;
00254             return 1; }
00255         veciDat2PCL.resize( Dat.GetGenes( ) );
00256         for( i = 0; i < veciDat2PCL.size( ); ++i )
00257             veciDat2PCL[i] = PCLWeights.GetGene( Dat.GetGene( i ) ); }
00258 
00259     iTotal = hubs( Dat, vecsHubs, PCLWeights, veciDat2PCL );
00260     if( sArgs.genes_arg == -1 ) {
00261         cout << "Function";
00262         for( i = 0; i < Dat.GetGenes( ); ++i )
00263             cout << '\t' << Dat.GetGene( i ); }
00264     else {
00265         cliques( Dat, iTotal, vecsHubs, 0, sDatum, NULL, PCLWeights, veciDat2PCL );
00266         cout << "name   size    hubbiness   hubbiness std.  hubbiness n cliquiness  cliquiness std. cliquiness n" << endl;
00267         cout << "total  " << iTotal << '\t' << sDatum.m_sHubbiness.GetAverage( ) << '\t' <<
00268             sDatum.m_sHubbiness.GetStdev( ) << '\t' << sDatum.m_sHubbiness.m_iCount << '\t' << sDatum.m_sCliquiness.GetAverage( ) << '\t' <<
00269             sDatum.m_sCliquiness.GetStdev( ) << '\t' << sDatum.m_sCliquiness.m_iCount; }
00270     cout << endl;
00271 
00272     pthread_mutex_init( &mutxTerms, NULL );
00273     vecsTerms.resize( sArgs.threads_arg );
00274     vecpthdThreads.resize( sArgs.threads_arg );
00275     for( iGenes = 0; iGenes < sArgs.inputs_num; iGenes += sArgs.threads_arg ) {
00276         for( iThread = 0; ( iThread < (size_t)sArgs.threads_arg ) && ( ( iGenes + iThread ) < sArgs.inputs_num ); ++iThread ) {
00277             if( !( ( iGenes + iThread ) % 25 ) )
00278                 cerr << ( iGenes + iThread ) << '/' << sArgs.inputs_num << endl;
00279             vecsTerms[iThread].m_strFile = sArgs.inputs[iGenes + iThread];
00280             vecsTerms[iThread].m_iGenes = sArgs.genes_arg;
00281             vecsTerms[iThread].m_pDat = &Dat;
00282             vecsTerms[iThread].m_iTotal = iTotal;
00283             vecsTerms[iThread].m_pPCLWeights = &PCLWeights;
00284             vecsTerms[iThread].m_strClip = sArgs.clip_arg ? sArgs.clip_arg : "";
00285             vecsTerms[iThread].m_pveciDat2PCL = &veciDat2PCL;
00286             vecsTerms[iThread].m_pvecsHubs = &vecsHubs;
00287             vecsTerms[iThread].m_pmutx = &mutxTerms;
00288             if( pthread_create( &vecpthdThreads[iThread], NULL, term_thread, &vecsTerms[iThread] ) ) {
00289                 cerr << "Couldn't create thread for term: " << sArgs.input_arg[iGenes + iThread] << endl;
00290                 return 1; } }
00291         for( i = 0; ( i < vecpthdThreads.size( ) ) && ( ( i + iGenes ) < sArgs.inputs_num ); ++i )
00292             pthread_join( vecpthdThreads[i], NULL ); }
00293     pthread_mutex_destroy( &mutxTerms );
00294 
00295 #ifdef WIN32
00296     pthread_win32_process_detach_np( );
00297 #endif // WIN32
00298     return 0; }
00299 
00300 static inline float get_weight( size_t iGene, const vector<size_t>& veciDat2PCL, const CPCL& PCLWeights ) {
00301     size_t  iPCL;
00302 
00303     if( iGene >= veciDat2PCL.size( ) )
00304         return 1;
00305 
00306     iPCL = veciDat2PCL[iGene];
00307     return ( ( ( iPCL == -1 ) || ( iPCL >= PCLWeights.GetGenes( ) ) ) ? 1 : PCLWeights.Get( iPCL, 0 ) ); }
00308 
00309 size_t hubs( const CDat& Dat, vector<SSummary>& vecsHubs, const CPCL& PCLWeights, const vector<size_t>& veciDat2PCL ) {
00310     size_t  i, j, iRet;
00311     float   d;
00312 
00313     vecsHubs.resize( Dat.GetGenes( ) );
00314     for( i = 0; i < Dat.GetGenes( ); ++i )
00315         for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) {
00316             if( CMeta::IsNaN( d = Dat.Get( i, j ) ) )
00317                 continue;
00318             d *= get_weight( i, veciDat2PCL, PCLWeights ) * get_weight( j, veciDat2PCL, PCLWeights );
00319             vecsHubs[ i ].Add( d );
00320             vecsHubs[ j ].Add( d ); }
00321     for( iRet = i = 0; i < vecsHubs.size( ); ++i )
00322         if( vecsHubs[ i ].m_iCount )
00323             iRet++;
00324 
00325     return iRet; }
00326 
00327 struct SSorter {
00328 
00329     bool operator()( const pair<size_t,float>& prOne, const pair<size_t,float>& prTwo ) const {
00330 
00331         return ( prOne.second > prTwo.second ); }
00332 };
00333 
00334 size_t cliques( const CDat& Dat, size_t iGenes, const vector<SSummary>& vecsHubs, size_t iSort, SDatum& sDatum,
00335     const CGenes* pGenes, const CPCL& PCLWeights, const vector<size_t>& veciDat2PCL ) {
00336     size_t              i, j, iRet;
00337     float               d;
00338     vector<SSummary>    vecsClique;
00339     vector<bool>        vecfOutside;
00340 
00341     vecfOutside.resize( Dat.GetGenes( ) );
00342     if( pGenes ) {
00343         for( i = 0; i < vecfOutside.size( ); ++i )
00344             if( !pGenes->IsGene( Dat.GetGene( i ) ) )
00345                 vecfOutside[ i ] = true; }
00346     vecsClique.resize( Dat.GetGenes( ) );
00347     sDatum.Clear( );
00348     for( i = 0; i < Dat.GetGenes( ); ++i )
00349         for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j ) {
00350             if( CMeta::IsNaN( d = Dat.Get( i, j ) ) )
00351                 continue;
00352             d *= get_weight( i, veciDat2PCL, PCLWeights ) * get_weight( j, veciDat2PCL, PCLWeights );
00353             // Added 9/4/13 to fix Arjun Issue #17, ignore precomputed hubs -- avoids double counting with summary output
00354             if (!vecfOutside[ i ] || !vecfOutside[ j ] ) {//if either is not outside, edge is incident to context
00355                 sDatum.m_sHubbiness.Add( d );
00356             }
00357             if( !vecfOutside[ i ] )
00358                 vecsClique[ j ].Add( d );
00359             if( !vecfOutside[ j ] )
00360                 vecsClique[ i ].Add( d ); }
00361     for( iRet = i = 0; i < vecsClique.size( ); ++i )
00362         if( !vecfOutside[ i ] ) {
00363             if( vecsClique[ i ].m_iCount )
00364                 iRet++;
00365             sDatum.m_sCliquiness.Add( vecsClique[ i ] ); }
00366     sDatum.m_sCliquiness.Multiply( 0.5 );
00367 
00368     /* Removed 9/4/13 to fix Arjun Issue #17
00369     for( i = 0; i < Dat.GetGenes( ); ++i )
00370         if( !vecfOutside[ i ] )
00371             sDatum.m_sHubbiness.Add( vecsHubs[ i ] );
00372     */
00373 
00374     if( iSort ) {
00375         sDatum.m_vecprSpecific.resize( Dat.GetGenes( ) );
00376         for( i = 0; i < sDatum.m_vecprSpecific.size( ); ++i ) {
00377             sDatum.m_vecprSpecific[ i ].first = i;
00378             //divide each genes cliquiness (within) by precomputed hubbiness (overall average) to calculate group membership
00379             sDatum.m_vecprSpecific[ i ].second = vecsClique[ i ].GetAverage( ) / vecsHubs[ i ].GetAverage( ); }
00380         if( iSort != -1 )
00381             sort( sDatum.m_vecprSpecific.begin( ), sDatum.m_vecprSpecific.end( ), SSorter( ) ); }
00382 
00383     return iRet; }
00384 
00385 int process( const char* szFile, bool fMemmap, bool fNormalize, const vector<vector<string> >& vecvecstrSets1,
00386     const vector<vector<string> >& vecvecstrSets2, long double* adResults, TFnProcessor* pfnProcessor,
00387     size_t iThreads ) {
00388     CDat                    Dat;
00389     vector<vector<size_t> > vecveciSets1, vecveciSets2;
00390 
00391     if( !Dat.Open( szFile, fMemmap && !fNormalize ) ) {
00392         cerr << "Could not open: " << szFile << endl;
00393         return 1; }
00394     cerr << "Calculating for: " << szFile << endl;
00395     if( fNormalize )
00396         Dat.Normalize( CDat::ENormalizeSigmoid );
00397 
00398     enset( Dat, vecvecstrSets1, vecveciSets1 );
00399     enset( Dat, vecvecstrSets2, vecveciSets2 );
00400     return pfnProcessor( SProcessor( Dat, vecveciSets1, vecveciSets2, adResults, iThreads ) ); }
00401 
00402 int background( const SProcessor& sProcessor ) {
00403     const CDat&                     Dat         = sProcessor.m_Dat;
00404     const vector<vector<size_t> >&  vecveciSets = sProcessor.m_vecveciSets1;
00405     long double*                    adResults   = sProcessor.m_adResults;
00406     vector<pthread_t>   vecpthdThreads;
00407     vector<SBackground> vecsBackground;
00408     size_t              i, iChunk;
00409 
00410     vecpthdThreads.resize( min( sProcessor.m_iThreads, vecveciSets.size( ) ) );
00411     vecsBackground.resize( vecpthdThreads.size( ) );
00412     iChunk = 1 + ( ( vecveciSets.size( ) - 1 ) / vecpthdThreads.size( ) );
00413     for( i = 0; i < vecpthdThreads.size( ); ++i ) {
00414         vecsBackground[ i ].m_adResults = adResults;
00415         vecsBackground[ i ].m_iLength = iChunk;
00416         vecsBackground[ i ].m_iStart = i * iChunk;
00417         vecsBackground[ i ].m_pDat = &Dat;
00418         vecsBackground[ i ].m_pvecveciSets = &vecveciSets;
00419         if( pthread_create( &vecpthdThreads[ i ], NULL, background_thread, &vecsBackground[ i ] ) ) {
00420             cerr << "Couldn't create thread for set group: " << i << endl;
00421             return 1; } }
00422     for( i = 0; i < vecpthdThreads.size( ); ++i )
00423         pthread_join( vecpthdThreads[ i ], NULL );
00424 
00425     return 0; }
00426 
00427 void* background_thread( void* pData ) {
00428     SBackground*    psData;
00429     vector<float>   vecdBackgrounds, vecdBackground;
00430     size_t          i, j, k;
00431     float           d;
00432 
00433     psData = (SBackground*)pData;
00434     for( i = 0; ( i < psData->m_iLength ) && ( ( psData->m_iStart + i ) < psData->m_pvecveciSets->size( ) );
00435         ++i ) {
00436         size_t                  iSet    = psData->m_iStart + i;
00437         const vector<size_t>&   veciSet = psData->m_pvecveciSets->at( iSet );
00438 
00439         vecdBackgrounds.resize( veciSet.size( ) );
00440         for( j = 0; j < veciSet.size( ); ++j ) {
00441             size_t  iOne    = veciSet[ j ];
00442 
00443             vecdBackground.clear( );
00444             vecdBackground.reserve( psData->m_pDat->GetGenes( ) );
00445             for( k = 0; k < psData->m_pDat->GetGenes( ); ++k )
00446                 if( !CMeta::IsNaN( d = psData->m_pDat->Get( iOne, k ) ) )
00447                     vecdBackground.push_back( d );
00448             CStatistics::Winsorize( vecdBackground, ( vecdBackground.size( ) / 10 ) + 1 );
00449             vecdBackgrounds[ j ] = (float)CStatistics::Average( vecdBackground ); }
00450 //          vecdBackgrounds[ j ] = (float)CStatistics::Median( vecdBackground ); }
00451 //      psData->m_adResults[ iSet ] = CStatistics::Median( vecdBackgrounds ); }
00452         CStatistics::Winsorize( vecdBackgrounds, ( vecdBackgrounds.size( ) / 10 ) + 1 );
00453         psData->m_adResults[ iSet ] = CStatistics::Average( vecdBackgrounds ); }
00454 
00455     return NULL; }
00456 
00457 int within( const SProcessor& sProcessor ) {
00458     const CDat&                     Dat         = sProcessor.m_Dat;
00459     const vector<vector<size_t> >&  vecveciSets = sProcessor.m_vecveciSets1;
00460     long double*                    adResults   = sProcessor.m_adResults;
00461     size_t              i, j, iCount, iChunk;
00462     float               d;
00463     long double         dPrior;
00464     vector<pthread_t>   vecpthdThreads;
00465     vector<SWithin>     vecsWithin;
00466 
00467     for( dPrior = 0,iCount = i = 0; i < Dat.GetGenes( ); ++i )
00468         for( j = ( i + 1 ); j < Dat.GetGenes( ); ++j )
00469             if( !CMeta::IsNaN( d = Dat.Get( i, j ) ) ) {
00470                 iCount++;
00471                 dPrior += d; }
00472     dPrior /= iCount ? iCount : 1;
00473 
00474     vecpthdThreads.resize( min( sProcessor.m_iThreads, vecveciSets.size( ) ) );
00475     vecsWithin.resize( vecpthdThreads.size( ) );
00476     iChunk = 1 + ( ( vecveciSets.size( ) - 1 ) / vecpthdThreads.size( ) );
00477     for( i = 0; i < vecpthdThreads.size( ); ++i ) {
00478         vecsWithin[ i ].m_dPrior = dPrior;
00479         vecsWithin[ i ].m_pDat = &Dat;
00480         vecsWithin[ i ].m_adResults = adResults;
00481         vecsWithin[ i ].m_pvecveciSets = &vecveciSets;
00482         vecsWithin[ i ].m_iStart = i * iChunk;
00483         vecsWithin[ i ].m_iLength = iChunk;
00484         if( pthread_create( &vecpthdThreads[ i ], NULL, within_thread, &vecsWithin[ i ] ) ) {
00485             cerr << "Couldn't create thread for set group: " << i << endl;
00486             return 1; } }
00487     for( i = 0; i < vecpthdThreads.size( ); ++i )
00488         pthread_join( vecpthdThreads[ i ], NULL );
00489 
00490     return 0; }
00491 
00492 void* within_thread( void* pData ) {
00493     SWithin*        psData;
00494     size_t          i, j, k;
00495     float           d;
00496     vector<float>   vecdWithin;
00497 
00498     psData = (SWithin*)pData;
00499     for( i = 0; ( i < psData->m_iLength ) && ( ( psData->m_iStart + i ) < psData->m_pvecveciSets->size( ) );
00500         ++i ) {
00501         size_t                  iSet    = psData->m_iStart + i;
00502         const vector<size_t>&   veciSet = psData->m_pvecveciSets->at( iSet );
00503 
00504         vecdWithin.clear( );
00505         vecdWithin.reserve( veciSet.size( ) * ( veciSet.size( ) + 1 ) / 2 );
00506         for( j = 0; j < veciSet.size( ); ++j ) {
00507             vecdWithin.push_back( (float)psData->m_dPrior );
00508             for( k = ( j + 1 ); k < veciSet.size( ); ++k )
00509                 if( !CMeta::IsNaN( d = psData->m_pDat->Get( veciSet[ j ], veciSet[ k ] ) ) )
00510                     vecdWithin.push_back( d ); }
00511         CStatistics::Winsorize( vecdWithin, ( vecdWithin.size( ) / 10 ) + 1 );
00512         psData->m_adResults[ iSet ] = CStatistics::Average( vecdWithin ); }
00513 //      psData->m_adResults[ iSet ] = CStatistics::Median( vecdWithin ); }
00514 
00515     return NULL; }
00516 
00517 int between( const SProcessor& sProcessor ) {
00518     const CDat&                     Dat             = sProcessor.m_Dat;
00519     const vector<vector<size_t> >&  vecveciSets1    = sProcessor.m_vecveciSets1;
00520     const vector<vector<size_t> >&  vecveciSets2    = sProcessor.m_vecveciSets2;
00521     long double*                    adResults       = sProcessor.m_adResults;
00522     vector<pthread_t>   vecpthdThreads;
00523     vector<SBetween>    vecsBetween;
00524     size_t              i, iChunk;
00525 
00526     vecpthdThreads.resize( min( sProcessor.m_iThreads, vecveciSets1.size( ) ) );
00527     vecsBetween.resize( vecpthdThreads.size( ) );
00528     iChunk = 1 + ( ( vecveciSets1.size( ) - 1 ) / vecpthdThreads.size( ) );
00529     for( i = 0; i < vecpthdThreads.size( ); ++i ) {
00530         vecsBetween[ i ].m_adResults = adResults;
00531         vecsBetween[ i ].m_iLength = iChunk;
00532         vecsBetween[ i ].m_iStart = i * iChunk;
00533         vecsBetween[ i ].m_pDat = &Dat;
00534         vecsBetween[ i ].m_pvecveciSets1 = &vecveciSets1;
00535         vecsBetween[ i ].m_pvecveciSets2 = &vecveciSets2;
00536         if( pthread_create( &vecpthdThreads[ i ], NULL, between_thread, &vecsBetween[ i ] ) ) {
00537             cerr << "Couldn't create thread for set group: " << i << endl;
00538             return 1; } }
00539     for( i = 0; i < vecpthdThreads.size( ); ++i )
00540         pthread_join( vecpthdThreads[ i ], NULL );
00541 
00542     return 0; }
00543 
00544 void* between_thread( void* pData ) {
00545     size_t              i, iSetOne, iSetTwo, iGeneOne, iGeneTwo, iOne, iTwo;
00546     float               d;
00547     SBetween*           psData;
00548     vector<long double> vecdBetween;
00549 
00550     psData = (SBetween*)pData;
00551     for( iSetOne = 0; ( iSetOne < psData->m_iLength ) &&
00552         ( ( psData->m_iStart + iSetOne ) < psData->m_pvecveciSets1->size( ) ); ++iSetOne ) {
00553         const vector<size_t>&   veciOne = (*psData->m_pvecveciSets1)[ psData->m_iStart + iSetOne ];
00554 
00555         cerr << iSetOne << '/' << psData->m_iLength << endl;
00556         for( iSetTwo = 0; iSetTwo < psData->m_pvecveciSets2->size( ); ++iSetTwo ) {
00557             const vector<size_t>&   veciTwo = (*psData->m_pvecveciSets2)[ iSetTwo ];
00558 
00559             vecdBetween.resize( veciOne.size( ) + veciTwo.size( ) );
00560             fill( vecdBetween.begin( ), vecdBetween.end( ), 0 );
00561             for( iGeneOne = 0; iGeneOne < veciOne.size( ); ++iGeneOne ) {
00562                 iOne = veciOne[ iGeneOne ];
00563                 for( iGeneTwo = 0; iGeneTwo < veciTwo.size( ); ++iGeneTwo ) {
00564                     iTwo = veciTwo[ iGeneTwo ];
00565                     d = ( iOne == iTwo ) ? 1 : psData->m_pDat->Get( iOne, iTwo );
00566                     vecdBetween[ iGeneOne ] += d;
00567                     vecdBetween[ veciOne.size( ) + iGeneTwo ] += d; } }
00568             for( i = 0; i < veciOne.size( ); ++i )
00569                 vecdBetween[ i ] /= max( (size_t)1, veciTwo.size( ) );
00570             for( i = 0; i < veciTwo.size( ); ++i )
00571                 vecdBetween[ veciOne.size( ) + i ] /= max( (size_t)1, veciOne.size( ) );
00572             CStatistics::Winsorize( vecdBetween, ( vecdBetween.size( ) / 10 ) + 1 );
00573             psData->m_adResults[ ( ( psData->m_iStart + iSetOne ) * psData->m_pvecveciSets2->size( ) ) +
00574                 iSetTwo ] = CStatistics::Average( vecdBetween ); } }
00575 //              iSetTwo ] = CStatistics::Median( vecdBetween ); } }
00576 
00577     return NULL; }
00578 
00579 int sets( const char* szFile, const vector<string>& vecstrGenes, vector<vector<string> >& vecvecstrSets ) {
00580     CPCL    PCLSets( false );
00581     size_t  i, iSet, iSets;
00582 
00583     if( !PCLSets.Open( szFile, 1 ) ) {
00584         cerr << "Could not open: " << szFile << endl;
00585         return 1; }
00586     for( iSets = i = 0; i < PCLSets.GetGenes( ); ++i ) {
00587         if( !( iSet = atoi( PCLSets.GetGene( i ).c_str( ) ) ) ) {
00588             cerr << "Invalid set: " << PCLSets.GetGene( i ) << endl;
00589             return 1; }
00590         if( iSet > iSets )
00591             iSets = iSet; }
00592     vecvecstrSets.resize( iSets );
00593     for( i = 0; i < PCLSets.GetGenes( ); ++i ) {
00594         size_t  iGene;
00595 
00596         iSet = atoi( PCLSets.GetGene( i ).c_str( ) ) - 1;
00597         iGene = atoi( PCLSets.GetFeature( i, 1 ).c_str( ) ) - 1;
00598         if( iGene >= vecstrGenes.size( ) ) {
00599             cerr << "Unknown gene: " << iGene << " (" << PCLSets.GetFeature( i, 1 ) << ") in set " <<
00600                 PCLSets.GetGene( i ) << endl;
00601             return 1; }
00602         vecvecstrSets[ iSet ].push_back( vecstrGenes[ iGene ] ); }
00603 
00604     return 0; }
00605 
00606 void enset( const CDat& Dat, const vector<vector<string> >& vecvecstrSets,
00607     vector<vector<size_t> >& vecveciSets ) {
00608     size_t              i, j, iGene;
00609     map<string,size_t>  mapstriGenes;
00610 
00611     for( i = 0; i < vecvecstrSets.size( ); ++i )
00612         for( j = 0; j < vecvecstrSets[ i ].size( ); ++j ) {
00613             const string&   strGene = vecvecstrSets[ i ][ j ];
00614 
00615             if( mapstriGenes.find( strGene ) == mapstriGenes.end( ) )
00616                 mapstriGenes[ strGene ] = Dat.GetGene( strGene ); }
00617 
00618     vecveciSets.resize( vecvecstrSets.size( ) );
00619     for( i = 0; i < vecveciSets.size( ); ++i ) {
00620         vector<size_t>&         veciSet     = vecveciSets[ i ];
00621         const vector<string>&   vecstrSet   = vecvecstrSets[ i ];
00622 
00623         veciSet.clear( );
00624         veciSet.reserve( vecstrSet.size( ) );
00625         for( j = 0; j < vecstrSet.size( ); ++j )
00626             if( ( iGene = mapstriGenes[ vecstrSet[ j ] ] ) != -1 )
00627                 veciSet.push_back( iGene ); } }
00628 
00629 void* term_thread( void* pData ) {
00630     CGenome     Genome;
00631     CGenes      Genes( Genome );
00632     ifstream    ifsm;
00633     size_t      i, iCur;
00634     STerm*      psData;
00635     SDatum      sDatum;
00636 
00637     psData = (STerm*)pData;
00638     ifsm.open( psData->m_strFile.c_str( ) );
00639     if( !Genes.Open( ifsm ) ) {
00640         cerr << "Could not open: " << psData->m_strFile << endl;
00641         return NULL; }
00642     ifsm.close( );
00643     iCur = cliques( *psData->m_pDat, psData->m_iTotal, *psData->m_pvecsHubs, psData->m_iGenes, sDatum, &Genes, *psData->m_pPCLWeights, *psData->m_pveciDat2PCL );
00644     pthread_mutex_lock( psData->m_pmutx );
00645     cout << CMeta::Basename( psData->m_strFile.c_str( ) );
00646     if( psData->m_iGenes == -1 )
00647         for( i = 0; i < sDatum.m_vecprSpecific.size( ); ++i )
00648             cout << '\t' << ( sDatum.m_vecprSpecific[ i ].second *
00649                 ( Genes.IsGene( psData->m_pDat->GetGene( sDatum.m_vecprSpecific[ i ].first ) ) ? -1 : 1 ) );
00650     else {
00651         cout << '\t' << iCur << '\t' << sDatum.m_sHubbiness.GetAverage( ) << '\t' <<
00652             sDatum.m_sHubbiness.GetStdev( ) << '\t' << sDatum.m_sHubbiness.m_iCount << '\t' << sDatum.m_sCliquiness.GetAverage( ) << '\t' <<
00653             sDatum.m_sCliquiness.GetStdev( ) << '\t' << sDatum.m_sCliquiness.m_iCount;
00654         for( i = 0; i < min( psData->m_iGenes, sDatum.m_vecprSpecific.size( ) ); ++i )
00655             cout << '\t' << psData->m_pDat->GetGene( sDatum.m_vecprSpecific[ i ].first ) << '|' <<
00656                 sDatum.m_vecprSpecific[ i ].second << '|' <<
00657                 ( Genes.IsGene( psData->m_pDat->GetGene( sDatum.m_vecprSpecific[ i ].first ) ) ? 1 : 0 ); }
00658     cout << endl;
00659     pthread_mutex_unlock( psData->m_pmutx );
00660 
00661     if( psData->m_strClip.length( ) ) {
00662         CDat            DatOut;
00663         size_t          j;
00664         vector<bool>    vecfGenes;
00665 
00666         vecfGenes.resize( psData->m_pDat->GetGenes( ) );
00667         for( i = 0; i < vecfGenes.size( ); ++i )
00668             vecfGenes[ i ] = Genes.IsGene( psData->m_pDat->GetGene( i ) );
00669         DatOut.Open( psData->m_pDat->GetGeneNames( ) );
00670         for( i = 0; i < DatOut.GetGenes( ); ++i )
00671             for( j = ( i + 1 ); j < DatOut.GetGenes( ); ++j )
00672                 if( vecfGenes[ i ] || vecfGenes[ j ] )
00673                     DatOut.Set( i, j, psData->m_pDat->Get( i, j ) );
00674         DatOut.Save( ( psData->m_strClip + '/' + CMeta::Deextension( CMeta::Basename(
00675             psData->m_strFile.c_str( ) ) ) + c_szDab ).c_str( ) ); }
00676 
00677     return NULL; }