Sleipnir
src/coalesceclusteri.h
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #ifndef COALESCECLUSTERI_H
00023 #define COALESCECLUSTERI_H
00024 
00025 #include "coalescestructsi.h"
00026 
00027 namespace Sleipnir {
00028 
00029 class CCoalesceCluster;
00030 class CCoalesceGeneScores;
00031 class CCoalesceGroupHistograms;
00032 struct SMotifMatch;
00033 
00034 class CCoalesceClusterImpl {
00035 protected:
00036     typedef std::vector<std::map<std::string, std::set<SMotifMatch> > > TVecMapStrSetSMotifs;
00037 
00038     static const char   c_cStar     = '*';
00039     static const char   c_szMotifs[];
00040     static const char   c_szConditions[];
00041     static const char   c_szGenes[];
00042 
00043     struct SDataset {
00044         const SCoalesceDataset* m_psDataset;
00045         double                  m_dP;
00046         float                   m_dZ;
00047         std::vector<float>      m_vecdCentroid;
00048 
00049         size_t GetConditions( ) const {
00050 
00051             return m_psDataset->GetConditions( ); }
00052 
00053         size_t GetCondition( size_t iCondition ) const {
00054 
00055             return m_psDataset->GetCondition( iCondition ); }
00056     };
00057 
00058     struct SThreadCentroid {
00059         CCoalesceCluster*   m_pCluster;
00060         const CPCL&         m_PCL;
00061 
00062         SThreadCentroid( CCoalesceCluster* pCluster, const CPCL& PCL ) : m_pCluster(pCluster), m_PCL(PCL) { }
00063     };
00064 
00065     struct SThreadSignificantGene {
00066         size_t                                  m_iOffset;
00067         size_t                                  m_iStep;
00068         std::vector<bool>*                      m_pvecfSignificant;
00069         const CPCL*                             m_pPCL;
00070         const CCoalesceMotifLibrary*            m_pMotifs;
00071         const CCoalesceGeneScores*              m_pGeneScores;
00072         const CCoalesceGroupHistograms*         m_pHistsCluster;
00073         const CCoalesceGroupHistograms*         m_pHistsPot;
00074         const CCoalesceCluster*                 m_pCluster;
00075         const CCoalesceCluster*                 m_pPot;
00076         const std::vector<size_t>*              m_pveciDatasets;
00077         const std::vector<float>*               m_pvecdStdevs;
00078         float                                   m_dBeta;
00079         size_t                                  m_iMinimum;
00080         float                                   m_dProbability;
00081     };
00082 
00083     struct SThreadSelectMotif {
00084         uint32_t                        m_iOffset;
00085         size_t                          m_iStep;
00086         const CCoalesceMotifLibrary*    m_pMotifs;
00087         const CCoalesceGroupHistograms* m_pHistsCluster;
00088         const CCoalesceGroupHistograms* m_pHistsPot;
00089         float                           m_dPValue;
00090         float                           m_dZScore;
00091         const std::vector<uint32_t>*    m_pveciMotifs;
00092         std::vector<SMotifMatch>        m_vecsMotifs;
00093     };
00094 
00095     struct SThreadSeedPair {
00096         size_t                                      m_iOffset;
00097         size_t                                      m_iStep;
00098         const CPCL*                                 m_pPCL;
00099         float                                       m_dFraction;
00100         const std::set<std::pair<size_t, size_t> >* m_psetpriiSeeds;
00101         double                                      m_dMaxCorr;
00102         double                                      m_dMinP;
00103         size_t                                      m_iOne;
00104         size_t                                      m_iTwo;
00105     };
00106 
00107     struct SThreadSelectCondition {
00108         size_t                      m_iOffset;
00109         size_t                      m_iStep;
00110         const std::vector<size_t>*  m_pveciCluster;
00111         const std::vector<size_t>*  m_pveciPot;
00112         std::vector<SDataset>*      m_pvecsDatasets;
00113         const CPCL*                 m_pPCL;
00114     };
00115 
00116     static void* ThreadCentroid( void* );
00117     static void* ThreadSignificantGene( void* );
00118     static void* ThreadSelectMotif( void* );
00119     static void* ThreadSeedPair( void* );
00120     static void* ThreadSelectCondition( void* );
00121     static bool AddSignificant( const CCoalesceMotifLibrary&, uint32_t, const CCoalesceGroupHistograms&,
00122         const CCoalesceGroupHistograms&, float, float, std::vector<SMotifMatch>& );
00123     static size_t Open( const CHierarchy&, const std::vector<CCoalesceCluster>&,
00124         const std::vector<std::string>&, std::map<size_t, size_t>&, std::map<size_t, size_t>&,
00125         TVecMapStrSetSMotifs& );
00126     static bool OpenMotifs( CCoalesceMotifLibrary&, const CHierarchy&, const std::vector<SMotifMatch>&, float,
00127         std::set<SMotifMatch>& );
00128 
00129     template<class tType>
00130     static bool IsConverged( const std::set<tType>& setNew, std::vector<tType>& vecOld ) {
00131         size_t              i;
00132         std::vector<tType>  vecNew;
00133 
00134         if( setNew.size( ) != vecOld.size( ) )
00135             return false;
00136         Snapshot( setNew, vecNew );
00137         for( i = 0; i < vecNew.size( ); ++i )
00138             if( vecNew[ i ] != vecOld[ i ] )
00139                 return false;
00140 
00141         return true; }
00142 
00143     template<class tType>
00144     static void Snapshot( const std::set<tType>& setNew, std::vector<tType>& vecOld ) {
00145 
00146         vecOld.resize( setNew.size( ) );
00147         std::copy( setNew.begin( ), setNew.end( ), vecOld.begin( ) );
00148         std::sort( vecOld.begin( ), vecOld.end( ) ); }
00149 
00150     template<class tType>
00151     static size_t GetHash( const std::set<tType>& set ) {
00152         size_t                                      iRet;
00153         typename std::set<tType>::const_iterator    iter;
00154 
00155         for( iRet = 0,iter = set.begin( ); iter != set.end( ); ++iter )
00156             iRet ^= GetHash( *iter );
00157 
00158         return iRet; }
00159 
00160     static size_t GetHash( size_t iValue ) {
00161 
00162         return ( iValue * ( (size_t)-1 / 20000 ) ); }
00163 
00164     static size_t GetHash( const SMotifMatch& sMotif ) {
00165 
00166         return sMotif.GetHash( ); }
00167 
00168     void Add( size_t, CCoalesceCluster& );
00169     bool AddCorrelatedGenes( const CPCL&, CCoalesceCluster&, const std::vector<float>&, float );
00170     bool AddSeedPair( const CPCL&, CCoalesceCluster&, std::set<std::pair<size_t, size_t> >&, float, float,
00171         size_t );
00172     void CalculateCentroid( const CPCL& );
00173     bool IsSignificant( size_t, const CPCL&, const std::vector<float>&, const CCoalesceMotifLibrary*,
00174         const CCoalesceGeneScores&, const CCoalesceGroupHistograms&, const CCoalesceGroupHistograms&,
00175         const CCoalesceCluster&, const std::vector<size_t>&, float, size_t, float ) const;
00176     bool CalculateProbabilityExpression( size_t, const CPCL&, const std::vector<float>&,
00177         const CCoalesceCluster&, const std::vector<size_t>&, bool, float&, float& ) const;
00178     bool CalculateProbabilityMotifs( const CCoalesceGeneScores&, size_t, const CCoalesceGroupHistograms&,
00179         const CCoalesceGroupHistograms&, bool, size_t, float&, float& ) const;
00180     bool SaveCopy( const CPCL&, const std::set<size_t>&, size_t, CPCL&, size_t, bool ) const;
00181     bool OpenMotifs( const std::set<SMotifMatch>&, CCoalesceMotifLibrary&, float );
00182     bool OpenMotifsHeuristic( const std::set<SMotifMatch>&, CCoalesceMotifLibrary&, float, size_t );
00183 
00184     size_t GetConditions( size_t iDataset ) const {
00185 
00186         if( iDataset < m_vecsDatasets.size( ) ) {
00187             const SDataset& sDataset    = m_vecsDatasets[ iDataset ];
00188 
00189             if( sDataset.m_psDataset )
00190                 return sDataset.m_psDataset->GetConditions( ); }
00191 
00192         return 1; }
00193 
00194     size_t GetCondition( size_t iDataset, size_t iCondition ) const {
00195 
00196         if( iDataset < m_vecsDatasets.size( ) ) {
00197             const SDataset& sDataset    = m_vecsDatasets[ iDataset ];
00198 
00199             if( sDataset.m_psDataset )
00200                 return sDataset.m_psDataset->GetCondition( iCondition ); }
00201 
00202         return iDataset; }
00203 
00204     bool IsGene( size_t iGene ) const {
00205 
00206         return ( m_setiGenes.find( iGene ) != m_setiGenes.end( ) ); }
00207 
00208     size_t GetHash( ) const {
00209 
00210         return ( GetHash( m_setiDatasets ) ^ GetHash( m_setiGenes ) ^ GetHash( m_setsMotifs ) ); }
00211 
00212     void GetConditions( std::set<size_t>& setiConditions ) const {
00213         set<size_t>::const_iterator iterDataset;
00214         size_t                      i;
00215 
00216         for( iterDataset = m_setiDatasets.begin( ); iterDataset != m_setiDatasets.end( ); ++iterDataset )
00217             for( i = 0; i < GetConditions( *iterDataset ); ++i )
00218                 setiConditions.insert( GetCondition( *iterDataset, i ) ); }
00219 
00220     const std::set<size_t>& GetGenes( ) const {
00221 
00222         return m_setiGenes; }
00223 
00224     void Clear( ) {
00225 
00226         m_setiDatasets.clear( );
00227         m_setiGenes.clear( );
00228         m_setsMotifs.clear( );
00229         m_veciPrevDatasets.clear( );
00230         m_veciPrevGenes.clear( );
00231         m_vecsPrevMotifs.clear( );
00232         m_veciCounts.clear( );
00233         m_vecdCentroid.clear( );
00234         m_vecdStdevs.clear( );
00235         m_setiHistory.clear( );
00236         m_vecdPriors.clear( );
00237         m_vecsDatasets.clear( ); }
00238 
00239     std::set<size_t>            m_setiDatasets;
00240     std::set<size_t>            m_setiGenes;
00241     std::set<SMotifMatch>       m_setsMotifs;
00242     std::vector<size_t>         m_veciPrevDatasets;
00243     std::vector<size_t>         m_veciPrevGenes;
00244     std::vector<SMotifMatch>    m_vecsPrevMotifs;
00245     std::vector<size_t>         m_veciCounts;
00246     std::vector<float>          m_vecdCentroid;
00247     std::vector<float>          m_vecdStdevs;
00248     std::set<size_t>            m_setiHistory;
00249     std::vector<float>          m_vecdPriors;
00250     std::vector<SDataset>       m_vecsDatasets;
00251 };
00252 
00253 }
00254 
00255 #endif // COALESCECLUSTERI_H