Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #ifndef COALESCECLUSTERI_H 00023 #define COALESCECLUSTERI_H 00024 00025 #include "coalescestructsi.h" 00026 00027 namespace Sleipnir { 00028 00029 class CCoalesceCluster; 00030 class CCoalesceGeneScores; 00031 class CCoalesceGroupHistograms; 00032 struct SMotifMatch; 00033 00034 class CCoalesceClusterImpl { 00035 protected: 00036 typedef std::vector<std::map<std::string, std::set<SMotifMatch> > > TVecMapStrSetSMotifs; 00037 00038 static const char c_cStar = '*'; 00039 static const char c_szMotifs[]; 00040 static const char c_szConditions[]; 00041 static const char c_szGenes[]; 00042 00043 struct SDataset { 00044 const SCoalesceDataset* m_psDataset; 00045 double m_dP; 00046 float m_dZ; 00047 std::vector<float> m_vecdCentroid; 00048 00049 size_t GetConditions( ) const { 00050 00051 return m_psDataset->GetConditions( ); } 00052 00053 size_t GetCondition( size_t iCondition ) const { 00054 00055 return m_psDataset->GetCondition( iCondition ); } 00056 }; 00057 00058 struct SThreadCentroid { 00059 CCoalesceCluster* m_pCluster; 00060 const CPCL& m_PCL; 00061 00062 SThreadCentroid( CCoalesceCluster* pCluster, const CPCL& PCL ) : m_pCluster(pCluster), m_PCL(PCL) { } 00063 }; 00064 00065 struct SThreadSignificantGene { 00066 size_t m_iOffset; 00067 size_t m_iStep; 00068 std::vector<bool>* m_pvecfSignificant; 00069 const CPCL* m_pPCL; 00070 const CCoalesceMotifLibrary* m_pMotifs; 00071 const CCoalesceGeneScores* m_pGeneScores; 00072 const CCoalesceGroupHistograms* m_pHistsCluster; 00073 const CCoalesceGroupHistograms* m_pHistsPot; 00074 const CCoalesceCluster* m_pCluster; 00075 const CCoalesceCluster* m_pPot; 00076 const std::vector<size_t>* m_pveciDatasets; 00077 const std::vector<float>* m_pvecdStdevs; 00078 float m_dBeta; 00079 size_t m_iMinimum; 00080 float m_dProbability; 00081 }; 00082 00083 struct SThreadSelectMotif { 00084 uint32_t m_iOffset; 00085 size_t m_iStep; 00086 const CCoalesceMotifLibrary* m_pMotifs; 00087 const CCoalesceGroupHistograms* m_pHistsCluster; 00088 const CCoalesceGroupHistograms* m_pHistsPot; 00089 float m_dPValue; 00090 float m_dZScore; 00091 const std::vector<uint32_t>* m_pveciMotifs; 00092 std::vector<SMotifMatch> m_vecsMotifs; 00093 }; 00094 00095 struct SThreadSeedPair { 00096 size_t m_iOffset; 00097 size_t m_iStep; 00098 const CPCL* m_pPCL; 00099 float m_dFraction; 00100 const std::set<std::pair<size_t, size_t> >* m_psetpriiSeeds; 00101 double m_dMaxCorr; 00102 double m_dMinP; 00103 size_t m_iOne; 00104 size_t m_iTwo; 00105 }; 00106 00107 struct SThreadSelectCondition { 00108 size_t m_iOffset; 00109 size_t m_iStep; 00110 const std::vector<size_t>* m_pveciCluster; 00111 const std::vector<size_t>* m_pveciPot; 00112 std::vector<SDataset>* m_pvecsDatasets; 00113 const CPCL* m_pPCL; 00114 }; 00115 00116 static void* ThreadCentroid( void* ); 00117 static void* ThreadSignificantGene( void* ); 00118 static void* ThreadSelectMotif( void* ); 00119 static void* ThreadSeedPair( void* ); 00120 static void* ThreadSelectCondition( void* ); 00121 static bool AddSignificant( const CCoalesceMotifLibrary&, uint32_t, const CCoalesceGroupHistograms&, 00122 const CCoalesceGroupHistograms&, float, float, std::vector<SMotifMatch>& ); 00123 static size_t Open( const CHierarchy&, const std::vector<CCoalesceCluster>&, 00124 const std::vector<std::string>&, std::map<size_t, size_t>&, std::map<size_t, size_t>&, 00125 TVecMapStrSetSMotifs& ); 00126 static bool OpenMotifs( CCoalesceMotifLibrary&, const CHierarchy&, const std::vector<SMotifMatch>&, float, 00127 std::set<SMotifMatch>& ); 00128 00129 template<class tType> 00130 static bool IsConverged( const std::set<tType>& setNew, std::vector<tType>& vecOld ) { 00131 size_t i; 00132 std::vector<tType> vecNew; 00133 00134 if( setNew.size( ) != vecOld.size( ) ) 00135 return false; 00136 Snapshot( setNew, vecNew ); 00137 for( i = 0; i < vecNew.size( ); ++i ) 00138 if( vecNew[ i ] != vecOld[ i ] ) 00139 return false; 00140 00141 return true; } 00142 00143 template<class tType> 00144 static void Snapshot( const std::set<tType>& setNew, std::vector<tType>& vecOld ) { 00145 00146 vecOld.resize( setNew.size( ) ); 00147 std::copy( setNew.begin( ), setNew.end( ), vecOld.begin( ) ); 00148 std::sort( vecOld.begin( ), vecOld.end( ) ); } 00149 00150 template<class tType> 00151 static size_t GetHash( const std::set<tType>& set ) { 00152 size_t iRet; 00153 typename std::set<tType>::const_iterator iter; 00154 00155 for( iRet = 0,iter = set.begin( ); iter != set.end( ); ++iter ) 00156 iRet ^= GetHash( *iter ); 00157 00158 return iRet; } 00159 00160 static size_t GetHash( size_t iValue ) { 00161 00162 return ( iValue * ( (size_t)-1 / 20000 ) ); } 00163 00164 static size_t GetHash( const SMotifMatch& sMotif ) { 00165 00166 return sMotif.GetHash( ); } 00167 00168 void Add( size_t, CCoalesceCluster& ); 00169 bool AddCorrelatedGenes( const CPCL&, CCoalesceCluster&, const std::vector<float>&, float ); 00170 bool AddSeedPair( const CPCL&, CCoalesceCluster&, std::set<std::pair<size_t, size_t> >&, float, float, 00171 size_t ); 00172 void CalculateCentroid( const CPCL& ); 00173 bool IsSignificant( size_t, const CPCL&, const std::vector<float>&, const CCoalesceMotifLibrary*, 00174 const CCoalesceGeneScores&, const CCoalesceGroupHistograms&, const CCoalesceGroupHistograms&, 00175 const CCoalesceCluster&, const std::vector<size_t>&, float, size_t, float ) const; 00176 bool CalculateProbabilityExpression( size_t, const CPCL&, const std::vector<float>&, 00177 const CCoalesceCluster&, const std::vector<size_t>&, bool, float&, float& ) const; 00178 bool CalculateProbabilityMotifs( const CCoalesceGeneScores&, size_t, const CCoalesceGroupHistograms&, 00179 const CCoalesceGroupHistograms&, bool, size_t, float&, float& ) const; 00180 bool SaveCopy( const CPCL&, const std::set<size_t>&, size_t, CPCL&, size_t, bool ) const; 00181 bool OpenMotifs( const std::set<SMotifMatch>&, CCoalesceMotifLibrary&, float ); 00182 bool OpenMotifsHeuristic( const std::set<SMotifMatch>&, CCoalesceMotifLibrary&, float, size_t ); 00183 00184 size_t GetConditions( size_t iDataset ) const { 00185 00186 if( iDataset < m_vecsDatasets.size( ) ) { 00187 const SDataset& sDataset = m_vecsDatasets[ iDataset ]; 00188 00189 if( sDataset.m_psDataset ) 00190 return sDataset.m_psDataset->GetConditions( ); } 00191 00192 return 1; } 00193 00194 size_t GetCondition( size_t iDataset, size_t iCondition ) const { 00195 00196 if( iDataset < m_vecsDatasets.size( ) ) { 00197 const SDataset& sDataset = m_vecsDatasets[ iDataset ]; 00198 00199 if( sDataset.m_psDataset ) 00200 return sDataset.m_psDataset->GetCondition( iCondition ); } 00201 00202 return iDataset; } 00203 00204 bool IsGene( size_t iGene ) const { 00205 00206 return ( m_setiGenes.find( iGene ) != m_setiGenes.end( ) ); } 00207 00208 size_t GetHash( ) const { 00209 00210 return ( GetHash( m_setiDatasets ) ^ GetHash( m_setiGenes ) ^ GetHash( m_setsMotifs ) ); } 00211 00212 void GetConditions( std::set<size_t>& setiConditions ) const { 00213 set<size_t>::const_iterator iterDataset; 00214 size_t i; 00215 00216 for( iterDataset = m_setiDatasets.begin( ); iterDataset != m_setiDatasets.end( ); ++iterDataset ) 00217 for( i = 0; i < GetConditions( *iterDataset ); ++i ) 00218 setiConditions.insert( GetCondition( *iterDataset, i ) ); } 00219 00220 const std::set<size_t>& GetGenes( ) const { 00221 00222 return m_setiGenes; } 00223 00224 void Clear( ) { 00225 00226 m_setiDatasets.clear( ); 00227 m_setiGenes.clear( ); 00228 m_setsMotifs.clear( ); 00229 m_veciPrevDatasets.clear( ); 00230 m_veciPrevGenes.clear( ); 00231 m_vecsPrevMotifs.clear( ); 00232 m_veciCounts.clear( ); 00233 m_vecdCentroid.clear( ); 00234 m_vecdStdevs.clear( ); 00235 m_setiHistory.clear( ); 00236 m_vecdPriors.clear( ); 00237 m_vecsDatasets.clear( ); } 00238 00239 std::set<size_t> m_setiDatasets; 00240 std::set<size_t> m_setiGenes; 00241 std::set<SMotifMatch> m_setsMotifs; 00242 std::vector<size_t> m_veciPrevDatasets; 00243 std::vector<size_t> m_veciPrevGenes; 00244 std::vector<SMotifMatch> m_vecsPrevMotifs; 00245 std::vector<size_t> m_veciCounts; 00246 std::vector<float> m_vecdCentroid; 00247 std::vector<float> m_vecdStdevs; 00248 std::set<size_t> m_setiHistory; 00249 std::vector<float> m_vecdPriors; 00250 std::vector<SDataset> m_vecsDatasets; 00251 }; 00252 00253 } 00254 00255 #endif // COALESCECLUSTERI_H