Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #ifndef STATISTICS_H 00023 #define STATISTICS_H 00024 00025 #include <limits> 00026 00027 #include "statisticsi.h" 00028 00029 #include "fullmatrix.h" 00030 00031 namespace Sleipnir { 00032 00033 class CDat; 00034 class CPCL; 00035 00044 class CStatistics: CStatisticsImpl { 00045 public: 00046 // Simple stuff 00047 static double FScore(size_t iTruePositives, size_t iFalsePositives, 00048 size_t iTrueNegatives, size_t iFalseNegatives, double dBeta = 1); 00049 00075 static double Precision(size_t iTruePositives, size_t iFalsePositives, 00076 size_t iTrueNegatives, size_t iFalseNegatives) { 00077 UNUSED_PARAMETER(iTrueNegatives); 00078 UNUSED_PARAMETER(iFalseNegatives); 00079 00080 return ((double) iTruePositives / (iTruePositives + iFalsePositives)); 00081 } 00082 00108 static double Recall(size_t iTruePositives, size_t iFalsePositives, 00109 size_t iTrueNegatives, size_t iFalseNegatives) { 00110 UNUSED_PARAMETER(iFalsePositives); 00111 UNUSED_PARAMETER(iTrueNegatives); 00112 00113 return ((double) iTruePositives / (iTruePositives + iFalseNegatives)); 00114 } 00115 00135 template<class tType> 00136 static double Variance(const tType Begin, const tType End, double dMean) { 00137 double dRet; 00138 size_t iN; 00139 00140 Sums(Begin, End, NULL, &dRet, &iN); 00141 return (iN ? max((dRet / iN) - (dMean * dMean), 0.0) : 0); 00142 } 00143 00144 template<class tType> 00145 static double NormalizeMeanStd(const tType Begin, const tType End) { 00146 double dSum, dSqSum; 00147 size_t iN; 00148 Sums(Begin, End, &dSum, &dSqSum, &iN); 00149 float dMean = dSum /= iN; 00150 float dStd = sqrt(iN ? max((dSqSum / iN) - (dMean * dMean), 0.0) : 0); 00151 tType Cur; 00152 for (Cur = Begin; Cur != End; Cur++) { 00153 (*Cur) = (*Cur - dMean) / dStd; 00154 } 00155 } 00172 template<class tType> 00173 static double Variance(const tType Begin, const tType End) { 00174 double dSum, dRet; 00175 size_t iN; 00176 00177 Sums(Begin, End, &dSum, &dRet, &iN); 00178 if (!iN) 00179 return 0; 00180 dSum /= iN; 00181 return max((dRet / iN) - (dSum * dSum), 0.0); 00182 } 00183 00200 template<class tType> 00201 static double Variance(const std::vector<tType>& vecValues, double dMean) { 00202 00203 return Variance(vecValues.begin(), vecValues.end(), dMean); 00204 } 00205 00219 template<class tType> 00220 static double Variance(const std::vector<tType>& vecValues) { 00221 00222 return Variance(vecValues.begin(), vecValues.end()); 00223 } 00224 00238 template<class tType> 00239 static double Average(const std::vector<tType>& vecValues) { 00240 00241 return Average(vecValues.begin(), vecValues.end()); 00242 } 00243 00260 template<class tType> 00261 static double Average(const tType Begin, const tType End) { 00262 double dRet; 00263 size_t iN; 00264 00265 Sums(Begin, End, &dRet, NULL, &iN); 00266 return (iN ? (dRet / iN) : dRet); 00267 } 00268 00292 template<class tType> 00293 static double Percentile(tType pBegin, tType pEnd, double dPercentile) { 00294 size_t iOne, iTwo, iSize; 00295 double d, dFrac; 00296 00297 iSize = pEnd - pBegin; 00298 std::sort(pBegin, pEnd); 00299 while (iSize && CMeta::IsNaN(pBegin[iSize - 1])) 00300 --iSize; 00301 if (!iSize) 00302 return CMeta::GetNaN(); 00303 d = (iSize - 1) * dPercentile; 00304 dFrac = d - (size_t) d; 00305 iOne = (size_t) d; 00306 iTwo = (size_t) (d + 1); 00307 00308 return ((iTwo >= iSize) ? pBegin[iOne] : ((pBegin[iOne] * (1 00309 - dPercentile)) + (pBegin[iTwo] * dPercentile))); 00310 } 00311 00328 template<class tType> 00329 static double Median(std::vector<tType>& vecData) { 00330 00331 return Percentile(vecData.begin(), vecData.end(), 0.5); 00332 } 00333 00355 template<class tType> 00356 static bool Winsorize(std::vector<tType>& vecValues, size_t iCount = 1) { 00357 00358 return Winsorize(vecValues.begin(), vecValues.end(), iCount); 00359 } 00360 00385 template<class tType> 00386 static bool Winsorize(tType pBegin, tType pEnd, size_t iCount = 1) { 00387 size_t i, iLength; 00388 00389 iLength = pEnd - pBegin; 00390 if (iLength < ((2 * iCount) + 1)) 00391 return false; 00392 std::sort(pBegin, pEnd); 00393 for (i = 0; i < iCount; ++i) { 00394 pBegin[i] = pBegin[i + iCount]; 00395 pEnd[-1 - i] = pEnd[-1 - iCount]; 00396 } 00397 00398 return true; 00399 } 00400 00424 template<class tType> 00425 static double CohensD(const std::vector<tType>& vecOne, const std::vector< 00426 tType>& vecTwo, double* pdAverage = NULL) { 00427 00428 return CohensD(vecOne.begin(), vecOne.end(), vecTwo.begin(), 00429 vecTwo.end(), pdAverage); 00430 } 00431 00461 template<class tType> 00462 static double CohensD(const tType BeginOne, const tType EndOne, 00463 const tType BeginTwo, const tType EndTwo, double* pdAverage = NULL) { 00464 double dAveOne, dAveTwo, dVarOne, dVarTwo, dStd; 00465 size_t iOne, iTwo; 00466 00467 Sums(BeginOne, EndOne, &dAveOne, &dVarOne, &iOne); 00468 if (iOne) { 00469 dAveOne /= iOne; 00470 dVarOne = (dVarOne / iOne) - (dAveOne * dAveOne); 00471 } 00472 if (pdAverage) 00473 *pdAverage = dAveOne; 00474 Sums(BeginTwo, EndTwo, &dAveTwo, &dVarTwo, &iTwo); 00475 if (iTwo) { 00476 dAveTwo /= iTwo; 00477 dVarTwo = (dVarTwo / iTwo) - (dAveTwo * dAveTwo); 00478 } 00479 00480 dStd = sqrt((dVarOne + dVarTwo) / 2); 00481 return (dStd ? ((dAveOne - dAveTwo) / dStd) : ((dAveOne == dAveTwo) ? 0 00482 : DBL_MAX)); 00483 } 00484 00504 template<class tType> 00505 static double ZScore(const std::vector<tType>& vecOne, const std::vector< 00506 tType>& vecTwo) { 00507 00508 return ZScore(vecOne.begin(), vecOne.end(), vecTwo.begin(), 00509 vecTwo.end()); 00510 } 00511 00540 template<class tType> 00541 static double ZScore(const tType BeginOne, const tType EndOne, 00542 const tType BeginTwo, const tType EndTwo, double* pdAverage = NULL) { 00543 double dAveOne, dAveTwo, dVarOne, dVarTwo, dAve, dStd; 00544 size_t iOne, iTwo; 00545 00546 Sums(BeginOne, EndOne, &dAveOne, &dVarOne, &iOne); 00547 Sums(BeginTwo, EndTwo, &dAveTwo, &dVarTwo, &iTwo); 00548 dAve = (iOne || iTwo) ? ((dAveOne + dAveTwo) / (iOne + iTwo)) : 0; 00549 if (iOne) 00550 dAveOne /= iOne; 00551 if (pdAverage) 00552 *pdAverage = dAveOne; 00553 dStd = (iOne || iTwo) ? sqrt(((dVarOne + dVarTwo) / (iOne + iTwo)) 00554 - (dAve * dAve)) : 0; 00555 00556 return (dStd ? ((dAveOne - dAve) / dStd) : ((dAveOne == dAve) ? 0 00557 : DBL_MAX)); 00558 } 00559 00579 static double ZTest(double dZScore, size_t iN) { 00580 00581 return (1 - Normal01CDF(fabs(dZScore) * sqrt((double) iN))); 00582 } 00583 00584 template<class tType, class tIter> 00585 static double AndersonDarlingScore(tIter Begin, tIter End) { 00586 tIter Cur; 00587 double d, dA2, dAve, dStd; 00588 size_t i, iN; 00589 std::vector<tType> vecValues; 00590 00591 dAve = dStd = 0; 00592 for (iN = 0, Cur = Begin; Cur != End; ++iN, ++Cur) { 00593 dAve += *Cur; 00594 dStd += *Cur * *Cur; 00595 } 00596 if (iN < 2) 00597 return CMeta::GetNaN(); 00598 dAve /= iN; 00599 dStd = sqrt((dStd / (iN - 1)) - (dAve * dAve)); 00600 if (dStd <= 0) 00601 dStd = 1; 00602 00603 vecValues.resize(iN); 00604 std::copy(Begin, End, vecValues.begin()); 00605 std::sort(vecValues.begin(), vecValues.end()); 00606 00607 dA2 = 0; 00608 for (i = 0; i < vecValues.size(); ++i) { 00609 d = Normal01CDF((vecValues[i] - dAve) / dStd); 00610 if (d <= std::numeric_limits<double>::epsilon()) 00611 d = std::numeric_limits<double>::epsilon(); 00612 else if ((1 - d) <= std::numeric_limits<double>::epsilon()) 00613 d = 1 - std::numeric_limits<double>::epsilon(); 00614 dA2 += (((2 * (i + 1)) - 1) * log(d)) + (((2 * (iN - i)) - 1) 00615 * log(1 - d)); 00616 } 00617 dA2 = (-dA2 / iN) - iN; 00618 dA2 *= 1 + (0.75 / iN) + (2.25 / (iN * iN)); 00619 00620 return dA2; 00621 } 00622 00623 static double AndersonDarlingTest(double dA2) { 00624 double dRet; 00625 00626 if (dA2 < 0.2) 00627 dRet = 1 - exp(-13.436 + (101.14 * dA2) - (223.73 * dA2 * dA2)); 00628 else if (dA2 < 0.34) 00629 dRet = 1 - exp(-8.318 + (42.796 * dA2) - (59.938 * dA2 * dA2)); 00630 else if (dA2 < 0.6) 00631 dRet = exp(0.9177 - (4.279 * dA2) - (1.38 * dA2 * dA2)); 00632 else if (dA2 < 13) 00633 dRet = exp(1.2937 - (5.709 * dA2) + (0.0186 * dA2 * dA2)); 00634 else 00635 dRet = 0; 00636 00637 return dRet; 00638 } 00639 00666 template<class tType> 00667 static double RootMeanSquareError(tType BeginOne, tType EndOne, 00668 tType BeginTwo, tType EndTwo) { 00669 tType CurOne, CurTwo; 00670 double d, dRet; 00671 size_t iN; 00672 00673 for (dRet = 0, CurOne = BeginOne, CurTwo = BeginTwo; (CurOne != EndOne) 00674 && (CurTwo != EndTwo); ++CurOne, ++CurTwo) { 00675 d = *CurOne - *CurTwo; 00676 dRet += d * d; 00677 } 00678 iN = min(EndOne - BeginOne, EndTwo - BeginTwo); 00679 00680 return (iN ? sqrt(dRet / iN) : 0); 00681 } 00682 00712 template<class tType> 00713 static double JensenShannonDivergence(tType BeginOne, tType EndOne, 00714 tType BeginTwo, tType EndTwo) { 00715 00716 return ((KullbackLeiblerDivergence(BeginOne, EndOne, BeginTwo, EndTwo) 00717 + KullbackLeiblerDivergence(BeginTwo, EndTwo, BeginOne, EndOne)) 00718 / 2); 00719 } 00720 00750 template<class tType> 00751 static double KullbackLeiblerDivergence(tType BeginOne, tType EndOne, 00752 tType BeginTwo, tType EndTwo) { 00753 double dRet; 00754 tType CurOne, CurTwo; 00755 00756 if ((EndOne - BeginOne) != (EndTwo - BeginTwo)) 00757 return CMeta::GetNaN(); 00758 00759 for (dRet = 0, CurOne = BeginOne, CurTwo = BeginTwo; (CurOne != EndOne) 00760 && (CurTwo != EndTwo); ++CurOne, ++CurTwo) 00761 dRet += *CurOne * log(*CurOne / *CurTwo); 00762 00763 return (dRet / log(2.0)); 00764 } 00765 00766 // P-value tests 00767 static double LjungBox(const float* adX, size_t iN, size_t iH); 00768 00783 static double LjungBox(const float* adX, size_t iN) { 00784 00785 return LjungBox(adX, iN, iN - 1); 00786 } 00787 00805 static double PValueLognormal(double dX, double dMean, double dStdev) { 00806 double dCDF; 00807 00808 dCDF = LognormalCDF(dX, dMean, dStdev); 00809 if (dX > exp(dMean)) 00810 dCDF = 1 - dCDF; 00811 00812 return (2 * dCDF); 00813 } 00814 00831 static double PValuePearson(double dR, size_t iN) { 00832 static const double c_dEpsilon = 1e-10; 00833 double dT, dF; 00834 00835 if (iN < 2) 00836 return 1; 00837 if ((1 - dR) < c_dEpsilon) 00838 return 0; 00839 dF = iN - 2; 00840 dT = dR * sqrt(dF / (1 - (dR * dR))); 00841 return (1 - TCDF(dT, dF)); 00842 } 00843 00860 static double PValueSpearman(double dR, size_t iN) { 00861 double dT; 00862 00863 if (iN < 3) 00864 return 1; 00865 00866 // dZ = sqrt( ( iN - 3 ) / 1.06 ) * CStatistics::FisherTransform( dR ); 00867 dT = dR * sqrt((iN - 2) / (1 - (dR * dR))); 00868 return (1 - TCDF(dT, iN - 2)); 00869 } 00870 00871 static double FisherTransform(double dR) { 00872 static const double c_dBound = 0.9999f; 00873 if (fabs(dR) >= c_dBound) 00874 dR *= c_dBound; 00875 return (log((1 + dR) / (1 - dR)) / 2); 00876 } 00877 00897 static double PValueKolmogorovSmirnov(double dD, size_t iM, size_t iN) { 00898 static const float c_dEpsilon1 = 0.001f; 00899 static const float c_dEpsilon2 = 1e-8f; 00900 static const float c_dEpsilon3 = 0.475f; 00901 double d, dRet, dCur, dPrev; 00902 size_t i, iIterations; 00903 00904 if (!dD) 00905 return 1; 00906 00907 d = sqrt((double) (iM * iN) / (iM + iN)); 00908 iIterations = max((size_t) 250, (size_t) (1 / dD)); 00909 dD = -2 * pow(dD * d, 2); 00910 // This is from NR, but it disagrees with R's results 00911 // dD = -2 * pow( dD * ( d + 0.12 + ( 0.11 / d ) ), 2 ); 00912 for (dRet = dPrev = 0, i = 1; i < iIterations; ++i) { 00913 dCur = exp(i * i * dD); 00914 if (!(i % 2)) 00915 dCur *= -1; 00916 dRet += dCur; 00917 d = fabs(dCur); 00918 if ((((d / dRet) < c_dEpsilon1) && (dRet > c_dEpsilon3)) || ((d 00919 / dPrev) < c_dEpsilon1) || ((d / dRet) < c_dEpsilon2)) 00920 break; 00921 dPrev = d; 00922 } 00923 if (dRet < 1) 00924 dRet = min(2 * dRet, 1.0); 00925 00926 return dRet; 00927 } 00928 00955 static double TTestStudent(double dMeanOne, double dVarianceOne, 00956 size_t iNOne, double dMeanTwo, double dVarianceTwo, size_t iNTwo) { 00957 size_t iDegFree; 00958 double dPoolVar, dT; 00959 00960 iDegFree = iNOne + iNTwo - 2; 00961 dPoolVar 00962 = (((iNOne - 1) * dVarianceOne) + ((iNTwo - 1) * dVarianceTwo)) 00963 / iDegFree; 00964 dT = (dMeanOne - dMeanTwo) / sqrt(dPoolVar * ((1.0 / iNOne) + (1.0 00965 / iNTwo))); 00966 00967 return (1 - TCDF(dT, iDegFree)); 00968 } 00969 00989 static double TTest(double dMean, double dVariance, size_t iN) { 00990 size_t iDegFree; 00991 double dT; 00992 00993 iDegFree = iN - 1; 00994 dT = sqrt((float) iN) * dMean / sqrt(dVariance); 00995 00996 return (1 - TCDF(dT, iDegFree)); 00997 } 00998 01025 static double TTestWelch(double dMeanOne, double dVarianceOne, 01026 size_t iNOne, double dMeanTwo, double dVarianceTwo, size_t iNTwo) { 01027 double dDegFree, dT; 01028 01029 dDegFree = (dVarianceOne / iNOne) + (dVarianceTwo / iNTwo); 01030 dDegFree = (dDegFree * dDegFree) / (((dVarianceOne * dVarianceOne) 01031 / iNOne / iNOne / (iNOne - 1)) + ((dVarianceTwo * dVarianceTwo) 01032 / iNTwo / iNTwo / (iNTwo - 1))); 01033 dT = (dMeanOne - dMeanTwo) / sqrt((dVarianceOne / iNOne) 01034 + (dVarianceTwo / iNTwo)); 01035 01036 return (1 - TCDF(dT, dDegFree)); 01037 } 01038 01058 static double FTest(double dVarianceOne, size_t iNOne, double dVarianceTwo, 01059 size_t iNTwo) { 01060 double dRet, dF; 01061 size_t iDF1, iDF2; 01062 01063 if (dVarianceOne < dVarianceTwo) { 01064 std::swap(dVarianceOne, dVarianceTwo); 01065 std::swap(iNOne, iNTwo); 01066 } 01067 dF = dVarianceOne / dVarianceTwo; 01068 iDF1 = iNOne - 1; 01069 iDF2 = iNTwo - 1; 01070 01071 dRet = 2 * IncompleteBeta(0.5 * iDF2, 0.5 * iDF1, iDF2 / (iDF2 + (iDF1 01072 * dF))); 01073 if (dRet > 1) 01074 dRet = 2 - dRet; 01075 01076 return dRet; 01077 } 01078 01079 // Evaluation statistics 01080 static double WilcoxonRankSum(const CDat& DatData, const CDat& DatAnswers, 01081 const std::vector<bool>& vecfGenesOfInterest, bool fInvert = false); 01082 01083 static double WilcoxonRankSum(const CDat& DatData, const CDat& DatAnswers, const std::vector<bool>& vecfGenesOfInterest, const std::vector<bool>& vecfUbik, bool fPosIn, bool fNegIn, bool fPosBridge, bool fNegBridge, bool fPosOut, bool fNegOut, bool fInvert = false); 01084 static double WilcoxonRankSum(const CPCL& DatData, const CPCL& DatAnswers, const std::vector<bool>& vecfGenesOfInterest, const std::vector<bool>& vecfUbik, bool fPosIn, bool fNegIn, bool fPosBridge, bool fNegBridge, bool fPosOut, bool fNegOut, bool fInvert = false); 01085 static double WilcoxonRankSum( const CDat& DatData, const CDat& DatAnswers, const vector<float>& vecGeneWeights, bool flipneg); 01086 static double WilcoxonRankSum( const CDat& DatData, const CDat& DatAnswers, const CDat& wDat, bool flipneg); 01087 // Probability distributions 01088 static double HypergeometricCDF(size_t iBoth, size_t iNonZeroInOne, 01089 size_t iNonZeroInTwo, size_t iN); 01090 static double TwoSidedHypergeometricCDF(size_t iHitsOne, size_t iSizeOne, 01091 size_t iHitsTwo, size_t iSizeTwo); 01092 static double SampleGammaStandard(double dShape); 01093 static double SampleGammaLogStandard(double dXX); 01094 static double SampleNormalStandard(); 01095 static double SampleExponentialStandard(); 01096 01113 static double SkellamPDF(size_t iX, double dMu1, double dMu2) { 01114 01115 return (exp(-(dMu1 + dMu2)) * pow(dMu1 / dMu2, 0.5 * iX) 01116 * ModifiedBesselI(iX, 2 * sqrt(dMu1 * dMu2))); 01117 } 01118 01141 static double BinomialCDF(size_t iObservations, size_t iSample, 01142 double dProbability) { 01143 double d; 01144 01145 d = (iObservations - (iSample * dProbability)) / sqrt(iSample 01146 * dProbability * (1 - dProbability)); 01147 return (1 - NormalCDF(d, 0, 1)); 01148 } 01149 01173 static double HypergeometricPDF(size_t iNonZeroInCommon, 01174 size_t iNonZeroInOne, size_t iNonZeroInTwo, size_t iTotalNumValues) { 01175 01176 return exp(LogFact(iTotalNumValues - iNonZeroInTwo) + LogFact( 01177 iNonZeroInTwo) //right margin 01178 + LogFact(iNonZeroInOne) + LogFact(iTotalNumValues 01179 - iNonZeroInOne) // bottom margin 01180 - LogFact(iTotalNumValues) // total 01181 - LogFact(iNonZeroInCommon) //1,1 01182 - LogFact(iNonZeroInTwo - iNonZeroInCommon) //1,0 01183 - LogFact(iTotalNumValues - iNonZeroInTwo + iNonZeroInCommon 01184 - iNonZeroInOne) //0, 0 01185 - LogFact(iNonZeroInOne - iNonZeroInCommon) //0,1 01186 ); 01187 } 01188 01202 static double TCDF(double dT, double dDF) { 01203 01204 return (1 - IncompleteBeta(0.5 * dDF, 0.5, dDF / (dDF + (dT * dT)))); 01205 } 01206 01223 static double LognormalCDF(double dX, double dMean, double dStdev) { 01224 01225 return ((dX > 0) ? NormalCDF(log(dX), dMean, dStdev) : 0); 01226 } 01227 01246 static double InverseGaussianCDF(double dX, double dMean, double dLambda) { 01247 01248 return (Normal01CDF(sqrt(dLambda / dX) * ((dX / dMean) - 1)) + exp((2 01249 * dLambda / dMean) + log(Normal01CDF(-sqrt(dLambda / dX) * ((dX 01250 / dMean) + 1))))); 01251 } 01252 01269 static double NormalCDF(double dX, double dMean, double dStdev) { 01270 01271 return Normal01CDF((dX - dMean) / dStdev); 01272 } 01273 01284 static double Normal01CDF(double dX) { 01285 01286 return CStatisticsImpl::Normal01CDF(dX); 01287 } 01288 01299 static double InverseNormal01CDF(double dX); 01300 01335 static double MultivariateNormalCDF(const std::vector<float>& vecdX, 01336 const std::vector<float>& vecdMu, 01337 const CDataMatrix& MatSigmaCholesky, size_t iN = 1, 01338 float dMaxError = 0.01, float dMaxCI = 0.99, size_t iMaxIterations = 01339 300) { 01340 std::vector<double> vecdDiff, vecdY, vecdF; 01341 size_t i, j, iIterations; 01342 double d, dRet, dVar, dError, dAlpha, dQ, dN; 01343 01344 if (vecdX.empty() || (vecdX.size() != vecdMu.size()) || (vecdX.size() 01345 != MatSigmaCholesky.GetRows()) || (vecdX.size() 01346 != MatSigmaCholesky.GetColumns())) 01347 return CMeta::GetNaN(); 01348 01349 dN = sqrt((double) iN); 01350 vecdDiff.resize(vecdX.size()); 01351 for (i = 0; i < vecdDiff.size(); ++i) 01352 vecdDiff[i] = vecdX[i] - vecdMu[i]; 01353 dAlpha = InverseNormal01CDF(dMaxCI); 01354 vecdF.resize(vecdX.size()); 01355 vecdF[0] = Normal01CDF(dN * vecdDiff[0] / MatSigmaCholesky.Get(0, 0)); 01356 vecdY.resize(vecdX.size()); 01357 01358 dRet = dVar = 0; 01359 dError = 2 * dMaxError; 01360 for (iIterations = 1; (iIterations <= iMaxIterations) && (dError 01361 > dMaxError); ++iIterations) { 01362 for (i = 1; i < vecdY.size(); ++i) { 01363 vecdY[i - 1] = InverseNormal01CDF(vecdF[i - 1] * rand() 01364 / RAND_MAX); 01365 dQ = 0; 01366 for (j = 0; j < i; ++j) 01367 dQ += MatSigmaCholesky.Get(j, i) * vecdY[j] / dN; 01368 vecdF[i] = Normal01CDF(dN * (vecdDiff[i] - dQ) 01369 / MatSigmaCholesky.Get(i, i)) * vecdF[i - 1]; 01370 } 01371 d = (vecdF.back() - dRet) / iIterations; 01372 dRet += d; 01373 dVar = ((iIterations - 2) * dVar / iIterations) + (d * d); 01374 dError = dAlpha * sqrt(dVar); 01375 } 01376 01377 return dRet; 01378 } 01379 01403 static double MultivariateNormalPDF(const std::vector<float>& vecdX, 01404 const std::vector<float>& vecdMu, const CDataMatrix& MatSigma) { 01405 CDataMatrix MatLU, MatInv; 01406 vector<size_t> veciIndices; 01407 bool fEven; 01408 double dDet; 01409 01410 if (!MatSigma.GetRows() 01411 || (MatSigma.GetRows() != MatSigma.GetColumns())) 01412 return CMeta::GetNaN(); 01413 01414 MatLU.Initialize(MatSigma.GetRows(), MatSigma.GetColumns()); 01415 MatLU.Open(MatSigma); 01416 MatrixLUDecompose(MatLU, veciIndices, fEven); 01417 MatrixLUInvert(MatLU, veciIndices, MatInv); 01418 dDet = MatrixLUDeterminant(MatLU, fEven); 01419 01420 return MultivariateNormalPDF(vecdX, vecdMu, sqrt(dDet), MatInv); 01421 } 01422 01449 static double MultivariateNormalPDF(const std::vector<float>& vecdX, 01450 const std::vector<float>& vecdMu, double dSigmaDetSqrt, 01451 const CDataMatrix& MatSigmaInv) { 01452 size_t i; 01453 vector<float> vecdXmMu, vecdXmMutS; 01454 double d; 01455 01456 if (!MatSigmaInv.GetRows() || (MatSigmaInv.GetRows() 01457 != MatSigmaInv.GetColumns()) || vecdX.empty() || (vecdX.size() 01458 != vecdMu.size())) 01459 return CMeta::GetNaN(); 01460 01461 vecdXmMu.resize(vecdX.size()); 01462 for (i = 0; i < vecdXmMu.size(); ++i) 01463 if (CMeta::IsNaN(vecdXmMu[i] = vecdX[i] - vecdMu[i])) 01464 vecdXmMu[i] = 0; 01465 MatrixMultiply(vecdXmMu, MatSigmaInv, vecdXmMutS); 01466 d = MatrixMultiply(vecdXmMutS, vecdXmMu); 01467 01468 return ((1 / pow(2 * 3.1415926535898, vecdX.size() / 2.0) 01469 / dSigmaDetSqrt) * exp(-d / 2)); 01470 } 01471 01485 static double SampleChi2(size_t iDF) { 01486 01487 return (2 * SampleGamma(1, iDF / 2)); 01488 } 01489 01506 static double Chi2CDF(double dC2, size_t iDF) { 01507 01508 return CStatisticsImpl::Chi2CDF(sqrt(dC2), 0, 1, iDF); 01509 } 01510 01524 static double SampleGamma(double dLocation, double dShape) { 01525 01526 return (SampleGammaStandard(dShape) / dLocation); 01527 } 01528 01556 static double BetaPDF(double dX, double dMinimum, double dMaximum, 01557 double dAlpha, double dBeta) { 01558 double dFunc, dLocation, dScale; 01559 01560 dLocation = dMinimum; 01561 dScale = dMaximum - dMinimum; 01562 dX = (dX - dLocation) / dScale; 01563 dFunc = exp(SampleGammaLogStandard(dAlpha) + SampleGammaLogStandard( 01564 dBeta) - SampleGammaLogStandard(dAlpha + dBeta)); 01565 return (pow(dX, dAlpha - 1) * pow(1 - dX, dBeta - 1) / dFunc / dScale); 01566 } 01567 01584 static double NormalPDF(double dX, double dMu, double dSigma) { 01585 static const double c_dS2P = sqrt(2 * 3.1415926535898); 01586 double d; 01587 01588 d = dX - dMu; 01589 return (exp(-(d * d) / (2 * dSigma * dSigma)) / (dSigma * c_dS2P)); 01590 } 01591 01605 static double ExponentialPDF(double dX, double dLambda) { 01606 01607 return (dLambda * exp(-dLambda * dX)); 01608 } 01609 01623 static bool CholeskyDecomposition(CDataMatrix& Matrix) { 01624 std::vector<float> vecdP; 01625 size_t i, j, k; 01626 float dSum; 01627 01628 if (Matrix.GetRows() != Matrix.GetColumns()) 01629 return false; 01630 01631 vecdP.resize(Matrix.GetRows()); 01632 for (i = 0; i < vecdP.size(); ++i) 01633 for (j = i; j < vecdP.size(); ++j) { 01634 dSum = Matrix.Get(i, j); 01635 for (k = 0; k < i; ++k) 01636 dSum -= Matrix.Get(i, k) * Matrix.Get(j, k); 01637 if (i == j) 01638 vecdP[i] 01639 = (dSum <= 0) ? std::numeric_limits<float>::epsilon() 01640 : sqrt(dSum); 01641 else 01642 Matrix.Set(j, i, dSum / vecdP[i]); 01643 } 01644 01645 for (i = 0; i < vecdP.size(); ++i) 01646 for (j = i; j < vecdP.size(); ++j) 01647 Matrix.Set(i, j, (i == j) ? vecdP[i] : Matrix.Get(j, i)); 01648 return true; 01649 } 01650 01651 // Matrix operations 01652 static bool MatrixLUDecompose(CDataMatrix& Mat, 01653 std::vector<size_t>& veciIndices, bool& fEven); 01654 static bool MatrixLUInvert(CDataMatrix& MatLU, 01655 const std::vector<size_t>& veciIndices, CDataMatrix& MatInv); 01656 01677 template<class tType> 01678 static double MatrixLUDeterminant(const CFullMatrix<tType>& MatLU, 01679 bool fEven) { 01680 double dRet; 01681 size_t i; 01682 01683 if (MatLU.GetRows() != MatLU.GetColumns()) 01684 return CMeta::GetNaN(); 01685 01686 dRet = fEven ? 1 : -1; 01687 for (i = 0; i < MatLU.GetRows(); ++i) 01688 dRet *= MatLU.Get(i, i); 01689 01690 return dRet; 01691 } 01692 }; 01693 01694 } 01695 01696 #endif // STATISTICS_H