Sleipnir
|
Utility class containing static statistics functions. More...
#include <statistics.h>
Static Public Member Functions | |
static double | FScore (size_t iTruePositives, size_t iFalsePositives, size_t iTrueNegatives, size_t iFalseNegatives, double dBeta=1) |
Calculate a precision/recall f-score. | |
static double | Precision (size_t iTruePositives, size_t iFalsePositives, size_t iTrueNegatives, size_t iFalseNegatives) |
Calculate the precision of a predicted positive set. | |
static double | Recall (size_t iTruePositives, size_t iFalsePositives, size_t iTrueNegatives, size_t iFalseNegatives) |
Calculate the recall of predictions over some positive set. | |
template<class tType > | |
static double | Variance (const tType Begin, const tType End, double dMean) |
Calculate the sample variance of a given sequence. | |
template<class tType > | |
static double | NormalizeMeanStd (const tType Begin, const tType End) |
template<class tType > | |
static double | Variance (const tType Begin, const tType End) |
Calculate the sample variance of a given sequence. | |
template<class tType > | |
static double | Variance (const std::vector< tType > &vecValues, double dMean) |
Calculate the sample variance of a given vector. | |
template<class tType > | |
static double | Variance (const std::vector< tType > &vecValues) |
Calculate the sample variance of a given vector. | |
template<class tType > | |
static double | Average (const std::vector< tType > &vecValues) |
Calculate the mean of a given vector. | |
template<class tType > | |
static double | Average (const tType Begin, const tType End) |
Calculate the mean of a given sequence. | |
template<class tType > | |
static double | Percentile (tType pBegin, tType pEnd, double dPercentile) |
Returns the value at the requested percentile of the given sequence. | |
template<class tType > | |
static double | Median (std::vector< tType > &vecData) |
Returns the median value of the given vector. | |
template<class tType > | |
static bool | Winsorize (std::vector< tType > &vecValues, size_t iCount=1) |
Winsorizes the requested number of items at each end of the given vector. | |
template<class tType > | |
static bool | Winsorize (tType pBegin, tType pEnd, size_t iCount=1) |
Winsorizes the requested number of items at each end of the given sequence. | |
template<class tType > | |
static double | CohensD (const std::vector< tType > &vecOne, const std::vector< tType > &vecTwo, double *pdAverage=NULL) |
Calculates Cohen's D, a modified z-score effect size measure based on unpooled variance. | |
template<class tType > | |
static double | CohensD (const tType BeginOne, const tType EndOne, const tType BeginTwo, const tType EndTwo, double *pdAverage=NULL) |
Calculates Cohen's D, a modified z-score effect size measure based on unpooled variance. | |
template<class tType > | |
static double | ZScore (const std::vector< tType > &vecOne, const std::vector< tType > &vecTwo) |
Calculates the z-score effect size measure. | |
template<class tType > | |
static double | ZScore (const tType BeginOne, const tType EndOne, const tType BeginTwo, const tType EndTwo, double *pdAverage=NULL) |
Calculates the z-score effect size measure. | |
static double | ZTest (double dZScore, size_t iN) |
Returns the z-test p-value of the given z-score and element count. | |
template<class tType , class tIter > | |
static double | AndersonDarlingScore (tIter Begin, tIter End) |
static double | AndersonDarlingTest (double dA2) |
template<class tType > | |
static double | RootMeanSquareError (tType BeginOne, tType EndOne, tType BeginTwo, tType EndTwo) |
Returns the root-mean-square error distance between two input arrays. | |
template<class tType > | |
static double | JensenShannonDivergence (tType BeginOne, tType EndOne, tType BeginTwo, tType EndTwo) |
Returns the Jensen-Shannon divergence between two discrete probability distributions represented as arrays. | |
template<class tType > | |
static double | KullbackLeiblerDivergence (tType BeginOne, tType EndOne, tType BeginTwo, tType EndTwo) |
Returns the Kullback-Leibler divergence between two discrete probability distributions represented as arrays. | |
static double | LjungBox (const float *adX, size_t iN, size_t iH) |
Calculates the p-value of a Ljung-Box portmanteau test for autocorrelation randomness. | |
static double | LjungBox (const float *adX, size_t iN) |
Calculate the p-value of a Ljung-Box portmanteau test for autocorrelation randomness. | |
static double | PValueLognormal (double dX, double dMean, double dStdev) |
Return the two-tailed p-value of a lognormal distribution. | |
static double | PValuePearson (double dR, size_t iN) |
Return the two-tailed p-value of a Pearson correlation. | |
static double | PValueSpearman (double dR, size_t iN) |
Return the two-tailed p-value of a Spearman correlation. | |
static double | FisherTransform (double dR) |
static double | PValueKolmogorovSmirnov (double dD, size_t iM, size_t iN) |
Returns the p-value corresponding to a D-score obtained from a Kolmogorov-Smirnov test. | |
static double | TTestStudent (double dMeanOne, double dVarianceOne, size_t iNOne, double dMeanTwo, double dVarianceTwo, size_t iNTwo) |
Return the p-value of a t-test between the two given array statistics assuming equal variance. | |
static double | TTest (double dMean, double dVariance, size_t iN) |
Return the p-value of a t-test between the given array statistics and zero. | |
static double | TTestWelch (double dMeanOne, double dVarianceOne, size_t iNOne, double dMeanTwo, double dVarianceTwo, size_t iNTwo) |
Return the p-value of a t-test between the two given array statistics without assuming equal variance. | |
static double | FTest (double dVarianceOne, size_t iNOne, double dVarianceTwo, size_t iNTwo) |
Return the p-value of an f-test between the two given array statistics to determine equality of variance. | |
static double | WilcoxonRankSum (const CDat &DatData, const CDat &DatAnswers, const std::vector< bool > &vecfGenesOfInterest, bool fInvert=false) |
Calculate the Wilcoxon Rank Sum p-value (AUC) of a given data (or prediction) set relative to an answer set. | |
static double | WilcoxonRankSum (const CDat &DatData, const CDat &DatAnswers, const std::vector< bool > &vecfGenesOfInterest, const std::vector< bool > &vecfUbik, bool fPosIn, bool fNegIn, bool fPosBridge, bool fNegBridge, bool fPosOut, bool fNegOut, bool fInvert=false) |
Calculate the Wilcoxon Rank Sum p-value (AUC) of a given data (or prediction) set relative to an answer set. Allows for ubiquitous genes. | |
static double | WilcoxonRankSum (const CPCL &DatData, const CPCL &DatAnswers, const std::vector< bool > &vecfGenesOfInterest, const std::vector< bool > &vecfUbik, bool fPosIn, bool fNegIn, bool fPosBridge, bool fNegBridge, bool fPosOut, bool fNegOut, bool fInvert=false) |
static double | WilcoxonRankSum (const CDat &DatData, const CDat &DatAnswers, const vector< float > &vecGeneWeights, bool flipneg) |
static double | WilcoxonRankSum (const CDat &DatData, const CDat &DatAnswers, const CDat &wDat, bool flipneg) |
static double | HypergeometricCDF (size_t iBoth, size_t iNonZeroInOne, size_t iNonZeroInTwo, size_t iN) |
Calculates the hypergeometric p-value given the sizes and overlap of two sets. | |
static double | TwoSidedHypergeometricCDF (size_t iHitsOne, size_t iSizeOne, size_t iHitsTwo, size_t iSizeTwo) |
Calculates the two sided hypergeometric p-value given the sizes and overlap of two sets. | |
static double | SampleGammaStandard (double dShape) |
Return a random sample from a standard gamma function with the given shape parameter. | |
static double | SampleGammaLogStandard (double dXX) |
Return a random sample from a gamma log function with the given parameter. | |
static double | SampleNormalStandard () |
Return a random sample from a standard normal distribution. | |
static double | SampleExponentialStandard () |
Return a random sample from a standard exponential distribution. | |
static double | SkellamPDF (size_t iX, double dMu1, double dMu2) |
Calculate the Skellam probability distribution given a point and the two distribution parameters. | |
static double | BinomialCDF (size_t iObservations, size_t iSample, double dProbability) |
Return the binomial p-value of obtaining at least the given sample with the given number of observations and probability of success. | |
static double | HypergeometricPDF (size_t iNonZeroInCommon, size_t iNonZeroInOne, size_t iNonZeroInTwo, size_t iTotalNumValues) |
Calculate the hypergeometric probability distribution given the sizes and overlap of two sets. | |
static double | TCDF (double dT, double dDF) |
Calculate a p-value for the given T and degrees of freedom. | |
static double | LognormalCDF (double dX, double dMean, double dStdev) |
CDF of a lognormal distribution with the given parameters at the given point. | |
static double | InverseGaussianCDF (double dX, double dMean, double dLambda) |
CDF of an inverse Gaussian distribution with the given parameters at the given point. | |
static double | NormalCDF (double dX, double dMean, double dStdev) |
CDF of a normal distribution with the given parameters at the given point. | |
static double | Normal01CDF (double dX) |
CDF of a standard normal distribution. | |
static double | InverseNormal01CDF (double dX) |
Inverse CDF of a standard normal distribution. | |
static double | MultivariateNormalCDF (const std::vector< float > &vecdX, const std::vector< float > &vecdMu, const CDataMatrix &MatSigmaCholesky, size_t iN=1, float dMaxError=0.01, float dMaxCI=0.99, size_t iMaxIterations=300) |
CDF of a multivariate normal distribution with the given parameters at the given point. | |
static double | MultivariateNormalPDF (const std::vector< float > &vecdX, const std::vector< float > &vecdMu, const CDataMatrix &MatSigma) |
Calculates the value of a multivariate normal probability density function at the requested point. | |
static double | MultivariateNormalPDF (const std::vector< float > &vecdX, const std::vector< float > &vecdMu, double dSigmaDetSqrt, const CDataMatrix &MatSigmaInv) |
Calculates the value of a multivariate normal probability density function at the requested point. | |
static double | SampleChi2 (size_t iDF) |
Return a random sample from a chi-squared distribution with the given degrees of freedom. | |
static double | Chi2CDF (double dC2, size_t iDF) |
Return the value of a chi-squared cumulative density function at the requested point. | |
static double | SampleGamma (double dLocation, double dShape) |
Return a random sample from a gamma distribution with the given location and shape parameters. | |
static double | BetaPDF (double dX, double dMinimum, double dMaximum, double dAlpha, double dBeta) |
Calculate a beta probability density at the given point with the given minimum, maximu, alpha, and beta parameters. | |
static double | NormalPDF (double dX, double dMu, double dSigma) |
Calculate a normal probability density at the given point for the given mean and standard deviation. | |
static double | ExponentialPDF (double dX, double dLambda) |
Returns the value of an exponential probability density function at the requested point. | |
static bool | CholeskyDecomposition (CDataMatrix &Matrix) |
Calculates the Cholesky decomposition of the given matrix, overwriting it in the process. | |
static bool | MatrixLUDecompose (CDataMatrix &Mat, std::vector< size_t > &veciIndices, bool &fEven) |
Replaces the given matrix with its LU decomposition. | |
static bool | MatrixLUInvert (CDataMatrix &MatLU, const std::vector< size_t > &veciIndices, CDataMatrix &MatInv) |
Calculates the inverse of a matrix given its LU decomposition. | |
template<class tType > | |
static double | MatrixLUDeterminant (const CFullMatrix< tType > &MatLU, bool fEven) |
Calculate the determinant of a matrix given its LU decomposition. |
Utility class containing static statistics functions.
CStatistics contains a collection of static utility functions, mainly various statistical distributions. Credit for many of these implementations goes to Press WH, Teukolsky SA, Vetterling WT, Flannery BP. Numerical Recipes in C, 1992, Cambridge University Press.
Definition at line 44 of file statistics.h.
static double Sleipnir::CStatistics::Average | ( | const std::vector< tType > & | vecValues | ) | [inline, static] |
Calculate the mean of a given vector.
vecValues | Vector over which average is calculated. |
Definition at line 239 of file statistics.h.
Referenced by Sleipnir::CPCL::Normalize().
static double Sleipnir::CStatistics::Average | ( | const tType | Begin, |
const tType | End | ||
) | [inline, static] |
Calculate the mean of a given sequence.
Begin | Beginning of the sequence over which average is calculated. |
End | End of the sequence over which average is calculated. |
Definition at line 261 of file statistics.h.
static double Sleipnir::CStatistics::BetaPDF | ( | double | dX, |
double | dMinimum, | ||
double | dMaximum, | ||
double | dAlpha, | ||
double | dBeta | ||
) | [inline, static] |
Calculate a beta probability density at the given point with the given minimum, maximu, alpha, and beta parameters.
dX | Point at which to sample the beta probability distribution. |
dMinimum | Minimum parameter of the beta distribution. |
dMaximum | Maximum parameter of the beta distribution. |
dAlpha | Alpha parameter of the beta distribution. |
dBeta | Beta parameter of the beta distribution. |
Definition at line 1556 of file statistics.h.
References SampleGammaLogStandard().
static double Sleipnir::CStatistics::BinomialCDF | ( | size_t | iObservations, |
size_t | iSample, | ||
double | dProbability | ||
) | [inline, static] |
Return the binomial p-value of obtaining at least the given sample with the given number of observations and probability of success.
iObservations | Count parameter of the binomial. |
iSample | Point at which to sample the CDF. |
dProbability | Probability parameter of the binomial. |
Definition at line 1141 of file statistics.h.
References NormalCDF().
static double Sleipnir::CStatistics::Chi2CDF | ( | double | dC2, |
size_t | iDF | ||
) | [inline, static] |
Return the value of a chi-squared cumulative density function at the requested point.
dC2 | Point at which chi-squared CDF should be evaluated. |
iDF | Degrees of freedom of the chi-squared distribution. |
Definition at line 1506 of file statistics.h.
static bool Sleipnir::CStatistics::CholeskyDecomposition | ( | CDataMatrix & | Matrix | ) | [inline, static] |
Calculates the Cholesky decomposition of the given matrix, overwriting it in the process.
Matrix | Matrix to be decomposed. |
Definition at line 1623 of file statistics.h.
References Sleipnir::CFullMatrix< tType >::Get(), Sleipnir::CFullMatrix< tType >::GetColumns(), Sleipnir::CFullMatrix< tType >::GetRows(), and Sleipnir::CFullMatrix< tType >::Set().
static double Sleipnir::CStatistics::CohensD | ( | const std::vector< tType > & | vecOne, |
const std::vector< tType > & | vecTwo, | ||
double * | pdAverage = NULL |
||
) | [inline, static] |
Calculates Cohen's D, a modified z-score effect size measure based on unpooled variance.
vecOne | First array to be compared. |
vecTwo | Second array to be compared. |
pdAverage | If non-null, output average value of elements in both arrays. |
Calculate's Cohen's D, a z-score-like effect size measurement based on unpooled variance. For two input arrays A1 and A2 with means m(Ai) and standard deviations s(Ai), the standard z-score is (m(A1)-m(A2))/s(A1 u A2). Cohen's D uses the unpooled variance 2(m(A1)-m(A2))/(s(A1)+s(A2)).
Definition at line 425 of file statistics.h.
static double Sleipnir::CStatistics::CohensD | ( | const tType | BeginOne, |
const tType | EndOne, | ||
const tType | BeginTwo, | ||
const tType | EndTwo, | ||
double * | pdAverage = NULL |
||
) | [inline, static] |
Calculates Cohen's D, a modified z-score effect size measure based on unpooled variance.
BeginOne | First element of the first array to be compared. |
EndOne | End of last element of the first array to be compared. |
BeginTwo | First element of the second array to be compared. |
EndTwo | End of last element of the second array to be compared. |
pdAverage | If non-null, output average value of elements in both arrays. |
Calculate's Cohen's D, a z-score-like effect size measurement based on unpooled variance. For two input arrays A1 and A2 with means m(Ai) and standard deviations s(Ai), the standard z-score is (m(A1)-m(A2))/s(A1 u A2). Cohen's D uses the unpooled variance 2(m(A1)-m(A2))/(s(A1)+s(A2)).
Definition at line 462 of file statistics.h.
static double Sleipnir::CStatistics::ExponentialPDF | ( | double | dX, |
double | dLambda | ||
) | [inline, static] |
Returns the value of an exponential probability density function at the requested point.
dX | Point at which the exponential PDF is evaluated. |
dLambda | Rate parameter of the exponential PDF to be evaluated. |
Definition at line 1605 of file statistics.h.
double Sleipnir::CStatistics::FScore | ( | size_t | iTruePositives, |
size_t | iFalsePositives, | ||
size_t | iTrueNegatives, | ||
size_t | iFalseNegatives, | ||
double | dBeta = 1 |
||
) | [static] |
Calculate a precision/recall f-score.
iTruePositives | Number of true positives. |
iFalsePositives | Number of false positives. |
iTrueNegatives | Number of true negatives. |
iFalseNegatives | Number of false negatives. |
dBeta | Relative weight given to precision; 0 ignores precision, 1 weights precision and recall equally, and arbitrary large values ignore recall. |
Definition at line 910 of file statistics.cpp.
References Precision(), and Recall().
static double Sleipnir::CStatistics::FTest | ( | double | dVarianceOne, |
size_t | iNOne, | ||
double | dVarianceTwo, | ||
size_t | iNTwo | ||
) | [inline, static] |
Return the p-value of an f-test between the two given array statistics to determine equality of variance.
dVarianceOne | Variance of the first sample. |
iNOne | Number of elements in the first sample. |
dVarianceTwo | Variance of the second sample. |
iNTwo | Number of elements in the second sample. |
Definition at line 1058 of file statistics.h.
double Sleipnir::CStatistics::HypergeometricCDF | ( | size_t | iBoth, |
size_t | iNonZeroInOne, | ||
size_t | iNonZeroInTwo, | ||
size_t | iN | ||
) | [static] |
Calculates the hypergeometric p-value given the sizes and overlap of two sets.
iBoth | Number of non-zero in both query sets together. |
iNonZeroInOne | Number of non-zero in query one |
iTwo | Number of non-zero in query two |
iN | Total number of sites in the query. |
Definition at line 824 of file statistics.cpp.
References HypergeometricPDF().
Referenced by Sleipnir::CMeasureHypergeometric::Measure(), and Sleipnir::CDat::Open().
static double Sleipnir::CStatistics::HypergeometricPDF | ( | size_t | iNonZeroInCommon, |
size_t | iNonZeroInOne, | ||
size_t | iNonZeroInTwo, | ||
size_t | iTotalNumValues | ||
) | [inline, static] |
Calculate the hypergeometric probability distribution given the sizes and overlap of two sets.
iNonZeroInCommon | Number of non zero values that both share. |
iNonZeroInOne | Size of the first (query) set. |
iNonZeroInTwo | Number of hits in the second (background) set. |
iTotalNumValues | Total number of values that were compared. |
Definition at line 1173 of file statistics.h.
References Sleipnir::CMath::LogFact().
Referenced by HypergeometricCDF(), and TwoSidedHypergeometricCDF().
static double Sleipnir::CStatistics::InverseGaussianCDF | ( | double | dX, |
double | dMean, | ||
double | dLambda | ||
) | [inline, static] |
CDF of an inverse Gaussian distribution with the given parameters at the given point.
dX | Sample point. |
dMean | Mean of inverse Gaussian. |
dLambda | Lambda parameter of inverse Gaussian. |
Definition at line 1246 of file statistics.h.
References Normal01CDF().
double Sleipnir::CStatistics::InverseNormal01CDF | ( | double | dX | ) | [static] |
Inverse CDF of a standard normal distribution.
dX | Sample point. |
Definition at line 1518 of file statistics.cpp.
References Sleipnir::CMeta::GetNaN(), and Normal01CDF().
Referenced by MultivariateNormalCDF().
static double Sleipnir::CStatistics::JensenShannonDivergence | ( | tType | BeginOne, |
tType | EndOne, | ||
tType | BeginTwo, | ||
tType | EndTwo | ||
) | [inline, static] |
Returns the Jensen-Shannon divergence between two discrete probability distributions represented as arrays.
BeginOne | First element of the first array to be compared. |
EndOne | End of last element of the first array to be compared. |
BeginTwo | First element of the second array to be compared. |
EndTwo | End of last element of the second array to be compared. |
The Jensen-Shannon or JS-divergence between two discrete probability distributions A1 and A2 is a symmetrized Kullback-Leibler divergence defined as ( KL(A1, A2) + KL(A2, A1 ) / 2.
Definition at line 713 of file statistics.h.
References KullbackLeiblerDivergence().
static double Sleipnir::CStatistics::KullbackLeiblerDivergence | ( | tType | BeginOne, |
tType | EndOne, | ||
tType | BeginTwo, | ||
tType | EndTwo | ||
) | [inline, static] |
Returns the Kullback-Leibler divergence between two discrete probability distributions represented as arrays.
BeginOne | First element of the first array to be compared. |
EndOne | End of last element of the first array to be compared. |
BeginTwo | First element of the second array to be compared. |
EndTwo | End of last element of the second array to be compared. |
The Kullback-Leibler or KL-divergence between two discrete probability distributions A1 and A2 is defined as sum( A1[i] * log( A1[i]/A2[i] ) ).
Definition at line 751 of file statistics.h.
References Sleipnir::CMeta::GetNaN().
Referenced by JensenShannonDivergence().
double Sleipnir::CStatistics::LjungBox | ( | const float * | adX, |
size_t | iN, | ||
size_t | iH | ||
) | [static] |
Calculates the p-value of a Ljung-Box portmanteau test for autocorrelation randomness.
adX | Array of values to test. |
iN | Number of values in array. |
iH | Number of lags to test, at most iN - 1. |
Definition at line 62 of file statistics.cpp.
Referenced by LjungBox().
static double Sleipnir::CStatistics::LjungBox | ( | const float * | adX, |
size_t | iN | ||
) | [inline, static] |
Calculate the p-value of a Ljung-Box portmanteau test for autocorrelation randomness.
adX | Array of values to test. |
iN | Number of values in array. |
Definition at line 783 of file statistics.h.
References LjungBox().
static double Sleipnir::CStatistics::LognormalCDF | ( | double | dX, |
double | dMean, | ||
double | dStdev | ||
) | [inline, static] |
CDF of a lognormal distribution with the given parameters at the given point.
dX | Sample point. |
dMean | Mean of lognormal. |
dStdev | Standard deviation of lognormal. |
Definition at line 1223 of file statistics.h.
References NormalCDF().
Referenced by PValueLognormal().
bool Sleipnir::CStatistics::MatrixLUDecompose | ( | CDataMatrix & | Mat, |
std::vector< size_t > & | veciIndices, | ||
bool & | fEven | ||
) | [static] |
Replaces the given matrix with its LU decomposition.
Mat | Matrix to be LU decomposed; overwritten during the calculation. |
veciIndices | Row permutations of the LU decomposition pivots. |
fEven | True if decomposition is of even rank; false otherwise. |
Definition at line 1595 of file statistics.cpp.
References Sleipnir::CFullMatrix< tType >::Get(), Sleipnir::CFullMatrix< tType >::GetColumns(), Sleipnir::CFullMatrix< tType >::GetRows(), and Sleipnir::CFullMatrix< tType >::Set().
Referenced by MultivariateNormalPDF().
static double Sleipnir::CStatistics::MatrixLUDeterminant | ( | const CFullMatrix< tType > & | MatLU, |
bool | fEven | ||
) | [inline, static] |
Calculate the determinant of a matrix given its LU decomposition.
MatLU | LU decomposition of matrix of interest. |
fEven | True if LU decomposition of even rank; false otherwise. |
Definition at line 1678 of file statistics.h.
References Sleipnir::CFullMatrix< tType >::Get(), Sleipnir::CFullMatrix< tType >::GetColumns(), Sleipnir::CMeta::GetNaN(), and Sleipnir::CFullMatrix< tType >::GetRows().
Referenced by MultivariateNormalPDF().
bool Sleipnir::CStatistics::MatrixLUInvert | ( | CDataMatrix & | MatLU, |
const std::vector< size_t > & | veciIndices, | ||
CDataMatrix & | MatInv | ||
) | [static] |
Calculates the inverse of a matrix given its LU decomposition.
MatLU | LU decomposition of matrix of interest; destroyed during the inversion calculation. |
veciIndices | Row permutations of the LU decomposition pivots. |
MatInv | Resulting inverted matrix. |
Definition at line 1698 of file statistics.cpp.
References Sleipnir::CFullMatrix< tType >::GetColumns(), Sleipnir::CFullMatrix< tType >::GetRows(), Sleipnir::CFullMatrix< tType >::Initialize(), and Sleipnir::CFullMatrix< tType >::Set().
Referenced by MultivariateNormalPDF().
static double Sleipnir::CStatistics::Median | ( | std::vector< tType > & | vecData | ) | [inline, static] |
Returns the median value of the given vector.
vecData | Vector from which median value is returned. |
Definition at line 329 of file statistics.h.
References Percentile().
static double Sleipnir::CStatistics::MultivariateNormalCDF | ( | const std::vector< float > & | vecdX, |
const std::vector< float > & | vecdMu, | ||
const CDataMatrix & | MatSigmaCholesky, | ||
size_t | iN = 1 , |
||
float | dMaxError = 0.01 , |
||
float | dMaxCI = 0.99 , |
||
size_t | iMaxIterations = 300 |
||
) | [inline, static] |
CDF of a multivariate normal distribution with the given parameters at the given point.
vecdX | Sample point. |
vecdMu | Mean of normal. |
MatSigmaCholesky | Cholesky decomposition of covariance matrix of normal. |
iN | Number of elements used to calculate sample point. |
dMaxError | Performance parameter; maximum error tolerance of return value. |
dMaxCI | Performance parameter; confidence interval of return value. |
iMaxIterations | Performance parameter; maximum iteratios to calculate return value. |
Definition at line 1335 of file statistics.h.
References Sleipnir::CFullMatrix< tType >::Get(), Sleipnir::CFullMatrix< tType >::GetColumns(), Sleipnir::CMeta::GetNaN(), Sleipnir::CFullMatrix< tType >::GetRows(), InverseNormal01CDF(), and Normal01CDF().
static double Sleipnir::CStatistics::MultivariateNormalPDF | ( | const std::vector< float > & | vecdX, |
const std::vector< float > & | vecdMu, | ||
const CDataMatrix & | MatSigma | ||
) | [inline, static] |
Calculates the value of a multivariate normal probability density function at the requested point.
vecdX | Point at which multivariate normal distribution is evaluated. |
vecdMu | Mean of multivariate normal distribution. |
MatSigma | Covariance matrix of multivariate normal distribution. |
Definition at line 1403 of file statistics.h.
References Sleipnir::CFullMatrix< tType >::GetColumns(), Sleipnir::CMeta::GetNaN(), Sleipnir::CFullMatrix< tType >::GetRows(), Sleipnir::CFullMatrix< tType >::Initialize(), MatrixLUDecompose(), MatrixLUDeterminant(), MatrixLUInvert(), and Sleipnir::CFullMatrix< tType >::Open().
static double Sleipnir::CStatistics::MultivariateNormalPDF | ( | const std::vector< float > & | vecdX, |
const std::vector< float > & | vecdMu, | ||
double | dSigmaDetSqrt, | ||
const CDataMatrix & | MatSigmaInv | ||
) | [inline, static] |
Calculates the value of a multivariate normal probability density function at the requested point.
vecdX | Point at which multivariate normal distribution is evaluated. |
vecdMu | Mean of multivariate normal distribution. |
dSigmaDetSqrt | Square root of the determinant of the requested distribution's covariance matrix. |
MatSigmaInv | Inverse of the requested distribution's covariance matrix. |
Definition at line 1449 of file statistics.h.
References Sleipnir::CFullMatrix< tType >::GetColumns(), Sleipnir::CMeta::GetNaN(), Sleipnir::CFullMatrix< tType >::GetRows(), and Sleipnir::CMeta::IsNaN().
static double Sleipnir::CStatistics::Normal01CDF | ( | double | dX | ) | [inline, static] |
CDF of a standard normal distribution.
dX | Sample point. |
Reimplemented from Sleipnir::CStatisticsImpl.
Definition at line 1284 of file statistics.h.
Referenced by InverseGaussianCDF(), InverseNormal01CDF(), MultivariateNormalCDF(), NormalCDF(), and ZTest().
static double Sleipnir::CStatistics::NormalCDF | ( | double | dX, |
double | dMean, | ||
double | dStdev | ||
) | [inline, static] |
CDF of a normal distribution with the given parameters at the given point.
dX | Sample point. |
dMean | Mean of normal. |
dStdev | Standard deviation of normal. |
Definition at line 1269 of file statistics.h.
References Normal01CDF().
Referenced by BinomialCDF(), and LognormalCDF().
static double Sleipnir::CStatistics::NormalPDF | ( | double | dX, |
double | dMu, | ||
double | dSigma | ||
) | [inline, static] |
Calculate a normal probability density at the given point for the given mean and standard deviation.
dX | Point at which to calculate the normal distribution. |
dMu | Mean of the normal distribution. |
dSigma | Standard deviation of the normal distribution. |
Definition at line 1584 of file statistics.h.
static double Sleipnir::CStatistics::Percentile | ( | tType | pBegin, |
tType | pEnd, | ||
double | dPercentile | ||
) | [inline, static] |
Returns the value at the requested percentile of the given sequence.
pBegin | Pointer/iterator to the beginning of the sequence. |
pEnd | Pointer/iterator to the end of the sequence. |
dPercentile | Value between 0 and 1 indicating the percentile of the value to be retrieved. |
Definition at line 293 of file statistics.h.
References Sleipnir::CMeta::GetNaN(), and Sleipnir::CMeta::IsNaN().
Referenced by Median().
static double Sleipnir::CStatistics::Precision | ( | size_t | iTruePositives, |
size_t | iFalsePositives, | ||
size_t | iTrueNegatives, | ||
size_t | iFalseNegatives | ||
) | [inline, static] |
Calculate the precision of a predicted positive set.
iTruePositives | Number of true positives. |
iFalsePositives | Number of false positives. |
iTrueNegatives | Number of true negatives. |
iFalseNegatives | Number of false negatives. |
Definition at line 75 of file statistics.h.
Referenced by FScore().
static double Sleipnir::CStatistics::PValueKolmogorovSmirnov | ( | double | dD, |
size_t | iM, | ||
size_t | iN | ||
) | [inline, static] |
Returns the p-value corresponding to a D-score obtained from a Kolmogorov-Smirnov test.
dD | D-value obtained from a Kolmogorov-Smirnov test. |
iM | Number of elements in first tested array. |
iN | Number of elements in second tested array. |
Definition at line 897 of file statistics.h.
Referenced by Sleipnir::CMeasureKolmogorovSmirnov::Measure().
static double Sleipnir::CStatistics::PValueLognormal | ( | double | dX, |
double | dMean, | ||
double | dStdev | ||
) | [inline, static] |
Return the two-tailed p-value of a lognormal distribution.
dX | Sample point. |
dMean | Mean of lognormal. |
dStdev | Standard deviation of lognormal. |
Definition at line 805 of file statistics.h.
References LognormalCDF().
static double Sleipnir::CStatistics::PValuePearson | ( | double | dR, |
size_t | iN | ||
) | [inline, static] |
Return the two-tailed p-value of a Pearson correlation.
dR | Pearson correlation. |
iN | Length of correlated vectors. |
Definition at line 831 of file statistics.h.
References TCDF().
static double Sleipnir::CStatistics::PValueSpearman | ( | double | dR, |
size_t | iN | ||
) | [inline, static] |
Return the two-tailed p-value of a Spearman correlation.
dR | Spearman correlation. |
iN | Length of correlated vectors. |
Definition at line 860 of file statistics.h.
References TCDF().
static double Sleipnir::CStatistics::Recall | ( | size_t | iTruePositives, |
size_t | iFalsePositives, | ||
size_t | iTrueNegatives, | ||
size_t | iFalseNegatives | ||
) | [inline, static] |
Calculate the recall of predictions over some positive set.
iTruePositives | Number of true positives. |
iFalsePositives | Number of false positives. |
iTrueNegatives | Number of true negatives. |
iFalseNegatives | Number of false negatives. |
Definition at line 108 of file statistics.h.
Referenced by FScore().
static double Sleipnir::CStatistics::RootMeanSquareError | ( | tType | BeginOne, |
tType | EndOne, | ||
tType | BeginTwo, | ||
tType | EndTwo | ||
) | [inline, static] |
Returns the root-mean-square error distance between two input arrays.
BeginOne | First element of the first array to be compared. |
EndOne | End of last element of the first array to be compared. |
BeginTwo | First element of the second array to be compared. |
EndTwo | End of last element of the second array to be compared. |
Calculates the root-mean-square error (RMSE) between two arrays, equal to sqrt( sum( (A1[i] - A2[i])^2 ) / min(|A1|, |A2|) ).
Definition at line 667 of file statistics.h.
static double Sleipnir::CStatistics::SampleChi2 | ( | size_t | iDF | ) | [inline, static] |
Return a random sample from a chi-squared distribution with the given degrees of freedom.
iDF | Degrees of freedom of the chi-squared distribution. |
Definition at line 1485 of file statistics.h.
References SampleGamma().
double Sleipnir::CStatistics::SampleExponentialStandard | ( | ) | [static] |
Return a random sample from a standard exponential distribution.
Definition at line 722 of file statistics.cpp.
Referenced by SampleGammaStandard().
static double Sleipnir::CStatistics::SampleGamma | ( | double | dLocation, |
double | dShape | ||
) | [inline, static] |
Return a random sample from a gamma distribution with the given location and shape parameters.
dLocation | Location parameter of gamma function to sample. |
dShape | Shape parameter of gamma function to sample. |
Definition at line 1524 of file statistics.h.
References SampleGammaStandard().
Referenced by SampleChi2().
double Sleipnir::CStatistics::SampleGammaLogStandard | ( | double | dXX | ) | [static] |
Return a random sample from a gamma log function with the given parameter.
dXX | Parameter of gamma log function to sample. |
Definition at line 403 of file statistics.cpp.
Referenced by BetaPDF().
double Sleipnir::CStatistics::SampleGammaStandard | ( | double | dShape | ) | [static] |
Return a random sample from a standard gamma function with the given shape parameter.
dShape | Shape parameter of gamma function to sample. |
Definition at line 432 of file statistics.cpp.
References SampleExponentialStandard(), and SampleNormalStandard().
Referenced by SampleGamma().
double Sleipnir::CStatistics::SampleNormalStandard | ( | ) | [static] |
Return a random sample from a standard normal distribution.
Definition at line 606 of file statistics.cpp.
Referenced by SampleGammaStandard().
static double Sleipnir::CStatistics::SkellamPDF | ( | size_t | iX, |
double | dMu1, | ||
double | dMu2 | ||
) | [inline, static] |
Calculate the Skellam probability distribution given a point and the two distribution parameters.
iX | Value at which PDF should be evaluated. |
dMu1 | First distribution parameter (first expected value). |
dMu2 | Second distribution parameter (second expected value). |
Definition at line 1113 of file statistics.h.
static double Sleipnir::CStatistics::TCDF | ( | double | dT, |
double | dDF | ||
) | [inline, static] |
Calculate a p-value for the given T and degrees of freedom.
dT | T value at which to sample the t-distribution. |
dDF | Degrees of freedom of the desired t-distribution. |
Definition at line 1202 of file statistics.h.
Referenced by Sleipnir::CMeasurePearsonSignificance::Measure(), PValuePearson(), PValueSpearman(), TTest(), TTestStudent(), and TTestWelch().
static double Sleipnir::CStatistics::TTest | ( | double | dMean, |
double | dVariance, | ||
size_t | iN | ||
) | [inline, static] |
Return the p-value of a t-test between the given array statistics and zero.
dMean | Sample mean. |
dVariance | Sample variance. |
iN | Sample size. |
Definition at line 989 of file statistics.h.
References TCDF().
static double Sleipnir::CStatistics::TTestStudent | ( | double | dMeanOne, |
double | dVarianceOne, | ||
size_t | iNOne, | ||
double | dMeanTwo, | ||
double | dVarianceTwo, | ||
size_t | iNTwo | ||
) | [inline, static] |
Return the p-value of a t-test between the two given array statistics assuming equal variance.
dMeanOne | Mean of the first sample. |
dVarianceOne | Variance of the first sample. |
iNOne | Number of elements in the first sample. |
dMeanTwo | Mean of the second sample. |
dVarianceTwo | Variance of the second sample. |
iNTwo | Number of elements in the second sample. |
Definition at line 955 of file statistics.h.
References TCDF().
static double Sleipnir::CStatistics::TTestWelch | ( | double | dMeanOne, |
double | dVarianceOne, | ||
size_t | iNOne, | ||
double | dMeanTwo, | ||
double | dVarianceTwo, | ||
size_t | iNTwo | ||
) | [inline, static] |
Return the p-value of a t-test between the two given array statistics without assuming equal variance.
dMeanOne | Mean of the first sample. |
dVarianceOne | Variance of the first sample. |
iNOne | Number of elements in the first sample. |
dMeanTwo | Mean of the second sample. |
dVarianceTwo | Variance of the second sample. |
iNTwo | Number of elements in the second sample. |
Definition at line 1025 of file statistics.h.
References TCDF().
double Sleipnir::CStatistics::TwoSidedHypergeometricCDF | ( | size_t | iNonZeroInCommon, |
size_t | iNonZeroInOne, | ||
size_t | iNonZeroInTwo, | ||
size_t | iTotalNumValues | ||
) | [static] |
Calculates the two sided hypergeometric p-value given the sizes and overlap of two sets.
iNonZeroInCommon | Number of non-zero in both query sets together. |
iNonZeroInOne | Number of non-zero in query one |
iNonZeroInTwo | Number of non-zero in query two |
iTotalNumValues | Total number of sites in the query. |
Definition at line 855 of file statistics.cpp.
References HypergeometricPDF().
static double Sleipnir::CStatistics::Variance | ( | const tType | Begin, |
const tType | End, | ||
double | dMean | ||
) | [inline, static] |
Calculate the sample variance of a given sequence.
Begin | Beginning of the sequence over which variance is estimated. |
End | End of the sequence over which variance is estimated. |
dMean | Mean of the sample or population. |
Definition at line 136 of file statistics.h.
Referenced by Sleipnir::CPCL::Normalize(), and Variance().
static double Sleipnir::CStatistics::Variance | ( | const tType | Begin, |
const tType | End | ||
) | [inline, static] |
Calculate the sample variance of a given sequence.
Begin | Beginning of the sequence over which variance is estimated. |
End | End of the sequence over which variance is estimated. |
Definition at line 173 of file statistics.h.
static double Sleipnir::CStatistics::Variance | ( | const std::vector< tType > & | vecValues, |
double | dMean | ||
) | [inline, static] |
Calculate the sample variance of a given vector.
vecValues | Vector over which variance is estimated. |
dMean | Mean of the sample or population. |
Definition at line 201 of file statistics.h.
References Variance().
static double Sleipnir::CStatistics::Variance | ( | const std::vector< tType > & | vecValues | ) | [inline, static] |
Calculate the sample variance of a given vector.
vecValues | Vector over which variance is estimated. |
Definition at line 220 of file statistics.h.
References Variance().
double Sleipnir::CStatistics::WilcoxonRankSum | ( | const CDat & | DatData, |
const CDat & | DatAnswers, | ||
const std::vector< bool > & | vecfGenesOfInterest, | ||
bool | fInvert = false |
||
) | [static] |
Calculate the Wilcoxon Rank Sum p-value (AUC) of a given data (or prediction) set relative to an answer set.
DatData | Data or prediction set to evaluate. |
DatAnswers | Answer set against which data is evaluated (values greater than zero are positive). |
vecfGenesOfInterest | If nonempty, genes against which to perform process-specific evaluation. |
fInvert | If true, use one minus data values. |
Calculates the AUC (equivalent to the Wilxocon Rank Sum p-value) of the given data set (or prediction set) against the given answers. The the set of genes of interest is nonempty, only positive pairs in which both genes are in the set or negative pairs where one gene is in the set will be scored. In psuedocode:
For each gene pair i,j: If CMeta::IsNaN( dValue = DatData.Get( i, j ) ), continue If CMeta::IdNaN( dAnswer = DatAnswers.Get( i, j ) ), continue fAnswer = ( dAnswer > 0 ) If vecfGenesOfInterest is nonempty: If fAnswer and !( vecfGenesOfInterest[ i ] && vecfGenesOfInterest[ j ] ), continue If !fAnswer and !( vecfGenesOfInterest[ i ] || vecfGenesOfInterest[ j ] ), continue Add the pair (dValue, fAnswer) to the list to score Evaluate the AUC/Wilcoxon Rank Sum p-value of the resulting value/answer list
The AUC is evaluated by sorting the pair list by value, summing the ranks of positive pairs, counting the numbers of positive and negative pairs, and calculating (sum - (positive * (positive - 1)) / 2) / positive / negative.
Definition at line 975 of file statistics.cpp.
References Sleipnir::CDat::Get(), Sleipnir::CDat::GetGene(), Sleipnir::CDat::GetGenes(), and Sleipnir::CMeta::IsNaN().
double Sleipnir::CStatistics::WilcoxonRankSum | ( | const CDat & | DatData, |
const CDat & | DatAnswers, | ||
const std::vector< bool > & | vecfGenesOfInterest, | ||
const std::vector< bool > & | vecfUbik, | ||
bool | fPosIn, | ||
bool | fNegIn, | ||
bool | fPosBridge, | ||
bool | fNegBridge, | ||
bool | fPosOut, | ||
bool | fNegOut, | ||
bool | fInvert = false |
||
) | [static] |
Calculate the Wilcoxon Rank Sum p-value (AUC) of a given data (or prediction) set relative to an answer set. Allows for ubiquitous genes.
DatData | Data or prediction set to evaluate. |
DatAnswers | Answer set against which data is evaluated (values greater than zero are positive). |
vecfGenesOfInterest | If nonempty, genes against which to perform process-specific evaluation. |
vecfUbik | If nonempty, ubiquitous genes. |
fPosIn | Should positives within the vecfGenesOfInterest be used? |
fNegIn | Should negatives within the vecfGenesOfInterest be used? |
fPosBridge | Should bridging (between ctxt and outside if empty vecfUbik or between ctxt and Ubik if non-empty) positives be used |
fNegBridge | Should bridging (between ctxt and outside if empty vecfUbik or between ctxt and Ubik if non-empty) negatives be used |
fPosOut | Should positive edges outside the context (non in/bridging) be used? |
fNegOut | Should negative edges outside the context (non in/bridging) be used) |
fInvert | If true, use one minus data values. |
Calculates the AUC (equivalent to the Wilxocon Rank Sum p-value) of the given data set (or prediction set) against the given answers. The the set of genes of interest is nonempty, only positive pairs in which both genes are in the set or negative pairs where one gene is in the set will be scored. In psuedocode:
For each gene pair i,j: If CMeta::IsNaN( dValue = DatData.Get( i, j ) ), continue If CMeta::IdNaN( dAnswer = DatAnswers.Get( i, j ) ), continue fAnswer = ( dAnswer > 0 ) If vecfGenesOfInterest is nonempty: If fAnswer and !( vecfGenesOfInterest[ i ] && vecfGenesOfInterest[ j ] ), continue If !fAnswer and !( vecfGenesOfInterest[ i ] || vecfGenesOfInterest[ j ] ), continue Add the pair (dValue, fAnswer) to the list to score Evaluate the AUC/Wilcoxon Rank Sum p-value of the resulting value/answer list
The AUC is evaluated by sorting the pair list by value, summing the ranks of positive pairs, counting the numbers of positive and negative pairs, and calculating (sum - (positive * (positive - 1)) / 2) / positive / negative.
Definition at line 1187 of file statistics.cpp.
References Sleipnir::CDat::Get(), Sleipnir::CDat::GetGene(), Sleipnir::CDat::GetGenes(), Sleipnir::CMeta::IsNaN(), and Sleipnir::CMeta::SkipEdge().
static bool Sleipnir::CStatistics::Winsorize | ( | std::vector< tType > & | vecValues, |
size_t | iCount = 1 |
||
) | [inline, static] |
Winsorizes the requested number of items at each end of the given vector.
vecValues | Vector to be sorted and Winsorized. |
iCount | Number of items at each end of the vector to be Winsorized. |
Winsorization is the process of replacing the n largest and smallest elements of a sequence with copies of the n-1st largest and n-1st smallest, respectively. For example, Winsorizing the sequence [1, 2, 3, 4, 5, 6, 7, 8] by two would return [3, 3, 3, 4, 5, 6, 6, 6]. This is a standard method for finding a robust average in the presence of outliers.
Definition at line 356 of file statistics.h.
static bool Sleipnir::CStatistics::Winsorize | ( | tType | pBegin, |
tType | pEnd, | ||
size_t | iCount = 1 |
||
) | [inline, static] |
Winsorizes the requested number of items at each end of the given sequence.
pBegin | Pointer/iterator to the beginning of the sequence. |
pEnd | Pointer/iterator to the end of the sequence. |
iCount | Number of items at each end of the sequence to be Winsorized. |
Winsorization is the process of replacing the n largest and smallest elements of a sequence with copies of the n-1st largest and n-1st smallest, respectively. For example, Winsorizing the sequence [1, 2, 3, 4, 5, 6, 7, 8] by two would return [3, 3, 3, 4, 5, 6, 6, 6]. This is a standard method for finding a robust average in the presence of outliers.
Definition at line 386 of file statistics.h.
static double Sleipnir::CStatistics::ZScore | ( | const std::vector< tType > & | vecOne, |
const std::vector< tType > & | vecTwo | ||
) | [inline, static] |
Calculates the z-score effect size measure.
vecOne | First array to be compared. |
vecTwo | Second array to be compared. |
For two input arrays A1 and A2 with means m(Ai) and standard deviations s(Ai), the standard z-score is (m(A1)-m(A2))/s(A1 u A2).
Definition at line 505 of file statistics.h.
static double Sleipnir::CStatistics::ZScore | ( | const tType | BeginOne, |
const tType | EndOne, | ||
const tType | BeginTwo, | ||
const tType | EndTwo, | ||
double * | pdAverage = NULL |
||
) | [inline, static] |
Calculates the z-score effect size measure.
BeginOne | First element of the first array to be compared. |
EndOne | End of last element of the first array to be compared. |
BeginTwo | First element of the second array to be compared. |
EndTwo | End of last element of the second array to be compared. |
pdAverage | If non-null, output average value of elements in both arrays. |
For two input arrays A1 and A2 with means m(Ai) and standard deviations s(Ai), the standard z-score is (m(A1)-m(A2))/s(A1 u A2).
Definition at line 541 of file statistics.h.
static double Sleipnir::CStatistics::ZTest | ( | double | dZScore, |
size_t | iN | ||
) | [inline, static] |
Returns the z-test p-value of the given z-score and element count.
dZScore | Z-score to be converted into a p-value. |
iN | Number of elements used to generate the given z-score. |
Definition at line 579 of file statistics.h.
References Normal01CDF().