Utility class containing static statistics functions. More...

#include <statistics.h>

Inheritance diagram for Sleipnir::CStatistics:

Static Public Member Functions
static double	FScore (size_t iTruePositives, size_t iFalsePositives, size_t iTrueNegatives, size_t iFalseNegatives, double dBeta=1)
	Calculate a precision/recall f-score.
static double	Precision (size_t iTruePositives, size_t iFalsePositives, size_t iTrueNegatives, size_t iFalseNegatives)
	Calculate the precision of a predicted positive set.
static double	Recall (size_t iTruePositives, size_t iFalsePositives, size_t iTrueNegatives, size_t iFalseNegatives)
	Calculate the recall of predictions over some positive set.
template<class tType >
static double	Variance (const tType Begin, const tType End, double dMean)
	Calculate the sample variance of a given sequence.
template<class tType >
static double	NormalizeMeanStd (const tType Begin, const tType End)
template<class tType >
static double	Variance (const tType Begin, const tType End)
	Calculate the sample variance of a given sequence.
template<class tType >
static double	Variance (const std::vector< tType > &vecValues, double dMean)
	Calculate the sample variance of a given vector.
template<class tType >
static double	Variance (const std::vector< tType > &vecValues)
	Calculate the sample variance of a given vector.
template<class tType >
static double	Average (const std::vector< tType > &vecValues)
	Calculate the mean of a given vector.
template<class tType >
static double	Average (const tType Begin, const tType End)
	Calculate the mean of a given sequence.
template<class tType >
static double	Percentile (tType pBegin, tType pEnd, double dPercentile)
	Returns the value at the requested percentile of the given sequence.
template<class tType >
static double	Median (std::vector< tType > &vecData)
	Returns the median value of the given vector.
template<class tType >
static bool	Winsorize (std::vector< tType > &vecValues, size_t iCount=1)
	Winsorizes the requested number of items at each end of the given vector.
template<class tType >
static bool	Winsorize (tType pBegin, tType pEnd, size_t iCount=1)
	Winsorizes the requested number of items at each end of the given sequence.
template<class tType >
static double	CohensD (const std::vector< tType > &vecOne, const std::vector< tType > &vecTwo, double *pdAverage=NULL)
	Calculates Cohen's D, a modified z-score effect size measure based on unpooled variance.
template<class tType >
static double	CohensD (const tType BeginOne, const tType EndOne, const tType BeginTwo, const tType EndTwo, double *pdAverage=NULL)
	Calculates Cohen's D, a modified z-score effect size measure based on unpooled variance.
template<class tType >
static double	ZScore (const std::vector< tType > &vecOne, const std::vector< tType > &vecTwo)
	Calculates the z-score effect size measure.
template<class tType >
static double	ZScore (const tType BeginOne, const tType EndOne, const tType BeginTwo, const tType EndTwo, double *pdAverage=NULL)
	Calculates the z-score effect size measure.
static double	ZTest (double dZScore, size_t iN)
	Returns the z-test p-value of the given z-score and element count.
template<class tType , class tIter >
static double	AndersonDarlingScore (tIter Begin, tIter End)
static double	AndersonDarlingTest (double dA2)
template<class tType >
static double	RootMeanSquareError (tType BeginOne, tType EndOne, tType BeginTwo, tType EndTwo)
	Returns the root-mean-square error distance between two input arrays.
template<class tType >
static double	JensenShannonDivergence (tType BeginOne, tType EndOne, tType BeginTwo, tType EndTwo)
	Returns the Jensen-Shannon divergence between two discrete probability distributions represented as arrays.
template<class tType >
static double	KullbackLeiblerDivergence (tType BeginOne, tType EndOne, tType BeginTwo, tType EndTwo)
	Returns the Kullback-Leibler divergence between two discrete probability distributions represented as arrays.
static double	LjungBox (const float *adX, size_t iN, size_t iH)
	Calculates the p-value of a Ljung-Box portmanteau test for autocorrelation randomness.
static double	LjungBox (const float *adX, size_t iN)
	Calculate the p-value of a Ljung-Box portmanteau test for autocorrelation randomness.
static double	PValueLognormal (double dX, double dMean, double dStdev)
	Return the two-tailed p-value of a lognormal distribution.
static double	PValuePearson (double dR, size_t iN)
	Return the two-tailed p-value of a Pearson correlation.
static double	PValueSpearman (double dR, size_t iN)
	Return the two-tailed p-value of a Spearman correlation.
static double	FisherTransform (double dR)
static double	PValueKolmogorovSmirnov (double dD, size_t iM, size_t iN)
	Returns the p-value corresponding to a D-score obtained from a Kolmogorov-Smirnov test.
static double	TTestStudent (double dMeanOne, double dVarianceOne, size_t iNOne, double dMeanTwo, double dVarianceTwo, size_t iNTwo)
	Return the p-value of a t-test between the two given array statistics assuming equal variance.
static double	TTest (double dMean, double dVariance, size_t iN)
	Return the p-value of a t-test between the given array statistics and zero.
static double	TTestWelch (double dMeanOne, double dVarianceOne, size_t iNOne, double dMeanTwo, double dVarianceTwo, size_t iNTwo)
	Return the p-value of a t-test between the two given array statistics without assuming equal variance.
static double	FTest (double dVarianceOne, size_t iNOne, double dVarianceTwo, size_t iNTwo)
	Return the p-value of an f-test between the two given array statistics to determine equality of variance.
static double	WilcoxonRankSum (const CDat &DatData, const CDat &DatAnswers, const std::vector< bool > &vecfGenesOfInterest, bool fInvert=false)
	Calculate the Wilcoxon Rank Sum p-value (AUC) of a given data (or prediction) set relative to an answer set.
static double	WilcoxonRankSum (const CDat &DatData, const CDat &DatAnswers, const std::vector< bool > &vecfGenesOfInterest, const std::vector< bool > &vecfUbik, bool fPosIn, bool fNegIn, bool fPosBridge, bool fNegBridge, bool fPosOut, bool fNegOut, bool fInvert=false)
	Calculate the Wilcoxon Rank Sum p-value (AUC) of a given data (or prediction) set relative to an answer set. Allows for ubiquitous genes.
static double	WilcoxonRankSum (const CPCL &DatData, const CPCL &DatAnswers, const std::vector< bool > &vecfGenesOfInterest, const std::vector< bool > &vecfUbik, bool fPosIn, bool fNegIn, bool fPosBridge, bool fNegBridge, bool fPosOut, bool fNegOut, bool fInvert=false)
static double	WilcoxonRankSum (const CDat &DatData, const CDat &DatAnswers, const vector< float > &vecGeneWeights, bool flipneg)
static double	WilcoxonRankSum (const CDat &DatData, const CDat &DatAnswers, const CDat &wDat, bool flipneg)
static double	HypergeometricCDF (size_t iBoth, size_t iNonZeroInOne, size_t iNonZeroInTwo, size_t iN)
	Calculates the hypergeometric p-value given the sizes and overlap of two sets.
static double	TwoSidedHypergeometricCDF (size_t iHitsOne, size_t iSizeOne, size_t iHitsTwo, size_t iSizeTwo)
	Calculates the two sided hypergeometric p-value given the sizes and overlap of two sets.
static double	SampleGammaStandard (double dShape)
	Return a random sample from a standard gamma function with the given shape parameter.
static double	SampleGammaLogStandard (double dXX)
	Return a random sample from a gamma log function with the given parameter.
static double	SampleNormalStandard ()
	Return a random sample from a standard normal distribution.
static double	SampleExponentialStandard ()
	Return a random sample from a standard exponential distribution.
static double	SkellamPDF (size_t iX, double dMu1, double dMu2)
	Calculate the Skellam probability distribution given a point and the two distribution parameters.
static double	BinomialCDF (size_t iObservations, size_t iSample, double dProbability)
	Return the binomial p-value of obtaining at least the given sample with the given number of observations and probability of success.
static double	HypergeometricPDF (size_t iNonZeroInCommon, size_t iNonZeroInOne, size_t iNonZeroInTwo, size_t iTotalNumValues)
	Calculate the hypergeometric probability distribution given the sizes and overlap of two sets.
static double	TCDF (double dT, double dDF)
	Calculate a p-value for the given T and degrees of freedom.
static double	LognormalCDF (double dX, double dMean, double dStdev)
	CDF of a lognormal distribution with the given parameters at the given point.
static double	InverseGaussianCDF (double dX, double dMean, double dLambda)
	CDF of an inverse Gaussian distribution with the given parameters at the given point.
static double	NormalCDF (double dX, double dMean, double dStdev)
	CDF of a normal distribution with the given parameters at the given point.
static double	Normal01CDF (double dX)
	CDF of a standard normal distribution.
static double	InverseNormal01CDF (double dX)
	Inverse CDF of a standard normal distribution.
static double	MultivariateNormalCDF (const std::vector< float > &vecdX, const std::vector< float > &vecdMu, const CDataMatrix &MatSigmaCholesky, size_t iN=1, float dMaxError=0.01, float dMaxCI=0.99, size_t iMaxIterations=300)
	CDF of a multivariate normal distribution with the given parameters at the given point.
static double	MultivariateNormalPDF (const std::vector< float > &vecdX, const std::vector< float > &vecdMu, const CDataMatrix &MatSigma)
	Calculates the value of a multivariate normal probability density function at the requested point.
static double	MultivariateNormalPDF (const std::vector< float > &vecdX, const std::vector< float > &vecdMu, double dSigmaDetSqrt, const CDataMatrix &MatSigmaInv)
	Calculates the value of a multivariate normal probability density function at the requested point.
static double	SampleChi2 (size_t iDF)
	Return a random sample from a chi-squared distribution with the given degrees of freedom.
static double	Chi2CDF (double dC2, size_t iDF)
	Return the value of a chi-squared cumulative density function at the requested point.
static double	SampleGamma (double dLocation, double dShape)
	Return a random sample from a gamma distribution with the given location and shape parameters.
static double	BetaPDF (double dX, double dMinimum, double dMaximum, double dAlpha, double dBeta)
	Calculate a beta probability density at the given point with the given minimum, maximu, alpha, and beta parameters.
static double	NormalPDF (double dX, double dMu, double dSigma)
	Calculate a normal probability density at the given point for the given mean and standard deviation.
static double	ExponentialPDF (double dX, double dLambda)
	Returns the value of an exponential probability density function at the requested point.
static bool	CholeskyDecomposition (CDataMatrix &Matrix)
	Calculates the Cholesky decomposition of the given matrix, overwriting it in the process.
static bool	MatrixLUDecompose (CDataMatrix &Mat, std::vector< size_t > &veciIndices, bool &fEven)
	Replaces the given matrix with its LU decomposition.
static bool	MatrixLUInvert (CDataMatrix &MatLU, const std::vector< size_t > &veciIndices, CDataMatrix &MatInv)
	Calculates the inverse of a matrix given its LU decomposition.
template<class tType >
static double	MatrixLUDeterminant (const CFullMatrix< tType > &MatLU, bool fEven)
	Calculate the determinant of a matrix given its LU decomposition.

Detailed Description

Utility class containing static statistics functions.

CStatistics contains a collection of static utility functions, mainly various statistical distributions. Credit for many of these implementations goes to Press WH, Teukolsky SA, Vetterling WT, Flannery BP. Numerical Recipes in C, 1992, Cambridge University Press.

Definition at line 44 of file statistics.h.

Member Function Documentation

template<class tType >

static double Sleipnir::CStatistics::Average ( const std::vector< tType > & vecValues ) [inline, static]

Calculate the mean of a given vector.

Parameters:

vecValues Vector over which average is calculated.

Returns:: sum(x) / n

See also:: Variance

Definition at line 239 of file statistics.h.

Referenced by Sleipnir::CPCL::Normalize().

template<class tType >

static double Sleipnir::CStatistics::Average	(	const tType	Begin,
		const tType	End
	)		`[inline, static]`

Calculate the mean of a given sequence.

Parameters:

Begin	Beginning of the sequence over which average is calculated.
End	End of the sequence over which average is calculated.

Returns:: sum(x) / n

See also:: Variance

Definition at line 261 of file statistics.h.

static double Sleipnir::CStatistics::BetaPDF	(	double	dX,
		double	dMinimum,
		double	dMaximum,
		double	dAlpha,
		double	dBeta
	)		`[inline, static]`

Calculate a beta probability density at the given point with the given minimum, maximu, alpha, and beta parameters.

Parameters:

dX	Point at which to sample the beta probability distribution.
dMinimum	Minimum parameter of the beta distribution.
dMaximum	Maximum parameter of the beta distribution.
dAlpha	Alpha parameter of the beta distribution.
dBeta	Beta parameter of the beta distribution.

Returns:: Beta probability density at the given point for a distribution with the requested parameters.

Remarks:: Implementation courtesy of Press WH, Teukolsky SA, Vetterling WT, Flannery BP. Numerical Recipes in C, 1992, Cambridge University Press.

Definition at line 1556 of file statistics.h.

References SampleGammaLogStandard().

static double Sleipnir::CStatistics::BinomialCDF	(	size_t	iObservations,
		size_t	iSample,
		double	dProbability
	)		`[inline, static]`

Return the binomial p-value of obtaining at least the given sample with the given number of observations and probability of success.

Parameters:

iObservations	Count parameter of the binomial.
iSample	Point at which to sample the CDF.
dProbability	Probability parameter of the binomial.

Returns:: 1 - CStatistics::NormalCDF ((iObservations - (iSample * dProbability)) / sqrt(iSample * dProbability * (1 - dProbability), 0, 1)

Remarks:: Implementation courtesy of Press WH, Teukolsky SA, Vetterling WT, Flannery BP. Numerical Recipes in C, 1992, Cambridge University Press.

Definition at line 1141 of file statistics.h.

References NormalCDF().

static double Sleipnir::CStatistics::Chi2CDF	(	double	dC2,
		size_t	iDF
	)		`[inline, static]`

Return the value of a chi-squared cumulative density function at the requested point.

Parameters:

dC2	Point at which chi-squared CDF should be evaluated.
iDF	Degrees of freedom of the chi-squared distribution.

Returns:: Value of the chi-squared CDF at the given point with the requested degrees of freedom.

See also:: SampleChi2

Definition at line 1506 of file statistics.h.

static bool Sleipnir::CStatistics::CholeskyDecomposition ( CDataMatrix & Matrix ) [inline, static]

Calculates the Cholesky decomposition of the given matrix, overwriting it in the process.

Parameters:

Matrix Matrix to be decomposed.

Returns:: True if the decomposition was successful, false otherwise.

Remarks:: Matrix must be square.

Definition at line 1623 of file statistics.h.

References Sleipnir::CFullMatrix< tType >::Get(), Sleipnir::CFullMatrix< tType >::GetColumns(), Sleipnir::CFullMatrix< tType >::GetRows(), and Sleipnir::CFullMatrix< tType >::Set().

template<class tType >

static double Sleipnir::CStatistics::CohensD	(	const std::vector< tType > &	vecOne,
		const std::vector< tType > &	vecTwo,
		double *	pdAverage = `NULL`
	)		`[inline, static]`

Calculates Cohen's D, a modified z-score effect size measure based on unpooled variance.

Parameters:

vecOne	First array to be compared.
vecTwo	Second array to be compared.
pdAverage	If non-null, output average value of elements in both arrays.

Returns:: Cohen's D effect size measurement of the difference between input arrays.

Calculate's Cohen's D, a z-score-like effect size measurement based on unpooled variance. For two input arrays A1 and A2 with means m(Ai) and standard deviations s(Ai), the standard z-score is (m(A1)-m(A2))/s(A1 u A2). Cohen's D uses the unpooled variance 2(m(A1)-m(A2))/(s(A1)+s(A2)).

See also:: ZScore

Definition at line 425 of file statistics.h.

template<class tType >

static double Sleipnir::CStatistics::CohensD	(	const tType	BeginOne,
		const tType	EndOne,
		const tType	BeginTwo,
		const tType	EndTwo,
		double *	pdAverage = `NULL`
	)		`[inline, static]`

Calculates Cohen's D, a modified z-score effect size measure based on unpooled variance.

Parameters:

BeginOne	First element of the first array to be compared.
EndOne	End of last element of the first array to be compared.
BeginTwo	First element of the second array to be compared.
EndTwo	End of last element of the second array to be compared.
pdAverage	If non-null, output average value of elements in both arrays.

Returns:: Cohen's D effect size measurement of the difference between input arrays.

Calculate's Cohen's D, a z-score-like effect size measurement based on unpooled variance. For two input arrays A1 and A2 with means m(Ai) and standard deviations s(Ai), the standard z-score is (m(A1)-m(A2))/s(A1 u A2). Cohen's D uses the unpooled variance 2(m(A1)-m(A2))/(s(A1)+s(A2)).

See also:: ZScore

Definition at line 462 of file statistics.h.

static double Sleipnir::CStatistics::ExponentialPDF	(	double	dX,
		double	dLambda
	)		`[inline, static]`

Returns the value of an exponential probability density function at the requested point.

Parameters:

dX	Point at which the exponential PDF is evaluated.
dLambda	Rate parameter of the exponential PDF to be evaluated.

Returns:: Value of an exponeitial PDF at the requested point.

Definition at line 1605 of file statistics.h.

double Sleipnir::CStatistics::FScore	(	size_t	iTruePositives,
		size_t	iFalsePositives,
		size_t	iTrueNegatives,
		size_t	iFalseNegatives,
		double	dBeta = `1`
	)		`[static]`

Calculate a precision/recall f-score.

Parameters:

iTruePositives	Number of true positives.
iFalsePositives	Number of false positives.
iTrueNegatives	Number of true negatives.
iFalseNegatives	Number of false negatives.
dBeta	Relative weight given to precision; 0 ignores precision, 1 weights precision and recall equally, and arbitrary large values ignore recall.

Returns:: (1 + dBeta^2) * precision * recall / (dBeta^2 * precision + recall)

See also:: Precision | Recall

Definition at line 910 of file statistics.cpp.

References Precision(), and Recall().

static double Sleipnir::CStatistics::FTest	(	double	dVarianceOne,
		size_t	iNOne,
		double	dVarianceTwo,
		size_t	iNTwo
	)		`[inline, static]`

Return the p-value of an f-test between the two given array statistics to determine equality of variance.

Parameters:

dVarianceOne	Variance of the first sample.
iNOne	Number of elements in the first sample.
dVarianceTwo	Variance of the second sample.
iNTwo	Number of elements in the second sample.

Returns:: P-value of F = dVarianceOne / dVarianceTwo.

Definition at line 1058 of file statistics.h.

double Sleipnir::CStatistics::HypergeometricCDF	(	size_t	iBoth,
		size_t	iNonZeroInOne,
		size_t	iNonZeroInTwo,
		size_t	iN
	)		`[static]`

Calculates the hypergeometric p-value given the sizes and overlap of two sets.

Parameters:

iBoth	Number of non-zero in both query sets together.
iNonZeroInOne	Number of non-zero in query one
iTwo	Number of non-zero in query two
iN	Total number of sites in the query.

Returns:: sum(CStatistics::HypergeometricPDF (iBoth, iNonZeroInOne, iTwo, iN), i=iBoth..min(iNonZeroInTwo, iNonZeroInOne))

Definition at line 824 of file statistics.cpp.

References HypergeometricPDF().

Referenced by Sleipnir::CMeasureHypergeometric::Measure(), and Sleipnir::CDat::Open().

static double Sleipnir::CStatistics::HypergeometricPDF	(	size_t	iNonZeroInCommon,
		size_t	iNonZeroInOne,
		size_t	iNonZeroInTwo,
		size_t	iTotalNumValues
	)		`[inline, static]`

Calculate the hypergeometric probability distribution given the sizes and overlap of two sets.

Parameters:

iNonZeroInCommon	Number of non zero values that both share.
iNonZeroInOne	Size of the first (query) set.
iNonZeroInTwo	Number of hits in the second (background) set.
iTotalNumValues	Total number of values that were compared.

Returns:: choose(iNonZeroInTwo, iNonZeroInCommon) * choose(iTotalNumValues - iNonZeroInTwo, iNonZeroInOne - iNonZeroInCommon) / choose(iTotalNumValues, iNonZeroInOne)

Remarks:: Calculated using the exponential of CStatistics::LogFact results for increased speed and precision.

Definition at line 1173 of file statistics.h.

References Sleipnir::CMath::LogFact().

Referenced by HypergeometricCDF(), and TwoSidedHypergeometricCDF().

static double Sleipnir::CStatistics::InverseGaussianCDF	(	double	dX,
		double	dMean,
		double	dLambda
	)		`[inline, static]`

CDF of an inverse Gaussian distribution with the given parameters at the given point.

Parameters:

dX	Sample point.
dMean	Mean of inverse Gaussian.
dLambda	Lambda parameter of inverse Gaussian.

Returns:: CStatistics::Normal01CDF ( sqrt( dLambda / dX ) * ( ( dX / dMean ) - 1 ) ) + exp( ( 2 * dLambda / dMean ) + log ( Normal01CDF( -sqrt( dLambda / dX ) * ( ( dX / dMean ) + 1 ) ) )

Definition at line 1246 of file statistics.h.

References Normal01CDF().

double Sleipnir::CStatistics::InverseNormal01CDF ( double dX ) [static]

Inverse CDF of a standard normal distribution.

Parameters:

dX	Sample point.

Returns:: Value of dY, for dX = NCDF(dY) and NCDF a normal CDF with mean 0, variance 1.

Definition at line 1518 of file statistics.cpp.

References Sleipnir::CMeta::GetNaN(), and Normal01CDF().

Referenced by MultivariateNormalCDF().

template<class tType >

static double Sleipnir::CStatistics::JensenShannonDivergence	(	tType	BeginOne,
		tType	EndOne,
		tType	BeginTwo,
		tType	EndTwo
	)		`[inline, static]`

Returns the Jensen-Shannon divergence between two discrete probability distributions represented as arrays.

Parameters:

BeginOne	First element of the first array to be compared.
EndOne	End of last element of the first array to be compared.
BeginTwo	First element of the second array to be compared.
EndTwo	End of last element of the second array to be compared.

Returns:: Jensen-Shannon divergence between the given probability distributions in bits.

The Jensen-Shannon or JS-divergence between two discrete probability distributions A1 and A2 is a symmetrized Kullback-Leibler divergence defined as ( KL(A1, A2) + KL(A2, A1 ) / 2.

Remarks:: The elements of each array should sum to one, and the two arrays should be of the same length.

See also:: KullbackLeiblerDivergence

Definition at line 713 of file statistics.h.

References KullbackLeiblerDivergence().

template<class tType >

static double Sleipnir::CStatistics::KullbackLeiblerDivergence	(	tType	BeginOne,
		tType	EndOne,
		tType	BeginTwo,
		tType	EndTwo
	)		`[inline, static]`

Returns the Kullback-Leibler divergence between two discrete probability distributions represented as arrays.

Parameters:

BeginOne	First element of the first array to be compared.
EndOne	End of last element of the first array to be compared.
BeginTwo	First element of the second array to be compared.
EndTwo	End of last element of the second array to be compared.

Returns:: Kullback-Leibler divergence between the given probability distributions in bits.

The Kullback-Leibler or KL-divergence between two discrete probability distributions A1 and A2 is defined as sum( A1[i] * log( A1[i]/A2[i] ) ).

Remarks:: The elements of each array should sum to one, and the two arrays should be of the same length.

See also:: JensenShannonDivergence

Definition at line 751 of file statistics.h.

References Sleipnir::CMeta::GetNaN().

Referenced by JensenShannonDivergence().

double Sleipnir::CStatistics::LjungBox	(	const float *	adX,
		size_t	iN,
		size_t	iH
	)		`[static]`

Calculates the p-value of a Ljung-Box portmanteau test for autocorrelation randomness.

Parameters:

adX	Array of values to test.
iN	Number of values in array.
iH	Number of lags to test, at most iN - 1.

Returns:: Chi-squared of Q = iN * (iN + 2) * sum(autocorrelation(adX, i)^2 / (iN - i), i = 1..iH) with iH degrees of freedom.

Definition at line 62 of file statistics.cpp.

Referenced by LjungBox().

static double Sleipnir::CStatistics::LjungBox	(	const float *	adX,
		size_t	iN
	)		`[inline, static]`

Calculate the p-value of a Ljung-Box portmanteau test for autocorrelation randomness.

Parameters:

adX	Array of values to test.
iN	Number of values in array.

Returns:: Chi-squared of Q = iN * (iN + 2) * sum(autocorrelation(adX, i)^2 / (iN - i), i = 1..(iN - 1)) with iN - 1 degrees of freedom.

Definition at line 783 of file statistics.h.

References LjungBox().

static double Sleipnir::CStatistics::LognormalCDF	(	double	dX,
		double	dMean,
		double	dStdev
	)		`[inline, static]`

CDF of a lognormal distribution with the given parameters at the given point.

Parameters:

dX	Sample point.
dMean	Mean of lognormal.
dStdev	Standard deviation of lognormal.

Returns:: CStatistics::NormalCDF (log(dX), dMean, dVariance)

Definition at line 1223 of file statistics.h.

References NormalCDF().

Referenced by PValueLognormal().

bool Sleipnir::CStatistics::MatrixLUDecompose	(	CDataMatrix &	Mat,
		std::vector< size_t > &	veciIndices,
		bool &	fEven
	)		`[static]`

Replaces the given matrix with its LU decomposition.

Parameters:

Mat	Matrix to be LU decomposed; overwritten during the calculation.
veciIndices	Row permutations of the LU decomposition pivots.
fEven	True if decomposition is of even rank; false otherwise.

Returns:: True if decomposition was successful; false otherwise.

Remarks:: Note that MatLU, which must be square, is overwritten by the calculation. Implementation courtesy of Press WH, Teukolsky SA, Vetterling WT, Flannery BP. Numerical Recipes in C, 1992, Cambridge University Press.

See also:: MatrixLUInvert | MatrixLUDeterminant

Definition at line 1595 of file statistics.cpp.

References Sleipnir::CFullMatrix< tType >::Get(), Sleipnir::CFullMatrix< tType >::GetColumns(), Sleipnir::CFullMatrix< tType >::GetRows(), and Sleipnir::CFullMatrix< tType >::Set().

Referenced by MultivariateNormalPDF().

template<class tType >

static double Sleipnir::CStatistics::MatrixLUDeterminant	(	const CFullMatrix< tType > &	MatLU,
		bool	fEven
	)		`[inline, static]`

Calculate the determinant of a matrix given its LU decomposition.

Parameters:

MatLU	LU decomposition of matrix of interest.
fEven	True if LU decomposition of even rank; false otherwise.

Returns:: Determinant of original matrix.

Remarks:: MatLU must be square. Implementation courtesy of Press WH, Teukolsky SA, Vetterling WT, Flannery BP. Numerical Recipes in C, 1992, Cambridge University Press.

See also:: MatrixLUDecompose

Definition at line 1678 of file statistics.h.

References Sleipnir::CFullMatrix< tType >::Get(), Sleipnir::CFullMatrix< tType >::GetColumns(), Sleipnir::CMeta::GetNaN(), and Sleipnir::CFullMatrix< tType >::GetRows().

Referenced by MultivariateNormalPDF().

bool Sleipnir::CStatistics::MatrixLUInvert	(	CDataMatrix &	MatLU,
		const std::vector< size_t > &	veciIndices,
		CDataMatrix &	MatInv
	)		`[static]`

Calculates the inverse of a matrix given its LU decomposition.

Parameters:

MatLU	LU decomposition of matrix of interest; destroyed during the inversion calculation.
veciIndices	Row permutations of the LU decomposition pivots.
MatInv	Resulting inverted matrix.

Returns:: True if inversion was successful, false otherwise.

Remarks:: Note that MatLU, which must be square, is destroyed by the calculation. MatInv will be initialized to dimensions identical to MatLU if it is not already. Implementation courtesy of Press WH, Teukolsky SA, Vetterling WT, Flannery BP. Numerical Recipes in C, 1992, Cambridge University Press.

See also:: MatrixLUDecompose

Definition at line 1698 of file statistics.cpp.

References Sleipnir::CFullMatrix< tType >::GetColumns(), Sleipnir::CFullMatrix< tType >::GetRows(), Sleipnir::CFullMatrix< tType >::Initialize(), and Sleipnir::CFullMatrix< tType >::Set().

Referenced by MultivariateNormalPDF().

template<class tType >

static double Sleipnir::CStatistics::Median ( std::vector< tType > & vecData ) [inline, static]

Returns the median value of the given vector.

Parameters:

vecData Vector from which median value is returned.

Returns:: Median value of the given vector.

Remarks:: Note that the given vector is modified (sorted) by this function.

See also:: Percentile

Definition at line 329 of file statistics.h.

References Percentile().

static double Sleipnir::CStatistics::MultivariateNormalCDF	(	const std::vector< float > &	vecdX,
		const std::vector< float > &	vecdMu,
		const CDataMatrix &	MatSigmaCholesky,
		size_t	iN = `1`,
		float	dMaxError = `0.01`,
		float	dMaxCI = `0.99`,
		size_t	iMaxIterations = `300`
	)		`[inline, static]`

CDF of a multivariate normal distribution with the given parameters at the given point.

Parameters:

vecdX	Sample point.
vecdMu	Mean of normal.
MatSigmaCholesky	Cholesky decomposition of covariance matrix of normal.
iN	Number of elements used to calculate sample point.
dMaxError	Performance parameter; maximum error tolerance of return value.
dMaxCI	Performance parameter; confidence interval of return value.
iMaxIterations	Performance parameter; maximum iteratios to calculate return value.

Returns:: NCDF((vecdX - vecdMu) / MatSigma), for NCDF a normal CDF with mean 0, standard deviation 1.

Remarks:: Implementation courtesy of Numerical Recipes.

See also:: CholeskyDecomposition

Definition at line 1335 of file statistics.h.

References Sleipnir::CFullMatrix< tType >::Get(), Sleipnir::CFullMatrix< tType >::GetColumns(), Sleipnir::CMeta::GetNaN(), Sleipnir::CFullMatrix< tType >::GetRows(), InverseNormal01CDF(), and Normal01CDF().

static double Sleipnir::CStatistics::MultivariateNormalPDF	(	const std::vector< float > &	vecdX,
		const std::vector< float > &	vecdMu,
		const CDataMatrix &	MatSigma
	)		`[inline, static]`

Calculates the value of a multivariate normal probability density function at the requested point.

Parameters:

vecdX	Point at which multivariate normal distribution is evaluated.
vecdMu	Mean of multivariate normal distribution.
MatSigma	Covariance matrix of multivariate normal distribution.

Returns:: Value of multivariate normal distribution at the requested point.

Remarks:: vecdX and vecdMu must be of the same nonzero length, and MatSigma must be a square matrix with dimensions equal to that length.

See also:: MultivariateNormalCDF

Definition at line 1403 of file statistics.h.

References Sleipnir::CFullMatrix< tType >::GetColumns(), Sleipnir::CMeta::GetNaN(), Sleipnir::CFullMatrix< tType >::GetRows(), Sleipnir::CFullMatrix< tType >::Initialize(), MatrixLUDecompose(), MatrixLUDeterminant(), MatrixLUInvert(), and Sleipnir::CFullMatrix< tType >::Open().

static double Sleipnir::CStatistics::MultivariateNormalPDF	(	const std::vector< float > &	vecdX,
		const std::vector< float > &	vecdMu,
		double	dSigmaDetSqrt,
		const CDataMatrix &	MatSigmaInv
	)		`[inline, static]`

Calculates the value of a multivariate normal probability density function at the requested point.

Parameters:

vecdX	Point at which multivariate normal distribution is evaluated.
vecdMu	Mean of multivariate normal distribution.
dSigmaDetSqrt	Square root of the determinant of the requested distribution's covariance matrix.
MatSigmaInv	Inverse of the requested distribution's covariance matrix.

Returns:: Value of multivariate normal distribution at the requested point.

Remarks:: vecdX and vecdMu must be of the same nonzero length, and MatSigmaInv must be a square matrix with dimensions equal to that length.

See also:: MultivariateNormalCDF

Definition at line 1449 of file statistics.h.

References Sleipnir::CFullMatrix< tType >::GetColumns(), Sleipnir::CMeta::GetNaN(), Sleipnir::CFullMatrix< tType >::GetRows(), and Sleipnir::CMeta::IsNaN().

static double Sleipnir::CStatistics::Normal01CDF ( double dX ) [inline, static]

CDF of a standard normal distribution.

Parameters:

dX	Sample point.

Returns:: NCDF(dX), for NCDF a normal CDF with mean 0, variance 1.

Reimplemented from Sleipnir::CStatisticsImpl.

Definition at line 1284 of file statistics.h.

Referenced by InverseGaussianCDF(), InverseNormal01CDF(), MultivariateNormalCDF(), NormalCDF(), and ZTest().

static double Sleipnir::CStatistics::NormalCDF	(	double	dX,
		double	dMean,
		double	dStdev
	)		`[inline, static]`

CDF of a normal distribution with the given parameters at the given point.

Parameters:

dX	Sample point.
dMean	Mean of normal.
dStdev	Standard deviation of normal.

Returns:: NCDF((dX - dMean) / dStdev), for NCDF a normal CDF with mean 0, standard deviation 1.

Definition at line 1269 of file statistics.h.

References Normal01CDF().

Referenced by BinomialCDF(), and LognormalCDF().

static double Sleipnir::CStatistics::NormalPDF	(	double	dX,
		double	dMu,
		double	dSigma
	)		`[inline, static]`

Calculate a normal probability density at the given point for the given mean and standard deviation.

Parameters:

dX	Point at which to calculate the normal distribution.
dMu	Mean of the normal distribution.
dSigma	Standard deviation of the normal distribution.

Returns:: exp(-(dX - dMu)^2 / (2 * dSigma^2)) / (dSigma * sqrt(2*PI))

Definition at line 1584 of file statistics.h.

template<class tType >

static double Sleipnir::CStatistics::Percentile	(	tType	pBegin,
		tType	pEnd,
		double	dPercentile
	)		`[inline, static]`

Returns the value at the requested percentile of the given sequence.

Parameters:

pBegin	Pointer/iterator to the beginning of the sequence.
pEnd	Pointer/iterator to the end of the sequence.
dPercentile	Value between 0 and 1 indicating the percentile of the value to be retrieved.

Returns:: Value at the requested percentile.

Remarks:: The 0th percentile is the first (smallest) element, 1st percentile is the last (largest), and 0.5th is the median. Note that the input sequence is modified (sorted) by this function.

See also:: Median

Definition at line 293 of file statistics.h.

References Sleipnir::CMeta::GetNaN(), and Sleipnir::CMeta::IsNaN().

Referenced by Median().

static double Sleipnir::CStatistics::Precision	(	size_t	iTruePositives,
		size_t	iFalsePositives,
		size_t	iTrueNegatives,
		size_t	iFalseNegatives
	)		`[inline, static]`

Calculate the precision of a predicted positive set.

Parameters:

iTruePositives	Number of true positives.
iFalsePositives	Number of false positives.
iTrueNegatives	Number of true negatives.
iFalseNegatives	Number of false negatives.

Returns:: iTruePositives / (iTruePositives + iFalsePositives)

Remarks:: True and false negatives are ignored and included only for consistency.

See also:: Recall | FScore

Definition at line 75 of file statistics.h.

Referenced by FScore().

static double Sleipnir::CStatistics::PValueKolmogorovSmirnov	(	double	dD,
		size_t	iM,
		size_t	iN
	)		`[inline, static]`

Returns the p-value corresponding to a D-score obtained from a Kolmogorov-Smirnov test.

Parameters:

dD	D-value obtained from a Kolmogorov-Smirnov test.
iM	Number of elements in first tested array.
iN	Number of elements in second tested array.

Returns:: P-value equivalent to the given D-score and element count.

See also:: CMeasureKolmogorovSmirnov

Definition at line 897 of file statistics.h.

Referenced by Sleipnir::CMeasureKolmogorovSmirnov::Measure().

static double Sleipnir::CStatistics::PValueLognormal	(	double	dX,
		double	dMean,
		double	dStdev
	)		`[inline, static]`

Return the two-tailed p-value of a lognormal distribution.

Parameters:

dX	Sample point.
dMean	Mean of lognormal.
dStdev	Standard deviation of lognormal.

Returns:: For dCDF = CStatistics::LognormalCDF (dX, dMean, dVariance), 2 * ((dX > exp(dMean)) ? (1 - dCDF) : dCDF).

Definition at line 805 of file statistics.h.

References LognormalCDF().

static double Sleipnir::CStatistics::PValuePearson	(	double	dR,
		size_t	iN
	)		`[inline, static]`

Return the two-tailed p-value of a Pearson correlation.

Parameters:

dR	Pearson correlation.
iN	Length of correlated vectors.

Returns:: P-value corresponding to the given correlation and array size.

See also:: CMeasurePearson | PValueSpearman

Definition at line 831 of file statistics.h.

References TCDF().

static double Sleipnir::CStatistics::PValueSpearman	(	double	dR,
		size_t	iN
	)		`[inline, static]`

Return the two-tailed p-value of a Spearman correlation.

Parameters:

dR	Spearman correlation.
iN	Length of correlated vectors.

Returns:: P-value corresponding to the given correlation and array size.

See also:: CMeasureSpearman | PValuePearson

Definition at line 860 of file statistics.h.

References TCDF().

static double Sleipnir::CStatistics::Recall	(	size_t	iTruePositives,
		size_t	iFalsePositives,
		size_t	iTrueNegatives,
		size_t	iFalseNegatives
	)		`[inline, static]`

Calculate the recall of predictions over some positive set.

Parameters:

iTruePositives	Number of true positives.
iFalsePositives	Number of false positives.
iTrueNegatives	Number of true negatives.
iFalseNegatives	Number of false negatives.

Returns:: iTruePositives / (iTruePositives + iFalseNegatives)

Remarks:: False positives and true negatives are ignored and included only for consistency.

See also:: Precision | FScore

Definition at line 108 of file statistics.h.

Referenced by FScore().

template<class tType >

static double Sleipnir::CStatistics::RootMeanSquareError	(	tType	BeginOne,
		tType	EndOne,
		tType	BeginTwo,
		tType	EndTwo
	)		`[inline, static]`

Returns the root-mean-square error distance between two input arrays.

Parameters:

BeginOne	First element of the first array to be compared.
EndOne	End of last element of the first array to be compared.
BeginTwo	First element of the second array to be compared.
EndTwo	End of last element of the second array to be compared.

Returns:: Root-mean-square error between the two input arrays.

Calculates the root-mean-square error (RMSE) between two arrays, equal to sqrt( sum( (A1[i] - A2[i])^2 ) / min(|A1|, |A2|) ).

Remarks:: If the two input arrays are of different lengths m and n, only min(m, n) elements of either array are compared.

Definition at line 667 of file statistics.h.

static double Sleipnir::CStatistics::SampleChi2 ( size_t iDF ) [inline, static]

Return a random sample from a chi-squared distribution with the given degrees of freedom.

Parameters:

iDF	Degrees of freedom of the chi-squared distribution.

Returns:: Random sample from a chi-squared distribution with the requested degrees of freedom.

See also:: Chi2CDF

Definition at line 1485 of file statistics.h.

References SampleGamma().

double Sleipnir::CStatistics::SampleExponentialStandard ( ) [static]

Return a random sample from a standard exponential distribution.

Returns:: Random sample from a standard exponential distribution.

Remarks:: Implementation courtesy of Press WH, Teukolsky SA, Vetterling WT, Flannery BP. Numerical Recipes in C, 1992, Cambridge University Press.

Definition at line 722 of file statistics.cpp.

Referenced by SampleGammaStandard().

static double Sleipnir::CStatistics::SampleGamma	(	double	dLocation,
		double	dShape
	)		`[inline, static]`

Return a random sample from a gamma distribution with the given location and shape parameters.

Parameters:

dLocation	Location parameter of gamma function to sample.
dShape	Shape parameter of gamma function to sample.

Returns:: Random sample from a gamma function with the given shape and location parameters.

Definition at line 1524 of file statistics.h.

References SampleGammaStandard().

Referenced by SampleChi2().

double Sleipnir::CStatistics::SampleGammaLogStandard ( double dXX ) [static]

Return a random sample from a gamma log function with the given parameter.

Parameters:

dXX	Parameter of gamma log function to sample.

Returns:: Random sample from a gamma log function with the given parameter.

Remarks:: Implementation courtesy of Press WH, Teukolsky SA, Vetterling WT, Flannery BP. Numerical Recipes in C, 1992, Cambridge University Press.

Definition at line 403 of file statistics.cpp.

Referenced by BetaPDF().

double Sleipnir::CStatistics::SampleGammaStandard ( double dShape ) [static]

Return a random sample from a standard gamma function with the given shape parameter.

Parameters:

dShape Shape parameter of gamma function to sample.

Returns:: Random sample from a standard gamma function with the given shape parameter.

Remarks:: Implementation courtesy of Press WH, Teukolsky SA, Vetterling WT, Flannery BP. Numerical Recipes in C, 1992, Cambridge University Press.

Definition at line 432 of file statistics.cpp.

References SampleExponentialStandard(), and SampleNormalStandard().

Referenced by SampleGamma().

double Sleipnir::CStatistics::SampleNormalStandard ( ) [static]

Return a random sample from a standard normal distribution.

Returns:: Random sample from a standard normal distribution.

Remarks:: Implementation courtesy of Press WH, Teukolsky SA, Vetterling WT, Flannery BP. Numerical Recipes in C, 1992, Cambridge University Press.

Definition at line 606 of file statistics.cpp.

Referenced by SampleGammaStandard().

static double Sleipnir::CStatistics::SkellamPDF	(	size_t	iX,
		double	dMu1,
		double	dMu2
	)		`[inline, static]`

Calculate the Skellam probability distribution given a point and the two distribution parameters.

Parameters:

iX	Value at which PDF should be evaluated.
dMu1	First distribution parameter (first expected value).
dMu2	Second distribution parameter (second expected value).

Returns:: Value of a Skellam distribution with the requested parameters at the given point.

Definition at line 1113 of file statistics.h.

static double Sleipnir::CStatistics::TCDF	(	double	dT,
		double	dDF
	)		`[inline, static]`

Calculate a p-value for the given T and degrees of freedom.

Parameters:

dT	T value at which to sample the t-distribution.
dDF	Degrees of freedom of the desired t-distribution.

Returns:: p-value of the given T and degrees of freedom.

Definition at line 1202 of file statistics.h.

Referenced by Sleipnir::CMeasurePearsonSignificance::Measure(), PValuePearson(), PValueSpearman(), TTest(), TTestStudent(), and TTestWelch().

static double Sleipnir::CStatistics::TTest	(	double	dMean,
		double	dVariance,
		size_t	iN
	)		`[inline, static]`

Return the p-value of a t-test between the given array statistics and zero.

Parameters:

dMean	Sample mean.
dVariance	Sample variance.
iN	Sample size.

Returns:: P-value of T = sqrt( iN ) * dMean / sqrt( dVariance )

See also:: TTestStudent | TTestWelch

Definition at line 989 of file statistics.h.

References TCDF().

static double Sleipnir::CStatistics::TTestStudent	(	double	dMeanOne,
		double	dVarianceOne,
		size_t	iNOne,
		double	dMeanTwo,
		double	dVarianceTwo,
		size_t	iNTwo
	)		`[inline, static]`

Return the p-value of a t-test between the two given array statistics assuming equal variance.

Parameters:

dMeanOne	Mean of the first sample.
dVarianceOne	Variance of the first sample.
iNOne	Number of elements in the first sample.
dMeanTwo	Mean of the second sample.
dVarianceTwo	Variance of the second sample.
iNTwo	Number of elements in the second sample.

Returns:: P-value of T = (dMeanOne - dMeanTwo) / sqrt(((((iNOne - 1) * dVarianceOne) + ((iNTwo - 1) * dVarianceTwo)) / (iNOne + iNTwo - 2)) * ((1 / iNOne) + (1 / iNTwo)))

Definition at line 955 of file statistics.h.

References TCDF().

static double Sleipnir::CStatistics::TTestWelch	(	double	dMeanOne,
		double	dVarianceOne,
		size_t	iNOne,
		double	dMeanTwo,
		double	dVarianceTwo,
		size_t	iNTwo
	)		`[inline, static]`

Return the p-value of a t-test between the two given array statistics without assuming equal variance.

Parameters:

dMeanOne	Mean of the first sample.
dVarianceOne	Variance of the first sample.
iNOne	Number of elements in the first sample.
dMeanTwo	Mean of the second sample.
dVarianceTwo	Variance of the second sample.
iNTwo	Number of elements in the second sample.

Returns:: P-value of T = (dMeanOne - dMeanTwo) / sqrt(((((iNOne - 1) * dVarianceOne) + ((iNTwo - 1) * dVarianceTwo)) / (iNOne + iNTwo - 2)) * ((1 / iNOne) + (1 / iNTwo)))

Definition at line 1025 of file statistics.h.

References TCDF().

double Sleipnir::CStatistics::TwoSidedHypergeometricCDF	(	size_t	iNonZeroInCommon,
		size_t	iNonZeroInOne,
		size_t	iNonZeroInTwo,
		size_t	iTotalNumValues
	)		`[static]`

Calculates the two sided hypergeometric p-value given the sizes and overlap of two sets.

Parameters:

iNonZeroInCommon	Number of non-zero in both query sets together.
iNonZeroInOne	Number of non-zero in query one
iNonZeroInTwo	Number of non-zero in query two
iTotalNumValues	Total number of sites in the query.

Returns:: sum(CStatistics::HypergeometricPDF (iBoth, iNonZeroInOne, iNonZeroInTwo, iN), i=iBoth..min(iNonZeroInTwo, iNonZeroInOne))

Definition at line 855 of file statistics.cpp.

References HypergeometricPDF().

template<class tType >

static double Sleipnir::CStatistics::Variance	(	const tType	Begin,
		const tType	End,
		double	dMean
	)		`[inline, static]`

Calculate the sample variance of a given sequence.

Parameters:

Begin	Beginning of the sequence over which variance is estimated.
End	End of the sequence over which variance is estimated.
dMean	Mean of the sample or population.

Returns:: sum((x - dMean)^2) / n

See also:: Average

Definition at line 136 of file statistics.h.

Referenced by Sleipnir::CPCL::Normalize(), and Variance().

template<class tType >

static double Sleipnir::CStatistics::Variance	(	const tType	Begin,
		const tType	End
	)		`[inline, static]`

Calculate the sample variance of a given sequence.

Parameters:

Begin	Beginning of the sequence over which variance is estimated.
End	End of the sequence over which variance is estimated.

Returns:: sum((x - dMean)^2) / (n - 1)

See also:: Average

Definition at line 173 of file statistics.h.

template<class tType >

static double Sleipnir::CStatistics::Variance	(	const std::vector< tType > &	vecValues,
		double	dMean
	)		`[inline, static]`

Calculate the sample variance of a given vector.

Parameters:

vecValues	Vector over which variance is estimated.
dMean	Mean of the sample or population.

Returns:: sum((x - dMean)^2) / (n - 1)

See also:: Average

Definition at line 201 of file statistics.h.

References Variance().

template<class tType >

static double Sleipnir::CStatistics::Variance ( const std::vector< tType > & vecValues ) [inline, static]

Calculate the sample variance of a given vector.

Parameters:

vecValues Vector over which variance is estimated.

Returns:: sum((x - xbar)^2) / (n - 1)

See also:: Average

Definition at line 220 of file statistics.h.

References Variance().

double Sleipnir::CStatistics::WilcoxonRankSum	(	const CDat &	DatData,
		const CDat &	DatAnswers,
		const std::vector< bool > &	vecfGenesOfInterest,
		bool	fInvert = `false`
	)		`[static]`

Calculate the Wilcoxon Rank Sum p-value (AUC) of a given data (or prediction) set relative to an answer set.

Parameters:

DatData	Data or prediction set to evaluate.
DatAnswers	Answer set against which data is evaluated (values greater than zero are positive).
vecfGenesOfInterest	If nonempty, genes against which to perform process-specific evaluation.
fInvert	If true, use one minus data values.

Returns:: Wilcoxon Rank Sum p-value (AUC, area under ROC curve) of the given data evaluated against the given answers.

Calculates the AUC (equivalent to the Wilxocon Rank Sum p-value) of the given data set (or prediction set) against the given answers. The the set of genes of interest is nonempty, only positive pairs in which both genes are in the set or negative pairs where one gene is in the set will be scored. In psuedocode:

 For each gene pair i,j:
   If CMeta::IsNaN( dValue = DatData.Get( i, j ) ), continue
   If CMeta::IdNaN( dAnswer = DatAnswers.Get( i, j ) ), continue
   fAnswer = ( dAnswer > 0 )
   If vecfGenesOfInterest is nonempty:
     If fAnswer and !( vecfGenesOfInterest[ i ] && vecfGenesOfInterest[ j ] ), continue
     If !fAnswer and !( vecfGenesOfInterest[ i ] || vecfGenesOfInterest[ j ] ), continue
   Add the pair (dValue, fAnswer) to the list to score
 Evaluate the AUC/Wilcoxon Rank Sum p-value of the resulting value/answer list

The AUC is evaluated by sorting the pair list by value, summing the ranks of positive pairs, counting the numbers of positive and negative pairs, and calculating (sum - (positive * (positive - 1)) / 2) / positive / negative.

Remarks:: DatData and DatAnswers must be of exactly the same size and have the same gene lists. If vecfGenesOfInterest is nonempty, it must also be of the same size and refer to the same gene list.

Definition at line 975 of file statistics.cpp.

References Sleipnir::CDat::Get(), Sleipnir::CDat::GetGene(), Sleipnir::CDat::GetGenes(), and Sleipnir::CMeta::IsNaN().

double Sleipnir::CStatistics::WilcoxonRankSum	(	const CDat &	DatData,
		const CDat &	DatAnswers,
		const std::vector< bool > &	vecfGenesOfInterest,
		const std::vector< bool > &	vecfUbik,
		bool	fPosIn,
		bool	fNegIn,
		bool	fPosBridge,
		bool	fNegBridge,
		bool	fPosOut,
		bool	fNegOut,
		bool	fInvert = `false`
	)		`[static]`

Calculate the Wilcoxon Rank Sum p-value (AUC) of a given data (or prediction) set relative to an answer set. Allows for ubiquitous genes.

Parameters:

DatData	Data or prediction set to evaluate.
DatAnswers	Answer set against which data is evaluated (values greater than zero are positive).
vecfGenesOfInterest	If nonempty, genes against which to perform process-specific evaluation.
vecfUbik	If nonempty, ubiquitous genes.
fPosIn	Should positives within the vecfGenesOfInterest be used?
fNegIn	Should negatives within the vecfGenesOfInterest be used?
fPosBridge	Should bridging (between ctxt and outside if empty vecfUbik or between ctxt and Ubik if non-empty) positives be used
fNegBridge	Should bridging (between ctxt and outside if empty vecfUbik or between ctxt and Ubik if non-empty) negatives be used
fPosOut	Should positive edges outside the context (non in/bridging) be used?
fNegOut	Should negative edges outside the context (non in/bridging) be used)
fInvert	If true, use one minus data values.

Returns:: Wilcoxon Rank Sum p-value (AUC, area under ROC curve) of the given data evaluated against the given answers.

Calculates the AUC (equivalent to the Wilxocon Rank Sum p-value) of the given data set (or prediction set) against the given answers. The the set of genes of interest is nonempty, only positive pairs in which both genes are in the set or negative pairs where one gene is in the set will be scored. In psuedocode:

 For each gene pair i,j:
   If CMeta::IsNaN( dValue = DatData.Get( i, j ) ), continue
   If CMeta::IdNaN( dAnswer = DatAnswers.Get( i, j ) ), continue
   fAnswer = ( dAnswer > 0 )
   If vecfGenesOfInterest is nonempty:
     If fAnswer and !( vecfGenesOfInterest[ i ] && vecfGenesOfInterest[ j ] ), continue
     If !fAnswer and !( vecfGenesOfInterest[ i ] || vecfGenesOfInterest[ j ] ), continue
   Add the pair (dValue, fAnswer) to the list to score
 Evaluate the AUC/Wilcoxon Rank Sum p-value of the resulting value/answer list

The AUC is evaluated by sorting the pair list by value, summing the ranks of positive pairs, counting the numbers of positive and negative pairs, and calculating (sum - (positive * (positive - 1)) / 2) / positive / negative.

Remarks:: DatData and DatAnswers must be of exactly the same size and have the same gene lists. If vecfGenesOfInterest is nonempty, it must also be of the same size and refer to the same gene list.

Definition at line 1187 of file statistics.cpp.

References Sleipnir::CDat::Get(), Sleipnir::CDat::GetGene(), Sleipnir::CDat::GetGenes(), Sleipnir::CMeta::IsNaN(), and Sleipnir::CMeta::SkipEdge().

template<class tType >

static bool Sleipnir::CStatistics::Winsorize	(	std::vector< tType > &	vecValues,
		size_t	iCount = `1`
	)		`[inline, static]`

Winsorizes the requested number of items at each end of the given vector.

Parameters:

vecValues	Vector to be sorted and Winsorized.
iCount	Number of items at each end of the vector to be Winsorized.

Returns:: True if Winsorization was necessary, false if there were too few values.

Winsorization is the process of replacing the n largest and smallest elements of a sequence with copies of the n-1st largest and n-1st smallest, respectively. For example, Winsorizing the sequence [1, 2, 3, 4, 5, 6, 7, 8] by two would return [3, 3, 3, 4, 5, 6, 6, 6]. This is a standard method for finding a robust average in the presence of outliers.

Remarks:: Note that the vector is modified (sorted) by this function.

Definition at line 356 of file statistics.h.

template<class tType >

static bool Sleipnir::CStatistics::Winsorize	(	tType	pBegin,
		tType	pEnd,
		size_t	iCount = `1`
	)		`[inline, static]`

Winsorizes the requested number of items at each end of the given sequence.

Parameters:

pBegin	Pointer/iterator to the beginning of the sequence.
pEnd	Pointer/iterator to the end of the sequence.
iCount	Number of items at each end of the sequence to be Winsorized.

Returns:: True if Winsorization was necessary, false if there were too few values.

Winsorization is the process of replacing the n largest and smallest elements of a sequence with copies of the n-1st largest and n-1st smallest, respectively. For example, Winsorizing the sequence [1, 2, 3, 4, 5, 6, 7, 8] by two would return [3, 3, 3, 4, 5, 6, 6, 6]. This is a standard method for finding a robust average in the presence of outliers.

Remarks:: Note that the sequence is modified (sorted) by this function.

Definition at line 386 of file statistics.h.

template<class tType >

static double Sleipnir::CStatistics::ZScore	(	const std::vector< tType > &	vecOne,
		const std::vector< tType > &	vecTwo
	)		`[inline, static]`

Calculates the z-score effect size measure.

Parameters:

vecOne	First array to be compared.
vecTwo	Second array to be compared.

Returns:: Z-score effect size measurement of the difference between input arrays.

For two input arrays A1 and A2 with means m(Ai) and standard deviations s(Ai), the standard z-score is (m(A1)-m(A2))/s(A1 u A2).

See also:: CohensD

Definition at line 505 of file statistics.h.

template<class tType >

static double Sleipnir::CStatistics::ZScore	(	const tType	BeginOne,
		const tType	EndOne,
		const tType	BeginTwo,
		const tType	EndTwo,
		double *	pdAverage = `NULL`
	)		`[inline, static]`

Calculates the z-score effect size measure.

Parameters:

BeginOne	First element of the first array to be compared.
EndOne	End of last element of the first array to be compared.
BeginTwo	First element of the second array to be compared.
EndTwo	End of last element of the second array to be compared.
pdAverage	If non-null, output average value of elements in both arrays.

Returns:: Z-score effect size measurement of the difference between input arrays.

For two input arrays A1 and A2 with means m(Ai) and standard deviations s(Ai), the standard z-score is (m(A1)-m(A2))/s(A1 u A2).

See also:: CohensD

Definition at line 541 of file statistics.h.

static double Sleipnir::CStatistics::ZTest	(	double	dZScore,
		size_t	iN
	)		`[inline, static]`

Returns the z-test p-value of the given z-score and element count.

Parameters:

dZScore	Z-score to be converted into a p-value.
iN	Number of elements used to generate the given z-score.

Returns:: P-value equivalent to the given z-score and element count.

Remarks:: Equivalent to 1 - normcdf( 0, sqrt( iN ), |dZScore| )

See also:: ZScore

Definition at line 579 of file statistics.h.

References Normal01CDF().

The documentation for this class was generated from the following files:

src/statistics.h
src/statistics.cpp

Static Public Member Functions

Detailed Description

Member Function Documentation