The code is an extension of the method of Knuth and Welford for computing standard deviation in one pass through the data. It computes skewness and kurtosis as well with a similar interface. In addition to only requiring one pass through the data, the algorithm is numerically stable and accurate.
To use the code, create a RunningStats object and use the Push method to insert data values. After accruing your data, you can obtain sample statistics by calling one of these methods:
MeanVarianceStandardDeviationSkewnessKurtosis
You can also combine two RunningStats objects by using the + and += operators. For example, you might accrue data on several different threads in parallel then add their RunningStats objects together to create a single object with the state that it would have had if all the data had been accumulated by it alone.
Update: Here is analogous code for computing a simple linear regression on data in one pass.
Update: Here is an implementation and extension in Julia by John Myles White.
Here is the header file RunningStats.h:
#ifndef RUNNINGSTATS_H
#define RUNNINGSTATS_H
class RunningStats
{
public:
RunningStats();
void Clear();
void Push(double x);
long long NumDataValues() const;
double Mean() const;
double Variance() const;
double StandardDeviation() const;
double Skewness() const;
double Kurtosis() const;
friend RunningStats operator+(const RunningStats a, const RunningStats b);
RunningStats& operator+=(const RunningStats &rhs);
private:
long long n;
double M1, M2, M3, M4;
};
#endif
And here is the implementation file RunningStats.cpp.
#include "RunningStats.h"
#include <cmath>
#include <vector>
RunningStats::RunningStats()
{
Clear();
}
void RunningStats::Clear()
{
n = 0;
M1 = M2 = M3 = M4 = 0.0;
}
void RunningStats::Push(double x)
{
double delta, delta_n, delta_n2, term1;
long long n1 = n;
n++;
delta = x - M1;
delta_n = delta / n;
delta_n2 = delta_n * delta_n;
term1 = delta * delta_n * n1;
M1 += delta_n;
M4 += term1 * delta_n2 * (n*n - 3*n + 3) + 6 * delta_n2 * M2 - 4 * delta_n * M3;
M3 += term1 * delta_n * (n - 2) - 3 * delta_n * M2;
M2 += term1;
}
long long RunningStats::NumDataValues() const
{
return n;
}
double RunningStats::Mean() const
{
return M1;
}
double RunningStats::Variance() const
{
return M2/(n-1.0);
}
double RunningStats::StandardDeviation() const
{
return sqrt( Variance() );
}
double RunningStats::Skewness() const
{
return sqrt(double(n)) * M3/ pow(M2, 1.5);
}
double RunningStats::Kurtosis() const
{
return double(n)*M4 / (M2*M2) - 3.0;
}
RunningStats operator+(const RunningStats a, const RunningStats b)
{
RunningStats combined;
combined.n = a.n + b.n;
double delta = b.M1 - a.M1;
double delta2 = delta*delta;
double delta3 = delta*delta2;
double delta4 = delta2*delta2;
combined.M1 = (a.n*a.M1 + b.n*b.M1) / combined.n;
combined.M2 = a.M2 + b.M2 +
delta2 * a.n * b.n / combined.n;
combined.M3 = a.M3 + b.M3 +
delta3 * a.n * b.n * (a.n - b.n)/(combined.n*combined.n);
combined.M3 += 3.0*delta * (a.n*b.M2 - b.n*a.M2) / combined.n;
combined.M4 = a.M4 + b.M4 + delta4*a.n*b.n * (a.n*a.n - a.n*b.n + b.n*b.n) /
(combined.n*combined.n*combined.n);
combined.M4 += 6.0*delta2 * (a.n*a.n*b.M2 + b.n*b.n*a.M2)/(combined.n*combined.n) +
4.0*delta*(a.n*b.M3 - b.n*a.M3) / combined.n;
return combined;
}
RunningStats& RunningStats::operator+=(const RunningStats& rhs)
{
RunningStats combined = *this + rhs;
*this = combined;
return *this;
}
References
- Wikipedia
- Timothy B. Terriberry. Computing Higher-Order Moments Online.
- Philippe Pébay. SANDIA REPORT SAND2008-6212 (2008). Formulas for Robust, One-Pass Parallel Computation of Co- variances and Arbitrary-Order Statistical Moments.