Computing linear regression in one pass

The RunningRegression class is the analog of the RunningStats class described here and uses that class. You add pairs of (x, y) values by using the Push. At any point along the way you can call the Slope, Intercept, or Correlation functions to see the current value of these statistics.

You can also combine two RunningRegression objects by using the + and += operators. For example, you might accrue data on several different threads in parallel then add their RunningRegression objects together.

Here is the header file RunningRegression.h:

    #ifndef RUNNINGREGRESSION
    #define RUNNINGREGRESSION

    #include "RunningStats.h"

    class RunningRegression
    {
        public:
            RunningRegression();
            void Clear();
            void Push(double x, double y);
            long long NumDataValues() const;
            double Slope() const;
            double Intercept() const;
            double Correlation() const;

            friend RunningRegression operator+(
            const RunningRegression a, const RunningRegression b);
            RunningRegression& operator+=(const RunningRegression &rhs);

        private:
            RunningStats x_stats;
            RunningStats y_stats;
            double S_xy;
            long long n;
    };

#endif

Here is the implementation file RunningRegression.cpp.

    #include "RunningRegression.h"
    
    RunningRegression::RunningRegression()
    {
        Clear();
    }
    
    void RunningRegression::Clear()
    {
        x_stats.Clear();
        y_stats.Clear();
        S_xy = 0.0;
        n = 0;
    }
    
    void RunningRegression::Push(double x, double y)
    {
        S_xy += (x_stats.Mean() -x)*(y_stats.Mean() - y)*double(n)/double(n+1);
    
        x_stats.Push(x);
        y_stats.Push(y);
        n++;
    }
    
    long long RunningRegression::NumDataValues() const
    {
        return n;
    }
    
    double RunningRegression::Slope() const
    {
        double S_xx = x_stats.Variance()*(n - 1.0);
    
        return S_xy / S_xx;
    }
    
    double RunningRegression::Intercept() const
    {
        return y_stats.Mean() - Slope()*x_stats.Mean();
    }
    
    double RunningRegression::Correlation() const
    {
        double t = x_stats.StandardDeviation() * y_stats.StandardDeviation();
        return S_xy / ( (n-1) * t );
    }
    
    RunningRegression operator+(const RunningRegression a, const RunningRegression b)
    {
        RunningRegression combined;
        
        combined.x_stats = a.x_stats + b.x_stats;
        combined.y_stats = a.y_stats + b.y_stats;
        combined.n = a.n + b.n;
        
        double delta_x = b.x_stats.Mean() - a.x_stats.Mean();
        double delta_y = b.y_stats.Mean() - a.y_stats.Mean();
        combined.S_xy = a.S_xy + b.S_xy +
        double(a.n*b.n)*delta_x*delta_y/double(combined.n);
        
        return combined;
    }
    
    RunningRegression& RunningRegression::operator+=(const RunningRegression &rhs)
    {
        RunningRegression combined = *this + rhs;
        *this = combined;
        return *this;
        }
    
    

This code depends on RunningStats.h and RunningStats.cpp which are available here.

Read more: Applied linear regression