PearsonsCorrelation.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.math4.legacy.stat.correlation;
import org.apache.commons.statistics.distribution.TDistribution;
import org.apache.commons.math4.legacy.exception.DimensionMismatchException;
import org.apache.commons.math4.legacy.exception.MathIllegalArgumentException;
import org.apache.commons.math4.legacy.exception.NullArgumentException;
import org.apache.commons.math4.legacy.exception.util.LocalizedFormats;
import org.apache.commons.math4.legacy.linear.BlockRealMatrix;
import org.apache.commons.math4.legacy.linear.RealMatrix;
import org.apache.commons.math4.legacy.stat.regression.SimpleRegression;
import org.apache.commons.math4.core.jdkmath.JdkMath;
/**
* Computes Pearson's product-moment correlation coefficients for pairs of arrays
* or columns of a matrix.
*
* <p>The constructors that take <code>RealMatrix</code> or
* <code>double[][]</code> arguments generate correlation matrices. The
* columns of the input matrices are assumed to represent variable values.
* Correlations are given by the formula</p>
*
* <p><code>cor(X, Y) = Σ[(x<sub>i</sub> - E(X))(y<sub>i</sub> - E(Y))] / [(n - 1)s(X)s(Y)]</code>
* where <code>E(X)</code> is the mean of <code>X</code>, <code>E(Y)</code>
* is the mean of the <code>Y</code> values and s(X), s(Y) are standard deviations.</p>
*
* <p>To compute the correlation coefficient for a single pair of arrays, use {@link #PearsonsCorrelation()}
* to construct an instance with no data and then {@link #correlation(double[], double[])}.
* Correlation matrices can also be computed directly from an instance with no data using
* {@link #computeCorrelationMatrix(double[][])}. In order to use {@link #getCorrelationMatrix()},
* {@link #getCorrelationPValues()}, or {@link #getCorrelationStandardErrors()}; however, one of the
* constructors supplying data or a covariance matrix must be used to create the instance.</p>
*
* @since 2.0
*/
public class PearsonsCorrelation {
/** correlation matrix. */
private final RealMatrix correlationMatrix;
/** number of observations. */
private final int nObs;
/**
* Create a PearsonsCorrelation instance without data.
*/
public PearsonsCorrelation() {
super();
correlationMatrix = null;
nObs = 0;
}
/**
* Create a PearsonsCorrelation from a rectangular array
* whose columns represent values of variables to be correlated.
*
* Throws MathIllegalArgumentException if the input array does not have at least
* two columns and two rows. Pairwise correlations are set to NaN if one
* of the correlates has zero variance.
*
* @param data rectangular array with columns representing variables
* @throws MathIllegalArgumentException if the input data array is not
* rectangular with at least two rows and two columns.
* @see #correlation(double[], double[])
*/
public PearsonsCorrelation(double[][] data) {
this(new BlockRealMatrix(data));
}
/**
* Create a PearsonsCorrelation from a RealMatrix whose columns
* represent variables to be correlated.
*
* Throws MathIllegalArgumentException if the matrix does not have at least
* two columns and two rows. Pairwise correlations are set to NaN if one
* of the correlates has zero variance.
*
* @param matrix matrix with columns representing variables to correlate
* @throws MathIllegalArgumentException if the matrix does not contain sufficient data
* @see #correlation(double[], double[])
*/
public PearsonsCorrelation(RealMatrix matrix) {
nObs = matrix.getRowDimension();
correlationMatrix = computeCorrelationMatrix(matrix);
}
/**
* Create a PearsonsCorrelation from a {@link Covariance}. The correlation
* matrix is computed by scaling the Covariance's covariance matrix.
* The Covariance instance must have been created from a data matrix with
* columns representing variable values.
*
* @param covariance Covariance instance
*/
public PearsonsCorrelation(Covariance covariance) {
RealMatrix covarianceMatrix = covariance.getCovarianceMatrix();
if (covarianceMatrix == null) {
throw new NullArgumentException(LocalizedFormats.COVARIANCE_MATRIX);
}
nObs = covariance.getN();
correlationMatrix = covarianceToCorrelation(covarianceMatrix);
}
/**
* Create a PearsonsCorrelation from a covariance matrix. The correlation
* matrix is computed by scaling the covariance matrix.
*
* @param covarianceMatrix covariance matrix
* @param numberOfObservations the number of observations in the dataset used to compute
* the covariance matrix
*/
public PearsonsCorrelation(RealMatrix covarianceMatrix, int numberOfObservations) {
nObs = numberOfObservations;
correlationMatrix = covarianceToCorrelation(covarianceMatrix);
}
/**
* Returns the correlation matrix.
*
* <p>This method will return null if the non-argument constructor was used
* to create this instance, even if {@link #computeCorrelationMatrix(double[][])}
* has been called before it is activated.</p>
*
* @return correlation matrix
*/
public RealMatrix getCorrelationMatrix() {
return correlationMatrix;
}
/**
* Returns a matrix of standard errors associated with the estimates
* in the correlation matrix.<br>
* <code>getCorrelationStandardErrors().getEntry(i,j)</code> is the standard
* error associated with <code>getCorrelationMatrix.getEntry(i,j)</code>
*
* <p>The formula used to compute the standard error is <br>
* <code>SE<sub>r</sub> = ((1 - r<sup>2</sup>) / (n - 2))<sup>1/2</sup></code>
* where <code>r</code> is the estimated correlation coefficient and
* <code>n</code> is the number of observations in the source dataset.</p>
*
* <p>To use this method, one of the constructors that supply an input
* matrix must have been used to create this instance.</p>
*
* @return matrix of correlation standard errors
* @throws NullPointerException if this instance was created with no data.
*/
public RealMatrix getCorrelationStandardErrors() {
int nVars = correlationMatrix.getColumnDimension();
double[][] out = new double[nVars][nVars];
for (int i = 0; i < nVars; i++) {
for (int j = 0; j < nVars; j++) {
double r = correlationMatrix.getEntry(i, j);
out[i][j] = JdkMath.sqrt((1 - r * r) /(nObs - 2));
}
}
return new BlockRealMatrix(out);
}
/**
* Returns a matrix of p-values associated with the (two-sided) null
* hypothesis that the corresponding correlation coefficient is zero.
*
* <p><code>getCorrelationPValues().getEntry(i,j)</code> is the probability
* that a random variable distributed as <code>t<sub>n-2</sub></code> takes
* a value with absolute value greater than or equal to <br>
* <code>|r|((n - 2) / (1 - r<sup>2</sup>))<sup>1/2</sup></code></p>
*
* <p>The values in the matrix are sometimes referred to as the
* <i>significance</i> of the corresponding correlation coefficients.</p>
*
* <p>To use this method, one of the constructors that supply an input
* matrix must have been used to create this instance.</p>
*
* @return matrix of p-values
* @throws org.apache.commons.math4.legacy.exception.MaxCountExceededException
* if an error occurs estimating probabilities
* @throws NullPointerException if this instance was created with no data.
*/
public RealMatrix getCorrelationPValues() {
TDistribution tDistribution = TDistribution.of(nObs - 2);
int nVars = correlationMatrix.getColumnDimension();
double[][] out = new double[nVars][nVars];
for (int i = 0; i < nVars; i++) {
for (int j = 0; j < nVars; j++) {
if (i == j) {
out[i][j] = 0d;
} else {
double r = correlationMatrix.getEntry(i, j);
double t = JdkMath.abs(r * JdkMath.sqrt((nObs - 2)/(1 - r * r)));
out[i][j] = 2 * tDistribution.cumulativeProbability(-t);
}
}
}
return new BlockRealMatrix(out);
}
/**
* Computes the correlation matrix for the columns of the
* input matrix, using {@link #correlation(double[], double[])}.
*
* Throws MathIllegalArgumentException if the matrix does not have at least
* two columns and two rows. Pairwise correlations are set to NaN if one
* of the correlates has zero variance.
*
* @param matrix matrix with columns representing variables to correlate
* @return correlation matrix
* @throws MathIllegalArgumentException if the matrix does not contain sufficient data
* @see #correlation(double[], double[])
*/
public RealMatrix computeCorrelationMatrix(RealMatrix matrix) {
checkSufficientData(matrix);
int nVars = matrix.getColumnDimension();
RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars);
for (int i = 0; i < nVars; i++) {
for (int j = 0; j < i; j++) {
double corr = correlation(matrix.getColumn(i), matrix.getColumn(j));
outMatrix.setEntry(i, j, corr);
outMatrix.setEntry(j, i, corr);
}
outMatrix.setEntry(i, i, 1d);
}
return outMatrix;
}
/**
* Computes the correlation matrix for the columns of the
* input rectangular array. The columns of the array represent values
* of variables to be correlated.
*
* Throws MathIllegalArgumentException if the matrix does not have at least
* two columns and two rows or if the array is not rectangular. Pairwise
* correlations are set to NaN if one of the correlates has zero variance.
*
* @param data matrix with columns representing variables to correlate
* @return correlation matrix
* @throws MathIllegalArgumentException if the array does not contain sufficient data
* @see #correlation(double[], double[])
*/
public RealMatrix computeCorrelationMatrix(double[][] data) {
return computeCorrelationMatrix(new BlockRealMatrix(data));
}
/**
* Computes the Pearson's product-moment correlation coefficient between two arrays.
*
* <p>Throws MathIllegalArgumentException if the arrays do not have the same length
* or their common length is less than 2. Returns {@code NaN} if either of the arrays
* has zero variance (i.e., if one of the arrays does not contain at least two distinct
* values).</p>
*
* @param xArray first data array
* @param yArray second data array
* @return Returns Pearson's correlation coefficient for the two arrays
* @throws DimensionMismatchException if the arrays lengths do not match
* @throws MathIllegalArgumentException if there is insufficient data
*/
public double correlation(final double[] xArray, final double[] yArray) {
SimpleRegression regression = new SimpleRegression();
if (xArray.length != yArray.length) {
throw new DimensionMismatchException(xArray.length, yArray.length);
} else if (xArray.length < 2) {
throw new MathIllegalArgumentException(LocalizedFormats.INSUFFICIENT_DIMENSION,
xArray.length, 2);
} else {
for(int i=0; i<xArray.length; i++) {
regression.addData(xArray[i], yArray[i]);
}
return regression.getR();
}
}
/**
* Derives a correlation matrix from a covariance matrix.
*
* <p>Uses the formula <br>
* <code>r(X,Y) = cov(X,Y)/s(X)s(Y)</code> where
* <code>r(·,·)</code> is the correlation coefficient and
* <code>s(·)</code> means standard deviation.</p>
*
* @param covarianceMatrix the covariance matrix
* @return correlation matrix
*/
public RealMatrix covarianceToCorrelation(RealMatrix covarianceMatrix) {
int nVars = covarianceMatrix.getColumnDimension();
RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars);
for (int i = 0; i < nVars; i++) {
double sigma = JdkMath.sqrt(covarianceMatrix.getEntry(i, i));
outMatrix.setEntry(i, i, 1d);
for (int j = 0; j < i; j++) {
double entry = covarianceMatrix.getEntry(i, j) /
(sigma * JdkMath.sqrt(covarianceMatrix.getEntry(j, j)));
outMatrix.setEntry(i, j, entry);
outMatrix.setEntry(j, i, entry);
}
}
return outMatrix;
}
/**
* Throws MathIllegalArgumentException if the matrix does not have at least
* two columns and two rows.
*
* @param matrix matrix to check for sufficiency
* @throws MathIllegalArgumentException if there is insufficient data
*/
private void checkSufficientData(final RealMatrix matrix) {
int nRows = matrix.getRowDimension();
int nCols = matrix.getColumnDimension();
if (nRows < 2 || nCols < 2) {
throw new MathIllegalArgumentException(LocalizedFormats.INSUFFICIENT_ROWS_AND_COLUMNS,
nRows, nCols);
}
}
}