001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.statistics.inference;
018
019import org.apache.commons.statistics.descriptive.LongMean;
020import org.apache.commons.statistics.distribution.ChiSquaredDistribution;
021
022/**
023 * Implements chi-square test statistics.
024 *
025 * <p>This implementation handles both known and unknown distributions.
026 *
027 * <p>Two samples tests can be used when the distribution is unknown <i>a priori</i>
028 * but provided by one sample, or when the hypothesis under test is that the two
029 * samples come from the same underlying distribution.
030 *
031 * @see <a href="https://en.wikipedia.org/wiki/Chi-squared_test">Chi-square test (Wikipedia)</a>
032 * @since 1.1
033 */
034public final class ChiSquareTest {
035    /** Name for the row. */
036    private static final String ROW = "row";
037    /** Name for the column. */
038    private static final String COLUMN = "column";
039    /** Default instance. */
040    private static final ChiSquareTest DEFAULT = new ChiSquareTest(0);
041
042    /** Degrees of freedom adjustment. */
043    private final int degreesOfFreedomAdjustment;
044
045    /**
046     * @param degreesOfFreedomAdjustment Degrees of freedom adjustment.
047     */
048    private ChiSquareTest(int degreesOfFreedomAdjustment) {
049        this.degreesOfFreedomAdjustment = degreesOfFreedomAdjustment;
050    }
051
052    /**
053     * Return an instance using the default options.
054     *
055     * <ul>
056     * <li>{@linkplain #withDegreesOfFreedomAdjustment(int) Degrees of freedom adjustment = 0}
057     * </ul>
058     *
059     * @return default instance
060     */
061    public static ChiSquareTest withDefaults() {
062        return DEFAULT;
063    }
064
065    /**
066     * Return an instance with the configured degrees of freedom adjustment.
067     *
068     * <p>The default degrees of freedom for a sample of length {@code n} are
069     * {@code n - 1}. An intrinsic null hypothesis is one where you estimate one or
070     * more parameters from the data in order to get the numbers for your null
071     * hypothesis. For a distribution with {@code p} parameters where up to
072     * {@code p} parameters have been estimated from the data the degrees of freedom
073     * is in the range {@code [n - 1 - p, n - 1]}.
074     *
075     * @param v Value.
076     * @return an instance
077     * @throws IllegalArgumentException if the value is negative
078     */
079    public ChiSquareTest withDegreesOfFreedomAdjustment(int v) {
080        return new ChiSquareTest(Arguments.checkNonNegative(v));
081    }
082
083    /**
084     * Computes the chi-square goodness-of-fit statistic comparing the {@code observed} counts to a
085     * uniform expected value (each category is equally likely).
086     *
087     * <p>Note: This is a specialized version of a comparison of {@code observed}
088     * with an {@code expected} array of uniform values. The result is faster than
089     * calling {@link #statistic(double[], long[])} and the statistic is the same,
090     * with an allowance for accumulated floating-point error due to the optimized
091     * routine.
092     *
093     * @param observed Observed frequency counts.
094     * @return Chi-square statistic
095     * @throws IllegalArgumentException if the sample size is less than 2;
096     * {@code observed} has negative entries; or all the observations are zero.
097     * @see #test(long[])
098     */
099    public double statistic(long[] observed) {
100        Arguments.checkValuesRequiredSize(observed.length, 2);
101        Arguments.checkNonNegative(observed);
102        final double e = LongMean.of(observed).getAsDouble();
103        if (e == 0) {
104            throw new InferenceException(InferenceException.NO_DATA);
105        }
106        // chi2 = sum{ (o-e)^2 / e }. Use a single division at the end.
107        double chi2 = 0;
108        for (final long o : observed) {
109            final double d = o - e;
110            chi2 += d * d;
111        }
112        return chi2 / e;
113    }
114
115    /**
116     * Computes the chi-square goodness-of-fit statistic comparing {@code observed} and
117     * {@code expected} frequency counts.
118     *
119     * <p><strong>Note:</strong>This implementation rescales the {@code expected}
120     * array if necessary to ensure that the sum of the expected and observed counts
121     * are equal.
122     *
123     * @param expected Expected frequency counts.
124     * @param observed Observed frequency counts.
125     * @return Chi-square statistic
126     * @throws IllegalArgumentException if the sample size is less than 2; the array
127     * sizes do not match; {@code expected} has entries that are not strictly
128     * positive; {@code observed} has negative entries; or all the observations are zero.
129     * @see #test(double[], long[])
130     */
131    public double statistic(double[] expected, long[] observed) {
132        final double ratio = StatisticUtils.computeRatio(expected, observed);
133        // chi2 = sum{ (o-e)^2 / e }
134        double chi2 = 0;
135        for (int i = 0; i < observed.length; i++) {
136            final double e = ratio * expected[i];
137            final double d = observed[i] - e;
138            chi2 += d * d / e;
139        }
140        return chi2;
141    }
142
143    /**
144     * Computes the chi-square statistic associated with a chi-square test of
145     * independence based on the input {@code counts} array, viewed as a two-way
146     * table in row-major format.
147     *
148     * @param counts 2-way table.
149     * @return Chi-square statistic
150     * @throws IllegalArgumentException if the number of rows or columns is less
151     * than 2; the array is non-rectangular; the array has negative entries; or the
152     * sum of a row or column is zero.
153     * @see #test(long[][])
154     */
155    public double statistic(long[][] counts) {
156        Arguments.checkCategoriesRequiredSize(counts.length, 2);
157        Arguments.checkValuesRequiredSize(counts[0].length, 2);
158        Arguments.checkRectangular(counts);
159        Arguments.checkNonNegative(counts);
160
161        final int nRows = counts.length;
162        final int nCols = counts[0].length;
163
164        // compute row, column and total sums
165        final double[] rowSum = new double[nRows];
166        final double[] colSum = new double[nCols];
167        double sum = 0;
168        for (int row = 0; row < nRows; row++) {
169            for (int col = 0; col < nCols; col++) {
170                rowSum[row] += counts[row][col];
171                colSum[col] += counts[row][col];
172            }
173            checkNonZero(rowSum[row], ROW, row);
174            sum += rowSum[row];
175        }
176
177        for (int col = 0; col < nCols; col++) {
178            checkNonZero(colSum[col], COLUMN, col);
179        }
180
181        // Compute expected counts and chi-square
182        double chi2 = 0;
183        for (int row = 0; row < nRows; row++) {
184            for (int col = 0; col < nCols; col++) {
185                final double e = (rowSum[row] * colSum[col]) / sum;
186                final double d = counts[row][col] - e;
187                chi2 += d * d / e;
188            }
189        }
190        return chi2;
191    }
192
193    /**
194     * Computes a chi-square statistic associated with a chi-square test of
195     * independence of frequency counts in {@code observed1} and {@code observed2}.
196     * The sums of frequency counts in the two samples are not required to be the
197     * same. The formula used to compute the test statistic is:
198     *
199     * <p>\[ \sum_i{ \frac{(K * a_i - b_i / K)^2}{a_i + b_i} } \]
200     *
201     * <p>where
202     *
203     * <p>\[ K = \sqrt{ \sum_i{a_i} / \sum_i{b_i} } \]
204     *
205     * <p>Note: This is a specialized version of a 2-by-n contingency table. The
206     * result is faster than calling {@link #statistic(long[][])} with the table
207     * composed as {@code new long[][]{observed1, observed2}}. The statistic is the
208     * same, with an allowance for accumulated floating-point error due to the
209     * optimized routine.
210     *
211     * @param observed1 Observed frequency counts of the first data set.
212     * @param observed2 Observed frequency counts of the second data set.
213     * @return Chi-square statistic
214     * @throws IllegalArgumentException if the sample size is less than 2; the array
215     * sizes do not match; either array has entries that are negative; either all
216     * counts of {@code observed1} or {@code observed2} are zero; or if the count at
217     * some index is zero for both arrays.
218     * @see ChiSquareTest#test(long[], long[])
219     */
220    public double statistic(long[] observed1, long[] observed2) {
221        Arguments.checkValuesRequiredSize(observed1.length, 2);
222        Arguments.checkValuesSizeMatch(observed1.length, observed2.length);
223        Arguments.checkNonNegative(observed1);
224        Arguments.checkNonNegative(observed2);
225
226        // Compute and compare count sums
227        long colSum1 = 0;
228        long colSum2 = 0;
229        for (int i = 0; i < observed1.length; i++) {
230            final long obs1 = observed1[i];
231            final long obs2 = observed2[i];
232            checkNonZero(obs1 | obs2, ROW, i);
233            colSum1 += obs1;
234            colSum2 += obs2;
235        }
236        // Create the same exception message as chiSquare(long[][])
237        checkNonZero(colSum1, COLUMN, 0);
238        checkNonZero(colSum2, COLUMN, 1);
239
240        // Compare and compute weight only if different
241        final boolean unequalCounts = colSum1 != colSum2;
242        final double weight = unequalCounts ?
243            Math.sqrt((double) colSum1 / colSum2) : 1;
244        // Compute chi-square
245        // This exploits an algebraic rearrangement of the generic n*m contingency table case
246        // for a single sum squared addition per row.
247        double chi2 = 0;
248        for (int i = 0; i < observed1.length; i++) {
249            final double obs1 = observed1[i];
250            final double obs2 = observed2[i];
251            // apply weights
252            final double d = unequalCounts ?
253                    obs1 / weight - obs2 * weight :
254                    obs1 - obs2;
255            chi2 += (d * d) / (obs1 + obs2);
256        }
257        return chi2;
258    }
259
260    /**
261     * Perform a chi-square goodness-of-fit test evaluating the null hypothesis that
262     * the {@code observed} counts conform to a uniform distribution (each category
263     * is equally likely).
264     *
265     * @param observed Observed frequency counts.
266     * @return test result
267     * @throws IllegalArgumentException if the sample size is less than 2;
268     * {@code observed} has negative entries; or all the observations are zero
269     * @see #statistic(long[])
270     */
271    public SignificanceResult test(long[] observed) {
272        final int df = observed.length - 1;
273        final double chi2 = statistic(observed);
274        final double p = computeP(chi2, df);
275        return new BaseSignificanceResult(chi2, p);
276    }
277
278    /**
279     * Perform a chi-square goodness-of-fit test evaluating the null hypothesis that the
280     * {@code observed} counts conform to the {@code expected} counts.
281     *
282     * <p>The test can be configured to apply an adjustment to the degrees of freedom
283     * if the observed data has been used to create the expected counts.
284     *
285     * @param expected Expected frequency counts.
286     * @param observed Observed frequency counts.
287     * @return test result
288     * @throws IllegalArgumentException if the sample size is less than 2; the array
289     * sizes do not match; {@code expected} has entries that are not strictly
290     * positive; {@code observed} has negative entries; all the observations are zero; or
291     * the adjusted degrees of freedom are not strictly positive
292     * @see #withDegreesOfFreedomAdjustment(int)
293     * @see #statistic(double[], long[])
294     */
295    public SignificanceResult test(double[] expected, long[] observed) {
296        final int df = StatisticUtils.computeDegreesOfFreedom(observed.length, degreesOfFreedomAdjustment);
297        final double chi2 = statistic(expected, observed);
298        final double p = computeP(chi2, df);
299        return new BaseSignificanceResult(chi2, p);
300    }
301
302    /**
303     * Perform a chi-square test of independence based on the input {@code counts} array,
304     * viewed as a two-way table.
305     *
306     * @param counts 2-way table.
307     * @return test result
308     * @throws IllegalArgumentException if the number of rows or columns is less
309     * than 2; the array is non-rectangular; the array has negative entries; or the
310     * sum of a row or column is zero.
311     * @see #statistic(long[][])
312     */
313    public SignificanceResult test(long[][] counts) {
314        final double chi2 = statistic(counts);
315        final double df = (counts.length - 1.0) * (counts[0].length - 1.0);
316        final double p = computeP(chi2, df);
317        return new BaseSignificanceResult(chi2, p);
318    }
319
320    /**
321     * Perform a chi-square test of independence of frequency counts in
322     * {@code observed1} and {@code observed2}.
323     *
324     * <p>Note: This is a specialized version of a 2-by-n contingency table.
325     *
326     * @param observed1 Observed frequency counts of the first data set.
327     * @param observed2 Observed frequency counts of the second data set.
328     * @return test result
329     * @throws IllegalArgumentException if the sample size is less than 2; the array
330     * sizes do not match; either array has entries that are negative; either all
331     * counts of {@code observed1} or {@code observed2} are zero; or if the count at
332     * some index is zero for both arrays.
333     * @see #statistic(long[], long[])
334     */
335    public SignificanceResult test(long[] observed1, long[] observed2) {
336        final double chi2 = statistic(observed1, observed2);
337        final double p = computeP(chi2, observed1.length - 1.0);
338        return new BaseSignificanceResult(chi2, p);
339    }
340
341    /**
342     * Compute the chi-square test p-value.
343     *
344     * @param chi2 Chi-square statistic.
345     * @param degreesOfFreedom Degrees of freedom.
346     * @return p-value
347     */
348    private static double computeP(double chi2, double degreesOfFreedom) {
349        return ChiSquaredDistribution.of(degreesOfFreedom).survivalProbability(chi2);
350    }
351
352    /**
353     * Check the array value is non-zero.
354     *
355     * @param value Value
356     * @param name Name of the array
357     * @param index Index in the array
358     * @throws IllegalArgumentException if the value is zero
359     */
360    private static void checkNonZero(double value, String name, int index) {
361        if (value == 0) {
362            throw new InferenceException(InferenceException.ZERO_AT, name, index);
363        }
364    }
365}