Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.statistics.inference;
018
019import org.apache.commons.numbers.core.Sum;
020import org.apache.commons.statistics.descriptive.LongMean;
021import org.apache.commons.statistics.distribution.ChiSquaredDistribution;
022
023/**
024 * Implements G-test (Generalized Log-Likelihood Ratio Test) statistics.
025 *
026 * <p>This is known in statistical genetics as the McDonald-Kreitman test.
027 * The implementation handles both known and unknown distributions.
028 *
029 * <p>Two samples tests can be used when the distribution is unknown <i>a priori</i>
030 * but provided by one sample, or when the hypothesis under test is that the two
031 * samples come from the same underlying distribution.
032 *
033 * @see <a href="https://en.wikipedia.org/wiki/G-test">G-test (Wikipedia)</a>
034 * @since 1.1
035 */
036public final class GTest {
037    // Note:
038    // The g-test statistic is a summation of terms with positive and negative sign
039    // and thus the sum may exhibit cancellation. This class uses separate high precision
040    // sums of the positive and negative terms which are then combined.
041    // Total cancellation for a large number of terms will not impact
042    // p-values of interest around critical alpha values as the Chi^2
043    // distribution exhibits strong concentration around its mean (degrees of freedom, k).
044    // The summation only need maintain enough bits in the final sum to distinguish
045    // g values around critical alpha values where 0 << chisq.sf(g, k) << 0.5: g > k,
046    // with k = number of terms - 1.
047
048    /** Default instance. */
049    private static final GTest DEFAULT = new GTest(0);
050
051    /** Degrees of freedom adjustment. */
052    private final int degreesOfFreedomAdjustment;
053
054    /**
055     * @param degreesOfFreedomAdjustment Degrees of freedom adjustment.
056     */
057    private GTest(int degreesOfFreedomAdjustment) {
058        this.degreesOfFreedomAdjustment = degreesOfFreedomAdjustment;
059    }
060
061    /**
062     * Return an instance using the default options.
063     *
064     * <ul>
065     * <li>{@linkplain #withDegreesOfFreedomAdjustment(int) Degrees of freedom adjustment = 0}
066     * </ul>
067     *
068     * @return default instance
069     */
070    public static GTest withDefaults() {
071        return DEFAULT;
072    }
073
074    /**
075     * Return an instance with the configured degrees of freedom adjustment.
076     *
077     * <p>The default degrees of freedom for a sample of length {@code n} are
078     * {@code n - 1}. An intrinsic null hypothesis is one where you estimate one or
079     * more parameters from the data in order to get the numbers for your null
080     * hypothesis. For a distribution with {@code p} parameters where up to
081     * {@code p} parameters have been estimated from the data the degrees of freedom
082     * is in the range {@code [n - 1 - p, n - 1]}.
083     *
084     * @param v Value.
085     * @return an instance
086     * @throws IllegalArgumentException if the value is negative
087     */
088    public GTest withDegreesOfFreedomAdjustment(int v) {
089        return new GTest(Arguments.checkNonNegative(v));
090    }
091
092    /**
093     * Computes the G-test goodness-of-fit statistic comparing the {@code observed} counts to
094     * a uniform expected value (each category is equally likely).
095     *
096     * <p>Note: This is a specialized version of a comparison of {@code observed}
097     * with an {@code expected} array of uniform values. The result is faster than
098     * calling {@link #statistic(double[], long[])} and the statistic is the same,
099     * with an allowance for accumulated floating-point error due to the optimized
100     * routine.
101     *
102     * @param observed Observed frequency counts.
103     * @return G-test statistic
104     * @throws IllegalArgumentException if the sample size is less than 2;
105     * {@code observed} has negative entries; or all the observations are zero.
106     * @see #test(long[])
107     */
108    public double statistic(long[] observed) {
109        Arguments.checkValuesRequiredSize(observed.length, 2);
110        Arguments.checkNonNegative(observed);
111        final double e = LongMean.of(observed).getAsDouble();
112        if (e == 0) {
113            throw new InferenceException(InferenceException.NO_DATA);
114        }
115        // g = 2 * sum{o * ln(o/e)}
116        //   = 2 * [ sum{o * ln(o)} - sum(o) * ln(e) ]
117        // The second form has more cancellation as the sums are larger.
118        // Separate sum for positive and negative terms.
119        final Sum sum = Sum.create();
120        final Sum sum2 = Sum.create();
121        for (final double o : observed) {
122            if (o > e) {
123                // Positive term
124                sum.add(o * Math.log(o / e));
125            } else if (o > 0) {
126                // Negative term
127                // Process non-zero counts to avoid 0 * -inf = NaN
128                sum2.add(o * Math.log(o / e));
129            }
130        }
131        return sum.add(sum2).getAsDouble() * 2;
132    }
133
134    /**
135     * Computes the G-test goodness-of-fit statistic comparing {@code observed} and {@code expected}
136     * frequency counts.
137     *
138     * <p><strong>Note:</strong>This implementation rescales the values
139     * if necessary to ensure that the sum of the expected and observed counts
140     * are equal.
141     *
142     * @param expected Expected frequency counts.
143     * @param observed Observed frequency counts.
144     * @return G-test statistic
145     * @throws IllegalArgumentException if the sample size is less than 2; the array
146     * sizes do not match; {@code expected} has entries that are not strictly
147     * positive; {@code observed} has negative entries; or all the observations are zero.
148     * @see #test(double[], long[])
149     */
150    public double statistic(double[] expected, long[] observed) {
151        // g = 2 * sum{o * ln(o/e)}
152        // The sum of o and e must be the same.
153        final double ratio = StatisticUtils.computeRatio(expected, observed);
154        // High precision sum to reduce cancellation.
155        // Separate sum for positive and negative terms.
156        final Sum sum = Sum.create();
157        final Sum sum2 = Sum.create();
158        for (int i = 0; i < observed.length; i++) {
159            final long o = observed[i];
160            // Process non-zero counts to avoid 0 * -inf = NaN
161            if (o != 0) {
162                final double term = o * Math.log(o / (ratio * expected[i]));
163                if (term < 0) {
164                    sum2.add(term);
165                } else {
166                    sum.add(term);
167                }
168            }
169        }
170        return sum.add(sum2).getAsDouble() * 2;
171    }
172
173    /**
174     * Computes a G-test statistic associated with a G-test of
175     * independence based on the input {@code counts} array, viewed as a two-way
176     * table. The formula used to compute the test statistic is:
177     *
178     * <p>\[ G = 2 \cdot \sum_{ij}{O_{ij}} \cdot \left[ H(r) + H(c) - H(r,c) \right] \]
179     *
180     * <p>and \( H \) is the <a
181     * href="https://en.wikipedia.org/wiki/Entropy_%28information_theory%29">
182     * Shannon Entropy</a> of the random variable formed by viewing the elements of
183     * the argument array as incidence counts:
184     *
185     * <p>\[ H(X) = - {\sum_{x \in \text{Supp}(X)} p(x) \ln p(x)} \]
186     *
187     * @param counts 2-way table.
188     * @return G-test statistic
189     * @throws IllegalArgumentException if the number of rows or columns is less
190     * than 2; the array is non-rectangular; the array has negative entries; or the
191     * sum of a row or column is zero.
192     * @see ChiSquareTest#test(long[][])
193     */
194    public double statistic(long[][] counts) {
195        Arguments.checkCategoriesRequiredSize(counts.length, 2);
196        Arguments.checkValuesRequiredSize(counts[0].length, 2);
197        Arguments.checkRectangular(counts);
198        Arguments.checkNonNegative(counts);
199
200        final int ni = counts.length;
201        final int nj = counts[0].length;
202
203        // Compute row, column and total sums
204        final double[] sumi = new double[ni];
205        final double[] sumj = new double[nj];
206        double n = 0;
207        // We can sum data on the first pass. See below for computation details.
208        final Sum sum = Sum.create();
209        for (int i = 0; i < ni; i++) {
210            for (int j = 0; j < nj; j++) {
211                final long c = counts[i][j];
212                sumi[i] += c;
213                sumj[j] += c;
214                if (c > 1) {
215                    sum.add(c * Math.log(c));
216                }
217            }
218            checkNonZero(sumi[i], "Row", i);
219            n += sumi[i];
220        }
221
222        for (int j = 0; j < nj; j++) {
223            checkNonZero(sumj[j], "Column", j);
224        }
225
226        // This computes a modified form of the Shannon entropy H without requiring
227        // normalisation of observations to probabilities and without negation,
228        // i.e. we compute n * [ H(r) + H(c) - H(r,c) ] as [ H'(r,c) - H'(r) - H'(c) ].
229
230        // H  = -sum (p * log(p))
231        // H' = n * sum (p * log(p))
232        //    = n * sum (o/n * log(o/n))
233        //    = n * [ sum(o/n * log(o)) - sum(o/n * log(n)) ]
234        //    = sum(o * log(o)) - n log(n)
235
236        // After 3 modified entropy sums H'(r,c) - H'(r) - H'(c) compensation is (-1 + 2) * n log(n)
237        sum.addProduct(n, Math.log(n));
238        // Negative terms
239        final Sum sum2 = Sum.create();
240        // All these counts are above zero so no check for zeros
241        for (final double c : sumi) {
242            sum2.add(c * -Math.log(c));
243        }
244        for (final double c : sumj) {
245            sum2.add(c * -Math.log(c));
246        }
247
248        return sum.add(sum2).getAsDouble() * 2;
249    }
250
251    /**
252     * Perform a G-test for goodness-of-fit evaluating the null hypothesis that the {@code observed}
253     * counts conform to a uniform distribution (each category is equally likely).
254     *
255     * @param observed Observed frequency counts.
256     * @return test result
257     * @throws IllegalArgumentException if the sample size is less than 2;
258     * {@code observed} has negative entries; or all the observations are zero
259     * @see #statistic(long[])
260     */
261    public SignificanceResult test(long[] observed) {
262        final int df = observed.length - 1;
263        final double g = statistic(observed);
264        final double p = computeP(g, df);
265        return new BaseSignificanceResult(g, p);
266    }
267
268    /**
269     * Perform a G-test for goodness-of-fit evaluating the null hypothesis that the {@code observed}
270     * counts conform to the {@code expected} counts.
271     *
272     * <p>The test can be configured to apply an adjustment to the degrees of freedom
273     * if the observed data has been used to create the expected counts.
274     *
275     * @param expected Expected frequency counts.
276     * @param observed Observed frequency counts.
277     * @return test result
278     * @throws IllegalArgumentException if the sample size is less than 2; the array
279     * sizes do not match; {@code expected} has entries that are not strictly
280     * positive; {@code observed} has negative entries; all the observations are zero; or
281     * the adjusted degrees of freedom are not strictly positive
282     * @see #withDegreesOfFreedomAdjustment(int)
283     * @see #statistic(double[], long[])
284     */
285    public SignificanceResult test(double[] expected, long[] observed) {
286        final int df = StatisticUtils.computeDegreesOfFreedom(observed.length, degreesOfFreedomAdjustment);
287        final double g = statistic(expected, observed);
288        final double p = computeP(g, df);
289        return new BaseSignificanceResult(g, p);
290    }
291
292    /**
293     * Perform a G-test of independence based on the input
294     * {@code counts} array, viewed as a two-way table.
295     *
296     * @param counts 2-way table.
297     * @return test result
298     * @throws IllegalArgumentException if the number of rows or columns is less
299     * than 2; the array is non-rectangular; the array has negative entries; or the
300     * sum of a row or column is zero.
301     * @see #statistic(long[][])
302     */
303    public SignificanceResult test(long[][] counts) {
304        final double g = statistic(counts);
305        final double df = (counts.length - 1.0) * (counts[0].length - 1.0);
306        final double p = computeP(g, df);
307        return new BaseSignificanceResult(g, p);
308    }
309
310    /**
311     * Compute the G-test p-value.
312     *
313     * @param g G-test statistic.
314     * @param degreesOfFreedom Degrees of freedom.
315     * @return p-value
316     */
317    private static double computeP(double g, double degreesOfFreedom) {
318        return ChiSquaredDistribution.of(degreesOfFreedom).survivalProbability(g);
319    }
320
321    /**
322     * Check the array value is non-zero.
323     *
324     * @param value Value
325     * @param name Name of the array
326     * @param index Index in the array
327     * @throws IllegalArgumentException if the value is zero
328     */
329    private static void checkNonZero(double value, String name, int index) {
330        if (value == 0) {
331            throw new InferenceException(InferenceException.ZERO_AT, name, index);
332        }
333    }
334}