001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.statistics.inference;
018
019import java.util.Collection;
020import java.util.Iterator;
021import org.apache.commons.numbers.core.Sum;
022import org.apache.commons.statistics.distribution.FDistribution;
023
024/**
025 * Implements one-way ANOVA (analysis of variance) statistics.
026 *
027 * <p>Tests for differences between two or more categories of univariate data
028 * (for example, the body mass index of accountants, lawyers, doctors and
029 * computer programmers). When two categories are given, this is equivalent to
030 * the {@link TTest}.
031 *
032 * <p>This implementation computes the F statistic using the definitional formula:
033 *
034 * <p>\[ F = \frac{\text{between-group variability}}{\text{within-group variability}} \]
035 *
036 * @see <a href="https://en.wikipedia.org/wiki/Analysis_of_variance">Analysis of variance (Wikipedia)</a>
037 * @see <a href="https://en.wikipedia.org/wiki/F-test#Multiple-comparison_ANOVA_problems">
038 * Multiple-comparison ANOVA problems (Wikipedia)</a>
039 * @see <a href="https://www.biostathandbook.com/onewayanova.html">
040 * McDonald, J.H. 2014. Handbook of Biological Statistics (3rd ed.). Sparky House Publishing, Baltimore, Maryland.
041 * One-way anova. pp 145-156.</a>
042 * @since 1.1
043 */
044public final class OneWayAnova {
045    /** Default instance. */
046    private static final OneWayAnova DEFAULT = new OneWayAnova();
047
048    /**
049     * Result for the one-way ANOVA.
050     *
051     * <p>This class is immutable.
052     *
053     * @since 1.1
054     */
055    public static final class Result extends BaseSignificanceResult {
056        /** Degrees of freedom in numerator (between groups). */
057        private final int dfbg;
058        /** Degrees of freedom in denominator (within groups). */
059        private final long dfwg;
060        /** Mean square between groups. */
061        private final double msbg;
062        /** Mean square within groups. */
063        private final double mswg;
064        /** nO value used to partition the variance. */
065        private final double nO;
066
067        /**
068         * @param dfbg Degrees of freedom in numerator (between groups).
069         * @param dfwg Degrees of freedom in denominator (within groups).
070         * @param msbg Mean square between groups.
071         * @param mswg Mean square within groups.
072         * @param nO Factor for partitioning the variance.
073         * @param f F statistic
074         * @param p P-value.
075         */
076        Result(int dfbg, long dfwg, double msbg, double mswg, double nO, double f, double p) {
077            super(f, p);
078            this.dfbg = dfbg;
079            this.dfwg = dfwg;
080            this.msbg = msbg;
081            this.mswg = mswg;
082            this.nO = nO;
083        }
084
085        /**
086         * Gets the degrees of freedom in the numerator (between groups).
087         *
088         * @return degrees of freedom between groups
089         */
090        int getDFBG() {
091            return dfbg;
092        }
093
094        /**
095         * Gets the degrees of freedom in the denominator (within groups).
096         *
097         * @return degrees of freedom within groups
098         */
099        long getDFWG() {
100            return dfwg;
101        }
102
103        /**
104         * Gets the mean square between groups.
105         *
106         * @return mean square between groups
107         */
108        public double getMSBG() {
109            return msbg;
110        }
111
112        /**
113         * Gets the mean square within groups.
114         *
115         * @return mean square within groups
116         */
117        public double getMSWG() {
118            return mswg;
119        }
120
121        /**
122         * Gets the variance component between groups.
123         *
124         * <p>The value is a partitioning of the variance.
125         * It is the complement of {@link #getVCWG()}.
126         *
127         * <p>Partitioning the variance applies only to a model II
128         * (random effects) one-way anova. This applies when the
129         * groups are random samples from a larger set of groups;
130         * partitioning the variance allows comparison of the
131         * variation between groups to the variation within groups.
132         *
133         * <p>If the {@linkplain #getMSBG() MSBG} is less than the
134         * {@linkplain #getMSWG() MSWG} this returns 0. Otherwise this
135         * creates an estimate of the added variance component
136         * between groups as:
137         *
138         * <p>\[ \text{between-group variance} = A = (\text{MS}_{\text{bg}} - \text{MS}_{\text{wg}}) / n_o \]
139         *
140         * <p>where \( n_o \) is a number close to, but usually less than,
141         * the arithmetic mean of the sample size \(n_i\) of each
142         * of the \( a \) groups:
143         *
144         * <p>\[ n_o = \frac{1}{a-1} \left( \sum_i{n_i} - \frac{\sum_i{n_i^2}}{\sum_i{n_i}} \right) \]
145         *
146         * <p>The added variance component among groups \( A \) is expressed
147         * as a fraction of the total variance components \( A + B \) where
148         * \( B \) is the {@linkplain #getMSWG() MSWG}.
149         *
150         * @return variance component between groups (in [0, 1]).
151         */
152        public double getVCBG() {
153            if (msbg <= mswg) {
154                return 0;
155            }
156            // a is an estimate of the between-group variance
157            final double a = (msbg - mswg) / nO;
158            final double b = mswg;
159            return a / (a + b);
160        }
161
162        /**
163         * Gets the variance component within groups.
164         *
165         * <p>The value is a partitioning of the variance.
166         * It is the complement of {@link #getVCBG()}. See
167         * that method for details.
168         *
169         * @return variance component within groups (in [0, 1]).
170         */
171        public double getVCWG() {
172            if (msbg <= mswg) {
173                return 1;
174            }
175            final double a = (msbg - mswg) / nO;
176            final double b = mswg;
177            return b / (a + b);
178        }
179    }
180
181    /** Private constructor. */
182    private OneWayAnova() {
183        // Do nothing
184    }
185
186    /**
187     * Return an instance using the default options.
188     *
189     * @return default instance
190     */
191    public static OneWayAnova withDefaults() {
192        return DEFAULT;
193    }
194
195    /**
196     * Computes the F statistic for an ANOVA test for a collection of category data,
197     * evaluating the null hypothesis that there is no difference among the means of
198     * the data categories.
199     *
200     * <p>Special cases:
201     * <ul>
202     * <li>If the value in each category is the same (no variance within groups) but different
203     * between groups, the f-value is {@linkplain Double#POSITIVE_INFINITY infinity}.
204     * <li>If the value in every group is the same (no variance within or between groups),
205     * the f-value is {@link Double#NaN NaN}.
206     * </ul>
207     *
208     * @param data Category summary data.
209     * @return F statistic
210     * @throws IllegalArgumentException if the number of categories is less than
211     * two; a contained category does not have at least one value; or all
212     * categories have only one value (zero degrees of freedom within groups)
213     */
214    public double statistic(Collection<double[]> data) {
215        final double[] f = new double[1];
216        aov(data, f);
217        return f[0];
218    }
219
220    /**
221     * Performs an ANOVA test for a collection of category data,
222     * evaluating the null hypothesis that there is no difference among the means of
223     * the data categories.
224     *
225     * <p>Special cases:
226     * <ul>
227     * <li>If the value in each category is the same (no variance within groups) but different
228     * between groups, the f-value is {@linkplain Double#POSITIVE_INFINITY infinity} and the p-value is zero.
229     * <li>If the value in every group is the same (no variance within or between groups),
230     * the f-value and p-value are {@link Double#NaN NaN}.
231     * </ul>
232     *
233     * @param data Category summary data.
234     * @return test result
235     * @throws IllegalArgumentException if the number of categories is less than
236     * two; a contained category does not have at least one value; or all
237     * categories have only one value (zero degrees of freedom within groups)
238     */
239    public Result test(Collection<double[]> data) {
240        return aov(data, null);
241    }
242
243    /**
244     * Performs an ANOVA test for a collection of category data, evaluating the null
245     * hypothesis that there is no difference among the means of the data categories.
246     *
247     * <p>This is a utility method to allow computation of the F statistic without
248     * the p-value or partitioning of the variance. If the {@code statistic} is not null
249     * the method will record the F statistic in the array and return null.
250     *
251     * @param data Category summary data.
252     * @param statistic Result for the F statistic (or null).
253     * @return test result (or null)
254     * @throws IllegalArgumentException if the number of categories is less than two; a
255     * contained category does not have at least one value; or all categories have only
256     * one value (zero degrees of freedom within groups)
257     */
258    private static Result aov(Collection<double[]> data, double[] statistic) {
259        Arguments.checkCategoriesRequiredSize(data.size(), 2);
260        long n = 0;
261        for (final double[] array : data) {
262            n += array.length;
263            Arguments.checkValuesRequiredSize(array.length, 1);
264        }
265        final long dfwg = n - data.size();
266        if (dfwg == 0) {
267            throw new InferenceException(InferenceException.ZERO, "Degrees of freedom within groups");
268        }
269
270        // wg = within group
271        // bg = between group
272
273        // F = Var(bg) / Var(wg)
274        // Var = SS / df
275        // SStotal = sum((x - u)^2) = sum(x^2) - sum(x)^2/n
276        //         = SSwg + SSbg
277        // Some cancellation of terms reduces the computation to 3 sums:
278        // SSwg = [ sum(x^2) - sum(x)^2/n ] - [ sum_g { sum(sum(x^2) - sum(x)^2/n) } ]
279        // SSbg = SStotal - SSwg
280        //      = sum_g { sum(x)^2/n) } - sum(x)^2/n
281        // SSwg = SStotal - SSbg
282        //      = sum(x^2) - sum_g { sum(x)^2/n) }
283
284        // Stabilize the computation by shifting all to a common mean of zero.
285        // This minimise the magnitude of x^2 terms.
286        // The terms sum(x)^2/n -> 0. Included them to capture the round-off.
287        final double m = StatisticUtils.mean(data);
288        final Sum sxx = Sum.create();
289        final Sum sx = Sum.create();
290        final Sum sg = Sum.create();
291        // Track if each group had the same value
292        boolean eachSame = true;
293        for (final double[] array : data) {
294            eachSame = eachSame && allMatch(array[0], array);
295            final Sum s = Sum.create();
296            for (final double v : array) {
297                final double x = v - m;
298                s.add(x);
299                // sum(x)
300                sx.add(x);
301                // sum(x^2)
302                sxx.add(x * x);
303            }
304            // Create the negative sum so we can subtract it via 'add'
305            // -sum_g { sum(x)^2/n) }
306            sg.add(-pow2(s.getAsDouble()) / array.length);
307        }
308
309        // Note: SS terms should not be negative given:
310        // SS = sum((x - u)^2)
311        // This can happen due to floating-point error in sum(x^2) - sum(x)^2/n
312        final double sswg = Math.max(0, sxx.add(sg).getAsDouble());
313        // Flip the sign back
314        final double ssbg = Math.max(0, -sg.add(pow2(sx.getAsDouble()) / n).getAsDouble());
315        final int dfbg = data.size() - 1;
316        // Handle edge-cases:
317        // Note: 0 / 0 -> NaN : x / 0 -> inf
318        // These are documented results and should output p=NaN or 0.
319        // This result will occur naturally.
320        // However the SS totals may not be 0.0 so we correct these cases.
321        final boolean allSame = eachSame && allMatch(data);
322        final double msbg = allSame ? 0 : ssbg / dfbg;
323        final double mswg = eachSame ? 0 : sswg / dfwg;
324        final double f = msbg / mswg;
325        if (statistic != null) {
326            statistic[0] = f;
327            return null;
328        }
329        final double p = FDistribution.of(dfbg, dfwg).survivalProbability(f);
330
331        // Support partitioning the variance
332        // ni = size of each of the groups
333        // nO=(1/(a−1))*(sum(ni)−(sum(ni^2)/sum(ni))
334        final double nO = (n - data.stream()
335                .mapToDouble(x -> pow2(x.length)).sum() / n) / dfbg;
336
337        return new Result(dfbg, dfwg, msbg, mswg, nO, f, p);
338    }
339
340    /**
341     * Return true if all values in the array match the specified value.
342     *
343     * @param v Value.
344     * @param a Array.
345     * @return true if all match
346     */
347    private static boolean allMatch(double v, double[] a) {
348        for (final double w : a) {
349            if (v != w) {
350                return false;
351            }
352        }
353        return true;
354    }
355
356    /**
357     * Return true if all values in the arrays match.
358     *
359     * <p>Assumes that there are at least two arrays and that each array has the same
360     * value throughout. Thus only the first element in each array is checked.
361     *
362     * @param data Arrays.
363     * @return true if all match
364     */
365    private static boolean allMatch(Collection<double[]> data) {
366        final Iterator<double[]> iter = data.iterator();
367        final double v = iter.next()[0];
368        while (iter.hasNext()) {
369            if (iter.next()[0] != v) {
370                return false;
371            }
372        }
373        return true;
374    }
375
376    /**
377     * Compute {@code x^2}.
378     *
379     * @param x Value.
380     * @return {@code x^2}
381     */
382    private static double pow2(double x) {
383        return x * x;
384    }
385}