View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.statistics.inference;
18  
19  import java.util.Collection;
20  import java.util.Iterator;
21  import org.apache.commons.numbers.core.Sum;
22  import org.apache.commons.statistics.distribution.FDistribution;
23  
24  /**
25   * Implements one-way ANOVA (analysis of variance) statistics.
26   *
27   * <p>Tests for differences between two or more categories of univariate data
28   * (for example, the body mass index of accountants, lawyers, doctors and
29   * computer programmers). When two categories are given, this is equivalent to
30   * the {@link TTest}.
31   *
32   * <p>This implementation computes the F statistic using the definitional formula:
33   *
34   * <p>\[ F = \frac{\text{between-group variability}}{\text{within-group variability}} \]
35   *
36   * @see <a href="https://en.wikipedia.org/wiki/Analysis_of_variance">Analysis of variance (Wikipedia)</a>
37   * @see <a href="https://en.wikipedia.org/wiki/F-test#Multiple-comparison_ANOVA_problems">
38   * Multiple-comparison ANOVA problems (Wikipedia)</a>
39   * @see <a href="https://www.biostathandbook.com/onewayanova.html">
40   * McDonald, J.H. 2014. Handbook of Biological Statistics (3rd ed.). Sparky House Publishing, Baltimore, Maryland.
41   * One-way anova. pp 145-156.</a>
42   * @since 1.1
43   */
44  public final class OneWayAnova {
45      /** Default instance. */
46      private static final OneWayAnova DEFAULT = new OneWayAnova();
47  
48      /**
49       * Result for the one-way ANOVA.
50       *
51       * <p>This class is immutable.
52       *
53       * @since 1.1
54       */
55      public static final class Result extends BaseSignificanceResult {
56          /** Degrees of freedom in numerator (between groups). */
57          private final int dfbg;
58          /** Degrees of freedom in denominator (within groups). */
59          private final long dfwg;
60          /** Mean square between groups. */
61          private final double msbg;
62          /** Mean square within groups. */
63          private final double mswg;
64          /** nO value used to partition the variance. */
65          private final double nO;
66  
67          /**
68           * @param dfbg Degrees of freedom in numerator (between groups).
69           * @param dfwg Degrees of freedom in denominator (within groups).
70           * @param msbg Mean square between groups.
71           * @param mswg Mean square within groups.
72           * @param nO Factor for partitioning the variance.
73           * @param f F statistic
74           * @param p P-value.
75           */
76          Result(int dfbg, long dfwg, double msbg, double mswg, double nO, double f, double p) {
77              super(f, p);
78              this.dfbg = dfbg;
79              this.dfwg = dfwg;
80              this.msbg = msbg;
81              this.mswg = mswg;
82              this.nO = nO;
83          }
84  
85          /**
86           * Gets the degrees of freedom in the numerator (between groups).
87           *
88           * @return degrees of freedom between groups
89           */
90          int getDFBG() {
91              return dfbg;
92          }
93  
94          /**
95           * Gets the degrees of freedom in the denominator (within groups).
96           *
97           * @return degrees of freedom within groups
98           */
99          long getDFWG() {
100             return dfwg;
101         }
102 
103         /**
104          * Gets the mean square between groups.
105          *
106          * @return mean square between groups
107          */
108         public double getMSBG() {
109             return msbg;
110         }
111 
112         /**
113          * Gets the mean square within groups.
114          *
115          * @return mean square within groups
116          */
117         public double getMSWG() {
118             return mswg;
119         }
120 
121         /**
122          * Gets the variance component between groups.
123          *
124          * <p>The value is a partitioning of the variance.
125          * It is the complement of {@link #getVCWG()}.
126          *
127          * <p>Partitioning the variance applies only to a model II
128          * (random effects) one-way anova. This applies when the
129          * groups are random samples from a larger set of groups;
130          * partitioning the variance allows comparison of the
131          * variation between groups to the variation within groups.
132          *
133          * <p>If the {@linkplain #getMSBG() MSBG} is less than the
134          * {@linkplain #getMSWG() MSWG} this returns 0. Otherwise this
135          * creates an estimate of the added variance component
136          * between groups as:
137          *
138          * <p>\[ \text{between-group variance} = A = (\text{MS}_{\text{bg}} - \text{MS}_{\text{wg}}) / n_o \]
139          *
140          * <p>where \( n_o \) is a number close to, but usually less than,
141          * the arithmetic mean of the sample size \(n_i\) of each
142          * of the \( a \) groups:
143          *
144          * <p>\[ n_o = \frac{1}{a-1} \left( \sum_i{n_i} - \frac{\sum_i{n_i^2}}{\sum_i{n_i}} \right) \]
145          *
146          * <p>The added variance component among groups \( A \) is expressed
147          * as a fraction of the total variance components \( A + B \) where
148          * \( B \) is the {@linkplain #getMSWG() MSWG}.
149          *
150          * @return variance component between groups (in [0, 1]).
151          */
152         public double getVCBG() {
153             if (msbg <= mswg) {
154                 return 0;
155             }
156             // a is an estimate of the between-group variance
157             final double a = (msbg - mswg) / nO;
158             final double b = mswg;
159             return a / (a + b);
160         }
161 
162         /**
163          * Gets the variance component within groups.
164          *
165          * <p>The value is a partitioning of the variance.
166          * It is the complement of {@link #getVCBG()}. See
167          * that method for details.
168          *
169          * @return variance component within groups (in [0, 1]).
170          */
171         public double getVCWG() {
172             if (msbg <= mswg) {
173                 return 1;
174             }
175             final double a = (msbg - mswg) / nO;
176             final double b = mswg;
177             return b / (a + b);
178         }
179     }
180 
181     /** Private constructor. */
182     private OneWayAnova() {
183         // Do nothing
184     }
185 
186     /**
187      * Return an instance using the default options.
188      *
189      * @return default instance
190      */
191     public static OneWayAnova withDefaults() {
192         return DEFAULT;
193     }
194 
195     /**
196      * Computes the F statistic for an ANOVA test for a collection of category data,
197      * evaluating the null hypothesis that there is no difference among the means of
198      * the data categories.
199      *
200      * <p>Special cases:
201      * <ul>
202      * <li>If the value in each category is the same (no variance within groups) but different
203      * between groups, the f-value is {@linkplain Double#POSITIVE_INFINITY infinity}.
204      * <li>If the value in every group is the same (no variance within or between groups),
205      * the f-value is {@link Double#NaN NaN}.
206      * </ul>
207      *
208      * @param data Category summary data.
209      * @return F statistic
210      * @throws IllegalArgumentException if the number of categories is less than
211      * two; a contained category does not have at least one value; or all
212      * categories have only one value (zero degrees of freedom within groups)
213      */
214     public double statistic(Collection<double[]> data) {
215         final double[] f = new double[1];
216         aov(data, f);
217         return f[0];
218     }
219 
220     /**
221      * Performs an ANOVA test for a collection of category data,
222      * evaluating the null hypothesis that there is no difference among the means of
223      * the data categories.
224      *
225      * <p>Special cases:
226      * <ul>
227      * <li>If the value in each category is the same (no variance within groups) but different
228      * between groups, the f-value is {@linkplain Double#POSITIVE_INFINITY infinity} and the p-value is zero.
229      * <li>If the value in every group is the same (no variance within or between groups),
230      * the f-value and p-value are {@link Double#NaN NaN}.
231      * </ul>
232      *
233      * @param data Category summary data.
234      * @return test result
235      * @throws IllegalArgumentException if the number of categories is less than
236      * two; a contained category does not have at least one value; or all
237      * categories have only one value (zero degrees of freedom within groups)
238      */
239     public Result test(Collection<double[]> data) {
240         return aov(data, null);
241     }
242 
243     /**
244      * Performs an ANOVA test for a collection of category data, evaluating the null
245      * hypothesis that there is no difference among the means of the data categories.
246      *
247      * <p>This is a utility method to allow computation of the F statistic without
248      * the p-value or partitioning of the variance. If the {@code statistic} is not null
249      * the method will record the F statistic in the array and return null.
250      *
251      * @param data Category summary data.
252      * @param statistic Result for the F statistic (or null).
253      * @return test result (or null)
254      * @throws IllegalArgumentException if the number of categories is less than two; a
255      * contained category does not have at least one value; or all categories have only
256      * one value (zero degrees of freedom within groups)
257      */
258     private static Result aov(Collection<double[]> data, double[] statistic) {
259         Arguments.checkCategoriesRequiredSize(data.size(), 2);
260         long n = 0;
261         for (final double[] array : data) {
262             n += array.length;
263             Arguments.checkValuesRequiredSize(array.length, 1);
264         }
265         final long dfwg = n - data.size();
266         if (dfwg == 0) {
267             throw new InferenceException(InferenceException.ZERO, "Degrees of freedom within groups");
268         }
269 
270         // wg = within group
271         // bg = between group
272 
273         // F = Var(bg) / Var(wg)
274         // Var = SS / df
275         // SStotal = sum((x - u)^2) = sum(x^2) - sum(x)^2/n
276         //         = SSwg + SSbg
277         // Some cancellation of terms reduces the computation to 3 sums:
278         // SSwg = [ sum(x^2) - sum(x)^2/n ] - [ sum_g { sum(sum(x^2) - sum(x)^2/n) } ]
279         // SSbg = SStotal - SSwg
280         //      = sum_g { sum(x)^2/n) } - sum(x)^2/n
281         // SSwg = SStotal - SSbg
282         //      = sum(x^2) - sum_g { sum(x)^2/n) }
283 
284         // Stabilize the computation by shifting all to a common mean of zero.
285         // This minimise the magnitude of x^2 terms.
286         // The terms sum(x)^2/n -> 0. Included them to capture the round-off.
287         final double m = StatisticUtils.mean(data);
288         final Sum sxx = Sum.create();
289         final Sum sx = Sum.create();
290         final Sum sg = Sum.create();
291         // Track if each group had the same value
292         boolean eachSame = true;
293         for (final double[] array : data) {
294             eachSame = eachSame && allMatch(array[0], array);
295             final Sum s = Sum.create();
296             for (final double v : array) {
297                 final double x = v - m;
298                 s.add(x);
299                 // sum(x)
300                 sx.add(x);
301                 // sum(x^2)
302                 sxx.add(x * x);
303             }
304             // Create the negative sum so we can subtract it via 'add'
305             // -sum_g { sum(x)^2/n) }
306             sg.add(-pow2(s.getAsDouble()) / array.length);
307         }
308 
309         // Note: SS terms should not be negative given:
310         // SS = sum((x - u)^2)
311         // This can happen due to floating-point error in sum(x^2) - sum(x)^2/n
312         final double sswg = Math.max(0, sxx.add(sg).getAsDouble());
313         // Flip the sign back
314         final double ssbg = Math.max(0, -sg.add(pow2(sx.getAsDouble()) / n).getAsDouble());
315         final int dfbg = data.size() - 1;
316         // Handle edge-cases:
317         // Note: 0 / 0 -> NaN : x / 0 -> inf
318         // These are documented results and should output p=NaN or 0.
319         // This result will occur naturally.
320         // However the SS totals may not be 0.0 so we correct these cases.
321         final boolean allSame = eachSame && allMatch(data);
322         final double msbg = allSame ? 0 : ssbg / dfbg;
323         final double mswg = eachSame ? 0 : sswg / dfwg;
324         final double f = msbg / mswg;
325         if (statistic != null) {
326             statistic[0] = f;
327             return null;
328         }
329         final double p = FDistribution.of(dfbg, dfwg).survivalProbability(f);
330 
331         // Support partitioning the variance
332         // ni = size of each of the groups
333         // nO=(1/(a−1))*(sum(ni)−(sum(ni^2)/sum(ni))
334         final double nO = (n - data.stream()
335                 .mapToDouble(x -> pow2(x.length)).sum() / n) / dfbg;
336 
337         return new Result(dfbg, dfwg, msbg, mswg, nO, f, p);
338     }
339 
340     /**
341      * Return true if all values in the array match the specified value.
342      *
343      * @param v Value.
344      * @param a Array.
345      * @return true if all match
346      */
347     private static boolean allMatch(double v, double[] a) {
348         for (final double w : a) {
349             if (v != w) {
350                 return false;
351             }
352         }
353         return true;
354     }
355 
356     /**
357      * Return true if all values in the arrays match.
358      *
359      * <p>Assumes that there are at least two arrays and that each array has the same
360      * value throughout. Thus only the first element in each array is checked.
361      *
362      * @param data Arrays.
363      * @return true if all match
364      */
365     private static boolean allMatch(Collection<double[]> data) {
366         final Iterator<double[]> iter = data.iterator();
367         final double v = iter.next()[0];
368         while (iter.hasNext()) {
369             if (iter.next()[0] != v) {
370                 return false;
371             }
372         }
373         return true;
374     }
375 
376     /**
377      * Compute {@code x^2}.
378      *
379      * @param x Value.
380      * @return {@code x^2}
381      */
382     private static double pow2(double x) {
383         return x * x;
384     }
385 }