1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.statistics.inference;
18
19 import java.util.Collection;
20 import java.util.Iterator;
21 import org.apache.commons.numbers.core.Sum;
22 import org.apache.commons.statistics.distribution.FDistribution;
23
24 /**
25 * Implements one-way ANOVA (analysis of variance) statistics.
26 *
27 * <p>Tests for differences between two or more categories of univariate data
28 * (for example, the body mass index of accountants, lawyers, doctors and
29 * computer programmers). When two categories are given, this is equivalent to
30 * the {@link TTest}.
31 *
32 * <p>This implementation computes the F statistic using the definitional formula:
33 *
34 * <p>\[ F = \frac{\text{between-group variability}}{\text{within-group variability}} \]
35 *
36 * @see <a href="https://en.wikipedia.org/wiki/Analysis_of_variance">Analysis of variance (Wikipedia)</a>
37 * @see <a href="https://en.wikipedia.org/wiki/F-test#Multiple-comparison_ANOVA_problems">
38 * Multiple-comparison ANOVA problems (Wikipedia)</a>
39 * @see <a href="https://www.biostathandbook.com/onewayanova.html">
40 * McDonald, J.H. 2014. Handbook of Biological Statistics (3rd ed.). Sparky House Publishing, Baltimore, Maryland.
41 * One-way anova. pp 145-156.</a>
42 * @since 1.1
43 */
44 public final class OneWayAnova {
45 /** Default instance. */
46 private static final OneWayAnova DEFAULT = new OneWayAnova();
47
48 /**
49 * Result for the one-way ANOVA.
50 *
51 * <p>This class is immutable.
52 *
53 * @since 1.1
54 */
55 public static final class Result extends BaseSignificanceResult {
56 /** Degrees of freedom in numerator (between groups). */
57 private final int dfbg;
58 /** Degrees of freedom in denominator (within groups). */
59 private final long dfwg;
60 /** Mean square between groups. */
61 private final double msbg;
62 /** Mean square within groups. */
63 private final double mswg;
64 /** nO value used to partition the variance. */
65 private final double nO;
66
67 /**
68 * @param dfbg Degrees of freedom in numerator (between groups).
69 * @param dfwg Degrees of freedom in denominator (within groups).
70 * @param msbg Mean square between groups.
71 * @param mswg Mean square within groups.
72 * @param nO Factor for partitioning the variance.
73 * @param f F statistic
74 * @param p P-value.
75 */
76 Result(int dfbg, long dfwg, double msbg, double mswg, double nO, double f, double p) {
77 super(f, p);
78 this.dfbg = dfbg;
79 this.dfwg = dfwg;
80 this.msbg = msbg;
81 this.mswg = mswg;
82 this.nO = nO;
83 }
84
85 /**
86 * Gets the degrees of freedom in the numerator (between groups).
87 *
88 * @return degrees of freedom between groups
89 */
90 int getDFBG() {
91 return dfbg;
92 }
93
94 /**
95 * Gets the degrees of freedom in the denominator (within groups).
96 *
97 * @return degrees of freedom within groups
98 */
99 long getDFWG() {
100 return dfwg;
101 }
102
103 /**
104 * Gets the mean square between groups.
105 *
106 * @return mean square between groups
107 */
108 public double getMSBG() {
109 return msbg;
110 }
111
112 /**
113 * Gets the mean square within groups.
114 *
115 * @return mean square within groups
116 */
117 public double getMSWG() {
118 return mswg;
119 }
120
121 /**
122 * Gets the variance component between groups.
123 *
124 * <p>The value is a partitioning of the variance.
125 * It is the complement of {@link #getVCWG()}.
126 *
127 * <p>Partitioning the variance applies only to a model II
128 * (random effects) one-way anova. This applies when the
129 * groups are random samples from a larger set of groups;
130 * partitioning the variance allows comparison of the
131 * variation between groups to the variation within groups.
132 *
133 * <p>If the {@linkplain #getMSBG() MSBG} is less than the
134 * {@linkplain #getMSWG() MSWG} this returns 0. Otherwise this
135 * creates an estimate of the added variance component
136 * between groups as:
137 *
138 * <p>\[ \text{between-group variance} = A = (\text{MS}_{\text{bg}} - \text{MS}_{\text{wg}}) / n_o \]
139 *
140 * <p>where \( n_o \) is a number close to, but usually less than,
141 * the arithmetic mean of the sample size \(n_i\) of each
142 * of the \( a \) groups:
143 *
144 * <p>\[ n_o = \frac{1}{a-1} \left( \sum_i{n_i} - \frac{\sum_i{n_i^2}}{\sum_i{n_i}} \right) \]
145 *
146 * <p>The added variance component among groups \( A \) is expressed
147 * as a fraction of the total variance components \( A + B \) where
148 * \( B \) is the {@linkplain #getMSWG() MSWG}.
149 *
150 * @return variance component between groups (in [0, 1]).
151 */
152 public double getVCBG() {
153 if (msbg <= mswg) {
154 return 0;
155 }
156 // a is an estimate of the between-group variance
157 final double a = (msbg - mswg) / nO;
158 final double b = mswg;
159 return a / (a + b);
160 }
161
162 /**
163 * Gets the variance component within groups.
164 *
165 * <p>The value is a partitioning of the variance.
166 * It is the complement of {@link #getVCBG()}. See
167 * that method for details.
168 *
169 * @return variance component within groups (in [0, 1]).
170 */
171 public double getVCWG() {
172 if (msbg <= mswg) {
173 return 1;
174 }
175 final double a = (msbg - mswg) / nO;
176 final double b = mswg;
177 return b / (a + b);
178 }
179 }
180
181 /** Private constructor. */
182 private OneWayAnova() {
183 // Do nothing
184 }
185
186 /**
187 * Return an instance using the default options.
188 *
189 * @return default instance
190 */
191 public static OneWayAnova withDefaults() {
192 return DEFAULT;
193 }
194
195 /**
196 * Computes the F statistic for an ANOVA test for a collection of category data,
197 * evaluating the null hypothesis that there is no difference among the means of
198 * the data categories.
199 *
200 * <p>Special cases:
201 * <ul>
202 * <li>If the value in each category is the same (no variance within groups) but different
203 * between groups, the f-value is {@linkplain Double#POSITIVE_INFINITY infinity}.
204 * <li>If the value in every group is the same (no variance within or between groups),
205 * the f-value is {@link Double#NaN NaN}.
206 * </ul>
207 *
208 * @param data Category summary data.
209 * @return F statistic
210 * @throws IllegalArgumentException if the number of categories is less than
211 * two; a contained category does not have at least one value; or all
212 * categories have only one value (zero degrees of freedom within groups)
213 */
214 public double statistic(Collection<double[]> data) {
215 final double[] f = new double[1];
216 aov(data, f);
217 return f[0];
218 }
219
220 /**
221 * Performs an ANOVA test for a collection of category data,
222 * evaluating the null hypothesis that there is no difference among the means of
223 * the data categories.
224 *
225 * <p>Special cases:
226 * <ul>
227 * <li>If the value in each category is the same (no variance within groups) but different
228 * between groups, the f-value is {@linkplain Double#POSITIVE_INFINITY infinity} and the p-value is zero.
229 * <li>If the value in every group is the same (no variance within or between groups),
230 * the f-value and p-value are {@link Double#NaN NaN}.
231 * </ul>
232 *
233 * @param data Category summary data.
234 * @return test result
235 * @throws IllegalArgumentException if the number of categories is less than
236 * two; a contained category does not have at least one value; or all
237 * categories have only one value (zero degrees of freedom within groups)
238 */
239 public Result test(Collection<double[]> data) {
240 return aov(data, null);
241 }
242
243 /**
244 * Performs an ANOVA test for a collection of category data, evaluating the null
245 * hypothesis that there is no difference among the means of the data categories.
246 *
247 * <p>This is a utility method to allow computation of the F statistic without
248 * the p-value or partitioning of the variance. If the {@code statistic} is not null
249 * the method will record the F statistic in the array and return null.
250 *
251 * @param data Category summary data.
252 * @param statistic Result for the F statistic (or null).
253 * @return test result (or null)
254 * @throws IllegalArgumentException if the number of categories is less than two; a
255 * contained category does not have at least one value; or all categories have only
256 * one value (zero degrees of freedom within groups)
257 */
258 private static Result aov(Collection<double[]> data, double[] statistic) {
259 Arguments.checkCategoriesRequiredSize(data.size(), 2);
260 long n = 0;
261 for (final double[] array : data) {
262 n += array.length;
263 Arguments.checkValuesRequiredSize(array.length, 1);
264 }
265 final long dfwg = n - data.size();
266 if (dfwg == 0) {
267 throw new InferenceException(InferenceException.ZERO, "Degrees of freedom within groups");
268 }
269
270 // wg = within group
271 // bg = between group
272
273 // F = Var(bg) / Var(wg)
274 // Var = SS / df
275 // SStotal = sum((x - u)^2) = sum(x^2) - sum(x)^2/n
276 // = SSwg + SSbg
277 // Some cancellation of terms reduces the computation to 3 sums:
278 // SSwg = [ sum(x^2) - sum(x)^2/n ] - [ sum_g { sum(sum(x^2) - sum(x)^2/n) } ]
279 // SSbg = SStotal - SSwg
280 // = sum_g { sum(x)^2/n) } - sum(x)^2/n
281 // SSwg = SStotal - SSbg
282 // = sum(x^2) - sum_g { sum(x)^2/n) }
283
284 // Stabilize the computation by shifting all to a common mean of zero.
285 // This minimise the magnitude of x^2 terms.
286 // The terms sum(x)^2/n -> 0. Included them to capture the round-off.
287 final double m = StatisticUtils.mean(data);
288 final Sum sxx = Sum.create();
289 final Sum sx = Sum.create();
290 final Sum sg = Sum.create();
291 // Track if each group had the same value
292 boolean eachSame = true;
293 for (final double[] array : data) {
294 eachSame = eachSame && allMatch(array[0], array);
295 final Sum s = Sum.create();
296 for (final double v : array) {
297 final double x = v - m;
298 s.add(x);
299 // sum(x)
300 sx.add(x);
301 // sum(x^2)
302 sxx.add(x * x);
303 }
304 // Create the negative sum so we can subtract it via 'add'
305 // -sum_g { sum(x)^2/n) }
306 sg.add(-pow2(s.getAsDouble()) / array.length);
307 }
308
309 // Note: SS terms should not be negative given:
310 // SS = sum((x - u)^2)
311 // This can happen due to floating-point error in sum(x^2) - sum(x)^2/n
312 final double sswg = Math.max(0, sxx.add(sg).getAsDouble());
313 // Flip the sign back
314 final double ssbg = Math.max(0, -sg.add(pow2(sx.getAsDouble()) / n).getAsDouble());
315 final int dfbg = data.size() - 1;
316 // Handle edge-cases:
317 // Note: 0 / 0 -> NaN : x / 0 -> inf
318 // These are documented results and should output p=NaN or 0.
319 // This result will occur naturally.
320 // However the SS totals may not be 0.0 so we correct these cases.
321 final boolean allSame = eachSame && allMatch(data);
322 final double msbg = allSame ? 0 : ssbg / dfbg;
323 final double mswg = eachSame ? 0 : sswg / dfwg;
324 final double f = msbg / mswg;
325 if (statistic != null) {
326 statistic[0] = f;
327 return null;
328 }
329 final double p = FDistribution.of(dfbg, dfwg).survivalProbability(f);
330
331 // Support partitioning the variance
332 // ni = size of each of the groups
333 // nO=(1/(a−1))*(sum(ni)−(sum(ni^2)/sum(ni))
334 final double nO = (n - data.stream()
335 .mapToDouble(x -> pow2(x.length)).sum() / n) / dfbg;
336
337 return new Result(dfbg, dfwg, msbg, mswg, nO, f, p);
338 }
339
340 /**
341 * Return true if all values in the array match the specified value.
342 *
343 * @param v Value.
344 * @param a Array.
345 * @return true if all match
346 */
347 private static boolean allMatch(double v, double[] a) {
348 for (final double w : a) {
349 if (v != w) {
350 return false;
351 }
352 }
353 return true;
354 }
355
356 /**
357 * Return true if all values in the arrays match.
358 *
359 * <p>Assumes that there are at least two arrays and that each array has the same
360 * value throughout. Thus only the first element in each array is checked.
361 *
362 * @param data Arrays.
363 * @return true if all match
364 */
365 private static boolean allMatch(Collection<double[]> data) {
366 final Iterator<double[]> iter = data.iterator();
367 final double v = iter.next()[0];
368 while (iter.hasNext()) {
369 if (iter.next()[0] != v) {
370 return false;
371 }
372 }
373 return true;
374 }
375
376 /**
377 * Compute {@code x^2}.
378 *
379 * @param x Value.
380 * @return {@code x^2}
381 */
382 private static double pow2(double x) {
383 return x * x;
384 }
385 }