001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.statistics.inference; 018 019import java.util.Collection; 020import java.util.Iterator; 021import org.apache.commons.numbers.core.Sum; 022import org.apache.commons.statistics.distribution.FDistribution; 023 024/** 025 * Implements one-way ANOVA (analysis of variance) statistics. 026 * 027 * <p>Tests for differences between two or more categories of univariate data 028 * (for example, the body mass index of accountants, lawyers, doctors and 029 * computer programmers). When two categories are given, this is equivalent to 030 * the {@link TTest}. 031 * 032 * <p>This implementation computes the F statistic using the definitional formula: 033 * 034 * <p>\[ F = \frac{\text{between-group variability}}{\text{within-group variability}} \] 035 * 036 * @see <a href="https://en.wikipedia.org/wiki/Analysis_of_variance">Analysis of variance (Wikipedia)</a> 037 * @see <a href="https://en.wikipedia.org/wiki/F-test#Multiple-comparison_ANOVA_problems"> 038 * Multiple-comparison ANOVA problems (Wikipedia)</a> 039 * @see <a href="https://www.biostathandbook.com/onewayanova.html"> 040 * McDonald, J.H. 2014. Handbook of Biological Statistics (3rd ed.). Sparky House Publishing, Baltimore, Maryland. 041 * One-way anova. pp 145-156.</a> 042 * @since 1.1 043 */ 044public final class OneWayAnova { 045 /** Default instance. */ 046 private static final OneWayAnova DEFAULT = new OneWayAnova(); 047 048 /** 049 * Result for the one-way ANOVA. 050 * 051 * <p>This class is immutable. 052 * 053 * @since 1.1 054 */ 055 public static final class Result extends BaseSignificanceResult { 056 /** Degrees of freedom in numerator (between groups). */ 057 private final int dfbg; 058 /** Degrees of freedom in denominator (within groups). */ 059 private final long dfwg; 060 /** Mean square between groups. */ 061 private final double msbg; 062 /** Mean square within groups. */ 063 private final double mswg; 064 /** nO value used to partition the variance. */ 065 private final double nO; 066 067 /** 068 * @param dfbg Degrees of freedom in numerator (between groups). 069 * @param dfwg Degrees of freedom in denominator (within groups). 070 * @param msbg Mean square between groups. 071 * @param mswg Mean square within groups. 072 * @param nO Factor for partitioning the variance. 073 * @param f F statistic 074 * @param p P-value. 075 */ 076 Result(int dfbg, long dfwg, double msbg, double mswg, double nO, double f, double p) { 077 super(f, p); 078 this.dfbg = dfbg; 079 this.dfwg = dfwg; 080 this.msbg = msbg; 081 this.mswg = mswg; 082 this.nO = nO; 083 } 084 085 /** 086 * Gets the degrees of freedom in the numerator (between groups). 087 * 088 * @return degrees of freedom between groups 089 */ 090 int getDFBG() { 091 return dfbg; 092 } 093 094 /** 095 * Gets the degrees of freedom in the denominator (within groups). 096 * 097 * @return degrees of freedom within groups 098 */ 099 long getDFWG() { 100 return dfwg; 101 } 102 103 /** 104 * Gets the mean square between groups. 105 * 106 * @return mean square between groups 107 */ 108 public double getMSBG() { 109 return msbg; 110 } 111 112 /** 113 * Gets the mean square within groups. 114 * 115 * @return mean square within groups 116 */ 117 public double getMSWG() { 118 return mswg; 119 } 120 121 /** 122 * Gets the variance component between groups. 123 * 124 * <p>The value is a partitioning of the variance. 125 * It is the complement of {@link #getVCWG()}. 126 * 127 * <p>Partitioning the variance applies only to a model II 128 * (random effects) one-way anova. This applies when the 129 * groups are random samples from a larger set of groups; 130 * partitioning the variance allows comparison of the 131 * variation between groups to the variation within groups. 132 * 133 * <p>If the {@linkplain #getMSBG() MSBG} is less than the 134 * {@linkplain #getMSWG() MSWG} this returns 0. Otherwise this 135 * creates an estimate of the added variance component 136 * between groups as: 137 * 138 * <p>\[ \text{between-group variance} = A = (\text{MS}_{\text{bg}} - \text{MS}_{\text{wg}}) / n_o \] 139 * 140 * <p>where \( n_o \) is a number close to, but usually less than, 141 * the arithmetic mean of the sample size \(n_i\) of each 142 * of the \( a \) groups: 143 * 144 * <p>\[ n_o = \frac{1}{a-1} \left( \sum_i{n_i} - \frac{\sum_i{n_i^2}}{\sum_i{n_i}} \right) \] 145 * 146 * <p>The added variance component among groups \( A \) is expressed 147 * as a fraction of the total variance components \( A + B \) where 148 * \( B \) is the {@linkplain #getMSWG() MSWG}. 149 * 150 * @return variance component between groups (in [0, 1]). 151 */ 152 public double getVCBG() { 153 if (msbg <= mswg) { 154 return 0; 155 } 156 // a is an estimate of the between-group variance 157 final double a = (msbg - mswg) / nO; 158 final double b = mswg; 159 return a / (a + b); 160 } 161 162 /** 163 * Gets the variance component within groups. 164 * 165 * <p>The value is a partitioning of the variance. 166 * It is the complement of {@link #getVCBG()}. See 167 * that method for details. 168 * 169 * @return variance component within groups (in [0, 1]). 170 */ 171 public double getVCWG() { 172 if (msbg <= mswg) { 173 return 1; 174 } 175 final double a = (msbg - mswg) / nO; 176 final double b = mswg; 177 return b / (a + b); 178 } 179 } 180 181 /** Private constructor. */ 182 private OneWayAnova() { 183 // Do nothing 184 } 185 186 /** 187 * Return an instance using the default options. 188 * 189 * @return default instance 190 */ 191 public static OneWayAnova withDefaults() { 192 return DEFAULT; 193 } 194 195 /** 196 * Computes the F statistic for an ANOVA test for a collection of category data, 197 * evaluating the null hypothesis that there is no difference among the means of 198 * the data categories. 199 * 200 * <p>Special cases: 201 * <ul> 202 * <li>If the value in each category is the same (no variance within groups) but different 203 * between groups, the f-value is {@linkplain Double#POSITIVE_INFINITY infinity}. 204 * <li>If the value in every group is the same (no variance within or between groups), 205 * the f-value is {@link Double#NaN NaN}. 206 * </ul> 207 * 208 * @param data Category summary data. 209 * @return F statistic 210 * @throws IllegalArgumentException if the number of categories is less than 211 * two; a contained category does not have at least one value; or all 212 * categories have only one value (zero degrees of freedom within groups) 213 */ 214 public double statistic(Collection<double[]> data) { 215 final double[] f = new double[1]; 216 aov(data, f); 217 return f[0]; 218 } 219 220 /** 221 * Performs an ANOVA test for a collection of category data, 222 * evaluating the null hypothesis that there is no difference among the means of 223 * the data categories. 224 * 225 * <p>Special cases: 226 * <ul> 227 * <li>If the value in each category is the same (no variance within groups) but different 228 * between groups, the f-value is {@linkplain Double#POSITIVE_INFINITY infinity} and the p-value is zero. 229 * <li>If the value in every group is the same (no variance within or between groups), 230 * the f-value and p-value are {@link Double#NaN NaN}. 231 * </ul> 232 * 233 * @param data Category summary data. 234 * @return test result 235 * @throws IllegalArgumentException if the number of categories is less than 236 * two; a contained category does not have at least one value; or all 237 * categories have only one value (zero degrees of freedom within groups) 238 */ 239 public Result test(Collection<double[]> data) { 240 return aov(data, null); 241 } 242 243 /** 244 * Performs an ANOVA test for a collection of category data, evaluating the null 245 * hypothesis that there is no difference among the means of the data categories. 246 * 247 * <p>This is a utility method to allow computation of the F statistic without 248 * the p-value or partitioning of the variance. If the {@code statistic} is not null 249 * the method will record the F statistic in the array and return null. 250 * 251 * @param data Category summary data. 252 * @param statistic Result for the F statistic (or null). 253 * @return test result (or null) 254 * @throws IllegalArgumentException if the number of categories is less than two; a 255 * contained category does not have at least one value; or all categories have only 256 * one value (zero degrees of freedom within groups) 257 */ 258 private static Result aov(Collection<double[]> data, double[] statistic) { 259 Arguments.checkCategoriesRequiredSize(data.size(), 2); 260 long n = 0; 261 for (final double[] array : data) { 262 n += array.length; 263 Arguments.checkValuesRequiredSize(array.length, 1); 264 } 265 final long dfwg = n - data.size(); 266 if (dfwg == 0) { 267 throw new InferenceException(InferenceException.ZERO, "Degrees of freedom within groups"); 268 } 269 270 // wg = within group 271 // bg = between group 272 273 // F = Var(bg) / Var(wg) 274 // Var = SS / df 275 // SStotal = sum((x - u)^2) = sum(x^2) - sum(x)^2/n 276 // = SSwg + SSbg 277 // Some cancellation of terms reduces the computation to 3 sums: 278 // SSwg = [ sum(x^2) - sum(x)^2/n ] - [ sum_g { sum(sum(x^2) - sum(x)^2/n) } ] 279 // SSbg = SStotal - SSwg 280 // = sum_g { sum(x)^2/n) } - sum(x)^2/n 281 // SSwg = SStotal - SSbg 282 // = sum(x^2) - sum_g { sum(x)^2/n) } 283 284 // Stabilize the computation by shifting all to a common mean of zero. 285 // This minimise the magnitude of x^2 terms. 286 // The terms sum(x)^2/n -> 0. Included them to capture the round-off. 287 final double m = StatisticUtils.mean(data); 288 final Sum sxx = Sum.create(); 289 final Sum sx = Sum.create(); 290 final Sum sg = Sum.create(); 291 // Track if each group had the same value 292 boolean eachSame = true; 293 for (final double[] array : data) { 294 eachSame = eachSame && allMatch(array[0], array); 295 final Sum s = Sum.create(); 296 for (final double v : array) { 297 final double x = v - m; 298 s.add(x); 299 // sum(x) 300 sx.add(x); 301 // sum(x^2) 302 sxx.add(x * x); 303 } 304 // Create the negative sum so we can subtract it via 'add' 305 // -sum_g { sum(x)^2/n) } 306 sg.add(-pow2(s.getAsDouble()) / array.length); 307 } 308 309 // Note: SS terms should not be negative given: 310 // SS = sum((x - u)^2) 311 // This can happen due to floating-point error in sum(x^2) - sum(x)^2/n 312 final double sswg = Math.max(0, sxx.add(sg).getAsDouble()); 313 // Flip the sign back 314 final double ssbg = Math.max(0, -sg.add(pow2(sx.getAsDouble()) / n).getAsDouble()); 315 final int dfbg = data.size() - 1; 316 // Handle edge-cases: 317 // Note: 0 / 0 -> NaN : x / 0 -> inf 318 // These are documented results and should output p=NaN or 0. 319 // This result will occur naturally. 320 // However the SS totals may not be 0.0 so we correct these cases. 321 final boolean allSame = eachSame && allMatch(data); 322 final double msbg = allSame ? 0 : ssbg / dfbg; 323 final double mswg = eachSame ? 0 : sswg / dfwg; 324 final double f = msbg / mswg; 325 if (statistic != null) { 326 statistic[0] = f; 327 return null; 328 } 329 final double p = FDistribution.of(dfbg, dfwg).survivalProbability(f); 330 331 // Support partitioning the variance 332 // ni = size of each of the groups 333 // nO=(1/(a−1))*(sum(ni)−(sum(ni^2)/sum(ni)) 334 final double nO = (n - data.stream() 335 .mapToDouble(x -> pow2(x.length)).sum() / n) / dfbg; 336 337 return new Result(dfbg, dfwg, msbg, mswg, nO, f, p); 338 } 339 340 /** 341 * Return true if all values in the array match the specified value. 342 * 343 * @param v Value. 344 * @param a Array. 345 * @return true if all match 346 */ 347 private static boolean allMatch(double v, double[] a) { 348 for (final double w : a) { 349 if (v != w) { 350 return false; 351 } 352 } 353 return true; 354 } 355 356 /** 357 * Return true if all values in the arrays match. 358 * 359 * <p>Assumes that there are at least two arrays and that each array has the same 360 * value throughout. Thus only the first element in each array is checked. 361 * 362 * @param data Arrays. 363 * @return true if all match 364 */ 365 private static boolean allMatch(Collection<double[]> data) { 366 final Iterator<double[]> iter = data.iterator(); 367 final double v = iter.next()[0]; 368 while (iter.hasNext()) { 369 if (iter.next()[0] != v) { 370 return false; 371 } 372 } 373 return true; 374 } 375 376 /** 377 * Compute {@code x^2}. 378 * 379 * @param x Value. 380 * @return {@code x^2} 381 */ 382 private static double pow2(double x) { 383 return x * x; 384 } 385}