001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.statistics.inference; 018 019import org.apache.commons.statistics.descriptive.LongMean; 020import org.apache.commons.statistics.distribution.ChiSquaredDistribution; 021 022/** 023 * Implements chi-square test statistics. 024 * 025 * <p>This implementation handles both known and unknown distributions. 026 * 027 * <p>Two samples tests can be used when the distribution is unknown <i>a priori</i> 028 * but provided by one sample, or when the hypothesis under test is that the two 029 * samples come from the same underlying distribution. 030 * 031 * @see <a href="https://en.wikipedia.org/wiki/Chi-squared_test">Chi-square test (Wikipedia)</a> 032 * @since 1.1 033 */ 034public final class ChiSquareTest { 035 /** Name for the row. */ 036 private static final String ROW = "row"; 037 /** Name for the column. */ 038 private static final String COLUMN = "column"; 039 /** Default instance. */ 040 private static final ChiSquareTest DEFAULT = new ChiSquareTest(0); 041 042 /** Degrees of freedom adjustment. */ 043 private final int degreesOfFreedomAdjustment; 044 045 /** 046 * @param degreesOfFreedomAdjustment Degrees of freedom adjustment. 047 */ 048 private ChiSquareTest(int degreesOfFreedomAdjustment) { 049 this.degreesOfFreedomAdjustment = degreesOfFreedomAdjustment; 050 } 051 052 /** 053 * Return an instance using the default options. 054 * 055 * <ul> 056 * <li>{@linkplain #withDegreesOfFreedomAdjustment(int) Degrees of freedom adjustment = 0} 057 * </ul> 058 * 059 * @return default instance 060 */ 061 public static ChiSquareTest withDefaults() { 062 return DEFAULT; 063 } 064 065 /** 066 * Return an instance with the configured degrees of freedom adjustment. 067 * 068 * <p>The default degrees of freedom for a sample of length {@code n} are 069 * {@code n - 1}. An intrinsic null hypothesis is one where you estimate one or 070 * more parameters from the data in order to get the numbers for your null 071 * hypothesis. For a distribution with {@code p} parameters where up to 072 * {@code p} parameters have been estimated from the data the degrees of freedom 073 * is in the range {@code [n - 1 - p, n - 1]}. 074 * 075 * @param v Value. 076 * @return an instance 077 * @throws IllegalArgumentException if the value is negative 078 */ 079 public ChiSquareTest withDegreesOfFreedomAdjustment(int v) { 080 return new ChiSquareTest(Arguments.checkNonNegative(v)); 081 } 082 083 /** 084 * Computes the chi-square goodness-of-fit statistic comparing the {@code observed} counts to a 085 * uniform expected value (each category is equally likely). 086 * 087 * <p>Note: This is a specialized version of a comparison of {@code observed} 088 * with an {@code expected} array of uniform values. The result is faster than 089 * calling {@link #statistic(double[], long[])} and the statistic is the same, 090 * with an allowance for accumulated floating-point error due to the optimized 091 * routine. 092 * 093 * @param observed Observed frequency counts. 094 * @return Chi-square statistic 095 * @throws IllegalArgumentException if the sample size is less than 2; 096 * {@code observed} has negative entries; or all the observations are zero. 097 * @see #test(long[]) 098 */ 099 public double statistic(long[] observed) { 100 Arguments.checkValuesRequiredSize(observed.length, 2); 101 Arguments.checkNonNegative(observed); 102 final double e = LongMean.of(observed).getAsDouble(); 103 if (e == 0) { 104 throw new InferenceException(InferenceException.NO_DATA); 105 } 106 // chi2 = sum{ (o-e)^2 / e }. Use a single division at the end. 107 double chi2 = 0; 108 for (final long o : observed) { 109 final double d = o - e; 110 chi2 += d * d; 111 } 112 return chi2 / e; 113 } 114 115 /** 116 * Computes the chi-square goodness-of-fit statistic comparing {@code observed} and 117 * {@code expected} frequency counts. 118 * 119 * <p><strong>Note:</strong>This implementation rescales the {@code expected} 120 * array if necessary to ensure that the sum of the expected and observed counts 121 * are equal. 122 * 123 * @param expected Expected frequency counts. 124 * @param observed Observed frequency counts. 125 * @return Chi-square statistic 126 * @throws IllegalArgumentException if the sample size is less than 2; the array 127 * sizes do not match; {@code expected} has entries that are not strictly 128 * positive; {@code observed} has negative entries; or all the observations are zero. 129 * @see #test(double[], long[]) 130 */ 131 public double statistic(double[] expected, long[] observed) { 132 final double ratio = StatisticUtils.computeRatio(expected, observed); 133 // chi2 = sum{ (o-e)^2 / e } 134 double chi2 = 0; 135 for (int i = 0; i < observed.length; i++) { 136 final double e = ratio * expected[i]; 137 final double d = observed[i] - e; 138 chi2 += d * d / e; 139 } 140 return chi2; 141 } 142 143 /** 144 * Computes the chi-square statistic associated with a chi-square test of 145 * independence based on the input {@code counts} array, viewed as a two-way 146 * table in row-major format. 147 * 148 * @param counts 2-way table. 149 * @return Chi-square statistic 150 * @throws IllegalArgumentException if the number of rows or columns is less 151 * than 2; the array is non-rectangular; the array has negative entries; or the 152 * sum of a row or column is zero. 153 * @see #test(long[][]) 154 */ 155 public double statistic(long[][] counts) { 156 Arguments.checkCategoriesRequiredSize(counts.length, 2); 157 Arguments.checkValuesRequiredSize(counts[0].length, 2); 158 Arguments.checkRectangular(counts); 159 Arguments.checkNonNegative(counts); 160 161 final int nRows = counts.length; 162 final int nCols = counts[0].length; 163 164 // compute row, column and total sums 165 final double[] rowSum = new double[nRows]; 166 final double[] colSum = new double[nCols]; 167 double sum = 0; 168 for (int row = 0; row < nRows; row++) { 169 for (int col = 0; col < nCols; col++) { 170 rowSum[row] += counts[row][col]; 171 colSum[col] += counts[row][col]; 172 } 173 checkNonZero(rowSum[row], ROW, row); 174 sum += rowSum[row]; 175 } 176 177 for (int col = 0; col < nCols; col++) { 178 checkNonZero(colSum[col], COLUMN, col); 179 } 180 181 // Compute expected counts and chi-square 182 double chi2 = 0; 183 for (int row = 0; row < nRows; row++) { 184 for (int col = 0; col < nCols; col++) { 185 final double e = (rowSum[row] * colSum[col]) / sum; 186 final double d = counts[row][col] - e; 187 chi2 += d * d / e; 188 } 189 } 190 return chi2; 191 } 192 193 /** 194 * Computes a chi-square statistic associated with a chi-square test of 195 * independence of frequency counts in {@code observed1} and {@code observed2}. 196 * The sums of frequency counts in the two samples are not required to be the 197 * same. The formula used to compute the test statistic is: 198 * 199 * <p>\[ \sum_i{ \frac{(K * a_i - b_i / K)^2}{a_i + b_i} } \] 200 * 201 * <p>where 202 * 203 * <p>\[ K = \sqrt{ \sum_i{a_i} / \sum_i{b_i} } \] 204 * 205 * <p>Note: This is a specialized version of a 2-by-n contingency table. The 206 * result is faster than calling {@link #statistic(long[][])} with the table 207 * composed as {@code new long[][]{observed1, observed2}}. The statistic is the 208 * same, with an allowance for accumulated floating-point error due to the 209 * optimized routine. 210 * 211 * @param observed1 Observed frequency counts of the first data set. 212 * @param observed2 Observed frequency counts of the second data set. 213 * @return Chi-square statistic 214 * @throws IllegalArgumentException if the sample size is less than 2; the array 215 * sizes do not match; either array has entries that are negative; either all 216 * counts of {@code observed1} or {@code observed2} are zero; or if the count at 217 * some index is zero for both arrays. 218 * @see ChiSquareTest#test(long[], long[]) 219 */ 220 public double statistic(long[] observed1, long[] observed2) { 221 Arguments.checkValuesRequiredSize(observed1.length, 2); 222 Arguments.checkValuesSizeMatch(observed1.length, observed2.length); 223 Arguments.checkNonNegative(observed1); 224 Arguments.checkNonNegative(observed2); 225 226 // Compute and compare count sums 227 long colSum1 = 0; 228 long colSum2 = 0; 229 for (int i = 0; i < observed1.length; i++) { 230 final long obs1 = observed1[i]; 231 final long obs2 = observed2[i]; 232 checkNonZero(obs1 | obs2, ROW, i); 233 colSum1 += obs1; 234 colSum2 += obs2; 235 } 236 // Create the same exception message as chiSquare(long[][]) 237 checkNonZero(colSum1, COLUMN, 0); 238 checkNonZero(colSum2, COLUMN, 1); 239 240 // Compare and compute weight only if different 241 final boolean unequalCounts = colSum1 != colSum2; 242 final double weight = unequalCounts ? 243 Math.sqrt((double) colSum1 / colSum2) : 1; 244 // Compute chi-square 245 // This exploits an algebraic rearrangement of the generic n*m contingency table case 246 // for a single sum squared addition per row. 247 double chi2 = 0; 248 for (int i = 0; i < observed1.length; i++) { 249 final double obs1 = observed1[i]; 250 final double obs2 = observed2[i]; 251 // apply weights 252 final double d = unequalCounts ? 253 obs1 / weight - obs2 * weight : 254 obs1 - obs2; 255 chi2 += (d * d) / (obs1 + obs2); 256 } 257 return chi2; 258 } 259 260 /** 261 * Perform a chi-square goodness-of-fit test evaluating the null hypothesis that 262 * the {@code observed} counts conform to a uniform distribution (each category 263 * is equally likely). 264 * 265 * @param observed Observed frequency counts. 266 * @return test result 267 * @throws IllegalArgumentException if the sample size is less than 2; 268 * {@code observed} has negative entries; or all the observations are zero 269 * @see #statistic(long[]) 270 */ 271 public SignificanceResult test(long[] observed) { 272 final int df = observed.length - 1; 273 final double chi2 = statistic(observed); 274 final double p = computeP(chi2, df); 275 return new BaseSignificanceResult(chi2, p); 276 } 277 278 /** 279 * Perform a chi-square goodness-of-fit test evaluating the null hypothesis that the 280 * {@code observed} counts conform to the {@code expected} counts. 281 * 282 * <p>The test can be configured to apply an adjustment to the degrees of freedom 283 * if the observed data has been used to create the expected counts. 284 * 285 * @param expected Expected frequency counts. 286 * @param observed Observed frequency counts. 287 * @return test result 288 * @throws IllegalArgumentException if the sample size is less than 2; the array 289 * sizes do not match; {@code expected} has entries that are not strictly 290 * positive; {@code observed} has negative entries; all the observations are zero; or 291 * the adjusted degrees of freedom are not strictly positive 292 * @see #withDegreesOfFreedomAdjustment(int) 293 * @see #statistic(double[], long[]) 294 */ 295 public SignificanceResult test(double[] expected, long[] observed) { 296 final int df = StatisticUtils.computeDegreesOfFreedom(observed.length, degreesOfFreedomAdjustment); 297 final double chi2 = statistic(expected, observed); 298 final double p = computeP(chi2, df); 299 return new BaseSignificanceResult(chi2, p); 300 } 301 302 /** 303 * Perform a chi-square test of independence based on the input {@code counts} array, 304 * viewed as a two-way table. 305 * 306 * @param counts 2-way table. 307 * @return test result 308 * @throws IllegalArgumentException if the number of rows or columns is less 309 * than 2; the array is non-rectangular; the array has negative entries; or the 310 * sum of a row or column is zero. 311 * @see #statistic(long[][]) 312 */ 313 public SignificanceResult test(long[][] counts) { 314 final double chi2 = statistic(counts); 315 final double df = (counts.length - 1.0) * (counts[0].length - 1.0); 316 final double p = computeP(chi2, df); 317 return new BaseSignificanceResult(chi2, p); 318 } 319 320 /** 321 * Perform a chi-square test of independence of frequency counts in 322 * {@code observed1} and {@code observed2}. 323 * 324 * <p>Note: This is a specialized version of a 2-by-n contingency table. 325 * 326 * @param observed1 Observed frequency counts of the first data set. 327 * @param observed2 Observed frequency counts of the second data set. 328 * @return test result 329 * @throws IllegalArgumentException if the sample size is less than 2; the array 330 * sizes do not match; either array has entries that are negative; either all 331 * counts of {@code observed1} or {@code observed2} are zero; or if the count at 332 * some index is zero for both arrays. 333 * @see #statistic(long[], long[]) 334 */ 335 public SignificanceResult test(long[] observed1, long[] observed2) { 336 final double chi2 = statistic(observed1, observed2); 337 final double p = computeP(chi2, observed1.length - 1.0); 338 return new BaseSignificanceResult(chi2, p); 339 } 340 341 /** 342 * Compute the chi-square test p-value. 343 * 344 * @param chi2 Chi-square statistic. 345 * @param degreesOfFreedom Degrees of freedom. 346 * @return p-value 347 */ 348 private static double computeP(double chi2, double degreesOfFreedom) { 349 return ChiSquaredDistribution.of(degreesOfFreedom).survivalProbability(chi2); 350 } 351 352 /** 353 * Check the array value is non-zero. 354 * 355 * @param value Value 356 * @param name Name of the array 357 * @param index Index in the array 358 * @throws IllegalArgumentException if the value is zero 359 */ 360 private static void checkNonZero(double value, String name, int index) { 361 if (value == 0) { 362 throw new InferenceException(InferenceException.ZERO_AT, name, index); 363 } 364 } 365}