001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.statistics.inference; 018 019import org.apache.commons.numbers.core.Sum; 020import org.apache.commons.statistics.descriptive.LongMean; 021import org.apache.commons.statistics.distribution.ChiSquaredDistribution; 022 023/** 024 * Implements G-test (Generalized Log-Likelihood Ratio Test) statistics. 025 * 026 * <p>This is known in statistical genetics as the McDonald-Kreitman test. 027 * The implementation handles both known and unknown distributions. 028 * 029 * <p>Two samples tests can be used when the distribution is unknown <i>a priori</i> 030 * but provided by one sample, or when the hypothesis under test is that the two 031 * samples come from the same underlying distribution. 032 * 033 * @see <a href="https://en.wikipedia.org/wiki/G-test">G-test (Wikipedia)</a> 034 * @since 1.1 035 */ 036public final class GTest { 037 // Note: 038 // The g-test statistic is a summation of terms with positive and negative sign 039 // and thus the sum may exhibit cancellation. This class uses separate high precision 040 // sums of the positive and negative terms which are then combined. 041 // Total cancellation for a large number of terms will not impact 042 // p-values of interest around critical alpha values as the Chi^2 043 // distribution exhibits strong concentration around its mean (degrees of freedom, k). 044 // The summation only need maintain enough bits in the final sum to distinguish 045 // g values around critical alpha values where 0 << chisq.sf(g, k) << 0.5: g > k, 046 // with k = number of terms - 1. 047 048 /** Default instance. */ 049 private static final GTest DEFAULT = new GTest(0); 050 051 /** Degrees of freedom adjustment. */ 052 private final int degreesOfFreedomAdjustment; 053 054 /** 055 * @param degreesOfFreedomAdjustment Degrees of freedom adjustment. 056 */ 057 private GTest(int degreesOfFreedomAdjustment) { 058 this.degreesOfFreedomAdjustment = degreesOfFreedomAdjustment; 059 } 060 061 /** 062 * Return an instance using the default options. 063 * 064 * <ul> 065 * <li>{@linkplain #withDegreesOfFreedomAdjustment(int) Degrees of freedom adjustment = 0} 066 * </ul> 067 * 068 * @return default instance 069 */ 070 public static GTest withDefaults() { 071 return DEFAULT; 072 } 073 074 /** 075 * Return an instance with the configured degrees of freedom adjustment. 076 * 077 * <p>The default degrees of freedom for a sample of length {@code n} are 078 * {@code n - 1}. An intrinsic null hypothesis is one where you estimate one or 079 * more parameters from the data in order to get the numbers for your null 080 * hypothesis. For a distribution with {@code p} parameters where up to 081 * {@code p} parameters have been estimated from the data the degrees of freedom 082 * is in the range {@code [n - 1 - p, n - 1]}. 083 * 084 * @param v Value. 085 * @return an instance 086 * @throws IllegalArgumentException if the value is negative 087 */ 088 public GTest withDegreesOfFreedomAdjustment(int v) { 089 return new GTest(Arguments.checkNonNegative(v)); 090 } 091 092 /** 093 * Computes the G-test goodness-of-fit statistic comparing the {@code observed} counts to 094 * a uniform expected value (each category is equally likely). 095 * 096 * <p>Note: This is a specialized version of a comparison of {@code observed} 097 * with an {@code expected} array of uniform values. The result is faster than 098 * calling {@link #statistic(double[], long[])} and the statistic is the same, 099 * with an allowance for accumulated floating-point error due to the optimized 100 * routine. 101 * 102 * @param observed Observed frequency counts. 103 * @return G-test statistic 104 * @throws IllegalArgumentException if the sample size is less than 2; 105 * {@code observed} has negative entries; or all the observations are zero. 106 * @see #test(long[]) 107 */ 108 public double statistic(long[] observed) { 109 Arguments.checkValuesRequiredSize(observed.length, 2); 110 Arguments.checkNonNegative(observed); 111 final double e = LongMean.of(observed).getAsDouble(); 112 if (e == 0) { 113 throw new InferenceException(InferenceException.NO_DATA); 114 } 115 // g = 2 * sum{o * ln(o/e)} 116 // = 2 * [ sum{o * ln(o)} - sum(o) * ln(e) ] 117 // The second form has more cancellation as the sums are larger. 118 // Separate sum for positive and negative terms. 119 final Sum sum = Sum.create(); 120 final Sum sum2 = Sum.create(); 121 for (final double o : observed) { 122 if (o > e) { 123 // Positive term 124 sum.add(o * Math.log(o / e)); 125 } else if (o > 0) { 126 // Negative term 127 // Process non-zero counts to avoid 0 * -inf = NaN 128 sum2.add(o * Math.log(o / e)); 129 } 130 } 131 return sum.add(sum2).getAsDouble() * 2; 132 } 133 134 /** 135 * Computes the G-test goodness-of-fit statistic comparing {@code observed} and {@code expected} 136 * frequency counts. 137 * 138 * <p><strong>Note:</strong>This implementation rescales the values 139 * if necessary to ensure that the sum of the expected and observed counts 140 * are equal. 141 * 142 * @param expected Expected frequency counts. 143 * @param observed Observed frequency counts. 144 * @return G-test statistic 145 * @throws IllegalArgumentException if the sample size is less than 2; the array 146 * sizes do not match; {@code expected} has entries that are not strictly 147 * positive; {@code observed} has negative entries; or all the observations are zero. 148 * @see #test(double[], long[]) 149 */ 150 public double statistic(double[] expected, long[] observed) { 151 // g = 2 * sum{o * ln(o/e)} 152 // The sum of o and e must be the same. 153 final double ratio = StatisticUtils.computeRatio(expected, observed); 154 // High precision sum to reduce cancellation. 155 // Separate sum for positive and negative terms. 156 final Sum sum = Sum.create(); 157 final Sum sum2 = Sum.create(); 158 for (int i = 0; i < observed.length; i++) { 159 final long o = observed[i]; 160 // Process non-zero counts to avoid 0 * -inf = NaN 161 if (o != 0) { 162 final double term = o * Math.log(o / (ratio * expected[i])); 163 if (term < 0) { 164 sum2.add(term); 165 } else { 166 sum.add(term); 167 } 168 } 169 } 170 return sum.add(sum2).getAsDouble() * 2; 171 } 172 173 /** 174 * Computes a G-test statistic associated with a G-test of 175 * independence based on the input {@code counts} array, viewed as a two-way 176 * table. The formula used to compute the test statistic is: 177 * 178 * <p>\[ G = 2 \cdot \sum_{ij}{O_{ij}} \cdot \left[ H(r) + H(c) - H(r,c) \right] \] 179 * 180 * <p>and \( H \) is the <a 181 * href="https://en.wikipedia.org/wiki/Entropy_%28information_theory%29"> 182 * Shannon Entropy</a> of the random variable formed by viewing the elements of 183 * the argument array as incidence counts: 184 * 185 * <p>\[ H(X) = - {\sum_{x \in \text{Supp}(X)} p(x) \ln p(x)} \] 186 * 187 * @param counts 2-way table. 188 * @return G-test statistic 189 * @throws IllegalArgumentException if the number of rows or columns is less 190 * than 2; the array is non-rectangular; the array has negative entries; or the 191 * sum of a row or column is zero. 192 * @see ChiSquareTest#test(long[][]) 193 */ 194 public double statistic(long[][] counts) { 195 Arguments.checkCategoriesRequiredSize(counts.length, 2); 196 Arguments.checkValuesRequiredSize(counts[0].length, 2); 197 Arguments.checkRectangular(counts); 198 Arguments.checkNonNegative(counts); 199 200 final int ni = counts.length; 201 final int nj = counts[0].length; 202 203 // Compute row, column and total sums 204 final double[] sumi = new double[ni]; 205 final double[] sumj = new double[nj]; 206 double n = 0; 207 // We can sum data on the first pass. See below for computation details. 208 final Sum sum = Sum.create(); 209 for (int i = 0; i < ni; i++) { 210 for (int j = 0; j < nj; j++) { 211 final long c = counts[i][j]; 212 sumi[i] += c; 213 sumj[j] += c; 214 if (c > 1) { 215 sum.add(c * Math.log(c)); 216 } 217 } 218 checkNonZero(sumi[i], "Row", i); 219 n += sumi[i]; 220 } 221 222 for (int j = 0; j < nj; j++) { 223 checkNonZero(sumj[j], "Column", j); 224 } 225 226 // This computes a modified form of the Shannon entropy H without requiring 227 // normalisation of observations to probabilities and without negation, 228 // i.e. we compute n * [ H(r) + H(c) - H(r,c) ] as [ H'(r,c) - H'(r) - H'(c) ]. 229 230 // H = -sum (p * log(p)) 231 // H' = n * sum (p * log(p)) 232 // = n * sum (o/n * log(o/n)) 233 // = n * [ sum(o/n * log(o)) - sum(o/n * log(n)) ] 234 // = sum(o * log(o)) - n log(n) 235 236 // After 3 modified entropy sums H'(r,c) - H'(r) - H'(c) compensation is (-1 + 2) * n log(n) 237 sum.addProduct(n, Math.log(n)); 238 // Negative terms 239 final Sum sum2 = Sum.create(); 240 // All these counts are above zero so no check for zeros 241 for (final double c : sumi) { 242 sum2.add(c * -Math.log(c)); 243 } 244 for (final double c : sumj) { 245 sum2.add(c * -Math.log(c)); 246 } 247 248 return sum.add(sum2).getAsDouble() * 2; 249 } 250 251 /** 252 * Perform a G-test for goodness-of-fit evaluating the null hypothesis that the {@code observed} 253 * counts conform to a uniform distribution (each category is equally likely). 254 * 255 * @param observed Observed frequency counts. 256 * @return test result 257 * @throws IllegalArgumentException if the sample size is less than 2; 258 * {@code observed} has negative entries; or all the observations are zero 259 * @see #statistic(long[]) 260 */ 261 public SignificanceResult test(long[] observed) { 262 final int df = observed.length - 1; 263 final double g = statistic(observed); 264 final double p = computeP(g, df); 265 return new BaseSignificanceResult(g, p); 266 } 267 268 /** 269 * Perform a G-test for goodness-of-fit evaluating the null hypothesis that the {@code observed} 270 * counts conform to the {@code expected} counts. 271 * 272 * <p>The test can be configured to apply an adjustment to the degrees of freedom 273 * if the observed data has been used to create the expected counts. 274 * 275 * @param expected Expected frequency counts. 276 * @param observed Observed frequency counts. 277 * @return test result 278 * @throws IllegalArgumentException if the sample size is less than 2; the array 279 * sizes do not match; {@code expected} has entries that are not strictly 280 * positive; {@code observed} has negative entries; all the observations are zero; or 281 * the adjusted degrees of freedom are not strictly positive 282 * @see #withDegreesOfFreedomAdjustment(int) 283 * @see #statistic(double[], long[]) 284 */ 285 public SignificanceResult test(double[] expected, long[] observed) { 286 final int df = StatisticUtils.computeDegreesOfFreedom(observed.length, degreesOfFreedomAdjustment); 287 final double g = statistic(expected, observed); 288 final double p = computeP(g, df); 289 return new BaseSignificanceResult(g, p); 290 } 291 292 /** 293 * Perform a G-test of independence based on the input 294 * {@code counts} array, viewed as a two-way table. 295 * 296 * @param counts 2-way table. 297 * @return test result 298 * @throws IllegalArgumentException if the number of rows or columns is less 299 * than 2; the array is non-rectangular; the array has negative entries; or the 300 * sum of a row or column is zero. 301 * @see #statistic(long[][]) 302 */ 303 public SignificanceResult test(long[][] counts) { 304 final double g = statistic(counts); 305 final double df = (counts.length - 1.0) * (counts[0].length - 1.0); 306 final double p = computeP(g, df); 307 return new BaseSignificanceResult(g, p); 308 } 309 310 /** 311 * Compute the G-test p-value. 312 * 313 * @param g G-test statistic. 314 * @param degreesOfFreedom Degrees of freedom. 315 * @return p-value 316 */ 317 private static double computeP(double g, double degreesOfFreedom) { 318 return ChiSquaredDistribution.of(degreesOfFreedom).survivalProbability(g); 319 } 320 321 /** 322 * Check the array value is non-zero. 323 * 324 * @param value Value 325 * @param name Name of the array 326 * @param index Index in the array 327 * @throws IllegalArgumentException if the value is zero 328 */ 329 private static void checkNonZero(double value, String name, int index) { 330 if (value == 0) { 331 throw new InferenceException(InferenceException.ZERO_AT, name, index); 332 } 333 } 334}