001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.statistics.inference; 018 019import java.util.Objects; 020import org.apache.commons.statistics.distribution.HypergeometricDistribution; 021 022/** 023 * Implements Fisher's exact test. 024 * 025 * <p>Performs an exact test for the statistical significance of the association (contingency) 026 * between two kinds of categorical classification. 027 * 028 * <p>Fisher's test applies in the case that the row sums and column sums are fixed in advance 029 * and not random. 030 * 031 * @see <a href="https://en.wikipedia.org/wiki/Fisher%27s_exact_test">Fisher's exact test (Wikipedia)</a> 032 * @since 1.1 033 */ 034public final class FisherExactTest { 035 /** Default instance. */ 036 private static final FisherExactTest DEFAULT = new FisherExactTest(AlternativeHypothesis.TWO_SIDED); 037 038 /** Alternative hypothesis. */ 039 private final AlternativeHypothesis alternative; 040 041 /** 042 * @param alternative Alternative hypothesis. 043 */ 044 private FisherExactTest(AlternativeHypothesis alternative) { 045 this.alternative = alternative; 046 } 047 048 /** 049 * Return an instance using the default options. 050 * 051 * <ul> 052 * <li>{@link AlternativeHypothesis#TWO_SIDED} 053 * </ul> 054 * 055 * @return default instance 056 */ 057 public static FisherExactTest withDefaults() { 058 return DEFAULT; 059 } 060 061 /** 062 * Return an instance with the configured alternative hypothesis. 063 * 064 * @param v Value. 065 * @return an instance 066 */ 067 public FisherExactTest with(AlternativeHypothesis v) { 068 return new FisherExactTest(Objects.requireNonNull(v)); 069 } 070 071 /** 072 * Compute the prior odds ratio for the 2-by-2 contingency table. This is the 073 * "sample" or "unconditional" maximum likelihood estimate. For a table of: 074 * 075 * <p>\[ \left[ {\begin{array}{cc} 076 * a & b \\ 077 * c & d \\ 078 * \end{array} } \right] \] 079 * 080 * <p>this is: 081 * 082 * <p>\[ r = \frac{a d}{b c} \] 083 * 084 * <p>Special cases: 085 * <ul> 086 * <li>If the denominator is zero, the value is {@linkplain Double#POSITIVE_INFINITY infinity}. 087 * <li>If a row or column sum is zero, the value is {@link Double#NaN NaN}. 088 * </ul> 089 * 090 * <p>Note: This statistic is equal to the statistic computed by the SciPy function 091 * {@code scipy.stats.fisher_exact}. It is different to the conditional maximum 092 * likelihood estimate computed by R function {@code fisher.test}. 093 * 094 * @param table 2-by-2 contingency table. 095 * @return odds ratio 096 * @throws IllegalArgumentException if the {@code table} is not a 2-by-2 table; any 097 * table entry is negative; or the sum of the table is 0 or larger than a 32-bit signed integer. 098 * @see #test(int[][]) 099 */ 100 public double statistic(int[][] table) { 101 Arguments.checkTable(table); 102 final double a = table[0][0]; 103 final double b = table[0][1]; 104 final double c = table[1][0]; 105 final double d = table[1][1]; 106 return (a * d) / (b * c); 107 } 108 109 /** 110 * Performs Fisher's exact test on the 2-by-2 contingency table. 111 * 112 * <p>The test statistic is equal to the prior odds ratio. This is the 113 * "sample" or "unconditional" maximum likelihood estimate. 114 * 115 * <p>The test is defined by the {@link AlternativeHypothesis}. 116 * 117 * <p>For a table of [[a, b], [c, d]] the possible values of any table are conditioned 118 * with the same marginals (row and column totals). In this case the possible values {@code x} 119 * of the upper-left element {@code a} are {@code min(0, a - d) <= x <= a + min(b, c)}. 120 * <ul> 121 * <li>'two-sided': the odds ratio of the underlying population is not one; the p-value 122 * is the probability that a random table has probability equal to or less than the input table. 123 * <li>'greater': the odds ratio of the underlying population is greater than one; the p-value 124 * is the probability that a random table has {@code x >= a}. 125 * <li>'less': the odds ratio of the underlying population is less than one; the p-value 126 * is the probability that a random table has {@code x <= a}. 127 * </ul> 128 * 129 * @param table 2-by-2 contingency table. 130 * @return test result 131 * @throws IllegalArgumentException if the {@code table} is not a 2-by-2 table; any 132 * table entry is negative; or the sum of the table is 0 or larger than a 32-bit signed integer. 133 * @see #with(AlternativeHypothesis) 134 * @see #statistic(int[][]) 135 */ 136 public SignificanceResult test(int[][] table) { 137 Arguments.checkTable(table); 138 final int a = table[0][0]; 139 final int b = table[0][1]; 140 final int c = table[1][0]; 141 final int d = table[1][1]; 142 143 // Odd-ratio. 144 final double statistic = ((double) a * d) / ((double) b * c); 145 146 final int nn = a + b + c + d; 147 final int k = a + b; 148 final int n = a + c; 149 150 // Note: The distribution validates the population size is > 0 151 final HypergeometricDistribution distribution = HypergeometricDistribution.of(nn, k, n); 152 final double p; 153 if (alternative == AlternativeHypothesis.GREATER_THAN) { 154 p = distribution.survivalProbability(a - 1); 155 } else if (alternative == AlternativeHypothesis.LESS_THAN) { 156 p = distribution.cumulativeProbability(a); 157 } else { 158 p = twoSidedTest(a, distribution); 159 } 160 return new BaseSignificanceResult(statistic, p); 161 } 162 163 /** 164 * Returns the <i>observed significance level</i>, or p-value, associated with a 165 * two-sided test about the observed value. 166 * 167 * @param k Observed value. 168 * @param distribution Hypergeometric distribution. 169 * @return p-value 170 */ 171 private static double twoSidedTest(int k, HypergeometricDistribution distribution) { 172 // Find all i where Pr(X = i) <= Pr(X = k) and sum them. 173 // Exploit the known unimodal distribution to increase the 174 // search speed. Note the search depends only on magnitude differences. 175 // The current HypergeometricDistribution is faster using log probability 176 // as it omits a call to Math.exp. 177 178 // Use the mode as the point of largest probability. 179 // The lower or upper mode is important for the search below. 180 final int nn = distribution.getPopulationSize(); 181 final int kk = distribution.getNumberOfSuccesses(); 182 final int n = distribution.getSampleSize(); 183 final double v = ((double) n + 1) * ((double) kk + 1) / (nn + 2.0); 184 final int m1 = (int) Math.ceil(v) - 1; 185 final int m2 = (int) Math.floor(v); 186 if (k < m1) { 187 final double pk = distribution.logProbability(k); 188 // Lower half = cdf(k) 189 // Find upper half. As k < lower mode i should never 190 // reach the lower mode based on the probability alone. 191 // Bracket with the upper mode. 192 final int i = Searches.searchDescending(m2, distribution.getSupportUpperBound(), pk, 193 distribution::logProbability); 194 return distribution.cumulativeProbability(k) + 195 distribution.survivalProbability(i - 1); 196 } else if (k > m2) { 197 final double pk = distribution.logProbability(k); 198 // Upper half = sf(k - 1) 199 // Find lower half. As k > upper mode i should never 200 // reach the upper mode based on the probability alone. 201 // Bracket with the lower mode. 202 final int i = Searches.searchAscending(distribution.getSupportLowerBound(), m1, pk, 203 distribution::logProbability); 204 return distribution.cumulativeProbability(i) + 205 distribution.survivalProbability(k - 1); 206 } 207 // k == mode 208 // Edge case where the sum of probabilities will be either 209 // 1 or 1 - Pr(X = mode) where mode != k 210 final double pk = distribution.probability(k); 211 final double pm = distribution.probability(k == m1 ? m2 : m1); 212 return pm > pk ? 1 - pm : 1; 213 } 214}