001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.statistics.inference; 018 019import java.lang.ref.SoftReference; 020import java.util.Arrays; 021import java.util.EnumSet; 022import java.util.Objects; 023import java.util.stream.IntStream; 024import org.apache.commons.numbers.combinatorics.BinomialCoefficientDouble; 025import org.apache.commons.statistics.distribution.NormalDistribution; 026import org.apache.commons.statistics.ranking.NaNStrategy; 027import org.apache.commons.statistics.ranking.NaturalRanking; 028import org.apache.commons.statistics.ranking.RankingAlgorithm; 029import org.apache.commons.statistics.ranking.TiesStrategy; 030 031/** 032 * Implements the Mann-Whitney U test (also called Wilcoxon rank-sum test). 033 * 034 * @see <a href="https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test"> 035 * Mann-Whitney U test (Wikipedia)</a> 036 * @since 1.1 037 */ 038public final class MannWhitneyUTest { 039 /** Limit on sample size for the exact p-value computation for the auto mode. */ 040 private static final int AUTO_LIMIT = 50; 041 /** Ranking instance. */ 042 private static final RankingAlgorithm RANKING = new NaturalRanking(NaNStrategy.FAILED, TiesStrategy.AVERAGE); 043 /** Value for an unset f computation. */ 044 private static final double UNSET = -1; 045 /** An object to use for synchonization when accessing the cache of F. */ 046 private static final Object LOCK = new Object(); 047 /** A reference to a previously computed storage for f. 048 * Use of a SoftReference ensures this is garbage collected before an OutOfMemoryError. 049 * The value should only be accessed, checked for size and optionally 050 * modified when holding the lock. When the storage is determined to be the correct 051 * size it can be returned for read/write to the array when not holding the lock. */ 052 private static SoftReference<double[][][]> cacheF = new SoftReference<>(null); 053 /** Default instance. */ 054 private static final MannWhitneyUTest DEFAULT = new MannWhitneyUTest( 055 AlternativeHypothesis.TWO_SIDED, PValueMethod.AUTO, true, 0); 056 057 /** Alternative hypothesis. */ 058 private final AlternativeHypothesis alternative; 059 /** Method to compute the p-value. */ 060 private final PValueMethod pValueMethod; 061 /** Perform continuity correction. */ 062 private final boolean continuityCorrection; 063 /** Expected location shift. */ 064 private final double mu; 065 066 /** 067 * Result for the Mann-Whitney U test. 068 * 069 * <p>This class is immutable. 070 * 071 * @since 1.1 072 */ 073 public static final class Result extends BaseSignificanceResult { 074 /** Flag indicating the data has tied values. */ 075 private final boolean tiedValues; 076 077 /** 078 * Create an instance. 079 * 080 * @param statistic Test statistic. 081 * @param tiedValues Flag indicating the data has tied values. 082 * @param p Result p-value. 083 */ 084 Result(double statistic, boolean tiedValues, double p) { 085 super(statistic, p); 086 this.tiedValues = tiedValues; 087 } 088 089 /** 090 * {@inheritDoc} 091 * 092 * <p>This is the U<sub>1</sub> statistic. Compute the U<sub>2</sub> statistic using 093 * the original sample lengths {@code n} and {@code m} using: 094 * <pre> 095 * u2 = (long) n * m - u1; 096 * </pre> 097 */ 098 @Override 099 public double getStatistic() { 100 // Note: This method is here for documentation 101 return super.getStatistic(); 102 } 103 104 /** 105 * Return {@code true} if the data had tied values. 106 * 107 * <p>Note: The exact computation cannot be used when there are tied values. 108 * 109 * @return {@code true} if there were tied values 110 */ 111 public boolean hasTiedValues() { 112 return tiedValues; 113 } 114 } 115 116 /** 117 * @param alternative Alternative hypothesis. 118 * @param method P-value method. 119 * @param continuityCorrection true to perform continuity correction. 120 * @param mu Expected location shift. 121 */ 122 private MannWhitneyUTest(AlternativeHypothesis alternative, PValueMethod method, 123 boolean continuityCorrection, double mu) { 124 this.alternative = alternative; 125 this.pValueMethod = method; 126 this.continuityCorrection = continuityCorrection; 127 this.mu = mu; 128 } 129 130 /** 131 * Return an instance using the default options. 132 * 133 * <ul> 134 * <li>{@link AlternativeHypothesis#TWO_SIDED} 135 * <li>{@link PValueMethod#AUTO} 136 * <li>{@link ContinuityCorrection#ENABLED} 137 * <li>{@linkplain #withMu(double) mu = 0} 138 * </ul> 139 * 140 * @return default instance 141 */ 142 public static MannWhitneyUTest withDefaults() { 143 return DEFAULT; 144 } 145 146 /** 147 * Return an instance with the configured alternative hypothesis. 148 * 149 * @param v Value. 150 * @return an instance 151 */ 152 public MannWhitneyUTest with(AlternativeHypothesis v) { 153 return new MannWhitneyUTest(Objects.requireNonNull(v), pValueMethod, continuityCorrection, mu); 154 } 155 156 /** 157 * Return an instance with the configured p-value method. 158 * 159 * @param v Value. 160 * @return an instance 161 * @throws IllegalArgumentException if the value is not in the allowed options or is null 162 */ 163 public MannWhitneyUTest with(PValueMethod v) { 164 return new MannWhitneyUTest(alternative, 165 Arguments.checkOption(v, EnumSet.of(PValueMethod.AUTO, PValueMethod.EXACT, PValueMethod.ASYMPTOTIC)), 166 continuityCorrection, mu); 167 } 168 169 /** 170 * Return an instance with the configured continuity correction. 171 * 172 * <p>If {@link ContinuityCorrection#ENABLED ENABLED}, adjust the U rank statistic by 173 * 0.5 towards the mean value when computing the z-statistic if a normal approximation is used 174 * to compute the p-value. 175 * 176 * @param v Value. 177 * @return an instance 178 */ 179 public MannWhitneyUTest with(ContinuityCorrection v) { 180 return new MannWhitneyUTest(alternative, pValueMethod, 181 Objects.requireNonNull(v) == ContinuityCorrection.ENABLED, mu); 182 } 183 184 /** 185 * Return an instance with the configured location shift {@code mu}. 186 * 187 * @param v Value. 188 * @return an instance 189 * @throws IllegalArgumentException if the value is not finite 190 */ 191 public MannWhitneyUTest withMu(double v) { 192 return new MannWhitneyUTest(alternative, pValueMethod, continuityCorrection, Arguments.checkFinite(v)); 193 } 194 195 /** 196 * Computes the Mann-Whitney U statistic comparing two independent 197 * samples possibly of different length. 198 * 199 * <p>This statistic can be used to perform a Mann-Whitney U test evaluating the 200 * null hypothesis that the two independent samples differ by a location shift of {@code mu}. 201 * 202 * <p>This returns the U<sub>1</sub> statistic. Compute the U<sub>2</sub> statistic using: 203 * <pre> 204 * u2 = (long) x.length * y.length - u1; 205 * </pre> 206 * 207 * @param x First sample values. 208 * @param y Second sample values. 209 * @return Mann-Whitney U<sub>1</sub> statistic 210 * @throws IllegalArgumentException if {@code x} or {@code y} are zero-length; or contain 211 * NaN values. 212 * @see #withMu(double) 213 */ 214 public double statistic(double[] x, double[] y) { 215 checkSamples(x, y); 216 217 final double[] z = concatenateSamples(mu, x, y); 218 final double[] ranks = RANKING.apply(z); 219 220 // The ranks for x is in the first x.length entries in ranks because x 221 // is in the first x.length entries in z 222 final double sumRankX = Arrays.stream(ranks).limit(x.length).sum(); 223 224 // U1 = R1 - (n1 * (n1 + 1)) / 2 where R1 is sum of ranks for sample 1, 225 // e.g. x, n1 is the number of observations in sample 1. 226 return sumRankX - ((long) x.length * (x.length + 1)) * 0.5; 227 } 228 229 /** 230 * Performs a Mann-Whitney U test comparing the location for two independent 231 * samples. The location is specified using {@link #withMu(double) mu}. 232 * 233 * <p>The test is defined by the {@link AlternativeHypothesis}. 234 * <ul> 235 * <li>'two-sided': the distribution underlying {@code (x - mu)} is not equal to the 236 * distribution underlying {@code y}. 237 * <li>'greater': the distribution underlying {@code (x - mu)} is stochastically greater than 238 * the distribution underlying {@code y}. 239 * <li>'less': the distribution underlying {@code (x - mu)} is stochastically less than 240 * the distribution underlying {@code y}. 241 * </ul> 242 * 243 * <p>If the p-value method is {@linkplain PValueMethod#AUTO auto} an exact p-value is 244 * computed if the samples contain less than 50 values; otherwise a normal 245 * approximation is used. 246 * 247 * <p>Computation of the exact p-value is only valid if there are no tied 248 * ranks in the data; otherwise the p-value resorts to the asymptotic 249 * approximation using a tie correction and an optional continuity correction. 250 * 251 * <p><strong>Note: </strong> 252 * Exact computation requires tabulation of values not exceeding size 253 * {@code (n+1)*(m+1)*(u+1)} where {@code u} is the minimum of the U<sub>1</sub> and 254 * U<sub>2</sub> statistics and {@code n} and {@code m} are the sample sizes. 255 * This may use a very large amount of memory and result in an {@link OutOfMemoryError}. 256 * Exact computation requires a finite binomial coefficient {@code binom(n+m, m)} 257 * which is limited to {@code n+m <= 1029} for any {@code n} and {@code m}, 258 * or {@code min(n, m) <= 37} for any {@code max(n, m)}. 259 * An {@link OutOfMemoryError} is not expected using the 260 * limits configured for the {@linkplain PValueMethod#AUTO auto} p-value computation 261 * as the maximum required memory is approximately 23 MiB. 262 * 263 * @param x First sample values. 264 * @param y Second sample values. 265 * @return test result 266 * @throws IllegalArgumentException if {@code x} or {@code y} are zero-length; or contain 267 * NaN values. 268 * @throws OutOfMemoryError if the exact computation is <em>user-requested</em> for 269 * large samples and there is not enough memory. 270 * @see #statistic(double[], double[]) 271 * @see #withMu(double) 272 * @see #with(AlternativeHypothesis) 273 * @see #with(ContinuityCorrection) 274 */ 275 public Result test(double[] x, double[] y) { 276 // Computation as above. The ranks are required for tie correction. 277 checkSamples(x, y); 278 final double[] z = concatenateSamples(mu, x, y); 279 final double[] ranks = RANKING.apply(z); 280 final double sumRankX = Arrays.stream(ranks).limit(x.length).sum(); 281 final double u1 = sumRankX - ((long) x.length * (x.length + 1)) * 0.5; 282 283 final double c = WilcoxonSignedRankTest.calculateTieCorrection(ranks); 284 final boolean tiedValues = c != 0; 285 286 PValueMethod method = pValueMethod; 287 final int n = x.length; 288 final int m = y.length; 289 if (method == PValueMethod.AUTO && Math.max(n, m) < AUTO_LIMIT) { 290 method = PValueMethod.EXACT; 291 } 292 // Exact p requires no ties. 293 // The method will fail-fast if the computation is not possible due 294 // to the size of the data. 295 double p = method == PValueMethod.EXACT && !tiedValues ? 296 calculateExactPValue(u1, n, m, alternative) : -1; 297 if (p < 0) { 298 p = calculateAsymptoticPValue(u1, n, m, c); 299 } 300 return new Result(u1, tiedValues, p); 301 } 302 303 /** 304 * Ensures that the provided arrays fulfil the assumptions. 305 * 306 * @param x First sample values. 307 * @param y Second sample values. 308 * @throws IllegalArgumentException if {@code x} or {@code y} are zero-length. 309 */ 310 private static void checkSamples(double[] x, double[] y) { 311 Arguments.checkValuesRequiredSize(x.length, 1); 312 Arguments.checkValuesRequiredSize(y.length, 1); 313 } 314 315 /** 316 * Concatenate the samples into one array. Subtract {@code mu} from the first sample. 317 * 318 * @param mu Expected difference between means. 319 * @param x First sample values. 320 * @param y Second sample values. 321 * @return concatenated array 322 */ 323 private static double[] concatenateSamples(double mu, double[] x, double[] y) { 324 final double[] z = new double[x.length + y.length]; 325 System.arraycopy(x, 0, z, 0, x.length); 326 System.arraycopy(y, 0, z, x.length, y.length); 327 if (mu != 0) { 328 for (int i = 0; i < x.length; i++) { 329 z[i] -= mu; 330 } 331 } 332 return z; 333 } 334 335 /** 336 * Calculate the asymptotic p-value using a Normal approximation. 337 * 338 * @param u Mann-Whitney U value. 339 * @param n1 Number of subjects in first sample. 340 * @param n2 Number of subjects in second sample. 341 * @param c Tie-correction 342 * @return two-sided asymptotic p-value 343 */ 344 private double calculateAsymptoticPValue(double u, int n1, int n2, double c) { 345 // Use long to avoid overflow 346 final long n1n2 = (long) n1 * n2; 347 final long n = (long) n1 + n2; 348 349 // https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test#Normal_approximation_and_tie_correction 350 final double e = n1n2 * 0.5; 351 final double variance = (n1n2 / 12.0) * ((n + 1.0) - c / n / (n - 1)); 352 353 double z = u - e; 354 if (continuityCorrection) { 355 // +/- 0.5 is a continuity correction towards the expected. 356 if (alternative == AlternativeHypothesis.GREATER_THAN) { 357 z -= 0.5; 358 } else if (alternative == AlternativeHypothesis.LESS_THAN) { 359 z += 0.5; 360 } else { 361 // two-sided. Shift towards the expected of zero. 362 // Use of signum ignores x==0 (i.e. not copySign(0.5, z)) 363 z -= Math.signum(z) * 0.5; 364 } 365 } 366 z /= Math.sqrt(variance); 367 368 final NormalDistribution standardNormal = NormalDistribution.of(0, 1); 369 if (alternative == AlternativeHypothesis.GREATER_THAN) { 370 return standardNormal.survivalProbability(z); 371 } 372 if (alternative == AlternativeHypothesis.LESS_THAN) { 373 return standardNormal.cumulativeProbability(z); 374 } 375 // two-sided 376 return 2 * standardNormal.survivalProbability(Math.abs(z)); 377 } 378 379 /** 380 * Calculate the exact p-value. If the value cannot be computed this returns -1. 381 * 382 * <p>Note: Computation may run out of memory during array allocation, or method 383 * recursion. 384 * 385 * @param u Mann-Whitney U value. 386 * @param m Number of subjects in first sample. 387 * @param n Number of subjects in second sample. 388 * @param alternative Alternative hypothesis. 389 * @return exact p-value (or -1) (two-sided, greater, or less using the options) 390 */ 391 // package-private for testing 392 static double calculateExactPValue(double u, int m, int n, AlternativeHypothesis alternative) { 393 // Check the computation can be attempted. 394 // u must be an integer 395 if ((int) u != u) { 396 return -1; 397 } 398 // Note: n+m will not overflow as we concatenated the samples to a single array. 399 final double binom = BinomialCoefficientDouble.value(n + m, m); 400 if (binom == Double.POSITIVE_INFINITY) { 401 return -1; 402 } 403 404 // Use u_min for the CDF. 405 final int u1 = (int) u; 406 final int u2 = (int) ((long) m * n - u1); 407 // Use m < n to support symmetry. 408 final int n1 = Math.min(m, n); 409 final int n2 = Math.max(m, n); 410 411 // Return the correct side: 412 if (alternative == AlternativeHypothesis.GREATER_THAN) { 413 // sf(u1 - 1) 414 return sf(u1 - 1, u2 + 1, n1, n2, binom); 415 } 416 if (alternative == AlternativeHypothesis.LESS_THAN) { 417 // cdf(u1) 418 return cdf(u1, u2, n1, n2, binom); 419 } 420 // two-sided: 2 * sf(max(u1, u2) - 1) or 2 * cdf(min(u1, u2)) 421 final double p = 2 * computeCdf(Math.min(u1, u2), n1, n2, binom); 422 // Clip to range: [0, 1] 423 return Math.min(1, p); 424 } 425 426 /** 427 * Compute the cumulative density function of the Mann-Whitney U1 statistic. 428 * The U2 statistic is passed for convenience to exploit symmetry in the distribution. 429 * 430 * @param u1 Mann-Whitney U1 statistic 431 * @param u2 Mann-Whitney U2 statistic 432 * @param m First sample size. 433 * @param n Second sample size. 434 * @param binom binom(n+m, m) (must be finite) 435 * @return {@code Pr(X <= k)} 436 */ 437 private static double cdf(int u1, int u2, int m, int n, double binom) { 438 // Exploit symmetry. Note the distribution is discrete thus requiring (u2 - 1). 439 return u2 > u1 ? 440 computeCdf(u1, m, n, binom) : 441 1 - computeCdf(u2 - 1, m, n, binom); 442 } 443 444 /** 445 * Compute the survival function of the Mann-Whitney U1 statistic. 446 * The U2 statistic is passed for convenience to exploit symmetry in the distribution. 447 * 448 * @param u1 Mann-Whitney U1 statistic 449 * @param u2 Mann-Whitney U2 statistic 450 * @param m First sample size. 451 * @param n Second sample size. 452 * @param binom binom(n+m, m) (must be finite) 453 * @return {@code Pr(X > k)} 454 */ 455 private static double sf(int u1, int u2, int m, int n, double binom) { 456 // Opposite of the CDF 457 return u2 > u1 ? 458 1 - computeCdf(u1, m, n, binom) : 459 computeCdf(u2 - 1, m, n, binom); 460 } 461 462 /** 463 * Compute the cumulative density function of the Mann-Whitney U statistic. 464 * 465 * <p>This should be called with the lower of U1 or U2 for computational efficiency. 466 * 467 * <p>Uses the recursive formula provided in Bucchianico, A.D, (1999) 468 * Combinatorics, computer algebra and the Wilcoxon-Mann-Whitney test, Journal 469 * of Statistical Planning and Inference, Volume 79, Issue 2, 349-364. 470 * 471 * @param k Mann-Whitney U statistic 472 * @param m First sample size. 473 * @param n Second sample size. 474 * @param binom binom(n+m, m) (must be finite) 475 * @return {@code Pr(X <= k)} 476 */ 477 private static double computeCdf(int k, int m, int n, double binom) { 478 // Theorem 2.5: 479 // f(m, n, k) = 0 if k < 0, m < 0, n < 0, k > nm 480 if (k < 0) { 481 return 0; 482 } 483 // Recursively compute f(m, n, k) 484 final double[][][] f = getF(m, n, k); 485 486 // P(X=k) = f(m, n, k) / binom(m+n, m) 487 // P(X<=k) = sum_0^k (P(X=i)) 488 489 // Called with k = min(u1, u2) : max(p) ~ 0.5 so no need to clip to [0, 1] 490 return IntStream.rangeClosed(0, k).mapToDouble(i -> fmnk(f, m, n, i)).sum() / binom; 491 } 492 493 /** 494 * Gets the storage for f(m, n, k). 495 * 496 * <p>This may be cached for performance. 497 * 498 * @param m M. 499 * @param n N. 500 * @param k K. 501 * @return the storage for f 502 */ 503 private static double[][][] getF(int m, int n, int k) { 504 // Obtain any previous computation of f and expand it if required. 505 // F is only modified within this synchronized block. 506 // Any concurrent threads using a reference returned by this method 507 // will not receive an index out-of-bounds as f is only ever expanded. 508 synchronized (LOCK) { 509 // Note: f(x<m, y<n, z<k) is always the same. 510 // Cache the array and re-use any previous computation. 511 double[][][] f = cacheF.get(); 512 513 // Require: 514 // f = new double[m + 1][n + 1][k + 1] 515 // f(m, n, 0) == 1; otherwise -1 if not computed 516 // m+n <= 1029 for any m,n; k < mn/2 (due to symmetry using min(u1, u2)) 517 // Size m=n=515: approximately 516^2 * 515^2/2 = 398868 doubles ~ 3.04 GiB 518 if (f == null) { 519 f = new double[m + 1][n + 1][k + 1]; 520 for (final double[][] a : f) { 521 for (final double[] b : a) { 522 initialize(b); 523 } 524 } 525 // Cache for reuse. 526 cacheF = new SoftReference<>(f); 527 return f; 528 } 529 530 // Grow if required: m1 < m+1 => m1-(m+1) < 0 => m1 - m < 1 531 final int m1 = f.length; 532 final int n1 = f[0].length; 533 final int k1 = f[0][0].length; 534 final boolean growM = m1 - m < 1; 535 final boolean growN = n1 - n < 1; 536 final boolean growK = k1 - k < 1; 537 if (growM | growN | growK) { 538 // Some part of the previous f is too small. 539 // Atomically grow without destroying the previous computation. 540 // Any other thread using the previous f will not go out of bounds 541 // by keeping the new f dimensions at least as large. 542 // Note: Doing this in-place allows the memory to be gradually 543 // increased rather than allocating a new [m + 1][n + 1][k + 1] 544 // and copying all old values. 545 final int sn = Math.max(n1, n + 1); 546 final int sk = Math.max(k1, k + 1); 547 if (growM) { 548 // Entirely new region 549 f = Arrays.copyOf(f, m + 1); 550 for (int x = m1; x <= m; x++) { 551 f[x] = new double[sn][sk]; 552 for (final double[] b : f[x]) { 553 initialize(b); 554 } 555 } 556 } 557 // Expand previous in place if required 558 if (growN) { 559 for (int x = 0; x < m1; x++) { 560 f[x] = Arrays.copyOf(f[x], sn); 561 for (int y = n1; y < sn; y++) { 562 final double[] b = f[x][y] = new double[sk]; 563 initialize(b); 564 } 565 } 566 } 567 if (growK) { 568 for (int x = 0; x < m1; x++) { 569 for (int y = 0; y < n1; y++) { 570 final double[] b = f[x][y] = Arrays.copyOf(f[x][y], sk); 571 for (int z = k1; z < sk; z++) { 572 b[z] = UNSET; 573 } 574 } 575 } 576 } 577 // Avoided an OutOfMemoryError. Cache for reuse. 578 cacheF = new SoftReference<>(f); 579 } 580 return f; 581 } 582 } 583 584 /** 585 * Initialize the array for f(m, n, x). 586 * Set value to 1 for x=0; otherwise {@link #UNSET}. 587 * 588 * @param fmn Array. 589 */ 590 private static void initialize(double[] fmn) { 591 Arrays.fill(fmn, UNSET); 592 // f(m, n, 0) == 1 if m >= 0, n >= 0 593 fmn[0] = 1; 594 } 595 596 /** 597 * Compute f(m; n; k), the number of subsets of {0; 1; ...; n} with m elements such 598 * that the elements of this subset add up to k. 599 * 600 * <p>The function is computed recursively. 601 * 602 * @param f Tabulated values of f[m][n][k]. 603 * @param m M 604 * @param n N 605 * @param k K 606 * @return f(m; n; k) 607 */ 608 private static double fmnk(double[][][] f, int m, int n, int k) { 609 // Theorem 2.5: 610 // Omit conditions that will not be met: k > mn 611 // f(m, n, k) = 0 if k < 0, m < 0, n < 0 612 if ((k | m | n) < 0) { 613 return 0; 614 } 615 // Compute on demand 616 double fmnk = f[m][n][k]; 617 if (fmnk < 0) { 618 // f(m, n, 0) == 1 if m >= 0, n >= 0 619 // This is already computed. 620 621 // Recursion from formula (3): 622 // f(m, n, k) = f(m-1, n, k-n) + f(m, n-1, k) 623 f[m][n][k] = fmnk = fmnk(f, m - 1, n, k - n) + fmnk(f, m, n - 1, k); 624 } 625 return fmnk; 626 } 627}