001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.statistics.inference; 018 019import java.util.EnumSet; 020import java.util.Objects; 021import org.apache.commons.statistics.descriptive.DoubleStatistics; 022import org.apache.commons.statistics.descriptive.Statistic; 023import org.apache.commons.statistics.distribution.TDistribution; 024 025/** 026 * Implements Student's t-test statistics. 027 * 028 * <p>Tests can be: 029 * <ul> 030 * <li>One-sample or two-sample 031 * <li>One-sided or two-sided 032 * <li>Paired or unpaired (for two-sample tests) 033 * <li>Homoscedastic (equal variance assumption) or heteroscedastic (for two sample tests) 034 * </ul> 035 * 036 * <p>Input to tests can be either {@code double[]} arrays or the mean, variance, and size 037 * of the sample. 038 * 039 * @see <a href="https://en.wikipedia.org/wiki/Student%27s_t-test">Student's t-test (Wikipedia)</a> 040 * @since 1.1 041 */ 042public final class TTest { 043 /** Default instance. */ 044 private static final TTest DEFAULT = new TTest(AlternativeHypothesis.TWO_SIDED, false, 0); 045 046 /** Alternative hypothesis. */ 047 private final AlternativeHypothesis alternative; 048 /** Assume the two samples have the same population variance. */ 049 private final boolean equalVariances; 050 /** The true value of the mean (or difference in means for a two sample test). */ 051 private final double mu; 052 053 /** 054 * Result for the t-test. 055 * 056 * <p>This class is immutable. 057 */ 058 public static final class Result extends BaseSignificanceResult { 059 /** Degrees of freedom. */ 060 private final double degreesOfFreedom; 061 062 /** 063 * Create an instance. 064 * 065 * @param statistic Test statistic. 066 * @param degreesOfFreedom Degrees of freedom. 067 * @param p Result p-value. 068 */ 069 Result(double statistic, double degreesOfFreedom, double p) { 070 super(statistic, p); 071 this.degreesOfFreedom = degreesOfFreedom; 072 } 073 074 /** 075 * Gets the degrees of freedom. 076 * 077 * @return the degrees of freedom 078 */ 079 public double getDegreesOfFreedom() { 080 return degreesOfFreedom; 081 } 082 } 083 084 /** 085 * @param alternative Alternative hypothesis. 086 * @param equalVariances Assume the two samples have the same population variance. 087 * @param mu true value of the mean (or difference in means for a two sample test). 088 */ 089 private TTest(AlternativeHypothesis alternative, boolean equalVariances, double mu) { 090 this.alternative = alternative; 091 this.equalVariances = equalVariances; 092 this.mu = mu; 093 } 094 095 /** 096 * Return an instance using the default options. 097 * 098 * <ul> 099 * <li>{@link AlternativeHypothesis#TWO_SIDED} 100 * <li>{@link DataDispersion#HETEROSCEDASTIC} 101 * <li>{@linkplain #withMu(double) mu = 0} 102 * </ul> 103 * 104 * @return default instance 105 */ 106 public static TTest withDefaults() { 107 return DEFAULT; 108 } 109 110 /** 111 * Return an instance with the configured alternative hypothesis. 112 * 113 * @param v Value. 114 * @return an instance 115 */ 116 public TTest with(AlternativeHypothesis v) { 117 return new TTest(Objects.requireNonNull(v), equalVariances, mu); 118 } 119 120 /** 121 * Return an instance with the configured assumption on the data dispersion. 122 * 123 * <p>Applies to the two-sample independent t-test. 124 * The statistic can compare the means without the assumption of equal 125 * sub-population variances (heteroscedastic); otherwise the means are compared 126 * under the assumption of equal sub-population variances (homoscedastic). 127 * 128 * @param v Value. 129 * @return an instance 130 * @see #test(double[], double[]) 131 * @see #test(double, double, long, double, double, long) 132 */ 133 public TTest with(DataDispersion v) { 134 return new TTest(alternative, Objects.requireNonNull(v) == DataDispersion.HOMOSCEDASTIC, mu); 135 } 136 137 /** 138 * Return an instance with the configured {@code mu}. 139 * 140 * <p>For the one-sample test this is the expected mean. 141 * 142 * <p>For the two-sample test this is the expected difference between the means. 143 * 144 * @param v Value. 145 * @return an instance 146 * @throws IllegalArgumentException if the value is not finite 147 */ 148 public TTest withMu(double v) { 149 return new TTest(alternative, equalVariances, Arguments.checkFinite(v)); 150 } 151 152 /** 153 * Computes a one-sample t statistic comparing the mean of the dataset to {@code mu}. 154 * 155 * <p>The returned t-statistic is: 156 * 157 * <p>\[ t = \frac{m - \mu}{ \sqrt{ \frac{v}{n} } } \] 158 * 159 * @param m Sample mean. 160 * @param v Sample variance. 161 * @param n Sample size. 162 * @return t statistic 163 * @throws IllegalArgumentException if the number of samples is {@code < 2}; or the 164 * variance is negative 165 * @see #withMu(double) 166 */ 167 public double statistic(double m, double v, long n) { 168 Arguments.checkNonNegative(v); 169 checkSampleSize(n); 170 return computeT(m - mu, v, n); 171 } 172 173 /** 174 * Computes a one-sample t statistic comparing the mean of the sample to {@code mu}. 175 * 176 * @param x Sample values. 177 * @return t statistic 178 * @throws IllegalArgumentException if the number of samples is {@code < 2} 179 * @see #statistic(double, double, long) 180 * @see #withMu(double) 181 */ 182 public double statistic(double[] x) { 183 final long n = checkSampleSize(x.length); 184 final DoubleStatistics s = DoubleStatistics.of( 185 EnumSet.of(Statistic.MEAN, Statistic.VARIANCE), x); 186 final double m = s.getAsDouble(Statistic.MEAN); 187 final double v = s.getAsDouble(Statistic.VARIANCE); 188 return computeT(m - mu, v, n); 189 } 190 191 /** 192 * Computes a paired two-sample t-statistic on related samples comparing the mean difference 193 * between the samples to {@code mu}. 194 * 195 * <p>The t-statistic returned is functionally equivalent to what would be returned by computing 196 * the one-sample t-statistic {@link #statistic(double[])}, with 197 * the sample array consisting of the (signed) differences between corresponding 198 * entries in {@code x} and {@code y}. 199 * 200 * @param x First sample values. 201 * @param y Second sample values. 202 * @return t statistic 203 * @throws IllegalArgumentException if the number of samples is {@code < 2}; or the 204 * the size of the samples is not equal 205 * @see #withMu(double) 206 */ 207 public double pairedStatistic(double[] x, double[] y) { 208 final long n = checkSampleSize(x.length); 209 final double m = StatisticUtils.meanDifference(x, y); 210 final double v = StatisticUtils.varianceDifference(x, y, m); 211 return computeT(m - mu, v, n); 212 } 213 214 /** 215 * Computes a two-sample t statistic on independent samples comparing the difference in means 216 * of the datasets to {@code mu}. 217 * 218 * <p>Use the {@link DataDispersion} to control the computation of the variance. 219 * 220 * <p>The heteroscedastic t-statistic is: 221 * 222 * <p>\[ t = \frac{m1 - m2 - \mu}{ \sqrt{ \frac{v_1}{n_1} + \frac{v_2}{n_2} } } \] 223 * 224 * <p>The homoscedastic t-statistic is: 225 * 226 * <p>\[ t = \frac{m1 - m2 - \mu}{ \sqrt{ v (\frac{1}{n_1} + \frac{1}{n_2}) } } \] 227 * 228 * <p>where \( v \) is the pooled variance estimate: 229 * 230 * <p>\[ v = \frac{(n_1-1)v_1 + (n_2-1)v_2}{n_1 + n_2 - 2} \] 231 * 232 * @param m1 First sample mean. 233 * @param v1 First sample variance. 234 * @param n1 First sample size. 235 * @param m2 Second sample mean. 236 * @param v2 Second sample variance. 237 * @param n2 Second sample size. 238 * @return t statistic 239 * @throws IllegalArgumentException if the number of samples in either dataset is 240 * {@code < 2}; or the variances are negative. 241 * @see #withMu(double) 242 * @see #with(DataDispersion) 243 */ 244 public double statistic(double m1, double v1, long n1, 245 double m2, double v2, long n2) { 246 Arguments.checkNonNegative(v1); 247 Arguments.checkNonNegative(v2); 248 checkSampleSize(n1); 249 checkSampleSize(n2); 250 return equalVariances ? 251 computeHomoscedasticT(mu, m1, v1, n1, m2, v2, n2) : 252 computeT(mu, m1, v1, n1, m2, v2, n2); 253 } 254 255 /** 256 * Computes a two-sample t statistic on independent samples comparing the difference 257 * in means of the samples to {@code mu}. 258 * 259 * <p>Use the {@link DataDispersion} to control the computation of the variance. 260 * 261 * @param x First sample values. 262 * @param y Second sample values. 263 * @return t statistic 264 * @throws IllegalArgumentException if the number of samples in either dataset is {@code < 2} 265 * @see #withMu(double) 266 * @see #with(DataDispersion) 267 */ 268 public double statistic(double[] x, double[] y) { 269 final long n1 = checkSampleSize(x.length); 270 final long n2 = checkSampleSize(y.length); 271 final DoubleStatistics.Builder b = DoubleStatistics.builder(Statistic.MEAN, Statistic.VARIANCE); 272 final DoubleStatistics s1 = b.build(x); 273 final double m1 = s1.getAsDouble(Statistic.MEAN); 274 final double v1 = s1.getAsDouble(Statistic.VARIANCE); 275 final DoubleStatistics s2 = b.build(y); 276 final double m2 = s2.getAsDouble(Statistic.MEAN); 277 final double v2 = s2.getAsDouble(Statistic.VARIANCE); 278 return equalVariances ? 279 computeHomoscedasticT(mu, m1, v1, n1, m2, v2, n2) : 280 computeT(mu, m1, v1, n1, m2, v2, n2); 281 } 282 283 /** 284 * Perform a one-sample t-test comparing the mean of the dataset to {@code mu}. 285 * 286 * <p>Degrees of freedom are \( v = n - 1 \). 287 * 288 * @param m Sample mean. 289 * @param v Sample variance. 290 * @param n Sample size. 291 * @return test result 292 * @throws IllegalArgumentException if the number of samples is {@code < 2}; or the 293 * variance is negative 294 * @see #statistic(double, double, long) 295 */ 296 public Result test(double m, double v, long n) { 297 final double t = statistic(m, v, n); 298 final double df = n - 1.0; 299 final double p = computeP(t, df); 300 return new Result(t, df, p); 301 } 302 303 /** 304 * Performs a one-sample t-test comparing the mean of the sample to {@code mu}. 305 * 306 * <p>Degrees of freedom are \( v = n - 1 \). 307 * 308 * @param sample Sample values. 309 * @return the test result 310 * @throws IllegalArgumentException if the number of samples is {@code < 2}; or the 311 * the size of the samples is not equal 312 * @see #statistic(double[]) 313 */ 314 public Result test(double[] sample) { 315 final double t = statistic(sample); 316 final double df = sample.length - 1.0; 317 final double p = computeP(t, df); 318 return new Result(t, df, p); 319 } 320 321 /** 322 * Performs a paired two-sample t-test on related samples comparing the mean difference between 323 * the samples to {@code mu}. 324 * 325 * <p>The test is functionally equivalent to what would be returned by computing 326 * the one-sample t-test {@link #test(double[])}, with 327 * the sample array consisting of the (signed) differences between corresponding 328 * entries in {@code x} and {@code y}. 329 * 330 * @param x First sample values. 331 * @param y Second sample values. 332 * @return the test result 333 * @throws IllegalArgumentException if the number of samples is {@code < 2}; or the 334 * the size of the samples is not equal 335 * @see #pairedStatistic(double[], double[]) 336 */ 337 public Result pairedTest(double[] x, double[] y) { 338 final double t = pairedStatistic(x, y); 339 final double df = x.length - 1.0; 340 final double p = computeP(t, df); 341 return new Result(t, df, p); 342 } 343 344 /** 345 * Performs a two-sample t-test on independent samples comparing the difference in means of the 346 * datasets to {@code mu}. 347 * 348 * <p>Use the {@link DataDispersion} to control the computation of the variance. 349 * 350 * <p>The heteroscedastic degrees of freedom are estimated using the 351 * Welch-Satterthwaite approximation: 352 * 353 * <p>\[ v = \frac{ (\frac{v_1}{n_1} + \frac{v_2}{n_2})^2 } 354 * { \frac{(v_1/n_1)^2}{n_1-1} + \frac{(v_2/n_2)^2}{n_2-1} } \] 355 * 356 * <p>The homoscedastic degrees of freedom are \( v = n_1 + n_2 - 2 \). 357 * 358 * @param m1 First sample mean. 359 * @param v1 First sample variance. 360 * @param n1 First sample size. 361 * @param m2 Second sample mean. 362 * @param v2 Second sample variance. 363 * @param n2 Second sample size. 364 * @return test result 365 * @throws IllegalArgumentException if the number of samples in either dataset is 366 * {@code < 2}; or the variances are negative. 367 * @see #statistic(double, double, long, double, double, long) 368 */ 369 public Result test(double m1, double v1, long n1, 370 double m2, double v2, long n2) { 371 final double t = statistic(m1, v1, n1, m2, v2, n2); 372 final double df = equalVariances ? 373 -2.0 + n1 + n2 : 374 computeDf(v1, n1, v2, n2); 375 final double p = computeP(t, df); 376 return new Result(t, df, p); 377 } 378 379 /** 380 * Performs a two-sample t-test on independent samples comparing the difference in means of 381 * the samples to {@code mu}. 382 * 383 * <p>Use the {@link DataDispersion} to control the computation of the variance. 384 * 385 * @param x First sample values. 386 * @param y Second sample values. 387 * @return the test result 388 * @throws IllegalArgumentException if the number of samples in either dataset 389 * is {@code < 2} 390 * @see #statistic(double[], double[]) 391 * @see #test(double, double, long, double, double, long) 392 */ 393 public Result test(double[] x, double[] y) { 394 // Here we do not call statistic(double[], double[]) because the degreesOfFreedom 395 // requires the variance. So repeat the computation and compute p. 396 final long n1 = checkSampleSize(x.length); 397 final long n2 = checkSampleSize(y.length); 398 final DoubleStatistics.Builder b = DoubleStatistics.builder(Statistic.MEAN, Statistic.VARIANCE); 399 final DoubleStatistics s1 = b.build(x); 400 final double m1 = s1.getAsDouble(Statistic.MEAN); 401 final double v1 = s1.getAsDouble(Statistic.VARIANCE); 402 final DoubleStatistics s2 = b.build(y); 403 final double m2 = s2.getAsDouble(Statistic.MEAN); 404 final double v2 = s2.getAsDouble(Statistic.VARIANCE); 405 final double t; 406 final double df; 407 if (equalVariances) { 408 t = computeHomoscedasticT(mu, m1, v1, n1, m2, v2, n2); 409 df = -2.0 + n1 + n2; 410 } else { 411 t = computeT(mu, m1, v1, n1, m2, v2, n2); 412 df = computeDf(v1, n1, v2, n2); 413 } 414 final double p = computeP(t, df); 415 return new Result(t, df, p); 416 } 417 418 /** 419 * Computes t statistic for one-sample t-test. 420 * 421 * @param m Sample mean. 422 * @param v Sample variance. 423 * @param n Sample size. 424 * @return t test statistic 425 */ 426 private static double computeT(double m, double v, long n) { 427 return m / Math.sqrt(v / n); 428 } 429 430 /** 431 * Computes t statistic for two-sample t-test without the assumption of equal 432 * samples sizes or sub-population variances. 433 * 434 * @param mu Expected difference between means. 435 * @param m1 First sample mean. 436 * @param v1 First sample variance. 437 * @param n1 First sample size. 438 * @param m2 Second sample mean. 439 * @param v2 Second sample variance. 440 * @param n2 Second sample size. 441 * @return t test statistic 442 */ 443 private static double computeT(double mu, 444 double m1, double v1, long n1, 445 double m2, double v2, long n2) { 446 return (m1 - m2 - mu) / Math.sqrt((v1 / n1) + (v2 / n2)); 447 } 448 449 /** 450 * Computes approximate degrees of freedom for two-sample t-test without the 451 * assumption of equal samples sizes or sub-population variances. 452 * 453 * @param v1 First sample variance. 454 * @param n1 First sample size. 455 * @param v2 Second sample variance. 456 * @param n2 Second sample size. 457 * @return approximate degrees of freedom 458 */ 459 private static double computeDf(double v1, long n1, 460 double v2, long n2) { 461 // Sample sizes are specified as a double to avoid integer overflow 462 final double d1 = n1; 463 final double d2 = n2; 464 return (((v1 / d1) + (v2 / d2)) * ((v1 / d1) + (v2 / d2))) / 465 ((v1 * v1) / (d1 * d1 * (n1 - 1)) + (v2 * v2) / (d2 * d2 * (n2 - 1))); 466 } 467 468 /** 469 * Computes t statistic for two-sample t-test under the hypothesis of equal 470 * sub-population variances. 471 * 472 * @param mu Expected difference between means. 473 * @param m1 First sample mean. 474 * @param v1 First sample variance. 475 * @param n1 First sample size. 476 * @param m2 Second sample mean. 477 * @param v2 Second sample variance. 478 * @param n2 Second sample size. 479 * @return t test statistic 480 */ 481 private static double computeHomoscedasticT(double mu, 482 double m1, double v1, long n1, 483 double m2, double v2, long n2) { 484 final double pooledVariance = ((n1 - 1) * v1 + (n2 - 1) * v2) / (-2.0 + n1 + n2); 485 return (m1 - m2 - mu) / Math.sqrt(pooledVariance * (1.0 / n1 + 1.0 / n2)); 486 } 487 488 /** 489 * Computes p-value for the specified t statistic. 490 * 491 * @param t T statistic. 492 * @param degreesOfFreedom Degrees of freedom. 493 * @return p-value for t-test 494 */ 495 private double computeP(double t, double degreesOfFreedom) { 496 if (alternative == AlternativeHypothesis.LESS_THAN) { 497 return TDistribution.of(degreesOfFreedom).cumulativeProbability(t); 498 } 499 if (alternative == AlternativeHypothesis.GREATER_THAN) { 500 return TDistribution.of(degreesOfFreedom).survivalProbability(t); 501 } 502 // two-sided 503 return 2.0 * TDistribution.of(degreesOfFreedom).survivalProbability(Math.abs(t)); 504 } 505 506 /** 507 * Check sample data size. 508 * 509 * @param n Data size. 510 * @return the sample size 511 * @throws IllegalArgumentException if the number of samples {@code < 2} 512 */ 513 private static long checkSampleSize(long n) { 514 if (n <= 1) { 515 throw new InferenceException(InferenceException.TWO_VALUES_REQUIRED, n); 516 } 517 return n; 518 } 519}