Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.statistics.inference;
018
019import java.util.EnumSet;
020import java.util.Objects;
021import org.apache.commons.statistics.descriptive.DoubleStatistics;
022import org.apache.commons.statistics.descriptive.Statistic;
023import org.apache.commons.statistics.distribution.TDistribution;
024
025/**
026 * Implements Student's t-test statistics.
027 *
028 * <p>Tests can be:
029 * <ul>
030 * <li>One-sample or two-sample
031 * <li>One-sided or two-sided
032 * <li>Paired or unpaired (for two-sample tests)
033 * <li>Homoscedastic (equal variance assumption) or heteroscedastic (for two sample tests)
034 * </ul>
035 *
036 * <p>Input to tests can be either {@code double[]} arrays or the mean, variance, and size
037 * of the sample.
038 *
039 * @see <a href="https://en.wikipedia.org/wiki/Student%27s_t-test">Student&#39;s t-test (Wikipedia)</a>
040 * @since 1.1
041 */
042public final class TTest {
043    /** Default instance. */
044    private static final TTest DEFAULT = new TTest(AlternativeHypothesis.TWO_SIDED, false, 0);
045
046    /** Alternative hypothesis. */
047    private final AlternativeHypothesis alternative;
048    /** Assume the two samples have the same population variance. */
049    private final boolean equalVariances;
050    /** The true value of the mean (or difference in means for a two sample test). */
051    private final double mu;
052
053    /**
054     * Result for the t-test.
055     *
056     * <p>This class is immutable.
057     */
058    public static final class Result extends BaseSignificanceResult {
059        /** Degrees of freedom. */
060        private final double degreesOfFreedom;
061
062        /**
063         * Create an instance.
064         *
065         * @param statistic Test statistic.
066         * @param degreesOfFreedom Degrees of freedom.
067         * @param p Result p-value.
068         */
069        Result(double statistic, double degreesOfFreedom, double p) {
070            super(statistic, p);
071            this.degreesOfFreedom = degreesOfFreedom;
072        }
073
074        /**
075         * Gets the degrees of freedom.
076         *
077         * @return the degrees of freedom
078         */
079        public double getDegreesOfFreedom() {
080            return degreesOfFreedom;
081        }
082    }
083
084    /**
085     * @param alternative Alternative hypothesis.
086     * @param equalVariances Assume the two samples have the same population variance.
087     * @param mu true value of the mean (or difference in means for a two sample test).
088     */
089    private TTest(AlternativeHypothesis alternative, boolean equalVariances, double mu) {
090        this.alternative = alternative;
091        this.equalVariances = equalVariances;
092        this.mu = mu;
093    }
094
095    /**
096     * Return an instance using the default options.
097     *
098     * <ul>
099     * <li>{@link AlternativeHypothesis#TWO_SIDED}
100     * <li>{@link DataDispersion#HETEROSCEDASTIC}
101     * <li>{@linkplain #withMu(double) mu = 0}
102     * </ul>
103     *
104     * @return default instance
105     */
106    public static TTest withDefaults() {
107        return DEFAULT;
108    }
109
110    /**
111     * Return an instance with the configured alternative hypothesis.
112     *
113     * @param v Value.
114     * @return an instance
115     */
116    public TTest with(AlternativeHypothesis v) {
117        return new TTest(Objects.requireNonNull(v), equalVariances, mu);
118    }
119
120    /**
121     * Return an instance with the configured assumption on the data dispersion.
122     *
123     * <p>Applies to the two-sample independent t-test.
124     * The statistic can compare the means without the assumption of equal
125     * sub-population variances (heteroscedastic); otherwise the means are compared
126     * under the assumption of equal sub-population variances (homoscedastic).
127     *
128     * @param v Value.
129     * @return an instance
130     * @see #test(double[], double[])
131     * @see #test(double, double, long, double, double, long)
132     */
133    public TTest with(DataDispersion v) {
134        return new TTest(alternative, Objects.requireNonNull(v) == DataDispersion.HOMOSCEDASTIC, mu);
135    }
136
137    /**
138     * Return an instance with the configured {@code mu}.
139     *
140     * <p>For the one-sample test this is the expected mean.
141     *
142     * <p>For the two-sample test this is the expected difference between the means.
143     *
144     * @param v Value.
145     * @return an instance
146     * @throws IllegalArgumentException if the value is not finite
147     */
148    public TTest withMu(double v) {
149        return new TTest(alternative, equalVariances, Arguments.checkFinite(v));
150    }
151
152    /**
153     * Computes a one-sample t statistic comparing the mean of the dataset to {@code mu}.
154     *
155     * <p>The returned t-statistic is:
156     *
157     * <p>\[ t = \frac{m - \mu}{ \sqrt{ \frac{v}{n} } } \]
158     *
159     * @param m Sample mean.
160     * @param v Sample variance.
161     * @param n Sample size.
162     * @return t statistic
163     * @throws IllegalArgumentException if the number of samples is {@code < 2}; or the
164     * variance is negative
165     * @see #withMu(double)
166     */
167    public double statistic(double m, double v, long n) {
168        Arguments.checkNonNegative(v);
169        checkSampleSize(n);
170        return computeT(m - mu, v, n);
171    }
172
173    /**
174     * Computes a one-sample t statistic comparing the mean of the sample to {@code mu}.
175     *
176     * @param x Sample values.
177     * @return t statistic
178     * @throws IllegalArgumentException if the number of samples is {@code < 2}
179     * @see #statistic(double, double, long)
180     * @see #withMu(double)
181     */
182    public double statistic(double[] x) {
183        final long n = checkSampleSize(x.length);
184        final DoubleStatistics s = DoubleStatistics.of(
185            EnumSet.of(Statistic.MEAN, Statistic.VARIANCE), x);
186        final double m = s.getAsDouble(Statistic.MEAN);
187        final double v = s.getAsDouble(Statistic.VARIANCE);
188        return computeT(m - mu, v, n);
189    }
190
191    /**
192     * Computes a paired two-sample t-statistic on related samples comparing the mean difference
193     * between the samples to {@code mu}.
194     *
195     * <p>The t-statistic returned is functionally equivalent to what would be returned by computing
196     * the one-sample t-statistic {@link #statistic(double[])}, with
197     * the sample array consisting of the (signed) differences between corresponding
198     * entries in {@code x} and {@code y}.
199     *
200     * @param x First sample values.
201     * @param y Second sample values.
202     * @return t statistic
203     * @throws IllegalArgumentException if the number of samples is {@code < 2}; or the
204     * the size of the samples is not equal
205     * @see #withMu(double)
206     */
207    public double pairedStatistic(double[] x, double[] y) {
208        final long n = checkSampleSize(x.length);
209        final double m = StatisticUtils.meanDifference(x, y);
210        final double v = StatisticUtils.varianceDifference(x, y, m);
211        return computeT(m - mu, v, n);
212    }
213
214    /**
215     * Computes a two-sample t statistic on independent samples comparing the difference in means
216     * of the datasets to {@code mu}.
217     *
218     * <p>Use the {@link DataDispersion} to control the computation of the variance.
219     *
220     * <p>The heteroscedastic t-statistic is:
221     *
222     * <p>\[ t = \frac{m1 - m2 - \mu}{ \sqrt{ \frac{v_1}{n_1} + \frac{v_2}{n_2} } } \]
223     *
224     * <p>The homoscedastic t-statistic is:
225     *
226     * <p>\[ t = \frac{m1 - m2 - \mu}{ \sqrt{ v (\frac{1}{n_1} + \frac{1}{n_2}) } } \]
227     *
228     * <p>where \( v \) is the pooled variance estimate:
229     *
230     * <p>\[ v = \frac{(n_1-1)v_1 + (n_2-1)v_2}{n_1 + n_2 - 2} \]
231     *
232     * @param m1 First sample mean.
233     * @param v1 First sample variance.
234     * @param n1 First sample size.
235     * @param m2 Second sample mean.
236     * @param v2 Second sample variance.
237     * @param n2 Second sample size.
238     * @return t statistic
239     * @throws IllegalArgumentException if the number of samples in either dataset is
240     * {@code < 2}; or the variances are negative.
241     * @see #withMu(double)
242     * @see #with(DataDispersion)
243     */
244    public double statistic(double m1, double v1, long n1,
245                            double m2, double v2, long n2) {
246        Arguments.checkNonNegative(v1);
247        Arguments.checkNonNegative(v2);
248        checkSampleSize(n1);
249        checkSampleSize(n2);
250        return equalVariances ?
251            computeHomoscedasticT(mu, m1, v1, n1, m2, v2, n2) :
252            computeT(mu, m1, v1, n1, m2, v2, n2);
253    }
254
255    /**
256     * Computes a two-sample t statistic on independent samples comparing the difference
257     * in means of the samples to {@code mu}.
258     *
259     * <p>Use the {@link DataDispersion} to control the computation of the variance.
260     *
261     * @param x First sample values.
262     * @param y Second sample values.
263     * @return t statistic
264     * @throws IllegalArgumentException if the number of samples in either dataset is {@code < 2}
265     * @see #withMu(double)
266     * @see #with(DataDispersion)
267     */
268    public double statistic(double[] x, double[] y) {
269        final long n1 = checkSampleSize(x.length);
270        final long n2 = checkSampleSize(y.length);
271        final DoubleStatistics.Builder b = DoubleStatistics.builder(Statistic.MEAN, Statistic.VARIANCE);
272        final DoubleStatistics s1 = b.build(x);
273        final double m1 = s1.getAsDouble(Statistic.MEAN);
274        final double v1 = s1.getAsDouble(Statistic.VARIANCE);
275        final DoubleStatistics s2 = b.build(y);
276        final double m2 = s2.getAsDouble(Statistic.MEAN);
277        final double v2 = s2.getAsDouble(Statistic.VARIANCE);
278        return equalVariances ?
279            computeHomoscedasticT(mu, m1, v1, n1, m2, v2, n2) :
280            computeT(mu, m1, v1, n1, m2, v2, n2);
281    }
282
283    /**
284     * Perform a one-sample t-test comparing the mean of the dataset to {@code mu}.
285     *
286     * <p>Degrees of freedom are \( v = n - 1 \).
287     *
288     * @param m Sample mean.
289     * @param v Sample variance.
290     * @param n Sample size.
291     * @return test result
292     * @throws IllegalArgumentException if the number of samples is {@code < 2}; or the
293     * variance is negative
294     * @see #statistic(double, double, long)
295     */
296    public Result test(double m, double v, long n) {
297        final double t = statistic(m, v, n);
298        final double df = n - 1.0;
299        final double p = computeP(t, df);
300        return new Result(t, df, p);
301    }
302
303    /**
304     * Performs a one-sample t-test comparing the mean of the sample to {@code mu}.
305     *
306     * <p>Degrees of freedom are \( v = n - 1 \).
307     *
308     * @param sample Sample values.
309     * @return the test result
310     * @throws IllegalArgumentException if the number of samples is {@code < 2}; or the
311     * the size of the samples is not equal
312     * @see #statistic(double[])
313     */
314    public Result test(double[] sample) {
315        final double t = statistic(sample);
316        final double df = sample.length - 1.0;
317        final double p = computeP(t, df);
318        return new Result(t, df, p);
319    }
320
321    /**
322     * Performs a paired two-sample t-test on related samples comparing the mean difference between
323     * the samples to {@code mu}.
324     *
325     * <p>The test is functionally equivalent to what would be returned by computing
326     * the one-sample t-test {@link #test(double[])}, with
327     * the sample array consisting of the (signed) differences between corresponding
328     * entries in {@code x} and {@code y}.
329     *
330     * @param x First sample values.
331     * @param y Second sample values.
332     * @return the test result
333     * @throws IllegalArgumentException if the number of samples is {@code < 2}; or the
334     * the size of the samples is not equal
335     * @see #pairedStatistic(double[], double[])
336     */
337    public Result pairedTest(double[] x, double[] y) {
338        final double t = pairedStatistic(x, y);
339        final double df = x.length - 1.0;
340        final double p = computeP(t, df);
341        return new Result(t, df, p);
342    }
343
344    /**
345     * Performs a two-sample t-test on independent samples comparing the difference in means of the
346     * datasets to {@code mu}.
347     *
348     * <p>Use the {@link DataDispersion} to control the computation of the variance.
349     *
350     * <p>The heteroscedastic degrees of freedom are estimated using the
351     * Welch-Satterthwaite approximation:
352     *
353     * <p>\[ v = \frac{ (\frac{v_1}{n_1} + \frac{v_2}{n_2})^2 }
354     *                { \frac{(v_1/n_1)^2}{n_1-1} + \frac{(v_2/n_2)^2}{n_2-1} } \]
355     *
356     * <p>The homoscedastic degrees of freedom are \( v = n_1 + n_2 - 2 \).
357     *
358     * @param m1 First sample mean.
359     * @param v1 First sample variance.
360     * @param n1 First sample size.
361     * @param m2 Second sample mean.
362     * @param v2 Second sample variance.
363     * @param n2 Second sample size.
364     * @return test result
365     * @throws IllegalArgumentException if the number of samples in either dataset is
366     * {@code < 2}; or the variances are negative.
367     * @see #statistic(double, double, long, double, double, long)
368     */
369    public Result test(double m1, double v1, long n1,
370                       double m2, double v2, long n2) {
371        final double t = statistic(m1, v1, n1, m2, v2, n2);
372        final double df = equalVariances ?
373                -2.0 + n1 + n2 :
374                computeDf(v1, n1, v2, n2);
375        final double p = computeP(t, df);
376        return new Result(t, df, p);
377    }
378
379    /**
380     * Performs a two-sample t-test on independent samples comparing the difference in means of
381     * the samples to {@code mu}.
382     *
383     * <p>Use the {@link DataDispersion} to control the computation of the variance.
384     *
385     * @param x First sample values.
386     * @param y Second sample values.
387     * @return the test result
388     * @throws IllegalArgumentException if the number of samples in either dataset
389     * is {@code < 2}
390     * @see #statistic(double[], double[])
391     * @see #test(double, double, long, double, double, long)
392     */
393    public Result test(double[] x, double[] y) {
394        // Here we do not call statistic(double[], double[]) because the degreesOfFreedom
395        // requires the variance. So repeat the computation and compute p.
396        final long n1 = checkSampleSize(x.length);
397        final long n2 = checkSampleSize(y.length);
398        final DoubleStatistics.Builder b = DoubleStatistics.builder(Statistic.MEAN, Statistic.VARIANCE);
399        final DoubleStatistics s1 = b.build(x);
400        final double m1 = s1.getAsDouble(Statistic.MEAN);
401        final double v1 = s1.getAsDouble(Statistic.VARIANCE);
402        final DoubleStatistics s2 = b.build(y);
403        final double m2 = s2.getAsDouble(Statistic.MEAN);
404        final double v2 = s2.getAsDouble(Statistic.VARIANCE);
405        final double t;
406        final double df;
407        if (equalVariances) {
408            t = computeHomoscedasticT(mu, m1, v1, n1, m2, v2, n2);
409            df = -2.0 + n1 + n2;
410        } else {
411            t = computeT(mu, m1, v1, n1, m2, v2, n2);
412            df = computeDf(v1, n1, v2, n2);
413        }
414        final double p = computeP(t, df);
415        return new Result(t, df, p);
416    }
417
418    /**
419     * Computes t statistic for one-sample t-test.
420     *
421     * @param m Sample mean.
422     * @param v Sample variance.
423     * @param n Sample size.
424     * @return t test statistic
425     */
426    private static double computeT(double m, double v, long n) {
427        return m / Math.sqrt(v / n);
428    }
429
430    /**
431     * Computes t statistic for two-sample t-test without the assumption of equal
432     * samples sizes or sub-population variances.
433     *
434     * @param mu Expected difference between means.
435     * @param m1 First sample mean.
436     * @param v1 First sample variance.
437     * @param n1 First sample size.
438     * @param m2 Second sample mean.
439     * @param v2 Second sample variance.
440     * @param n2 Second sample size.
441     * @return t test statistic
442     */
443    private static double computeT(double mu,
444                                   double m1, double v1, long n1,
445                                   double m2, double v2, long n2)  {
446        return (m1 - m2 - mu) / Math.sqrt((v1 / n1) + (v2 / n2));
447    }
448
449    /**
450     * Computes approximate degrees of freedom for two-sample t-test without the
451     * assumption of equal samples sizes or sub-population variances.
452     *
453     * @param v1 First sample variance.
454     * @param n1 First sample size.
455     * @param v2 Second sample variance.
456     * @param n2 Second sample size.
457     * @return approximate degrees of freedom
458     */
459    private static double computeDf(double v1, long n1,
460                                    double v2, long n2) {
461        // Sample sizes are specified as a double to avoid integer overflow
462        final double d1 = n1;
463        final double d2 = n2;
464        return (((v1 / d1) + (v2 / d2)) * ((v1 / d1) + (v2 / d2))) /
465            ((v1 * v1) / (d1 * d1 * (n1 - 1)) + (v2 * v2) / (d2 * d2 * (n2 - 1)));
466    }
467
468    /**
469     * Computes t statistic for two-sample t-test under the hypothesis of equal
470     * sub-population variances.
471     *
472     * @param mu Expected difference between means.
473     * @param m1 First sample mean.
474     * @param v1 First sample variance.
475     * @param n1 First sample size.
476     * @param m2 Second sample mean.
477     * @param v2 Second sample variance.
478     * @param n2 Second sample size.
479     * @return t test statistic
480     */
481    private static double computeHomoscedasticT(double mu,
482                                                double m1, double v1, long n1,
483                                                double m2, double v2, long n2)  {
484        final double pooledVariance = ((n1 - 1) * v1 + (n2 - 1) * v2) / (-2.0 + n1 + n2);
485        return (m1 - m2 - mu) / Math.sqrt(pooledVariance * (1.0 / n1 + 1.0 / n2));
486    }
487
488    /**
489     * Computes p-value for the specified t statistic.
490     *
491     * @param t T statistic.
492     * @param degreesOfFreedom Degrees of freedom.
493     * @return p-value for t-test
494     */
495    private double computeP(double t, double degreesOfFreedom) {
496        if (alternative == AlternativeHypothesis.LESS_THAN) {
497            return TDistribution.of(degreesOfFreedom).cumulativeProbability(t);
498        }
499        if (alternative == AlternativeHypothesis.GREATER_THAN) {
500            return TDistribution.of(degreesOfFreedom).survivalProbability(t);
501        }
502        // two-sided
503        return 2.0 * TDistribution.of(degreesOfFreedom).survivalProbability(Math.abs(t));
504    }
505
506    /**
507     * Check sample data size.
508     *
509     * @param n Data size.
510     * @return the sample size
511     * @throws IllegalArgumentException if the number of samples {@code < 2}
512     */
513    private static long checkSampleSize(long n) {
514        if (n <= 1) {
515            throw new InferenceException(InferenceException.TWO_VALUES_REQUIRED, n);
516        }
517        return n;
518    }
519}