1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.statistics.descriptive; 18 19 /** 20 * Computes the variance of the available values. The default implementation uses the 21 * following definition of the <em>sample variance</em>: 22 * 23 * <p>\[ \tfrac{1}{n-1} \sum_{i=1}^n (x_i-\overline{x})^2 \] 24 * 25 * <p>where \( \overline{x} \) is the sample mean, and \( n \) is the number of samples. 26 * 27 * <ul> 28 * <li>The result is {@code NaN} if no values are added. 29 * <li>The result is {@code NaN} if any of the values is {@code NaN} or infinite. 30 * <li>The result is {@code NaN} if the sum of the squared deviations from the mean is infinite. 31 * <li>The result is zero if there is one finite value in the data set. 32 * </ul> 33 * 34 * <p>The use of the term \( n − 1 \) is called Bessel's correction. This is an unbiased 35 * estimator of the variance of a hypothetical infinite population. If the 36 * {@link #setBiased(boolean) biased} option is enabled the normalisation factor is 37 * changed to \( \frac{1}{n} \) for a biased estimator of the <em>sample variance</em>. 38 * 39 * <p>The {@link #accept(double)} method uses a recursive updating algorithm based on West's 40 * algorithm (see Chan and Lewis (1979)). 41 * 42 * <p>The {@link #of(double...)} method uses the corrected two-pass algorithm from 43 * Chan <i>et al</i>, (1983). 44 * 45 * <p>Note that adding values using {@link #accept(double) accept} and then executing 46 * {@link #getAsDouble() getAsDouble} will 47 * sometimes give a different, less accurate, result than executing 48 * {@link #of(double...) of} with the full array of values. The former approach 49 * should only be used when the full array of values is not available. 50 * 51 * <p>Supports up to 2<sup>63</sup> (exclusive) observations. 52 * This implementation does not check for overflow of the count. 53 * 54 * <p>This class is designed to work with (though does not require) 55 * {@linkplain java.util.stream streams}. 56 * 57 * <p><strong>Note that this instance is not synchronized.</strong> If 58 * multiple threads access an instance of this class concurrently, and at least 59 * one of the threads invokes the {@link java.util.function.DoubleConsumer#accept(double) accept} or 60 * {@link StatisticAccumulator#combine(StatisticResult) combine} method, it must be synchronized externally. 61 * 62 * <p>However, it is safe to use {@link java.util.function.DoubleConsumer#accept(double) accept} 63 * and {@link StatisticAccumulator#combine(StatisticResult) combine} 64 * as {@code accumulator} and {@code combiner} functions of 65 * {@link java.util.stream.Collector Collector} on a parallel stream, 66 * because the parallel instance of {@link java.util.stream.Stream#collect Stream.collect()} 67 * provides the necessary partitioning, isolation, and merging of results for 68 * safe and efficient parallel execution. 69 * 70 * <p>References: 71 * <ul> 72 * <li>Chan and Lewis (1979) 73 * Computing standard deviations: accuracy. 74 * Communications of the ACM, 22, 526-531. 75 * <a href="http://doi.acm.org/10.1145/359146.359152">doi: 10.1145/359146.359152</a> 76 * <li>Chan, Golub and Levesque (1983) 77 * Algorithms for Computing the Sample Variance: Analysis and Recommendations. 78 * American Statistician, 37, 242-247. 79 * <a href="https://doi.org/10.2307/2683386">doi: 10.2307/2683386</a> 80 * </ul> 81 * 82 * @see <a href="https://en.wikipedia.org/wiki/Variance">Variance (Wikipedia)</a> 83 * @see <a href="https://en.wikipedia.org/wiki/Bessel%27s_correction">Bessel's correction</a> 84 * @see StandardDeviation 85 * @since 1.1 86 */ 87 public final class Variance implements DoubleStatistic, StatisticAccumulator<Variance> { 88 89 /** 90 * An instance of {@link SumOfSquaredDeviations}, which is used to 91 * compute the variance. 92 */ 93 private final SumOfSquaredDeviations ss; 94 95 /** Flag to control if the statistic is biased, or should use a bias correction. */ 96 private boolean biased; 97 98 /** 99 * Create an instance. 100 */ 101 private Variance() { 102 this(new SumOfSquaredDeviations()); 103 } 104 105 /** 106 * Creates an instance with the sum of squared deviations from the mean. 107 * 108 * @param ss Sum of squared deviations. 109 */ 110 Variance(SumOfSquaredDeviations ss) { 111 this.ss = ss; 112 } 113 114 /** 115 * Creates an instance. 116 * 117 * <p>The initial result is {@code NaN}. 118 * 119 * @return {@code Variance} instance. 120 */ 121 public static Variance create() { 122 return new Variance(); 123 } 124 125 /** 126 * Returns an instance populated using the input {@code values}. 127 * 128 * <p>Note: {@code Variance} computed using {@link #accept(double) accept} may be 129 * different from this variance. 130 * 131 * <p>See {@link Variance} for details on the computing algorithm. 132 * 133 * @param values Values. 134 * @return {@code Variance} instance. 135 */ 136 public static Variance of(double... values) { 137 return new Variance(SumOfSquaredDeviations.of(values)); 138 } 139 140 /** 141 * Updates the state of the statistic to reflect the addition of {@code value}. 142 * 143 * @param value Value. 144 */ 145 @Override 146 public void accept(double value) { 147 ss.accept(value); 148 } 149 150 /** 151 * Gets the variance of all input values. 152 * 153 * <p>When no values have been added, the result is {@code NaN}. 154 * 155 * @return variance of all values. 156 */ 157 @Override 158 public double getAsDouble() { 159 // This method checks the sum of squared is finite 160 // to provide a consistent NaN when the computation is not possible. 161 // Note: The SS checks for n=0 and returns NaN. 162 final double m2 = ss.getSumOfSquaredDeviations(); 163 if (!Double.isFinite(m2)) { 164 return Double.NaN; 165 } 166 final long n = ss.n; 167 // Avoid a divide by zero 168 if (n == 1) { 169 return 0; 170 } 171 return biased ? m2 / n : m2 / (n - 1); 172 } 173 174 @Override 175 public Variance combine(Variance other) { 176 ss.combine(other.ss); 177 return this; 178 } 179 180 /** 181 * Sets the value of the biased flag. The default value is {@code false}. 182 * 183 * <p>If {@code false} the sum of squared deviations from the sample mean is normalised by 184 * {@code n - 1} where {@code n} is the number of samples. This is Bessel's correction 185 * for an unbiased estimator of the variance of a hypothetical infinite population. 186 * 187 * <p>If {@code true} the sum of squared deviations is normalised by the number of samples 188 * {@code n}. 189 * 190 * <p>Note: This option only applies when {@code n > 1}. The variance of {@code n = 1} is 191 * always 0. 192 * 193 * <p>This flag only controls the final computation of the statistic. The value of this flag 194 * will not affect compatibility between instances during a {@link #combine(Variance) combine} 195 * operation. 196 * 197 * @param v Value. 198 * @return {@code this} instance 199 */ 200 public Variance setBiased(boolean v) { 201 biased = v; 202 return this; 203 } 204 }