001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.statistics.descriptive; 018 019/** 020 * Computes the variance of the available values. The default implementation uses the 021 * following definition of the <em>sample variance</em>: 022 * 023 * <p>\[ \tfrac{1}{n-1} \sum_{i=1}^n (x_i-\overline{x})^2 \] 024 * 025 * <p>where \( \overline{x} \) is the sample mean, and \( n \) is the number of samples. 026 * 027 * <ul> 028 * <li>The result is {@code NaN} if no values are added. 029 * <li>The result is {@code NaN} if any of the values is {@code NaN} or infinite. 030 * <li>The result is {@code NaN} if the sum of the squared deviations from the mean is infinite. 031 * <li>The result is zero if there is one finite value in the data set. 032 * </ul> 033 * 034 * <p>The use of the term \( n − 1 \) is called Bessel's correction. This is an unbiased 035 * estimator of the variance of a hypothetical infinite population. If the 036 * {@link #setBiased(boolean) biased} option is enabled the normalisation factor is 037 * changed to \( \frac{1}{n} \) for a biased estimator of the <em>sample variance</em>. 038 * 039 * <p>The {@link #accept(double)} method uses a recursive updating algorithm based on West's 040 * algorithm (see Chan and Lewis (1979)). 041 * 042 * <p>The {@link #of(double...)} method uses the corrected two-pass algorithm from 043 * Chan <i>et al</i>, (1983). 044 * 045 * <p>Note that adding values using {@link #accept(double) accept} and then executing 046 * {@link #getAsDouble() getAsDouble} will 047 * sometimes give a different, less accurate, result than executing 048 * {@link #of(double...) of} with the full array of values. The former approach 049 * should only be used when the full array of values is not available. 050 * 051 * <p>Supports up to 2<sup>63</sup> (exclusive) observations. 052 * This implementation does not check for overflow of the count. 053 * 054 * <p>This class is designed to work with (though does not require) 055 * {@linkplain java.util.stream streams}. 056 * 057 * <p><strong>Note that this instance is not synchronized.</strong> If 058 * multiple threads access an instance of this class concurrently, and at least 059 * one of the threads invokes the {@link java.util.function.DoubleConsumer#accept(double) accept} or 060 * {@link StatisticAccumulator#combine(StatisticResult) combine} method, it must be synchronized externally. 061 * 062 * <p>However, it is safe to use {@link java.util.function.DoubleConsumer#accept(double) accept} 063 * and {@link StatisticAccumulator#combine(StatisticResult) combine} 064 * as {@code accumulator} and {@code combiner} functions of 065 * {@link java.util.stream.Collector Collector} on a parallel stream, 066 * because the parallel instance of {@link java.util.stream.Stream#collect Stream.collect()} 067 * provides the necessary partitioning, isolation, and merging of results for 068 * safe and efficient parallel execution. 069 * 070 * <p>References: 071 * <ul> 072 * <li>Chan and Lewis (1979) 073 * Computing standard deviations: accuracy. 074 * Communications of the ACM, 22, 526-531. 075 * <a href="http://doi.acm.org/10.1145/359146.359152">doi: 10.1145/359146.359152</a> 076 * <li>Chan, Golub and Levesque (1983) 077 * Algorithms for Computing the Sample Variance: Analysis and Recommendations. 078 * American Statistician, 37, 242-247. 079 * <a href="https://doi.org/10.2307/2683386">doi: 10.2307/2683386</a> 080 * </ul> 081 * 082 * @see <a href="https://en.wikipedia.org/wiki/Variance">Variance (Wikipedia)</a> 083 * @see <a href="https://en.wikipedia.org/wiki/Bessel%27s_correction">Bessel's correction</a> 084 * @see StandardDeviation 085 * @since 1.1 086 */ 087public final class Variance implements DoubleStatistic, StatisticAccumulator<Variance> { 088 089 /** 090 * An instance of {@link SumOfSquaredDeviations}, which is used to 091 * compute the variance. 092 */ 093 private final SumOfSquaredDeviations ss; 094 095 /** Flag to control if the statistic is biased, or should use a bias correction. */ 096 private boolean biased; 097 098 /** 099 * Create an instance. 100 */ 101 private Variance() { 102 this(new SumOfSquaredDeviations()); 103 } 104 105 /** 106 * Creates an instance with the sum of squared deviations from the mean. 107 * 108 * @param ss Sum of squared deviations. 109 */ 110 Variance(SumOfSquaredDeviations ss) { 111 this.ss = ss; 112 } 113 114 /** 115 * Creates an instance. 116 * 117 * <p>The initial result is {@code NaN}. 118 * 119 * @return {@code Variance} instance. 120 */ 121 public static Variance create() { 122 return new Variance(); 123 } 124 125 /** 126 * Returns an instance populated using the input {@code values}. 127 * 128 * <p>Note: {@code Variance} computed using {@link #accept(double) accept} may be 129 * different from this variance. 130 * 131 * <p>See {@link Variance} for details on the computing algorithm. 132 * 133 * @param values Values. 134 * @return {@code Variance} instance. 135 */ 136 public static Variance of(double... values) { 137 return new Variance(SumOfSquaredDeviations.of(values)); 138 } 139 140 /** 141 * Updates the state of the statistic to reflect the addition of {@code value}. 142 * 143 * @param value Value. 144 */ 145 @Override 146 public void accept(double value) { 147 ss.accept(value); 148 } 149 150 /** 151 * Gets the variance of all input values. 152 * 153 * <p>When no values have been added, the result is {@code NaN}. 154 * 155 * @return variance of all values. 156 */ 157 @Override 158 public double getAsDouble() { 159 // This method checks the sum of squared is finite 160 // to provide a consistent NaN when the computation is not possible. 161 // Note: The SS checks for n=0 and returns NaN. 162 final double m2 = ss.getSumOfSquaredDeviations(); 163 if (!Double.isFinite(m2)) { 164 return Double.NaN; 165 } 166 final long n = ss.n; 167 // Avoid a divide by zero 168 if (n == 1) { 169 return 0; 170 } 171 return biased ? m2 / n : m2 / (n - 1); 172 } 173 174 @Override 175 public Variance combine(Variance other) { 176 ss.combine(other.ss); 177 return this; 178 } 179 180 /** 181 * Sets the value of the biased flag. The default value is {@code false}. 182 * 183 * <p>If {@code false} the sum of squared deviations from the sample mean is normalised by 184 * {@code n - 1} where {@code n} is the number of samples. This is Bessel's correction 185 * for an unbiased estimator of the variance of a hypothetical infinite population. 186 * 187 * <p>If {@code true} the sum of squared deviations is normalised by the number of samples 188 * {@code n}. 189 * 190 * <p>Note: This option only applies when {@code n > 1}. The variance of {@code n = 1} is 191 * always 0. 192 * 193 * <p>This flag only controls the final computation of the statistic. The value of this flag 194 * will not affect compatibility between instances during a {@link #combine(Variance) combine} 195 * operation. 196 * 197 * @param v Value. 198 * @return {@code this} instance 199 */ 200 public Variance setBiased(boolean v) { 201 biased = v; 202 return this; 203 } 204}