001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.statistics.descriptive; 018 019import java.math.BigInteger; 020 021/** 022 * Computes the variance of the available values. The default implementation uses the 023 * following definition of the <em>sample variance</em>: 024 * 025 * <p>\[ \tfrac{1}{n-1} \sum_{i=1}^n (x_i-\overline{x})^2 \] 026 * 027 * <p>where \( \overline{x} \) is the sample mean, and \( n \) is the number of samples. 028 * 029 * <ul> 030 * <li>The result is {@code NaN} if no values are added. 031 * <li>The result is zero if there is one value in the data set. 032 * </ul> 033 * 034 * <p>The use of the term \( n − 1 \) is called Bessel's correction. This is an unbiased 035 * estimator of the variance of a hypothetical infinite population. If the 036 * {@link #setBiased(boolean) biased} option is enabled the normalisation factor is 037 * changed to \( \frac{1}{n} \) for a biased estimator of the <em>sample variance</em>. 038 * 039 * <p>The implementation uses an exact integer sum to compute the scaled (by \( n \)) 040 * sum of squared deviations from the mean; this is normalised by the scaled correction factor. 041 * 042 * <p>\[ \frac {n \times \sum_{i=1}^n x_i^2 - (\sum_{i=1}^n x_i)^2}{n \times (n - 1)} \] 043 * 044 * <p>Supports up to 2<sup>63</sup> (exclusive) observations. 045 * This implementation does not check for overflow of the count. 046 * 047 * <p>This class is designed to work with (though does not require) 048 * {@linkplain java.util.stream streams}. 049 * 050 * <p><strong>This implementation is not thread safe.</strong> 051 * If multiple threads access an instance of this class concurrently, 052 * and at least one of the threads invokes the {@link java.util.function.LongConsumer#accept(long) accept} or 053 * {@link StatisticAccumulator#combine(StatisticResult) combine} method, it must be synchronized externally. 054 * 055 * <p>However, it is safe to use {@link java.util.function.LongConsumer#accept(long) accept} 056 * and {@link StatisticAccumulator#combine(StatisticResult) combine} 057 * as {@code accumulator} and {@code combiner} functions of 058 * {@link java.util.stream.Collector Collector} on a parallel stream, 059 * because the parallel implementation of {@link java.util.stream.Stream#collect Stream.collect()} 060 * provides the necessary partitioning, isolation, and merging of results for 061 * safe and efficient parallel execution. 062 * 063 * @see <a href="https://en.wikipedia.org/wiki/variance">variance (Wikipedia)</a> 064 * @see <a href="https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance"> 065 * Algorithms for computing the variance (Wikipedia)</a> 066 * @see <a href="https://en.wikipedia.org/wiki/Bessel%27s_correction">Bessel's correction</a> 067 * @since 1.1 068 */ 069public final class LongVariance implements LongStatistic, StatisticAccumulator<LongVariance> { 070 071 /** Sum of the squared values. */ 072 private final UInt192 sumSq; 073 /** Sum of the values. */ 074 private final Int128 sum; 075 /** Count of values that have been added. */ 076 private long n; 077 078 /** Flag to control if the statistic is biased, or should use a bias correction. */ 079 private boolean biased; 080 081 /** 082 * Create an instance. 083 */ 084 private LongVariance() { 085 this(UInt192.create(), Int128.create(), 0); 086 } 087 088 /** 089 * Create an instance. 090 * 091 * @param sumSq Sum of the squared values. 092 * @param sum Sum of the values. 093 * @param n Count of values that have been added. 094 */ 095 private LongVariance(UInt192 sumSq, Int128 sum, int n) { 096 this.sumSq = sumSq; 097 this.sum = sum; 098 this.n = n; 099 } 100 101 /** 102 * Creates an instance. 103 * 104 * <p>The initial result is {@code NaN}. 105 * 106 * @return {@code LongVariance} instance. 107 */ 108 public static LongVariance create() { 109 return new LongVariance(); 110 } 111 112 /** 113 * Returns an instance populated using the input {@code values}. 114 * 115 * @param values Values. 116 * @return {@code LongVariance} instance. 117 */ 118 public static LongVariance of(long... values) { 119 // Note: Arrays could be processed using specialised counts knowing the maximum limit 120 // for an array is 2^31 values. Requires a UInt160. 121 122 final Int128 s = Int128.create(); 123 final UInt192 ss = UInt192.create(); 124 for (final long x : values) { 125 s.add(x); 126 ss.addSquare(x); 127 } 128 return new LongVariance(ss, s, values.length); 129 } 130 131 /** 132 * Updates the state of the statistic to reflect the addition of {@code value}. 133 * 134 * @param value Value. 135 */ 136 @Override 137 public void accept(long value) { 138 sumSq.addSquare(value); 139 sum.add(value); 140 n++; 141 } 142 143 /** 144 * Gets the variance of all input values. 145 * 146 * <p>When no values have been added, the result is {@code NaN}. 147 * 148 * @return variance of all values. 149 */ 150 @Override 151 public double getAsDouble() { 152 return computeVarianceOrStd(sumSq, sum, n, biased, false); 153 } 154 155 /** 156 * Compute the variance (or standard deviation). 157 * 158 * <p>The {@code std} flag controls if the result is returned as the standard deviation 159 * using the {@link Math#sqrt(double) square root} function. 160 * 161 * @param sumSq Sum of the squared values. 162 * @param sum Sum of the values. 163 * @param n Count of values that have been added. 164 * @param biased Flag to control if the statistic is biased, or should use a bias correction. 165 * @param std Flag to control if the statistic is the standard deviation. 166 * @return the variance (or standard deviation) 167 */ 168 static double computeVarianceOrStd(UInt192 sumSq, Int128 sum, long n, boolean biased, boolean std) { 169 if (n == 0) { 170 return Double.NaN; 171 } 172 // Avoid a divide by zero 173 if (n == 1) { 174 return 0; 175 } 176 // Sum-of-squared deviations: sum(x^2) - sum(x)^2 / n 177 // Sum-of-squared deviations precursor: n * sum(x^2) - sum(x)^2 178 // The precursor is computed in integer precision. 179 // The divide uses double precision. 180 // This ensures we avoid cancellation in the difference and use a fast divide. 181 // The result is limited to by the rounding in the double computation. 182 final double diff = computeSSDevN(sumSq, sum, n); 183 final long n0 = biased ? n : n - 1; 184 final double v = diff / IntMath.unsignedMultiplyToDouble(n, n0); 185 if (std) { 186 return Math.sqrt(v); 187 } 188 return v; 189 } 190 191 /** 192 * Compute the sum-of-squared deviations multiplied by the count of values: 193 * {@code n * sum(x^2) - sum(x)^2}. 194 * 195 * @param sumSq Sum of the squared values. 196 * @param sum Sum of the values. 197 * @param n Count of values that have been added. 198 * @return the sum-of-squared deviations precursor 199 */ 200 private static double computeSSDevN(UInt192 sumSq, Int128 sum, long n) { 201 // Compute the term if possible using fast integer arithmetic. 202 // 192-bit sum(x^2) * n will be OK when the upper 32-bits are zero. 203 // 128-bit sum(x)^2 will be OK when the upper 64-bits are zero. 204 // The first is safe when n < 2^32 but we must check the sum high bits. 205 if (((n >>> Integer.SIZE) | sum.hi64()) == 0) { 206 return sumSq.unsignedMultiply((int) n).subtract(sum.squareLow()).toDouble(); 207 } else { 208 return sumSq.toBigInteger().multiply(BigInteger.valueOf(n)) 209 .subtract(square(sum.toBigInteger())).doubleValue(); 210 } 211 } 212 213 /** 214 * Compute the sum of the squared deviations from the mean. 215 * 216 * <p>This is a helper method used in higher order moments. 217 * 218 * @return the sum of the squared deviations 219 */ 220 double computeSumOfSquaredDeviations() { 221 return computeSSDevN(sumSq, sum, n) / n; 222 } 223 224 /** 225 * Compute the mean. 226 * 227 * <p>This is a helper method used in higher order moments. 228 * 229 * @return the mean 230 */ 231 double computeMean() { 232 return LongMean.computeMean(sum, n); 233 } 234 235 /** 236 * Convenience method to square a BigInteger. 237 * 238 * @param x Value 239 * @return x^2 240 */ 241 private static BigInteger square(BigInteger x) { 242 return x.multiply(x); 243 } 244 245 @Override 246 public LongVariance combine(LongVariance other) { 247 sumSq.add(other.sumSq); 248 sum.add(other.sum); 249 n += other.n; 250 return this; 251 } 252 253 /** 254 * Sets the value of the biased flag. The default value is {@code false}. 255 * 256 * <p>If {@code false} the sum of squared deviations from the sample mean is normalised by 257 * {@code n - 1} where {@code n} is the number of samples. This is Bessel's correction 258 * for an unbiased estimator of the variance of a hypothetical infinite population. 259 * 260 * <p>If {@code true} the sum of squared deviations is normalised by the number of samples 261 * {@code n}. 262 * 263 * <p>Note: This option only applies when {@code n > 1}. The variance of {@code n = 1} is 264 * always 0. 265 * 266 * <p>This flag only controls the final computation of the statistic. The value of this flag 267 * will not affect compatibility between instances during a {@link #combine(LongVariance) combine} 268 * operation. 269 * 270 * @param v Value. 271 * @return {@code this} instance 272 */ 273 public LongVariance setBiased(boolean v) { 274 biased = v; 275 return this; 276 } 277}