001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.statistics.distribution; 019 020import org.apache.commons.rng.UniformRandomProvider; 021import org.apache.commons.rng.sampling.distribution.RejectionInversionZipfSampler; 022 023/** 024 * Implementation of the Zipf distribution. 025 * 026 * <p>The probability mass function of \( X \) is: 027 * 028 * <p>\[ f(k; N, s) = \frac{1/k^s}{H_{N,s}} \] 029 * 030 * <p>for \( N \in \{1, 2, 3, \dots\} \) the number of elements, 031 * \( s \gt 0 \) the exponent characterizing the distribution, 032 * \( k \in \{1, 2, \dots, N\} \) the element rank, and 033 * \( H_{N,s} \) is the normalizing constant which corresponds to the 034 * <a href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers"> 035 * generalized harmonic number</a> of order N of s. 036 * 037 * @see <a href="https://en.wikipedia.org/wiki/Zipf's_law">Zipf distribution (Wikipedia)</a> 038 */ 039public final class ZipfDistribution extends AbstractDiscreteDistribution { 040 /** Number of elements. */ 041 private final int numberOfElements; 042 /** Exponent parameter of the distribution. */ 043 private final double exponent; 044 /** Cached value of the nth generalized harmonic. */ 045 private final double nthHarmonic; 046 /** Cached value of the log of the nth generalized harmonic. */ 047 private final double logNthHarmonic; 048 049 /** 050 * @param numberOfElements Number of elements. 051 * @param exponent Exponent. 052 */ 053 private ZipfDistribution(int numberOfElements, 054 double exponent) { 055 this.numberOfElements = numberOfElements; 056 this.exponent = exponent; 057 this.nthHarmonic = generalizedHarmonic(numberOfElements, exponent); 058 logNthHarmonic = Math.log(nthHarmonic); 059 } 060 061 /** 062 * Creates a Zipf distribution. 063 * 064 * @param numberOfElements Number of elements. 065 * @param exponent Exponent. 066 * @return the distribution 067 * @exception IllegalArgumentException if {@code numberOfElements <= 0} 068 * or {@code exponent <= 0}. 069 */ 070 public static ZipfDistribution of(int numberOfElements, 071 double exponent) { 072 if (numberOfElements <= 0) { 073 throw new DistributionException(DistributionException.NOT_STRICTLY_POSITIVE, 074 numberOfElements); 075 } 076 if (exponent < 0) { 077 throw new DistributionException(DistributionException.NEGATIVE, 078 exponent); 079 } 080 return new ZipfDistribution(numberOfElements, exponent); 081 } 082 083 /** 084 * Gets the number of elements parameter of this distribution. 085 * 086 * @return the number of elements. 087 */ 088 public int getNumberOfElements() { 089 return numberOfElements; 090 } 091 092 /** 093 * Gets the exponent parameter of this distribution. 094 * 095 * @return the exponent. 096 */ 097 public double getExponent() { 098 return exponent; 099 } 100 101 /** {@inheritDoc} */ 102 @Override 103 public double probability(final int x) { 104 if (x <= 0 || x > numberOfElements) { 105 return 0; 106 } 107 108 return Math.pow(x, -exponent) / nthHarmonic; 109 } 110 111 /** {@inheritDoc} */ 112 @Override 113 public double logProbability(int x) { 114 if (x <= 0 || x > numberOfElements) { 115 return Double.NEGATIVE_INFINITY; 116 } 117 118 return -Math.log(x) * exponent - logNthHarmonic; 119 } 120 121 /** {@inheritDoc} */ 122 @Override 123 public double cumulativeProbability(final int x) { 124 if (x <= 0) { 125 return 0; 126 } else if (x >= numberOfElements) { 127 return 1; 128 } 129 130 return generalizedHarmonic(x, exponent) / nthHarmonic; 131 } 132 133 /** {@inheritDoc} */ 134 @Override 135 public double survivalProbability(int x) { 136 if (x <= 0) { 137 return 1; 138 } else if (x >= numberOfElements) { 139 return 0; 140 } 141 142 // See http://www.math.wm.edu/~leemis/chart/UDR/PDFs/Zipf.pdf 143 // S(x) = P(X > x) = ((x+1)^a Hn,a - (x+1)^a Hx+1,a + 1) / ((x+1)^a Hn,a) 144 // where a = exponent and Hx,a is the generalized harmonic for x with exponent a. 145 final double z = Math.pow(x + 1.0, exponent); 146 // Compute generalizedHarmonic(x, exponent) and generalizedHarmonic(x+1, exponent) 147 final double hx = generalizedHarmonic(x, exponent); 148 final double hx1 = hx + Math.pow(x + 1.0, -exponent); 149 // Compute the survival function 150 final double p = (z * (nthHarmonic - hx1) + 1) / (z * nthHarmonic); 151 // May overflow for large exponent so validate the probability. 152 // If this occurs revert to 1 - CDF(x), reusing the generalized harmonic for x 153 return p <= 1.0 ? p : 1.0 - hx / nthHarmonic; 154 } 155 156 /** 157 * {@inheritDoc} 158 * 159 * <p>For number of elements \( N \) and exponent \( s \), the mean is: 160 * 161 * <p>\[ \frac{H_{N,s-1}}{H_{N,s}} \] 162 * 163 * <p>where \( H_{N,k} \) is the 164 * <a href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers"> 165 * generalized harmonic number</a> of order \( N \) of \( k \). 166 */ 167 @Override 168 public double getMean() { 169 final int N = getNumberOfElements(); 170 final double s = getExponent(); 171 172 final double Hs1 = generalizedHarmonicAscendingSum(N, s - 1); 173 174 return Hs1 / nthHarmonic; 175 } 176 177 /** 178 * {@inheritDoc} 179 * 180 * <p>For number of elements \( N \) and exponent \( s \), the variance is: 181 * 182 * <p>\[ \frac{H_{N,s-2}}{H_{N,s}} - \frac{H_{N,s-1}^2}{H_{N,s}^2} \] 183 * 184 * <p>where \( H_{N,k} \) is the 185 * <a href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers"> 186 * generalized harmonic number</a> of order \( N \) of \( k \). 187 */ 188 @Override 189 public double getVariance() { 190 final int N = getNumberOfElements(); 191 final double s = getExponent(); 192 193 final double Hs2 = generalizedHarmonicAscendingSum(N, s - 2); 194 final double Hs1 = generalizedHarmonicAscendingSum(N, s - 1); 195 final double Hs = nthHarmonic; 196 197 return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs)); 198 } 199 200 /** 201 * Calculates the Nth generalized harmonic number. See 202 * <a href="https://mathworld.wolfram.com/HarmonicSeries.html">Harmonic 203 * Series</a>. 204 * 205 * <p>Assumes {@code exponent > 0} to arrange the terms to sum from small to large. 206 * 207 * @param n Term in the series to calculate (must be larger than 1) 208 * @param m Exponent (special case {@code m = 1} is the harmonic series). 209 * @return the n<sup>th</sup> generalized harmonic number. 210 */ 211 private static double generalizedHarmonic(final int n, final double m) { 212 double value = 0; 213 // Sum small to large 214 for (int k = n; k >= 1; k--) { 215 value += Math.pow(k, -m); 216 } 217 return value; 218 } 219 220 /** 221 * Calculates the Nth generalized harmonic number. 222 * 223 * <p>Checks the value of the {@code exponent} to arrange the terms to sum from from small to large. 224 * 225 * @param n Term in the series to calculate (must be larger than 1) 226 * @param m Exponent (special case {@code m = 1} is the harmonic series). 227 * @return the n<sup>th</sup> generalized harmonic number. 228 */ 229 private static double generalizedHarmonicAscendingSum(final int n, final double m) { 230 double value = 0; 231 // Sum small to large 232 // If m < 0 then sum ascending, otherwise descending 233 if (m < 0) { 234 for (int k = 1; k <= n; k++) { 235 value += Math.pow(k, -m); 236 } 237 } else { 238 for (int k = n; k >= 1; k--) { 239 value += Math.pow(k, -m); 240 } 241 } 242 return value; 243 } 244 245 /** 246 * {@inheritDoc} 247 * 248 * <p>The lower bound of the support is always 1. 249 * 250 * @return 1. 251 */ 252 @Override 253 public int getSupportLowerBound() { 254 return 1; 255 } 256 257 /** 258 * {@inheritDoc} 259 * 260 * <p>The upper bound of the support is the number of elements. 261 * 262 * @return number of elements. 263 */ 264 @Override 265 public int getSupportUpperBound() { 266 return getNumberOfElements(); 267 } 268 269 /** {@inheritDoc} */ 270 @Override 271 public DiscreteDistribution.Sampler createSampler(final UniformRandomProvider rng) { 272 // Zipf distribution sampler. 273 return RejectionInversionZipfSampler.of(rng, numberOfElements, exponent)::sample; 274 } 275}