001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text.similarity; 018 019import java.util.HashSet; 020import java.util.Map; 021import java.util.Set; 022 023/** 024 * Measures the Cosine similarity of two vectors of an inner product space and compares the angle between them. 025 * <p> 026 * For further explanation about the Cosine Similarity, refer to https://en.wikipedia.org/wiki/Cosine_similarity. 027 * </p> 028 * <p> 029 * Instances of this class are immutable and are safe for use by multiple concurrent threads. 030 * </p> 031 * 032 * @since 1.0 033 */ 034public class CosineSimilarity { 035 036 /** 037 * Singleton instance. 038 */ 039 static final CosineSimilarity INSTANCE = new CosineSimilarity(); 040 041 /** 042 * Calculates the cosine similarity for two given vectors. 043 * 044 * @param leftVector left vector 045 * @param rightVector right vector 046 * @return cosine similarity between the two vectors 047 */ 048 public Double cosineSimilarity(final Map<CharSequence, Integer> leftVector, 049 final Map<CharSequence, Integer> rightVector) { 050 if (leftVector == null || rightVector == null) { 051 throw new IllegalArgumentException("Vectors must not be null"); 052 } 053 054 final Set<CharSequence> intersection = getIntersection(leftVector, rightVector); 055 056 final double dotProduct = dot(leftVector, rightVector, intersection); 057 double d1 = 0.0d; 058 for (final Integer value : leftVector.values()) { 059 d1 += Math.pow(value, 2); 060 } 061 double d2 = 0.0d; 062 for (final Integer value : rightVector.values()) { 063 d2 += Math.pow(value, 2); 064 } 065 final double cosineSimilarity; 066 if (d1 <= 0.0 || d2 <= 0.0) { 067 cosineSimilarity = 0.0; 068 } else { 069 cosineSimilarity = dotProduct / (Math.sqrt(d1) * Math.sqrt(d2)); 070 } 071 return cosineSimilarity; 072 } 073 074 /** 075 * Computes the dot product of two vectors. It ignores remaining elements. It means 076 * that if a vector is longer than other, then a smaller part of it will be used to compute 077 * the dot product. 078 * 079 * @param leftVector left vector 080 * @param rightVector right vector 081 * @param intersection common elements 082 * @return The dot product 083 */ 084 private double dot(final Map<CharSequence, Integer> leftVector, final Map<CharSequence, Integer> rightVector, 085 final Set<CharSequence> intersection) { 086 long dotProduct = 0; 087 for (final CharSequence key : intersection) { 088 dotProduct += leftVector.get(key) * (long) rightVector.get(key); 089 } 090 return dotProduct; 091 } 092 093 /** 094 * Returns a set with strings common to the two given maps. 095 * 096 * @param leftVector left vector map 097 * @param rightVector right vector map 098 * @return common strings 099 */ 100 private Set<CharSequence> getIntersection(final Map<CharSequence, Integer> leftVector, 101 final Map<CharSequence, Integer> rightVector) { 102 final Set<CharSequence> intersection = new HashSet<>(leftVector.keySet()); 103 intersection.retainAll(rightVector.keySet()); 104 return intersection; 105 } 106 107}