001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text.similarity; 018 019import java.util.HashSet; 020import java.util.Set; 021 022/** 023 * Measures the Jaccard similarity (aka Jaccard index) of two sets of character sequence. Jaccard similarity is the size of the intersection divided by the size 024 * of the union of the two sets. 025 * 026 * <p> 027 * For further explanation about Jaccard Similarity, refer https://en.wikipedia.org/wiki/Jaccard_index 028 * </p> 029 * 030 * @since 1.0 031 */ 032public class JaccardSimilarity implements SimilarityScore<Double> { 033 034 /** 035 * Singleton instance. 036 */ 037 static final JaccardSimilarity INSTANCE = new JaccardSimilarity(); 038 039 /** 040 * Creates a new instance. 041 */ 042 public JaccardSimilarity() { 043 // empty 044 } 045 046 /** 047 * Computes the Jaccard Similarity of two set character sequence passed as input. 048 * 049 * @param left first input sequence. 050 * @param right second input sequence. 051 * @return index. 052 * @throws IllegalArgumentException if either String input {@code null}. 053 */ 054 @Override 055 public Double apply(final CharSequence left, final CharSequence right) { 056 return apply(SimilarityInput.input(left), SimilarityInput.input(right)); 057 } 058 059 /** 060 * Computes the Jaccard Similarity of two character sequences passed as input. Does the calculation by identifying the union (characters in at least one of 061 * the two sets) of the two sets and intersection (characters which are present in set one which are present in set two) 062 * 063 * @param <E> The type of similarity score unit. 064 * @param left first input sequence. 065 * @param right second input sequence. 066 * @return index. 067 * @since 1.13.0 068 */ 069 public <E> Double apply(final SimilarityInput<E> left, final SimilarityInput<E> right) { 070 if (left == null || right == null) { 071 throw new IllegalArgumentException("Input cannot be null"); 072 } 073 final int leftLength = left.length(); 074 final int rightLength = right.length(); 075 if (leftLength == 0 && rightLength == 0) { 076 return 1d; 077 } 078 if (leftLength == 0 || rightLength == 0) { 079 return 0d; 080 } 081 final Set<E> leftSet = new HashSet<>(); 082 for (int i = 0; i < leftLength; i++) { 083 leftSet.add(left.at(i)); 084 } 085 final Set<E> rightSet = new HashSet<>(); 086 for (int i = 0; i < rightLength; i++) { 087 rightSet.add(right.at(i)); 088 } 089 final Set<E> unionSet = new HashSet<>(leftSet); 090 unionSet.addAll(rightSet); 091 final int intersectionSize = leftSet.size() + rightSet.size() - unionSet.size(); 092 return 1.0d * intersectionSize / unionSet.size(); 093 } 094}