001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text.similarity; 018 019import java.util.Map; 020 021/** 022 * Measures the cosine distance between two character sequences. 023 * 024 * <p>It utilizes the {@link CosineSimilarity} to compute the distance. Character sequences 025 * are converted into vectors through a simple tokenizer that works with a regular expression 026 * to split words in a sentence.</p> 027 * 028 * <p> 029 * For further explanation about Cosine Similarity and Cosine Distance, refer to 030 * https://en.wikipedia.org/wiki/Cosine_similarity. 031 * </p> 032 * 033 * @since 1.0 034 * @see CosineSimilarity 035 */ 036public class CosineDistance implements EditDistance<Double> { 037 038 @Override 039 public Double apply(final CharSequence left, final CharSequence right) { 040 final CharSequence[] leftTokens = RegexTokenizer.INSTANCE.apply(left); 041 final CharSequence[] rightTokens = RegexTokenizer.INSTANCE.apply(right); 042 043 final Map<CharSequence, Integer> leftVector = Counter.of(leftTokens); 044 final Map<CharSequence, Integer> rightVector = Counter.of(rightTokens); 045 final double similarity = CosineSimilarity.INSTANCE.cosineSimilarity(leftVector, rightVector); 046 return 1.0 - similarity; 047 } 048 049}