File: WordSimilarityChecker.cs
Web Access
Project: ..\..\..\src\Workspaces\Core\Portable\Microsoft.CodeAnalysis.Workspaces.csproj (Microsoft.CodeAnalysis.Workspaces)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System;
using System.Collections.Generic;
using System.Diagnostics;
 
namespace Roslyn.Utilities
{
    internal class WordSimilarityChecker
    {
        private readonly struct CacheResult
        {
            public readonly string CandidateText;
            public readonly bool AreSimilar;
            public readonly double SimilarityWeight;
 
            public CacheResult(string candidate, bool areSimilar, double similarityWeight)
            {
                CandidateText = candidate;
                AreSimilar = areSimilar;
                SimilarityWeight = similarityWeight;
            }
        }
 
        // Cache the result of the last call to AreSimilar.  We'll often be called with the same
        // value multiple times in a row, so we can avoid expensive computation by returning the
        // same value immediately.
        private CacheResult _lastAreSimilarResult;
 
        private string _source;
        private EditDistance _editDistance;
        private int _threshold;
 
        /// <summary>
        /// Whether or words should be considered similar if one is contained within the other
        /// (regardless of edit distance).  For example if is true then IService would be considered
        /// similar to IServiceFactory despite the edit distance being quite high at 7.
        /// </summary>
        private bool _substringsAreSimilar;
 
        private static readonly object s_poolGate = new();
        private static readonly Stack<WordSimilarityChecker> s_pool = new();
 
        public static WordSimilarityChecker Allocate(string text, bool substringsAreSimilar)
        {
            WordSimilarityChecker checker;
            lock (s_poolGate)
            {
                checker = s_pool.Count > 0
                    ? s_pool.Pop()
                    : new WordSimilarityChecker();
            }
 
            checker.Initialize(text, substringsAreSimilar);
            return checker;
        }
 
        private WordSimilarityChecker()
        {
            // These are initialized by 'Initialize'
            _source = null!;
            _editDistance = null!;
        }
 
        private void Initialize(string text, bool substringsAreSimilar)
        {
            _source = text ?? throw new ArgumentNullException(nameof(text));
            _threshold = GetThreshold(_source);
            _editDistance = new EditDistance(text);
            _substringsAreSimilar = substringsAreSimilar;
        }
 
        public void Free()
        {
            _editDistance?.Dispose();
            _source = null!;
            _editDistance = null!;
            _lastAreSimilarResult = default;
            lock (s_poolGate)
            {
                s_pool.Push(this);
            }
        }
 
        public static bool AreSimilar(string originalText, string candidateText)
            => AreSimilar(originalText, candidateText, substringsAreSimilar: false);
 
        public static bool AreSimilar(string originalText, string candidateText, bool substringsAreSimilar)
            => AreSimilar(originalText, candidateText, substringsAreSimilar, out _);
 
        public static bool AreSimilar(string originalText, string candidateText, out double similarityWeight)
        {
            return AreSimilar(
                originalText, candidateText,
                substringsAreSimilar: false, similarityWeight: out similarityWeight);
        }
 
        /// <summary>
        /// Returns true if 'originalText' and 'candidateText' are likely a misspelling of each other.
        /// Returns false otherwise.  If it is a likely misspelling a similarityWeight is provided
        /// to help rank the match.  Lower costs mean it was a better match.
        /// </summary>
        public static bool AreSimilar(string originalText, string candidateText, bool substringsAreSimilar, out double similarityWeight)
        {
            var checker = Allocate(originalText, substringsAreSimilar);
            var result = checker.AreSimilar(candidateText, out similarityWeight);
            checker.Free();
 
            return result;
        }
 
        internal static int GetThreshold(string value)
            => value.Length <= 4 ? 1 : 2;
 
        public bool AreSimilar(string candidateText)
            => AreSimilar(candidateText, out _);
 
        public bool AreSimilar(string candidateText, out double similarityWeight)
        {
            if (_source.Length < 3)
            {
                // If we're comparing strings that are too short, we'll find 
                // far too many spurious hits.  Don't even bother in this case.
                similarityWeight = double.MaxValue;
                return false;
            }
 
            if (_lastAreSimilarResult.CandidateText == candidateText)
            {
                similarityWeight = _lastAreSimilarResult.SimilarityWeight;
                return _lastAreSimilarResult.AreSimilar;
            }
 
            var result = AreSimilarWorker(candidateText, out similarityWeight);
            _lastAreSimilarResult = new CacheResult(candidateText, result, similarityWeight);
            return result;
        }
 
        private bool AreSimilarWorker(string candidateText, out double similarityWeight)
        {
            similarityWeight = double.MaxValue;
 
            // If the two strings differ by more characters than the cost threshold, then there's 
            // no point in even computing the edit distance as it would necessarily take at least
            // that many additions/deletions.
            if (Math.Abs(_source.Length - candidateText.Length) <= _threshold)
            {
                similarityWeight = _editDistance.GetEditDistance(candidateText, _threshold);
            }
 
            if (similarityWeight > _threshold)
            {
                // it had a high cost.  However, the string the user typed was contained
                // in the string we're currently looking at.  That's enough to consider it
                // although we place it just at the threshold (i.e. it's worse than all
                // other matches).
                if (_substringsAreSimilar && candidateText.IndexOf(_source, StringComparison.OrdinalIgnoreCase) >= 0)
                {
                    similarityWeight = _threshold;
                }
                else
                {
                    return false;
                }
            }
 
            Debug.Assert(similarityWeight <= _threshold);
 
            similarityWeight += Penalty(candidateText, _source);
            return true;
        }
 
        private static double Penalty(string candidateText, string originalText)
        {
            var lengthDifference = Math.Abs(originalText.Length - candidateText.Length);
            if (lengthDifference != 0)
            {
                // For all items of the same edit cost, we penalize those that are 
                // much longer than the original text versus those that are only 
                // a little longer.
                //
                // Note: even with this penalty, all matches of cost 'X' will all still
                // cost less than matches of cost 'X + 1'.  i.e. the penalty is in the 
                // range [0, 1) and only serves to order matches of the same cost.
                //
                // Here's the relation of the first few values of length diff and penalty:
                // LengthDiff   -> Penalty
                // 1            -> .5
                // 2            -> .66
                // 3            -> .75
                // 4            -> .8
                // And so on and so forth.
                var penalty = 1.0 - (1.0 / (lengthDifference + 1));
                return penalty;
            }
 
            return 0;
        }
    }
}