File: SectionMatcher_TryParseSection.cs
Web Access
Project: ..\..\..\src\Workspaces\Core\Portable\Microsoft.CodeAnalysis.Workspaces.csproj (Microsoft.CodeAnalysis.Workspaces)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
 
using System.Collections.Immutable;
using System.Text;
using System.Text.RegularExpressions;
using Microsoft.CodeAnalysis.PooledObjects;
using Roslyn.Utilities;
 
namespace Microsoft.CodeAnalysis.EditorConfig.Parsing
{
    internal readonly partial struct SectionMatcher
    {
        public static bool TryParseSection(string headerText, out SectionMatcher matcher)
        {
            // An editorconfig section name is a language for recognizing file paths
            // defined by the following grammar:
            //
            // <path> ::= <path-list>
            // <path-list> ::= <path-item> | <path-item> <path-list>
            // <path-item> ::= "*"  | "**" | "?" | <char> | <choice> | <range>
            // <char> ::= any unicode character
            // <choice> ::= "{" <choice-list> "}"
            // <choice-list> ::= <path-list> | <path-list> "," <choice-list>
            // <range> ::= "{" <integer> ".." <integer> "}"
            // <integer> ::= "-" <digit-list> | <digit-list>
            // <digit-list> ::= <digit> | <digit> <digit-list>
            // <digit> ::= 0-9
 
            matcher = default;
            using var _0 = PooledStringBuilder.GetInstance(out var sb);
            sb.Append('^');
 
            // EditorConfig matching depends on the whether or not there are
            // directory separators and where they are located in the section
            // name. Specifically, the editorconfig core parser says:
            // https://github.com/editorconfig/editorconfig-core-c/blob/5d3996811e962a717a7d7fdd0a941192382241a7/src/lib/editorconfig.c#L231
            //
            //     Pattern would be:
            //     /dir/of/editorconfig/file[double_star]/[section] if section does not contain '/',
            //     /dir/of/editorconfig/file[section] if section starts with a '/', or
            //     /dir/of/editorconfig/file/[section] if section contains '/' but does not start with '/'.
 
            if (!headerText.Contains("/"))
            {
                sb.Append(".*/");
            }
            else if (headerText[0] != '/')
            {
                sb.Append('/');
            }
 
            var lexer = new Lexer(headerText);
            using var _1 = ArrayBuilder<(int minValue, int maxValue)>.GetInstance(out var numberRangePairs);
            if (!TryCompilePathList(ref lexer, sb, parsingChoice: false, numberRangePairs))
            {
                return false;
            }
 
            sb.Append('$');
            var pattern = sb.ToString();
            matcher = new SectionMatcher(new Regex(pattern), headerText, numberRangePairs.ToImmutableArray());
            return true;
        }
 
        private static bool TryCompilePathList(
            ref Lexer lexer,
            StringBuilder sb,
            bool parsingChoice,
            ArrayBuilder<(int minValue, int maxValue)> numberRangePairs)
        {
            while (!lexer.IsDone)
            {
                var tokenKind = lexer.Lex();
                switch (tokenKind)
                {
                    case TokenKind.BadToken:
                        // Parsing failure
                        return false;
                    case TokenKind.SimpleCharacter:
                        // Matches just this character
                        sb.Append(Regex.Escape(lexer.EatCurrentCharacter().ToString()));
                        break;
                    case TokenKind.Question:
                        // '?' matches any single character
                        sb.Append('.');
                        break;
                    case TokenKind.Star:
                        // Matches any string of characters except directory separator
                        // Directory separator is defined in editorconfig spec as '/'
                        sb.Append("[^/]*");
                        break;
                    case TokenKind.StarStar:
                        // Matches any string of characters
                        sb.Append(".*");
                        break;
                    case TokenKind.OpenCurly:
                        // Back up token stream. The following helpers all expect a '{'
                        lexer.Position--;
                        // This is ambiguous between {num..num} and {item1,item2}
                        // We need to look ahead to disambiguate. Looking for {num..num}
                        // is easier because it can't be recursive.
                        var numberRange = TryParseNumberRange(ref lexer);
                        if (numberRange is null)
                        {
                            // Not a number range. Try a choice expression
                            if (!TryCompileChoice(ref lexer, sb, numberRangePairs))
                            {
                                return false;
                            }
                            // Keep looping. There may be more after the '}'.
                            break;
                        }
                        else
                        {
                            var (numStart, numEnd) = numberRange.Value;
                            if (int.TryParse(numStart, out var intStart) && int.TryParse(numEnd, out var intEnd))
                            {
                                var pair = intStart < intEnd ? (intStart, intEnd) : (intEnd, intStart);
                                numberRangePairs.Add(pair);
                                // Group allowing any digit sequence. The validity will be checked outside of the regex
                                sb.Append("(-?[0-9]+)");
                                // Keep looping
                                break;
                            }
 
                            return false;
                        }
                    case TokenKind.CloseCurly:
                        // Either the end of a choice, or a failed parse
                        return parsingChoice;
                    case TokenKind.Comma:
                        // The end of a choice section, or a failed parse
                        return parsingChoice;
                    case TokenKind.OpenBracket:
                        sb.Append('[');
                        if (!TryCompileCharacterClass(ref lexer, sb))
                        {
                            return false;
                        }
 
                        break;
                    default:
                        throw ExceptionUtilities.UnexpectedValue(tokenKind);
                }
            }
            // If we're parsing a choice we should not exit without a closing '}'
            return !parsingChoice;
        }
 
        private static (string numStart, string numEnd)? TryParseNumberRange(ref Lexer lexer)
        {
            var saved = lexer.Position;
            if (lexer.Lex() != TokenKind.OpenCurly)
            {
                lexer.Position = saved;
                return null;
            }
 
            var numStart = lexer.TryLexNumber();
            if (numStart is null)
            {
                // Not a number
                lexer.Position = saved;
                return null;
            }
 
            // The next two characters must be ".."
            if (!lexer.TryEatCurrentCharacter(out var c) || c != '.' ||
                !lexer.TryEatCurrentCharacter(out c) || c != '.')
            {
                lexer.Position = saved;
                return null;
            }
 
            // Now another number
            var numEnd = lexer.TryLexNumber();
            if (numEnd is null || lexer.IsDone || lexer.Lex() != TokenKind.CloseCurly)
            {
                // Not a number or no '}'
                lexer.Position = saved;
                return null;
            }
 
            return (numStart, numEnd);
        }
 
        private static bool TryCompileCharacterClass(ref Lexer lexer, StringBuilder sb)
        {
            // [...] should match any of the characters in the brackets, with special
            // behavior for four characters: '!' immediately after the opening bracket
            // implies the negation of the character class, '-' implies matching
            // between the locale-dependent range of the previous and next characters,
            // '\' escapes the following character, and ']' ends the range
            if (!lexer.IsDone && lexer.CurrentCharacter == '!')
            {
                sb.Append('^');
                lexer.Position++;
            }
 
            while (!lexer.IsDone)
            {
                var currentChar = lexer.EatCurrentCharacter();
                switch (currentChar)
                {
                    case '-':
                        // '-' means the same thing in regex as it does in the glob, so
                        // put it in verbatim
                        sb.Append(currentChar);
                        break;
 
                    case '\\':
                        // Escape the next char
                        if (lexer.IsDone)
                        {
                            return false;
                        }
 
                        sb.Append('\\');
                        sb.Append(lexer.EatCurrentCharacter());
                        break;
 
                    case ']':
                        sb.Append(currentChar);
                        return true;
 
                    default:
                        sb.Append(Regex.Escape(currentChar.ToString()));
                        break;
                }
            }
            // Stream ended without a closing bracket
            return false;
        }
 
        private static bool TryCompileChoice(
            ref Lexer lexer,
            StringBuilder sb,
            ArrayBuilder<(int, int)> numberRangePairs)
        {
            if (lexer.Lex() != TokenKind.OpenCurly)
            {
                return false;
            }
 
            // Start a non-capturing group for the choice
            sb.Append("(?:");
 
            // We start immediately after a '{'
            // Try to compile the nested <path-list>
            while (TryCompilePathList(ref lexer, sb, parsingChoice: true, numberRangePairs))
            {
                // If we've successfully compiled a <path-list> the last token should
                // have been a ',' or a '}'
                var lastChar = lexer[lexer.Position - 1];
                if (lastChar == ',')
                {
                    // Another option
                    sb.Append('|');
                }
                else if (lastChar == '}')
                {
                    // Close out the capture group
                    sb.Append(')');
                    return true;
                }
                else
                {
                    throw ExceptionUtilities.UnexpectedValue(lastChar);
                }
            }
 
            // Propagate failure
            return false;
        }
    }
}