// Copyright 2023 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package glob import ( "errors" "fmt" "strings" "unicode/utf8" ) // A Glob is an LSP-compliant glob pattern, as defined by the spec: // https://microsoft.github.io/language-server-protocol/specifications/lsp/3.17/specification/#documentFilter // // NOTE: this implementation is currently only intended for testing. In order // to make it production ready, we'd need to: // - verify it against the VS Code implementation // - add more tests // - microbenchmark, likely avoiding the element interface // - resolve the question of what is meant by "character". If it's a UTF-16 // code (as we suspect) it'll be a bit more work. // // Quoting from the spec: // Glob patterns can have the following syntax: // - `*` to match one or more characters in a path segment // - `?` to match on one character in a path segment // - `**` to match any number of path segments, including none // - `{}` to group sub patterns into an OR expression. (e.g. `**/*.{ts,js}` // matches all TypeScript and JavaScript files) // - `[]` to declare a range of characters to match in a path segment // (e.g., `example.[0-9]` to match on `example.0`, `example.1`, …) // - `[!...]` to negate a range of characters to match in a path segment // (e.g., `example.[!0-9]` to match on `example.a`, `example.b`, but // not `example.0`) // // Expanding on this: // - '/' matches one or more literal slashes. // - any other character matches itself literally. type Glob struct { elems []element // pattern elements } // Parse builds a Glob for the given pattern, returning an error if the pattern // is invalid. func Parse(pattern string) (*Glob, error) { g, _, err := parse(pattern, false) return g, err } func parse(pattern string, nested bool) (*Glob, string, error) { g := new(Glob) for len(pattern) > 0 { switch pattern[0] { case '/': pattern = pattern[1:] g.elems = append(g.elems, slash{}) case '*': if len(pattern) > 1 && pattern[1] == '*' { if (len(g.elems) > 0 && g.elems[len(g.elems)-1] != slash{}) || (len(pattern) > 2 && pattern[2] != '/') { return nil, "", errors.New("** may only be adjacent to '/'") } pattern = pattern[2:] g.elems = append(g.elems, starStar{}) break } pattern = pattern[1:] g.elems = append(g.elems, star{}) case '?': pattern = pattern[1:] g.elems = append(g.elems, anyChar{}) case '{': var gs group for pattern[0] != '}' { pattern = pattern[1:] groupG, pat, err := parse(pattern, true) if err != nil { return nil, "", err } if len(pat) == 0 { return nil, "", errors.New("unmatched '{'") } pattern = pat gs = append(gs, groupG) } pattern = pattern[1:] g.elems = append(g.elems, gs) case '}', ',': if nested { return g, pattern, nil } pattern = g.parseLiteral(pattern, false) case '[': pattern = pattern[1:] if len(pattern) == 0 { return nil, "", errBadRange } negate := false if pattern[0] == '!' { pattern = pattern[1:] negate = true } low, sz, err := readRangeRune(pattern) if err != nil { return nil, "", err } pattern = pattern[sz:] if len(pattern) == 0 || pattern[0] != '-' { return nil, "", errBadRange } pattern = pattern[1:] high, sz, err := readRangeRune(pattern) if err != nil { return nil, "", err } pattern = pattern[sz:] if len(pattern) == 0 || pattern[0] != ']' { return nil, "", errBadRange } pattern = pattern[1:] g.elems = append(g.elems, charRange{negate, low, high}) default: pattern = g.parseLiteral(pattern, nested) } } return g, "", nil } // helper for decoding a rune in range elements, e.g. [a-z] func readRangeRune(input string) (rune, int, error) { r, sz := utf8.DecodeRuneInString(input) var err error if r == utf8.RuneError { // See the documentation for DecodeRuneInString. switch sz { case 0: err = errBadRange case 1: err = errInvalidUTF8 } } return r, sz, err } var ( errBadRange = errors.New("'[' patterns must be of the form [x-y]") errInvalidUTF8 = errors.New("invalid UTF-8 encoding") ) func (g *Glob) parseLiteral(pattern string, nested bool) string { var specialChars string if nested { specialChars = "*?{[/}," } else { specialChars = "*?{[/" } end := strings.IndexAny(pattern, specialChars) if end == -1 { end = len(pattern) } g.elems = append(g.elems, literal(pattern[:end])) return pattern[end:] } func (g *Glob) String() string { var b strings.Builder for _, e := range g.elems { fmt.Fprint(&b, e) } return b.String() } // element holds a glob pattern element, as defined below. type element fmt.Stringer // element types. type ( slash struct{} // One or more '/' separators literal string // string literal, not containing /, *, ?, {}, or [] star struct{} // * anyChar struct{} // ? starStar struct{} // ** group []*Glob // {foo, bar, ...} grouping charRange struct { // [a-z] character range negate bool low, high rune } ) func (s slash) String() string { return "/" } func (l literal) String() string { return string(l) } func (s star) String() string { return "*" } func (a anyChar) String() string { return "?" } func (s starStar) String() string { return "**" } func (g group) String() string { var parts []string for _, g := range g { parts = append(parts, g.String()) } return "{" + strings.Join(parts, ",") + "}" } func (r charRange) String() string { return "[" + string(r.low) + "-" + string(r.high) + "]" } // Match reports whether the input string matches the glob pattern. func (g *Glob) Match(input string) bool { return match(g.elems, input) } func match(elems []element, input string) (ok bool) { var elem interface{} for len(elems) > 0 { elem, elems = elems[0], elems[1:] switch elem := elem.(type) { case slash: if len(input) == 0 || input[0] != '/' { return false } for input[0] == '/' { input = input[1:] } case starStar: // Special cases: // - **/a matches "a" // - **/ matches everything // // Note that if ** is followed by anything, it must be '/' (this is // enforced by Parse). if len(elems) > 0 { elems = elems[1:] } // A trailing ** matches anything. if len(elems) == 0 { return true } // Backtracking: advance pattern segments until the remaining pattern // elements match. for len(input) != 0 { if match(elems, input) { return true } _, input = split(input) } return false case literal: if !strings.HasPrefix(input, string(elem)) { return false } input = input[len(elem):] case star: var segInput string segInput, input = split(input) elemEnd := len(elems) for i, e := range elems { if e == (slash{}) { elemEnd = i break } } segElems := elems[:elemEnd] elems = elems[elemEnd:] // A trailing * matches the entire segment. if len(segElems) == 0 { break } // Backtracking: advance characters until remaining subpattern elements // match. matched := false for i := range segInput { if match(segElems, segInput[i:]) { matched = true break } } if !matched { return false } case anyChar: if len(input) == 0 || input[0] == '/' { return false } input = input[1:] case group: // Append remaining pattern elements to each group member looking for a // match. var branch []element for _, m := range elem { branch = branch[:0] branch = append(branch, m.elems...) branch = append(branch, elems...) if match(branch, input) { return true } } return false case charRange: if len(input) == 0 || input[0] == '/' { return false } c, sz := utf8.DecodeRuneInString(input) if c < elem.low || c > elem.high { return false } input = input[sz:] default: panic(fmt.Sprintf("segment type %T not implemented", elem)) } } return len(input) == 0 } // split returns the portion before and after the first slash // (or sequence of consecutive slashes). If there is no slash // it returns (input, nil). func split(input string) (first, rest string) { i := strings.IndexByte(input, '/') if i < 0 { return input, "" } first = input[:i] for j := i; j < len(input); j++ { if input[j] != '/' { return first, input[j:] } } return first, "" }