reference, declarationdefinition
definition → references, declarations, derived classes, virtual overrides
reference to multiple definitions → definitions
unreferenced
    1
    2
    3
    4
    5
    6
    7
    8
    9
   10
   11
   12
   13
   14
   15
   16
   17
   18
   19
   20
   21
   22
   23
   24
   25
   26
   27
   28
   29
   30
   31
   32
   33
   34
   35
   36
   37
   38
   39
   40
   41
   42
   43
   44
   45
   46
   47
   48
   49
   50
   51
   52
   53
   54
   55
   56
   57
   58
   59
   60
   61
   62
   63
   64
   65
   66
   67
   68
   69
   70
   71
   72
   73
   74
   75
   76
   77
   78
   79
   80
   81
   82
   83
   84
   85
   86
   87
   88
   89
   90
   91
   92
   93
   94
   95
   96
   97
   98
   99
  100
  101
  102
  103
  104
  105
  106
  107
  108
  109
  110
  111
  112
  113
  114
  115
  116
  117
  118
  119
  120
  121
  122
  123
  124
  125
  126
  127
  128
  129
  130
  131
  132
  133
  134
  135
  136
//===--- FuzzyMatch.h - Approximate identifier matching  ---------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements fuzzy-matching of strings against identifiers.
// It indicates both the existence and quality of a match:
// 'eb' matches both 'emplace_back' and 'embed', the former has a better score.
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H

#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/raw_ostream.h"

namespace clang {
namespace clangd {

// Utilities for word segmentation.
// FuzzyMatcher already incorporates this logic, so most users don't need this.
//
// A name like "fooBar_baz" consists of several parts foo, bar, baz.
// Aligning segmentation of word and pattern improves the fuzzy-match.
// For example: [lol] matches "LaughingOutLoud" better than "LionPopulation"
//
// First we classify each character into types (uppercase, lowercase, etc).
// Then we look at the sequence: e.g. [upper, lower] is the start of a segment.

// We distinguish the types of characters that affect segmentation.
// It's not obvious how to segment digits, we treat them as lowercase letters.
// As we don't decode UTF-8, we treat bytes over 127 as lowercase too.
// This means we require exact (case-sensitive) match for those characters.
enum CharType : unsigned char {
  Empty = 0,       // Before-the-start and after-the-end (and control chars).
  Lower = 1,       // Lowercase letters, digits, and non-ASCII bytes.
  Upper = 2,       // Uppercase letters.
  Punctuation = 3, // ASCII punctuation (including Space)
};
// A CharTypeSet is a bitfield representing all the character types in a word.
// Its bits are 1<<Empty, 1<<Lower, etc.
using CharTypeSet = unsigned char;

// Each character's Role is the Head or Tail of a segment, or a Separator.
// e.g. XMLHttpRequest_Async
//      +--+---+------ +----
//      ^Head   ^Tail ^Separator
enum CharRole : unsigned char {
  Unknown = 0,   // Stray control characters or impossible states.
  Tail = 1,      // Part of a word segment, but not the first character.
  Head = 2,      // The first character of a word segment.
  Separator = 3, // Punctuation characters that separate word segments.
};

// Compute segmentation of Text.
// Character roles are stored in Roles (Roles.size() must equal Text.size()).
// The set of character types encountered is returned, this may inform
// heuristics for dealing with poorly-segmented identifiers like "strndup".
CharTypeSet calculateRoles(llvm::StringRef Text,
                           llvm::MutableArrayRef<CharRole> Roles);

// A matcher capable of matching and scoring strings against a single pattern.
// It's optimized for matching against many strings - match() does not allocate.
class FuzzyMatcher {
public:
  // Characters beyond MaxPat are ignored.
  FuzzyMatcher(llvm::StringRef Pattern);

  // If Word matches the pattern, return a score indicating the quality match.
  // Scores usually fall in a [0,1] range, with 1 being a very good score.
  // "Super" scores in (1,2] are possible if the pattern is the full word.
  // Characters beyond MaxWord are ignored.
  llvm::Optional<float> match(llvm::StringRef Word);

  llvm::StringRef pattern() const { return llvm::StringRef(Pat, PatN); }
  bool empty() const { return PatN == 0; }

  // Dump internal state from the last match() to the stream, for debugging.
  // Returns the pattern with [] around matched characters, e.g.
  //   [u_p] + "unique_ptr" --> "[u]nique[_p]tr"
  llvm::SmallString<256> dumpLast(llvm::raw_ostream &) const;

private:
  // We truncate the pattern and the word to bound the cost of matching.
  constexpr static int MaxPat = 63, MaxWord = 127;
  // Action describes how a word character was matched to the pattern.
  // It should be an enum, but this causes bitfield problems:
  //   - for MSVC the enum type must be explicitly unsigned for correctness
  //   - GCC 4.8 complains not all values fit if the type is unsigned
  using Action = bool;
  constexpr static Action Miss = false; // Word character was skipped.
  constexpr static Action Match = true; // Matched against a pattern character.

  bool init(llvm::StringRef Word);
  void buildGraph();
  bool allowMatch(int P, int W, Action Last) const;
  int skipPenalty(int W, Action Last) const;
  int matchBonus(int P, int W, Action Last) const;

  // Pattern data is initialized by the constructor, then constant.
  char Pat[MaxPat];         // Pattern data
  int PatN;                 // Length
  char LowPat[MaxPat];      // Pattern in lowercase
  CharRole PatRole[MaxPat]; // Pattern segmentation info
  CharTypeSet PatTypeSet;   // Bitmask of 1<<CharType for all Pattern characters
  float ScoreScale;         // Normalizes scores for the pattern length.

  // Word data is initialized on each call to match(), mostly by init().
  char Word[MaxWord];         // Word data
  int WordN;                  // Length
  char LowWord[MaxWord];      // Word in lowercase
  CharRole WordRole[MaxWord]; // Word segmentation info
  CharTypeSet WordTypeSet;    // Bitmask of 1<<CharType for all Word characters
  bool WordContainsPattern;   // Simple substring check

  // Cumulative best-match score table.
  // Boundary conditions are filled in by the constructor.
  // The rest is repopulated for each match(), by buildGraph().
  struct ScoreInfo {
    signed int Score : 15;
    Action Prev : 1;
  };
  ScoreInfo Scores[MaxPat + 1][MaxWord + 1][/* Last Action */ 2];
};

} // namespace clangd
} // namespace clang

#endif