Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ require (
github.com/stretchr/testify v1.11.1
github.com/uber/jaeger-client-go v2.30.0+incompatible
github.com/uber/jaeger-lib v2.4.1+incompatible
github.com/wasilibs/go-re2 v1.10.0
github.com/xeipuuv/gojsonschema v1.2.0
gitlab.com/gitlab-org/api/client-go v1.46.0
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.63.0
Expand Down Expand Up @@ -63,6 +64,8 @@ require (
github.com/go-fed/httpsig v1.1.0 // indirect
github.com/hashicorp/go-version v1.7.0 // indirect
github.com/kylelemons/godebug v1.1.0 // indirect
github.com/tetratelabs/wazero v1.9.0 // indirect
github.com/wasilibs/wazero-helpers v0.0.0-20240620070341-3dff1577cd52 // indirect
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
)

Expand All @@ -85,7 +88,7 @@ require (
github.com/cockroachdb/logtags v0.0.0-20241215232642-bb51bb14a506 // indirect
github.com/cockroachdb/redact v1.1.5 // indirect
github.com/cyphar/filepath-securejoin v0.4.1 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/emirpasic/gods v1.18.1 // indirect
github.com/fatih/color v1.18.0 // indirect
github.com/getsentry/sentry-go v0.31.1 // indirect
Expand Down Expand Up @@ -118,7 +121,7 @@ require (
github.com/mschoch/smat v0.2.0 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pjbgf/sha1cd v0.3.2 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/common v0.62.0 // indirect
Expand Down
12 changes: 10 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,9 @@ github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ3
github.com/cyphar/filepath-securejoin v0.4.1 h1:JyxxyPEaktOD+GAnqIqTf9A8tHyAG22rowi7HkoSU1s=
github.com/cyphar/filepath-securejoin v0.4.1/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davidmz/go-pageant v1.0.2 h1:bPblRCh5jGU+Uptpz6LgMZGD5hJoOt7otgT454WvHn0=
github.com/davidmz/go-pageant v1.0.2/go.mod h1:P2EDDnMqIwG5Rrp05dTRITj9z2zpGcD9efWSkTNKLIE=
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
Expand Down Expand Up @@ -273,8 +274,9 @@ github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsK
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt9k/+g42oCprj/FisM4qX9L3sZB3upGN2ZU=
github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE=
github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g=
Expand Down Expand Up @@ -323,10 +325,16 @@ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
github.com/tetratelabs/wazero v1.9.0 h1:IcZ56OuxrtaEz8UYNRHBrUa9bYeX9oVY93KspZZBf/I=
github.com/tetratelabs/wazero v1.9.0/go.mod h1:TSbcXCfFP0L2FGkRPxHphadXPjo1T6W+CseNNY7EkjM=
github.com/uber/jaeger-client-go v2.30.0+incompatible h1:D6wyKGCecFaSRUpo8lCVbaOOb6ThwMmTEbhRwtKR97o=
github.com/uber/jaeger-client-go v2.30.0+incompatible/go.mod h1:WVhlPFC8FDjOFMMWRy2pZqQJSXxYSwNYOkTr/Z6d3Kk=
github.com/uber/jaeger-lib v2.4.1+incompatible h1:td4jdvLcExb4cBISKIpHuGoVXh+dVKhn2Um6rjCsSsg=
github.com/uber/jaeger-lib v2.4.1+incompatible/go.mod h1:ComeNDZlWwrWnDv8aPp0Ba6+uUTzImX/AauajbLI56U=
github.com/wasilibs/go-re2 v1.10.0 h1:vQZEBYZOCA9jdBMmrO4+CvqyCj0x4OomXTJ4a5/urQ0=
github.com/wasilibs/go-re2 v1.10.0/go.mod h1:k+5XqO2bCJS+QpGOnqugyfwC04nw0jaglmjrrkG8U6o=
github.com/wasilibs/wazero-helpers v0.0.0-20240620070341-3dff1577cd52 h1:OvLBa8SqJnZ6P+mjlzc2K7PM22rRUPE1x32G9DTPrC4=
github.com/wasilibs/wazero-helpers v0.0.0-20240620070341-3dff1577cd52/go.mod h1:jMeV4Vpbi8osrE/pKUxRZkVaA0EX7NZN0A9/oRzgpgY=
github.com/xanzy/ssh-agent v0.3.3 h1:+/15pJfg/RsTxqYcX6fHqOXZwwMP+2VyYWJeWM2qQFM=
github.com/xanzy/ssh-agent v0.3.3/go.mod h1:6dzNDKs0J9rVPHPhaGCukekBHKqfl+L3KghI1Bc68Uw=
github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU=
Expand Down
33 changes: 29 additions & 4 deletions index/matchtree.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"github.com/grafana/regexp"

"github.com/sourcegraph/zoekt"
"github.com/sourcegraph/zoekt/internal/hybridre2"
"github.com/sourcegraph/zoekt/internal/syntaxutil"
"github.com/sourcegraph/zoekt/query"
)
Expand Down Expand Up @@ -187,6 +188,11 @@ type noVisitMatchTree struct {
type regexpMatchTree struct {
regexp *regexp.Regexp

// hybridRegexp is a size-aware wrapper that dispatches to go-re2 for
// large file content when ZOEKT_RE2_THRESHOLD_BYTES is configured.
// For small inputs and filename matches, regexp is used directly.
hybridRegexp *hybridre2.Regexp

// origRegexp is the original parsed regexp from the query structure. It
// does not include mutations such as case sensitivity.
origRegexp *syntax.Regexp
Expand All @@ -207,10 +213,19 @@ func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree {
prefix = "(?i)"
}

pattern := prefix + syntaxutil.RegexpString(s.Regexp)

// hybridRegexp is only used for file content matching; skip the RE2
// compilation overhead for filename-only regexps.
var hr *hybridre2.Regexp
if !s.FileName {
hr = hybridre2.MustCompile(pattern)
}
return &regexpMatchTree{
regexp: regexp.MustCompile(prefix + syntaxutil.RegexpString(s.Regexp)),
origRegexp: s.Regexp,
fileName: s.FileName,
regexp: regexp.MustCompile(pattern),
hybridRegexp: hr,
origRegexp: s.Regexp,
fileName: s.FileName,
}
}

Expand Down Expand Up @@ -802,7 +817,17 @@ func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[match
}

cp.stats.RegexpsConsidered++
idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
data := cp.data(t.fileName)
// For file content, use hybridRegexp which dispatches to go-re2 when
// len(data) >= ZOEKT_RE2_THRESHOLD_BYTES. For filename matching, use
// grafana/regexp directly: filenames are always short, so the WASM
// call overhead of go-re2 outweighs any benefit.
var idxs [][]int
if t.fileName {
idxs = t.regexp.FindAllIndex(data, -1)
} else {
idxs = t.hybridRegexp.FindAllIndex(data, -1)
}
found := t.found[:0]
for _, idx := range idxs {
cm := &candidateMatch{
Expand Down
148 changes: 148 additions & 0 deletions internal/hybridre2/hybridre2.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
// Copyright 2026 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package hybridre2 provides a hybrid regex engine that switches between
// grafana/regexp (an optimized fork of Go's stdlib regexp) and
// wasilibs/go-re2 (RE2 via WebAssembly) based on input size.
//
// Motivation: Go's regexp engine lacks a lazy DFA, making it O(n·m) for
// hard patterns. RE2's lazy DFA provides linear-time matching, which is
// dramatically faster for large inputs (>32KB) or complex patterns. For
// small inputs the WASM call overhead of go-re2 exceeds the savings,
// so grafana/regexp remains the better choice there.
//
// The threshold is controlled by the ZOEKT_RE2_THRESHOLD_BYTES environment
// variable, read once at program startup:
//
// - -1 (default): disabled, always use grafana/regexp
// - 0: always use go-re2
// - N > 0: use go-re2 when len(input) >= N bytes
//
// # Known tradeoffs
//
// Memory: each Regexp holds compiled state for both engines when RE2 is
// enabled. Patterns are compiled per-search (not cached globally), so under
// high concurrency with many unique patterns the WASM heap adds up. Monitor
// RSS when first enabling the threshold in production.
//
// UTF-8 semantics: go-re2 stops at invalid UTF-8; grafana/regexp replaces
// invalid bytes with U+FFFD and continues. Results may differ on binary
// content that slips past content-type detection. See FindAllIndex for
// details.
//
// RE2 compilation failure: if RE2 rejects a pattern that grafana/regexp
// accepts (due to syntax differences between the two engines), Compile
// returns an error rather than silently falling back to grafana/regexp.
// This is intentional (fail-fast), but it means enabling the threshold
// could surface errors for edge-case patterns that work today. Patterns
// sourced from zoekt query parsing are validated before reaching this
// package, so this is unlikely in practice.
package hybridre2

import (
"os"
"strconv"
"sync"

grafanaregexp "github.com/grafana/regexp"
re2regexp "github.com/wasilibs/go-re2"
)

const (
// envThreshold is the environment variable name controlling the size
// threshold (bytes) at which go-re2 is used instead of grafana/regexp.
// Set to -1 (default) to disable go-re2 entirely, 0 to always use it.
envThreshold = "ZOEKT_RE2_THRESHOLD_BYTES"

// disabled is the sentinel value meaning go-re2 is never used.
disabled = int64(-1)
)

// threshold returns the configured byte threshold, reading
// ZOEKT_RE2_THRESHOLD_BYTES from the environment exactly once.
// Negative means disabled; zero means always use RE2.
//
// Tests may reassign this variable to override the threshold.
var threshold = sync.OnceValue(func() int64 {
if val, ok := os.LookupEnv(envThreshold); ok {
if n, err := strconv.ParseInt(val, 10, 64); err == nil {
return n
}
}
return disabled
})

// Regexp is a compiled regular expression that dispatches to either
// grafana/regexp or go-re2 at match time, based on input size.
type Regexp struct {
grafana *grafanaregexp.Regexp
re2 *re2regexp.Regexp // nil when threshold() < 0 (disabled)
}

// Compile returns a new Regexp. The grafana/regexp variant is always compiled.
// The go-re2 variant is only compiled when ZOEKT_RE2_THRESHOLD_BYTES is set to
// a non-negative value; when RE2 is disabled (the default), skipping WASM
// compilation keeps the disabled path truly zero-cost.
func Compile(pattern string) (*Regexp, error) {
g, err := grafanaregexp.Compile(pattern)
if err != nil {
return nil, err
}
result := &Regexp{grafana: g}
if threshold() >= 0 {
r, err := re2regexp.Compile(pattern)
if err != nil {
return nil, err
}
result.re2 = r
}
return result, nil
}

// MustCompile is like Compile but panics on error.
func MustCompile(pattern string) *Regexp {
re, err := Compile(pattern)
if err != nil {
panic("hybridre2: Compile(" + pattern + "): " + err.Error())
}
return re
}

// useRE2 reports whether the RE2 engine should be used for an input of the
// given length, based on the current threshold setting.
func useRE2(inputLen int) bool {
t := threshold()
return t >= 0 && int64(inputLen) >= t
}

// FindAllIndex returns successive non-overlapping matches of the expression
// in b. It uses go-re2 when len(b) >= threshold() (and RE2 is enabled),
// and grafana/regexp otherwise. Match indices are relative to b.
//
// NOTE: go-re2 stops matching at invalid UTF-8 bytes, whereas grafana/regexp
// replaces them with U+FFFD and continues. This means results may differ on
// binary or non-UTF-8 content when RE2 is active. The default threshold of -1
// (disabled) ensures zero behaviour change for existing deployments; operators
// enabling the threshold should be aware of this distinction.
func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
if re.re2 != nil && useRE2(len(b)) {
return re.re2.FindAllIndex(b, n)
}
return re.grafana.FindAllIndex(b, n)
}

// String returns the source text used to compile the regular expression.
func (re *Regexp) String() string {
return re.grafana.String()
}
Loading
Loading