diff --git a/go.mod b/go.mod index c4dd2daa6..9b5d676e0 100644 --- a/go.mod +++ b/go.mod @@ -34,6 +34,7 @@ require ( github.com/stretchr/testify v1.11.1 github.com/uber/jaeger-client-go v2.30.0+incompatible github.com/uber/jaeger-lib v2.4.1+incompatible + github.com/wasilibs/go-re2 v1.10.0 github.com/xeipuuv/gojsonschema v1.2.0 gitlab.com/gitlab-org/api/client-go v1.46.0 go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.63.0 @@ -63,6 +64,8 @@ require ( github.com/go-fed/httpsig v1.1.0 // indirect github.com/hashicorp/go-version v1.7.0 // indirect github.com/kylelemons/godebug v1.1.0 // indirect + github.com/tetratelabs/wazero v1.9.0 // indirect + github.com/wasilibs/wazero-helpers v0.0.0-20240620070341-3dff1577cd52 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect ) @@ -85,7 +88,7 @@ require ( github.com/cockroachdb/logtags v0.0.0-20241215232642-bb51bb14a506 // indirect github.com/cockroachdb/redact v1.1.5 // indirect github.com/cyphar/filepath-securejoin v0.4.1 // indirect - github.com/davecgh/go-spew v1.1.1 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/emirpasic/gods v1.18.1 // indirect github.com/fatih/color v1.18.0 // indirect github.com/getsentry/sentry-go v0.31.1 // indirect @@ -118,7 +121,7 @@ require ( github.com/mschoch/smat v0.2.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pjbgf/sha1cd v0.3.2 // indirect - github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect github.com/prometheus/client_model v0.6.1 // indirect github.com/prometheus/common v0.62.0 // indirect diff --git a/go.sum b/go.sum index 989fd215a..4ccb07313 100644 --- a/go.sum +++ b/go.sum @@ -74,8 +74,9 @@ github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ3 github.com/cyphar/filepath-securejoin v0.4.1 h1:JyxxyPEaktOD+GAnqIqTf9A8tHyAG22rowi7HkoSU1s= github.com/cyphar/filepath-securejoin v0.4.1/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davidmz/go-pageant v1.0.2 h1:bPblRCh5jGU+Uptpz6LgMZGD5hJoOt7otgT454WvHn0= github.com/davidmz/go-pageant v1.0.2/go.mod h1:P2EDDnMqIwG5Rrp05dTRITj9z2zpGcD9efWSkTNKLIE= github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= @@ -273,8 +274,9 @@ github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsK github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt9k/+g42oCprj/FisM4qX9L3sZB3upGN2ZU= github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g= @@ -323,10 +325,16 @@ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/tetratelabs/wazero v1.9.0 h1:IcZ56OuxrtaEz8UYNRHBrUa9bYeX9oVY93KspZZBf/I= +github.com/tetratelabs/wazero v1.9.0/go.mod h1:TSbcXCfFP0L2FGkRPxHphadXPjo1T6W+CseNNY7EkjM= github.com/uber/jaeger-client-go v2.30.0+incompatible h1:D6wyKGCecFaSRUpo8lCVbaOOb6ThwMmTEbhRwtKR97o= github.com/uber/jaeger-client-go v2.30.0+incompatible/go.mod h1:WVhlPFC8FDjOFMMWRy2pZqQJSXxYSwNYOkTr/Z6d3Kk= github.com/uber/jaeger-lib v2.4.1+incompatible h1:td4jdvLcExb4cBISKIpHuGoVXh+dVKhn2Um6rjCsSsg= github.com/uber/jaeger-lib v2.4.1+incompatible/go.mod h1:ComeNDZlWwrWnDv8aPp0Ba6+uUTzImX/AauajbLI56U= +github.com/wasilibs/go-re2 v1.10.0 h1:vQZEBYZOCA9jdBMmrO4+CvqyCj0x4OomXTJ4a5/urQ0= +github.com/wasilibs/go-re2 v1.10.0/go.mod h1:k+5XqO2bCJS+QpGOnqugyfwC04nw0jaglmjrrkG8U6o= +github.com/wasilibs/wazero-helpers v0.0.0-20240620070341-3dff1577cd52 h1:OvLBa8SqJnZ6P+mjlzc2K7PM22rRUPE1x32G9DTPrC4= +github.com/wasilibs/wazero-helpers v0.0.0-20240620070341-3dff1577cd52/go.mod h1:jMeV4Vpbi8osrE/pKUxRZkVaA0EX7NZN0A9/oRzgpgY= github.com/xanzy/ssh-agent v0.3.3 h1:+/15pJfg/RsTxqYcX6fHqOXZwwMP+2VyYWJeWM2qQFM= github.com/xanzy/ssh-agent v0.3.3/go.mod h1:6dzNDKs0J9rVPHPhaGCukekBHKqfl+L3KghI1Bc68Uw= github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU= diff --git a/index/matchtree.go b/index/matchtree.go index 92492f6ad..6b0b91bf9 100644 --- a/index/matchtree.go +++ b/index/matchtree.go @@ -27,6 +27,7 @@ import ( "github.com/grafana/regexp" "github.com/sourcegraph/zoekt" + "github.com/sourcegraph/zoekt/internal/hybridre2" "github.com/sourcegraph/zoekt/internal/syntaxutil" "github.com/sourcegraph/zoekt/query" ) @@ -187,6 +188,11 @@ type noVisitMatchTree struct { type regexpMatchTree struct { regexp *regexp.Regexp + // hybridRegexp is a size-aware wrapper that dispatches to go-re2 for + // large file content when ZOEKT_RE2_THRESHOLD_BYTES is configured. + // For small inputs and filename matches, regexp is used directly. + hybridRegexp *hybridre2.Regexp + // origRegexp is the original parsed regexp from the query structure. It // does not include mutations such as case sensitivity. origRegexp *syntax.Regexp @@ -207,10 +213,19 @@ func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree { prefix = "(?i)" } + pattern := prefix + syntaxutil.RegexpString(s.Regexp) + + // hybridRegexp is only used for file content matching; skip the RE2 + // compilation overhead for filename-only regexps. + var hr *hybridre2.Regexp + if !s.FileName { + hr = hybridre2.MustCompile(pattern) + } return ®expMatchTree{ - regexp: regexp.MustCompile(prefix + syntaxutil.RegexpString(s.Regexp)), - origRegexp: s.Regexp, - fileName: s.FileName, + regexp: regexp.MustCompile(pattern), + hybridRegexp: hr, + origRegexp: s.Regexp, + fileName: s.FileName, } } @@ -802,7 +817,17 @@ func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[match } cp.stats.RegexpsConsidered++ - idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1) + data := cp.data(t.fileName) + // For file content, use hybridRegexp which dispatches to go-re2 when + // len(data) >= ZOEKT_RE2_THRESHOLD_BYTES. For filename matching, use + // grafana/regexp directly: filenames are always short, so the WASM + // call overhead of go-re2 outweighs any benefit. + var idxs [][]int + if t.fileName { + idxs = t.regexp.FindAllIndex(data, -1) + } else { + idxs = t.hybridRegexp.FindAllIndex(data, -1) + } found := t.found[:0] for _, idx := range idxs { cm := &candidateMatch{ diff --git a/internal/hybridre2/hybridre2.go b/internal/hybridre2/hybridre2.go new file mode 100644 index 000000000..8d946523c --- /dev/null +++ b/internal/hybridre2/hybridre2.go @@ -0,0 +1,148 @@ +// Copyright 2026 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package hybridre2 provides a hybrid regex engine that switches between +// grafana/regexp (an optimized fork of Go's stdlib regexp) and +// wasilibs/go-re2 (RE2 via WebAssembly) based on input size. +// +// Motivation: Go's regexp engine lacks a lazy DFA, making it O(n·m) for +// hard patterns. RE2's lazy DFA provides linear-time matching, which is +// dramatically faster for large inputs (>32KB) or complex patterns. For +// small inputs the WASM call overhead of go-re2 exceeds the savings, +// so grafana/regexp remains the better choice there. +// +// The threshold is controlled by the ZOEKT_RE2_THRESHOLD_BYTES environment +// variable, read once at program startup: +// +// - -1 (default): disabled, always use grafana/regexp +// - 0: always use go-re2 +// - N > 0: use go-re2 when len(input) >= N bytes +// +// # Known tradeoffs +// +// Memory: each Regexp holds compiled state for both engines when RE2 is +// enabled. Patterns are compiled per-search (not cached globally), so under +// high concurrency with many unique patterns the WASM heap adds up. Monitor +// RSS when first enabling the threshold in production. +// +// UTF-8 semantics: go-re2 stops at invalid UTF-8; grafana/regexp replaces +// invalid bytes with U+FFFD and continues. Results may differ on binary +// content that slips past content-type detection. See FindAllIndex for +// details. +// +// RE2 compilation failure: if RE2 rejects a pattern that grafana/regexp +// accepts (due to syntax differences between the two engines), Compile +// returns an error rather than silently falling back to grafana/regexp. +// This is intentional (fail-fast), but it means enabling the threshold +// could surface errors for edge-case patterns that work today. Patterns +// sourced from zoekt query parsing are validated before reaching this +// package, so this is unlikely in practice. +package hybridre2 + +import ( + "os" + "strconv" + "sync" + + grafanaregexp "github.com/grafana/regexp" + re2regexp "github.com/wasilibs/go-re2" +) + +const ( + // envThreshold is the environment variable name controlling the size + // threshold (bytes) at which go-re2 is used instead of grafana/regexp. + // Set to -1 (default) to disable go-re2 entirely, 0 to always use it. + envThreshold = "ZOEKT_RE2_THRESHOLD_BYTES" + + // disabled is the sentinel value meaning go-re2 is never used. + disabled = int64(-1) +) + +// threshold returns the configured byte threshold, reading +// ZOEKT_RE2_THRESHOLD_BYTES from the environment exactly once. +// Negative means disabled; zero means always use RE2. +// +// Tests may reassign this variable to override the threshold. +var threshold = sync.OnceValue(func() int64 { + if val, ok := os.LookupEnv(envThreshold); ok { + if n, err := strconv.ParseInt(val, 10, 64); err == nil { + return n + } + } + return disabled +}) + +// Regexp is a compiled regular expression that dispatches to either +// grafana/regexp or go-re2 at match time, based on input size. +type Regexp struct { + grafana *grafanaregexp.Regexp + re2 *re2regexp.Regexp // nil when threshold() < 0 (disabled) +} + +// Compile returns a new Regexp. The grafana/regexp variant is always compiled. +// The go-re2 variant is only compiled when ZOEKT_RE2_THRESHOLD_BYTES is set to +// a non-negative value; when RE2 is disabled (the default), skipping WASM +// compilation keeps the disabled path truly zero-cost. +func Compile(pattern string) (*Regexp, error) { + g, err := grafanaregexp.Compile(pattern) + if err != nil { + return nil, err + } + result := &Regexp{grafana: g} + if threshold() >= 0 { + r, err := re2regexp.Compile(pattern) + if err != nil { + return nil, err + } + result.re2 = r + } + return result, nil +} + +// MustCompile is like Compile but panics on error. +func MustCompile(pattern string) *Regexp { + re, err := Compile(pattern) + if err != nil { + panic("hybridre2: Compile(" + pattern + "): " + err.Error()) + } + return re +} + +// useRE2 reports whether the RE2 engine should be used for an input of the +// given length, based on the current threshold setting. +func useRE2(inputLen int) bool { + t := threshold() + return t >= 0 && int64(inputLen) >= t +} + +// FindAllIndex returns successive non-overlapping matches of the expression +// in b. It uses go-re2 when len(b) >= threshold() (and RE2 is enabled), +// and grafana/regexp otherwise. Match indices are relative to b. +// +// NOTE: go-re2 stops matching at invalid UTF-8 bytes, whereas grafana/regexp +// replaces them with U+FFFD and continues. This means results may differ on +// binary or non-UTF-8 content when RE2 is active. The default threshold of -1 +// (disabled) ensures zero behaviour change for existing deployments; operators +// enabling the threshold should be aware of this distinction. +func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { + if re.re2 != nil && useRE2(len(b)) { + return re.re2.FindAllIndex(b, n) + } + return re.grafana.FindAllIndex(b, n) +} + +// String returns the source text used to compile the regular expression. +func (re *Regexp) String() string { + return re.grafana.String() +} diff --git a/internal/hybridre2/hybridre2_test.go b/internal/hybridre2/hybridre2_test.go new file mode 100644 index 000000000..3c3d508c7 --- /dev/null +++ b/internal/hybridre2/hybridre2_test.go @@ -0,0 +1,358 @@ +// Copyright 2026 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hybridre2 + +import ( + "fmt" + "testing" + + grafanaregexp "github.com/grafana/regexp" +) + +// withThreshold overrides the effective threshold for the duration of the test +// and registers a t.Cleanup to restore it afterwards. +// +// NOT safe for concurrent use: do not call t.Parallel() after withThreshold, +// and do not use it from TestMain or init(). +func withThreshold(tb testing.TB, thresh int64) { + tb.Helper() + old := threshold + threshold = func() int64 { return thresh } + tb.Cleanup(func() { threshold = old }) +} + +// ---- unit tests ---- + +func TestCompileValid(t *testing.T) { + _, err := Compile(`foo.*bar`) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } +} + +func TestCompileInvalid(t *testing.T) { + _, err := Compile(`[invalid`) + if err == nil { + t.Fatal("expected error for invalid pattern, got nil") + } +} + +func TestMustCompilePanics(t *testing.T) { + defer func() { + if r := recover(); r == nil { + t.Fatal("MustCompile should panic on invalid pattern") + } + }() + MustCompile(`[invalid`) +} + +func TestString(t *testing.T) { + const pat = `foo.*bar` + re := MustCompile(pat) + if re.String() != pat { + t.Fatalf("String() = %q, want %q", re.String(), pat) + } +} + +// TestFindAllIndexDisabled checks that with threshold=-1, we use grafana/regexp. +func TestFindAllIndexDisabled(t *testing.T) { + corpus := []byte("func main() { fmt.Println(\"hello world\") }") + patterns := []string{`\w+`, `fmt\.\w+`, `(?i)MAIN`, `"[^"]*"`} + + withThreshold(t, disabled) + for _, pat := range patterns { + hybrid := MustCompile(pat) + grafana := grafanaregexp.MustCompile(pat) + got := hybrid.FindAllIndex(corpus, -1) + want := grafana.FindAllIndex(corpus, -1) + if !equalIndexSlices(got, want) { + t.Errorf("disabled mode, pattern %q: hybrid=%v grafana=%v", pat, got, want) + } + } +} + +// TestFindAllIndexForcedRE2 checks that with threshold=0, go-re2 is used and +// produces identical results to grafana/regexp for standard patterns. +func TestFindAllIndexForcedRE2(t *testing.T) { + corpus := []byte("func main() { fmt.Println(\"hello world\") }") + patterns := []string{`\w+`, `fmt\.\w+`, `(?i)MAIN`, `"[^"]*"`} + + withThreshold(t, 0) + for _, pat := range patterns { + hybrid := MustCompile(pat) + grafana := grafanaregexp.MustCompile(pat) + got := hybrid.FindAllIndex(corpus, -1) + want := grafana.FindAllIndex(corpus, -1) + if !equalIndexSlices(got, want) { + t.Errorf("forced-re2 mode, pattern %q: hybrid=%v grafana=%v", pat, got, want) + } + } +} + +// TestThresholdSwitching verifies the engine switches at the configured byte boundary. +func TestThresholdSwitching(t *testing.T) { + const thresh = int64(512) + pattern := `func\s+\w+` + grafana := grafanaregexp.MustCompile(pattern) + + smallCorpus := makeCorpus(300) // < 512 + largeCorpus := makeCorpus(600) // >= 512 + + withThreshold(t, thresh) + hybrid := MustCompile(pattern) + + for _, tc := range []struct { + name string + corpus []byte + }{ + {"small(=threshold)", largeCorpus}, + } { + got := hybrid.FindAllIndex(tc.corpus, -1) + want := grafana.FindAllIndex(tc.corpus, -1) + if !equalIndexSlices(got, want) { + t.Errorf("%s: hybrid=%v grafana=%v", tc.name, got, want) + } + } +} + +// TestFindAllIndexIdenticalResults is a comprehensive correctness sweep across +// pattern types and input sizes, asserting identical match positions. +func TestFindAllIndexIdenticalResults(t *testing.T) { + patterns := []struct { + name string + pattern string + }{ + {"literal", `hello`}, + {"case-insensitive", `(?i)Hello`}, + {"word-boundary", `\bfunc\b`}, + {"alternation", `foo|bar|baz`}, + {"char-class", `[a-zA-Z_]\w*`}, + {"complex", `(func|var|const)\s+[A-Z]\w*`}, + {"dot-plus", `.+`}, + {"anchored-line", `(?m)^package\s+\w+`}, + {"no-match", `XYZZY_NEVER_MATCHES`}, + } + + sizes := []struct { + name string + size int + }{ + {"64B", 64}, + {"512B", 512}, + {"4KB", 4 * 1024}, + {"64KB", 64 * 1024}, + {"256KB", 256 * 1024}, + } + + // Force re2 path to test its correctness across all sizes. + withThreshold(t, 0) + for _, sz := range sizes { + corpus := makeCorpus(sz.size) + for _, pat := range patterns { + name := sz.name + "/" + pat.name + t.Run(name, func(t *testing.T) { + hybrid := MustCompile(pat.pattern) + grafana := grafanaregexp.MustCompile(pat.pattern) + + got := hybrid.FindAllIndex(corpus, -1) + want := grafana.FindAllIndex(corpus, -1) + if !equalIndexSlices(got, want) { + t.Errorf("pattern=%q size=%d: len(hybrid)=%d len(grafana)=%d", + pat.pattern, sz.size, len(got), len(want)) + if len(got) > 0 && len(want) > 0 { + t.Errorf(" first hybrid=%v first grafana=%v", got[0], want[0]) + } + } + }) + } + } +} + +// TestFindAllIndexLimitN verifies that the n parameter (match count limit) is +// honoured identically by both engines. +func TestFindAllIndexLimitN(t *testing.T) { + corpus := makeCorpus(64 * 1024) // large enough to have many matches + patterns := []string{`func\s+\w+`, `\bvar\b`, `[A-Z]\w*`} + + withThreshold(t, 0) // force re2 path + for _, pat := range patterns { + hybrid := MustCompile(pat) + grafana := grafanaregexp.MustCompile(pat) + + got := hybrid.FindAllIndex(corpus, 1) + want := grafana.FindAllIndex(corpus, 1) + if !equalIndexSlices(got, want) { + t.Errorf("n=1, pattern=%q: hybrid=%v grafana=%v", pat, got, want) + } + // Sanity: n=1 should return at most one match. + if len(got) > 1 { + t.Errorf("n=1, pattern=%q: got %d matches, want <= 1", pat, len(got)) + } + } +} + +// TestNoMatchReturnsEmpty verifies no-match returns nil/empty consistently. +func TestNoMatchReturnsEmpty(t *testing.T) { + corpus := makeCorpus(1024) + + for _, thresh := range []int64{disabled, 0} { + t.Run(fmt.Sprintf("thresh=%d", thresh), func(t *testing.T) { + withThreshold(t, thresh) + // MustCompile must be after withThreshold so that the lazy RE2 + // compilation in Compile() sees the overridden threshold and + // actually initialises re.re2 when thresh=0. + re := MustCompile(`XYZZY_NEVER_MATCHES`) + if got := re.FindAllIndex(corpus, -1); len(got) != 0 { + t.Errorf("thresh=%d: expected empty, got %v", thresh, got) + } + }) + } +} + +// ---- benchmarks ---- + +// BenchmarkEngines measures FindAllIndex performance for grafana/regexp vs +// go-re2 across multiple input sizes and pattern complexities. +// +// Run with: +// +// go test -bench=BenchmarkEngines -benchmem -benchtime=3s ./internal/hybridre2/ +func BenchmarkEngines(b *testing.B) { + patterns := []struct { + name string + pattern string + }{ + {"literal", `main`}, + {"case-insensitive", `(?i)func`}, + {"alternation-5", `func|var|const|type|import`}, + {"complex", `(func|var)\s+[A-Z]\w*\s*\(`}, + {"hard-no-match", `XYZZY_NEVER_MATCHES_AT_ALL`}, + } + + sizes := []struct { + name string + size int + }{ + {"512B", 512}, + {"4KB", 4 * 1024}, + {"32KB", 32 * 1024}, + {"128KB", 128 * 1024}, + {"512KB", 512 * 1024}, + } + + // Pre-build all corpora outside the benchmark loop. + corpora := make(map[string][]byte, len(sizes)) + for _, sz := range sizes { + corpora[sz.name] = makeCorpus(sz.size) + } + + for _, pat := range patterns { + grafanaRe := grafanaregexp.MustCompile(pat.pattern) + + for _, sz := range sizes { + corpus := corpora[sz.name] + name := pat.name + "/" + sz.name + + b.Run("grafana/"+name, func(b *testing.B) { + b.SetBytes(int64(len(corpus))) + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = grafanaRe.FindAllIndex(corpus, -1) + } + }) + + b.Run("go-re2/"+name, func(b *testing.B) { + withThreshold(b, 0) // force re2 for all sizes + // MustCompile must be after withThreshold so that re.re2 + // is initialised (lazy compilation checks threshold() at + // compile time, not match time). + hybridRe := MustCompile(pat.pattern) + b.SetBytes(int64(len(corpus))) + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = hybridRe.FindAllIndex(corpus, -1) + } + }) + } + } +} + +// ---- helpers ---- + +// makeCorpus returns a realistic-looking Go source corpus of approximately +// the requested size. +func makeCorpus(size int) []byte { + const template = `package main + +import ( + "fmt" + "strings" +) + +// Foo is an exported function that transforms its input. +func Foo(input string) string { + return strings.ToUpper(input) +} + +// Bar demonstrates calling Foo. +func Bar() { + result := Foo("hello world") + fmt.Println(result) +} + +var globalVar = "some value" +const MaxItems = 100 + +type MyStruct struct { + Name string + Value int +} + +func (m MyStruct) String() string { + return fmt.Sprintf("%s=%d", m.Name, m.Value) +} + +` + buf := make([]byte, 0, size) + for len(buf) < size { + buf = append(buf, []byte(template)...) + } + return buf[:size] +} + +func equalIndexSlices(a, b [][]int) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if !equalIntSlice(a[i], b[i]) { + return false + } + } + return true +} + +func equalIntSlice(a, b []int) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +}