Skip to content

Commit c405695

Browse files
authored
Merge pull request #1048 from GitGuardian/salomevoltz/scrt-5247-ggshield-crashed-on-windows-because-of-git-command-too-long
fix: secret scan pre-commit crashing on big merges
2 parents 8b6464d + 4af42f9 commit c405695

File tree

4 files changed

+58
-17
lines changed

4 files changed

+58
-17
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
### Fixed
2+
3+
- Fix `secret scan pre-commit` crashing on big merges (#1032).

ggshield/core/scan/commit.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from .commit_information import CommitInformation
99
from .commit_utils import (
1010
DIFF_EMPTY_COMMIT_INFO_BLOCK,
11+
MAX_FILES_PER_GIT_COMMAND,
1112
PATCH_COMMON_ARGS,
1213
PATCH_PREFIX,
1314
STAGED_PREFIX,
@@ -96,14 +97,13 @@ def from_merge(
9697
files_to_scan.add(path)
9798

9899
def parser_merge(commit: "Commit") -> Iterable[Scannable]:
99-
patch = git(
100-
["diff", "--staged", *PATCH_COMMON_ARGS, *files_to_scan], cwd=cwd
101-
)
102-
yield from parse_patch(
103-
STAGED_PREFIX,
104-
DIFF_EMPTY_COMMIT_INFO_BLOCK + patch,
105-
exclusion_regexes,
106-
)
100+
for files in batched(files_to_scan, MAX_FILES_PER_GIT_COMMAND):
101+
patch = git(["diff", "--staged", *PATCH_COMMON_ARGS, *files], cwd=cwd)
102+
yield from parse_patch(
103+
STAGED_PREFIX,
104+
DIFF_EMPTY_COMMIT_INFO_BLOCK + patch,
105+
exclusion_regexes,
106+
)
107107

108108
info = CommitInformation.from_staged(cwd)
109109
return Commit(sha=None, patch_parser=parser_merge, info=info)

ggshield/core/scan/commit_utils.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from ggshield.utils.files import is_path_excluded
77
from ggshield.utils.git_shell import Filemode, git
8+
from ggshield.utils.itertools import batched
89

910
from .scannable import Scannable
1011

@@ -43,6 +44,8 @@
4344
r"^(?P<at>@@+) (?P<from>-\d+(?:,\d+)?) .* (?P<to>\+\d+(?:,\d+)?) @@+(?P<trailing_content>.+)?"
4445
)
4546

47+
MAX_FILES_PER_GIT_COMMAND = 100
48+
4649

4750
class PatchParseError(Exception):
4851
"""
@@ -370,10 +373,11 @@ def get_file_sha_in_ref(
370373
"""
371374
Helper function to get the shas of files in the git reference.
372375
"""
373-
output = git(["ls-tree", "-z", ref] + files, cwd=cwd)
374-
for line in output.split("\0")[:-1]:
375-
_, _, sha, path = line.split(maxsplit=3)
376-
yield (path, sha)
376+
for files in batched(files, MAX_FILES_PER_GIT_COMMAND):
377+
output = git(["ls-tree", "-z", ref] + files, cwd=cwd)
378+
for line in output.split("\0")[:-1]:
379+
_, _, sha, path = line.split(maxsplit=3)
380+
yield (path, sha)
377381

378382

379383
def get_file_sha_stage(
@@ -382,10 +386,11 @@ def get_file_sha_stage(
382386
"""
383387
Helper function to get the shas currently staged of files.
384388
"""
385-
output = git(["ls-files", "--stage", "-z"] + files, cwd=cwd)
386-
for line in output.split("\0")[:-1]:
387-
_, sha, _, path = line.split(maxsplit=3)
388-
yield (path, sha)
389+
for files in batched(files, MAX_FILES_PER_GIT_COMMAND):
390+
output = git(["ls-files", "--stage", "-z"] + files, cwd=cwd)
391+
for line in output.split("\0")[:-1]:
392+
_, sha, _, path = line.split(maxsplit=3)
393+
yield (path, sha)
389394

390395

391396
def get_diff_files(cwd: Optional[Path] = None) -> List[str]:

tests/unit/core/scan/test_commit_utils.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,16 @@
1+
import tempfile
12
from pathlib import Path
23
from typing import Optional, Tuple
34

45
import pytest
56

6-
from ggshield.core.scan.commit_utils import PatchFileInfo, convert_multi_parent_diff
7+
from ggshield.core.scan.commit_utils import (
8+
PatchFileInfo,
9+
convert_multi_parent_diff,
10+
get_file_sha_in_ref,
11+
)
712
from ggshield.utils.git_shell import Filemode
13+
from tests.repository import Repository
814

915

1016
@pytest.mark.parametrize(
@@ -124,3 +130,30 @@ def test_convert_multi_parent_diff(diff: str, expected: str):
124130
expected = expected.strip()
125131
result = convert_multi_parent_diff(diff)
126132
assert result == expected
133+
134+
135+
def test_get_file_sha_in_ref():
136+
"""
137+
Assert that get_file_sha_in_ref doesn't crash when called
138+
with a large number of files
139+
"""
140+
with tempfile.TemporaryDirectory() as tmp_path_str:
141+
142+
tmp_path = Path(tmp_path_str)
143+
repo = Repository.create(tmp_path)
144+
145+
for i in range(200):
146+
file_path = tmp_path / f"{i:0200d}.txt"
147+
file_path.write_text("dummy content")
148+
149+
repo.add(".")
150+
repo.create_commit("Add 200 dummy files")
151+
152+
try:
153+
files = [f"{i:0200d}.txt" for i in range(200)]
154+
result = list(get_file_sha_in_ref("HEAD", files))
155+
156+
assert isinstance(result, list), "The result should be a list."
157+
158+
except Exception as e:
159+
assert False, f"get_file_sha_in_ref crashed with error: {e}"

0 commit comments

Comments
 (0)