Skip to content

Commit 34a1a4b

Browse files
CM-55207-Fix Secrets documents collecting for diff scan
1 parent 700654f commit 34a1a4b

File tree

3 files changed

+100
-10
lines changed

3 files changed

+100
-10
lines changed

cycode/cli/apps/scan/commit_range_scanner.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ def _scan_commit_range_documents(
186186
def _scan_sca_commit_range(ctx: typer.Context, repo_path: str, commit_range: str, **_) -> None:
187187
scan_parameters = get_scan_parameters(ctx, (repo_path,))
188188

189-
from_commit_rev, to_commit_rev = parse_commit_range(commit_range, repo_path)
189+
from_commit_rev, to_commit_rev, _ = parse_commit_range(commit_range, repo_path)
190190
from_commit_documents, to_commit_documents, _ = get_commit_range_modified_documents(
191191
ctx.obj['progress_bar'], ScanProgressBarSection.PREPARE_LOCAL_FILES, repo_path, from_commit_rev, to_commit_rev
192192
)
@@ -227,7 +227,7 @@ def _scan_secret_commit_range(
227227
def _scan_sast_commit_range(ctx: typer.Context, repo_path: str, commit_range: str, **_) -> None:
228228
scan_parameters = get_scan_parameters(ctx, (repo_path,))
229229

230-
from_commit_rev, to_commit_rev = parse_commit_range(commit_range, repo_path)
230+
from_commit_rev, to_commit_rev, _ = parse_commit_range(commit_range, repo_path)
231231
_, commit_documents, diff_documents = get_commit_range_modified_documents(
232232
ctx.obj['progress_bar'],
233233
ScanProgressBarSection.PREPARE_LOCAL_FILES,

cycode/cli/files_collector/commit_range_documents.py

Lines changed: 43 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -67,12 +67,15 @@ def collect_commit_range_diff_documents(
6767
commit_documents_to_scan = []
6868

6969
repo = git_proxy.get_repo(path)
70-
total_commits_count = int(repo.git.rev_list('--count', commit_range))
71-
logger.debug('Calculating diffs for %s commits in the commit range %s', total_commits_count, commit_range)
70+
71+
normalized_commit_range = normalize_commit_range(commit_range, path)
72+
73+
total_commits_count = int(repo.git.rev_list('--count', normalized_commit_range))
74+
logger.debug('Calculating diffs for %s commits in the commit range %s', total_commits_count, normalized_commit_range)
7275

7376
progress_bar.set_section_length(ScanProgressBarSection.PREPARE_LOCAL_FILES, total_commits_count)
7477

75-
for scanned_commits_count, commit in enumerate(repo.iter_commits(rev=commit_range)):
78+
for scanned_commits_count, commit in enumerate(repo.iter_commits(rev=normalized_commit_range)):
7679
if _does_reach_to_max_commits_to_scan_limit(commit_ids_to_scan, max_commits_count):
7780
logger.debug('Reached to max commits to scan count. Going to scan only %s last commits', max_commits_count)
7881
progress_bar.update(ScanProgressBarSection.PREPARE_LOCAL_FILES, total_commits_count - scanned_commits_count)
@@ -96,7 +99,12 @@ def collect_commit_range_diff_documents(
9699

97100
logger.debug(
98101
'Found all relevant files in commit %s',
99-
{'path': path, 'commit_range': commit_range, 'commit_id': commit_id},
102+
{
103+
'path': path,
104+
'commit_range': commit_range,
105+
'normalized_commit_range': normalized_commit_range,
106+
'commit_id': commit_id,
107+
},
100108
)
101109

102110
logger.debug('List of commit ids to scan, %s', {'commit_ids': commit_ids_to_scan})
@@ -428,8 +436,8 @@ def get_pre_commit_modified_documents(
428436
return git_head_documents, pre_committed_documents, diff_documents
429437

430438

431-
def parse_commit_range(commit_range: str, path: str) -> tuple[Optional[str], Optional[str]]:
432-
"""Parses a git commit range string and returns the full SHAs for the 'from' and 'to' commits.
439+
def parse_commit_range(commit_range: str, path: str) -> tuple[Optional[str], Optional[str], Optional[str]]:
440+
"""Parses a git commit range string and returns the full SHAs for the 'from' and 'to' commits as well as the separator.
433441
434442
Supports:
435443
- 'from..to'
@@ -442,8 +450,10 @@ def parse_commit_range(commit_range: str, path: str) -> tuple[Optional[str], Opt
442450

443451
if '...' in commit_range:
444452
from_spec, to_spec = commit_range.split('...', 1)
453+
separator = '...'
445454
elif '..' in commit_range:
446455
from_spec, to_spec = commit_range.split('..', 1)
456+
separator = '..'
447457
else:
448458
# Git commands like 'git diff <commit>' compare against HEAD.
449459
from_spec = commit_range
@@ -459,7 +469,32 @@ def parse_commit_range(commit_range: str, path: str) -> tuple[Optional[str], Opt
459469
# Use rev_parse to resolve each specifier to its full commit SHA
460470
from_commit_rev = repo.rev_parse(from_spec).hexsha
461471
to_commit_rev = repo.rev_parse(to_spec).hexsha
462-
return from_commit_rev, to_commit_rev
472+
return from_commit_rev, to_commit_rev, separator
463473
except git_proxy.get_git_command_error() as e:
464474
logger.warning("Failed to parse commit range '%s'", commit_range, exc_info=e)
465-
return None, None
475+
return None, None, None
476+
477+
def normalize_commit_range(commit_range: str, path: str) -> str:
478+
"""Normalize a commit range string to handle various formats consistently with all scan types.
479+
480+
Returns:
481+
A normalized commit range string suitable for Git operations (e.g., 'full_sha1..full_sha2')
482+
"""
483+
from_commit_rev, to_commit_rev, separator = parse_commit_range(commit_range, path)
484+
if from_commit_rev is None or to_commit_rev is None:
485+
logger.warning(
486+
'Failed to parse commit range "%s", falling back to raw string. This may cause unexpected behavior.',
487+
commit_range
488+
)
489+
# Fall back to using the raw commit_range string
490+
return commit_range
491+
492+
# Construct a normalized range string using the original separator for iter_commits
493+
# This preserves the semantics of two-dot vs three-dot syntax
494+
normalized_commit_range = f'{from_commit_rev}{separator}{to_commit_rev}'
495+
logger.debug(
496+
'Normalized commit range "%s" to "%s"',
497+
commit_range,
498+
normalized_commit_range,
499+
)
500+
return normalized_commit_range

tests/cli/files_collector/test_commit_range_documents.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
_get_default_branches_for_merge_base,
1414
calculate_pre_push_commit_range,
1515
calculate_pre_receive_commit_range,
16+
collect_commit_range_diff_documents,
1617
get_diff_file_path,
1718
get_safe_head_reference_for_diff,
1819
parse_commit_range,
@@ -1047,3 +1048,57 @@ def test_initial_oldest_commit_without_parent_with_two_commits_returns_single_co
10471048
work_repo.close()
10481049
finally:
10491050
server_repo.close()
1051+
1052+
1053+
class TestCollectCommitRangeDiffDocuments:
1054+
"""Test the collect_commit_range_diff_documents function with various commit range formats."""
1055+
1056+
def test_collect_with_various_commit_range_formats(self) -> None:
1057+
"""Test that different commit range formats are normalized and work correctly."""
1058+
with temporary_git_repository() as (temp_dir, repo):
1059+
# Create three commits
1060+
a_file = os.path.join(temp_dir, 'a.txt')
1061+
with open(a_file, 'w') as f:
1062+
f.write('A')
1063+
repo.index.add(['a.txt'])
1064+
a_commit = repo.index.commit('A')
1065+
1066+
with open(a_file, 'a') as f:
1067+
f.write('B')
1068+
repo.index.add(['a.txt'])
1069+
b_commit = repo.index.commit('B')
1070+
1071+
with open(a_file, 'a') as f:
1072+
f.write('C')
1073+
repo.index.add(['a.txt'])
1074+
c_commit = repo.index.commit('C')
1075+
1076+
# Create mock context
1077+
mock_ctx = Mock()
1078+
mock_progress_bar = Mock()
1079+
mock_progress_bar.set_section_length = Mock()
1080+
mock_progress_bar.update = Mock()
1081+
mock_ctx.obj = {'progress_bar': mock_progress_bar}
1082+
1083+
# Test two-dot range - should collect documents from commits B and C (2 commits, 2 documents)
1084+
commit_range = f'{a_commit.hexsha}..{c_commit.hexsha}'
1085+
documents = collect_commit_range_diff_documents(mock_ctx, temp_dir, commit_range)
1086+
assert len(documents) == 2, f'Expected 2 documents from range A..C, got {len(documents)}'
1087+
commit_ids_in_documents = {doc.unique_id for doc in documents if doc.unique_id}
1088+
assert b_commit.hexsha in commit_ids_in_documents and c_commit.hexsha in commit_ids_in_documents
1089+
1090+
# Test three-dot range - should collect documents from commits B and C (2 commits, 2 documents)
1091+
commit_range = f'{a_commit.hexsha}...{c_commit.hexsha}'
1092+
documents = collect_commit_range_diff_documents(mock_ctx, temp_dir, commit_range)
1093+
assert len(documents) == 2, f'Expected 2 documents from range A...C, got {len(documents)}'
1094+
1095+
# Test parent notation with three-dot - should collect document from commit C (1 commit, 1 document)
1096+
commit_range = f'{c_commit.hexsha}~1...{c_commit.hexsha}'
1097+
documents = collect_commit_range_diff_documents(mock_ctx, temp_dir, commit_range)
1098+
assert len(documents) == 1, f'Expected 1 document from range C~1...C, got {len(documents)}'
1099+
assert documents[0].unique_id == c_commit.hexsha
1100+
1101+
# Test single commit spec - should be interpreted as A..HEAD (commits B and C, 2 documents)
1102+
commit_range = a_commit.hexsha
1103+
documents = collect_commit_range_diff_documents(mock_ctx, temp_dir, commit_range)
1104+
assert len(documents) == 2, f'Expected 2 documents from single commit A (interpreted as A..HEAD), got {len(documents)}'

0 commit comments

Comments
 (0)