From 5cae09d6a6a0dbc8bc63f0fca9dced904c524de6 Mon Sep 17 00:00:00 2001 From: Vladimir Sitnikov Date: Wed, 3 Dec 2025 13:53:53 +0300 Subject: [PATCH] feat: use git ls-files to skip files ignored by git Ideally, Files.walkFileTree could be replaced with iteration over the detected list of files, however, the current change is a minimal one to improve the performance. Fixes https://github.com/exadmin/CyberFerret/issues/7 --- .../cyberferret/async/RunnableScanner.java | 22 ++- .../cyberferret/utils/GitFileDiscovery.java | 142 ++++++++++++++++++ 2 files changed, 160 insertions(+), 4 deletions(-) create mode 100644 src/main/java/com/github/exadmin/cyberferret/utils/GitFileDiscovery.java diff --git a/src/main/java/com/github/exadmin/cyberferret/async/RunnableScanner.java b/src/main/java/com/github/exadmin/cyberferret/async/RunnableScanner.java index d557c2e..218030c 100644 --- a/src/main/java/com/github/exadmin/cyberferret/async/RunnableScanner.java +++ b/src/main/java/com/github/exadmin/cyberferret/async/RunnableScanner.java @@ -9,6 +9,7 @@ import com.github.exadmin.cyberferret.model.FoundPathItem; import com.github.exadmin.cyberferret.model.ItemType; import com.github.exadmin.cyberferret.utils.FileUtils; +import com.github.exadmin.cyberferret.utils.GitFileDiscovery; import com.github.exadmin.cyberferret.utils.MiscUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -17,10 +18,7 @@ import java.io.IOException; import java.nio.file.*; import java.nio.file.attribute.BasicFileAttributes; -import java.util.ArrayDeque; -import java.util.Deque; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.concurrent.Executors; import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Matcher; @@ -94,6 +92,16 @@ protected void _run() throws IOException { final ExcludeFileModel excludeFileModel = tmpExcludeFileModel; + // Try to discover files using git (respects .gitignore) + Optional> gitFilesOpt = GitFileDiscovery.discoverGitFiles(rootDir); + Set gitFiles = gitFilesOpt.orElse(null); + + if (gitFiles != null) { + log.info("Using git-based file discovery. Found {} files to scan (respecting .gitignore)", gitFiles.size()); + } else { + log.info("Git not available or not a git repository. Using full directory scan."); + } + // load files first Deque parentsDeque = new ArrayDeque<>(); Files.walkFileTree(rootDir, new FileVisitor<>() { @@ -118,6 +126,12 @@ public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) { public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) { log.debug("Visiting file {}", file); + // If using git-based discovery, skip files not in the git list + if (gitFiles != null && !gitFiles.contains(file)) { + log.debug("Skipping git-ignored file: {}", file); + return FileVisitResult.CONTINUE; + } + FoundPathItem parent = parentsDeque.peekLast(); FoundPathItem foundPathItem = new FoundPathItem(file, ItemType.FILE, parent); foundItemsContainer.addItem(foundPathItem); diff --git a/src/main/java/com/github/exadmin/cyberferret/utils/GitFileDiscovery.java b/src/main/java/com/github/exadmin/cyberferret/utils/GitFileDiscovery.java new file mode 100644 index 0000000..14d40db --- /dev/null +++ b/src/main/java/com/github/exadmin/cyberferret/utils/GitFileDiscovery.java @@ -0,0 +1,142 @@ +package com.github.exadmin.cyberferret.utils; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.HashSet; +import java.util.Optional; +import java.util.Set; + +/** + * Utility class for discovering files in a git repository while respecting .gitignore rules. + * Uses git CLI commands to get the list of tracked and untracked (non-ignored) files. + */ +public class GitFileDiscovery { + private static final Logger log = LoggerFactory.getLogger(GitFileDiscovery.class); + + /** + * Discovers all files that should be scanned in a git repository. + * This includes: + * - All tracked files (files committed to git) + * - All untracked files that are NOT ignored by .gitignore + * + * @param repositoryRoot the root directory to scan + * @return Optional containing a Set of file Paths to scan, or empty if git is unavailable + */ + public static Optional> discoverGitFiles(Path repositoryRoot) { + try { + // Get tracked files + Set trackedFiles = executeGitLsFiles(repositoryRoot, false); + Set allFiles = new HashSet<>(trackedFiles); + log.info("Found {} tracked files", trackedFiles.size()); + + // Get untracked but not ignored files + Set untrackedFiles = executeGitLsFiles(repositoryRoot, true); + allFiles.addAll(untrackedFiles); + log.info("Found {} untracked (non-ignored) files", untrackedFiles.size()); + + log.info("Total files to scan from git: {}", allFiles.size()); + return Optional.of(allFiles); + + } catch (Exception e) { + log.warn("Error discovering files via git, will fall back to regular file scan: {}", e.getMessage()); + return Optional.empty(); + } + } + + /** + * Gets the root directory of the git repository. + * + * @param directory a directory inside the git repository + * @return the git repository root path, or null if unable to determine + */ + private static Path getGitRepositoryRoot(Path directory) { + try { + ProcessBuilder pb = new ProcessBuilder("git", "rev-parse", "--show-toplevel"); + pb.directory(directory.toFile()); + pb.redirectErrorStream(true); + + Process process = pb.start(); + String output = readProcessOutput(process); + int exitCode = process.waitFor(); + + if (exitCode == 0 && output != null && !output.trim().isEmpty()) { + return Paths.get(output.trim()); + } + } catch (Exception e) { + log.debug("Error getting git repository root: {}", e.getMessage()); + } + return null; + } + + /** + * Executes git ls-files command to get list of files. + * + * @param gitRoot the git repository root directory + * @param untrackedOnly if true, gets untracked non-ignored files; if false, gets tracked files + * @return Set of file paths relative to gitRoot + */ + private static Set executeGitLsFiles(Path gitRoot, boolean untrackedOnly) throws IOException, InterruptedException { + Set files = new HashSet<>(); + + ProcessBuilder pb; + if (untrackedOnly) { + // Get untracked files that are NOT ignored + pb = new ProcessBuilder("git", "ls-files", "--others", "--exclude-standard"); + } else { + // Get tracked files + pb = new ProcessBuilder("git", "ls-files"); + } + + pb.directory(gitRoot.toFile()); + pb.redirectErrorStream(true); + + Process process = pb.start(); + + try (BufferedReader reader = new BufferedReader(new InputStreamReader(new BufferedInputStream(process.getInputStream())))) { + String line; + while ((line = reader.readLine()) != null) { + line = line.trim(); + if (!line.isEmpty()) { + // Convert relative path from git to absolute path + Path filePath = gitRoot.resolve(line).normalize(); + // Only include if it's a regular file (not a directory) + if (Files.isRegularFile(filePath)) { + files.add(filePath); + } + } + } + } + + int exitCode = process.waitFor(); + if (exitCode != 0) { + log.warn("Git command exited with code {}", exitCode); + } + + return files; + } + + /** + * Reads all output from a process. + * + * @param process the process to read from + * @return the complete output as a string + */ + private static String readProcessOutput(Process process) throws IOException { + StringBuilder output = new StringBuilder(); + try (BufferedReader reader = new BufferedReader(new InputStreamReader(new BufferedInputStream(process.getInputStream())))) { + String line; + while ((line = reader.readLine()) != null) { + output.append(line).append("\n"); + } + } + return output.toString(); + } +}