From 7a68f465d9f41cd70d5b8b0cd8a5cd72b06dc27a Mon Sep 17 00:00:00 2001 From: Michael Tautschnig Date: Tue, 23 Dec 2025 12:04:47 +0000 Subject: [PATCH 1/2] irept::string_hash: irept hashing using string content Compute hash values with respect to string content, not just string container indices (as irept::hash and irept::full_hash do). --- src/util/irep.cpp | 27 +++++++++++++++++++++++++++ src/util/irep.h | 6 ++++++ 2 files changed, 33 insertions(+) diff --git a/src/util/irep.cpp b/src/util/irep.cpp index 36e103ea8ff..467e08e17d4 100644 --- a/src/util/irep.cpp +++ b/src/util/irep.cpp @@ -450,6 +450,33 @@ std::size_t irept::hash() const return result; } +std::size_t irept::string_hash() const +{ + const irept::subt &sub = get_sub(); + const irept::named_subt &named_sub = get_named_sub(); + + std::size_t result = hash_string(id2string(id())); + + for(const auto &irep : sub) + result = hash_combine(result, irep.string_hash()); + + std::size_t number_of_named_ireps = 0; + + for(const auto &irep_entry : named_sub) + { + if(!is_comment(irep_entry.first)) // this variant ignores comments + { + result = hash_combine(result, hash_string(id2string(irep_entry.first))); + result = hash_combine(result, irep_entry.second.string_hash()); + number_of_named_ireps++; + } + } + + result = hash_finalize(result, sub.size() + number_of_named_ireps); + + return result; +} + std::size_t irept::full_hash() const { const irept::subt &sub=get_sub(); diff --git a/src/util/irep.h b/src/util/irep.h index 28f87062c48..d942b7e9f72 100644 --- a/src/util/irep.h +++ b/src/util/irep.h @@ -452,6 +452,12 @@ class irept std::size_t hash() const; std::size_t full_hash() const; + /// Compute hash value while hashing the actual string content instead of + /// string indices. These hash values remain stable even when the string + /// container is built in a different order. Note that this can be much slower + /// than using `hash()` as string lookups are required, strings need to be + /// iterated over, and the results are not cached. + std::size_t string_hash() const; bool full_eq(const irept &other) const; From 66ee0c0b532e67016fcfd73b872817e42b283f33 Mon Sep 17 00:00:00 2001 From: Michael Tautschnig Date: Tue, 23 Dec 2025 11:09:53 +0100 Subject: [PATCH 2/2] Hash-based loop identifiers compute_loop_hashes() uses AST fingerprinting based on loop structure. This yields loop identifiers that are stable across unrelated code changes. --- regression/cbmc/loop-hash-nested/test.c | 19 + regression/cbmc/loop-hash-nested/test.desc | 8 + regression/cbmc/loop-hash-stability/test.c | 24 ++ regression/cbmc/loop-hash-stability/test.desc | 8 + .../cbmc/loop-hash-stability/test_modified.c | 38 ++ .../loop-hash-stability/test_modified.desc | 8 + regression/cbmc/loop-hash-types/test.c | 31 ++ regression/cbmc/loop-hash-types/test.desc | 8 + regression/goto-programs/CMakeLists.txt | 3 + regression/goto-programs/Makefile | 19 + regression/goto-programs/chain.sh | 7 + .../loop-hash-nested-01/chain.sh | 17 + .../goto-programs/loop-hash-nested-01/main.c | 25 ++ .../loop-hash-nested-01/test.desc | 7 + .../loop-hash-sensitivity-01/chain.sh | 17 + .../loop-hash-sensitivity-01/main.c | 24 ++ .../loop-hash-sensitivity-01/test.desc | 7 + .../loop-hash-stability-01/chain.sh | 18 + .../loop-hash-stability-01/main.c | 31 ++ .../loop-hash-stability-01/test.desc | 8 + .../loop-hash-stability-02/chain.sh | 30 ++ .../loop-hash-stability-02/original.c | 24 ++ .../loop-hash-stability-02/shifted.c | 43 ++ .../loop-hash-stability-02/test.desc | 7 + src/goto-programs/goto_functions.cpp | 8 + src/goto-programs/goto_functions.h | 2 + src/goto-programs/goto_program.cpp | 72 ++++ src/goto-programs/goto_program.h | 15 +- src/goto-programs/loop_ids.h | 33 +- unit/Makefile | 1 + unit/goto-programs/loop_hash.cpp | 400 ++++++++++++++++++ 31 files changed, 955 insertions(+), 7 deletions(-) create mode 100644 regression/cbmc/loop-hash-nested/test.c create mode 100644 regression/cbmc/loop-hash-nested/test.desc create mode 100644 regression/cbmc/loop-hash-stability/test.c create mode 100644 regression/cbmc/loop-hash-stability/test.desc create mode 100644 regression/cbmc/loop-hash-stability/test_modified.c create mode 100644 regression/cbmc/loop-hash-stability/test_modified.desc create mode 100644 regression/cbmc/loop-hash-types/test.c create mode 100644 regression/cbmc/loop-hash-types/test.desc create mode 100644 regression/goto-programs/CMakeLists.txt create mode 100644 regression/goto-programs/Makefile create mode 100755 regression/goto-programs/chain.sh create mode 100755 regression/goto-programs/loop-hash-nested-01/chain.sh create mode 100644 regression/goto-programs/loop-hash-nested-01/main.c create mode 100644 regression/goto-programs/loop-hash-nested-01/test.desc create mode 100755 regression/goto-programs/loop-hash-sensitivity-01/chain.sh create mode 100644 regression/goto-programs/loop-hash-sensitivity-01/main.c create mode 100644 regression/goto-programs/loop-hash-sensitivity-01/test.desc create mode 100755 regression/goto-programs/loop-hash-stability-01/chain.sh create mode 100644 regression/goto-programs/loop-hash-stability-01/main.c create mode 100644 regression/goto-programs/loop-hash-stability-01/test.desc create mode 100755 regression/goto-programs/loop-hash-stability-02/chain.sh create mode 100644 regression/goto-programs/loop-hash-stability-02/original.c create mode 100644 regression/goto-programs/loop-hash-stability-02/shifted.c create mode 100644 regression/goto-programs/loop-hash-stability-02/test.desc create mode 100644 unit/goto-programs/loop_hash.cpp diff --git a/regression/cbmc/loop-hash-nested/test.c b/regression/cbmc/loop-hash-nested/test.c new file mode 100644 index 00000000000..8eb42c853b8 --- /dev/null +++ b/regression/cbmc/loop-hash-nested/test.c @@ -0,0 +1,19 @@ +// Test program with nested loops +// Each loop should have a unique hash identifier + +int main() +{ + int sum = 0; + + // Outer loop + for(int i = 0; i < 3; i++) + { + // Inner loop + for(int j = 0; j < 3; j++) + { + sum += i * j; + } + } + + return sum; +} diff --git a/regression/cbmc/loop-hash-nested/test.desc b/regression/cbmc/loop-hash-nested/test.desc new file mode 100644 index 00000000000..044b3a5e406 --- /dev/null +++ b/regression/cbmc/loop-hash-nested/test.desc @@ -0,0 +1,8 @@ +CORE +test.c +--show-loops --unwind 10 +^EXIT=0$ +^SIGNAL=0$ +^main\.hash_[0-9]+$ +-- +^warning: ignoring diff --git a/regression/cbmc/loop-hash-stability/test.c b/regression/cbmc/loop-hash-stability/test.c new file mode 100644 index 00000000000..6eef65c668c --- /dev/null +++ b/regression/cbmc/loop-hash-stability/test.c @@ -0,0 +1,24 @@ +// Test program to demonstrate hash-based loop identification +// The loop identifiers should remain stable even when we add +// unrelated code before or after the loops. + +int main() +{ + int sum = 0; + + // Loop 1: Simple for loop + for(int i = 0; i < 10; i++) + { + sum += i; + } + + // Loop 2: While loop + int j = 0; + while(j < 5) + { + sum += j * 2; + j++; + } + + return sum; +} diff --git a/regression/cbmc/loop-hash-stability/test.desc b/regression/cbmc/loop-hash-stability/test.desc new file mode 100644 index 00000000000..044b3a5e406 --- /dev/null +++ b/regression/cbmc/loop-hash-stability/test.desc @@ -0,0 +1,8 @@ +CORE +test.c +--show-loops --unwind 10 +^EXIT=0$ +^SIGNAL=0$ +^main\.hash_[0-9]+$ +-- +^warning: ignoring diff --git a/regression/cbmc/loop-hash-stability/test_modified.c b/regression/cbmc/loop-hash-stability/test_modified.c new file mode 100644 index 00000000000..afa5e43a376 --- /dev/null +++ b/regression/cbmc/loop-hash-stability/test_modified.c @@ -0,0 +1,38 @@ +// Modified test program with pseudo-loops added before the real loops +// The real loops should maintain their hash identifiers + +int main() +{ + int sum = 0; + + // Add some pseudo-loops (do-while-0 pattern) that should not affect + // the hash values of the real loops below + do + { + sum += 1; + } while(0); + do + { + sum += 2; + } while(0); + do + { + sum += 3; + } while(0); + + // Loop 1: Simple for loop (same as before) + for(int i = 0; i < 10; i++) + { + sum += i; + } + + // Loop 2: While loop (same as before) + int j = 0; + while(j < 5) + { + sum += j * 2; + j++; + } + + return sum; +} diff --git a/regression/cbmc/loop-hash-stability/test_modified.desc b/regression/cbmc/loop-hash-stability/test_modified.desc new file mode 100644 index 00000000000..abc1893f302 --- /dev/null +++ b/regression/cbmc/loop-hash-stability/test_modified.desc @@ -0,0 +1,8 @@ +CORE +test_modified.c +--show-loops --unwind 10 +^EXIT=0$ +^SIGNAL=0$ +^main\.hash_[0-9]+$ +-- +^warning: ignoring diff --git a/regression/cbmc/loop-hash-types/test.c b/regression/cbmc/loop-hash-types/test.c new file mode 100644 index 00000000000..7a88917969d --- /dev/null +++ b/regression/cbmc/loop-hash-types/test.c @@ -0,0 +1,31 @@ +// Test program with different loop types +// Each loop should have a unique hash identifier + +int main() +{ + int sum = 0; + + // For loop + for(int i = 0; i < 5; i++) + { + sum += i; + } + + // While loop + int j = 0; + while(j < 5) + { + sum += j; + j++; + } + + // Do-while loop + int k = 0; + do + { + sum += k; + k++; + } while(k < 5); + + return sum; +} diff --git a/regression/cbmc/loop-hash-types/test.desc b/regression/cbmc/loop-hash-types/test.desc new file mode 100644 index 00000000000..044b3a5e406 --- /dev/null +++ b/regression/cbmc/loop-hash-types/test.desc @@ -0,0 +1,8 @@ +CORE +test.c +--show-loops --unwind 10 +^EXIT=0$ +^SIGNAL=0$ +^main\.hash_[0-9]+$ +-- +^warning: ignoring diff --git a/regression/goto-programs/CMakeLists.txt b/regression/goto-programs/CMakeLists.txt new file mode 100644 index 00000000000..12f9cc5be08 --- /dev/null +++ b/regression/goto-programs/CMakeLists.txt @@ -0,0 +1,3 @@ +# CMake configuration for goto-programs regression tests + +add_subdirectory(loop-hash-regression) diff --git a/regression/goto-programs/Makefile b/regression/goto-programs/Makefile new file mode 100644 index 00000000000..71fb22e780d --- /dev/null +++ b/regression/goto-programs/Makefile @@ -0,0 +1,19 @@ +# For goto-programs regression tests, see ../Makefile + +default: test + +include ../../src/config.inc +include ../../src/common + +ifeq ($(BUILD_ENV_),MSVC) + exe=../../../src/goto-cc/goto-cl +else + exe=../../../src/goto-cc/goto-cc +endif + +test: + @../test.pl -p -c "../chain.sh $(exe) ../../../src/cbmc/cbmc" + +clean: + find -name '*.out' -execdir $(RM) '{}' \; + find -name '*.gb' -execdir $(RM) '{}' \; diff --git a/regression/goto-programs/chain.sh b/regression/goto-programs/chain.sh new file mode 100755 index 00000000000..17f63ba94be --- /dev/null +++ b/regression/goto-programs/chain.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +set -e +goto_cc=$1 +cbmc=$2 +is_windows=$3 +shift 3 +exec "$@" diff --git a/regression/goto-programs/loop-hash-nested-01/chain.sh b/regression/goto-programs/loop-hash-nested-01/chain.sh new file mode 100755 index 00000000000..c812f4fac62 --- /dev/null +++ b/regression/goto-programs/loop-hash-nested-01/chain.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +set -e + +goto_cc=$1 +cbmc=$2 +is_windows=$3 + +name="main" + +if [[ "${is_windows}" == "true" ]]; then + ${goto_cc} ${name}.c "/Fe${name}.gb" +else + ${goto_cc} ${name}.c -o ${name}.gb +fi + +${cbmc} --show-loops ${name}.gb | grep -E "Loop [0-9]+ hash:" | sed 's/.*hash: //' | sort -n diff --git a/regression/goto-programs/loop-hash-nested-01/main.c b/regression/goto-programs/loop-hash-nested-01/main.c new file mode 100644 index 00000000000..2e3dc118a24 --- /dev/null +++ b/regression/goto-programs/loop-hash-nested-01/main.c @@ -0,0 +1,25 @@ +// Test program with nested loops for hash uniqueness verification +// Each loop should have a unique hash identifier + +int main() +{ + int sum = 0; + + // Outer loop + for(int i = 0; i < 3; i++) + { + // Inner loop 1 + for(int j = 0; j < 3; j++) + { + sum += i * j; + } + + // Inner loop 2 (sibling to inner loop 1) + for(int k = 0; k < 2; k++) + { + sum += i + k; + } + } + + return sum; +} diff --git a/regression/goto-programs/loop-hash-nested-01/test.desc b/regression/goto-programs/loop-hash-nested-01/test.desc new file mode 100644 index 00000000000..4214819ae90 --- /dev/null +++ b/regression/goto-programs/loop-hash-nested-01/test.desc @@ -0,0 +1,7 @@ +CORE +main.c +--chain chain.sh +^EXIT=0$ +^SIGNAL=0$ +^[0-9]+$ +^[0-9]+$ diff --git a/regression/goto-programs/loop-hash-sensitivity-01/chain.sh b/regression/goto-programs/loop-hash-sensitivity-01/chain.sh new file mode 100755 index 00000000000..c812f4fac62 --- /dev/null +++ b/regression/goto-programs/loop-hash-sensitivity-01/chain.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +set -e + +goto_cc=$1 +cbmc=$2 +is_windows=$3 + +name="main" + +if [[ "${is_windows}" == "true" ]]; then + ${goto_cc} ${name}.c "/Fe${name}.gb" +else + ${goto_cc} ${name}.c -o ${name}.gb +fi + +${cbmc} --show-loops ${name}.gb | grep -E "Loop [0-9]+ hash:" | sed 's/.*hash: //' | sort -n diff --git a/regression/goto-programs/loop-hash-sensitivity-01/main.c b/regression/goto-programs/loop-hash-sensitivity-01/main.c new file mode 100644 index 00000000000..7d72d58dbcf --- /dev/null +++ b/regression/goto-programs/loop-hash-sensitivity-01/main.c @@ -0,0 +1,24 @@ +// Test case: Loops with CHANGED CONDITIONS +// These loops should have DIFFERENT hashes compared to test_location_stability.c +// because the loop conditions are different + +int main() +{ + int sum = 0; + + // Test loop 1: Changed upper bound from 10 to 20 + for(int i = 0; i < 20; i++) + { // Changed: was i < 10 + sum += i; + } + + // Test loop 2: Changed condition operator + int j = 0; + while(j <= 5) + { // Changed: was j < 5 + sum += j; + j++; + } + + return sum; +} diff --git a/regression/goto-programs/loop-hash-sensitivity-01/test.desc b/regression/goto-programs/loop-hash-sensitivity-01/test.desc new file mode 100644 index 00000000000..4214819ae90 --- /dev/null +++ b/regression/goto-programs/loop-hash-sensitivity-01/test.desc @@ -0,0 +1,7 @@ +CORE +main.c +--chain chain.sh +^EXIT=0$ +^SIGNAL=0$ +^[0-9]+$ +^[0-9]+$ diff --git a/regression/goto-programs/loop-hash-stability-01/chain.sh b/regression/goto-programs/loop-hash-stability-01/chain.sh new file mode 100755 index 00000000000..e160e288b1e --- /dev/null +++ b/regression/goto-programs/loop-hash-stability-01/chain.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +set -e + +goto_cc=$1 +cbmc=$2 +is_windows=$3 + +name="main" + +if [[ "${is_windows}" == "true" ]]; then + ${goto_cc} ${name}.c "/Fe${name}.gb" +else + ${goto_cc} ${name}.c -o ${name}.gb +fi + +# Show loops and extract hash values +${cbmc} --show-loops ${name}.gb | grep -E "Loop [0-9]+ hash:" | sed 's/.*hash: //' | sort -n diff --git a/regression/goto-programs/loop-hash-stability-01/main.c b/regression/goto-programs/loop-hash-stability-01/main.c new file mode 100644 index 00000000000..587ec26d212 --- /dev/null +++ b/regression/goto-programs/loop-hash-stability-01/main.c @@ -0,0 +1,31 @@ +// Test program with simple loops for hash stability verification +// The loop hashes should remain stable across multiple runs + +int main() +{ + int sum = 0; + + // Loop 1: Simple for loop + for(int i = 0; i < 10; i++) + { + sum += i; + } + + // Loop 2: While loop + int j = 0; + while(j < 5) + { + sum += j * 2; + j++; + } + + // Loop 3: Do-while loop + int k = 0; + do + { + sum += k; + k++; + } while(k < 3); + + return sum; +} diff --git a/regression/goto-programs/loop-hash-stability-01/test.desc b/regression/goto-programs/loop-hash-stability-01/test.desc new file mode 100644 index 00000000000..3340972f770 --- /dev/null +++ b/regression/goto-programs/loop-hash-stability-01/test.desc @@ -0,0 +1,8 @@ +CORE +main.c +--chain chain.sh +^EXIT=0$ +^SIGNAL=0$ +^[0-9]+$ +^[0-9]+$ +^[0-9]+$ diff --git a/regression/goto-programs/loop-hash-stability-02/chain.sh b/regression/goto-programs/loop-hash-stability-02/chain.sh new file mode 100755 index 00000000000..49377d2ba01 --- /dev/null +++ b/regression/goto-programs/loop-hash-stability-02/chain.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +set -e + +goto_cc=$1 +cbmc=$2 +is_windows=$3 + +if [[ "${is_windows}" == "true" ]]; then + ${goto_cc} original.c "/Feoriginal.gb" + ${goto_cc} shifted.c "/Feshifted.gb" +else + ${goto_cc} original.c -o original.gb + ${goto_cc} shifted.c -o shifted.gb +fi + +# Extract hashes from both versions +hash1=$(${cbmc} --show-loops original.gb | grep -E "Loop [0-9]+ hash:" | sed 's/.*hash: //' | sort -n) +hash2=$(${cbmc} --show-loops shifted.gb | grep -E "Loop [0-9]+ hash:" | sed 's/.*hash: //' | sort -n) + +# Compare - should be identical +if [ "$hash1" == "$hash2" ]; then + echo "STABLE: Hashes match" + echo "$hash1" +else + echo "UNSTABLE: Hashes differ" + echo "Original: $hash1" + echo "Shifted: $hash2" + exit 1 +fi diff --git a/regression/goto-programs/loop-hash-stability-02/original.c b/regression/goto-programs/loop-hash-stability-02/original.c new file mode 100644 index 00000000000..43340660005 --- /dev/null +++ b/regression/goto-programs/loop-hash-stability-02/original.c @@ -0,0 +1,24 @@ +// Test case: Verify loop hashes are stable when source locations change +// This file is designed to be modified by adding comments/whitespace before loops +// to verify that line number changes don't affect loop hashes + +int main() +{ + int sum = 0; + + // Test loop 1: Basic for loop + for(int i = 0; i < 10; i++) + { + sum += i; + } + + // Test loop 2: While loop + int j = 0; + while(j < 5) + { + sum += j; + j++; + } + + return sum; +} diff --git a/regression/goto-programs/loop-hash-stability-02/shifted.c b/regression/goto-programs/loop-hash-stability-02/shifted.c new file mode 100644 index 00000000000..a1145285721 --- /dev/null +++ b/regression/goto-programs/loop-hash-stability-02/shifted.c @@ -0,0 +1,43 @@ +// Test case: Same loops as test_location_stability.c but shifted by adding comments +// The loop hashes should remain IDENTICAL despite line number changes + +// Adding multiple comment lines to shift all code down +// Comment line 2 +// Comment line 3 +// Comment line 4 +// Comment line 5 +// Comment line 6 +// Comment line 7 +// Comment line 8 +// Comment line 9 +// Comment line 10 + +int main() +{ + int sum = 0; + + // Adding more comments before the loop to shift its location + // Comment A + // Comment B + // Comment C + + // Test loop 1: Basic for loop (now at a different line number) + for(int i = 0; i < 10; i++) + { + sum += i; + } + + // More comments between loops + // Comment D + // Comment E + + // Test loop 2: While loop (now at a different line number) + int j = 0; + while(j < 5) + { + sum += j; + j++; + } + + return sum; +} diff --git a/regression/goto-programs/loop-hash-stability-02/test.desc b/regression/goto-programs/loop-hash-stability-02/test.desc new file mode 100644 index 00000000000..f0e95359323 --- /dev/null +++ b/regression/goto-programs/loop-hash-stability-02/test.desc @@ -0,0 +1,7 @@ +CORE +original.c shifted.c +--chain chain.sh +^EXIT=0$ +^SIGNAL=0$ +^STABLE: Hashes match$ +^[0-9]+$ diff --git a/src/goto-programs/goto_functions.cpp b/src/goto-programs/goto_functions.cpp index f4ac42e6d4a..186592322c5 100644 --- a/src/goto-programs/goto_functions.cpp +++ b/src/goto-programs/goto_functions.cpp @@ -60,6 +60,14 @@ void goto_functionst::compute_loop_numbers() } } +void goto_functionst::compute_loop_hashes() +{ + for(auto &func : function_map) + { + func.second.body.compute_loop_hashes(); + } +} + /// returns a vector of the iterators in alphabetical order std::vector goto_functionst::sorted() const diff --git a/src/goto-programs/goto_functions.h b/src/goto-programs/goto_functions.h index 73f8d06ba1d..b2f685197dc 100644 --- a/src/goto-programs/goto_functions.h +++ b/src/goto-programs/goto_functions.h @@ -82,6 +82,7 @@ class goto_functionst void compute_location_numbers(); void compute_location_numbers(goto_programt &); void compute_loop_numbers(); + void compute_loop_hashes(); void compute_target_numbers(); void compute_incoming_edges(); @@ -91,6 +92,7 @@ class goto_functionst compute_target_numbers(); compute_location_numbers(); compute_loop_numbers(); + compute_loop_hashes(); } /// Get the identifier of the entry point to a goto model diff --git a/src/goto-programs/goto_program.cpp b/src/goto-programs/goto_program.cpp index 2539394b69a..cd6a51d2888 100644 --- a/src/goto-programs/goto_program.cpp +++ b/src/goto-programs/goto_program.cpp @@ -16,6 +16,7 @@ Author: Daniel Kroening, kroening@kroening.com #include #include #include +#include #include #include #include @@ -616,12 +617,83 @@ void goto_programt::compute_loop_numbers() i.loop_number=nr++; } +/// Compute a hash for an instruction based on its structural properties. +/// This hash excludes position-dependent information like location_number, +/// target_number, and source location, focusing on the semantic content. +static std::size_t +hash_instruction_structure(const goto_programt::instructiont &instruction) +{ + std::size_t result = 0; + + // Hash the instruction type + result = hash_combine(result, static_cast(instruction.type())); + + // Hash the guard/condition expression if present + if(instruction.has_condition()) + result = hash_combine(result, instruction.condition().string_hash()); + + // Hash the code + result = hash_combine(result, instruction.code().string_hash()); + + return result; +} + +/// Compute hash-based loop identifiers that are stable across unrelated +/// code changes. This implementation follows the AST fingerprinting approach, +/// hashing the structural properties of the loop including: +/// - The instruction types and semantic content within the loop +/// - The control flow structure of the loop body +/// - The loop condition +void goto_programt::compute_loop_hashes() +{ + for(auto it = instructions.begin(); it != instructions.end(); ++it) + { + if(!it->is_backwards_goto()) + continue; + + std::size_t loop_hash = 0; + + // Hash the condition of the backwards goto (loop exit condition) + if(it->has_condition()) + loop_hash = hash_combine(loop_hash, it->condition().string_hash()); + + // Find the loop target (loop header) + if(!it->targets.empty()) + { + auto loop_head = it->targets.front(); + + // Hash the structure of instructions in the loop body + // We iterate from the loop header to the backwards goto + std::size_t body_hash = 0; + std::size_t instruction_count = 0; + + for(auto body_it = loop_head; body_it != it; ++body_it) + { + body_hash = + hash_combine(body_hash, hash_instruction_structure(*body_it)); + ++instruction_count; + } + + // Incorporate the body hash and instruction count + loop_hash = hash_combine(loop_hash, body_hash); + loop_hash = hash_combine(loop_hash, instruction_count); + } + + // Finalize the hash + loop_hash = hash_finalize(loop_hash, 0); + + // Store the hash in the instruction + it->loop_hash = loop_hash; + } +} + void goto_programt::update() { compute_incoming_edges(); compute_target_numbers(); compute_location_numbers(); compute_loop_numbers(); + compute_loop_hashes(); } std::ostream &goto_programt::output(std::ostream &out) const diff --git a/src/goto-programs/goto_program.h b/src/goto-programs/goto_program.h index 234d759f6c9..4a0c6f7737f 100644 --- a/src/goto-programs/goto_program.h +++ b/src/goto-programs/goto_program.h @@ -557,9 +557,12 @@ class goto_programt /// one goto_program. unsigned location_number = 0; - /// Number unique per function to identify loops + /// Number unique per function to identify loops (deprecated) unsigned loop_number = 0; + /// Hash-based identifier for loops, stable across unrelated code changes + std::size_t loop_hash = 0; + /// A number to identify branch targets. /// This is \ref nil_target if it's not a target. unsigned target_number = nil_target; @@ -783,6 +786,9 @@ class goto_programt /// Compute loop numbers void compute_loop_numbers(); + /// Compute hash-based loop identifiers + void compute_loop_hashes(); + /// Update all indices void update(); @@ -790,6 +796,13 @@ class goto_programt static irep_idt loop_id(const irep_idt &function_id, const instructiont &instruction) { + // Prefer hash-based identifier when available + if(instruction.loop_hash != 0) + { + return id2string(function_id) + ".hash_" + + std::to_string(instruction.loop_hash); + } + // Fall back to ordinal number for backward compatibility return id2string(function_id) + "." + std::to_string(instruction.loop_number); } diff --git a/src/goto-programs/loop_ids.h b/src/goto-programs/loop_ids.h index dc8860fabb5..01d88d3fa59 100644 --- a/src/goto-programs/loop_ids.h +++ b/src/goto-programs/loop_ids.h @@ -18,16 +18,27 @@ class goto_functionst; class goto_modelt; class goto_programt; -/// Loop id used to identify loops. It consists of two arguments: +/// Loop id used to identify loops. It consists of three arguments: /// `function_id` -/// the function id stored as keys of `function_mapt`; and +/// the function id stored as keys of `function_mapt`; /// `loop_number` /// the index of loop indicated by `loop_number` of backward -/// goto instruction. +/// goto instruction (deprecated, for backward compatibility); and +/// `loop_hash` +/// a hash-based identifier computed from the structural properties +/// of the loop, stable across unrelated code changes. struct loop_idt { loop_idt(const irep_idt &function_id, const unsigned int loop_number) - : function_id(function_id), loop_number(loop_number) + : function_id(function_id), loop_number(loop_number), loop_hash(0) + { + } + + loop_idt( + const irep_idt &function_id, + const unsigned int loop_number, + const std::size_t loop_hash) + : function_id(function_id), loop_number(loop_number), loop_hash(loop_hash) { } @@ -35,9 +46,14 @@ struct loop_idt irep_idt function_id; unsigned int loop_number; + std::size_t loop_hash; bool operator==(const loop_idt &o) const { + // Prefer hash-based comparison if both have non-zero hashes + if(loop_hash != 0 && o.loop_hash != 0) + return function_id == o.function_id && loop_hash == o.loop_hash; + // Fall back to loop_number for backward compatibility return function_id == o.function_id && loop_number == o.loop_number; } @@ -48,8 +64,13 @@ struct loop_idt bool operator<(const loop_idt &o) const { - return function_id < o.function_id || - (function_id == o.function_id && loop_number < o.loop_number); + if(function_id != o.function_id) + return function_id < o.function_id; + // Prefer hash-based comparison if both have non-zero hashes + if(loop_hash != 0 && o.loop_hash != 0) + return loop_hash < o.loop_hash; + // Fall back to loop_number for backward compatibility + return loop_number < o.loop_number; } }; diff --git a/unit/Makefile b/unit/Makefile index 20c48c51f15..42580684eab 100644 --- a/unit/Makefile +++ b/unit/Makefile @@ -73,6 +73,7 @@ SRC += analyses/ai/ai.cpp \ goto-programs/goto_trace_output.cpp \ goto-programs/is_goto_binary.cpp \ goto-programs/label_function_pointer_call_sites.cpp \ + goto-programs/loop_hash.cpp \ goto-programs/osx_fat_reader.cpp \ goto-programs/restrict_function_pointers.cpp \ goto-programs/structured_trace_util.cpp \ diff --git a/unit/goto-programs/loop_hash.cpp b/unit/goto-programs/loop_hash.cpp new file mode 100644 index 00000000000..69b6cccec48 --- /dev/null +++ b/unit/goto-programs/loop_hash.cpp @@ -0,0 +1,400 @@ +/*******************************************************************\ + +Module: Unit tests for loop hash computation + +Author: CBMC Team + +\*******************************************************************/ + +#include +#include +#include + +#include +#include + +SCENARIO( + "Loop hash computation produces stable identifiers", + "[core][goto-programs][loop-hash]") +{ + GIVEN("A program with two loops") + { + const std::string code = R"( + int main() { + int sum = 0; + + // Loop 1: Simple for loop + for(int i = 0; i < 10; i++) { + sum += i; + } + + // Loop 2: While loop + int j = 0; + while(j < 5) { + sum += j * 2; + j++; + } + + return sum; + } + )"; + + auto goto_model = get_goto_model_from_c(code); + + WHEN("Loop hashes are computed") + { + goto_model.goto_functions.update(); + + THEN("Each backwards goto has a non-zero hash") + { + const auto &main_func = + goto_model.goto_functions.function_map.at("main"); + + int loop_count = 0; + for(const auto &inst : main_func.body.instructions) + { + if(inst.is_backwards_goto()) + { + loop_count++; + REQUIRE(inst.loop_hash != 0); + } + } + + REQUIRE(loop_count > 0); + } + } + } + + GIVEN("A program with nested loops") + { + const std::string code = R"( + int main() { + int sum = 0; + + for(int i = 0; i < 3; i++) { + for(int j = 0; j < 3; j++) { + sum += i * j; + } + } + + return sum; + } + )"; + + auto goto_model = get_goto_model_from_c(code); + + WHEN("Loop hashes are computed") + { + goto_model.goto_functions.update(); + + THEN("Inner and outer loops have different hashes") + { + const auto &main_func = + goto_model.goto_functions.function_map.at("main"); + + std::vector hashes; + for(const auto &inst : main_func.body.instructions) + { + if(inst.is_backwards_goto()) + { + hashes.push_back(inst.loop_hash); + } + } + + REQUIRE(hashes.size() == 2); + REQUIRE(hashes[0] != hashes[1]); + REQUIRE(hashes[0] != 0); + REQUIRE(hashes[1] != 0); + } + } + } +} + +SCENARIO( + "Loop hashes remain stable across unrelated code changes", + "[core][goto-programs][loop-hash]") +{ + GIVEN("A program with two loops") + { + const std::string original_code = R"( + int main() { + int sum = 0; + + // Loop 1: Simple for loop + for(int i = 0; i < 10; i++) { + sum += i; + } + + // Loop 2: While loop + int j = 0; + while(j < 5) { + sum += j * 2; + j++; + } + + return sum; + } + )"; + + auto original_model = get_goto_model_from_c(original_code); + original_model.goto_functions.update(); + + const auto &original_main = + original_model.goto_functions.function_map.at("main"); + + std::vector original_hashes; + for(const auto &inst : original_main.body.instructions) + { + if(inst.is_backwards_goto()) + original_hashes.push_back(inst.loop_hash); + } + + WHEN("Pseudo-loops are added before the real loops") + { + const std::string modified_code = R"( + int main() { + int sum = 0; + + // Add some pseudo-loops (do-while-0 pattern) + do { sum += 1; } while(0); + do { sum += 2; } while(0); + do { sum += 3; } while(0); + + // Loop 1: Simple for loop (same as before) + for(int i = 0; i < 10; i++) { + sum += i; + } + + // Loop 2: While loop (same as before) + int j = 0; + while(j < 5) { + sum += j * 2; + j++; + } + + return sum; + } + )"; + + auto modified_model = get_goto_model_from_c(modified_code); + modified_model.goto_functions.update(); + + const auto &modified_main = + modified_model.goto_functions.function_map.at("main"); + + std::vector modified_hashes; + for(const auto &inst : modified_main.body.instructions) + { + if(inst.is_backwards_goto()) + modified_hashes.push_back(inst.loop_hash); + } + + THEN("The original loops maintain their hash values") + { + // The modified version should have more loops total (pseudo-loops + + // real loops) + REQUIRE(modified_hashes.size() > original_hashes.size()); + + // Find the real loops in the modified version + // They should still have the same hashes as in the original + bool found_first = false; + bool found_second = false; + + for(std::size_t hash : modified_hashes) + { + if(hash == original_hashes[0]) + found_first = true; + if(original_hashes.size() > 1 && hash == original_hashes[1]) + found_second = true; + } + + REQUIRE(found_first); + if(original_hashes.size() > 1) + REQUIRE(found_second); + } + } + } +} + +SCENARIO( + "Loop hash computation for different loop structures", + "[core][goto-programs][loop-hash]") +{ + GIVEN("Different types of loops") + { + WHEN("Testing a for loop") + { + const std::string code = R"( + int main() { + int sum = 0; + for(int i = 0; i < 10; i++) { + sum += i; + } + return sum; + } + )"; + + auto goto_model = get_goto_model_from_c(code); + goto_model.goto_functions.update(); + + THEN("The loop has a valid hash") + { + const auto &main_func = + goto_model.goto_functions.function_map.at("main"); + + bool found_loop = false; + for(const auto &inst : main_func.body.instructions) + { + if(inst.is_backwards_goto()) + { + found_loop = true; + REQUIRE(inst.loop_hash != 0); + } + } + REQUIRE(found_loop); + } + } + + WHEN("Testing a while loop") + { + const std::string code = R"( + int main() { + int sum = 0; + int i = 0; + while(i < 10) { + sum += i; + i++; + } + return sum; + } + )"; + + auto goto_model = get_goto_model_from_c(code); + goto_model.goto_functions.update(); + + THEN("The loop has a valid hash") + { + const auto &main_func = + goto_model.goto_functions.function_map.at("main"); + + bool found_loop = false; + for(const auto &inst : main_func.body.instructions) + { + if(inst.is_backwards_goto()) + { + found_loop = true; + REQUIRE(inst.loop_hash != 0); + } + } + REQUIRE(found_loop); + } + } + + WHEN("Testing a do-while loop") + { + const std::string code = R"( + int main() { + int sum = 0; + int i = 0; + do { + sum += i; + i++; + } while(i < 10); + return sum; + } + )"; + + auto goto_model = get_goto_model_from_c(code); + goto_model.goto_functions.update(); + + THEN("The loop has a valid hash") + { + const auto &main_func = + goto_model.goto_functions.function_map.at("main"); + + bool found_loop = false; + for(const auto &inst : main_func.body.instructions) + { + if(inst.is_backwards_goto()) + { + found_loop = true; + REQUIRE(inst.loop_hash != 0); + } + } + REQUIRE(found_loop); + } + } + } +} + +SCENARIO( + "Loop hash stability with body changes", + "[core][goto-programs][loop-hash]") +{ + GIVEN("A loop with a specific structure") + { + const std::string original_code = R"( + int main() { + int sum = 0; + for(int i = 0; i < 10; i++) { + sum += i; + } + return sum; + } + )"; + + auto original_model = get_goto_model_from_c(original_code); + original_model.goto_functions.update(); + + const auto &original_main = + original_model.goto_functions.function_map.at("main"); + + std::size_t original_hash = 0; + for(const auto &inst : original_main.body.instructions) + { + if(inst.is_backwards_goto()) + { + original_hash = inst.loop_hash; + break; + } + } + REQUIRE(original_hash != 0); + + WHEN("The loop body changes") + { + const std::string modified_code = R"( + int main() { + int sum = 0; + for(int i = 0; i < 10; i++) { + sum += i * 2; // Changed operation + } + return sum; + } + )"; + + auto modified_model = get_goto_model_from_c(modified_code); + modified_model.goto_functions.update(); + + const auto &modified_main = + modified_model.goto_functions.function_map.at("main"); + + std::size_t modified_hash = 0; + for(const auto &inst : modified_main.body.instructions) + { + if(inst.is_backwards_goto()) + { + modified_hash = inst.loop_hash; + break; + } + } + REQUIRE(modified_hash != 0); + + THEN("The hash changes because the loop structure changed") + { + // When the loop body changes, the hash should also change + // This test verifies that the hash is sensitive to structural changes + REQUIRE(modified_hash != original_hash); + } + } + } +}