From 6c71be2cd0f5ee6cfab54496e1ada4f5ba54390f Mon Sep 17 00:00:00 2001 From: haochengxia Date: Sat, 2 Aug 2025 02:09:26 +0000 Subject: [PATCH 1/3] Clean up --- .github/workflows/build.yml | 12 +- README.md | 341 ++++++------------ examples/README.md | 280 -------------- examples/basic_usage.py | 29 ++ examples/demo_unified_interface.py | 131 ------- .../{plugin_cache.py => plugin_cache/lru.py} | 0 examples/plugin_cache/s3fifo.py | 204 +++++++++++ examples/python_hook_cache_example.py | 178 --------- examples/stream_request_example.py | 154 -------- examples/zipf_trace_example.py | 243 ------------- libcachesim/__init__.py | 2 + libcachesim/protocols.py | 2 +- libcachesim/synthetic_reader.py | 8 +- libcachesim/trace_reader.py | 20 +- src/export_reader.cpp | 2 +- tests/test_reader.py | 40 +- 16 files changed, 397 insertions(+), 1249 deletions(-) delete mode 100644 examples/README.md create mode 100644 examples/basic_usage.py delete mode 100644 examples/demo_unified_interface.py rename examples/{plugin_cache.py => plugin_cache/lru.py} (100%) create mode 100644 examples/plugin_cache/s3fifo.py delete mode 100644 examples/python_hook_cache_example.py delete mode 100644 examples/stream_request_example.py delete mode 100644 examples/zipf_trace_example.py diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 8b67b2b..bc97a64 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,6 +1,16 @@ name: Build -on: [push, pull_request] +on: + push: + paths: + - 'src/**' + - 'libcachesim/**' + - 'tests/**' + pull_request: + paths: + - 'src/**' + - 'libcachesim/**' + - 'tests/**' permissions: contents: read diff --git a/README.md b/README.md index 6a04cdb..0a6afc5 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,10 @@ # libCacheSim Python Binding [![Build](https://github.com/cacheMon/libCacheSim-python/actions/workflows/build.yml/badge.svg)](https://github.com/cacheMon/libCacheSim-python/actions/workflows/build.yml) -[![Documentation](https://github.com/cacheMon/libCacheSim-python/actions/workflows/docs.yml/badge.svg)](https://github.com/cacheMon/libCacheSim-python/actions/workflows/docs.yml) +[![Documentation](https://github.com/cacheMon/libCacheSim-python/actions/workflows/docs.yml/badge.svg)](docs.libcachesim.com/python) Python bindings for [libCacheSim](https://github.com/1a1a11a/libCacheSim), a high-performance cache simulator and analysis library. -## 📚 Documentation - -- **[English Documentation](https://cacheMon.github.io/libCacheSim-python/en/)** - Complete API reference, tutorials, and examples -- **[中文文档](https://cacheMon.github.io/libCacheSim-python/zh/)** - 完整的API参考、教程和示例 - ## Installation Binary installers for the latest released version are available at the [Python Package Index (PyPI)](https://pypi.org/project/libcachesim). @@ -32,16 +27,6 @@ Run all tests to ensure the package works. python -m pytest tests/ ``` -## 🚀 Features - -- **High-Performance Cache Simulation**: Built on the proven libCacheSim C++ library -- **Multiple Cache Algorithms**: LRU, LFU, FIFO, ARC, S3FIFO, Sieve, TinyLFU, and more -- **Trace Processing**: Support for various trace formats (CSV, binary, Oracle, etc.) -- **Synthetic Workload Generation**: Zipf, uniform, and custom distributions -- **Trace Analysis**: Comprehensive workload analysis and visualization tools -- **Custom Cache Policies**: Implement new algorithms using Python hooks -- **Multi-language Documentation**: English and Chinese documentation with examples - ## Quick Start ### Basic Usage @@ -63,276 +48,172 @@ print(cache.get(req)) # True (second access) ### Trace Processing -To simulate with traces, we need to read the request of traces correctly. `open_trace` is an unified interface for trace reading, which accepet three parameters: - -- `trace_path`: trace path, can be relative or absolutive path. -- `type` (optional): if not given, we will automatically infer the type of trace according to the suffix of the trace file. -- `params` (optional): if not given, default params are applied. - ```python import libcachesim as lcs -# Open trace and process efficiently -reader = lcs.open_trace( - trace_path = "./data/cloudPhysicsIO.oracleGeneral.bin", - type = lcs.TraceType.ORACLE_GENERAL_TRACE, - params = lcs.ReaderInitParam(ignore_obj_size=True) -) -cache = lcs.S3FIFO(cache_size=1024*1024) +# Step 1: Get one trace from S3 bucket +URI = "cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst" +dl = lcs.DataLoader() +dl.load(URI) -# Process entire trace efficiently (C++ backend) -obj_miss_ratio, byte_miss_ratio = cache.process_trace(reader) -print(f"Object miss ratio: {obj_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") +# Step 2: Open trace and process efficiently +reader = lcs.TraceReader(dl.get_cache_path(URI)) +# Step 3: Initialize cache cache = lcs.S3FIFO(cache_size=1024*1024) -# Process with limits and time ranges -obj_miss_ratio, byte_miss_ratio = cache.process_trace( - reader, - start_req=0, - max_req=1000 -) + +# Step 4: Process entire trace efficiently (C++ backend) +obj_miss_ratio, byte_miss_ratio = cache.process_trace(reader) print(f"Object miss ratio: {obj_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") ``` +> [!NOTE] +> We DO NOT ignore the object size by defaults, you can add `reader_init_params = lcs.ReaderInitParam(ignore_obj_size=False)` to the initialization of `TraceReader` if needed. + ## Custom Cache Policies Implement custom cache replacement algorithms using pure Python functions - **no C/C++ compilation required**. ### Python Hook Cache Overview -The `PluginCache` allows you to define custom caching behavior through Python callback functions. This is perfect for: -- Prototyping new cache algorithms -- Educational purposes and learning -- Research and experimentation -- Custom business logic implementation +The `PluginCache` allows you to define custom caching behavior through Python callback functions without without any C/C++ compilation. ### Hook Functions You need to implement these callback functions: -- **`init_hook(cache_size: int) -> Any`**: Initialize your data structure -- **`hit_hook(data: Any, obj_id: int, obj_size: int) -> None`**: Handle cache hits -- **`miss_hook(data: Any, obj_id: int, obj_size: int) -> None`**: Handle cache misses -- **`eviction_hook(data: Any, obj_id: int, obj_size: int) -> int`**: Return object ID to evict -- **`remove_hook(data: Any, obj_id: int) -> None`**: Clean up when object removed -- **`free_hook(data: Any) -> None`**: [Optional] Final cleanup +| Function | Signature | Description | +|----------|-----------|-------------| +| `init_hook` | `((common_cache_params: CommonCacheParams)) -> Any` | Initialize your data structure | +| `hit_hook` | `(data: Any, request: Request) -> None` | Handle cache hits | +| `miss_hook` | `(data: Any, request: Request) -> None` | Handle cache misses | +| `eviction_hook` | `(data: Any, request: Request) -> int` | Return object ID to evict | +| `remove_hook` | `(data: Any, obj_id: int) -> None` | Clean up when object removed | +| `free_hook` | `(data: Any) -> None` | [Optional] Final cleanup | -### Example: Custom LRU Implementation +
+An example for LRU -```python -import libcachesim as lcs +``` from collections import OrderedDict +from libcachesim import PluginCache, CommonCacheParams, Request, SyntheticReader, LRU -# Create a Python hook-based cache -cache = lcs.PluginCache(cache_size=1024*1024, cache_name="MyLRU") - -# Define LRU policy hooks -def init_hook(cache_size): - return OrderedDict() # Track access order -def hit_hook(lru_dict, obj_id, obj_size): - lru_dict.move_to_end(obj_id) # Move to most recent +class StandaloneLRU: + def __init__(self): + self.cache_data = OrderedDict() -def miss_hook(lru_dict, obj_id, obj_size): - lru_dict[obj_id] = True # Add to end + def cache_hit(self, obj_id): + if obj_id in self.cache_data: + obj_size = self.cache_data.pop(obj_id) + self.cache_data[obj_id] = obj_size -def eviction_hook(lru_dict, obj_id, obj_size): - return next(iter(lru_dict)) # Return least recent + def cache_miss(self, obj_id, obj_size): + self.cache_data[obj_id] = obj_size -def remove_hook(lru_dict, obj_id): - lru_dict.pop(obj_id, None) + def cache_eviction(self): + evicted_id, _ = self.cache_data.popitem(last=False) + return evicted_id -# Set the hooks -cache.set_hooks(init_hook, hit_hook, miss_hook, eviction_hook, remove_hook) - -# Use it like any other cache -req = lcs.Request() -req.obj_id = 1 -req.obj_size = 100 -hit = cache.get(req) -print(f"Cache hit: {hit}") # Should be False (miss) -``` + def cache_remove(self, obj_id): + if obj_id in self.cache_data: + del self.cache_data[obj_id] -### Example: Custom FIFO Implementation -```python -import libcachesim as lcs -from collections import deque -from contextlib import suppress +def cache_init_hook(common_cache_params: CommonCacheParams): + return StandaloneLRU() -cache = lcs.PluginCache(cache_size=1024, cache_name="CustomFIFO") -def init_hook(cache_size): - return deque() # Use deque for FIFO order +def cache_hit_hook(cache, request: Request): + cache.cache_hit(request.obj_id) -def hit_hook(fifo_queue, obj_id, obj_size): - pass # FIFO doesn't reorder on hit -def miss_hook(fifo_queue, obj_id, obj_size): - fifo_queue.append(obj_id) # Add to end of queue +def cache_miss_hook(cache, request: Request): + cache.cache_miss(request.obj_id, request.obj_size) -def eviction_hook(fifo_queue, obj_id, obj_size): - return fifo_queue[0] # Return first item (oldest) -def remove_hook(fifo_queue, obj_id): - with suppress(ValueError): - fifo_queue.remove(obj_id) +def cache_eviction_hook(cache, request: Request): + return cache.cache_eviction() -# Set the hooks and test -cache.set_hooks(init_hook, hit_hook, miss_hook, eviction_hook, remove_hook) -req = lcs.Request(obj_id=1, obj_size=100) -hit = cache.get(req) -print(f"Cache hit: {hit}") # Should be False (miss) -``` +def cache_remove_hook(cache, obj_id): + cache.cache_remove(obj_id) -## Available Algorithms - -### Built-in Cache Algorithms - -#### Basic Algorithms -- **FIFO**: First-In-First-Out -- **LRU**: Least Recently Used -- **LFU**: Least Frequently Used -- **LFUDA**: LFU with Dynamic Aging -- **Clock**: Clock/Second-chance algorithm - -#### Advanced Algorithms -- **QDLP**: Queue Demotion with Lazy Promotion -- **S3FIFO**: Simple, Fast, Fair FIFO (recommended for most workloads) -- **Sieve**: High-performance eviction algorithm -- **ARC**: Adaptive Replacement Cache -- **TwoQ**: Two-Queue algorithm -- **SLRU**: Segmented LRU -- **TinyLFU**: TinyLFU with window -- **WTinyLFU**: Windowed TinyLFU - -#### Research/ML Algorithms -- **LeCaR**: Learning Cache Replacement (adaptive) -- **Cacheus**: Cache replacement policy -- **LRB**: Learning-based cache (if enabled) -- **GLCache**: Machine learning-based cache -- **ThreeLCache**: Three-level cache hierarchy (if enabled) - -#### Optimal Algorithms (for analysis) -- **Belady**: Optimal offline algorithm -- **BeladySize**: Size-aware optimal algorithm -```python -import libcachesim as lcs +def cache_free_hook(cache): + cache.cache_data.clear() -# All algorithms use the same unified interface -cache_size = 1024 * 1024 # 1MB -lru_cache = lcs.LRU(cache_size) -s3fifo_cache = lcs.S3FIFO(cache_size) -sieve_cache = lcs.Sieve(cache_size) -arc_cache = lcs.ARC(cache_size) - -# All caches work identically -req = lcs.Request() -req.obj_id = 1 -req.obj_size = 100 -hit = lru_cache.get(req) -print(hit) -``` - -## Examples and Testing - -### Algorithm Comparison -```python -import libcachesim as lcs - -def compare_algorithms(trace_path): - reader = lcs.open_trace(trace_path, lcs.TraceType.VSCSI_TRACE) - algorithms = ['LRU', 'S3FIFO', 'Sieve', 'ARC'] - for algo_name in algorithms: - cache = getattr(lcs, algo_name)(cache_size=1024*1024) - obj_miss_ratio, byte_miss_ratio = cache.process_trace(reader) - print(f"{algo_name}\t\tObj: {obj_miss_ratio:.4f}, Byte: {byte_miss_ratio:.4f}") - -compare_algorithms("./data/cloudPhysicsIO.vscsi") +plugin_lru_cache = PluginCache( + cache_size=1024, + cache_init_hook=cache_init_hook, + cache_hit_hook=cache_hit_hook, + cache_miss_hook=cache_miss_hook, + cache_eviction_hook=cache_eviction_hook, + cache_remove_hook=cache_remove_hook, + cache_free_hook=cache_free_hook, + cache_name="CustomizedLRU", +) ``` +
-### Performance Benchmarking -```python -import time - -def benchmark_cache(cache, num_requests=100000): - """Benchmark cache performance""" - start_time = time.time() - for i in range(num_requests): - req = lcs.Request() - req.obj_id = i % 1000 # Working set of 1000 objects - req.obj_size = 100 - cache.get(req) - end_time = time.time() - throughput = num_requests / (end_time - start_time) - print(f"Processed {num_requests} requests in {end_time - start_time:.2f}s") - print(f"Throughput: {throughput:.0f} requests/sec") - -# Compare performance -lru_cache = lcs.LRU(cache_size=1024*1024) -s3fifo_cache = lcs.S3FIFO(cache_size=1024*1024) - -print("LRU Performance:") -benchmark_cache(lru_cache) - -print("\nS3FIFO Performance:") -benchmark_cache(s3fifo_cache) -``` -## Advanced Usage +Another simple implementation via hook functions for S3FIFO respectively is given in [examples](examples/plugin_cache/s3fifo.py). -### Multi-Format Trace Processing +### Getting Help -```python -import libcachesim as lcs +- Check [project documentation](docs.libcachesim.com/python) for detailed guides +- Open issues on [GitHub](https://github.com/cacheMon/libCacheSim-python/issues) +- Review [examples](/example) in the main repository -# Supported trace types -trace_types = { - "oracle": lcs.TraceType.ORACLE_GENERAL_TRACE, - "csv": lcs.TraceType.CSV_TRACE, - "vscsi": lcs.TraceType.VSCSI_TRACE, - "txt": lcs.TraceType.PLAIN_TXT_TRACE -} +--- +## Reference +
+ Please cite the following papers if you use libCacheSim. -# Open different trace formats -oracle_reader = lcs.open_trace("./data/cloudPhysicsIO.oracleGeneral.bin", trace_types["oracle"]) -csv_reader = lcs.open_trace("./data/cloudPhysicsIO.txt", trace_types["txt"]) - -# Process traces with different caches -caches = [ - lcs.LRU(cache_size=1024*1024), - lcs.S3FIFO(cache_size=1024*1024), - lcs.Sieve(cache_size=1024*1024) -] - -for i, cache in enumerate(caches): - miss_ratio_oracle = cache.process_trace(oracle_reader)[0] - miss_ratio_csv = cache.process_trace(csv_reader)[0] - print(f"Cache {i} miss ratio: {miss_ratio_oracle:.4f}, {miss_ratio_csv:.4f}") ``` +@inproceedings{yang2020-workload, + author = {Juncheng Yang and Yao Yue and K. V. Rashmi}, + title = {A large-scale analysis of hundreds of in-memory cache clusters at Twitter}, + booktitle = {14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)}, + year = {2020}, + isbn = {978-1-939133-19-9}, + pages = {191--208}, + url = {https://www.usenix.org/conference/osdi20/presentation/yang}, + publisher = {USENIX Association}, +} -## Troubleshooting - -### Common Issues +@inproceedings{yang2023-s3fifo, + title = {FIFO Queues Are All You Need for Cache Eviction}, + author = {Juncheng Yang and Yazhuo Zhang and Ziyue Qiu and Yao Yue and K.V. Rashmi}, + isbn = {9798400702297}, + publisher = {Association for Computing Machinery}, + booktitle = {Symposium on Operating Systems Principles (SOSP'23)}, + pages = {130–149}, + numpages = {20}, + year={2023} +} -**Import Error**: Make sure libCacheSim C++ library is built first: -```bash -cmake -G Ninja -B build && ninja -C build +@inproceedings{yang2023-qdlp, + author = {Juncheng Yang and Ziyue Qiu and Yazhuo Zhang and Yao Yue and K.V. Rashmi}, + title = {FIFO Can Be Better than LRU: The Power of Lazy Promotion and Quick Demotion}, + year = {2023}, + isbn = {9798400701955}, + publisher = {Association for Computing Machinery}, + doi = {10.1145/3593856.3595887}, + booktitle = {Proceedings of the 19th Workshop on Hot Topics in Operating Systems (HotOS23)}, + pages = {70–79}, + numpages = {10}, +} ``` +If you used libCacheSim in your research, please cite the above papers. -**Performance Issues**: Use `process_trace()` for large workloads instead of individual `get()` calls for better performance. +
-**Memory Usage**: Monitor cache statistics (`cache.occupied_byte`) and ensure proper cache size limits for your system. +--- -**Custom Cache Issues**: Validate your custom implementation against built-in algorithms using the test functions above. -**Install with uv**: Since automatically building with `uv` will fail due to incomplete source code, please force install the binary file via `uv pip install libcachesim --only-binary=:all:`. +## License +See [LICENSE](LICENSE) for details. -### Getting Help - -- Check the [main documentation](../doc/) for detailed guides -- Open issues on [GitHub](https://github.com/1a1a11a/libCacheSim/issues) -- Review [examples](/example) in the main repository +--- \ No newline at end of file diff --git a/examples/README.md b/examples/README.md deleted file mode 100644 index 9f9f2e0..0000000 --- a/examples/README.md +++ /dev/null @@ -1,280 +0,0 @@ -# libCacheSim Python Examples - -This directory contains examples demonstrating how to use libCacheSim Python bindings for cache simulation and trace generation. - -## Overview - -libCacheSim Python bindings provide a powerful interface for: - -- Cache simulation with various eviction policies (LRU, FIFO, ARC, etc.) -- Synthetic trace generation (Zipf and Uniform distributions) -- Real trace analysis and processing -- Custom cache policy implementation with Python hooks -- Unified interface supporting all cache algorithms - -## Example Files - -### 1. Stream Request Generation (`stream_request_example.py`) - -Demonstrates how to generate synthetic request traces and use them for cache simulation: - -```python -import libcachesim as lcs - -# Create Zipf-distributed requests -zipf_generator = lcs.create_zipf_requests( - num_objects=1000, # 1000 unique objects - num_requests=10000, # 10000 requests - alpha=1.0, # Zipf skewness - obj_size=4000, # Object size in bytes - seed=42 # For reproducibility -) - -# Test with LRU cache -cache = lcs.LRU(cache_size=50*1024*1024) # 50MB cache for better hit ratio -miss_count = sum(1 for req in zipf_generator if not cache.get(req)) -print(f"Final miss ratio: {miss_count / 10000:.3f}") -``` - -**Features**: -- Memory efficient: No temporary files created -- Fast: Direct Request object generation -- Reproducible: Support for random seeds -- Flexible: Easy parameter adjustment - -### 2. Unified Interface Demo (`demo_unified_interface.py`) - -Shows the unified interface for all cache policies, including built-in and custom Python hook caches: - -```python -import libcachesim as lcs - -cache_size = 1024 * 1024 # 1MB - -# Create different cache policies -caches = { - "LRU": lcs.LRU(cache_size), - "FIFO": lcs.FIFO(cache_size), - "ARC": lcs.ARC(cache_size), -} - -# Create Python hook cache -python_cache = lcs.PluginCache(cache_size, "CustomLRU") -# Set hook functions... -caches["Custom Python LRU"] = python_cache - -# Unified interface testing -test_req = lcs.Request() -test_req.obj_id = 1 -test_req.obj_size = 1024 - -for name, cache in caches.items(): - result = cache.get(test_req) - print(f"{name}: {'HIT' if result else 'MISS'}") -``` - -**Benefits of Unified Interface**: -- Same API for all cache policies -- Easy to switch between different algorithms -- Efficient C++ backend trace processing -- Consistent properties and statistics - -### 3. Python Hook Cache (`python_hook_cache_example.py`) - -Demonstrates how to create custom cache policies using Python hooks: - -```python -import libcachesim as lcs -from collections import OrderedDict - -class LRUPolicy: - def __init__(self, cache_size): - self.access_order = OrderedDict() - - def on_hit(self, obj_id, obj_size): - self.access_order.move_to_end(obj_id) - - def on_miss(self, obj_id, obj_size): - self.access_order[obj_id] = True - - def evict(self, obj_id, obj_size): - return next(iter(self.access_order)) - -def create_lru_cache(cache_size): - cache = lcs.PluginCache(cache_size, "PythonLRU") - - def init_hook(cache_size): - return LRUPolicy(cache_size) - - # Set other hooks... - cache.set_hooks(init_hook, hit_hook, miss_hook, eviction_hook, remove_hook) - return cache -``` - -**Custom Policy Features**: -- Pure Python cache logic implementation -- Support for LRU, FIFO and other policies -- Flexible hook system -- Same interface as built-in policies - -### 4. Zipf Trace Examples (`zipf_trace_example.py`) - -Shows synthetic trace generation methods and algorithm comparison: - -```python -import libcachesim as lcs - -# Method 1: Create Zipf-distributed request generator -zipf_generator = lcs.create_zipf_requests( - num_objects=1000, - num_requests=10000, - alpha=1.0, - obj_size=1024, - seed=42 -) - -# Method 2: Create uniform-distributed request generator -uniform_generator = lcs.create_uniform_requests( - num_objects=1000, - num_requests=10000, - obj_size=1024, - seed=42 -) - -# Compare different Zipf parameters -alphas = [0.5, 1.0, 1.5, 2.0] -for alpha in alphas: - generator = lcs.create_zipf_requests(1000, 10000, alpha=alpha, seed=42) - cache = lcs.LRU(1024*1024) - hit_count = sum(1 for req in generator if cache.get(req)) - hit_ratio = hit_count / 10000 - print(f"α={alpha}: Hit ratio={hit_ratio:.4f}") -``` - -**Synthetic Trace Features**: -- Higher α values create more skewed access patterns -- Memory efficient: No temporary files created -- Request generators for flexible processing -- Suitable for simulating real workloads - -## Key Features - -### Trace Generation -- `create_zipf_requests()`: Create Zipf-distributed request generator -- `create_uniform_requests()`: Create uniform-distributed request generator - -### Cache Algorithms -- **Classic algorithms**: `LRU()`, `FIFO()`, `ARC()`, `Clock()` -- **Modern algorithms**: `S3FIFO()`, `Sieve()`, `TinyLFU()` -- **Custom policies**: `PluginCache()` - -### Trace Processing -- `open_trace()`: Open real trace files -- `process_trace()`: High-performance trace processing - -## Basic Usage Examples - -### 1. Compare Cache Algorithms - -```python -import libcachesim as lcs - -# Test different algorithms -algorithms = ['LRU', 'FIFO', 'ARC', 'S3FIFO'] -cache_size = 1024*1024 - -for algo_name in algorithms: - # Create fresh workload for each algorithm - generator = lcs.create_zipf_requests(1000, 10000, alpha=1.0, seed=42) - cache = getattr(lcs, algo_name)(cache_size) - hit_count = sum(1 for req in generator if cache.get(req)) - print(f"{algo_name}: {hit_count/10000:.3f}") -``` - -### 2. Parameter Sensitivity Analysis - -```python -import libcachesim as lcs - -# Test different Zipf parameters -for alpha in [0.5, 1.0, 1.5, 2.0]: - generator = lcs.create_zipf_requests(1000, 10000, alpha=alpha, seed=42) - cache = lcs.LRU(cache_size=512*1024) - - hit_count = sum(1 for req in generator if cache.get(req)) - print(f"α={alpha}: Hit ratio={hit_count/10000:.3f}") -``` - -## Parameters - -### Trace Generation Parameters -- `num_objects`: Number of unique objects -- `num_requests`: Number of requests to generate -- `alpha`: Zipf skewness (α=1.0 for classic Zipf) -- `obj_size`: Object size in bytes (default: 4000) -- `seed`: Random seed for reproducibility - -### Cache Parameters -- `cache_size`: Cache capacity in bytes -- Algorithm-specific parameters (e.g.,`fifo_size_ratio` for S3FIFO) - -## Running Examples - -```bash -# Navigate to examples directory -cd libCacheSim-python/examples - -# Run stream-based trace generation -python stream_request_example.py - -# Run unified interface demo -python demo_unified_interface.py - -# Run Python hook cache example -python python_hook_cache_example.py - -# Run Zipf trace examples -python zipf_trace_example.py - -# Run all tests -python -m pytest ../tests/ -v -``` - -## Performance Tips - -1. **Use appropriate cache and object sizes**: - ```python - # Good: cache can hold multiple objects - cache = lcs.LRU(cache_size=1024*1024) # 1MB - generator = lcs.create_zipf_requests(1000, 10000, obj_size=1024) # 1KB objects - ``` - -2. **Use seeds for reproducible experiments**: - ```python - generator = lcs.create_zipf_requests(1000, 10000, seed=42) - ``` - -3. **Process large traces with C++ backend**: - ```python - # Fast: C++ processing - obj_miss_ratio, byte_miss_ratio = lcs.process_trace(cache, reader) - - # Slow: Python loop - for req in reader: - cache.get(req) - ``` - -4. **Understand Zipf parameter effects**: - - α=0.5: Slightly skewed, close to uniform distribution - - α=1.0: Classic Zipf distribution - - α=2.0: Highly skewed, few objects get most accesses - -## Testing - -Run comprehensive tests: - -```bash -python -m pytest ../tests/test_trace_generator.py -v -python -m pytest ../tests/test_eviction.py -v -python -m pytest ../tests/test_process_trace.py -v -``` diff --git a/examples/basic_usage.py b/examples/basic_usage.py new file mode 100644 index 0000000..e8dd208 --- /dev/null +++ b/examples/basic_usage.py @@ -0,0 +1,29 @@ +import libcachesim as lcs + +# Step 1: Get one trace from S3 bucket +URI = "cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst" +dl = lcs.DataLoader() +dl.load(URI) + +# Step 2: Open trace and process efficiently +reader = lcs.TraceReader( + trace = dl.get_cache_path(URI), + trace_type = lcs.TraceType.ORACLE_GENERAL_TRACE, + reader_init_params = lcs.ReaderInitParam(ignore_obj_size=False) +) + +# Step 3: Initialize cache +cache = lcs.S3FIFO(cache_size=1024*1024) + +# Step 4: Process entire trace efficiently (C++ backend) +obj_miss_ratio, byte_miss_ratio = cache.process_trace(reader) +print(f"Object miss ratio: {obj_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") + +# Step 4.1: Process with limited number of requests +cache = lcs.S3FIFO(cache_size=1024*1024) +obj_miss_ratio, byte_miss_ratio = cache.process_trace( + reader, + start_req=0, + max_req=1000 +) +print(f"Object miss ratio: {obj_miss_ratio:.4f}, Byte miss ratio: {byte_miss_ratio:.4f}") \ No newline at end of file diff --git a/examples/demo_unified_interface.py b/examples/demo_unified_interface.py deleted file mode 100644 index 0cb629f..0000000 --- a/examples/demo_unified_interface.py +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env python3 -""" -Demo script showing the unified interface for all cache policies. -This demonstrates how to use both native and Python hook-based caches -with the same API for seamless algorithm comparison and switching. -""" - -import sys -import os - -# Add parent directory for development testing -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) - -try: - import libcachesim as lcs -except ImportError as e: - print(f"Error importing libcachesim: {e}") - print("Make sure the Python binding is built and installed") - sys.exit(1) - -from collections import OrderedDict - - -def create_trace_reader(): - """Helper function to create a trace reader.""" - data_file = os.path.join( - os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "data", "cloudPhysicsIO.oracleGeneral.bin" - ) - if not os.path.exists(data_file): - print(f"Warning: Trace file not found at {data_file}") - return None - return lcs.open_trace(data_file, lcs.TraceType.ORACLE_GENERAL_TRACE) - - -def create_demo_lru_hooks(): - """Create demo LRU hooks for Python-based cache policy.""" - - def init_hook(cache_size): - print(f" Initializing custom LRU with {cache_size} bytes") - return OrderedDict() - - def hit_hook(lru_dict, obj_id, obj_size): - if obj_id in lru_dict: - lru_dict.move_to_end(obj_id) - - def miss_hook(lru_dict, obj_id, obj_size): - lru_dict[obj_id] = obj_size - - def eviction_hook(lru_dict, obj_id, obj_size): - if lru_dict: - return next(iter(lru_dict)) - return obj_id - - def remove_hook(lru_dict, obj_id): - lru_dict.pop(obj_id, None) - - return init_hook, hit_hook, miss_hook, eviction_hook, remove_hook - - -def demo_unified_interface(): - """Demonstrate the unified interface across different cache policies.""" - print("libCacheSim Python Binding - Unified Interface Demo") - print("=" * 60) - - cache_size = 1024 * 1024 # 1MB - - # Create different cache policies - caches = { - "LRU": lcs.LRU(cache_size), - "FIFO": lcs.FIFO(cache_size), - "ARC": lcs.ARC(cache_size), - } - - # Create Python hook-based LRU - python_cache = lcs.PluginCache(cache_size, "CustomLRU") - init_hook, hit_hook, miss_hook, eviction_hook, remove_hook = create_demo_lru_hooks() - python_cache.set_hooks(init_hook, hit_hook, miss_hook, eviction_hook, remove_hook) - caches["Custom Python LRU"] = python_cache - - print(f"Testing {len(caches)} different cache policies with unified interface:") - - # Demo 1: Single request interface - print("1. Single Request Interface:") - print(" All caches use: cache.get(request)") - - test_req = lcs.Request() - test_req.obj_id = 1 - test_req.obj_size = 1024 - - for name, cache in caches.items(): - result = cache.get(test_req) - print(f" {name:20s}: {'HIT' if result else 'MISS'}") - - # Demo 2: Unified properties interface - print("\n2. Unified Properties Interface:") - print(" All caches provide: cache_size, n_obj, occupied_byte, n_req") - - for name, cache in caches.items(): - print( - f" {name:20s}: size={cache.cache_size}, objs={cache.n_obj}, " - f"bytes={cache.occupied_byte}, reqs={cache.n_req}" - ) - - # Demo 3: Efficient trace processing - print("\n3. Efficient Trace Processing Interface:") - print(" All caches use: cache.process_trace(reader, max_req=N)") - - max_requests = 1000 - - for name, cache in caches.items(): - # Create fresh reader for each cache - reader = create_trace_reader() - if not reader: - print(f" {name:20s}: trace file not available") - continue - - obj_miss_ratio, byte_miss_ratio = cache.process_trace(reader, max_req=max_requests) - print(f" {name:20s}: obj_miss_ratio={obj_miss_ratio:.4f}, byte_miss_ratio={byte_miss_ratio:.4f}") - - print("\nKey Benefits of Unified Interface:") - print(" • Same API for all cache policies (built-in + custom)") - print(" • Easy to switch between different algorithms") - print(" • Efficient trace processing in C++ (no Python overhead)") - print(" • Consistent properties and statistics") - print(" • Type-safe and well-documented") - - print("\nDemo completed! All cache policies work with the same interface.") - - -if __name__ == "__main__": - demo_unified_interface() diff --git a/examples/plugin_cache.py b/examples/plugin_cache/lru.py similarity index 100% rename from examples/plugin_cache.py rename to examples/plugin_cache/lru.py diff --git a/examples/plugin_cache/s3fifo.py b/examples/plugin_cache/s3fifo.py new file mode 100644 index 0000000..576d841 --- /dev/null +++ b/examples/plugin_cache/s3fifo.py @@ -0,0 +1,204 @@ +# An example of plugin for s3fifo +from collections import OrderedDict +from libcachesim import PluginCache, CommonCacheParams, Request, S3FIFO, SyntheticReader + +# NOTE(haocheng): we only support ignore object size for now +class StandaloneS3FIFO: + def __init__(self, + small_size_ratio: float = 0.1, + ghost_size_ratio: float = 0.9, + move_to_main_threshold: int = 2, + cache_size: int = 1024): + # S3-FIFO uses three queues with OrderedDict for O(1) operations + self.small_fifo = OrderedDict() + self.main_fifo = OrderedDict() + self.ghost_fifo = OrderedDict() + + # Size limits + self.small_max_size = int(small_size_ratio * cache_size) + self.main_max_size = int(cache_size - small_size_ratio * cache_size) + self.ghost_max_size = int(ghost_size_ratio * cache_size) + + # Frequency tracking + self.small_freq = {} + self.main_freq = {} + self.ghost_freq = {} + + # Other parameters + self.max_freq = 3 + self.move_to_main_threshold = move_to_main_threshold + + def cache_hit(self, obj_id): + """ + Cache hit can happen in two cases: + 1. Small FIFO cache hit (small_fifo) + 2. Main FIFO cache hit (main_fifo) + """ + if obj_id in self.main_fifo: + self.main_freq[obj_id] += 1 + elif obj_id in self.small_fifo: + self.small_freq[obj_id] += 1 + else: + print(f"Cache hit for obj_id {obj_id} but not found in any queue") + print(f"small_fifo: {list(self.small_fifo.keys())}") + print(f"main_fifo: {list(self.main_fifo.keys())}") + print(f"ghost_fifo: {list(self.ghost_fifo.keys())}") + assert False, "Cache hit should happen in small_fifo or main_fifo" + + def cache_miss(self, obj_id, obj_size=1): + """ + Cache miss can happen in three cases: + 1. Miss in small and main but hit in ghost + 2. Miss all three queues + """ + if obj_id in self.ghost_fifo: + del self.ghost_fifo[obj_id] + del self.ghost_freq[obj_id] + self.insert_to_main(obj_id) + else: + # Miss all three queues + cond = (obj_id not in self.small_fifo) and (obj_id not in self.main_fifo) + assert cond, "Should not be in small_fifo or main_fifo" + + # Then we need to insert to small fifo queue + self.insert_to_small(obj_id) + + def insert_to_small(self, obj_id): + if len(self.small_fifo) >= self.small_max_size: + self.cache_evict_small() + self.small_fifo[obj_id] = None # OrderedDict value doesn't matter + self.small_freq[obj_id] = 0 + + def insert_to_main(self, obj_id): + if len(self.main_fifo) >= self.main_max_size: + self.cache_evict_main() + self.main_fifo[obj_id] = None + self.main_freq[obj_id] = 0 + + def insert_to_ghost(self, obj_id, original_freq=0): + if len(self.ghost_fifo) >= self.ghost_max_size: + # Remove oldest item + oldest_id = next(iter(self.ghost_fifo)) + del self.ghost_fifo[oldest_id] + del self.ghost_freq[oldest_id] + self.ghost_fifo[obj_id] = None + self.ghost_freq[obj_id] = original_freq + + def cache_evict_small(self): + has_evicted = False + evicted_id = None + while not has_evicted and len(self.small_fifo) > 0: + obj_to_evict = next(iter(self.small_fifo)) # Get first item + if self.small_freq[obj_to_evict] >= self.move_to_main_threshold: + # Move to main fifo cache (not real evict, just move) + del self.small_fifo[obj_to_evict] + del self.small_freq[obj_to_evict] + self.insert_to_main(obj_to_evict) + else: + evicted_id = obj_to_evict + # Insert to ghost fifo cache (real evict) + del self.small_fifo[obj_to_evict] + del self.small_freq[obj_to_evict] + self.insert_to_ghost(obj_to_evict) + has_evicted = True + return evicted_id + + def cache_evict_main(self): + has_evicted = False + evicted_id = None + while not has_evicted and len(self.main_fifo) > 0: + obj_to_evict = next(iter(self.main_fifo)) # Get first item + freq = self.main_freq[obj_to_evict] + if freq >= 1: + # Reinsert with decremented frequency + del self.main_fifo[obj_to_evict] + del self.main_freq[obj_to_evict] + self.insert_to_main(obj_to_evict) + self.main_freq[obj_to_evict] = min(freq, self.max_freq) - 1 + else: + evicted_id = obj_to_evict + # Real eviction + del self.main_fifo[obj_to_evict] + del self.main_freq[obj_to_evict] + has_evicted = True + return evicted_id + + def cache_evict(self): + evicted_id = None + # if main is full or small is empty, evict main + if len(self.main_fifo) >= self.main_max_size or len(self.small_fifo) == 0: + evicted_id = self.cache_evict_main() + # if small is not empty, evict small + else: + evicted_id = self.cache_evict_small() + if evicted_id is None: + assert False, "Should not be None" + return evicted_id + + def cache_remove(self, obj_id): + removed = False + if obj_id in self.small_fifo: + del self.small_fifo[obj_id] + del self.small_freq[obj_id] + removed = True + elif obj_id in self.ghost_fifo: + del self.ghost_fifo[obj_id] + del self.ghost_freq[obj_id] + removed = True + elif obj_id in self.main_fifo: + del self.main_fifo[obj_id] + del self.main_freq[obj_id] + removed = True + return removed + +def cache_init_hook(common_cache_params: CommonCacheParams): + return StandaloneS3FIFO(cache_size=common_cache_params.cache_size) + +def cache_hit_hook(cache, request: Request): + cache.cache_hit(request.obj_id) + +def cache_miss_hook(cache, request: Request): + cache.cache_miss(request.obj_id, request.obj_size) + +def cache_eviction_hook(cache, request: Request): + # NOTE(haocheng): never called + pass + +def cache_remove_hook(cache, obj_id): + cache.cache_remove(obj_id) + +def cache_free_hook(cache): + cache.small_fifo.clear() + cache.small_freq.clear() + cache.ghost_fifo.clear() + cache.ghost_freq.clear() + cache.main_fifo.clear() + cache.main_freq.clear() + +cache = PluginCache( + cache_size=1024*1024, + cache_init_hook=cache_init_hook, + cache_hit_hook=cache_hit_hook, + cache_miss_hook=cache_miss_hook, + cache_eviction_hook=cache_eviction_hook, + cache_remove_hook=cache_remove_hook, + cache_free_hook=cache_free_hook, + cache_name="S3FIFO") + +ref_s3fifo = S3FIFO(cache_size=1024) + +reader = SyntheticReader( + num_of_req=1000000, + num_objects=100, + obj_size=1, + seed=42, + alpha=0.8, + dist="zipf", +) + +for req in reader: + plugin_hit = cache.get(req) + ref_hit = ref_s3fifo.get(req) + assert plugin_hit == ref_hit, f"Cache hit mismatch: {plugin_hit} != {ref_hit}" + +print("All requests processed successfully. Plugin cache matches reference S3FIFO cache.") \ No newline at end of file diff --git a/examples/python_hook_cache_example.py b/examples/python_hook_cache_example.py deleted file mode 100644 index fa309d4..0000000 --- a/examples/python_hook_cache_example.py +++ /dev/null @@ -1,178 +0,0 @@ -#!/usr/bin/env python3 -""" -Example demonstrating how to create custom cache policies using Python hooks. - -This example shows how to implement LRU and FIFO cache policies using the -PluginCache class, which allows users to define cache behavior using -pure Python functions instead of C/C++ plugins. -""" - -import libcachesim as lcs -from collections import OrderedDict, deque -from contextlib import suppress - - -class LRUPolicy: - """LRU (Least Recently Used) cache policy implementation.""" - - def __init__(self, cache_size): - self.cache_size = cache_size - self.access_order = OrderedDict() # obj_id -> True (for ordering) - - def on_hit(self, obj_id, obj_size): - """Move accessed object to end (most recent).""" - if obj_id in self.access_order: - # Move to end (most recent) - self.access_order.move_to_end(obj_id) - - def on_miss(self, obj_id, obj_size): - """Add new object to end (most recent).""" - self.access_order[obj_id] = True - - def evict(self, obj_id, obj_size): - """Return the least recently used object ID.""" - if self.access_order: - # Return first item (least recent) - victim_id = next(iter(self.access_order)) - return victim_id - raise RuntimeError("No objects to evict") - - def on_remove(self, obj_id): - """Remove object from tracking.""" - self.access_order.pop(obj_id, None) - - -class FIFOPolicy: - """FIFO (First In First Out) cache policy implementation.""" - - def __init__(self, cache_size): - self.cache_size = cache_size - self.insertion_order = deque() # obj_id queue - - def on_hit(self, obj_id, obj_size): - """FIFO doesn't change order on hits.""" - pass - - def on_miss(self, obj_id, obj_size): - """Add new object to end of queue.""" - self.insertion_order.append(obj_id) - - def evict(self, obj_id, obj_size): - """Return the first inserted object ID.""" - if self.insertion_order: - victim_id = self.insertion_order.popleft() - return victim_id - raise RuntimeError("No objects to evict") - - def on_remove(self, obj_id): - """Remove object from tracking.""" - with suppress(ValueError): - self.insertion_order.remove(obj_id) - - -def create_lru_cache(cache_size): - """Create an LRU cache using Python hooks.""" - cache = lcs.PluginCache(cache_size, "PythonLRU") - - def init_hook(cache_size): - return LRUPolicy(cache_size) - - def hit_hook(policy, obj_id, obj_size): - policy.on_hit(obj_id, obj_size) - - def miss_hook(policy, obj_id, obj_size): - policy.on_miss(obj_id, obj_size) - - def eviction_hook(policy, obj_id, obj_size): - return policy.evict(obj_id, obj_size) - - def remove_hook(policy, obj_id): - policy.on_remove(obj_id) - - def free_hook(policy): - # Python garbage collection handles cleanup - pass - - cache.set_hooks(init_hook, hit_hook, miss_hook, eviction_hook, remove_hook, free_hook) - return cache - - -def create_fifo_cache(cache_size): - """Create a FIFO cache using Python hooks.""" - cache = lcs.PluginCache(cache_size, "PythonFIFO") - - def init_hook(cache_size): - return FIFOPolicy(cache_size) - - def hit_hook(policy, obj_id, obj_size): - policy.on_hit(obj_id, obj_size) - - def miss_hook(policy, obj_id, obj_size): - policy.on_miss(obj_id, obj_size) - - def eviction_hook(policy, obj_id, obj_size): - return policy.evict(obj_id, obj_size) - - def remove_hook(policy, obj_id): - policy.on_remove(obj_id) - - cache.set_hooks(init_hook, hit_hook, miss_hook, eviction_hook, remove_hook) - return cache - - -def test_cache_policy(cache, name): - """Test a cache policy with sample requests.""" - print(f"\n=== Testing {name} Cache ===") - - # Test requests: obj_id, obj_size - test_requests = [ - (1, 100), - (2, 100), - (3, 100), - (4, 100), - (5, 100), # Fill cache - (1, 100), # Hit - (6, 100), # Miss, should evict something - (2, 100), # Hit or miss depending on policy - (7, 100), # Miss, should evict something - ] - - hits = 0 - misses = 0 - - for obj_id, obj_size in test_requests: - req = lcs.Request() - req.obj_id = obj_id - req.obj_size = obj_size - - hit = cache.get(req) - if hit: - hits += 1 - print(f"Request {obj_id}: HIT") - else: - misses += 1 - print(f"Request {obj_id}: MISS") - - print(f"Total: {hits} hits, {misses} misses") - print(f"Cache stats: {cache.n_obj} objects, {cache.occupied_byte} bytes occupied") - - -def main(): - """Main example function.""" - cache_size = 500 # Bytes (can hold 5 objects of size 100 each) - - # Test LRU cache - lru_cache = create_lru_cache(cache_size) - test_cache_policy(lru_cache, "LRU") - - # Test FIFO cache - fifo_cache = create_fifo_cache(cache_size) - test_cache_policy(fifo_cache, "FIFO") - - print("\n=== Comparison ===") - print("LRU keeps recently accessed items, evicting least recently used") - print("FIFO keeps items in insertion order, evicting oldest inserted") - - -if __name__ == "__main__": - main() diff --git a/examples/stream_request_example.py b/examples/stream_request_example.py deleted file mode 100644 index eed213b..0000000 --- a/examples/stream_request_example.py +++ /dev/null @@ -1,154 +0,0 @@ -#!/usr/bin/env python3 -""" -Example: Using stream request generators for cache simulation. - -This example demonstrates how to use the stream request generators -to create synthetic traces and run cache simulations without creating -temporary files. -""" - -import libcachesim as lcs - - -def main(): - """Demonstrate stream request generators.""" - print("libCacheSim Stream Request Generation Example") - print("=" * 50) - - # Example 1: Basic Zipf generation with appropriate cache size - print("\n1. Basic Zipf Request Generation") - print("-" * 30) - - # Use reasonable cache and object sizes - cache_size = 50 * 1024 * 1024 # 50MB cache - obj_size = 1024 # 1KB objects - num_objects = 1000 - num_requests = 10000 - - # Create a cache - cache = lcs.LRU(cache_size=cache_size) - - # Create a Zipf-distributed request generator - zipf_generator = lcs.create_zipf_requests( - num_objects=num_objects, - num_requests=num_requests, - alpha=1.0, # Zipf skewness - obj_size=obj_size, # Object size in bytes - seed=42, # For reproducibility - ) - - print(f"Cache size: {cache_size // 1024 // 1024}MB") - print(f"Object size: {obj_size}B") - print(f"Generated {num_requests} Zipf requests for {num_objects} objects") - - # Process the requests directly - hit_count = 0 - for i, req in enumerate(zipf_generator): - if cache.get(req): - hit_count += 1 - - # Print progress every 2000 requests - if (i + 1) % 2000 == 0: - current_hit_ratio = hit_count / (i + 1) - print(f"Processed {i + 1} requests, hit ratio: {current_hit_ratio:.3f}") - - final_hit_ratio = hit_count / num_requests - print(f"Final hit ratio: {final_hit_ratio:.3f}") - - # Example 2: Uniform distribution comparison - print("\n2. Uniform Request Generation") - print("-" * 30) - - # Create a uniform-distributed request generator - uniform_generator = lcs.create_uniform_requests( - num_objects=num_objects, num_requests=num_requests, obj_size=obj_size, seed=42 - ) - - print(f"Generated {num_requests} uniform requests for {num_objects} objects") - - # Reset cache and process uniform requests - cache = lcs.LRU(cache_size=cache_size) - hit_count = 0 - - for i, req in enumerate(uniform_generator): - if cache.get(req): - hit_count += 1 - - if (i + 1) % 2000 == 0: - current_hit_ratio = hit_count / (i + 1) - print(f"Processed {i + 1} requests, hit ratio: {current_hit_ratio:.3f}") - - final_hit_ratio = hit_count / num_requests - print(f"Final hit ratio: {final_hit_ratio:.3f}") - - # Example 3: Compare different Zipf alpha values - print("\n3. Zipf Alpha Parameter Comparison") - print("-" * 30) - - alphas = [0.5, 1.0, 1.5, 2.0] - print(f"{'Alpha':<8} {'Hit Ratio':<12} {'Description'}") - print("-" * 40) - - for alpha in alphas: - generator = lcs.create_zipf_requests( - num_objects=num_objects, num_requests=num_requests, alpha=alpha, obj_size=obj_size, seed=42 - ) - - cache = lcs.LRU(cache_size=cache_size) - hit_count = sum(1 for req in generator if cache.get(req)) - hit_ratio = hit_count / num_requests - - # Describe the skewness - if alpha < 0.8: - description = "Low skew (nearly uniform)" - elif alpha < 1.2: - description = "Classic Zipf" - elif alpha < 1.8: - description = "High skew" - else: - description = "Very high skew" - - print(f"{alpha:<8.1f} {hit_ratio:<12.3f} {description}") - - # Example 4: Cache size sensitivity - print("\n4. Cache Size Sensitivity") - print("-" * 30) - - # Fixed workload - generator = lcs.create_zipf_requests( - num_objects=num_objects, num_requests=num_requests, alpha=1.0, obj_size=obj_size, seed=42 - ) - - cache_sizes = [ - 1 * 1024 * 1024, # 1MB - 5 * 1024 * 1024, # 5MB - 10 * 1024 * 1024, # 10MB - 50 * 1024 * 1024, # 50MB - ] - - print(f"{'Cache Size':<12} {'Hit Ratio':<12} {'Objects Fit'}") - print("-" * 36) - - for cache_size in cache_sizes: - cache = lcs.LRU(cache_size=cache_size) - - # Create fresh generator for each test - test_generator = lcs.create_zipf_requests( - num_objects=num_objects, num_requests=num_requests, alpha=1.0, obj_size=obj_size, seed=42 - ) - - hit_count = sum(1 for req in test_generator if cache.get(req)) - hit_ratio = hit_count / num_requests - objects_fit = cache_size // obj_size - - print(f"{cache_size // 1024 // 1024}MB{'':<8} {hit_ratio:<12.3f} ~{objects_fit}") - - print("\nNotes:") - print("- Higher α values create more skewed access patterns") - print("- Skewed patterns generally have higher hit ratios") - print("- Cache size affects performance, but beyond a point diminishing returns") - print(f"- Working set: {num_objects} objects × {obj_size}B = {num_objects * obj_size // 1024}KB") - - -if __name__ == "__main__": - main() diff --git a/examples/zipf_trace_example.py b/examples/zipf_trace_example.py deleted file mode 100644 index 662ae0f..0000000 --- a/examples/zipf_trace_example.py +++ /dev/null @@ -1,243 +0,0 @@ -#!/usr/bin/env python3 -""" -Example demonstrating trace generation and cache simulation in libCacheSim Python bindings. - -This example shows how to: -1. Generate synthetic request traces using available APIs -2. Use the generated traces with cache simulations -3. Compare different algorithms and parameters -""" - -import libcachesim as lcs - - -def example_basic_trace_generation(): - """Basic example of generating synthetic traces.""" - print("=== Basic Synthetic Trace Generation ===") - - # Generate Zipf requests using available API - num_objects = 1000 - num_requests = 10000 - alpha = 1.0 - obj_size = 1024 # 1KB objects - - # Create Zipf-distributed requests - zipf_requests = lcs.create_zipf_requests( - num_objects=num_objects, num_requests=num_requests, alpha=alpha, obj_size=obj_size, seed=42 - ) - - print(f"Generated {num_requests} Zipf requests with α={alpha}") - print(f"Object size: {obj_size}B, Number of unique objects: {num_objects}") - - # Use the requests with a cache - cache = lcs.LRU(cache_size=50 * 1024 * 1024) # 50MB cache - hit_count = sum(1 for req in zipf_requests if cache.get(req)) - hit_ratio = hit_count / num_requests - print(f"LRU cache hit ratio: {hit_ratio:.4f}") - - return hit_ratio - - -def example_compare_zipf_parameters(): - """Compare different Zipf parameters.""" - print("\n=== Comparing Zipf Parameters ===") - - num_objects = 1000 - num_requests = 10000 - cache_size = 50 * 1024 * 1024 # 50MB - obj_size = 1024 # 1KB objects - - alphas = [0.5, 1.0, 1.5, 2.0] - results = {} - - print(f"{'Alpha':<8} {'LRU':<8} {'FIFO':<8} {'ARC':<8} {'Clock':<8}") - print("-" * 40) - - for alpha in alphas: - # Test with different cache policies - policies = { - "LRU": lcs.LRU(cache_size), - "FIFO": lcs.FIFO(cache_size), - "ARC": lcs.ARC(cache_size), - "Clock": lcs.Clock(cache_size), - } - - results[alpha] = {} - hit_ratios = [] - for name, cache in policies.items(): - # Create fresh request iterator for each cache - test_requests = lcs.create_zipf_requests( - num_objects=num_objects, num_requests=num_requests, alpha=alpha, obj_size=obj_size, seed=42 - ) - hit_count = sum(1 for req in test_requests if cache.get(req)) - hit_ratio = hit_count / num_requests - results[alpha][name] = hit_ratio - hit_ratios.append(f"{hit_ratio:.3f}") - - print(f"{alpha:<8.1f} {hit_ratios[0]:<8} {hit_ratios[1]:<8} {hit_ratios[2]:<8} {hit_ratios[3]:<8}") - - return results - - -def example_algorithm_comparison(): - """Compare different cache algorithms.""" - print("\n=== Cache Algorithm Comparison ===") - - # Fixed workload parameters - num_objects = 1000 - num_requests = 10000 - alpha = 1.0 - obj_size = 1024 - cache_size = 10 * 1024 * 1024 # 10MB - - # Available algorithms - algorithms = { - "LRU": lcs.LRU, - "FIFO": lcs.FIFO, - "ARC": lcs.ARC, - "Clock": lcs.Clock, - "S3FIFO": lcs.S3FIFO, - "Sieve": lcs.Sieve, - } - - print(f"Testing with: {num_objects} objects, {num_requests} requests") - print(f"Cache size: {cache_size // 1024 // 1024}MB, Object size: {obj_size}B") - print(f"Zipf alpha: {alpha}") - print() - - print(f"{'Algorithm':<10} {'Hit Ratio':<12} {'Description'}") - print("-" * 45) - - results = {} - for name, cache_class in algorithms.items(): - try: - # Create fresh requests for each algorithm - requests = lcs.create_zipf_requests( - num_objects=num_objects, num_requests=num_requests, alpha=alpha, obj_size=obj_size, seed=42 - ) - - cache = cache_class(cache_size) - hit_count = sum(1 for req in requests if cache.get(req)) - hit_ratio = hit_count / num_requests - results[name] = hit_ratio - - # Add descriptions - descriptions = { - "LRU": "Least Recently Used", - "FIFO": "First In First Out", - "ARC": "Adaptive Replacement Cache", - "Clock": "Clock/Second Chance", - "S3FIFO": "Simple Scalable FIFO", - "Sieve": "Lazy Promotion", - } - - print(f"{name:<10} {hit_ratio:<12.4f} {descriptions.get(name, '')}") - - except Exception as e: - print(f"{name:<10} {'ERROR':<12} {str(e)}") - - return results - - -def example_uniform_vs_zipf(): - """Compare uniform vs Zipf distributions.""" - print("\n=== Uniform vs Zipf Distribution Comparison ===") - - num_objects = 1000 - num_requests = 10000 - obj_size = 1024 - cache_size = 10 * 1024 * 1024 - - # Test uniform distribution - uniform_requests = lcs.create_uniform_requests( - num_objects=num_objects, num_requests=num_requests, obj_size=obj_size, seed=42 - ) - - cache = lcs.LRU(cache_size) - uniform_hits = sum(1 for req in uniform_requests if cache.get(req)) - uniform_hit_ratio = uniform_hits / num_requests - - # Test Zipf distribution - zipf_requests = lcs.create_zipf_requests( - num_objects=num_objects, num_requests=num_requests, alpha=1.0, obj_size=obj_size, seed=42 - ) - - cache = lcs.LRU(cache_size) - zipf_hits = sum(1 for req in zipf_requests if cache.get(req)) - zipf_hit_ratio = zipf_hits / num_requests - - print(f"{'Distribution':<12} {'Hit Ratio':<12} {'Description'}") - print("-" * 45) - print(f"{'Uniform':<12} {uniform_hit_ratio:<12.4f} {'All objects equally likely'}") - print(f"{'Zipf (α=1.0)':<12} {zipf_hit_ratio:<12.4f} {'Some objects much more popular'}") - - print( - f"\nObservation: Zipf typically shows{'higher' if zipf_hit_ratio > uniform_hit_ratio else 'lower'} hit ratios" - ) - print("due to locality of reference (hot objects get cached)") - - -def example_cache_size_analysis(): - """Analyze the effect of different cache sizes.""" - print("\n=== Cache Size Sensitivity Analysis ===") - - num_objects = 1000 - num_requests = 10000 - alpha = 1.0 - obj_size = 1024 - - cache_sizes = [ - 1 * 1024 * 1024, # 1MB - 5 * 1024 * 1024, # 5MB - 10 * 1024 * 1024, # 10MB - 25 * 1024 * 1024, # 25MB - 50 * 1024 * 1024, # 50MB - ] - - print(f"{'Cache Size':<12} {'Objects Fit':<12} {'Hit Ratio':<12} {'Efficiency'}") - print("-" * 55) - - for cache_size in cache_sizes: - requests = lcs.create_zipf_requests( - num_objects=num_objects, num_requests=num_requests, alpha=alpha, obj_size=obj_size, seed=42 - ) - - cache = lcs.LRU(cache_size) - hit_count = sum(1 for req in requests if cache.get(req)) - hit_ratio = hit_count / num_requests - objects_fit = cache_size // obj_size - efficiency = hit_ratio / (cache_size / (1024 * 1024)) # hit ratio per MB - - print(f"{cache_size // 1024 // 1024}MB{'':<8} {objects_fit:<12} {hit_ratio:<12.4f} {efficiency:<12.4f}") - - -def main(): - """Run all examples.""" - print("libCacheSim Python Bindings - Trace Generation Examples") - print("=" * 60) - - try: - # Run examples - example_basic_trace_generation() - example_compare_zipf_parameters() - example_algorithm_comparison() - example_uniform_vs_zipf() - example_cache_size_analysis() - - print("\n" + "=" * 60) - print("All examples completed successfully!") - print("\nKey Takeaways:") - print("• Higher Zipf α values create more skewed access patterns") - print("• Skewed patterns generally result in higher cache hit ratios") - print("• Different algorithms perform differently based on workload") - print("• Cache size has diminishing returns beyond working set size") - - except Exception as e: - print(f"Error running examples: {e}") - import traceback - - traceback.print_exc() - - -if __name__ == "__main__": - main() diff --git a/libcachesim/__init__.py b/libcachesim/__init__.py index bd194bf..c9fc1e7 100644 --- a/libcachesim/__init__.py +++ b/libcachesim/__init__.py @@ -6,6 +6,7 @@ Cache, Request, ReqOp, + ReaderInitParam, TraceType, SamplerType, AnalysisParam, @@ -65,6 +66,7 @@ "Cache", "Request", "ReqOp", + "ReaderInitParam", "TraceType", "SamplerType", "AnalysisParam", diff --git a/libcachesim/protocols.py b/libcachesim/protocols.py index 58eeddb..74a45f8 100644 --- a/libcachesim/protocols.py +++ b/libcachesim/protocols.py @@ -23,7 +23,7 @@ class ReaderProtocol(Protocol): """ def get_num_of_req(self) -> int: ... - def read_one_req(self, req: Request) -> Request: ... + def read_one_req(self) -> Request: ... def skip_n_req(self, n: int) -> int: ... def reset(self) -> None: ... def close(self) -> None: ... diff --git a/libcachesim/synthetic_reader.py b/libcachesim/synthetic_reader.py index 16f8a10..b429242 100644 --- a/libcachesim/synthetic_reader.py +++ b/libcachesim/synthetic_reader.py @@ -85,11 +85,12 @@ def obj_ids(self) -> np.ndarray: def get_num_of_req(self) -> int: return self.num_of_req - def read_one_req(self, req: Request) -> Request: + def read_one_req(self) -> Request: """Read one request and fill Request object""" + req = Request() if self.current_pos >= self.num_of_req: req.valid = False - return req + return req # return invalid request obj_id = self.obj_ids[self.current_pos] req.obj_id = obj_id @@ -194,8 +195,7 @@ def __next__(self) -> Request: if self.current_pos >= self.num_of_req: raise StopIteration - req = Request() - return self.read_one_req(req) + return self.read_one_req() def __getitem__(self, index: int) -> Request: """Support index access""" diff --git a/libcachesim/trace_reader.py b/libcachesim/trace_reader.py index 8d70741..20a2aba 100644 --- a/libcachesim/trace_reader.py +++ b/libcachesim/trace_reader.py @@ -167,8 +167,12 @@ def read_direction(self) -> ReadDirection: def get_num_of_req(self) -> int: return self._reader.get_num_of_req() - def read_one_req(self, req: Request) -> Request: - return self._reader.read_one_req(req) + def read_one_req(self) -> Request: + req = Request() + ret = self._reader.read_one_req(req) # return 0 if success + if ret != 0: + raise RuntimeError("Failed to read one request") + return req def reset(self) -> None: self._reader.reset() @@ -198,19 +202,23 @@ def set_read_pos(self, pos: float) -> None: self._reader.set_read_pos(pos) def __iter__(self) -> Iterator[Request]: - return self._reader.__iter__() + self._reader.reset() + return self def __len__(self) -> int: return self._reader.get_num_of_req() def __next__(self) -> Request: - if self._reader.n_req_left == 0: + req = Request() + ret = self._reader.read_one_req(req) + if ret != 0: raise StopIteration - return self._reader.read_one_req() + return req def __getitem__(self, index: int) -> Request: if index < 0 or index >= self._reader.get_num_of_req(): raise IndexError("Index out of range") self._reader.reset() self._reader.skip_n_req(index) - return self._reader.read_one_req() + req = Request() + return self._reader.read_one_req(req) diff --git a/src/export_reader.cpp b/src/export_reader.cpp index 5eb6ce5..8f286f3 100644 --- a/src/export_reader.cpp +++ b/src/export_reader.cpp @@ -264,7 +264,7 @@ void export_reader(py::module& m) { if (ret != 0) { throw std::runtime_error("Failed to read request"); } - return req; + return ret; }, "req"_a) .def("reset", [](reader_t& self) { reset_reader(&self); }) diff --git a/tests/test_reader.py b/tests/test_reader.py index bb3cb7b..688217a 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -30,7 +30,7 @@ def test_zipf_distribution(self): # Read some requests and verify they are valid req = Request() - first_req = reader.read_one_req(req) + first_req = reader.read_one_req() assert first_req.obj_id >= 0 assert first_req.obj_size == 1024 assert hasattr(first_req, "op") # Just check it has op attribute @@ -44,7 +44,7 @@ def test_uniform_distribution(self): # Read some requests req = Request() for _ in range(10): - read_req = reader.read_one_req(req) + read_req = reader.read_one_req() assert read_req.obj_size == 512 assert hasattr(read_req, "op") # Just check it has op attribute @@ -68,13 +68,13 @@ def test_reader_reset(self): # Read some requests req = Request() - first_read = reader.read_one_req(req) - reader.read_one_req(req) - reader.read_one_req(req) + first_read = reader.read_one_req() + reader.read_one_req() + reader.read_one_req() # Reset and read again reader.reset() - reset_read = reader.read_one_req(req) + reset_read = reader.read_one_req() # Should get the same first request after reset assert first_read.obj_id == reset_read.obj_id @@ -89,7 +89,7 @@ def test_skip_requests(self): # Verify we can still read remaining requests req = Request() - read_req = reader.read_one_req(req) + read_req = reader.read_one_req() assert read_req.valid == True # Should still be able to read def test_clone_reader(self): @@ -98,8 +98,8 @@ def test_clone_reader(self): # Read some requests req = Request() - reader.read_one_req(req) - reader.read_one_req(req) + reader.read_one_req() + reader.read_one_req() # Clone the reader cloned_reader = reader.clone() @@ -160,7 +160,7 @@ def test_csv_trace_creation(self): # Read first request req = Request() - first_req = reader.read_one_req(req) + first_req = reader.read_one_req() assert first_req.obj_id == 100 assert first_req.obj_size == 1024 @@ -192,11 +192,11 @@ def test_trace_reader_iteration(self): # Read requests one by one instead of using list() req = Request() - first_req = reader.read_one_req(req) + first_req = reader.read_one_req() assert first_req.obj_id == 100 assert first_req.obj_size == 1024 - second_req = reader.read_one_req(req) + second_req = reader.read_one_req() assert second_req.obj_id == 101 assert second_req.obj_size == 2048 @@ -226,21 +226,21 @@ def test_trace_reader_reset_and_skip(self): # Read some requests req = Request() - first_req = reader.read_one_req(req) - reader.read_one_req(req) + first_req = reader.read_one_req() + reader.read_one_req() # Reset and verify we get same first request reader.reset() - reset_req = reader.read_one_req(req) + reset_req = reader.read_one_req() assert first_req.obj_id == reset_req.obj_id # Test skip functionality reader.reset() # Instead of using skip_n_req which might fail, just read requests one by one for _ in range(5): - reader.read_one_req(req) + reader.read_one_req() - next_req = reader.read_one_req(req) + next_req = reader.read_one_req() assert next_req.obj_id == 105 # Should be 6th request (100+5) finally: @@ -276,7 +276,7 @@ def test_trace_reader_sampling(self): # Read a few requests to verify it works req = Request() - first_req = reader.read_one_req(req) + first_req = reader.read_one_req() assert first_req.valid == True finally: @@ -411,8 +411,8 @@ def test_request_format_consistency(self): # Get requests from both readers req = Request() - synthetic_req = synthetic_reader.read_one_req(req) - trace_req = trace_reader.read_one_req(req) + synthetic_req = synthetic_reader.read_one_req() + trace_req = trace_reader.read_one_req() # Both should produce Request objects with same attributes assert hasattr(synthetic_req, "obj_id") From a51375c133f0af30672018162fa7f16d279c9c66 Mon Sep 17 00:00:00 2001 From: haochengxia Date: Sat, 2 Aug 2025 02:16:47 +0000 Subject: [PATCH 2/3] Add code type --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0a6afc5..4cb6202 100644 --- a/README.md +++ b/README.md @@ -94,7 +94,7 @@ You need to implement these callback functions:
An example for LRU -``` +```python from collections import OrderedDict from libcachesim import PluginCache, CommonCacheParams, Request, SyntheticReader, LRU From 211d4717c52d78997dc7a3df0f3790a51f7dfe2a Mon Sep 17 00:00:00 2001 From: haochengxia Date: Sat, 2 Aug 2025 23:35:52 +0000 Subject: [PATCH 3/3] Fix s3fifo plugin cache --- README.md | 6 +- docs/src/en/index.md | 2 +- docs/src/zh/index.md | 2 +- examples/plugin_cache/lru.py | 2 +- examples/plugin_cache/s3fifo.py | 274 +++++++++++++++++--------------- libcachesim/cache.py | 2 +- src/export_cache.cpp | 2 +- 7 files changed, 149 insertions(+), 141 deletions(-) diff --git a/README.md b/README.md index 4cb6202..462bb65 100644 --- a/README.md +++ b/README.md @@ -76,11 +76,7 @@ Implement custom cache replacement algorithms using pure Python functions - **no ### Python Hook Cache Overview -The `PluginCache` allows you to define custom caching behavior through Python callback functions without without any C/C++ compilation. - -### Hook Functions - -You need to implement these callback functions: +The `PluginCache` allows you to define custom caching behavior through Python callback functions. You need to implement these callback functions: | Function | Signature | Description | |----------|-----------|-------------| diff --git a/docs/src/en/index.md b/docs/src/en/index.md index 0b0e732..2eba51f 100644 --- a/docs/src/en/index.md +++ b/docs/src/en/index.md @@ -65,4 +65,4 @@ We welcome contributions! Please see our [GitHub repository](https://github.com/ ## License -This project is licensed under the Apache License 2.0. +This project is licensed under the GPL-3.0 License. diff --git a/docs/src/zh/index.md b/docs/src/zh/index.md index d900ad6..997399a 100644 --- a/docs/src/zh/index.md +++ b/docs/src/zh/index.md @@ -65,4 +65,4 @@ pip install -e . ## 许可证 -本项目采用 Apache License 2.0 许可证。 +本项目采用 GPL-3.0 许可证。 diff --git a/examples/plugin_cache/lru.py b/examples/plugin_cache/lru.py index 04940b3..da17836 100644 --- a/examples/plugin_cache/lru.py +++ b/examples/plugin_cache/lru.py @@ -62,7 +62,7 @@ def cache_free_hook(cache): reader = SyntheticReader( num_of_req=100000, - num_objects=100, + num_objects=10000, obj_size=100, seed=42, alpha=0.8, diff --git a/examples/plugin_cache/s3fifo.py b/examples/plugin_cache/s3fifo.py index 576d841..829710d 100644 --- a/examples/plugin_cache/s3fifo.py +++ b/examples/plugin_cache/s3fifo.py @@ -1,6 +1,12 @@ # An example of plugin for s3fifo + +# NOTE(haocheng): the one shows that with plugin system, we can make cache as lego blocks +# Happy caching! + +import libcachesim as lcs from collections import OrderedDict -from libcachesim import PluginCache, CommonCacheParams, Request, S3FIFO, SyntheticReader +from collections import deque +from libcachesim import PluginCache, CommonCacheParams, Request, S3FIFO, FIFO, SyntheticReader # NOTE(haocheng): we only support ignore object size for now class StandaloneS3FIFO: @@ -9,160 +15,157 @@ def __init__(self, ghost_size_ratio: float = 0.9, move_to_main_threshold: int = 2, cache_size: int = 1024): - # S3-FIFO uses three queues with OrderedDict for O(1) operations - self.small_fifo = OrderedDict() - self.main_fifo = OrderedDict() - self.ghost_fifo = OrderedDict() - - # Size limits - self.small_max_size = int(small_size_ratio * cache_size) - self.main_max_size = int(cache_size - small_size_ratio * cache_size) - self.ghost_max_size = int(ghost_size_ratio * cache_size) + self.cache_size = cache_size + small_fifo_size = int(small_size_ratio * cache_size) + main_fifo_size = cache_size - small_fifo_size + ghost_fifo_size = int(ghost_size_ratio * cache_size) + + self.small_set = set() + self.main_set = set() + self.ghost_set = deque(maxlen=ghost_fifo_size) + + self.small_fifo = FIFO(small_fifo_size) + self.main_fifo = FIFO(main_fifo_size) + self.ghost_fifo = FIFO(ghost_fifo_size) # Frequency tracking - self.small_freq = {} - self.main_freq = {} - self.ghost_freq = {} + self.freq = {} # Other parameters self.max_freq = 3 self.move_to_main_threshold = move_to_main_threshold - def cache_hit(self, obj_id): - """ - Cache hit can happen in two cases: - 1. Small FIFO cache hit (small_fifo) - 2. Main FIFO cache hit (main_fifo) - """ - if obj_id in self.main_fifo: - self.main_freq[obj_id] += 1 - elif obj_id in self.small_fifo: - self.small_freq[obj_id] += 1 - else: - print(f"Cache hit for obj_id {obj_id} but not found in any queue") - print(f"small_fifo: {list(self.small_fifo.keys())}") - print(f"main_fifo: {list(self.main_fifo.keys())}") - print(f"ghost_fifo: {list(self.ghost_fifo.keys())}") - assert False, "Cache hit should happen in small_fifo or main_fifo" + self.has_evicted = False # Mark if we start to evict, only after full we will start eviction + self.hit_on_ghost = False + + def cache_hit(self, req: Request): + hit_small = False + hit_main = False + if self.small_fifo.find(req, update_cache=False): + self.freq[req.obj_id] += 1 + + if self.main_fifo.find(req, update_cache=False): + self.freq[req.obj_id] += 1 - def cache_miss(self, obj_id, obj_size=1): - """ - Cache miss can happen in three cases: - 1. Miss in small and main but hit in ghost - 2. Miss all three queues - """ - if obj_id in self.ghost_fifo: - del self.ghost_fifo[obj_id] - del self.ghost_freq[obj_id] - self.insert_to_main(obj_id) + def cache_miss(self, req: Request): + if not self.hit_on_ghost: + obj = self.ghost_fifo.find(req, update_cache=False) + if obj is not None: + self.hit_on_ghost = True + # remove from ghost set + self.ghost_fifo.remove(req.obj_id) + self.ghost_set.remove(req.obj_id) + + + # NOTE(haocheng): first we need to know this miss object has record in ghost or not + if not self.hit_on_ghost: + if req.obj_size >= self.small_fifo.cache_size: + # If object is too large, we do not process it + return + + # If is initialization state, we need to insert to small fifo, + # then we can insert to main fifo + if not self.has_evicted and self.small_fifo.get_occupied_byte() >= self.small_fifo.cache_size: + obj = self.main_fifo.insert(req) + self.main_set.add(obj.obj_id) + else: + obj = self.small_fifo.insert(req) + self.small_set.add(obj.obj_id) else: - # Miss all three queues - cond = (obj_id not in self.small_fifo) and (obj_id not in self.main_fifo) - assert cond, "Should not be in small_fifo or main_fifo" - - # Then we need to insert to small fifo queue - self.insert_to_small(obj_id) - - def insert_to_small(self, obj_id): - if len(self.small_fifo) >= self.small_max_size: - self.cache_evict_small() - self.small_fifo[obj_id] = None # OrderedDict value doesn't matter - self.small_freq[obj_id] = 0 - - def insert_to_main(self, obj_id): - if len(self.main_fifo) >= self.main_max_size: - self.cache_evict_main() - self.main_fifo[obj_id] = None - self.main_freq[obj_id] = 0 - - def insert_to_ghost(self, obj_id, original_freq=0): - if len(self.ghost_fifo) >= self.ghost_max_size: - # Remove oldest item - oldest_id = next(iter(self.ghost_fifo)) - del self.ghost_fifo[oldest_id] - del self.ghost_freq[oldest_id] - self.ghost_fifo[obj_id] = None - self.ghost_freq[obj_id] = original_freq + obj = self.main_fifo.insert(req) + self.main_set.add(req.obj_id) + self.hit_on_ghost = False + self.freq[obj.obj_id] = 0 - def cache_evict_small(self): + def cache_evict_small(self, req: Request): has_evicted = False evicted_id = None - while not has_evicted and len(self.small_fifo) > 0: - obj_to_evict = next(iter(self.small_fifo)) # Get first item - if self.small_freq[obj_to_evict] >= self.move_to_main_threshold: - # Move to main fifo cache (not real evict, just move) - del self.small_fifo[obj_to_evict] - del self.small_freq[obj_to_evict] - self.insert_to_main(obj_to_evict) + real_evicted_id = None + while not has_evicted and self.small_fifo.get_occupied_byte() > 0: + obj_to_evict = self.small_fifo.to_evict(req) + evicted_id = obj_to_evict.obj_id # Store the ID before any operations + if self.freq[obj_to_evict.obj_id] >= self.move_to_main_threshold: + new_req = Request(obj_id=evicted_id, obj_size=1) + self.main_fifo.insert(new_req) + self.main_set.add(evicted_id) + # Reset frequency + self.freq[evicted_id] = 0 else: - evicted_id = obj_to_evict - # Insert to ghost fifo cache (real evict) - del self.small_fifo[obj_to_evict] - del self.small_freq[obj_to_evict] - self.insert_to_ghost(obj_to_evict) + new_req = Request(obj_id=evicted_id, obj_size=1) + self.ghost_fifo.get(new_req) + self.ghost_set.append(evicted_id) has_evicted = True - return evicted_id + real_evicted_id = evicted_id + flag = self.small_fifo.remove(evicted_id) + self.small_set.remove(evicted_id) + assert flag, "Should be able to remove" + return real_evicted_id - def cache_evict_main(self): + def cache_evict_main(self, req: Request): has_evicted = False evicted_id = None - while not has_evicted and len(self.main_fifo) > 0: - obj_to_evict = next(iter(self.main_fifo)) # Get first item - freq = self.main_freq[obj_to_evict] + while not has_evicted and self.main_fifo.get_occupied_byte() > 0: + obj_to_evict = self.main_fifo.to_evict(req) + assert obj_to_evict is not None + evicted_id = obj_to_evict.obj_id # Store the ID before any operations + freq = self.freq[evicted_id] if freq >= 1: # Reinsert with decremented frequency - del self.main_fifo[obj_to_evict] - del self.main_freq[obj_to_evict] - self.insert_to_main(obj_to_evict) - self.main_freq[obj_to_evict] = min(freq, self.max_freq) - 1 + self.main_fifo.remove(evicted_id) + self.main_set.remove(evicted_id) + new_req = Request(obj_id=evicted_id, obj_size=1) + self.main_fifo.insert(new_req) + self.main_set.add(evicted_id) + self.freq[evicted_id] = min(freq, self.max_freq) - 1 else: - evicted_id = obj_to_evict - # Real eviction - del self.main_fifo[obj_to_evict] - del self.main_freq[obj_to_evict] + flag = self.main_fifo.remove(evicted_id) + self.main_set.remove(evicted_id) has_evicted = True + # print(f"Evicted {evicted_id}") return evicted_id - def cache_evict(self): - evicted_id = None - # if main is full or small is empty, evict main - if len(self.main_fifo) >= self.main_max_size or len(self.small_fifo) == 0: - evicted_id = self.cache_evict_main() - # if small is not empty, evict small + def cache_evict(self, req: Request): + if not self.hit_on_ghost: + obj = self.ghost_fifo.find(req, update_cache=False) + if obj is not None: + self.hit_on_ghost = True + # remove from ghost set + self.ghost_fifo.remove(req.obj_id) + self.ghost_set.remove(req.obj_id) + + self.has_evicted = True + cond = (self.main_fifo.get_occupied_byte() > self.main_fifo.cache_size) + if (cond or (self.small_fifo.get_occupied_byte() == 0)): + obj_id = self.cache_evict_main(req) else: - evicted_id = self.cache_evict_small() - if evicted_id is None: - assert False, "Should not be None" - return evicted_id + obj_id = self.cache_evict_small(req) + + if obj_id is not None: + del self.freq[obj_id] + + return obj_id def cache_remove(self, obj_id): removed = False - if obj_id in self.small_fifo: - del self.small_fifo[obj_id] - del self.small_freq[obj_id] - removed = True - elif obj_id in self.ghost_fifo: - del self.ghost_fifo[obj_id] - del self.ghost_freq[obj_id] - removed = True - elif obj_id in self.main_fifo: - del self.main_fifo[obj_id] - del self.main_freq[obj_id] - removed = True + removed |= self.small_fifo.remove(obj_id) + removed |= self.ghost_fifo.remove(obj_id) + removed |= self.main_fifo.remove(obj_id) return removed def cache_init_hook(common_cache_params: CommonCacheParams): return StandaloneS3FIFO(cache_size=common_cache_params.cache_size) def cache_hit_hook(cache, request: Request): - cache.cache_hit(request.obj_id) + cache.cache_hit(request) def cache_miss_hook(cache, request: Request): - cache.cache_miss(request.obj_id, request.obj_size) + cache.cache_miss(request) def cache_eviction_hook(cache, request: Request): - # NOTE(haocheng): never called - pass + evicted_id = None + while evicted_id is None: + evicted_id = cache.cache_evict(request) + return evicted_id def cache_remove_hook(cache, obj_id): cache.cache_remove(obj_id) @@ -176,7 +179,7 @@ def cache_free_hook(cache): cache.main_freq.clear() cache = PluginCache( - cache_size=1024*1024, + cache_size=1024, cache_init_hook=cache_init_hook, cache_hit_hook=cache_hit_hook, cache_miss_hook=cache_miss_hook, @@ -185,20 +188,29 @@ def cache_free_hook(cache): cache_free_hook=cache_free_hook, cache_name="S3FIFO") -ref_s3fifo = S3FIFO(cache_size=1024) +URI = "cache_dataset_oracleGeneral/2007_msr/msr_hm_0.oracleGeneral.zst" +dl = lcs.DataLoader() +dl.load(URI) -reader = SyntheticReader( - num_of_req=1000000, - num_objects=100, - obj_size=1, - seed=42, - alpha=0.8, - dist="zipf", +# Step 2: Open trace and process efficiently +reader = lcs.TraceReader( + trace = dl.get_cache_path(URI), + trace_type = lcs.TraceType.ORACLE_GENERAL_TRACE, + reader_init_params = lcs.ReaderInitParam(ignore_obj_size=True) ) -for req in reader: - plugin_hit = cache.get(req) - ref_hit = ref_s3fifo.get(req) - assert plugin_hit == ref_hit, f"Cache hit mismatch: {plugin_hit} != {ref_hit}" +ref_s3fifo = S3FIFO(cache_size=1024, small_size_ratio=0.1, ghost_size_ratio=0.9, move_to_main_threshold=2) + +# for req in reader: +# hit = cache.get(req) +# ref_hit = ref_s3fifo.get(req) +# assert hit == ref_hit, f"Cache hit mismatch: {hit} != {ref_hit}" + +req_miss_ratio, byte_miss_ratio = cache.process_trace(reader) +ref_req_miss_ratio, ref_byte_miss_ratio = ref_s3fifo.process_trace(reader) +print(f"Plugin req miss ratio: {req_miss_ratio}, ref req miss ratio: {ref_req_miss_ratio}") +print(f"Plugin byte miss ratio: {byte_miss_ratio}, ref byte miss ratio: {ref_byte_miss_ratio}") +assert req_miss_ratio == ref_req_miss_ratio +assert byte_miss_ratio == ref_byte_miss_ratio print("All requests processed successfully. Plugin cache matches reference S3FIFO cache.") \ No newline at end of file diff --git a/libcachesim/cache.py b/libcachesim/cache.py index 99a17aa..b61a512 100644 --- a/libcachesim/cache.py +++ b/libcachesim/cache.py @@ -54,7 +54,7 @@ def __init__(self, _cache: Cache): def get(self, req: Request) -> bool: return self._cache.get(req) - def find(self, req: Request, update_cache: bool = True) -> CacheObject: + def find(self, req: Request, update_cache: bool = True) -> Optional[CacheObject]: return self._cache.find(req, update_cache) def can_insert(self, req: Request) -> bool: diff --git a/src/export_cache.cpp b/src/export_cache.cpp index cff2031..8fb3f04 100644 --- a/src/export_cache.cpp +++ b/src/export_cache.cpp @@ -281,7 +281,7 @@ void export_cache(py::module& m) { "find", [](cache_t& self, const request_t& req, const bool update_cache) { cache_obj_t* obj = self.find(&self, &req, update_cache); - return py::cast(obj, py::return_value_policy::reference); + return obj ? py::cast(obj, py::return_value_policy::reference) : py::none(); }, "req"_a, "update_cache"_a = true) .def(