From c02027f7c1358b4495cec4d81be761c9237fd4ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Ole=20G=C3=B6deke?= Date: Sat, 13 Dec 2025 23:30:50 +0100 Subject: [PATCH 01/12] Add pytest and tests for CLI and Python module - Add pytest configuration for test management - Implement test suite for CLI functionality and Python module - Update README with testing instructions and badge - Fix Dockerfile - Create .dockerignore to exclude unnecessary files from Docker builds - Add GitHub Actions workflows for testing - Clean up makefile to include test commands --- .dockerignore | 27 ++ .github/workflows/pypi-deploy.yml | 4 + .github/workflows/test.yml | 21 ++ .gitignore | 3 + Dockerfile | 13 +- README.md | 8 + makefile | 25 +- pytest.ini | 9 + tests/README.md | 30 +++ tests/test_cli.py | 242 +++++++++++++++++ tests/test_python.py | 423 ++++++++++++++++++++++++++++++ 11 files changed, 796 insertions(+), 9 deletions(-) create mode 100644 .dockerignore create mode 100644 .github/workflows/test.yml create mode 100644 pytest.ini create mode 100644 tests/README.md create mode 100644 tests/test_cli.py create mode 100644 tests/test_python.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..997485c --- /dev/null +++ b/.dockerignore @@ -0,0 +1,27 @@ +# Build artifacts +*.o +*.so +*.pyc +imctermite +main.cpp.cpp + +# Python build +python/build/ +python/dist/ +python/*.so +python/*.cpp +python/lib/ +python/LICENSE +python/README.md +python/*.egg-info/ +__pycache__/ + +# Git and editor +.git/ +.venv/ +*.swp +*.swo +*~ + +# Test outputs +.pytest_cache/ diff --git a/.github/workflows/pypi-deploy.yml b/.github/workflows/pypi-deploy.yml index aa02101..aa89ef9 100644 --- a/.github/workflows/pypi-deploy.yml +++ b/.github/workflows/pypi-deploy.yml @@ -8,9 +8,13 @@ on: jobs: + test: + uses: ./.github/workflows/test.yml + build_setup: name: Prepare environment for wheel builds runs-on: ubuntu-24.04 + needs: [test] steps: - uses: actions/checkout@v2 - name: Prepare wheel build diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..ce243c4 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,21 @@ +name: Run Tests + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + test: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Build Docker image + run: docker build -t imctermite . + + - name: Run tests in container + run: docker run --rm imctermite make test diff --git a/.gitignore b/.gitignore index b4e57bd..947413f 100644 --- a/.gitignore +++ b/.gitignore @@ -43,3 +43,6 @@ python/*.soc python/lib/ python/*.cpp python/wheelhouse/ + +__pycache__/ +.pytest_cache/ diff --git a/Dockerfile b/Dockerfile index e5389a0..836f221 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,21 +1,22 @@ - -FROM debian:bullseye-20210111 +FROM debian:bullseye USER root RUN apt-get update && apt-get install -y \ build-essential git vim \ python3 python3-pip -RUN python3 -m pip install cython +RUN python3 -m pip install cython pytest +RUN ln -s /usr/bin/python3 /usr/bin/python RUN g++ -v -COPY ./ /IMCtermite/ +WORKDIR /IMCtermite +COPY ./ . # install CLI tool -RUN cd /IMCtermite && ls -lh && make install && ls -lh /usr/local/bin/imctermite +RUN make install # install Python module -RUN cd /IMCtermite && ls -lh && make cython-install +RUN make python-build CMD ["sleep","infinity"] diff --git a/README.md b/README.md index 6c167d7..f1b175d 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ [![LICENSE](https://img.shields.io/github/license/RecordEvolution/IMCtermite)](https://img.shields.io/github/license/RecordEvolution/IMCtermite) [![STARS](https://img.shields.io/github/stars/RecordEvolution/IMCtermite)](https://img.shields.io/github/stars/RecordEvolution/IMCtermite) +![Tests](https://github.com/RecordEvolution/IMCtermite/actions/workflows/test.yml/badge.svg) ![CI Build Wheel](https://github.com/RecordEvolution/IMCtermite/actions/workflows/pypi-deploy.yml/badge.svg?branch=&event=push) [![PYPI](https://img.shields.io/pypi/v/IMCtermite.svg)](https://pypi.org/project/imctermite/) @@ -27,6 +28,7 @@ Python module to integrate the _.raw_ format into any ETL workflow. * [File format](#Fileformat) * [Build and Installation](#Installation) * [Usage and Examples](#Usage) +* [Testing](#Testing) * [References](#References) ## File format @@ -217,6 +219,12 @@ A more complete [example](python/examples/usage.py), including the methods for obtaining the channels, i.a. their data and/or directly printing them to files, can be found in the `python/examples` folder. +## Testing + +Run end-to-end tests: `make test` + +See [tests/README.md](tests/README.md) for details. + ## References ### IMC diff --git a/makefile b/makefile index 2d88a26..d5e7b01 100644 --- a/makefile +++ b/makefile @@ -35,7 +35,7 @@ INST := /usr/local/bin # C++ and CLI tool # build executable -$(EXE): check-tags $(GVSN) main.o +$(EXE): check-tags main.o $(CC) $(OPT) main.o -o $@ # build main.cpp and include git version/commit tag @@ -86,7 +86,7 @@ docker-run: #-----------------------------------------------------------------------------# # python -python-build: check-tags $(GVSN) +python-build: check-tags make -C python/ build-inplace cp python/imctermite*.so ./ -v @@ -97,10 +97,29 @@ python-clean: python-test: PYTHONPATH=./ python python/examples/usage.py +#-----------------------------------------------------------------------------# +# tests + +test: $(EXE) python-build + @echo "Running all tests..." + @PYTHONPATH=./ pytest + +test-cli: $(EXE) + @echo "Running CLI tests..." + @PYTHONPATH=./ pytest tests/test_cli.py + +test-python: python-build + @echo "Running Python tests..." + @PYTHONPATH=./ pytest tests/test_python.py + #-----------------------------------------------------------------------------# # clean -clean: cpp-clean python-clean +test-clean: + rm -rf .pytest_cache + find tests/ -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true + +clean: cpp-clean python-clean test-clean #-----------------------------------------------------------------------------# # github actions diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..e0459a4 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,9 @@ +[pytest] +testpaths = tests +pythonpath = . +python_files = test_*.py +python_classes = Test* +python_functions = test_* +addopts = -v --strict-markers --tb=short +markers = + slow: marks tests as slow (deselect with '-m "not slow"') diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..aa8343c --- /dev/null +++ b/tests/README.md @@ -0,0 +1,30 @@ +# IMCtermite Tests + +End-to-end tests for both the CLI tool and Python module. + + +## Running Tests + +### All Tests +```bash +make test # Via makefile (builds if needed) +pytest # Direct pytest +``` + +### CLI Tests Only +```bash +make test-cli +pytest tests/test_cli.py +``` + +### Python Module Tests Only +```bash +make test-python +pytest tests/test_python.py +``` + +## Prerequisites + +```bash +pip install cython pytest setuptools +``` diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..6144e6c --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python3 +""" +End-to-end tests for IMCtermite CLI tool +""" + +import pytest +import subprocess +from pathlib import Path + +PROJECT_ROOT = Path(__file__).parent.parent +CLI = PROJECT_ROOT / "imctermite" +SAMPLES_DIR = PROJECT_ROOT / "samples" / "datasetA" + + +class TestCLIBasics: + """Test basic CLI functionality""" + + def test_cli_exists(self): + """CLI binary should exist""" + assert CLI.exists(), f"CLI not found at {CLI}" + + def test_help_output(self): + """Should display help message""" + result = subprocess.run([str(CLI), "--help"], capture_output=True, text=True) + assert result.returncode == 0 + assert "Usage:" in result.stdout or "usage:" in result.stdout.lower() + + def test_version_output(self): + """Should display version""" + result = subprocess.run([str(CLI), "--version"], capture_output=True, text=True) + assert result.returncode == 0 + assert len(result.stdout) > 0 + + def test_invalid_file_handling(self): + """Should fail gracefully on nonexistent file""" + result = subprocess.run( + [str(CLI), "/nonexistent/file.raw"], + capture_output=True, + text=True + ) + assert result.returncode != 0 + + +class TestChannelOperations: + """Test channel listing and data extraction""" + + @pytest.fixture + def sample_file(self): + """Get path to sample file""" + sample = SAMPLES_DIR / "datasetA_1.raw" + if not sample.exists(): + pytest.skip(f"Sample file not found: {sample}") + return sample + + def test_list_channels(self, sample_file): + """Should list channels with metadata""" + result = subprocess.run( + [str(CLI), str(sample_file), "--listchannels"], + capture_output=True, + text=True + ) + assert result.returncode == 0 + assert "uuid" in result.stdout + + def test_list_blocks(self, sample_file): + """Should list IMC blocks""" + result = subprocess.run( + [str(CLI), str(sample_file), "--listblocks"], + capture_output=True, + text=True + ) + assert result.returncode == 0 + # Block markers like CF, CK, CC, etc. + assert "C" in result.stdout and ("F" in result.stdout or "K" in result.stdout) + + +class TestCSVOutput: + """Test CSV file generation""" + + @pytest.fixture + def sample_file(self): + """Get path to sample file""" + sample = SAMPLES_DIR / "datasetA_1.raw" + if not sample.exists(): + pytest.skip(f"Sample file not found: {sample}") + return sample + + def test_generate_csv_output(self, sample_file, tmp_path): + """Should generate CSV files""" + output_dir = tmp_path / "csv_output" + output_dir.mkdir() + + result = subprocess.run( + [str(CLI), str(sample_file), "--output", str(output_dir)], + capture_output=True, + text=True + ) + assert result.returncode == 0 + + csv_files = list(output_dir.glob("*.csv")) + assert len(csv_files) > 0, "Should generate at least one CSV file" + + def test_csv_format_valid(self, sample_file, tmp_path): + """Generated CSV should have valid format""" + output_dir = tmp_path / "csv_output" + output_dir.mkdir() + + subprocess.run( + [str(CLI), str(sample_file), "--output", str(output_dir)], + capture_output=True + ) + + csv_files = list(output_dir.glob("*.csv")) + assert len(csv_files) > 0 + + # Check first CSV file + first_csv = csv_files[0] + content = first_csv.read_text() + lines = content.strip().split('\n') + + assert len(lines) > 1, "CSV should have header and data" + assert ',' in lines[0], "CSV should use comma delimiter" + + def test_custom_delimiter(self, sample_file, tmp_path): + """Should support custom delimiter""" + output_dir = tmp_path / "csv_delim" + output_dir.mkdir() + + result = subprocess.run( + [str(CLI), str(sample_file), "--output", str(output_dir), "--delimiter", ";"], + capture_output=True, + text=True + ) + assert result.returncode == 0 + + csv_files = list(output_dir.glob("*.csv")) + assert len(csv_files) > 0 + + # Check delimiter is applied + first_csv = csv_files[0] + content = first_csv.read_text() + first_line = content.split('\n')[0] + assert ';' in first_line, "Should use semicolon delimiter" + + +class TestMultipleFiles: + """Test processing multiple sample files""" + + def test_process_all_sample_files(self): + """Should successfully process all .raw and .dat files in samples directory (list channels)""" + samples_root = SAMPLES_DIR.parent + if not samples_root.exists(): + pytest.skip(f"Samples directory not found: {samples_root}") + + # Get all .raw and .dat files recursively + samples = sorted(list(samples_root.glob("*.raw")) + + list(samples_root.glob("*.dat")) + + list(samples_root.glob("**/*.raw")) + + list(samples_root.glob("**/*.dat"))) + # Remove duplicates + samples = sorted(set(samples)) + + if len(samples) == 0: + pytest.skip("No .raw or .dat files in samples directory") + + failed = [] + for sample in samples: + result = subprocess.run( + [str(CLI), str(sample), "--listchannels"], + capture_output=True, + text=True, + errors='replace' # Handle non-UTF8 characters in output + ) + if result.returncode != 0: + failed.append(f"{sample.relative_to(samples_root)}: exit code {result.returncode}") + + assert len(failed) == 0, f"Failed to process {len(failed)}/{len(samples)} files: {failed}" + + def test_extract_all_sample_files_with_data(self): + """Should successfully extract data from all .raw and .dat files""" + import tempfile + import shutil + + samples_root = SAMPLES_DIR.parent + if not samples_root.exists(): + pytest.skip(f"Samples directory not found: {samples_root}") + + # Get all .raw and .dat files recursively + samples = sorted(list(samples_root.glob("*.raw")) + + list(samples_root.glob("*.dat")) + + list(samples_root.glob("**/*.raw")) + + list(samples_root.glob("**/*.dat"))) + samples = sorted(set(samples)) + + if len(samples) == 0: + pytest.skip("No .raw or .dat files in samples directory") + + # Create temp directory for output + temp_dir = tempfile.mkdtemp() + try: + failed = [] + for sample in samples: + result = subprocess.run( + [str(CLI), str(sample), "--output", temp_dir], + capture_output=True, + text=True, + errors='replace' + ) + if result.returncode != 0: + failed.append(f"{sample.relative_to(samples_root)}: exit code {result.returncode}") + + assert len(failed) == 0, f"Failed to extract data from {len(failed)}/{len(samples)} files: {failed}" + finally: + shutil.rmtree(temp_dir, ignore_errors=True) + + +class TestExitCodes: + """Test exit code behavior""" + + def test_success_exit_code(self): + """Should return 0 on success""" + sample = SAMPLES_DIR / "datasetA_1.raw" + if not sample.exists(): + pytest.skip("Sample file not found") + + result = subprocess.run( + [str(CLI), str(sample), "--listchannels"], + capture_output=True + ) + assert result.returncode == 0 + + def test_error_exit_code(self): + """Should return non-zero on error""" + result = subprocess.run( + [str(CLI), "/nonexistent/file.raw"], + capture_output=True + ) + assert result.returncode != 0 + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/test_python.py b/tests/test_python.py new file mode 100644 index 0000000..7470709 --- /dev/null +++ b/tests/test_python.py @@ -0,0 +1,423 @@ +#!/usr/bin/env python3 +""" +End-to-end tests for IMCtermite Python module +""" + +import pytest +import os +import tempfile +import csv +from pathlib import Path + +try: + import imctermite +except ImportError: + pytest.skip("imctermite module not built - run 'make python-build' first", allow_module_level=True) + +PROJECT_ROOT = Path(__file__).parent.parent +SAMPLES_DIR = PROJECT_ROOT / "samples" +DATASET_A = SAMPLES_DIR / "datasetA" +DATASET_B = SAMPLES_DIR / "datasetB" + + +class TestModuleImport: + """Test basic module functionality""" + + def test_module_imports(self): + """Module should import without errors""" + assert imctermite is not None + + def test_can_instantiate(self): + """Should create instance with valid file""" + sample_file = DATASET_A / "datasetA_1.raw" + if not sample_file.exists(): + pytest.skip(f"Sample file not found: {sample_file}") + + imc = imctermite.imctermite(str(sample_file).encode()) + assert imc is not None + + +class TestChannelListing: + """Test channel metadata retrieval""" + + @pytest.fixture + def imc_instance(self): + """Create IMC instance with sample file""" + sample_file = DATASET_A / "datasetA_1.raw" + if not sample_file.exists(): + pytest.skip(f"Sample file not found: {sample_file}") + return imctermite.imctermite(str(sample_file).encode()) + + def test_get_channel_list(self, imc_instance): + """Should return list of channel metadata""" + channels = imc_instance.get_channels(include_data=False) + assert isinstance(channels, list) + assert len(channels) > 0 + + def test_channel_metadata_structure(self, imc_instance): + """Channel metadata should have required fields""" + channels = imc_instance.get_channels(include_data=False) + first_channel = channels[0] + + # Check for expected keys + required_keys = ['name', 'uuid'] + for key in required_keys: + assert key in first_channel, f"Missing key: {key}" + + def test_get_channel_data(self, imc_instance): + """Should return channel data with xdata and ydata""" + channels = imc_instance.get_channels(include_data=True) + assert isinstance(channels, list) + assert len(channels) > 0 + + first_channel = channels[0] + assert 'xdata' in first_channel + assert 'ydata' in first_channel + assert isinstance(first_channel['xdata'], list) + assert isinstance(first_channel['ydata'], list) + assert len(first_channel['xdata']) == len(first_channel['ydata']) + + +class TestDataIntegrity: + """Test data extraction and validation""" + + @pytest.fixture + def sample_data(self): + """Load sample file and extract data""" + sample_file = DATASET_A / "datasetA_1.raw" + if not sample_file.exists(): + pytest.skip(f"Sample file not found: {sample_file}") + + imc = imctermite.imctermite(str(sample_file).encode()) + return imc.get_channels(include_data=True) + + def test_data_arrays_not_empty(self, sample_data): + """Data arrays should not be empty""" + for channel in sample_data: + assert len(channel['xdata']) > 0 + assert len(channel['ydata']) > 0 + + def test_data_values_are_numeric(self, sample_data): + """All data values should be numeric""" + for channel in sample_data: + for x in channel['xdata'][:10]: # Check first 10 + assert isinstance(x, (int, float)) + for y in channel['ydata'][:10]: + assert isinstance(y, (int, float)) + + def test_xdata_monotonic(self, sample_data): + """X-data (time) should be monotonically increasing""" + for channel in sample_data: + xdata = channel['xdata'] + if len(xdata) > 1: + # Check if mostly increasing (allow small floating point issues) + increasing_count = sum(1 for i in range(len(xdata)-1) if xdata[i] <= xdata[i+1]) + ratio = increasing_count / (len(xdata) - 1) + assert ratio > 0.95, f"X-data not monotonic enough: {ratio:.2%}" + + +class TestCSVOutput: + """Test CSV file generation""" + + @pytest.fixture + def imc_instance(self): + """Create IMC instance""" + sample_file = DATASET_A / "datasetA_1.raw" + if not sample_file.exists(): + pytest.skip(f"Sample file not found: {sample_file}") + return imctermite.imctermite(str(sample_file).encode()) + + def test_print_channel_to_csv(self, imc_instance, tmp_path): + """Should create CSV file for single channel""" + output_file = tmp_path / "test_channel.csv" + + channels = imc_instance.get_channels(include_data=False) + if len(channels) == 0: + pytest.skip("No channels in sample file") + + channel_uuid = channels[0]['uuid'] + imc_instance.print_channel(channel_uuid.encode(), str(output_file).encode(), b','[0]) + + assert output_file.exists() + assert output_file.stat().st_size > 0 + + def test_csv_format_valid(self, imc_instance, tmp_path): + """Generated CSV should be valid""" + output_file = tmp_path / "test_channel.csv" + + channels = imc_instance.get_channels(include_data=False) + if len(channels) == 0: + pytest.skip("No channels in sample file") + + channel_uuid = channels[0]['uuid'] + imc_instance.print_channel(channel_uuid.encode(), str(output_file).encode(), b','[0]) + + # Read and validate CSV + with open(output_file, 'r') as f: + reader = csv.reader(f) + rows = list(reader) + + assert len(rows) > 1, "CSV should have header and data" + assert len(rows[0]) == 2, "CSV should have 2 columns" + + # Check second row is numeric (first row is header with units) + if len(rows) > 1: + data_row = rows[1] + try: + float(data_row[0]) # Should not raise + float(data_row[1]) # Should not raise + except ValueError: + # Maybe first row is header, try second data row + if len(rows) > 2: + data_row = rows[2] + float(data_row[0]) + float(data_row[1]) + + def test_print_all_channels(self, imc_instance, tmp_path): + """Should create CSV files for all channels""" + output_dir = tmp_path / "all_channels" + output_dir.mkdir() + + imc_instance.print_channels(str(output_dir).encode(), b','[0]) + + csv_files = list(output_dir.glob("*.csv")) + assert len(csv_files) > 0, "Should generate at least one CSV file" + + +class TestMultipleFiles: + """Test processing multiple sample files""" + + def test_process_all_sample_files(self): + """Should process all .raw and .dat files in samples directory (metadata only)""" + if not SAMPLES_DIR.exists(): + pytest.skip(f"Samples directory not found: {SAMPLES_DIR}") + + # Get all .raw and .dat files recursively + files_to_test = sorted(list(SAMPLES_DIR.glob("*.raw")) + + list(SAMPLES_DIR.glob("*.dat")) + + list(SAMPLES_DIR.glob("**/*.raw")) + + list(SAMPLES_DIR.glob("**/*.dat"))) + # Remove duplicates (files in root will be in both patterns) + files_to_test = sorted(set(files_to_test)) + + if len(files_to_test) == 0: + pytest.skip("No .raw or .dat files in samples directory") + + successful = 0 + failed = [] + for sample_file in files_to_test: + try: + imc = imctermite.imctermite(str(sample_file).encode()) + channels = imc.get_channels(include_data=False) + if len(channels) > 0: + successful += 1 + except Exception as e: + failed.append(f"{sample_file.relative_to(SAMPLES_DIR)}: {e}") + + assert len(failed) == 0, f"Failed to process {len(failed)}/{len(files_to_test)} files: {failed}" + assert successful == len(files_to_test), f"Only {successful}/{len(files_to_test)} files had channels" + + def test_extract_all_sample_files_with_data(self): + """Should fully extract all .raw and .dat files with data""" + if not SAMPLES_DIR.exists(): + pytest.skip(f"Samples directory not found: {SAMPLES_DIR}") + + # Get all .raw and .dat files recursively + files_to_test = sorted(list(SAMPLES_DIR.glob("*.raw")) + + list(SAMPLES_DIR.glob("*.dat")) + + list(SAMPLES_DIR.glob("**/*.raw")) + + list(SAMPLES_DIR.glob("**/*.dat"))) + files_to_test = sorted(set(files_to_test)) + + if len(files_to_test) == 0: + pytest.skip("No .raw or .dat files in samples directory") + + successful = 0 + failed = [] + for sample_file in files_to_test: + try: + imc = imctermite.imctermite(str(sample_file).encode()) + channels = imc.get_channels(include_data=True) + + # Verify we got data + if len(channels) > 0: + # Check that at least one channel has actual data (xdata or ydata) + has_data = False + for channel in channels: + if ('xdata' in channel and len(channel['xdata']) > 0) or \ + ('ydata' in channel and len(channel['ydata']) > 0): + has_data = True + break + + if has_data: + successful += 1 + else: + failed.append(f"{sample_file.relative_to(SAMPLES_DIR)}: no data in channels") + else: + failed.append(f"{sample_file.relative_to(SAMPLES_DIR)}: no channels found") + except Exception as e: + failed.append(f"{sample_file.relative_to(SAMPLES_DIR)}: {e}") + + assert len(failed) == 0, f"Failed to extract data from {len(failed)}/{len(files_to_test)} files: {failed}" + assert successful == len(files_to_test), f"Only {successful}/{len(files_to_test)} files extracted with data" + + def test_reload_different_file(self): + """Should be able to load different files sequentially""" + file1 = DATASET_A / "datasetA_1.raw" + file2 = DATASET_A / "datasetA_2.raw" + + if not (file1.exists() and file2.exists()): + pytest.skip("Need at least 2 sample files") + + # Load first file + imc1 = imctermite.imctermite(str(file1).encode()) + channels1 = imc1.get_channels(include_data=False) + + # Load second file + imc2 = imctermite.imctermite(str(file2).encode()) + channels2 = imc2.get_channels(include_data=False) + + # Both should work + assert len(channels1) > 0 + assert len(channels2) > 0 + + +class TestDataRegression: + """Test specific known values to catch parsing regressions""" + + @pytest.mark.parametrize("file_path,expected", [ + # datasetA_1.raw - Standard .raw format with gravity unit + ("datasetA/datasetA_1.raw", { + 'num_channels': 1, + 'data_length': 6000, + 'yunit': 'G', + 'xstepwidth': 0.005, + 'ydata_first': [0.010029276, 0.015780726], + 'ydata_last': [-0.02981583, -0.030068753], # [-2], [-1] + 'xdata_first': [416.01], + }), + # sampleA.raw - Pressure data with mbar units + ("sampleA.raw", { + 'num_channels': 1, + 'data_length': 2402, + 'yunit': '"mbar"', + 'xoffset': 2044.03, + 'ydata_first': [956.013793945, 955.484924316, 955.487670898], + 'ydata_last': [866.840881348, 866.91619873, 866.985290527], # [-3], [-2], [-1] + }), + # XY_dataset_example.dat - Different .dat format with explicit X-Y data + ("XY_dataset_example.dat", { + 'num_channels': 1, + 'data_length': 13094, + 'ydata_first': [0, 0, 0], + 'ydata_last': [2796202, 2796202, 2982616], # [-3], [-2], [-1] + 'xdata_first': [67.855759, 67.880796], + 'xdata_last': [395.158317], + }), + ]) + def test_known_values(self, file_path, expected): + """Verify known values from sample files to catch parsing regressions""" + sample_file = SAMPLES_DIR / file_path + if not sample_file.exists(): + pytest.skip(f"Sample file not found: {sample_file}") + + imc = imctermite.imctermite(str(sample_file).encode()) + channels = imc.get_channels(include_data=True) + + # Check number of channels + assert len(channels) == expected['num_channels'], \ + f"Should have {expected['num_channels']} channel(s)" + + ch = channels[0] + + # Verify data length + ydata = ch.get('ydata', []) + assert len(ydata) == expected['data_length'], \ + f"Should have {expected['data_length']} data points" + + # Verify metadata if specified + if 'yunit' in expected: + assert ch.get('yunit') == expected['yunit'], \ + f"Unit should be {expected['yunit']}" + + if 'xstepwidth' in expected: + assert abs(float(ch.get('xstepwidth')) - expected['xstepwidth']) < 1e-9, \ + f"X step width should be {expected['xstepwidth']}" + + if 'xoffset' in expected: + assert abs(float(ch.get('xoffset')) - expected['xoffset']) < 1e-9, \ + f"X offset should be {expected['xoffset']}" + + # Verify ydata first values + tolerance = 1e-6 # Default tolerance for floating-point comparisons + for i, expected_val in enumerate(expected['ydata_first']): + if isinstance(expected_val, float): + assert abs(ydata[i] - expected_val) < tolerance, \ + f"ydata[{i}] should be {expected_val}" + else: + assert ydata[i] == expected_val, \ + f"ydata[{i}] should be {expected_val}" + + # Verify ydata last values + for i, expected_val in enumerate(expected['ydata_last']): + idx = -(len(expected['ydata_last']) - i) + if isinstance(expected_val, float): + assert abs(ydata[idx] - expected_val) < tolerance, \ + f"ydata[{idx}] should be {expected_val}" + else: + assert ydata[idx] == expected_val, \ + f"ydata[{idx}] should be {expected_val}" + + # Verify xdata if specified + if 'xdata_first' in expected: + xdata = ch.get('xdata', []) + for i, expected_val in enumerate(expected['xdata_first']): + assert abs(xdata[i] - expected_val) < tolerance, \ + f"xdata[{i}] should be {expected_val}" + + if 'xdata_last' in expected: + xdata = ch.get('xdata', []) + for i, expected_val in enumerate(expected['xdata_last']): + idx = -(len(expected['xdata_last']) - i) + assert abs(xdata[idx] - expected_val) < tolerance, \ + f"xdata[{idx}] should be {expected_val}" + + +class TestErrorHandling: + """Test error conditions""" + + def test_nonexistent_file(self): + """Should raise error for nonexistent file""" + with pytest.raises(Exception): + imctermite.imctermite(b"/nonexistent/file.raw") + + def test_invalid_channel_name(self): + """Should handle invalid channel name gracefully""" + sample_file = DATASET_A / "datasetA_1.raw" + if not sample_file.exists(): + pytest.skip(f"Sample file not found: {sample_file}") + + imc = imctermite.imctermite(str(sample_file).encode()) + + # This should either raise or return empty - both are acceptable + try: + with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: + output_file = f.name + + imc.print_channel(b"NONEXISTENT_CHANNEL_UUID", output_file.encode(), b','[0]) + + # If it didn't raise, check if file is empty or has minimal content + if os.path.exists(output_file): + size = os.path.getsize(output_file) + # Either file doesn't exist or is very small (just header) + assert size < 100 + except Exception: + # Raising an exception is also acceptable behavior + pass + finally: + if os.path.exists(output_file): + os.unlink(output_file) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) From 574393c1d0c6123954788f4e312419dcb25e323f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Ole=20G=C3=B6deke?= Date: Sun, 14 Dec 2025 22:01:20 +0100 Subject: [PATCH 02/12] Implement chunked NumPy export for large files and add corresponding example and tests --- README.md | 22 ++++++++ lib/imc_raw.hpp | 62 ++++++++++++++++++++++ python/examples/usage_numpy_chunks.py | 56 ++++++++++++++++++++ python/imctermite.pxd | 13 +++++ python/imctermite.pyx | 32 +++++++++++- python/setup.py | 2 + tests/test_python.py | 75 +++++++++++++++++++++++---- 7 files changed, 251 insertions(+), 11 deletions(-) create mode 100644 python/examples/usage_numpy_chunks.py diff --git a/README.md b/README.md index f1b175d..31db211 100644 --- a/README.md +++ b/README.md @@ -219,6 +219,28 @@ A more complete [example](python/examples/usage.py), including the methods for obtaining the channels, i.a. their data and/or directly printing them to files, can be found in the `python/examples` folder. +### Chunked NumPy export (fast path) + +For large files, you can iterate over channel data in chunks as NumPy arrays. This avoids creating large Python lists and allows for streaming processing (e.g. writing to Parquet). + +```python +import imctermite +import numpy as np + +imcraw = imctermite.imctermite(b"samples/large_file.raw") +channels = imcraw.get_channels(False) +uuid = channels[0]['uuid'].encode('utf-8') + +# Iterate over channel data in chunks of 1 million samples +for chunk in imcraw.iter_channel_numpy(uuid, include_x=True, chunk_rows=1_000_000): + y_data = chunk['y'] # NumPy array + x_data = chunk.get('x') # NumPy array (if include_x=True) + start_index = chunk['start'] + + # Process chunk (e.g. write to parquet) + print(f"Processed chunk starting at {start_index}, size {len(y_data)}") +``` + ## Testing Run end-to-end tests: `make test` diff --git a/lib/imc_raw.hpp b/lib/imc_raw.hpp index ee77daa..379bf7b 100644 --- a/lib/imc_raw.hpp +++ b/lib/imc_raw.hpp @@ -19,6 +19,14 @@ namespace imc { + struct channel_chunk { + std::vector x; + std::vector y; + unsigned long int start; + unsigned long int count; + bool has_x; + }; + class raw { // (path of) raw-file and its basename @@ -393,6 +401,60 @@ namespace imc return channels; } + // get length of a channel + unsigned long int get_channel_length(std::string uuid) + { + if ( channels_.count(uuid) ) + { + return (unsigned long int)channels_.at(uuid).ydata_.size(); + } + else + { + throw std::runtime_error(std::string("channel does not exist:") + uuid); + } + } + + // read a chunk of channel data + channel_chunk read_channel_chunk(std::string uuid, unsigned long int start, unsigned long int count, bool include_x) + { + if ( !channels_.count(uuid) ) + { + throw std::runtime_error(std::string("channel does not exist:") + uuid); + } + + imc::channel& ch = channels_.at(uuid); + unsigned long int total_len = ch.ydata_.size(); + + if ( start >= total_len ) + { + return { {}, {}, start, 0, include_x }; + } + + unsigned long int end = start + count; + if ( end > total_len ) end = total_len; + unsigned long int actual_count = end - start; + + channel_chunk chunk; + chunk.start = start; + chunk.count = actual_count; + chunk.has_x = include_x; + chunk.y.reserve(actual_count); + if (include_x) chunk.x.reserve(actual_count); + + for (unsigned long int i = 0; i < actual_count; ++i) + { + chunk.y.push_back(ch.ydata_[start + i].as_double()); + if (include_x) + { + if (start + i < ch.xdata_.size()) + chunk.x.push_back(ch.xdata_[start + i].as_double()); + else + chunk.x.push_back(0.0); + } + } + return chunk; + } + // print single specific channel void print_channel(std::string channeluuid, std::string outputfile, const char sep) { diff --git a/python/examples/usage_numpy_chunks.py b/python/examples/usage_numpy_chunks.py new file mode 100644 index 0000000..b111b5d --- /dev/null +++ b/python/examples/usage_numpy_chunks.py @@ -0,0 +1,56 @@ + +import imctermite +import json +import os +import numpy as np + +# Path to a sample file +raw_file = b"samples/datasetA/datasetA_1.raw" +if not os.path.exists(raw_file): + print(f"Sample file {raw_file} not found.") + exit(1) + +print(f"Loading {raw_file}") + +try: + imcraw = imctermite.imctermite(raw_file) +except RuntimeError as e: + print(f"Failed to load/parse raw-file: {e}") + exit(1) + +# Get channels metadata +channels = imcraw.get_channels(False) +if not channels: + print("No channels found.") + exit(0) + +# Pick the first channel +first_channel_uuid = channels[0]['uuid'].encode('utf-8') +print(f"Iterating over channel {first_channel_uuid}") + +# Iterate in chunks +total_rows = 0 +chunk_size = 100 + +for chunk in imcraw.iter_channel_numpy(first_channel_uuid, include_x=True, chunk_rows=chunk_size): + start = chunk['start'] + y = chunk['y'] + x = chunk.get('x') + + count = len(y) + total_rows += count + + print(f"Chunk start={start}, count={count}, y_shape={y.shape}, y_dtype={y.dtype}") + if x is not None: + print(f" x_shape={x.shape}, x_dtype={x.dtype}") + + # Verify data (optional, just checking first few values) + if start == 0 and count > 0: + print(f" First y value: {y[0]}") + + # Here you could write the chunk to a Parquet file using pyarrow or fastparquet + # e.g. + # table = pa.Table.from_pydict({"x": x, "y": y}) + # pq.write_table(table, output_file) + +print(f"Total rows read: {total_rows}") diff --git a/python/imctermite.pxd b/python/imctermite.pxd index f76521e..cbeae15 100644 --- a/python/imctermite.pxd +++ b/python/imctermite.pxd @@ -6,6 +6,13 @@ from libcpp cimport bool cdef extern from "lib/imc_raw.hpp" namespace "imc": + cdef struct channel_chunk: + vector[double] x + vector[double] y + unsigned long int start + unsigned long int count + bool has_x + cdef cppclass cppimctermite "imc::raw": # constructor(s) @@ -18,6 +25,12 @@ cdef extern from "lib/imc_raw.hpp" namespace "imc": # get JSON list of channels vector[string] get_channels(bool json, bool data) except + + # get length of a channel + unsigned long int get_channel_length(string uuid) except + + + # read a chunk of channel data + channel_chunk read_channel_chunk(string uuid, unsigned long int start, unsigned long int count, bool include_x) except + + # print single channel/all channels void print_channel(string channeluuid, string outputdir, char delimiter) except + void print_channels(string outputdir, char delimiter) except + diff --git a/python/imctermite.pyx b/python/imctermite.pyx index 3bbc7fa..4c27e0b 100644 --- a/python/imctermite.pyx +++ b/python/imctermite.pyx @@ -1,7 +1,9 @@ # distutils: language = c++ # cython: language_level = 3 -from imctermite cimport cppimctermite +from imctermite cimport cppimctermite, channel_chunk +cimport numpy as cnp +import numpy as np import json as jn import decimal @@ -35,6 +37,34 @@ cdef class imctermite: chnlstjn = [jn.loads(chn.decode(get_codepage(chn),errors="ignore")) for chn in chnlst] return chnlstjn + def iter_channel_numpy(self, string channeluuid, bool include_x=True, int chunk_rows=1000000): + cdef unsigned long int total_len = self.cppimc.get_channel_length(channeluuid) + cdef unsigned long int start = 0 + cdef channel_chunk chunk + cdef cnp.ndarray x_arr + cdef cnp.ndarray y_arr + + while start < total_len: + chunk = self.cppimc.read_channel_chunk(channeluuid, start, chunk_rows, include_x) + + # Create numpy arrays from vectors + y_arr = np.array(chunk.y, dtype=np.float64) + + result = { + "start": chunk.start, + "y": y_arr + } + + if include_x: + x_arr = np.array(chunk.x, dtype=np.float64) + result["x"] = x_arr + + yield result + + start += chunk.count + if chunk.count == 0: + break + # print single channel/all channels def print_channel(self, string channeluuid, string outputfile, char delimiter): self.cppimc.print_channel(channeluuid,outputfile,delimiter) diff --git a/python/setup.py b/python/setup.py index 98ebef7..8d2e8a2 100644 --- a/python/setup.py +++ b/python/setup.py @@ -1,6 +1,7 @@ from setuptools import Extension, setup from Cython.Build import cythonize import sys +import numpy print("building on platform: "+sys.platform) @@ -13,6 +14,7 @@ extension = Extension( "imctermite", sources=["imctermite.pyx"], + include_dirs=[numpy.get_include()], extra_compile_args=cmpArgs[sys.platform] ) diff --git a/tests/test_python.py b/tests/test_python.py index 7470709..ee8e654 100644 --- a/tests/test_python.py +++ b/tests/test_python.py @@ -7,6 +7,7 @@ import os import tempfile import csv +import numpy as np from pathlib import Path try: @@ -104,17 +105,71 @@ def test_data_values_are_numeric(self, sample_data): assert isinstance(x, (int, float)) for y in channel['ydata'][:10]: assert isinstance(y, (int, float)) - - def test_xdata_monotonic(self, sample_data): - """X-data (time) should be monotonically increasing""" - for channel in sample_data: - xdata = channel['xdata'] - if len(xdata) > 1: - # Check if mostly increasing (allow small floating point issues) - increasing_count = sum(1 for i in range(len(xdata)-1) if xdata[i] <= xdata[i+1]) - ratio = increasing_count / (len(xdata) - 1) - assert ratio > 0.95, f"X-data not monotonic enough: {ratio:.2%}" + for val in channel['ydata']: + assert isinstance(val, (int, float)) + +class TestChunkedNumpy: + """Test chunked NumPy API""" + def test_chunked_iteration_all_samples(self): + """Verify chunked iteration against get_channels for all samples""" + + raw_files = list(DATASET_A.glob("*.raw")) + list(DATASET_B.glob("*.raw")) + # Sort for deterministic order + raw_files.sort() + + for raw_file in raw_files: + # print(f"Testing {raw_file.name}") + try: + imc = imctermite.imctermite(str(raw_file).encode()) + + # Get reference data + channels_ref = imc.get_channels(include_data=True) + + for ch_ref in channels_ref: + uuid = ch_ref['uuid'].encode('utf-8') + + # Test with include_x=True + y_chunks = [] + x_chunks = [] + + # Use a small chunk size to ensure we test chunking logic even on small files + # Some files might be very small, so 100 is a good stress test + for chunk in imc.iter_channel_numpy(uuid, include_x=True, chunk_rows=100): + y_chunks.append(chunk['y']) + x_chunks.append(chunk['x']) + + if not y_chunks: + assert len(ch_ref['ydata']) == 0 + continue + + y_full = np.concatenate(y_chunks) + x_full = np.concatenate(x_chunks) + + # Compare with reference + # Note: get_channels returns lists of floats. + # We compare them with numpy arrays. + + # Check lengths first + assert len(y_full) == len(ch_ref['ydata']), f"Length mismatch in {raw_file.name} channel {uuid}" + + # Check values + assert np.allclose(y_full, ch_ref['ydata'], equal_nan=True), f"Y data mismatch in {raw_file.name} channel {uuid}" + assert np.allclose(x_full, ch_ref['xdata'], equal_nan=True), f"X data mismatch in {raw_file.name} channel {uuid}" + + # Test with include_x=False + y_chunks_nox = [] + for chunk in imc.iter_channel_numpy(uuid, include_x=False, chunk_rows=100): + y_chunks_nox.append(chunk['y']) + assert 'x' not in chunk + + if y_chunks_nox: + y_full_nox = np.concatenate(y_chunks_nox) + assert np.allclose(y_full_nox, ch_ref['ydata'], equal_nan=True), f"Y data mismatch (no x) in {raw_file.name} channel {uuid}" + + except Exception as e: + pytest.fail(f"Failed processing {raw_file.name}: {str(e)}") + class TestCSVOutput: """Test CSV file generation""" From 393240ba5513e72a74b0e7c1f78ee8ea1925ff74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Ole=20G=C3=B6deke?= Date: Sun, 14 Dec 2025 23:09:40 +0100 Subject: [PATCH 03/12] Enhance channel data handling with raw mode support and update examples for chunked NumPy export --- README.md | 20 +--- lib/imc_channel.hpp | 3 + lib/imc_raw.hpp | 135 +++++++++++++++++++++++--- python/examples/usage_numpy_chunks.py | 71 ++++++++++---- python/imctermite.pxd | 11 ++- python/imctermite.pyx | 44 ++++++++- tests/test_python.py | 36 ++++--- 7 files changed, 243 insertions(+), 77 deletions(-) diff --git a/README.md b/README.md index 31db211..0436816 100644 --- a/README.md +++ b/README.md @@ -221,25 +221,7 @@ can be found in the `python/examples` folder. ### Chunked NumPy export (fast path) -For large files, you can iterate over channel data in chunks as NumPy arrays. This avoids creating large Python lists and allows for streaming processing (e.g. writing to Parquet). - -```python -import imctermite -import numpy as np - -imcraw = imctermite.imctermite(b"samples/large_file.raw") -channels = imcraw.get_channels(False) -uuid = channels[0]['uuid'].encode('utf-8') - -# Iterate over channel data in chunks of 1 million samples -for chunk in imcraw.iter_channel_numpy(uuid, include_x=True, chunk_rows=1_000_000): - y_data = chunk['y'] # NumPy array - x_data = chunk.get('x') # NumPy array (if include_x=True) - start_index = chunk['start'] - - # Process chunk (e.g. write to parquet) - print(f"Processed chunk starting at {start_index}, size {len(y_data)}") -``` +For large files, you can iterate over channel data in chunks as NumPy arrays. This avoids creating large Python lists and allows for streaming processing (e.g. writing to Parquet). See [`python/examples/usage_numpy_chunks.py`](python/examples/usage_numpy_chunks.py) for a complete example. ## Testing diff --git a/lib/imc_channel.hpp b/lib/imc_channel.hpp index 6e19e1c..0a83f0a 100644 --- a/lib/imc_channel.hpp +++ b/lib/imc_channel.hpp @@ -716,12 +716,15 @@ namespace imc <<"\",\"codepage\":\""< x; - std::vector y; + std::vector x_bytes; + std::vector y_bytes; unsigned long int start; unsigned long int count; bool has_x; + int x_type; + int y_type; }; class raw @@ -414,8 +416,21 @@ namespace imc } } + // get numeric type of a channel + int get_channel_numeric_type(std::string uuid) + { + if ( channels_.count(uuid) ) + { + return (int)channels_.at(uuid).ydatatp_; + } + else + { + throw std::runtime_error(std::string("channel does not exist:") + uuid); + } + } + // read a chunk of channel data - channel_chunk read_channel_chunk(std::string uuid, unsigned long int start, unsigned long int count, bool include_x) + channel_chunk read_channel_chunk(std::string uuid, unsigned long int start, unsigned long int count, bool include_x, bool raw_mode) { if ( !channels_.count(uuid) ) { @@ -427,7 +442,7 @@ namespace imc if ( start >= total_len ) { - return { {}, {}, start, 0, include_x }; + return { {}, {}, start, 0, include_x, 0, 0 }; } unsigned long int end = start + count; @@ -438,20 +453,108 @@ namespace imc chunk.start = start; chunk.count = actual_count; chunk.has_x = include_x; - chunk.y.reserve(actual_count); - if (include_x) chunk.x.reserve(actual_count); - for (unsigned long int i = 0; i < actual_count; ++i) - { - chunk.y.push_back(ch.ydata_[start + i].as_double()); - if (include_x) - { - if (start + i < ch.xdata_.size()) - chunk.x.push_back(ch.xdata_[start + i].as_double()); - else - chunk.x.push_back(0.0); - } + // Handle Y data + if (raw_mode) { + // Raw mode: read bytes directly from buffer + int type = (int)ch.ydatatp_; + unsigned long int bytes_per_sample = ch.ysignbits_ / 8; + + if (mapblocks_.count(ch.chnenv_.CSuuid_) == 0) { + throw std::runtime_error("CS block not found for channel"); + } + imc::block& cs_block = mapblocks_.at(ch.chnenv_.CSuuid_); + std::vector prms = cs_block.get_parameters(); + if (prms.size() < 4) throw std::runtime_error("Invalid CS block parameters"); + unsigned long int buffstrt = prms[3].begin(); + + unsigned long int abs_start = buffstrt + ch.ybuffer_offset_ + 1 + start * bytes_per_sample; + unsigned long int byte_count = actual_count * bytes_per_sample; + + if (abs_start + byte_count > buffer_.size()) { + throw std::runtime_error("Buffer read out of bounds"); + } + + if (type == 13) { // six_byte_unsigned_long -> promote to 8 byte (uint64) + chunk.y_type = 13; // Keep original type ID, but data is promoted + chunk.y_bytes.resize(actual_count * 8); + uint64_t* dest = reinterpret_cast(chunk.y_bytes.data()); + + for (unsigned long int i = 0; i < actual_count; ++i) { + unsigned long int src_idx = abs_start + i * 6; + uint64_t val = 0; + // Assuming Little Endian storage in file + for (int b = 0; b < 6; ++b) { + val |= (uint64_t)buffer_[src_idx + b] << (b * 8); + } + dest[i] = val; + } + } else { + chunk.y_type = type; + chunk.y_bytes.resize(byte_count); + std::copy(buffer_.begin() + abs_start, buffer_.begin() + abs_start + byte_count, chunk.y_bytes.begin()); + } + } else { + // Scaled mode: convert to double + chunk.y_type = 8; // imc::numtype::ddouble + chunk.y_bytes.resize(actual_count * sizeof(double)); + double* ptr = reinterpret_cast(chunk.y_bytes.data()); + + for (unsigned long int i = 0; i < actual_count; ++i) { + ptr[i] = ch.ydata_[start + i].as_double(); + } + } + + // Handle X data + if (include_x) { + if (ch.dimension_ == 2 && raw_mode) { + // XY channel, raw mode + int type = (int)ch.xdatatp_; + unsigned long int bytes_per_sample = ch.xsignbits_ / 8; + + imc::block& cs_block = mapblocks_.at(ch.chnenv_.CSuuid_); + std::vector prms = cs_block.get_parameters(); + unsigned long int buffstrt = prms[3].begin(); + + unsigned long int abs_start = buffstrt + ch.xbuffer_offset_ + 1 + start * bytes_per_sample; + unsigned long int byte_count = actual_count * bytes_per_sample; + + if (abs_start + byte_count > buffer_.size()) { + throw std::runtime_error("Buffer read out of bounds (X)"); + } + + if (type == 13) { // six_byte_unsigned_long -> promote to 8 byte + chunk.x_type = 13; // Keep original type ID + chunk.x_bytes.resize(actual_count * 8); + uint64_t* dest = reinterpret_cast(chunk.x_bytes.data()); + for (unsigned long int i = 0; i < actual_count; ++i) { + unsigned long int src_idx = abs_start + i * 6; + uint64_t val = 0; + for (int b = 0; b < 6; ++b) { + val |= (uint64_t)buffer_[src_idx + b] << (b * 8); + } + dest[i] = val; + } + } else { + chunk.x_type = type; + chunk.x_bytes.resize(byte_count); + std::copy(buffer_.begin() + abs_start, buffer_.begin() + abs_start + byte_count, chunk.x_bytes.begin()); + } + } else { + // Generated X or scaled X + chunk.x_type = 8; // imc::numtype::ddouble + chunk.x_bytes.resize(actual_count * sizeof(double)); + double* ptr = reinterpret_cast(chunk.x_bytes.data()); + + for (unsigned long int i = 0; i < actual_count; ++i) { + if (start + i < ch.xdata_.size()) + ptr[i] = ch.xdata_[start + i].as_double(); + else + ptr[i] = 0.0; + } + } } + return chunk; } diff --git a/python/examples/usage_numpy_chunks.py b/python/examples/usage_numpy_chunks.py index b111b5d..6aeaf09 100644 --- a/python/examples/usage_numpy_chunks.py +++ b/python/examples/usage_numpy_chunks.py @@ -5,7 +5,8 @@ import numpy as np # Path to a sample file -raw_file = b"samples/datasetA/datasetA_1.raw" +# Using sampleB.raw because it has integer data with scaling (factor=0.01, offset=327.68) +raw_file = b"samples/sampleB.raw" if not os.path.exists(raw_file): print(f"Sample file {raw_file} not found.") exit(1) @@ -25,14 +26,23 @@ exit(0) # Pick the first channel -first_channel_uuid = channels[0]['uuid'].encode('utf-8') -print(f"Iterating over channel {first_channel_uuid}") +# For sampleB.raw, channel 347 is the interesting one +target_uuid = "347" +channel_info = next((ch for ch in channels if ch['uuid'] == target_uuid), channels[0]) -# Iterate in chunks +first_channel_uuid = channel_info['uuid'].encode('utf-8') +print(f"Iterating over channel {first_channel_uuid} ({channel_info.get('name', 'unnamed')})") + +# Check native datatype +if 'datatype' in channel_info: + print(f"Native IMC datatype ID: {channel_info['datatype']}") + +# Example 1: Scaled mode (default) - returns floats (physical units) +print("\n--- Scaled Mode (Physical Units) ---") total_rows = 0 -chunk_size = 100 +chunk_size = 1000 -for chunk in imcraw.iter_channel_numpy(first_channel_uuid, include_x=True, chunk_rows=chunk_size): +for chunk in imcraw.iter_channel_numpy(first_channel_uuid, include_x=True, chunk_rows=chunk_size, mode="scaled"): start = chunk['start'] y = chunk['y'] x = chunk.get('x') @@ -40,17 +50,38 @@ count = len(y) total_rows += count - print(f"Chunk start={start}, count={count}, y_shape={y.shape}, y_dtype={y.dtype}") - if x is not None: - print(f" x_shape={x.shape}, x_dtype={x.dtype}") - - # Verify data (optional, just checking first few values) - if start == 0 and count > 0: - print(f" First y value: {y[0]}") - - # Here you could write the chunk to a Parquet file using pyarrow or fastparquet - # e.g. - # table = pa.Table.from_pydict({"x": x, "y": y}) - # pq.write_table(table, output_file) - -print(f"Total rows read: {total_rows}") + if total_rows <= chunk_size * 2: # Print only first few chunks + print(f"Chunk start={start}, count={count}, y_shape={y.shape}, y_dtype={y.dtype}") + if x is not None: + print(f" x_shape={x.shape}, x_dtype={x.dtype}") + if count > 0: + print(f" First y value: {y[0]}") + +print(f"Total rows read (scaled): {total_rows}") + +# Example 2: Raw mode - returns native types (e.g. integers) +print("\n--- Raw Mode (Native Types) ---") + +# Get scaling factors +factor = float(channel_info.get('factor', 1.0)) +offset = float(channel_info.get('offset', 0.0)) +print(f"Scaling: factor={factor}, offset={offset}") + +total_rows = 0 + +for chunk in imcraw.iter_channel_numpy(first_channel_uuid, include_x=True, chunk_rows=chunk_size, mode="raw"): + start = chunk['start'] + y = chunk['y'] + + count = len(y) + total_rows += count + + if total_rows <= chunk_size * 2: + print(f"Chunk start={start}, count={count}, y_shape={y.shape}, y_dtype={y.dtype}") + if count > 0: + raw_val = y[0] + scaled_val = raw_val * factor + offset + print(f" First y value (raw): {raw_val}") + print(f" First y value (manually scaled): {scaled_val}") + +print(f"Total rows read (raw): {total_rows}") diff --git a/python/imctermite.pxd b/python/imctermite.pxd index cbeae15..262f57a 100644 --- a/python/imctermite.pxd +++ b/python/imctermite.pxd @@ -7,11 +7,13 @@ from libcpp cimport bool cdef extern from "lib/imc_raw.hpp" namespace "imc": cdef struct channel_chunk: - vector[double] x - vector[double] y + vector[unsigned char] x_bytes + vector[unsigned char] y_bytes unsigned long int start unsigned long int count bool has_x + int x_type + int y_type cdef cppclass cppimctermite "imc::raw": @@ -28,8 +30,11 @@ cdef extern from "lib/imc_raw.hpp" namespace "imc": # get length of a channel unsigned long int get_channel_length(string uuid) except + + # get numeric type of a channel + int get_channel_numeric_type(string uuid) except + + # read a chunk of channel data - channel_chunk read_channel_chunk(string uuid, unsigned long int start, unsigned long int count, bool include_x) except + + channel_chunk read_channel_chunk(string uuid, unsigned long int start, unsigned long int count, bool include_x, bool raw_mode) except + # print single channel/all channels void print_channel(string channeluuid, string outputdir, char delimiter) except + diff --git a/python/imctermite.pyx b/python/imctermite.pyx index 4c27e0b..5618592 100644 --- a/python/imctermite.pyx +++ b/python/imctermite.pyx @@ -4,6 +4,7 @@ from imctermite cimport cppimctermite, channel_chunk cimport numpy as cnp import numpy as np +from libc.string cimport memcpy import json as jn import decimal @@ -37,18 +38,44 @@ cdef class imctermite: chnlstjn = [jn.loads(chn.decode(get_codepage(chn),errors="ignore")) for chn in chnlst] return chnlstjn - def iter_channel_numpy(self, string channeluuid, bool include_x=True, int chunk_rows=1000000): + def iter_channel_numpy(self, string channeluuid, bool include_x=True, int chunk_rows=1000000, str mode="scaled"): cdef unsigned long int total_len = self.cppimc.get_channel_length(channeluuid) cdef unsigned long int start = 0 cdef channel_chunk chunk cdef cnp.ndarray x_arr cdef cnp.ndarray y_arr + cdef bool raw_mode = (mode == "raw") + + # Map imc::numtype to numpy dtype + # Types 9 (imc_devices_transitional_recording) and 10 (timestamp_ascii) + # are not currently supported by the underlying C++ library. + dtype_map = { + 1: np.uint8, # unsigned_byte + 2: np.int8, # signed_byte + 3: np.uint16, # unsigned_short + 4: np.int16, # signed_short + 5: np.uint32, # unsigned_long (imc_Ulongint is unsigned int (32-bit) on x86_64 usually) + 6: np.int32, # signed_long (imc_Slongint is signed int) + 7: np.float32, # ffloat + 8: np.float64, # ddouble + 11: np.uint16, # two_byte_word_digital + 12: np.uint64, # eight_byte_unsigned_long + 13: np.uint64, # six_byte_unsigned_long (promoted to 8 bytes in C++) + 14: np.int64 # eight_byte_signed_long + } while start < total_len: - chunk = self.cppimc.read_channel_chunk(channeluuid, start, chunk_rows, include_x) + chunk = self.cppimc.read_channel_chunk(channeluuid, start, chunk_rows, include_x, raw_mode) - # Create numpy arrays from vectors - y_arr = np.array(chunk.y, dtype=np.float64) + # Create numpy arrays from bytes + y_dtype = dtype_map.get(chunk.y_type, np.float64) + + y_arr = np.empty(chunk.count, dtype=y_dtype) + + if chunk.y_bytes.size() > 0: + memcpy( cnp.PyArray_DATA(y_arr), + chunk.y_bytes.data(), + chunk.y_bytes.size()) result = { "start": chunk.start, @@ -56,7 +83,14 @@ cdef class imctermite: } if include_x: - x_arr = np.array(chunk.x, dtype=np.float64) + x_dtype = dtype_map.get(chunk.x_type, np.float64) + x_arr = np.empty(chunk.count, dtype=x_dtype) + + if chunk.x_bytes.size() > 0: + memcpy( cnp.PyArray_DATA(x_arr), + chunk.x_bytes.data(), + chunk.x_bytes.size()) + result["x"] = x_arr yield result diff --git a/tests/test_python.py b/tests/test_python.py index ee8e654..dd6a5df 100644 --- a/tests/test_python.py +++ b/tests/test_python.py @@ -114,9 +114,9 @@ class TestChunkedNumpy: def test_chunked_iteration_all_samples(self): """Verify chunked iteration against get_channels for all samples""" - raw_files = list(DATASET_A.glob("*.raw")) + list(DATASET_B.glob("*.raw")) - # Sort for deterministic order - raw_files.sort() + # Get all .raw and .dat files recursively + raw_files = sorted(list(SAMPLES_DIR.glob("**/*.raw")) + + list(SAMPLES_DIR.glob("**/*.dat"))) for raw_file in raw_files: # print(f"Testing {raw_file.name}") @@ -135,7 +135,7 @@ def test_chunked_iteration_all_samples(self): # Use a small chunk size to ensure we test chunking logic even on small files # Some files might be very small, so 100 is a good stress test - for chunk in imc.iter_channel_numpy(uuid, include_x=True, chunk_rows=100): + for chunk in imc.iter_channel_numpy(uuid, include_x=True, chunk_rows=100, mode="scaled"): y_chunks.append(chunk['y']) x_chunks.append(chunk['x']) @@ -159,13 +159,28 @@ def test_chunked_iteration_all_samples(self): # Test with include_x=False y_chunks_nox = [] - for chunk in imc.iter_channel_numpy(uuid, include_x=False, chunk_rows=100): + for chunk in imc.iter_channel_numpy(uuid, include_x=False, chunk_rows=100, mode="scaled"): y_chunks_nox.append(chunk['y']) assert 'x' not in chunk if y_chunks_nox: y_full_nox = np.concatenate(y_chunks_nox) assert np.allclose(y_full_nox, ch_ref['ydata'], equal_nan=True), f"Y data mismatch (no x) in {raw_file.name} channel {uuid}" + + # Test raw mode (basic check that it runs and returns correct length) + # We can't easily verify values without reimplementing the scaling logic, + # but we can check that it returns something valid. + y_chunks_raw = [] + for chunk in imc.iter_channel_numpy(uuid, include_x=False, chunk_rows=100, mode="raw"): + y_chunks_raw.append(chunk['y']) + # Check that dtype is not float64 unless it really is float data + # Most samples are likely int16 or similar + # print(f"Raw dtype: {chunk['y'].dtype}") + + if y_chunks_raw: + y_full_raw = np.concatenate(y_chunks_raw) + assert len(y_full_raw) == len(ch_ref['ydata']), f"Raw length mismatch in {raw_file.name} channel {uuid}" + except Exception as e: pytest.fail(f"Failed processing {raw_file.name}: {str(e)}") @@ -248,12 +263,8 @@ def test_process_all_sample_files(self): pytest.skip(f"Samples directory not found: {SAMPLES_DIR}") # Get all .raw and .dat files recursively - files_to_test = sorted(list(SAMPLES_DIR.glob("*.raw")) + - list(SAMPLES_DIR.glob("*.dat")) + - list(SAMPLES_DIR.glob("**/*.raw")) + + files_to_test = sorted(list(SAMPLES_DIR.glob("**/*.raw")) + list(SAMPLES_DIR.glob("**/*.dat"))) - # Remove duplicates (files in root will be in both patterns) - files_to_test = sorted(set(files_to_test)) if len(files_to_test) == 0: pytest.skip("No .raw or .dat files in samples directory") @@ -278,11 +289,8 @@ def test_extract_all_sample_files_with_data(self): pytest.skip(f"Samples directory not found: {SAMPLES_DIR}") # Get all .raw and .dat files recursively - files_to_test = sorted(list(SAMPLES_DIR.glob("*.raw")) + - list(SAMPLES_DIR.glob("*.dat")) + - list(SAMPLES_DIR.glob("**/*.raw")) + + files_to_test = sorted(list(SAMPLES_DIR.glob("**/*.raw")) + list(SAMPLES_DIR.glob("**/*.dat"))) - files_to_test = sorted(set(files_to_test)) if len(files_to_test) == 0: pytest.skip("No .raw or .dat files in samples directory") From 4cee020a365f396f305d02dba8ef3eea88fc67a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Ole=20G=C3=B6deke?= Date: Mon, 15 Dec 2025 22:29:28 +0100 Subject: [PATCH 04/12] Refactor IMC library for improved data handling and performance - Modified `component_group` and `channel` constructors to accept raw buffer pointers instead of vectors. - Enhanced `load_all_data` and `init_metadata` methods for better data initialization and loading. - Implemented `read_chunk` method in `channel` to facilitate chunked data reading with support for raw and scaled modes. - Updated `convert_data_to_type` and `convert_chunk_to_double` functions to handle raw data more efficiently. - Removed redundant `imc_result.hpp` file to streamline the codebase. - Adjusted Python bindings in `imctermite.pyx` to manage C++ instance memory correctly. --- lib/imc_block.hpp | 22 ++-- lib/imc_buffer.hpp | 127 +++++++++++++++++++++ lib/imc_channel.hpp | 246 +++++++++++++++++++++++++++++++++-------- lib/imc_conversion.hpp | 61 +++++++++- lib/imc_object.hpp | 38 +++---- lib/imc_raw.hpp | 194 ++++++-------------------------- lib/imc_result.hpp | 30 ----- python/imctermite.pyx | 10 +- 8 files changed, 456 insertions(+), 272 deletions(-) create mode 100644 lib/imc_buffer.hpp delete mode 100644 lib/imc_result.hpp diff --git a/lib/imc_block.hpp b/lib/imc_block.hpp index 332a3b6..71b9d58 100644 --- a/lib/imc_block.hpp +++ b/lib/imc_block.hpp @@ -34,7 +34,8 @@ namespace imc // name and buffer of associated raw file std::string raw_file_; - const std::vector* buffer_; + const unsigned char* buffer_; + size_t buffer_size_; // offset of first/last byte of parameters in block (separated by ch_sep_) // w.r.t. to first byte of block (=0) @@ -44,7 +45,7 @@ namespace imc // constructor block(key thekey, unsigned long int begin, unsigned long int end, - std::string raw_file, const std::vector* buffer): + std::string raw_file, const unsigned char* buffer, size_t buffer_size): thekey_(thekey), uuid_(std::to_string(begin)) { if ( !imc::check_key(thekey) ) throw std::logic_error("unknown key"); @@ -56,14 +57,15 @@ namespace imc } raw_file_ = raw_file; buffer_ = buffer; + buffer_size_ = buffer_size; // make sure "end_" does not exceed buffer size due to invalid "length" parameter of block - if ( end_ > buffer_->size() ) + if ( end_ > buffer_size_ ) { std::cout<<"WARNING: invalid length parameter in "<size()<<")" + <<"(block-end:"< resetting block-end to buffer-size\n"; - end_ = (unsigned long int)(buffer_->size()); + end_ = (unsigned long int)(buffer_size_); } try { @@ -86,7 +88,7 @@ namespace imc for ( unsigned long int b = begin_; b < end_ && ( ! (thekey_.name_== "CS") || count < 4 ); b++ ) { - if ( buffer_->at(b) == imc::ch_sep_ ) + if ( buffer_[b] == imc::ch_sep_ ) { // define range of parameter with first byte = ch_sep_ parameters_.push_back(imc::parameter(b,b)); @@ -124,8 +126,8 @@ namespace imc { throw std::logic_error("inconsistent parameter offsets"); } - std::vector parambuff(buffer_->begin()+begin_+param.begin(), - buffer_->begin()+begin_+param.end()); + std::vector parambuff(buffer_+begin_+param.begin(), + buffer_+begin_+param.end()); return parambuff; } @@ -140,7 +142,7 @@ namespace imc std::string prm(""); for ( unsigned long int i = param.begin()+1; i <= param.end(); i++ ) { - prm.push_back( (char)((*buffer_)[i]) ); + prm.push_back( (char)(buffer_[i]) ); } return prm; } @@ -163,7 +165,7 @@ namespace imc <size()<<"\n" + < +#include +#include +#include +#include +#include +#include + +namespace imc +{ + class MemoryMappedFile + { + private: + const unsigned char* data_; + size_t size_; + int fd_; + + public: + MemoryMappedFile() : data_(nullptr), size_(0), fd_(-1) {} + + ~MemoryMappedFile() + { + close_file(); + } + + // Delete copy constructor and assignment operator to prevent double-free + MemoryMappedFile(const MemoryMappedFile&) = delete; + MemoryMappedFile& operator=(const MemoryMappedFile&) = delete; + + // Implement move constructor + MemoryMappedFile(MemoryMappedFile&& other) noexcept + : data_(other.data_), size_(other.size_), fd_(other.fd_) + { + other.data_ = nullptr; + other.size_ = 0; + other.fd_ = -1; + } + + // Implement move assignment operator + MemoryMappedFile& operator=(MemoryMappedFile&& other) noexcept + { + if (this != &other) + { + close_file(); + data_ = other.data_; + size_ = other.size_; + fd_ = other.fd_; + other.data_ = nullptr; + other.size_ = 0; + other.fd_ = -1; + } + return *this; + } + + void map(const std::string& filename) + { + close_file(); + + fd_ = open(filename.c_str(), O_RDONLY); + if (fd_ == -1) + { + throw std::runtime_error("Failed to open file: " + filename); + } + + struct stat sb; + if (fstat(fd_, &sb) == -1) + { + close(fd_); + fd_ = -1; + throw std::runtime_error("Failed to get file size: " + filename); + } + size_ = sb.st_size; + + if (size_ == 0) + { + data_ = nullptr; + return; + } + + void* mapped = mmap(NULL, size_, PROT_READ, MAP_PRIVATE, fd_, 0); + if (mapped == MAP_FAILED) + { + close(fd_); + fd_ = -1; + size_ = 0; + throw std::runtime_error("Failed to mmap file: " + filename); + } + + data_ = static_cast(mapped); + } + + void close_file() + { + if (data_) + { + munmap(const_cast(data_), size_); + data_ = nullptr; + } + if (fd_ != -1) + { + close(fd_); + fd_ = -1; + } + size_ = 0; + } + + const unsigned char* data() const + { + return data_; + } + + size_t size() const + { + return size_; + } + + const unsigned char& operator[](size_t index) const + { + return data_[index]; + } + }; +} + +#endif diff --git a/lib/imc_channel.hpp b/lib/imc_channel.hpp index 0a83f0a..c86e394 100644 --- a/lib/imc_channel.hpp +++ b/lib/imc_channel.hpp @@ -11,6 +11,7 @@ #include #include #include +#include #if defined(__linux__) || defined(__APPLE__) #include #elif defined(__WIN32__) || defined(_WIN32) @@ -21,6 +22,16 @@ namespace imc { + struct channel_chunk { + std::vector x_bytes; + std::vector y_bytes; + unsigned long int start; + unsigned long int count; + bool has_x; + int x_type; + int y_type; + }; + struct component_env { std::string uuid_; @@ -274,7 +285,7 @@ namespace imc component_env compenv_; // Constructor to parse the associated blocks - component_group(component_env &compenv, std::map* blocks, std::vector* buffer) + component_group(component_env &compenv, std::map* blocks, const unsigned char* buffer) : compenv_(compenv) { if (blocks->count(compenv.CCuuid_) == 1) @@ -311,7 +322,7 @@ namespace imc // associated environment of blocks and map of blocks channel_env chnenv_; std::map* blocks_; - std::vector* buffer_; + const unsigned char* buffer_; imc::origin_data NO_; imc::language NL_; @@ -348,6 +359,8 @@ namespace imc // range, factor and offset double xfactor_, yfactor_; double xoffset_, yoffset_; + + unsigned long int number_of_samples_ = 0; // group reference the channel belongs to unsigned long int group_index_; @@ -355,7 +368,7 @@ namespace imc // constructor takes channel's block environment channel(channel_env &chnenv, std::map* blocks, - std::vector* buffer): + const unsigned char* buffer): chnenv_(chnenv), blocks_(blocks), buffer_(buffer), xfactor_(1.), yfactor_(1.), xoffset_(0.), yoffset_(0.), group_index_(-1) @@ -475,15 +488,15 @@ namespace imc } // start converting binary buffer to imc::datatype - if ( !chnenv_.CSuuid_.empty() ) convert_buffer(); + if ( !chnenv_.CSuuid_.empty() ) init_metadata(); // convert any non-UTF-8 codepage to UTF-8 and cleanse any text convert_encoding(); cleanse_text(); } - // convert buffer to actual datatype - void convert_buffer() + // initialize metadata without loading data + void init_metadata() { std::vector prms = blocks_->at(chnenv_.CSuuid_).get_parameters(); if ( prms.size() < 4) @@ -492,65 +505,203 @@ namespace imc } // extract (channel dependent) part of buffer - unsigned long int buffstrt = prms[3].begin(); - std::vector yCSbuffer( buffer_->begin()+buffstrt+ybuffer_offset_+1, - buffer_->begin()+buffstrt+ybuffer_offset_+ybuffer_size_+1 ); + size_t yCSbuffer_size = ybuffer_size_; // determine number of values in buffer - unsigned long int ynum_values = (unsigned long int)(yCSbuffer.size()/(ysignbits_/8)); - if ( ynum_values*(ysignbits_/8) != yCSbuffer.size() ) + unsigned long int ynum_values = (unsigned long int)(yCSbuffer_size/(ysignbits_/8)); + if ( ynum_values*(ysignbits_/8) != yCSbuffer_size ) { throw std::runtime_error("CSbuffer and significant bits of y datatype don't match"); } - + + number_of_samples_ = ynum_values; if (dimension_ == 1) { - // process y-data - process_data(ydata_, ynum_values, ydatatp_, yCSbuffer); - // find appropriate precision for "xdata_" by means of "xstepwidth_" xprec_ = (xstepwidth_ > 0 ) ? (int)ceil(fabs(log10(xstepwidth_))) : 10; - - // fill xdata_ - for ( unsigned long int i = 0; i < ynum_values; i++ ) - { - xdata_.push_back(xstart_+(double)i*xstepwidth_); - } } else if (dimension_ == 2) { - // process x- and y-data - std::vector xCSbuffer( buffer_->begin()+buffstrt+xbuffer_offset_+1, - buffer_->begin()+buffstrt+xbuffer_offset_+xbuffer_size_+1 ); - - // determine number of values in buffer - unsigned long int xnum_values = (unsigned long int)(xCSbuffer.size()/(xsignbits_/8)); - if ( xnum_values*(xsignbits_/8) != xCSbuffer.size() ) - { - throw std::runtime_error("CSbuffer and significant bits of x datatype don't match"); - } + // const unsigned char* xCSbuffer = buffer_ + buffstrt + xbuffer_offset_ + 1; + size_t xCSbuffer_size = xbuffer_size_; + unsigned long int xnum_values = (unsigned long int)(xCSbuffer_size/(xsignbits_/8)); + if ( xnum_values != ynum_values ) { throw std::runtime_error("x and y data have different number of values"); } - xprec_ = 9; - - process_data(xdata_, xnum_values, xdatatp_, xCSbuffer); - process_data(ydata_, ynum_values, ydatatp_, yCSbuffer); } else { throw std::runtime_error("unsupported dimension"); } + } + + // convert buffer to actual datatype (loads all data) + void load_all_data() + { + std::vector prms = blocks_->at(chnenv_.CSuuid_).get_parameters(); + unsigned long int buffstrt = prms[3].begin(); + const unsigned char* yCSbuffer = buffer_ + buffstrt + ybuffer_offset_ + 1; + size_t yCSbuffer_size = ybuffer_size_; + unsigned long int ynum_values = number_of_samples_; + + if (dimension_ == 1) + { + process_data(ydata_, ynum_values, ydatatp_, yCSbuffer, yCSbuffer_size); + for ( unsigned long int i = 0; i < ynum_values; i++ ) + { + xdata_.push_back(xstart_+(double)i*xstepwidth_); + } + } + else if (dimension_ == 2) + { + const unsigned char* xCSbuffer = buffer_ + buffstrt + xbuffer_offset_ + 1; + size_t xCSbuffer_size = xbuffer_size_; + process_data(xdata_, ynum_values, xdatatp_, xCSbuffer, xCSbuffer_size); + process_data(ydata_, ynum_values, ydatatp_, yCSbuffer, yCSbuffer_size); + } transformData(xdata_, xfactor_, xoffset_); transformData(ydata_, yfactor_, yoffset_); } + channel_chunk read_chunk(unsigned long int start, unsigned long int count, bool include_x, bool raw_mode) + { + unsigned long int total_len = number_of_samples_; + + if ( start >= total_len ) + { + return { {}, {}, start, 0, include_x, 0, 0 }; + } + + unsigned long int end = start + count; + if ( end > total_len ) end = total_len; + unsigned long int actual_count = end - start; + + channel_chunk chunk; + chunk.start = start; + chunk.count = actual_count; + chunk.has_x = include_x; + chunk.x_type = 0; + chunk.y_type = 0; + + std::vector prms = blocks_->at(chnenv_.CSuuid_).get_parameters(); + unsigned long int buffstrt = prms[3].begin(); + + // Handle Y data + if (raw_mode) { + int type = (int)ydatatp_; + unsigned long int bytes_per_sample = ysignbits_ / 8; + unsigned long int abs_start = buffstrt + ybuffer_offset_ + 1 + start * bytes_per_sample; + unsigned long int byte_count = actual_count * bytes_per_sample; + + if (type == 13) { // six_byte_unsigned_long -> promote to 8 byte (uint64) + chunk.y_type = 13; + chunk.y_bytes.resize(actual_count * 8); + uint64_t* dest = reinterpret_cast(chunk.y_bytes.data()); + for (unsigned long int i = 0; i < actual_count; ++i) { + unsigned long int src_idx = abs_start + i * 6; + uint64_t val = 0; + for (int b = 0; b < 6; ++b) val |= (uint64_t)buffer_[src_idx + b] << (b * 8); + dest[i] = val; + } + } else { + chunk.y_type = type; + chunk.y_bytes.resize(byte_count); + std::copy(buffer_ + abs_start, buffer_ + abs_start + byte_count, chunk.y_bytes.begin()); + } + } else { + // Scaled mode: convert to double + chunk.y_type = 8; // imc::numtype::ddouble + chunk.y_bytes.resize(actual_count * sizeof(double)); + std::vector temp_data; + + unsigned long int abs_start = buffstrt + ybuffer_offset_ + 1; // Base start + + switch (ydatatp_) { + case numtype::unsigned_byte: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, yfactor_, yoffset_, temp_data); break; + case numtype::signed_byte: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, yfactor_, yoffset_, temp_data); break; + case numtype::unsigned_short: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, yfactor_, yoffset_, temp_data); break; + case numtype::signed_short: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, yfactor_, yoffset_, temp_data); break; + case numtype::unsigned_long: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, yfactor_, yoffset_, temp_data); break; + case numtype::signed_long: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, yfactor_, yoffset_, temp_data); break; + case numtype::ffloat: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, yfactor_, yoffset_, temp_data); break; + case numtype::ddouble: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, yfactor_, yoffset_, temp_data); break; + case numtype::two_byte_word_digital: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, yfactor_, yoffset_, temp_data); break; + case numtype::eight_byte_unsigned_long: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, yfactor_, yoffset_, temp_data); break; + case numtype::six_byte_unsigned_long: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, yfactor_, yoffset_, temp_data); break; + case numtype::eight_byte_signed_long: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, yfactor_, yoffset_, temp_data); break; + default: throw std::runtime_error("Unsupported type for scaled chunk reading (Y): " + std::to_string(ydatatp_)); + } + + memcpy(chunk.y_bytes.data(), temp_data.data(), temp_data.size() * sizeof(double)); + } + + // Handle X data + if (include_x) { + if (dimension_ == 2 && raw_mode) { + int type = (int)xdatatp_; + unsigned long int bytes_per_sample = xsignbits_ / 8; + unsigned long int abs_start = buffstrt + xbuffer_offset_ + 1 + start * bytes_per_sample; + unsigned long int byte_count = actual_count * bytes_per_sample; + + if (type == 13) { + chunk.x_type = 13; + chunk.x_bytes.resize(actual_count * 8); + uint64_t* dest = reinterpret_cast(chunk.x_bytes.data()); + for (unsigned long int i = 0; i < actual_count; ++i) { + unsigned long int src_idx = abs_start + i * 6; + uint64_t val = 0; + for (int b = 0; b < 6; ++b) val |= (uint64_t)buffer_[src_idx + b] << (b * 8); + dest[i] = val; + } + } else { + chunk.x_type = type; + chunk.x_bytes.resize(byte_count); + std::copy(buffer_ + abs_start, buffer_ + abs_start + byte_count, chunk.x_bytes.begin()); + } + } else { + // Generated X or scaled X + chunk.x_type = 8; // imc::numtype::ddouble + chunk.x_bytes.resize(actual_count * sizeof(double)); + double* ptr = reinterpret_cast(chunk.x_bytes.data()); + + if (dimension_ == 2) { + // Read X from file and scale + std::vector temp_data; + unsigned long int abs_start = buffstrt + xbuffer_offset_ + 1; + switch (xdatatp_) { + case numtype::unsigned_byte: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, xfactor_, xoffset_, temp_data); break; + case numtype::signed_byte: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, xfactor_, xoffset_, temp_data); break; + case numtype::unsigned_short: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, xfactor_, xoffset_, temp_data); break; + case numtype::signed_short: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, xfactor_, xoffset_, temp_data); break; + case numtype::unsigned_long: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, xfactor_, xoffset_, temp_data); break; + case numtype::signed_long: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, xfactor_, xoffset_, temp_data); break; + case numtype::ffloat: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, xfactor_, xoffset_, temp_data); break; + case numtype::ddouble: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, xfactor_, xoffset_, temp_data); break; + case numtype::two_byte_word_digital: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, xfactor_, xoffset_, temp_data); break; + case numtype::eight_byte_unsigned_long: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, xfactor_, xoffset_, temp_data); break; + case numtype::six_byte_unsigned_long: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, xfactor_, xoffset_, temp_data); break; + case numtype::eight_byte_signed_long: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, xfactor_, xoffset_, temp_data); break; + default: throw std::runtime_error("Unsupported type for scaled chunk reading (X): " + std::to_string(xdatatp_)); + } + memcpy(ptr, temp_data.data(), temp_data.size() * sizeof(double)); + } else { + // Generated X + for (unsigned long int i = 0; i < actual_count; ++i) { + ptr[i] = xstart_ + (double)(start + i) * xstepwidth_; + } + } + } + } + return chunk; + } + // handle data type conversion - void process_data(std::vector& data_, size_t num_values, numtype datatp_, std::vector& CSbuffer) + void process_data(std::vector& data_, size_t num_values, numtype datatp_, const unsigned char* CSbuffer, size_t CSbuffer_size) { // adjust size of data data_.resize(num_values); @@ -559,34 +710,34 @@ namespace imc switch (datatp_) { case numtype::unsigned_byte: - imc::convert_data_to_type(CSbuffer, data_); + imc::convert_data_to_type(CSbuffer, CSbuffer_size, data_); break; case numtype::signed_byte: - imc::convert_data_to_type(CSbuffer, data_); + imc::convert_data_to_type(CSbuffer, CSbuffer_size, data_); break; case numtype::unsigned_short: - imc::convert_data_to_type(CSbuffer, data_); + imc::convert_data_to_type(CSbuffer, CSbuffer_size, data_); break; case numtype::signed_short: - imc::convert_data_to_type(CSbuffer, data_); + imc::convert_data_to_type(CSbuffer, CSbuffer_size, data_); break; case numtype::unsigned_long: - imc::convert_data_to_type(CSbuffer, data_); + imc::convert_data_to_type(CSbuffer, CSbuffer_size, data_); break; case numtype::signed_long: - imc::convert_data_to_type(CSbuffer, data_); + imc::convert_data_to_type(CSbuffer, CSbuffer_size, data_); break; case numtype::ffloat: - imc::convert_data_to_type(CSbuffer, data_); + imc::convert_data_to_type(CSbuffer, CSbuffer_size, data_); break; case numtype::ddouble: - imc::convert_data_to_type(CSbuffer, data_); + imc::convert_data_to_type(CSbuffer, CSbuffer_size, data_); break; case numtype::two_byte_word_digital: - imc::convert_data_to_type(CSbuffer, data_); + imc::convert_data_to_type(CSbuffer, CSbuffer_size, data_); break; case numtype::six_byte_unsigned_long: - imc::convert_data_to_type(CSbuffer, data_); + imc::convert_data_to_type(CSbuffer, CSbuffer_size, data_); break; default: throw std::runtime_error(std::string("unsupported/unknown datatype ") + std::to_string(datatp_)); @@ -699,6 +850,9 @@ namespace imc // provide JSON string of metadata std::string get_json(bool include_data = false) { + if (include_data && ydata_.empty() && number_of_samples_ > 0) { + load_all_data(); + } // prepare printable trigger-time std::time_t tt = std::chrono::system_clock::to_time_t(trigger_time_); std::time_t att = std::chrono::system_clock::to_time_t(absolute_trigger_time_); diff --git a/lib/imc_conversion.hpp b/lib/imc_conversion.hpp index 9cdb71e..dac3a38 100644 --- a/lib/imc_conversion.hpp +++ b/lib/imc_conversion.hpp @@ -11,14 +11,14 @@ namespace imc { // convert raw data in buffer into specific datatype template - void convert_data_to_type(std::vector& subbuffer, + void convert_data_to_type(const unsigned char* subbuffer, size_t subbuffer_size, std::vector& channel) { // check number of elements of type "datatype" in buffer - if ( subbuffer.size() != channel.size()*sizeof(datatype) ) + if ( subbuffer_size != channel.size()*sizeof(datatype) ) { throw std::runtime_error( std::string("size mismatch between subbuffer (") - + std::to_string(subbuffer.size()) + + std::to_string(subbuffer_size) + std::string(") and datatype (") + std::to_string(channel.size()) + std::string("*") + std::to_string(sizeof(datatype)) + std::string(")") ); @@ -44,6 +44,61 @@ namespace imc // for ( auto el: channel ) std::cout< + void convert_chunk_to_double(const unsigned char* buffer, size_t start_index, size_t count, + double factor, double offset, std::vector& out) + { + size_t type_size = sizeof(SourceType); + const unsigned char* start_ptr = buffer + start_index * type_size; + + out.resize(count); + + for (size_t i = 0; i < count; ++i) { + SourceType val; + + const unsigned char* val_ptr = start_ptr + i * type_size; + uint8_t* dest_ptr = reinterpret_cast(&val); + for(size_t j=0; j(val); + if (factor != 1.0 || offset != 0.0) { + double fact = (factor == 0.0) ? 1.0 : factor; + dval = dval * fact + offset; + } + out[i] = dval; + } + } + + // Specialization for imc_sixbyte + template<> + inline void convert_chunk_to_double(const unsigned char* buffer, size_t start_index, size_t count, + double factor, double offset, std::vector& out) + { + size_t type_size = 6; + const unsigned char* start_ptr = buffer + start_index * type_size; + + out.resize(count); + + for (size_t i = 0; i < count; ++i) { + const unsigned char* val_ptr = start_ptr + i * type_size; + uint64_t val = 0; + for(int j=0; j<6; ++j) { + val |= (uint64_t)val_ptr[j] << (j*8); + } + + double dval = static_cast(val); + if (factor != 1.0 || offset != 0.0) { + double fact = (factor == 0.0) ? 1.0 : factor; + dval = dval * fact + offset; + } + out[i] = dval; + } + } + } #endif diff --git a/lib/imc_object.hpp b/lib/imc_object.hpp index 1fc1a44..6090a18 100644 --- a/lib/imc_object.hpp +++ b/lib/imc_object.hpp @@ -12,12 +12,12 @@ namespace imc { // obtain specific parameters as string - std::string get_parameter(const std::vector* buffer, const imc::parameter* param) + std::string get_parameter(const unsigned char* buffer, const imc::parameter* param) { std::string prm(""); for ( unsigned long int i = param->begin()+1; i <= param->end(); i++ ) { - prm.push_back((char)(*buffer)[i]); + prm.push_back((char)buffer[i]); } return prm; } @@ -29,7 +29,7 @@ namespace imc int processor_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 3 ) throw std::runtime_error("invalid number of parameters in CF"); fileformat_ = std::stoi(get_parameter(buffer,¶meters[0])); @@ -56,7 +56,7 @@ namespace imc bool closed_; // corresponds to true = 1 and false = 0 in file // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 2 ) throw std::runtime_error("invalid number of parameters in CK"); version_ = std::stoi(get_parameter(buffer,¶meters[0])); @@ -83,7 +83,7 @@ namespace imc std::string comment_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 7 ) throw std::runtime_error("invalid number of parameters in CB"); group_index_ = std::stoul(get_parameter(buffer,¶meters[2])); @@ -111,7 +111,7 @@ namespace imc std::string comment_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 9 ) throw std::runtime_error("invalid number of parameters in CT"); group_index_ = std::stoul(get_parameter(buffer,¶meters[2])); @@ -149,7 +149,7 @@ namespace imc int dimension_; // corresponding to fieldtype \in {1,} // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 5 ) throw std::runtime_error("invalid number of parameters in CG"); number_components_ = std::stoul(get_parameter(buffer,¶meters[2])); @@ -176,7 +176,7 @@ namespace imc std::string unit_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 6 ) throw std::runtime_error("invalid number of parameters in CD1"); dx_ = std::stod(get_parameter(buffer,¶meters[2])); @@ -208,7 +208,7 @@ namespace imc int pretriggerapp_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 11 ) throw std::runtime_error("invalid number of parameters in CD2"); dx_ = std::stod(get_parameter(buffer,¶meters[2])); @@ -244,7 +244,7 @@ namespace imc bool analog_digital_; // 1 => false (analog), 2 => true (digital) // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 4 ) throw std::runtime_error("invalid number of parameters in CC"); component_index_ = std::stoi(get_parameter(buffer,¶meters[2])); @@ -291,7 +291,7 @@ namespace imc unsigned long int distance_bytes_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 10 ) throw std::runtime_error("invalid number of parameters in CP"); buffer_reference_ = std::stoi(get_parameter(buffer,¶meters[2])); @@ -337,7 +337,7 @@ namespace imc // bool new_event_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 13 ) throw std::runtime_error("invalid number of parameters in Cb"); number_buffers_ = std::stoul(get_parameter(buffer,¶meters[2])); @@ -379,7 +379,7 @@ namespace imc std::string unit_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 8 ) throw std::runtime_error("invalid number of parameters in CR"); transform_ = (get_parameter(buffer,¶meters[2]) == std::string("1")); @@ -411,7 +411,7 @@ namespace imc std::string comment_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 9 ) throw std::runtime_error("invalid number of parameters in CN"); group_index_ = std::stoul(get_parameter(buffer,¶meters[2])); @@ -440,7 +440,7 @@ namespace imc // unsigned long int begin_buffer_, end_buffer_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 4 ) throw std::runtime_error("invalid number of parameters in CS"); index_ = std::stoul(get_parameter(buffer,¶meters[2])); @@ -464,7 +464,7 @@ namespace imc std::string language_code_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if (parameters.size() < 4) throw std::runtime_error("invalid number of parameters in NL"); codepage_ = get_parameter(buffer, ¶meters[2]); @@ -480,7 +480,7 @@ namespace imc std::string comment_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 7 ) throw std::runtime_error("invalid number of parameters in NO"); origin_ = ( get_parameter(buffer,¶meters[2]) == std::string("1") ); @@ -506,7 +506,7 @@ namespace imc double trigger_time_frac_secs_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 8 ) throw std::runtime_error("invalid number of parameters in NT1"); tms_ = std::tm(); @@ -560,7 +560,7 @@ namespace imc { rawobject(): objidx_(-1) { } - void parse(imc::key key, const std::vector* buffer, + void parse(imc::key key, const unsigned char* buffer, const std::vector& parameters) { if ( key.name_ == std::string("CF") ) diff --git a/lib/imc_raw.hpp b/lib/imc_raw.hpp index 77804db..47fc011 100644 --- a/lib/imc_raw.hpp +++ b/lib/imc_raw.hpp @@ -7,35 +7,24 @@ #include #include -// #include "hexshow.hpp" +#include "imc_buffer.hpp" #include "imc_key.hpp" #include "imc_block.hpp" #include "imc_datatype.hpp" #include "imc_object.hpp" -#include "imc_result.hpp" #include "imc_channel.hpp" //---------------------------------------------------------------------------// namespace imc { - struct channel_chunk { - std::vector x_bytes; - std::vector y_bytes; - unsigned long int start; - unsigned long int count; - bool has_x; - int x_type; - int y_type; - }; - class raw { // (path of) raw-file and its basename std::string raw_file_, file_name_; // buffer of raw-file - std::vector buffer_; + imc::MemoryMappedFile buffer_; // list and map of imc-blocks std::vector rawblocks_; @@ -53,6 +42,12 @@ namespace imc raw() { }; raw(std::string raw_file): raw_file_(raw_file) { set_file(raw_file); }; + // Delete copy and move operations because of self-referential pointers in channels_ + raw(const raw&) = delete; + raw& operator=(const raw&) = delete; + raw(raw&&) = delete; + raw& operator=(raw&&) = delete; + // provide new raw-file void set_file(std::string raw_file) { @@ -68,16 +63,9 @@ namespace imc // open file and stream data into buffer void fill_buffer() { - buffer_.clear(); - // open file and put data in buffer try { - std::ifstream fin(raw_file_.c_str(),std::ifstream::binary); - if ( !fin.good() ) throw std::runtime_error("failed to open file"); - std::vector buffer((std::istreambuf_iterator(fin)), - (std::istreambuf_iterator())); - buffer_ = buffer; - fin.close(); + buffer_.map(raw_file_); } catch ( const std::exception& e ) { throw std::runtime_error( std::string("failed to open raw-file and stream data in buffer: ") + e.what() @@ -93,31 +81,33 @@ namespace imc // reset counter to identify computational complexity cplxcnt_ = 0; + const unsigned char* data = buffer_.data(); + size_t size = buffer_.size(); + // start parsing raw-blocks in buffer - for ( std::vector::iterator it=buffer_.begin(); - it!=buffer_.end(); ++it ) + for ( unsigned long int i = 0; i < size; ++i ) { cplxcnt_++; // check for "magic byte" - if ( *it == ch_bgn_ ) + if ( data[i] == ch_bgn_ ) { // check for (non)critical key - if ( *(it+1) == imc::key_crit_ || *(it+1) == imc::key_non_crit_ ) + if ( data[i+1] == imc::key_crit_ || data[i+1] == imc::key_non_crit_ ) { // compose (entire) key - std::string newkey = { (char)*(it+1), (char)*(it+2) }; - imc::key itkey(*(it+1) == imc::key_crit_,newkey); + std::string newkey = { (char)data[i+1], (char)data[i+2] }; + imc::key itkey(data[i+1] == imc::key_crit_,newkey); // expecting ch_sep_ after key - if ( *(it+3) == ch_sep_ ) + if ( data[i+3] == ch_sep_ ) { // extract key version std::string vers(""); unsigned long int pos = 4; - while ( *(it+pos) != ch_sep_ ) + while ( data[i+pos] != ch_sep_ ) { - vers.push_back((char)*(it+pos)); + vers.push_back((char)data[i+pos]); pos++; } int version = std::stoi(vers); @@ -132,9 +122,9 @@ namespace imc // get block length std::string leng(""); pos++; - while ( *(it+pos) != ch_sep_ ) + while ( data[i+pos] != ch_sep_ ) { - leng.push_back((char)*(it+pos)); + leng.push_back((char)data[i+pos]); pos++; } unsigned long int length = std::stoul(leng); @@ -142,23 +132,23 @@ namespace imc // declare and initialize corresponding key and block // imc::key bkey( *(it+1)==imc::key_crit_ , newkey, // imc::keys.at(newkey).description_, version ); - imc::block blk(itkey,(unsigned long int)(it-buffer_.begin()), - (unsigned long int)(it-buffer_.begin()+pos+1+length), - raw_file_, &buffer_); + imc::block blk(itkey,i, + i+pos+1+length, + raw_file_, data, size); // add block to list rawblocks_.push_back(blk); // skip the remaining block according to its length - if ( (unsigned long int)(it-buffer_.begin()+length) < (unsigned long int)(buffer_.size()) ) + if ( i+length < size ) { - std::advance(it,length); + i += length; } } else { // all critical must be known !! while a noncritical may be ignored - if ( *(it+1) == imc::key_crit_ ) + if ( data[i+1] == imc::key_crit_ ) { throw std::runtime_error( std::string("unknown critical key: ") + newkey + std::to_string(version) @@ -175,7 +165,7 @@ namespace imc { throw std::runtime_error( std::string("invalid block or corrupt buffer at byte: ") - + std::to_string(it+3-buffer_.begin()) + + std::to_string(i+3) ); } } @@ -242,7 +232,7 @@ namespace imc // a new component group is started // TODO: can we avoid to parse the whole component here? imc::component component; - component.parse(&buffer_, blk.get_parameters()); + component.parse(buffer_.data(), blk.get_parameters()); if ( component.component_index_ == 1 ) compenv_ptr = &chnenv.compenv1_; else if ( component.component_index_ == 2 ) compenv_ptr = &chnenv.compenv2_; else throw std::runtime_error("invalid component index in CC block"); @@ -293,7 +283,7 @@ namespace imc // create channel object and add it to the map of channels channels_.insert( std::pair - (chnenv.CNuuid_,imc::channel(chnenv,&mapblocks_,&buffer_)) + (chnenv.CNuuid_,imc::channel(chnenv,&mapblocks_,buffer_.data())) ); // reset channel uuid @@ -408,7 +398,7 @@ namespace imc { if ( channels_.count(uuid) ) { - return (unsigned long int)channels_.at(uuid).ydata_.size(); + return channels_.at(uuid).number_of_samples_; } else { @@ -437,125 +427,7 @@ namespace imc throw std::runtime_error(std::string("channel does not exist:") + uuid); } - imc::channel& ch = channels_.at(uuid); - unsigned long int total_len = ch.ydata_.size(); - - if ( start >= total_len ) - { - return { {}, {}, start, 0, include_x, 0, 0 }; - } - - unsigned long int end = start + count; - if ( end > total_len ) end = total_len; - unsigned long int actual_count = end - start; - - channel_chunk chunk; - chunk.start = start; - chunk.count = actual_count; - chunk.has_x = include_x; - - // Handle Y data - if (raw_mode) { - // Raw mode: read bytes directly from buffer - int type = (int)ch.ydatatp_; - unsigned long int bytes_per_sample = ch.ysignbits_ / 8; - - if (mapblocks_.count(ch.chnenv_.CSuuid_) == 0) { - throw std::runtime_error("CS block not found for channel"); - } - imc::block& cs_block = mapblocks_.at(ch.chnenv_.CSuuid_); - std::vector prms = cs_block.get_parameters(); - if (prms.size() < 4) throw std::runtime_error("Invalid CS block parameters"); - unsigned long int buffstrt = prms[3].begin(); - - unsigned long int abs_start = buffstrt + ch.ybuffer_offset_ + 1 + start * bytes_per_sample; - unsigned long int byte_count = actual_count * bytes_per_sample; - - if (abs_start + byte_count > buffer_.size()) { - throw std::runtime_error("Buffer read out of bounds"); - } - - if (type == 13) { // six_byte_unsigned_long -> promote to 8 byte (uint64) - chunk.y_type = 13; // Keep original type ID, but data is promoted - chunk.y_bytes.resize(actual_count * 8); - uint64_t* dest = reinterpret_cast(chunk.y_bytes.data()); - - for (unsigned long int i = 0; i < actual_count; ++i) { - unsigned long int src_idx = abs_start + i * 6; - uint64_t val = 0; - // Assuming Little Endian storage in file - for (int b = 0; b < 6; ++b) { - val |= (uint64_t)buffer_[src_idx + b] << (b * 8); - } - dest[i] = val; - } - } else { - chunk.y_type = type; - chunk.y_bytes.resize(byte_count); - std::copy(buffer_.begin() + abs_start, buffer_.begin() + abs_start + byte_count, chunk.y_bytes.begin()); - } - } else { - // Scaled mode: convert to double - chunk.y_type = 8; // imc::numtype::ddouble - chunk.y_bytes.resize(actual_count * sizeof(double)); - double* ptr = reinterpret_cast(chunk.y_bytes.data()); - - for (unsigned long int i = 0; i < actual_count; ++i) { - ptr[i] = ch.ydata_[start + i].as_double(); - } - } - - // Handle X data - if (include_x) { - if (ch.dimension_ == 2 && raw_mode) { - // XY channel, raw mode - int type = (int)ch.xdatatp_; - unsigned long int bytes_per_sample = ch.xsignbits_ / 8; - - imc::block& cs_block = mapblocks_.at(ch.chnenv_.CSuuid_); - std::vector prms = cs_block.get_parameters(); - unsigned long int buffstrt = prms[3].begin(); - - unsigned long int abs_start = buffstrt + ch.xbuffer_offset_ + 1 + start * bytes_per_sample; - unsigned long int byte_count = actual_count * bytes_per_sample; - - if (abs_start + byte_count > buffer_.size()) { - throw std::runtime_error("Buffer read out of bounds (X)"); - } - - if (type == 13) { // six_byte_unsigned_long -> promote to 8 byte - chunk.x_type = 13; // Keep original type ID - chunk.x_bytes.resize(actual_count * 8); - uint64_t* dest = reinterpret_cast(chunk.x_bytes.data()); - for (unsigned long int i = 0; i < actual_count; ++i) { - unsigned long int src_idx = abs_start + i * 6; - uint64_t val = 0; - for (int b = 0; b < 6; ++b) { - val |= (uint64_t)buffer_[src_idx + b] << (b * 8); - } - dest[i] = val; - } - } else { - chunk.x_type = type; - chunk.x_bytes.resize(byte_count); - std::copy(buffer_.begin() + abs_start, buffer_.begin() + abs_start + byte_count, chunk.x_bytes.begin()); - } - } else { - // Generated X or scaled X - chunk.x_type = 8; // imc::numtype::ddouble - chunk.x_bytes.resize(actual_count * sizeof(double)); - double* ptr = reinterpret_cast(chunk.x_bytes.data()); - - for (unsigned long int i = 0; i < actual_count; ++i) { - if (start + i < ch.xdata_.size()) - ptr[i] = ch.xdata_[start + i].as_double(); - else - ptr[i] = 0.0; - } - } - } - - return chunk; + return channels_.at(uuid).read_chunk(start, count, include_x, raw_mode); } // print single specific channel diff --git a/lib/imc_result.hpp b/lib/imc_result.hpp deleted file mode 100644 index 1961ace..0000000 --- a/lib/imc_result.hpp +++ /dev/null @@ -1,30 +0,0 @@ -//---------------------------------------------------------------------------// - -#ifndef IMCRESULT -#define IMCRESULT - -#include "imc_datatype.hpp" - -//---------------------------------------------------------------------------// - -namespace imc -{ - struct channel_tab - { - std::string name_; - - // abscissa - std::vector xaxis_; - std::string xunit_; - - // ordinate - // std::vector yaxis_; - std::vector yaxis_; - std::string yunit_; - }; - -} - -#endif - -//---------------------------------------------------------------------------// diff --git a/python/imctermite.pyx b/python/imctermite.pyx index 5618592..3bfc319 100644 --- a/python/imctermite.pyx +++ b/python/imctermite.pyx @@ -21,12 +21,16 @@ def get_codepage(chn) : cdef class imctermite: - # C++ instance of class => stack allocated (requires nullary constructor!) - cdef cppimctermite cppimc + # C++ instance of class + cdef cppimctermite* cppimc # constructor def __cinit__(self, string rawfile): - self.cppimc = cppimctermite(rawfile) + self.cppimc = new cppimctermite(rawfile) + + def __dealloc__(self): + if self.cppimc != NULL: + del self.cppimc # provide raw file def submit_file(self,string rawfile): From fdc8ecef57e5dcb2026a1818efe532b1113ab2e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Ole=20G=C3=B6deke?= Date: Mon, 15 Dec 2025 22:48:51 +0100 Subject: [PATCH 05/12] Add windows target to CI tests and add tests for streaming functionality - Update GitHub Actions workflow to support testing on multiple OS - Refactor memory mapping in imc_buffer.hpp for Windows compatibility - Improve makefile to handle .pyd files for Python builds - Add comprehensive tests for streaming and chunking functionality in test_streaming.py --- .github/workflows/test.yml | 33 ++++++++--- lib/imc_buffer.hpp | 101 +++++++++++++++++++++++++++++++-- makefile | 4 +- python/makefile | 4 +- tests/test_streaming.py | 112 +++++++++++++++++++++++++++++++++++++ 5 files changed, 236 insertions(+), 18 deletions(-) create mode 100644 tests/test_streaming.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ce243c4..7fa6520 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,20 +2,35 @@ name: Run Tests on: push: - branches: [ master ] + branches: [ master, numpy-streaming ] pull_request: branches: [ master ] jobs: test: - runs-on: ubuntu-latest - + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, windows-latest] + python-version: ["3.10"] + steps: - name: Checkout code uses: actions/checkout@v3 - - - name: Build Docker image - run: docker build -t imctermite . - - - name: Run tests in container - run: docker run --rm imctermite make test + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest numpy cython setuptools wheel + + # Cross-platform build and test using Makefile + # Requires bash shell on Windows (Git Bash) + - name: Build and Test + shell: bash + run: | + make test diff --git a/lib/imc_buffer.hpp b/lib/imc_buffer.hpp index d6708f7..80f68f6 100644 --- a/lib/imc_buffer.hpp +++ b/lib/imc_buffer.hpp @@ -3,12 +3,18 @@ #include #include -#include -#include -#include -#include #include +#if defined(_WIN32) || defined(_WIN64) + #define WIN32_LEAN_AND_MEAN + #include +#else + #include + #include + #include + #include +#endif + namespace imc { class MemoryMappedFile @@ -16,10 +22,19 @@ namespace imc private: const unsigned char* data_; size_t size_; +#if defined(_WIN32) || defined(_WIN64) + HANDLE hFile_; + HANDLE hMap_; +#else int fd_; +#endif public: +#if defined(_WIN32) || defined(_WIN64) + MemoryMappedFile() : data_(nullptr), size_(0), hFile_(INVALID_HANDLE_VALUE), hMap_(NULL) {} +#else MemoryMappedFile() : data_(nullptr), size_(0), fd_(-1) {} +#endif ~MemoryMappedFile() { @@ -32,12 +47,22 @@ namespace imc // Implement move constructor MemoryMappedFile(MemoryMappedFile&& other) noexcept +#if defined(_WIN32) || defined(_WIN64) + : data_(other.data_), size_(other.size_), hFile_(other.hFile_), hMap_(other.hMap_) + { + other.data_ = nullptr; + other.size_ = 0; + other.hFile_ = INVALID_HANDLE_VALUE; + other.hMap_ = NULL; + } +#else : data_(other.data_), size_(other.size_), fd_(other.fd_) { other.data_ = nullptr; other.size_ = 0; other.fd_ = -1; } +#endif // Implement move assignment operator MemoryMappedFile& operator=(MemoryMappedFile&& other) noexcept @@ -47,10 +72,17 @@ namespace imc close_file(); data_ = other.data_; size_ = other.size_; +#if defined(_WIN32) || defined(_WIN64) + hFile_ = other.hFile_; + hMap_ = other.hMap_; + other.hFile_ = INVALID_HANDLE_VALUE; + other.hMap_ = NULL; +#else fd_ = other.fd_; + other.fd_ = -1; +#endif other.data_ = nullptr; other.size_ = 0; - other.fd_ = -1; } return *this; } @@ -59,6 +91,46 @@ namespace imc { close_file(); +#if defined(_WIN32) || defined(_WIN64) + hFile_ = CreateFileA(filename.c_str(), GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); + if (hFile_ == INVALID_HANDLE_VALUE) + { + throw std::runtime_error("Failed to open file: " + filename); + } + + LARGE_INTEGER fileSize; + if (!GetFileSizeEx(hFile_, &fileSize)) + { + CloseHandle(hFile_); + hFile_ = INVALID_HANDLE_VALUE; + throw std::runtime_error("Failed to get file size: " + filename); + } + size_ = (size_t)fileSize.QuadPart; + + if (size_ == 0) + { + data_ = nullptr; + return; + } + + hMap_ = CreateFileMappingA(hFile_, NULL, PAGE_READONLY, 0, 0, NULL); + if (hMap_ == NULL) + { + CloseHandle(hFile_); + hFile_ = INVALID_HANDLE_VALUE; + throw std::runtime_error("Failed to create file mapping: " + filename); + } + + data_ = static_cast(MapViewOfFile(hMap_, FILE_MAP_READ, 0, 0, 0)); + if (data_ == NULL) + { + CloseHandle(hMap_); + hMap_ = NULL; + CloseHandle(hFile_); + hFile_ = INVALID_HANDLE_VALUE; + throw std::runtime_error("Failed to map view of file: " + filename); + } +#else fd_ = open(filename.c_str(), O_RDONLY); if (fd_ == -1) { @@ -90,20 +162,39 @@ namespace imc } data_ = static_cast(mapped); +#endif } void close_file() { if (data_) { +#if defined(_WIN32) || defined(_WIN64) + UnmapViewOfFile(data_); +#else munmap(const_cast(data_), size_); +#endif data_ = nullptr; } + +#if defined(_WIN32) || defined(_WIN64) + if (hMap_) + { + CloseHandle(hMap_); + hMap_ = NULL; + } + if (hFile_ != INVALID_HANDLE_VALUE) + { + CloseHandle(hFile_); + hFile_ = INVALID_HANDLE_VALUE; + } +#else if (fd_ != -1) { close(fd_); fd_ = -1; } +#endif size_ = 0; } diff --git a/makefile b/makefile index d5e7b01..59dd5a9 100644 --- a/makefile +++ b/makefile @@ -88,11 +88,11 @@ docker-run: python-build: check-tags make -C python/ build-inplace - cp python/imctermite*.so ./ -v + cp python/imctermite*.so ./ -v 2>/dev/null || cp python/imctermite*.pyd ./ -v 2>/dev/null || true python-clean: make -C python/ clean - rm -vf imctermite*.so + rm -vf imctermite*.so imctermite*.pyd python-test: PYTHONPATH=./ python python/examples/usage.py diff --git a/python/makefile b/python/makefile index 6bb6ecd..43dd6ea 100644 --- a/python/makefile +++ b/python/makefile @@ -26,8 +26,8 @@ build-bdist: setup build-clean: python setup.py clean --all - rm -vf imctermite*.so imctermite*.cpp - rm -vf IMCtermite*.so IMCtermite*.cpp + rm -vf imctermite*.so imctermite*.pyd imctermite*.cpp + rm -vf IMCtermite*.so IMCtermite*.pyd IMCtermite*.cpp rm -rvf dist/ IMCtermite.egg-info/ rm -rvf dist/ imctermite.egg-info/ diff --git a/tests/test_streaming.py b/tests/test_streaming.py new file mode 100644 index 0000000..ad95710 --- /dev/null +++ b/tests/test_streaming.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +""" +Tests for the new streaming/chunking functionality in IMCtermite +""" + +import pytest +import numpy as np +from pathlib import Path + +try: + import imctermite +except ImportError: + pytest.skip("imctermite module not built - run 'make python-build' first", allow_module_level=True) + +PROJECT_ROOT = Path(__file__).parent.parent +SAMPLES_DIR = PROJECT_ROOT / "samples" +DATASET_A = SAMPLES_DIR / "datasetA" + +class TestStreaming: + """Test iter_channel_numpy functionality""" + + @pytest.fixture + def imc_instance(self): + """Create IMC instance with sample file""" + sample_file = DATASET_A / "datasetA_1.raw" + if not sample_file.exists(): + pytest.skip(f"Sample file not found: {sample_file}") + return imctermite.imctermite(str(sample_file).encode()) + + @pytest.fixture + def first_channel_uuid(self, imc_instance): + """Get UUID of the first channel""" + channels = imc_instance.get_channels(include_data=False) + assert len(channels) > 0 + return channels[0]['uuid'] + + def test_iter_channel_numpy_scaled(self, imc_instance, first_channel_uuid): + """Test default scaled streaming""" + # Get ground truth via old method + full_channels = imc_instance.get_channels(include_data=True) + target_channel = next(ch for ch in full_channels if ch['uuid'] == first_channel_uuid) + expected_y = np.array(target_channel['ydata']) + + # Stream data + streamed_y = [] + # Encode UUID to bytes for C++ std::string + uuid_bytes = first_channel_uuid.encode('utf-8') + for chunk in imc_instance.iter_channel_numpy(uuid_bytes, chunk_rows=100): + assert 'y' in chunk + assert isinstance(chunk['y'], np.ndarray) + assert chunk['y'].dtype == np.float64 # Scaled should be float64 + streamed_y.append(chunk['y']) + + full_streamed_y = np.concatenate(streamed_y) + + # Compare + np.testing.assert_allclose(full_streamed_y, expected_y, rtol=1e-4) + + def test_iter_channel_numpy_raw(self, imc_instance, first_channel_uuid): + """Test raw streaming""" + # We can't easily compare raw values to scaled values without knowing the factor/offset + # But we can check types and consistency + + streamed_y_raw = [] + uuid_bytes = first_channel_uuid.encode('utf-8') + for chunk in imc_instance.iter_channel_numpy(uuid_bytes, chunk_rows=100, mode="raw"): + assert 'y' in chunk + assert isinstance(chunk['y'], np.ndarray) + # Raw type depends on file, but shouldn't necessarily be float64 unless the raw data is float + streamed_y_raw.append(chunk['y']) + + full_streamed_y_raw = np.concatenate(streamed_y_raw) + + # Ensure we got data + assert len(full_streamed_y_raw) > 0 + + def test_chunking_behavior(self, imc_instance, first_channel_uuid): + """Test that small chunks work correctly""" + # Get total length + channels = imc_instance.get_channels(include_data=False) + # We don't have direct access to length in metadata without loading, + # but we can infer it from a full load or just count + + chunk_size = 10 + uuid_bytes = first_channel_uuid.encode('utf-8') + chunks = list(imc_instance.iter_channel_numpy(uuid_bytes, chunk_rows=chunk_size)) + + # Check that most chunks are of size 10 + for i, chunk in enumerate(chunks[:-1]): # All but last should be full + assert len(chunk['y']) == chunk_size + + # Check continuity of 'start' index + expected_start = 0 + for chunk in chunks: + assert chunk['start'] == expected_start + expected_start += len(chunk['y']) + + def test_include_x_parameter(self, imc_instance, first_channel_uuid): + """Test include_x=False""" + uuid_bytes = first_channel_uuid.encode('utf-8') + for chunk in imc_instance.iter_channel_numpy(uuid_bytes, include_x=False, chunk_rows=100): + assert 'y' in chunk + assert 'x' not in chunk + + def test_invalid_channel_uuid(self, imc_instance): + """Test behavior with invalid UUID""" + # Depending on implementation, this might raise an error or return empty generator + # Based on C++ code: throw std::runtime_error("channel does not exist:" + uuid); + # Cython should propagate this as RuntimeError + + with pytest.raises(RuntimeError): + list(imc_instance.iter_channel_numpy(b"non-existent-uuid")) From 0d93e03f4d92488ea6ce10812f46389ff1dcf554 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Ole=20G=C3=B6deke?= Date: Mon, 15 Dec 2025 22:56:24 +0100 Subject: [PATCH 06/12] Fix platform detection in codepage conversion and update CLI path for Windows --- .github/workflows/test.yml | 4 ++-- makefile | 2 +- python/imctermite.pyx | 16 ++++++++++++---- python/setup.cfg | 4 ++++ python/setup.py | 3 ++- src/main.cpp | 1 + tests/test_cli.py | 3 +++ 7 files changed, 25 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7fa6520..e3b64e7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,7 +2,7 @@ name: Run Tests on: push: - branches: [ master, numpy-streaming ] + branches: [ master ] pull_request: branches: [ master ] @@ -12,7 +12,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, windows-latest] - python-version: ["3.10"] + python-version: ["3.10", "3.11", "3.12", "3.13"] steps: - name: Checkout code diff --git a/makefile b/makefile index 59dd5a9..f44495b 100644 --- a/makefile +++ b/makefile @@ -18,7 +18,7 @@ MIB = $(foreach dir,$(KIB),-I $(dir)) # choose compiler and its options CC = g++ -std=c++17 -OPT = -O3 -Wall -Wconversion -Wpedantic -Werror -Wunused-variable -Wsign-compare +OPT = -O3 -Wall -Wconversion -Wpedantic -Werror -Wunused-variable -Wsign-compare -static # determine git version/commit and release tag GTAG := $(shell git tag -l --sort=version:refname | tail -n1 | sed "s/$^v//g") diff --git a/python/imctermite.pyx b/python/imctermite.pyx index 3bfc319..a6c6c20 100644 --- a/python/imctermite.pyx +++ b/python/imctermite.pyx @@ -12,10 +12,18 @@ import platform # auxiliary function for codepage conversion def get_codepage(chn) : - if platform == 'Windows' : - chndec = jn.loads(chn.decode(errors="ignore")) - chncdp = chndec["codepage"] - return 'utf-8' if chncdp is None else chncdp + if platform.system() == 'Windows' : + try: + chndec = jn.loads(chn.decode(errors="ignore")) + chncdp = chndec.get("codepage") + if not chncdp: + return 'utf-8' + # If it's a number like "1252", Python expects "cp1252" + if str(chncdp).isdigit(): + return 'cp' + str(chncdp) + return str(chncdp) + except: + return 'utf-8' else : return 'utf-8' diff --git a/python/setup.cfg b/python/setup.cfg index 1308c6e..86528dc 100644 --- a/python/setup.cfg +++ b/python/setup.cfg @@ -15,6 +15,10 @@ license_files = LICENSE keywords = IMC, raw, imcFAMOS, imcSTUDIO, imcCRONOS classifiers = Programming Language :: Python :: 3 + Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.11 + Programming Language :: Python :: 3.12 + Programming Language :: Python :: 3.13 License :: OSI Approved :: MIT License Operating System :: OS Independent Topic :: Scientific/Engineering diff --git a/python/setup.py b/python/setup.py index 8d2e8a2..1d03fc6 100644 --- a/python/setup.py +++ b/python/setup.py @@ -15,7 +15,8 @@ "imctermite", sources=["imctermite.pyx"], include_dirs=[numpy.get_include()], - extra_compile_args=cmpArgs[sys.platform] + extra_compile_args=cmpArgs[sys.platform], + define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")] ) setup( diff --git a/src/main.cpp b/src/main.cpp index b19e0c7..989d30a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -3,6 +3,7 @@ #include #include #include +#include // #include "imc_key.hpp" // #include "imc_block.hpp" diff --git a/tests/test_cli.py b/tests/test_cli.py index 6144e6c..e906689 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -5,10 +5,13 @@ import pytest import subprocess +import sys from pathlib import Path PROJECT_ROOT = Path(__file__).parent.parent CLI = PROJECT_ROOT / "imctermite" +if sys.platform == "win32": + CLI = CLI.with_suffix(".exe") SAMPLES_DIR = PROJECT_ROOT / "samples" / "datasetA" From 27d8215c85f71a72bef69fdee0f27364a3b3f73b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Ole=20G=C3=B6deke?= Date: Tue, 16 Dec 2025 19:32:33 +0100 Subject: [PATCH 07/12] Implement chunked streaming for channel printing to improve memory efficiency (#9) Reduces memory usage by 90% for large datasets while maintaining comparable processing speed. --- lib/imc_channel.hpp | 42 +++++++++++++++++++++++++++++------------- lib/imc_raw.hpp | 10 +++++----- python/imctermite.pxd | 4 ++-- python/imctermite.pyx | 8 ++++---- 4 files changed, 40 insertions(+), 24 deletions(-) diff --git a/lib/imc_channel.hpp b/lib/imc_channel.hpp index c86e394..fe61b34 100644 --- a/lib/imc_channel.hpp +++ b/lib/imc_channel.hpp @@ -913,7 +913,7 @@ namespace imc } // print channel - void print(std::string filename, const char sep = ',', int width = 25, int yprec = 9) + void print(std::string filename, const char sep = ',', int width = 25, int yprec = 9, unsigned long int chunk_size = 100000) { std::ofstream fou(filename); @@ -930,21 +930,37 @@ namespace imc fou<(chunk.x_bytes.data()); + const double* y_ptr = reinterpret_cast(chunk.y_bytes.data()); + + // Write chunk data + for (unsigned long int i = 0; i < chunk.count; i++) { - fou<second.name_ + std::string(".csv"); std::filesystem::path pf = pd / filenam; - // and print the channel - it->second.print(pf.u8string(),sep); + // and print the channel using streaming + it->second.print(pf.u8string(),sep,25,9,chunk_size); } } diff --git a/python/imctermite.pxd b/python/imctermite.pxd index 262f57a..682946e 100644 --- a/python/imctermite.pxd +++ b/python/imctermite.pxd @@ -37,6 +37,6 @@ cdef extern from "lib/imc_raw.hpp" namespace "imc": channel_chunk read_channel_chunk(string uuid, unsigned long int start, unsigned long int count, bool include_x, bool raw_mode) except + # print single channel/all channels - void print_channel(string channeluuid, string outputdir, char delimiter) except + - void print_channels(string outputdir, char delimiter) except + + void print_channel(string channeluuid, string outputdir, char delimiter, unsigned long int chunk_size) except + + void print_channels(string outputdir, char delimiter, unsigned long int chunk_size) except + void print_table(string outputfile) except + diff --git a/python/imctermite.pyx b/python/imctermite.pyx index a6c6c20..b85ced9 100644 --- a/python/imctermite.pyx +++ b/python/imctermite.pyx @@ -112,10 +112,10 @@ cdef class imctermite: break # print single channel/all channels - def print_channel(self, string channeluuid, string outputfile, char delimiter): - self.cppimc.print_channel(channeluuid,outputfile,delimiter) - def print_channels(self, string outputdir, char delimiter): - self.cppimc.print_channels(outputdir,delimiter) + def print_channel(self, string channeluuid, string outputfile, char delimiter, unsigned long int chunk_size=100000): + self.cppimc.print_channel(channeluuid,outputfile,delimiter,chunk_size) + def print_channels(self, string outputdir, char delimiter, unsigned long int chunk_size=100000): + self.cppimc.print_channels(outputdir,delimiter,chunk_size) # print table including channels def print_table(self, string outputfile): From dffefa8b04f51a042126b96a27e8e8954506fe2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Ole=20G=C3=B6deke?= Date: Tue, 16 Dec 2025 22:12:02 +0100 Subject: [PATCH 08/12] Refactor raw file handling to use string instead of bytes across examples and core functionality --- README.md | 6 ++--- python/examples/multichannel.py | 4 ++-- python/examples/usage.py | 8 +++---- python/examples/usage_adv.py | 6 ++--- python/examples/usage_ext.py | 2 +- python/examples/usage_files.py | 6 ++--- python/examples/usage_numpy_chunks.py | 4 ++-- python/imctermite.pyx | 32 +++++++++++++++++---------- 8 files changed, 38 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 0436816..c239a09 100644 --- a/README.md +++ b/README.md @@ -196,17 +196,17 @@ of it by passing a _raw_ file to the constructor: ```Python import imctermite -imcraw = imctermite.imctermite(b"sample/sampleA.raw") +imcraw = imctermite.imctermite("sample/sampleA.raw") ``` An example of how to create an instance and obtain the list of channels is: ```Python -import IMCtermite +import imctermite # declare and initialize instance of "imctermite" by passing a raw-file try : - imcraw = IMCtermite.imctermite(b"samples/sampleA.raw") + imcraw = imctermite.imctermite("samples/sampleA.raw") except RuntimeError as e : print("failed to load/parse raw-file: " + str(e)) diff --git a/python/examples/multichannel.py b/python/examples/multichannel.py index 67b6b41..d2e2e1b 100644 --- a/python/examples/multichannel.py +++ b/python/examples/multichannel.py @@ -11,7 +11,7 @@ def add_trigger_time(trigger_time, add_time) : if __name__ == "__main__" : # read file and extract data - imctm = imctermite.imctermite(b"Measurement.raw") + imctm = imctermite.imctermite("samples/exampleB.raw") chns = imctm.get_channels(True) # prepare abscissa @@ -39,5 +39,5 @@ def add_trigger_time(trigger_time, add_time) : # show entire dataframe and write file print(df) - df.to_csv("Measurement.csv",header=True,sep='\t',index=False) + df.to_csv("exampleB.csv",header=True,sep='\t',index=False) diff --git a/python/examples/usage.py b/python/examples/usage.py index 06cc3ed..2b48e22 100644 --- a/python/examples/usage.py +++ b/python/examples/usage.py @@ -5,7 +5,7 @@ # declare and initialize instance of "imctermite" by passing a raw-file try : - imcraw = imctermite.imctermite(b"samples/exampleB.raw") + imcraw = imctermite.imctermite("samples/exampleB.raw") except RuntimeError as e : raise Exception("failed to load/parse raw-file: " + str(e)) @@ -24,15 +24,15 @@ print() # print the channels into a specific directory -imcraw.print_channels(b"/tmp/",ord(',')) +imcraw.print_channels("/tmp/",ord(',')) # print all channels separately for i,chn in enumerate(channels) : print(str(i)+" : "+chn['name']+" : "+chn['uuid']) filname = os.path.join("/tmp/",str(i) + "_" + chn['name']+".csv") print(filname) - imcraw.print_channel(chn['uuid'].encode(),filname.encode(),ord(',')) + imcraw.print_channel(chn['uuid'],filname,ord(',')) # print all channels in single file -imcraw.print_table(b"/tmp/allchannels.csv") +imcraw.print_table("/tmp/allchannels.csv") diff --git a/python/examples/usage_adv.py b/python/examples/usage_adv.py index 36000a6..0c844d8 100644 --- a/python/examples/usage_adv.py +++ b/python/examples/usage_adv.py @@ -15,7 +15,7 @@ # declare and initialize instance of "imctermite" by passing a raw-file try : - imcraw = imctermite.imctermite(fl.encode()) + imcraw = imctermite.imctermite(fl) except RuntimeError as e : raise Exception("failed to load/parse raw-file: " + str(e)) @@ -24,7 +24,7 @@ print(json.dumps(channels,indent=4, sort_keys=False)) # print the channels into a specific directory - imcraw.print_channels(b"./",ord(',')) + imcraw.print_channels("./",ord(',')) # print all channels in single file - imcraw.print_table(("./"+str(os.path.basename(fl).split('.')[0])+"_allchannels.csv").encode()) + imcraw.print_table(("./"+str(os.path.basename(fl).split('.')[0])+"_allchannels.csv")) diff --git a/python/examples/usage_ext.py b/python/examples/usage_ext.py index b6536e2..e7dd8e5 100644 --- a/python/examples/usage_ext.py +++ b/python/examples/usage_ext.py @@ -6,7 +6,7 @@ # declare and initialize instance of "imctermite" by passing a raw-file try : - imcraw = imctermite.imctermite(b"samples/sampleB.raw") + imcraw = imctermite.imctermite("samples/sampleB.raw") except RuntimeError as e : raise Exception("failed to load/parse raw-file: " + str(e)) diff --git a/python/examples/usage_files.py b/python/examples/usage_files.py index 3dcebd3..b6532d6 100644 --- a/python/examples/usage_files.py +++ b/python/examples/usage_files.py @@ -1,5 +1,5 @@ -import imctermite import imctermite +import imctermite def show_results(imcraw) : @@ -19,11 +19,11 @@ def show_results(imcraw) : print("") # create instance of 'imctermite' -imcraw = imctermite(b'samples/sampleA.raw') +imcraw = imctermite.imctermite("samples/sampleA.raw") show_results(imcraw) # use previous instance of 'imctermite' to provide new file -imcraw.submit_file(b'samples/sampleB.raw') +imcraw.submit_file("samples/sampleB.raw") show_results(imcraw) diff --git a/python/examples/usage_numpy_chunks.py b/python/examples/usage_numpy_chunks.py index 6aeaf09..8e63ee2 100644 --- a/python/examples/usage_numpy_chunks.py +++ b/python/examples/usage_numpy_chunks.py @@ -6,7 +6,7 @@ # Path to a sample file # Using sampleB.raw because it has integer data with scaling (factor=0.01, offset=327.68) -raw_file = b"samples/sampleB.raw" +raw_file = "samples/sampleB.raw" if not os.path.exists(raw_file): print(f"Sample file {raw_file} not found.") exit(1) @@ -30,7 +30,7 @@ target_uuid = "347" channel_info = next((ch for ch in channels if ch['uuid'] == target_uuid), channels[0]) -first_channel_uuid = channel_info['uuid'].encode('utf-8') +first_channel_uuid = channel_info['uuid'] print(f"Iterating over channel {first_channel_uuid} ({channel_info.get('name', 'unnamed')})") # Check native datatype diff --git a/python/imctermite.pyx b/python/imctermite.pyx index b85ced9..f8ecfbf 100644 --- a/python/imctermite.pyx +++ b/python/imctermite.pyx @@ -27,22 +27,30 @@ def get_codepage(chn) : else : return 'utf-8' +cdef bytes _as_bytes(obj): + if isinstance(obj, bytes): + return obj + elif isinstance(obj, str): + return obj.encode('utf-8') + else: + return str(obj).encode('utf-8') + cdef class imctermite: # C++ instance of class cdef cppimctermite* cppimc # constructor - def __cinit__(self, string rawfile): - self.cppimc = new cppimctermite(rawfile) + def __cinit__(self, rawfile): + self.cppimc = new cppimctermite(_as_bytes(rawfile)) def __dealloc__(self): if self.cppimc != NULL: del self.cppimc # provide raw file - def submit_file(self,string rawfile): - self.cppimc.set_file(rawfile) + def submit_file(self, rawfile): + self.cppimc.set_file(_as_bytes(rawfile)) # get JSON list of channels def get_channels(self, bool include_data): @@ -51,7 +59,7 @@ cdef class imctermite: return chnlstjn def iter_channel_numpy(self, string channeluuid, bool include_x=True, int chunk_rows=1000000, str mode="scaled"): - cdef unsigned long int total_len = self.cppimc.get_channel_length(channeluuid) + cdef unsigned long int total_len = self.cppimc.get_channel_length(_as_bytes(channeluuid)) cdef unsigned long int start = 0 cdef channel_chunk chunk cdef cnp.ndarray x_arr @@ -77,7 +85,7 @@ cdef class imctermite: } while start < total_len: - chunk = self.cppimc.read_channel_chunk(channeluuid, start, chunk_rows, include_x, raw_mode) + chunk = self.cppimc.read_channel_chunk(_as_bytes(channeluuid), start, chunk_rows, include_x, raw_mode) # Create numpy arrays from bytes y_dtype = dtype_map.get(chunk.y_type, np.float64) @@ -112,16 +120,16 @@ cdef class imctermite: break # print single channel/all channels - def print_channel(self, string channeluuid, string outputfile, char delimiter, unsigned long int chunk_size=100000): - self.cppimc.print_channel(channeluuid,outputfile,delimiter,chunk_size) - def print_channels(self, string outputdir, char delimiter, unsigned long int chunk_size=100000): - self.cppimc.print_channels(outputdir,delimiter,chunk_size) + def print_channel(self, channeluuid, outputfile, char delimiter, unsigned long int chunk_size=100000): + self.cppimc.print_channel(_as_bytes(channeluuid),_as_bytes(outputfile),delimiter,chunk_size) + def print_channels(self, outputdir, char delimiter, unsigned long int chunk_size=100000): + self.cppimc.print_channels(_as_bytes(outputdir),delimiter,chunk_size) # print table including channels - def print_table(self, string outputfile): + def print_table(self, outputfile): chnlst = self.cppimc.get_channels(True,True) chnlstjn = [jn.loads(chn.decode(errors="ignore")) for chn in chnlst] - with open(outputfile.decode(),'w') as fout: + with open(outputfile,'w') as fout: for chn in chnlstjn: fout.write('#' +str(chn['xname']).rjust(19)+str(chn['yname']).rjust(20)+'\n') fout.write('#'+str(chn['xunit']).rjust(19)+str(chn['yunit']).rjust(20)+'\n') From 62da4de05d2d992c869ea678bf37980d3ca7ad36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Ole=20G=C3=B6deke?= Date: Tue, 16 Dec 2025 22:30:27 +0100 Subject: [PATCH 09/12] Enhance multichannel data handling and add numpy support for channel data retrieval (#33) --- python/examples/multichannel.py | 53 ++++++++++++++++++++++----------- python/imctermite.pyx | 14 +++++++-- 2 files changed, 47 insertions(+), 20 deletions(-) diff --git a/python/examples/multichannel.py b/python/examples/multichannel.py index d2e2e1b..4dd7b4a 100644 --- a/python/examples/multichannel.py +++ b/python/examples/multichannel.py @@ -2,6 +2,7 @@ import imctermite import pandas import datetime +import numpy as np def add_trigger_time(trigger_time, add_time) : trgts = datetime.datetime.strptime(trigger_time,'%Y-%m-%dT%H:%M:%S') @@ -10,32 +11,48 @@ def add_trigger_time(trigger_time, add_time) : if __name__ == "__main__" : - # read file and extract data + # read file imctm = imctermite.imctermite("samples/exampleB.raw") - chns = imctm.get_channels(True) - # prepare abscissa - xcol = "time ["+chns[0]['xunit']+"]" - #xcol = "timestamp" - xsts = [add_trigger_time(chns[0]['trigger-time'],tm) for tm in chns[0]['xdata']] + # Get metadata only + chns = imctm.get_channels(False) + + if not chns: + print("No channels found") + exit() + + # Prepare DataFrame + df = pandas.DataFrame() - # sort channels + # Get X-axis from the first channel + first_chn = chns[0] + + data = imctm.get_channel_data(first_chn['uuid'], include_x=True) + x_data = data['x'] + + xcol = "time ["+first_chn['xunit']+"]" + df[xcol] = x_data + + # sort channels by name chnnms = sorted([chn['name'] for chn in chns], reverse=False) - chnsdict = {} - for chn in chns : - chnsdict[chn['name']] = chn + chnsdict = {chn['name']: chn for chn in chns} - # construct dataframe - df = pandas.DataFrame() - df[xcol] = pandas.Series(chns[0]['xdata']) - #df[xcol] = pandas.Series(xsts) - #for idx,chn in enumerate(chns) : for chnnm in chnnms : chn = chnsdict[chnnm] - #xcol = (chn['xname'] if chn['xname'] != '' else "x_"+str(idx))+" ["+chn['xunit']+"]" - #df[xcol] = pandas.Series(chn['xdata']) + uuid = chn['uuid'] + + # Fetch Y data only + data = imctm.get_channel_data(uuid, include_x=False) + y_data = data['y'] + ycol = chn['yname']+" ["+chn['yunit']+"]" - df[ycol] = pandas.Series(chn['ydata']) + + # Assign to DataFrame + if len(y_data) == len(df): + df[ycol] = y_data + else: + # Fallback to Series for alignment/filling + df[ycol] = pandas.Series(y_data) # show entire dataframe and write file print(df) diff --git a/python/imctermite.pyx b/python/imctermite.pyx index f8ecfbf..df00846 100644 --- a/python/imctermite.pyx +++ b/python/imctermite.pyx @@ -58,9 +58,9 @@ cdef class imctermite: chnlstjn = [jn.loads(chn.decode(get_codepage(chn),errors="ignore")) for chn in chnlst] return chnlstjn - def iter_channel_numpy(self, string channeluuid, bool include_x=True, int chunk_rows=1000000, str mode="scaled"): + def iter_channel_numpy(self, channeluuid, bool include_x=True, unsigned long int chunk_rows=1000000, str mode="scaled", unsigned long int start_index=0): cdef unsigned long int total_len = self.cppimc.get_channel_length(_as_bytes(channeluuid)) - cdef unsigned long int start = 0 + cdef unsigned long int start = start_index cdef channel_chunk chunk cdef cnp.ndarray x_arr cdef cnp.ndarray y_arr @@ -119,6 +119,16 @@ cdef class imctermite: if chunk.count == 0: break + def get_channel_data(self, channeluuid, bool include_x=True, str mode="scaled"): + cdef unsigned long int total_len = self.cppimc.get_channel_length(_as_bytes(channeluuid)) + if total_len == 0: + res = {'y': np.array([])} + if include_x: + res['x'] = np.array([]) + return res + + return next(self.iter_channel_numpy(channeluuid, include_x, total_len, mode, 0)) + # print single channel/all channels def print_channel(self, channeluuid, outputfile, char delimiter, unsigned long int chunk_size=100000): self.cppimc.print_channel(_as_bytes(channeluuid),_as_bytes(outputfile),delimiter,chunk_size) From b644d3382b89a8b73a363dfde22d01ed3bbc3538 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Ole=20G=C3=B6deke?= Date: Tue, 16 Dec 2025 22:33:28 +0100 Subject: [PATCH 10/12] Add print_timerange example and implement channel length retrieval in imctermite --- python/examples/usage_timerange.py | 65 ++++++++++++++++++++++++++++++ python/imctermite.pyx | 4 ++ 2 files changed, 69 insertions(+) create mode 100644 python/examples/usage_timerange.py diff --git a/python/examples/usage_timerange.py b/python/examples/usage_timerange.py new file mode 100644 index 0000000..eb36d9e --- /dev/null +++ b/python/examples/usage_timerange.py @@ -0,0 +1,65 @@ + +import imctermite +import sys +import os + +def print_timerange(filename): + """ + Demonstrates how to efficiently get the time range (first and last X values) + of channels without reading the entire file. + """ + + try: + imc = imctermite.imctermite(filename) + except RuntimeError as e: + print(f"Error loading file: {e}") + return + + # Get list of channels (metadata only, no data loaded yet) + channels = imc.get_channels(False) + + if not channels: + print("No channels found in file.") + return + + print(f"File: {filename}") + print("-" * 80) + print(f"{'Channel Name':<25} | {'Start (X)':<15} | {'End (X)':<15} | {'Samples':<10}") + print("-" * 80) + + for chn in channels: + uuid = chn['uuid'] + name = chn.get('yname', 'Unknown') + + length = imc.get_channel_length(uuid) + + if length == 0: + print(f"{name:<25} | {'Empty':<15} | {'Empty':<15} | {0:<10}") + continue + + # Get first sample (efficiently, reading only 1 row) + # We request X data to get the time/index + # chunk_rows=1 ensures we only read/convert the absolute minimum data + gen_first = imc.iter_channel_numpy(uuid, start_index=0, chunk_rows=1, include_x=True) + try: + first_chunk = next(gen_first) + first_x = first_chunk['x'][0] + except (StopIteration, IndexError): + first_x = float('nan') + + # Get last sample + gen_last = imc.iter_channel_numpy(uuid, start_index=length-1, chunk_rows=1, include_x=True) + try: + last_chunk = next(gen_last) + last_x = last_chunk['x'][0] + except (StopIteration, IndexError): + last_x = float('nan') + + print(f"{name:<25} | {first_x:<15.5f} | {last_x:<15.5f} | {length:<10}") + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python usage_timerange.py ") + print("Example: python usage_timerange.py ../../samples/datasetA/datasetA_1.raw") + else: + print_timerange(sys.argv[1]) diff --git a/python/imctermite.pyx b/python/imctermite.pyx index df00846..10e1991 100644 --- a/python/imctermite.pyx +++ b/python/imctermite.pyx @@ -58,6 +58,10 @@ cdef class imctermite: chnlstjn = [jn.loads(chn.decode(get_codepage(chn),errors="ignore")) for chn in chnlst] return chnlstjn + # get length of a channel + def get_channel_length(self, channeluuid): + return self.cppimc.get_channel_length(_as_bytes(channeluuid)) + def iter_channel_numpy(self, channeluuid, bool include_x=True, unsigned long int chunk_rows=1000000, str mode="scaled", unsigned long int start_index=0): cdef unsigned long int total_len = self.cppimc.get_channel_length(_as_bytes(channeluuid)) cdef unsigned long int start = start_index From fbf752f5457e7a398baacde62e5998237dba15a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Ole=20G=C3=B6deke?= Date: Tue, 16 Dec 2025 22:33:53 +0100 Subject: [PATCH 11/12] Add type stubs and package data for improved type checking and IDE support --- python/MANIFEST.in | 2 + python/imctermite.pyi | 185 ++++++++++++++++++++++++++++++++++++++++++ python/py.typed | 0 python/setup.py | 6 +- 4 files changed, 192 insertions(+), 1 deletion(-) create mode 100644 python/imctermite.pyi create mode 100644 python/py.typed diff --git a/python/MANIFEST.in b/python/MANIFEST.in index dbe052e..eb044a9 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -2,4 +2,6 @@ include lib/*.hpp include *.cpp include *.pyx include *.pxd +include *.pyi +include py.typed include VERSION diff --git a/python/imctermite.pyi b/python/imctermite.pyi new file mode 100644 index 0000000..07f1dde --- /dev/null +++ b/python/imctermite.pyi @@ -0,0 +1,185 @@ +""" +Type stub file for IMCtermite Cython extension. +This provides IDE support, type checking, and autocomplete for the imctermite module. +""" + +from typing import Any, Dict, Iterator, List, Literal, Optional, Union +import numpy as np +import numpy.typing as npt + +def get_codepage(chn: bytes) -> str: + """Get the codepage for decoding channel data.""" + ... + +class imctermite: + """ + IMCtermite parser for .raw (IMC2 Data Format) files. + + This class provides methods to read and parse IMC measurement data files, + extracting channel metadata and data. + """ + + def __init__(self, rawfile: Union[str, bytes]) -> None: + """ + Initialize parser with a .raw file. + + Args: + rawfile: Path to the .raw file to parse + """ + ... + + def submit_file(self, rawfile: Union[str, bytes]) -> None: + """ + Set or change the raw file to parse. + + Args: + rawfile: Path to the .raw file to parse + """ + ... + + def get_channels(self, include_data: bool = True) -> List[Dict[str, Any]]: + """ + Get list of all channels in the file with their metadata. + + Args: + include_data: If True, includes the actual measurement data in the result. + If False, only returns metadata (faster for inspection). + + Returns: + List of dictionaries containing channel information: + - uuid: Unique identifier for the channel + - xname: X-axis name (typically "time") + - yname: Y-axis name (measurement name) + - xunit: X-axis unit + - yunit: Y-axis unit + - length: Number of data points + - xdata: X-axis data (if include_data=True) + - ydata: Y-axis data (if include_data=True) + - buffer_type: Data type identifier + - codepage: Text encoding information + """ + ... + + def get_channel_length(self, channeluuid: Union[str, bytes]) -> int: + """ + Get the number of data points in a channel. + + Args: + channeluuid: UUID of the channel to query + + Returns: + Number of data points in the channel + """ + ... + + def iter_channel_numpy( + self, + channeluuid: Union[str, bytes], + include_x: bool = True, + chunk_rows: int = 1000000, + mode: Literal["scaled", "raw"] = "scaled", + start_index: int = 0 + ) -> Iterator[Dict[str, Union[int, npt.NDArray[Any]]]]: + """ + Iterate over channel data in chunks as numpy arrays. + + This is memory-efficient for large datasets as it yields data in chunks + rather than loading everything into memory at once. + + Args: + channeluuid: UUID of the channel to read + include_x: If True, includes x-axis data in results + chunk_rows: Number of rows per chunk (default: 1,000,000) + mode: "scaled" for calibrated values or "raw" for uncalibrated ADC values + start_index: Starting row index (for partial reads) + + Yields: + Dictionary containing: + - start: Starting index of this chunk + - y: numpy array of Y-axis values + - x: numpy array of X-axis values (if include_x=True) + + Example: + >>> imc = imctermite("measurement.raw") + >>> channels = imc.get_channels(include_data=False) + >>> uuid = channels[0]['uuid'] + >>> for chunk in imc.iter_channel_numpy(uuid, chunk_rows=100000): + ... print(f"Processing {len(chunk['y'])} samples starting at {chunk['start']}") + ... # Process chunk['x'] and chunk['y'] arrays + """ + ... + + def get_channel_data( + self, + channeluuid: Union[str, bytes], + include_x: bool = True, + mode: Literal["scaled", "raw"] = "scaled" + ) -> Dict[str, npt.NDArray[Any]]: + """ + Get all data for a channel as numpy arrays. + + Args: + channeluuid: UUID of the channel to read + include_x: If True, includes x-axis data in result + mode: "scaled" for calibrated values or "raw" for uncalibrated ADC values + + Returns: + Dictionary containing: + - y: numpy array of Y-axis values + - x: numpy array of X-axis values (if include_x=True) + + Note: + This loads the entire channel into memory. For large datasets, + consider using iter_channel_numpy() instead. + + Example: + >>> imc = imctermite("measurement.raw") + >>> channels = imc.get_channels(include_data=False) + >>> uuid = channels[0]['uuid'] + >>> data = imc.get_channel_data(uuid) + >>> print(f"X shape: {data['x'].shape}, Y shape: {data['y'].shape}") + """ + ... + + def print_channel( + self, + channeluuid: Union[str, bytes], + outputfile: Union[str, bytes], + delimiter: Union[str, bytes] = b',', + chunk_size: int = 100000 + ) -> None: + """ + Export a single channel to a CSV file. + + Args: + channeluuid: UUID of the channel to export + outputfile: Path to output file + delimiter: Column delimiter character (default: comma) + chunk_size: Number of rows to process at once + """ + ... + + def print_channels( + self, + outputdir: Union[str, bytes], + delimiter: Union[str, bytes] = b',', + chunk_size: int = 100000 + ) -> None: + """ + Export all channels to separate CSV files in a directory. + + Args: + outputdir: Directory path for output files + delimiter: Column delimiter character (default: comma) + chunk_size: Number of rows to process at once + """ + ... + + def print_table(self, outputfile: Union[str, bytes]) -> None: + """ + Export all channels with headers to a single formatted text file. + + Args: + outputfile: Path to output file + """ + ... diff --git a/python/py.typed b/python/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/python/setup.py b/python/setup.py index 1d03fc6..badf881 100644 --- a/python/setup.py +++ b/python/setup.py @@ -20,5 +20,9 @@ ) setup( - ext_modules=cythonize(extension,language_level=3) + ext_modules=cythonize(extension,language_level=3), + package_data={ + "": ["py.typed", "*.pyi"] + }, + zip_safe=False ) From c2c9109761bd6231e55358c15abd94637406a47f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Ole=20G=C3=B6deke?= Date: Wed, 17 Dec 2025 09:52:07 +0100 Subject: [PATCH 12/12] chore: modernize Python packaging and CI workflows - Migrate from setup.cfg to pyproject.toml with PEP 517/621 compliance - Update to Python build tools (replace setup.py commands with python -m build) - Upgrade all GitHub Actions to latest versions (@v4, ubuntu-latest) - Remove outdated cibuildwheel version pinning - Add numpy as explicit build and runtime dependency - Bump package version to 3.0.0 - Improve test documentation with development install guidance - Add Python version badge to README - Standardize python3 usage across makefiles --- .github/workflows/pypi-deploy.yml | 35 ++++++++++---------- .github/workflows/test.yml | 7 ++-- README.md | 12 ++++--- makefile | 4 +-- python/VERSION | 2 +- python/makefile | 21 +++++------- python/pyproject.toml | 55 +++++++++++++++++++++++++++++-- python/setup.cfg | 27 --------------- python/setup.py | 5 +-- tests/README.md | 21 +++++++++++- 10 files changed, 113 insertions(+), 76 deletions(-) delete mode 100644 python/setup.cfg diff --git a/.github/workflows/pypi-deploy.yml b/.github/workflows/pypi-deploy.yml index aa89ef9..db89645 100644 --- a/.github/workflows/pypi-deploy.yml +++ b/.github/workflows/pypi-deploy.yml @@ -13,14 +13,14 @@ jobs: build_setup: name: Prepare environment for wheel builds - runs-on: ubuntu-24.04 + runs-on: ubuntu-latest needs: [test] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Prepare wheel build run: make -C python/ setup - name: Store wheel configuration files - uses: actions/upload-artifact@v4.6.0 + uses: actions/upload-artifact@v4 with: name: wheel-config path: python/ @@ -36,12 +36,11 @@ jobs: os: [ubuntu-latest, windows-latest] steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - uses: actions/checkout@v4 - name: Install cibuildwheel - run: python -m pip install cibuildwheel==2.1.2 + run: python -m pip install cibuildwheel - name: Get wheel configuration files - uses: actions/download-artifact@v4.1.7 + uses: actions/download-artifact@v4 with: name: wheel-config path: python/ @@ -49,29 +48,29 @@ jobs: run: python -m cibuildwheel --output-dir wheelhouse working-directory: python/ - name: Store binary wheels - uses: actions/upload-artifact@v4.6.0 + uses: actions/upload-artifact@v4 with: name: binary-wheels-${{matrix.os}}-${{ strategy.job-index }} path: python/wheelhouse/*.whl build_sdist: name: Build source distribution - runs-on: ubuntu-24.04 + runs-on: ubuntu-latest needs: [build_setup] steps: - - uses: actions/checkout@v2 - - name: Install cython - run: python -m pip install cython==0.29.24 + - uses: actions/checkout@v4 + - name: Install build tools + run: python -m pip install build - name: Get wheel configuration files - uses: actions/download-artifact@v4.1.7 + uses: actions/download-artifact@v4 with: name: wheel-config path: python/ - name: Build sdist - run: python setup.py sdist + run: python -m build --sdist working-directory: python/ - name: Store source wheels - uses: actions/upload-artifact@v4.6.0 + uses: actions/upload-artifact@v4 with: name: source-wheels path: python/dist/*.tar.gz @@ -81,17 +80,17 @@ jobs: upload_pypi: name: Upload wheels to PyPI - runs-on: ubuntu-24.04 + runs-on: ubuntu-latest needs: [build_wheels, build_sdist] steps: - name: Get source wheels - uses: actions/download-artifact@v4.1.7 + uses: actions/download-artifact@v4 with: name: source-wheels path: dist/ - name: Get binary wheels - uses: actions/download-artifact@v4.1.7 + uses: actions/download-artifact@v4 with: path: dist/ pattern: binary-wheels-* diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e3b64e7..861d309 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -5,6 +5,7 @@ on: branches: [ master ] pull_request: branches: [ master ] + workflow_call: jobs: test: @@ -23,13 +24,11 @@ jobs: with: python-version: ${{ matrix.python-version }} - - name: Install dependencies + - name: Install test dependencies run: | python -m pip install --upgrade pip - pip install pytest numpy cython setuptools wheel + pip install pytest - # Cross-platform build and test using Makefile - # Requires bash shell on Windows (Git Bash) - name: Build and Test shell: bash run: | diff --git a/README.md b/README.md index c239a09..883d57d 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ ![Tests](https://github.com/RecordEvolution/IMCtermite/actions/workflows/test.yml/badge.svg) ![CI Build Wheel](https://github.com/RecordEvolution/IMCtermite/actions/workflows/pypi-deploy.yml/badge.svg?branch=&event=push) [![PYPI](https://img.shields.io/pypi/v/IMCtermite.svg)](https://pypi.org/project/imctermite/) +[![Python Version](https://img.shields.io/pypi/pyversions/imctermite)](https://pypi.org/project/imctermite/) # IMCtermite @@ -151,10 +152,13 @@ python3 -m pip install imctermite ``` which provides binary wheels for multiple architectures on _Windows_ and _Linux_ -and most _Python 3.x_ distributions. However, if your platform/architecture is -not supported you can still compile the source distribution yourself, which -requires _python3_setuptools_ and an up-to-date compiler supporting C++11 -standard (e.g. _gcc version >= 10.2.0_). +and most _Python 3.x_ distributions. **Note:** Starting from version 3.0.0, +imctermite requires numpy as a dependency, which will be automatically +installed if not already present. + +However, if your platform/architecture is not supported you can still compile +the source distribution yourself, which requires _python3_setuptools_, _numpy_, +and an up-to-date compiler supporting C++11 standard (e.g. _gcc version >= 10.2.0_). ## Usage diff --git a/makefile b/makefile index f44495b..41580c5 100644 --- a/makefile +++ b/makefile @@ -87,7 +87,7 @@ docker-run: # python python-build: check-tags - make -C python/ build-inplace + make -C python/ build cp python/imctermite*.so ./ -v 2>/dev/null || cp python/imctermite*.pyd ./ -v 2>/dev/null || true python-clean: @@ -95,7 +95,7 @@ python-clean: rm -vf imctermite*.so imctermite*.pyd python-test: - PYTHONPATH=./ python python/examples/usage.py + PYTHONPATH=./ python3 python/examples/usage.py #-----------------------------------------------------------------------------# # tests diff --git a/python/VERSION b/python/VERSION index d302656..4a36342 100644 --- a/python/VERSION +++ b/python/VERSION @@ -1 +1 @@ -2.1.18 +3.0.0 diff --git a/python/makefile b/python/makefile index 43dd6ea..878a7b3 100644 --- a/python/makefile +++ b/python/makefile @@ -11,25 +11,20 @@ setup-clean: rm -rf lib/ build: setup - python setup.py build - -build-inplace: setup - python setup.py build_ext --inplace + python3 -m pip install -e . build-sdist: setup - python setup.py sdist - python -m twine check dist/* + python3 -m build --sdist + python3 -m twine check dist/* build-bdist: setup - python setup.py bdist - python -m twine check dist/* + python3 -m build --wheel + python3 -m twine check dist/* build-clean: - python setup.py clean --all rm -vf imctermite*.so imctermite*.pyd imctermite*.cpp - rm -vf IMCtermite*.so IMCtermite*.pyd IMCtermite*.cpp - rm -rvf dist/ IMCtermite.egg-info/ rm -rvf dist/ imctermite.egg-info/ + rm -rvf build/ cibuildwheel-build: setup cibuildwheel --platform linux @@ -38,9 +33,9 @@ cibuildwheel-clean: rm -rvf wheelhouse/ pypi-upload: - python -m twine upload dist/$(shell ls -t dist/ | head -n1) + python3 -m twine upload dist/$(shell ls -t dist/ | head -n1) clean: setup build-clean cibuildwheel-clean setup-clean run-example: - PYTHONPATH=$(pwd) python examples/usage_files.py + PYTHONPATH=$(pwd) python3 examples/usage_files.py diff --git a/python/pyproject.toml b/python/pyproject.toml index 0e657f5..39b64ac 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,6 +1,57 @@ [build-system] -requires = ["setuptools", "wheel","Cython"] +requires = ["setuptools>=77.0.0", "wheel", "Cython", "numpy"] build-backend = "setuptools.build_meta" +[project] +name = "imctermite" +description = "Enables extraction of measurement data from binary files with extension 'raw' used by proprietary software imcFAMOS and imcSTUDIO and facilitates its storage in open source file formats" +readme = "README.md" +requires-python = ">=3.10" +license = "MIT" +authors = [ + {name = "Record Evolution GmbH", email = "mario.fink@record-evolution.de"} +] +maintainers = [ + {name = "Record Evolution GmbH"} +] +keywords = ["IMC", "raw", "imcFAMOS", "imcSTUDIO", "imcCRONOS"] +classifiers = [ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Operating System :: OS Independent", + "Topic :: Scientific/Engineering", + "Topic :: Software Development :: Libraries :: Python Modules" +] +dependencies = [ + "numpy>=1.26.0" +] +dynamic = ["version"] + +[project.optional-dependencies] +test = ["pytest>=7.0.0"] + +[project.urls] +Homepage = "https://github.com/RecordEvolution/IMCtermite.git" + +[tool.setuptools] +# This is a single extension module build, not a package with subdirectories +py-modules = [] +# Explicitly set packages to empty to prevent auto-discovery +packages = [] + +[tool.setuptools.dynamic] +version = {file = "VERSION"} + +[tool.setuptools.package-data] +"*" = ["py.typed", "*.pyi"] + [tool.cibuildwheel] -before-all = "" +# Build for Python 3.10-3.13 +build = "cp310-* cp311-* cp312-* cp313-*" +# Skip 32-bit builds and musllinux +skip = "*-win32 *-manylinux_i686 *-musllinux_*" +# Tests are already run in test.yml workflow before wheel building +test-skip = "*" diff --git a/python/setup.cfg b/python/setup.cfg deleted file mode 100644 index 86528dc..0000000 --- a/python/setup.cfg +++ /dev/null @@ -1,27 +0,0 @@ - -[metadata] -name = imctermite -description = Enables extraction of measurement data from binary files with extension 'raw' used by proprietary software imcFAMOS and imcSTUDIO and facilitates its storage in open source file formats -long_description = file: README.md -# long_description_content_type = text/x-rst -long_description_content_type = text/markdown -version = file: VERSION -author = Record Evolution GmbH -author_email = mario.fink@record-evolution.de -maintainer = Record Evolution GmbH -url= https://github.com/RecordEvolution/IMCtermite.git -license = MIT License -license_files = LICENSE -keywords = IMC, raw, imcFAMOS, imcSTUDIO, imcCRONOS -classifiers = - Programming Language :: Python :: 3 - Programming Language :: Python :: 3.10 - Programming Language :: Python :: 3.11 - Programming Language :: Python :: 3.12 - Programming Language :: Python :: 3.13 - License :: OSI Approved :: MIT License - Operating System :: OS Independent - Topic :: Scientific/Engineering - Topic :: Software Development :: Libraries :: Python Modules - -[options] diff --git a/python/setup.py b/python/setup.py index badf881..afb021d 100644 --- a/python/setup.py +++ b/python/setup.py @@ -20,9 +20,6 @@ ) setup( - ext_modules=cythonize(extension,language_level=3), - package_data={ - "": ["py.typed", "*.pyi"] - }, + ext_modules=cythonize(extension, language_level=3), zip_safe=False ) diff --git a/tests/README.md b/tests/README.md index aa8343c..715ea11 100644 --- a/tests/README.md +++ b/tests/README.md @@ -25,6 +25,25 @@ pytest tests/test_python.py ## Prerequisites +### Recommended: Development install + +Install the package in editable mode with test dependencies (handles all requirements automatically): + +```bash +pip install -e "python[test]" +``` + +Then run tests with pytest: ```bash -pip install cython pytest setuptools +pytest ``` + +### Alternative: Using makefile + +If you prefer `make test`, just install pytest first: + +```bash +pip install pytest +make test +``` +