diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..997485c --- /dev/null +++ b/.dockerignore @@ -0,0 +1,27 @@ +# Build artifacts +*.o +*.so +*.pyc +imctermite +main.cpp.cpp + +# Python build +python/build/ +python/dist/ +python/*.so +python/*.cpp +python/lib/ +python/LICENSE +python/README.md +python/*.egg-info/ +__pycache__/ + +# Git and editor +.git/ +.venv/ +*.swp +*.swo +*~ + +# Test outputs +.pytest_cache/ diff --git a/.github/workflows/pypi-deploy.yml b/.github/workflows/pypi-deploy.yml index aa02101..db89645 100644 --- a/.github/workflows/pypi-deploy.yml +++ b/.github/workflows/pypi-deploy.yml @@ -8,15 +8,19 @@ on: jobs: + test: + uses: ./.github/workflows/test.yml + build_setup: name: Prepare environment for wheel builds - runs-on: ubuntu-24.04 + runs-on: ubuntu-latest + needs: [test] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Prepare wheel build run: make -C python/ setup - name: Store wheel configuration files - uses: actions/upload-artifact@v4.6.0 + uses: actions/upload-artifact@v4 with: name: wheel-config path: python/ @@ -32,12 +36,11 @@ jobs: os: [ubuntu-latest, windows-latest] steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - uses: actions/checkout@v4 - name: Install cibuildwheel - run: python -m pip install cibuildwheel==2.1.2 + run: python -m pip install cibuildwheel - name: Get wheel configuration files - uses: actions/download-artifact@v4.1.7 + uses: actions/download-artifact@v4 with: name: wheel-config path: python/ @@ -45,29 +48,29 @@ jobs: run: python -m cibuildwheel --output-dir wheelhouse working-directory: python/ - name: Store binary wheels - uses: actions/upload-artifact@v4.6.0 + uses: actions/upload-artifact@v4 with: name: binary-wheels-${{matrix.os}}-${{ strategy.job-index }} path: python/wheelhouse/*.whl build_sdist: name: Build source distribution - runs-on: ubuntu-24.04 + runs-on: ubuntu-latest needs: [build_setup] steps: - - uses: actions/checkout@v2 - - name: Install cython - run: python -m pip install cython==0.29.24 + - uses: actions/checkout@v4 + - name: Install build tools + run: python -m pip install build - name: Get wheel configuration files - uses: actions/download-artifact@v4.1.7 + uses: actions/download-artifact@v4 with: name: wheel-config path: python/ - name: Build sdist - run: python setup.py sdist + run: python -m build --sdist working-directory: python/ - name: Store source wheels - uses: actions/upload-artifact@v4.6.0 + uses: actions/upload-artifact@v4 with: name: source-wheels path: python/dist/*.tar.gz @@ -77,17 +80,17 @@ jobs: upload_pypi: name: Upload wheels to PyPI - runs-on: ubuntu-24.04 + runs-on: ubuntu-latest needs: [build_wheels, build_sdist] steps: - name: Get source wheels - uses: actions/download-artifact@v4.1.7 + uses: actions/download-artifact@v4 with: name: source-wheels path: dist/ - name: Get binary wheels - uses: actions/download-artifact@v4.1.7 + uses: actions/download-artifact@v4 with: path: dist/ pattern: binary-wheels-* diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..861d309 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,35 @@ +name: Run Tests + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + workflow_call: + +jobs: + test: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, windows-latest] + python-version: ["3.10", "3.11", "3.12", "3.13"] + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install test dependencies + run: | + python -m pip install --upgrade pip + pip install pytest + + - name: Build and Test + shell: bash + run: | + make test diff --git a/.gitignore b/.gitignore index b4e57bd..947413f 100644 --- a/.gitignore +++ b/.gitignore @@ -43,3 +43,6 @@ python/*.soc python/lib/ python/*.cpp python/wheelhouse/ + +__pycache__/ +.pytest_cache/ diff --git a/Dockerfile b/Dockerfile index e5389a0..836f221 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,21 +1,22 @@ - -FROM debian:bullseye-20210111 +FROM debian:bullseye USER root RUN apt-get update && apt-get install -y \ build-essential git vim \ python3 python3-pip -RUN python3 -m pip install cython +RUN python3 -m pip install cython pytest +RUN ln -s /usr/bin/python3 /usr/bin/python RUN g++ -v -COPY ./ /IMCtermite/ +WORKDIR /IMCtermite +COPY ./ . # install CLI tool -RUN cd /IMCtermite && ls -lh && make install && ls -lh /usr/local/bin/imctermite +RUN make install # install Python module -RUN cd /IMCtermite && ls -lh && make cython-install +RUN make python-build CMD ["sleep","infinity"] diff --git a/README.md b/README.md index 6c167d7..883d57d 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,10 @@ [![LICENSE](https://img.shields.io/github/license/RecordEvolution/IMCtermite)](https://img.shields.io/github/license/RecordEvolution/IMCtermite) [![STARS](https://img.shields.io/github/stars/RecordEvolution/IMCtermite)](https://img.shields.io/github/stars/RecordEvolution/IMCtermite) +![Tests](https://github.com/RecordEvolution/IMCtermite/actions/workflows/test.yml/badge.svg) ![CI Build Wheel](https://github.com/RecordEvolution/IMCtermite/actions/workflows/pypi-deploy.yml/badge.svg?branch=&event=push) [![PYPI](https://img.shields.io/pypi/v/IMCtermite.svg)](https://pypi.org/project/imctermite/) +[![Python Version](https://img.shields.io/pypi/pyversions/imctermite)](https://pypi.org/project/imctermite/) # IMCtermite @@ -27,6 +29,7 @@ Python module to integrate the _.raw_ format into any ETL workflow. * [File format](#Fileformat) * [Build and Installation](#Installation) * [Usage and Examples](#Usage) +* [Testing](#Testing) * [References](#References) ## File format @@ -149,10 +152,13 @@ python3 -m pip install imctermite ``` which provides binary wheels for multiple architectures on _Windows_ and _Linux_ -and most _Python 3.x_ distributions. However, if your platform/architecture is -not supported you can still compile the source distribution yourself, which -requires _python3_setuptools_ and an up-to-date compiler supporting C++11 -standard (e.g. _gcc version >= 10.2.0_). +and most _Python 3.x_ distributions. **Note:** Starting from version 3.0.0, +imctermite requires numpy as a dependency, which will be automatically +installed if not already present. + +However, if your platform/architecture is not supported you can still compile +the source distribution yourself, which requires _python3_setuptools_, _numpy_, +and an up-to-date compiler supporting C++11 standard (e.g. _gcc version >= 10.2.0_). ## Usage @@ -194,17 +200,17 @@ of it by passing a _raw_ file to the constructor: ```Python import imctermite -imcraw = imctermite.imctermite(b"sample/sampleA.raw") +imcraw = imctermite.imctermite("sample/sampleA.raw") ``` An example of how to create an instance and obtain the list of channels is: ```Python -import IMCtermite +import imctermite # declare and initialize instance of "imctermite" by passing a raw-file try : - imcraw = IMCtermite.imctermite(b"samples/sampleA.raw") + imcraw = imctermite.imctermite("samples/sampleA.raw") except RuntimeError as e : print("failed to load/parse raw-file: " + str(e)) @@ -217,6 +223,16 @@ A more complete [example](python/examples/usage.py), including the methods for obtaining the channels, i.a. their data and/or directly printing them to files, can be found in the `python/examples` folder. +### Chunked NumPy export (fast path) + +For large files, you can iterate over channel data in chunks as NumPy arrays. This avoids creating large Python lists and allows for streaming processing (e.g. writing to Parquet). See [`python/examples/usage_numpy_chunks.py`](python/examples/usage_numpy_chunks.py) for a complete example. + +## Testing + +Run end-to-end tests: `make test` + +See [tests/README.md](tests/README.md) for details. + ## References ### IMC diff --git a/lib/imc_block.hpp b/lib/imc_block.hpp index 332a3b6..71b9d58 100644 --- a/lib/imc_block.hpp +++ b/lib/imc_block.hpp @@ -34,7 +34,8 @@ namespace imc // name and buffer of associated raw file std::string raw_file_; - const std::vector* buffer_; + const unsigned char* buffer_; + size_t buffer_size_; // offset of first/last byte of parameters in block (separated by ch_sep_) // w.r.t. to first byte of block (=0) @@ -44,7 +45,7 @@ namespace imc // constructor block(key thekey, unsigned long int begin, unsigned long int end, - std::string raw_file, const std::vector* buffer): + std::string raw_file, const unsigned char* buffer, size_t buffer_size): thekey_(thekey), uuid_(std::to_string(begin)) { if ( !imc::check_key(thekey) ) throw std::logic_error("unknown key"); @@ -56,14 +57,15 @@ namespace imc } raw_file_ = raw_file; buffer_ = buffer; + buffer_size_ = buffer_size; // make sure "end_" does not exceed buffer size due to invalid "length" parameter of block - if ( end_ > buffer_->size() ) + if ( end_ > buffer_size_ ) { std::cout<<"WARNING: invalid length parameter in "<size()<<")" + <<"(block-end:"< resetting block-end to buffer-size\n"; - end_ = (unsigned long int)(buffer_->size()); + end_ = (unsigned long int)(buffer_size_); } try { @@ -86,7 +88,7 @@ namespace imc for ( unsigned long int b = begin_; b < end_ && ( ! (thekey_.name_== "CS") || count < 4 ); b++ ) { - if ( buffer_->at(b) == imc::ch_sep_ ) + if ( buffer_[b] == imc::ch_sep_ ) { // define range of parameter with first byte = ch_sep_ parameters_.push_back(imc::parameter(b,b)); @@ -124,8 +126,8 @@ namespace imc { throw std::logic_error("inconsistent parameter offsets"); } - std::vector parambuff(buffer_->begin()+begin_+param.begin(), - buffer_->begin()+begin_+param.end()); + std::vector parambuff(buffer_+begin_+param.begin(), + buffer_+begin_+param.end()); return parambuff; } @@ -140,7 +142,7 @@ namespace imc std::string prm(""); for ( unsigned long int i = param.begin()+1; i <= param.end(); i++ ) { - prm.push_back( (char)((*buffer_)[i]) ); + prm.push_back( (char)(buffer_[i]) ); } return prm; } @@ -163,7 +165,7 @@ namespace imc <size()<<"\n" + < +#include +#include + +#if defined(_WIN32) || defined(_WIN64) + #define WIN32_LEAN_AND_MEAN + #include +#else + #include + #include + #include + #include +#endif + +namespace imc +{ + class MemoryMappedFile + { + private: + const unsigned char* data_; + size_t size_; +#if defined(_WIN32) || defined(_WIN64) + HANDLE hFile_; + HANDLE hMap_; +#else + int fd_; +#endif + + public: +#if defined(_WIN32) || defined(_WIN64) + MemoryMappedFile() : data_(nullptr), size_(0), hFile_(INVALID_HANDLE_VALUE), hMap_(NULL) {} +#else + MemoryMappedFile() : data_(nullptr), size_(0), fd_(-1) {} +#endif + + ~MemoryMappedFile() + { + close_file(); + } + + // Delete copy constructor and assignment operator to prevent double-free + MemoryMappedFile(const MemoryMappedFile&) = delete; + MemoryMappedFile& operator=(const MemoryMappedFile&) = delete; + + // Implement move constructor + MemoryMappedFile(MemoryMappedFile&& other) noexcept +#if defined(_WIN32) || defined(_WIN64) + : data_(other.data_), size_(other.size_), hFile_(other.hFile_), hMap_(other.hMap_) + { + other.data_ = nullptr; + other.size_ = 0; + other.hFile_ = INVALID_HANDLE_VALUE; + other.hMap_ = NULL; + } +#else + : data_(other.data_), size_(other.size_), fd_(other.fd_) + { + other.data_ = nullptr; + other.size_ = 0; + other.fd_ = -1; + } +#endif + + // Implement move assignment operator + MemoryMappedFile& operator=(MemoryMappedFile&& other) noexcept + { + if (this != &other) + { + close_file(); + data_ = other.data_; + size_ = other.size_; +#if defined(_WIN32) || defined(_WIN64) + hFile_ = other.hFile_; + hMap_ = other.hMap_; + other.hFile_ = INVALID_HANDLE_VALUE; + other.hMap_ = NULL; +#else + fd_ = other.fd_; + other.fd_ = -1; +#endif + other.data_ = nullptr; + other.size_ = 0; + } + return *this; + } + + void map(const std::string& filename) + { + close_file(); + +#if defined(_WIN32) || defined(_WIN64) + hFile_ = CreateFileA(filename.c_str(), GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); + if (hFile_ == INVALID_HANDLE_VALUE) + { + throw std::runtime_error("Failed to open file: " + filename); + } + + LARGE_INTEGER fileSize; + if (!GetFileSizeEx(hFile_, &fileSize)) + { + CloseHandle(hFile_); + hFile_ = INVALID_HANDLE_VALUE; + throw std::runtime_error("Failed to get file size: " + filename); + } + size_ = (size_t)fileSize.QuadPart; + + if (size_ == 0) + { + data_ = nullptr; + return; + } + + hMap_ = CreateFileMappingA(hFile_, NULL, PAGE_READONLY, 0, 0, NULL); + if (hMap_ == NULL) + { + CloseHandle(hFile_); + hFile_ = INVALID_HANDLE_VALUE; + throw std::runtime_error("Failed to create file mapping: " + filename); + } + + data_ = static_cast(MapViewOfFile(hMap_, FILE_MAP_READ, 0, 0, 0)); + if (data_ == NULL) + { + CloseHandle(hMap_); + hMap_ = NULL; + CloseHandle(hFile_); + hFile_ = INVALID_HANDLE_VALUE; + throw std::runtime_error("Failed to map view of file: " + filename); + } +#else + fd_ = open(filename.c_str(), O_RDONLY); + if (fd_ == -1) + { + throw std::runtime_error("Failed to open file: " + filename); + } + + struct stat sb; + if (fstat(fd_, &sb) == -1) + { + close(fd_); + fd_ = -1; + throw std::runtime_error("Failed to get file size: " + filename); + } + size_ = sb.st_size; + + if (size_ == 0) + { + data_ = nullptr; + return; + } + + void* mapped = mmap(NULL, size_, PROT_READ, MAP_PRIVATE, fd_, 0); + if (mapped == MAP_FAILED) + { + close(fd_); + fd_ = -1; + size_ = 0; + throw std::runtime_error("Failed to mmap file: " + filename); + } + + data_ = static_cast(mapped); +#endif + } + + void close_file() + { + if (data_) + { +#if defined(_WIN32) || defined(_WIN64) + UnmapViewOfFile(data_); +#else + munmap(const_cast(data_), size_); +#endif + data_ = nullptr; + } + +#if defined(_WIN32) || defined(_WIN64) + if (hMap_) + { + CloseHandle(hMap_); + hMap_ = NULL; + } + if (hFile_ != INVALID_HANDLE_VALUE) + { + CloseHandle(hFile_); + hFile_ = INVALID_HANDLE_VALUE; + } +#else + if (fd_ != -1) + { + close(fd_); + fd_ = -1; + } +#endif + size_ = 0; + } + + const unsigned char* data() const + { + return data_; + } + + size_t size() const + { + return size_; + } + + const unsigned char& operator[](size_t index) const + { + return data_[index]; + } + }; +} + +#endif diff --git a/lib/imc_channel.hpp b/lib/imc_channel.hpp index 6e19e1c..fe61b34 100644 --- a/lib/imc_channel.hpp +++ b/lib/imc_channel.hpp @@ -11,6 +11,7 @@ #include #include #include +#include #if defined(__linux__) || defined(__APPLE__) #include #elif defined(__WIN32__) || defined(_WIN32) @@ -21,6 +22,16 @@ namespace imc { + struct channel_chunk { + std::vector x_bytes; + std::vector y_bytes; + unsigned long int start; + unsigned long int count; + bool has_x; + int x_type; + int y_type; + }; + struct component_env { std::string uuid_; @@ -274,7 +285,7 @@ namespace imc component_env compenv_; // Constructor to parse the associated blocks - component_group(component_env &compenv, std::map* blocks, std::vector* buffer) + component_group(component_env &compenv, std::map* blocks, const unsigned char* buffer) : compenv_(compenv) { if (blocks->count(compenv.CCuuid_) == 1) @@ -311,7 +322,7 @@ namespace imc // associated environment of blocks and map of blocks channel_env chnenv_; std::map* blocks_; - std::vector* buffer_; + const unsigned char* buffer_; imc::origin_data NO_; imc::language NL_; @@ -348,6 +359,8 @@ namespace imc // range, factor and offset double xfactor_, yfactor_; double xoffset_, yoffset_; + + unsigned long int number_of_samples_ = 0; // group reference the channel belongs to unsigned long int group_index_; @@ -355,7 +368,7 @@ namespace imc // constructor takes channel's block environment channel(channel_env &chnenv, std::map* blocks, - std::vector* buffer): + const unsigned char* buffer): chnenv_(chnenv), blocks_(blocks), buffer_(buffer), xfactor_(1.), yfactor_(1.), xoffset_(0.), yoffset_(0.), group_index_(-1) @@ -475,15 +488,15 @@ namespace imc } // start converting binary buffer to imc::datatype - if ( !chnenv_.CSuuid_.empty() ) convert_buffer(); + if ( !chnenv_.CSuuid_.empty() ) init_metadata(); // convert any non-UTF-8 codepage to UTF-8 and cleanse any text convert_encoding(); cleanse_text(); } - // convert buffer to actual datatype - void convert_buffer() + // initialize metadata without loading data + void init_metadata() { std::vector prms = blocks_->at(chnenv_.CSuuid_).get_parameters(); if ( prms.size() < 4) @@ -492,65 +505,203 @@ namespace imc } // extract (channel dependent) part of buffer - unsigned long int buffstrt = prms[3].begin(); - std::vector yCSbuffer( buffer_->begin()+buffstrt+ybuffer_offset_+1, - buffer_->begin()+buffstrt+ybuffer_offset_+ybuffer_size_+1 ); + size_t yCSbuffer_size = ybuffer_size_; // determine number of values in buffer - unsigned long int ynum_values = (unsigned long int)(yCSbuffer.size()/(ysignbits_/8)); - if ( ynum_values*(ysignbits_/8) != yCSbuffer.size() ) + unsigned long int ynum_values = (unsigned long int)(yCSbuffer_size/(ysignbits_/8)); + if ( ynum_values*(ysignbits_/8) != yCSbuffer_size ) { throw std::runtime_error("CSbuffer and significant bits of y datatype don't match"); } - + + number_of_samples_ = ynum_values; if (dimension_ == 1) { - // process y-data - process_data(ydata_, ynum_values, ydatatp_, yCSbuffer); - // find appropriate precision for "xdata_" by means of "xstepwidth_" xprec_ = (xstepwidth_ > 0 ) ? (int)ceil(fabs(log10(xstepwidth_))) : 10; - - // fill xdata_ - for ( unsigned long int i = 0; i < ynum_values; i++ ) - { - xdata_.push_back(xstart_+(double)i*xstepwidth_); - } } else if (dimension_ == 2) { - // process x- and y-data - std::vector xCSbuffer( buffer_->begin()+buffstrt+xbuffer_offset_+1, - buffer_->begin()+buffstrt+xbuffer_offset_+xbuffer_size_+1 ); - - // determine number of values in buffer - unsigned long int xnum_values = (unsigned long int)(xCSbuffer.size()/(xsignbits_/8)); - if ( xnum_values*(xsignbits_/8) != xCSbuffer.size() ) - { - throw std::runtime_error("CSbuffer and significant bits of x datatype don't match"); - } + // const unsigned char* xCSbuffer = buffer_ + buffstrt + xbuffer_offset_ + 1; + size_t xCSbuffer_size = xbuffer_size_; + unsigned long int xnum_values = (unsigned long int)(xCSbuffer_size/(xsignbits_/8)); + if ( xnum_values != ynum_values ) { throw std::runtime_error("x and y data have different number of values"); } - xprec_ = 9; - - process_data(xdata_, xnum_values, xdatatp_, xCSbuffer); - process_data(ydata_, ynum_values, ydatatp_, yCSbuffer); } else { throw std::runtime_error("unsupported dimension"); } + } + + // convert buffer to actual datatype (loads all data) + void load_all_data() + { + std::vector prms = blocks_->at(chnenv_.CSuuid_).get_parameters(); + unsigned long int buffstrt = prms[3].begin(); + const unsigned char* yCSbuffer = buffer_ + buffstrt + ybuffer_offset_ + 1; + size_t yCSbuffer_size = ybuffer_size_; + unsigned long int ynum_values = number_of_samples_; + + if (dimension_ == 1) + { + process_data(ydata_, ynum_values, ydatatp_, yCSbuffer, yCSbuffer_size); + for ( unsigned long int i = 0; i < ynum_values; i++ ) + { + xdata_.push_back(xstart_+(double)i*xstepwidth_); + } + } + else if (dimension_ == 2) + { + const unsigned char* xCSbuffer = buffer_ + buffstrt + xbuffer_offset_ + 1; + size_t xCSbuffer_size = xbuffer_size_; + process_data(xdata_, ynum_values, xdatatp_, xCSbuffer, xCSbuffer_size); + process_data(ydata_, ynum_values, ydatatp_, yCSbuffer, yCSbuffer_size); + } transformData(xdata_, xfactor_, xoffset_); transformData(ydata_, yfactor_, yoffset_); } + channel_chunk read_chunk(unsigned long int start, unsigned long int count, bool include_x, bool raw_mode) + { + unsigned long int total_len = number_of_samples_; + + if ( start >= total_len ) + { + return { {}, {}, start, 0, include_x, 0, 0 }; + } + + unsigned long int end = start + count; + if ( end > total_len ) end = total_len; + unsigned long int actual_count = end - start; + + channel_chunk chunk; + chunk.start = start; + chunk.count = actual_count; + chunk.has_x = include_x; + chunk.x_type = 0; + chunk.y_type = 0; + + std::vector prms = blocks_->at(chnenv_.CSuuid_).get_parameters(); + unsigned long int buffstrt = prms[3].begin(); + + // Handle Y data + if (raw_mode) { + int type = (int)ydatatp_; + unsigned long int bytes_per_sample = ysignbits_ / 8; + unsigned long int abs_start = buffstrt + ybuffer_offset_ + 1 + start * bytes_per_sample; + unsigned long int byte_count = actual_count * bytes_per_sample; + + if (type == 13) { // six_byte_unsigned_long -> promote to 8 byte (uint64) + chunk.y_type = 13; + chunk.y_bytes.resize(actual_count * 8); + uint64_t* dest = reinterpret_cast(chunk.y_bytes.data()); + for (unsigned long int i = 0; i < actual_count; ++i) { + unsigned long int src_idx = abs_start + i * 6; + uint64_t val = 0; + for (int b = 0; b < 6; ++b) val |= (uint64_t)buffer_[src_idx + b] << (b * 8); + dest[i] = val; + } + } else { + chunk.y_type = type; + chunk.y_bytes.resize(byte_count); + std::copy(buffer_ + abs_start, buffer_ + abs_start + byte_count, chunk.y_bytes.begin()); + } + } else { + // Scaled mode: convert to double + chunk.y_type = 8; // imc::numtype::ddouble + chunk.y_bytes.resize(actual_count * sizeof(double)); + std::vector temp_data; + + unsigned long int abs_start = buffstrt + ybuffer_offset_ + 1; // Base start + + switch (ydatatp_) { + case numtype::unsigned_byte: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, yfactor_, yoffset_, temp_data); break; + case numtype::signed_byte: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, yfactor_, yoffset_, temp_data); break; + case numtype::unsigned_short: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, yfactor_, yoffset_, temp_data); break; + case numtype::signed_short: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, yfactor_, yoffset_, temp_data); break; + case numtype::unsigned_long: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, yfactor_, yoffset_, temp_data); break; + case numtype::signed_long: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, yfactor_, yoffset_, temp_data); break; + case numtype::ffloat: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, yfactor_, yoffset_, temp_data); break; + case numtype::ddouble: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, yfactor_, yoffset_, temp_data); break; + case numtype::two_byte_word_digital: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, yfactor_, yoffset_, temp_data); break; + case numtype::eight_byte_unsigned_long: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, yfactor_, yoffset_, temp_data); break; + case numtype::six_byte_unsigned_long: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, yfactor_, yoffset_, temp_data); break; + case numtype::eight_byte_signed_long: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, yfactor_, yoffset_, temp_data); break; + default: throw std::runtime_error("Unsupported type for scaled chunk reading (Y): " + std::to_string(ydatatp_)); + } + + memcpy(chunk.y_bytes.data(), temp_data.data(), temp_data.size() * sizeof(double)); + } + + // Handle X data + if (include_x) { + if (dimension_ == 2 && raw_mode) { + int type = (int)xdatatp_; + unsigned long int bytes_per_sample = xsignbits_ / 8; + unsigned long int abs_start = buffstrt + xbuffer_offset_ + 1 + start * bytes_per_sample; + unsigned long int byte_count = actual_count * bytes_per_sample; + + if (type == 13) { + chunk.x_type = 13; + chunk.x_bytes.resize(actual_count * 8); + uint64_t* dest = reinterpret_cast(chunk.x_bytes.data()); + for (unsigned long int i = 0; i < actual_count; ++i) { + unsigned long int src_idx = abs_start + i * 6; + uint64_t val = 0; + for (int b = 0; b < 6; ++b) val |= (uint64_t)buffer_[src_idx + b] << (b * 8); + dest[i] = val; + } + } else { + chunk.x_type = type; + chunk.x_bytes.resize(byte_count); + std::copy(buffer_ + abs_start, buffer_ + abs_start + byte_count, chunk.x_bytes.begin()); + } + } else { + // Generated X or scaled X + chunk.x_type = 8; // imc::numtype::ddouble + chunk.x_bytes.resize(actual_count * sizeof(double)); + double* ptr = reinterpret_cast(chunk.x_bytes.data()); + + if (dimension_ == 2) { + // Read X from file and scale + std::vector temp_data; + unsigned long int abs_start = buffstrt + xbuffer_offset_ + 1; + switch (xdatatp_) { + case numtype::unsigned_byte: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, xfactor_, xoffset_, temp_data); break; + case numtype::signed_byte: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, xfactor_, xoffset_, temp_data); break; + case numtype::unsigned_short: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, xfactor_, xoffset_, temp_data); break; + case numtype::signed_short: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, xfactor_, xoffset_, temp_data); break; + case numtype::unsigned_long: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, xfactor_, xoffset_, temp_data); break; + case numtype::signed_long: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, xfactor_, xoffset_, temp_data); break; + case numtype::ffloat: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, xfactor_, xoffset_, temp_data); break; + case numtype::ddouble: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, xfactor_, xoffset_, temp_data); break; + case numtype::two_byte_word_digital: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, xfactor_, xoffset_, temp_data); break; + case numtype::eight_byte_unsigned_long: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, xfactor_, xoffset_, temp_data); break; + case numtype::six_byte_unsigned_long: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, xfactor_, xoffset_, temp_data); break; + case numtype::eight_byte_signed_long: imc::convert_chunk_to_double(buffer_ + abs_start, start, actual_count, xfactor_, xoffset_, temp_data); break; + default: throw std::runtime_error("Unsupported type for scaled chunk reading (X): " + std::to_string(xdatatp_)); + } + memcpy(ptr, temp_data.data(), temp_data.size() * sizeof(double)); + } else { + // Generated X + for (unsigned long int i = 0; i < actual_count; ++i) { + ptr[i] = xstart_ + (double)(start + i) * xstepwidth_; + } + } + } + } + return chunk; + } + // handle data type conversion - void process_data(std::vector& data_, size_t num_values, numtype datatp_, std::vector& CSbuffer) + void process_data(std::vector& data_, size_t num_values, numtype datatp_, const unsigned char* CSbuffer, size_t CSbuffer_size) { // adjust size of data data_.resize(num_values); @@ -559,34 +710,34 @@ namespace imc switch (datatp_) { case numtype::unsigned_byte: - imc::convert_data_to_type(CSbuffer, data_); + imc::convert_data_to_type(CSbuffer, CSbuffer_size, data_); break; case numtype::signed_byte: - imc::convert_data_to_type(CSbuffer, data_); + imc::convert_data_to_type(CSbuffer, CSbuffer_size, data_); break; case numtype::unsigned_short: - imc::convert_data_to_type(CSbuffer, data_); + imc::convert_data_to_type(CSbuffer, CSbuffer_size, data_); break; case numtype::signed_short: - imc::convert_data_to_type(CSbuffer, data_); + imc::convert_data_to_type(CSbuffer, CSbuffer_size, data_); break; case numtype::unsigned_long: - imc::convert_data_to_type(CSbuffer, data_); + imc::convert_data_to_type(CSbuffer, CSbuffer_size, data_); break; case numtype::signed_long: - imc::convert_data_to_type(CSbuffer, data_); + imc::convert_data_to_type(CSbuffer, CSbuffer_size, data_); break; case numtype::ffloat: - imc::convert_data_to_type(CSbuffer, data_); + imc::convert_data_to_type(CSbuffer, CSbuffer_size, data_); break; case numtype::ddouble: - imc::convert_data_to_type(CSbuffer, data_); + imc::convert_data_to_type(CSbuffer, CSbuffer_size, data_); break; case numtype::two_byte_word_digital: - imc::convert_data_to_type(CSbuffer, data_); + imc::convert_data_to_type(CSbuffer, CSbuffer_size, data_); break; case numtype::six_byte_unsigned_long: - imc::convert_data_to_type(CSbuffer, data_); + imc::convert_data_to_type(CSbuffer, CSbuffer_size, data_); break; default: throw std::runtime_error(std::string("unsupported/unknown datatype ") + std::to_string(datatp_)); @@ -699,6 +850,9 @@ namespace imc // provide JSON string of metadata std::string get_json(bool include_data = false) { + if (include_data && ydata_.empty() && number_of_samples_ > 0) { + load_all_data(); + } // prepare printable trigger-time std::time_t tt = std::chrono::system_clock::to_time_t(trigger_time_); std::time_t att = std::chrono::system_clock::to_time_t(absolute_trigger_time_); @@ -716,12 +870,15 @@ namespace imc <<"\",\"codepage\":\""<(chunk.x_bytes.data()); + const double* y_ptr = reinterpret_cast(chunk.y_bytes.data()); + + // Write chunk data + for (unsigned long int i = 0; i < chunk.count; i++) { - fou< - void convert_data_to_type(std::vector& subbuffer, + void convert_data_to_type(const unsigned char* subbuffer, size_t subbuffer_size, std::vector& channel) { // check number of elements of type "datatype" in buffer - if ( subbuffer.size() != channel.size()*sizeof(datatype) ) + if ( subbuffer_size != channel.size()*sizeof(datatype) ) { throw std::runtime_error( std::string("size mismatch between subbuffer (") - + std::to_string(subbuffer.size()) + + std::to_string(subbuffer_size) + std::string(") and datatype (") + std::to_string(channel.size()) + std::string("*") + std::to_string(sizeof(datatype)) + std::string(")") ); @@ -44,6 +44,61 @@ namespace imc // for ( auto el: channel ) std::cout< + void convert_chunk_to_double(const unsigned char* buffer, size_t start_index, size_t count, + double factor, double offset, std::vector& out) + { + size_t type_size = sizeof(SourceType); + const unsigned char* start_ptr = buffer + start_index * type_size; + + out.resize(count); + + for (size_t i = 0; i < count; ++i) { + SourceType val; + + const unsigned char* val_ptr = start_ptr + i * type_size; + uint8_t* dest_ptr = reinterpret_cast(&val); + for(size_t j=0; j(val); + if (factor != 1.0 || offset != 0.0) { + double fact = (factor == 0.0) ? 1.0 : factor; + dval = dval * fact + offset; + } + out[i] = dval; + } + } + + // Specialization for imc_sixbyte + template<> + inline void convert_chunk_to_double(const unsigned char* buffer, size_t start_index, size_t count, + double factor, double offset, std::vector& out) + { + size_t type_size = 6; + const unsigned char* start_ptr = buffer + start_index * type_size; + + out.resize(count); + + for (size_t i = 0; i < count; ++i) { + const unsigned char* val_ptr = start_ptr + i * type_size; + uint64_t val = 0; + for(int j=0; j<6; ++j) { + val |= (uint64_t)val_ptr[j] << (j*8); + } + + double dval = static_cast(val); + if (factor != 1.0 || offset != 0.0) { + double fact = (factor == 0.0) ? 1.0 : factor; + dval = dval * fact + offset; + } + out[i] = dval; + } + } + } #endif diff --git a/lib/imc_object.hpp b/lib/imc_object.hpp index 1fc1a44..6090a18 100644 --- a/lib/imc_object.hpp +++ b/lib/imc_object.hpp @@ -12,12 +12,12 @@ namespace imc { // obtain specific parameters as string - std::string get_parameter(const std::vector* buffer, const imc::parameter* param) + std::string get_parameter(const unsigned char* buffer, const imc::parameter* param) { std::string prm(""); for ( unsigned long int i = param->begin()+1; i <= param->end(); i++ ) { - prm.push_back((char)(*buffer)[i]); + prm.push_back((char)buffer[i]); } return prm; } @@ -29,7 +29,7 @@ namespace imc int processor_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 3 ) throw std::runtime_error("invalid number of parameters in CF"); fileformat_ = std::stoi(get_parameter(buffer,¶meters[0])); @@ -56,7 +56,7 @@ namespace imc bool closed_; // corresponds to true = 1 and false = 0 in file // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 2 ) throw std::runtime_error("invalid number of parameters in CK"); version_ = std::stoi(get_parameter(buffer,¶meters[0])); @@ -83,7 +83,7 @@ namespace imc std::string comment_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 7 ) throw std::runtime_error("invalid number of parameters in CB"); group_index_ = std::stoul(get_parameter(buffer,¶meters[2])); @@ -111,7 +111,7 @@ namespace imc std::string comment_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 9 ) throw std::runtime_error("invalid number of parameters in CT"); group_index_ = std::stoul(get_parameter(buffer,¶meters[2])); @@ -149,7 +149,7 @@ namespace imc int dimension_; // corresponding to fieldtype \in {1,} // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 5 ) throw std::runtime_error("invalid number of parameters in CG"); number_components_ = std::stoul(get_parameter(buffer,¶meters[2])); @@ -176,7 +176,7 @@ namespace imc std::string unit_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 6 ) throw std::runtime_error("invalid number of parameters in CD1"); dx_ = std::stod(get_parameter(buffer,¶meters[2])); @@ -208,7 +208,7 @@ namespace imc int pretriggerapp_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 11 ) throw std::runtime_error("invalid number of parameters in CD2"); dx_ = std::stod(get_parameter(buffer,¶meters[2])); @@ -244,7 +244,7 @@ namespace imc bool analog_digital_; // 1 => false (analog), 2 => true (digital) // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 4 ) throw std::runtime_error("invalid number of parameters in CC"); component_index_ = std::stoi(get_parameter(buffer,¶meters[2])); @@ -291,7 +291,7 @@ namespace imc unsigned long int distance_bytes_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 10 ) throw std::runtime_error("invalid number of parameters in CP"); buffer_reference_ = std::stoi(get_parameter(buffer,¶meters[2])); @@ -337,7 +337,7 @@ namespace imc // bool new_event_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 13 ) throw std::runtime_error("invalid number of parameters in Cb"); number_buffers_ = std::stoul(get_parameter(buffer,¶meters[2])); @@ -379,7 +379,7 @@ namespace imc std::string unit_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 8 ) throw std::runtime_error("invalid number of parameters in CR"); transform_ = (get_parameter(buffer,¶meters[2]) == std::string("1")); @@ -411,7 +411,7 @@ namespace imc std::string comment_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 9 ) throw std::runtime_error("invalid number of parameters in CN"); group_index_ = std::stoul(get_parameter(buffer,¶meters[2])); @@ -440,7 +440,7 @@ namespace imc // unsigned long int begin_buffer_, end_buffer_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 4 ) throw std::runtime_error("invalid number of parameters in CS"); index_ = std::stoul(get_parameter(buffer,¶meters[2])); @@ -464,7 +464,7 @@ namespace imc std::string language_code_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if (parameters.size() < 4) throw std::runtime_error("invalid number of parameters in NL"); codepage_ = get_parameter(buffer, ¶meters[2]); @@ -480,7 +480,7 @@ namespace imc std::string comment_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 7 ) throw std::runtime_error("invalid number of parameters in NO"); origin_ = ( get_parameter(buffer,¶meters[2]) == std::string("1") ); @@ -506,7 +506,7 @@ namespace imc double trigger_time_frac_secs_; // construct members by parsing particular parameters from buffer - void parse(const std::vector* buffer, const std::vector& parameters) + void parse(const unsigned char* buffer, const std::vector& parameters) { if ( parameters.size() < 8 ) throw std::runtime_error("invalid number of parameters in NT1"); tms_ = std::tm(); @@ -560,7 +560,7 @@ namespace imc { rawobject(): objidx_(-1) { } - void parse(imc::key key, const std::vector* buffer, + void parse(imc::key key, const unsigned char* buffer, const std::vector& parameters) { if ( key.name_ == std::string("CF") ) diff --git a/lib/imc_raw.hpp b/lib/imc_raw.hpp index ee77daa..1ab74b2 100644 --- a/lib/imc_raw.hpp +++ b/lib/imc_raw.hpp @@ -7,12 +7,11 @@ #include #include -// #include "hexshow.hpp" +#include "imc_buffer.hpp" #include "imc_key.hpp" #include "imc_block.hpp" #include "imc_datatype.hpp" #include "imc_object.hpp" -#include "imc_result.hpp" #include "imc_channel.hpp" //---------------------------------------------------------------------------// @@ -25,7 +24,7 @@ namespace imc std::string raw_file_, file_name_; // buffer of raw-file - std::vector buffer_; + imc::MemoryMappedFile buffer_; // list and map of imc-blocks std::vector rawblocks_; @@ -43,6 +42,12 @@ namespace imc raw() { }; raw(std::string raw_file): raw_file_(raw_file) { set_file(raw_file); }; + // Delete copy and move operations because of self-referential pointers in channels_ + raw(const raw&) = delete; + raw& operator=(const raw&) = delete; + raw(raw&&) = delete; + raw& operator=(raw&&) = delete; + // provide new raw-file void set_file(std::string raw_file) { @@ -58,16 +63,9 @@ namespace imc // open file and stream data into buffer void fill_buffer() { - buffer_.clear(); - // open file and put data in buffer try { - std::ifstream fin(raw_file_.c_str(),std::ifstream::binary); - if ( !fin.good() ) throw std::runtime_error("failed to open file"); - std::vector buffer((std::istreambuf_iterator(fin)), - (std::istreambuf_iterator())); - buffer_ = buffer; - fin.close(); + buffer_.map(raw_file_); } catch ( const std::exception& e ) { throw std::runtime_error( std::string("failed to open raw-file and stream data in buffer: ") + e.what() @@ -83,31 +81,33 @@ namespace imc // reset counter to identify computational complexity cplxcnt_ = 0; + const unsigned char* data = buffer_.data(); + size_t size = buffer_.size(); + // start parsing raw-blocks in buffer - for ( std::vector::iterator it=buffer_.begin(); - it!=buffer_.end(); ++it ) + for ( unsigned long int i = 0; i < size; ++i ) { cplxcnt_++; // check for "magic byte" - if ( *it == ch_bgn_ ) + if ( data[i] == ch_bgn_ ) { // check for (non)critical key - if ( *(it+1) == imc::key_crit_ || *(it+1) == imc::key_non_crit_ ) + if ( data[i+1] == imc::key_crit_ || data[i+1] == imc::key_non_crit_ ) { // compose (entire) key - std::string newkey = { (char)*(it+1), (char)*(it+2) }; - imc::key itkey(*(it+1) == imc::key_crit_,newkey); + std::string newkey = { (char)data[i+1], (char)data[i+2] }; + imc::key itkey(data[i+1] == imc::key_crit_,newkey); // expecting ch_sep_ after key - if ( *(it+3) == ch_sep_ ) + if ( data[i+3] == ch_sep_ ) { // extract key version std::string vers(""); unsigned long int pos = 4; - while ( *(it+pos) != ch_sep_ ) + while ( data[i+pos] != ch_sep_ ) { - vers.push_back((char)*(it+pos)); + vers.push_back((char)data[i+pos]); pos++; } int version = std::stoi(vers); @@ -122,9 +122,9 @@ namespace imc // get block length std::string leng(""); pos++; - while ( *(it+pos) != ch_sep_ ) + while ( data[i+pos] != ch_sep_ ) { - leng.push_back((char)*(it+pos)); + leng.push_back((char)data[i+pos]); pos++; } unsigned long int length = std::stoul(leng); @@ -132,23 +132,23 @@ namespace imc // declare and initialize corresponding key and block // imc::key bkey( *(it+1)==imc::key_crit_ , newkey, // imc::keys.at(newkey).description_, version ); - imc::block blk(itkey,(unsigned long int)(it-buffer_.begin()), - (unsigned long int)(it-buffer_.begin()+pos+1+length), - raw_file_, &buffer_); + imc::block blk(itkey,i, + i+pos+1+length, + raw_file_, data, size); // add block to list rawblocks_.push_back(blk); // skip the remaining block according to its length - if ( (unsigned long int)(it-buffer_.begin()+length) < (unsigned long int)(buffer_.size()) ) + if ( i+length < size ) { - std::advance(it,length); + i += length; } } else { // all critical must be known !! while a noncritical may be ignored - if ( *(it+1) == imc::key_crit_ ) + if ( data[i+1] == imc::key_crit_ ) { throw std::runtime_error( std::string("unknown critical key: ") + newkey + std::to_string(version) @@ -165,7 +165,7 @@ namespace imc { throw std::runtime_error( std::string("invalid block or corrupt buffer at byte: ") - + std::to_string(it+3-buffer_.begin()) + + std::to_string(i+3) ); } } @@ -232,7 +232,7 @@ namespace imc // a new component group is started // TODO: can we avoid to parse the whole component here? imc::component component; - component.parse(&buffer_, blk.get_parameters()); + component.parse(buffer_.data(), blk.get_parameters()); if ( component.component_index_ == 1 ) compenv_ptr = &chnenv.compenv1_; else if ( component.component_index_ == 2 ) compenv_ptr = &chnenv.compenv2_; else throw std::runtime_error("invalid component index in CC block"); @@ -283,7 +283,7 @@ namespace imc // create channel object and add it to the map of channels channels_.insert( std::pair - (chnenv.CNuuid_,imc::channel(chnenv,&mapblocks_,&buffer_)) + (chnenv.CNuuid_,imc::channel(chnenv,&mapblocks_,buffer_.data())) ); // reset channel uuid @@ -393,8 +393,45 @@ namespace imc return channels; } + // get length of a channel + unsigned long int get_channel_length(std::string uuid) + { + if ( channels_.count(uuid) ) + { + return channels_.at(uuid).number_of_samples_; + } + else + { + throw std::runtime_error(std::string("channel does not exist:") + uuid); + } + } + + // get numeric type of a channel + int get_channel_numeric_type(std::string uuid) + { + if ( channels_.count(uuid) ) + { + return (int)channels_.at(uuid).ydatatp_; + } + else + { + throw std::runtime_error(std::string("channel does not exist:") + uuid); + } + } + + // read a chunk of channel data + channel_chunk read_channel_chunk(std::string uuid, unsigned long int start, unsigned long int count, bool include_x, bool raw_mode) + { + if ( !channels_.count(uuid) ) + { + throw std::runtime_error(std::string("channel does not exist:") + uuid); + } + + return channels_.at(uuid).read_chunk(start, count, include_x, raw_mode); + } + // print single specific channel - void print_channel(std::string channeluuid, std::string outputfile, const char sep) + void print_channel(std::string channeluuid, std::string outputfile, const char sep, unsigned long int chunk_size = 100000) { // check for given parent directory of output file std::filesystem::path pdf = outputfile; @@ -407,7 +444,7 @@ namespace imc // find channel with given name if ( channels_.count(channeluuid) == 1 ) { - channels_.at(channeluuid).print(outputfile,sep); + channels_.at(channeluuid).print(outputfile,sep,25,9,chunk_size); } else { @@ -417,7 +454,7 @@ namespace imc } // print all channels into given directory - void print_channels(std::string output, const char sep) + void print_channels(std::string output, const char sep, unsigned long int chunk_size = 100000) { // check for given directory std::filesystem::path pd = output; @@ -436,8 +473,8 @@ namespace imc : it->second.name_ + std::string(".csv"); std::filesystem::path pf = pd / filenam; - // and print the channel - it->second.print(pf.u8string(),sep); + // and print the channel using streaming + it->second.print(pf.u8string(),sep,25,9,chunk_size); } } diff --git a/lib/imc_result.hpp b/lib/imc_result.hpp deleted file mode 100644 index 1961ace..0000000 --- a/lib/imc_result.hpp +++ /dev/null @@ -1,30 +0,0 @@ -//---------------------------------------------------------------------------// - -#ifndef IMCRESULT -#define IMCRESULT - -#include "imc_datatype.hpp" - -//---------------------------------------------------------------------------// - -namespace imc -{ - struct channel_tab - { - std::string name_; - - // abscissa - std::vector xaxis_; - std::string xunit_; - - // ordinate - // std::vector yaxis_; - std::vector yaxis_; - std::string yunit_; - }; - -} - -#endif - -//---------------------------------------------------------------------------// diff --git a/makefile b/makefile index 2d88a26..41580c5 100644 --- a/makefile +++ b/makefile @@ -18,7 +18,7 @@ MIB = $(foreach dir,$(KIB),-I $(dir)) # choose compiler and its options CC = g++ -std=c++17 -OPT = -O3 -Wall -Wconversion -Wpedantic -Werror -Wunused-variable -Wsign-compare +OPT = -O3 -Wall -Wconversion -Wpedantic -Werror -Wunused-variable -Wsign-compare -static # determine git version/commit and release tag GTAG := $(shell git tag -l --sort=version:refname | tail -n1 | sed "s/$^v//g") @@ -35,7 +35,7 @@ INST := /usr/local/bin # C++ and CLI tool # build executable -$(EXE): check-tags $(GVSN) main.o +$(EXE): check-tags main.o $(CC) $(OPT) main.o -o $@ # build main.cpp and include git version/commit tag @@ -86,21 +86,40 @@ docker-run: #-----------------------------------------------------------------------------# # python -python-build: check-tags $(GVSN) - make -C python/ build-inplace - cp python/imctermite*.so ./ -v +python-build: check-tags + make -C python/ build + cp python/imctermite*.so ./ -v 2>/dev/null || cp python/imctermite*.pyd ./ -v 2>/dev/null || true python-clean: make -C python/ clean - rm -vf imctermite*.so + rm -vf imctermite*.so imctermite*.pyd python-test: - PYTHONPATH=./ python python/examples/usage.py + PYTHONPATH=./ python3 python/examples/usage.py + +#-----------------------------------------------------------------------------# +# tests + +test: $(EXE) python-build + @echo "Running all tests..." + @PYTHONPATH=./ pytest + +test-cli: $(EXE) + @echo "Running CLI tests..." + @PYTHONPATH=./ pytest tests/test_cli.py + +test-python: python-build + @echo "Running Python tests..." + @PYTHONPATH=./ pytest tests/test_python.py #-----------------------------------------------------------------------------# # clean -clean: cpp-clean python-clean +test-clean: + rm -rf .pytest_cache + find tests/ -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true + +clean: cpp-clean python-clean test-clean #-----------------------------------------------------------------------------# # github actions diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..e0459a4 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,9 @@ +[pytest] +testpaths = tests +pythonpath = . +python_files = test_*.py +python_classes = Test* +python_functions = test_* +addopts = -v --strict-markers --tb=short +markers = + slow: marks tests as slow (deselect with '-m "not slow"') diff --git a/python/MANIFEST.in b/python/MANIFEST.in index dbe052e..eb044a9 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -2,4 +2,6 @@ include lib/*.hpp include *.cpp include *.pyx include *.pxd +include *.pyi +include py.typed include VERSION diff --git a/python/VERSION b/python/VERSION index d302656..4a36342 100644 --- a/python/VERSION +++ b/python/VERSION @@ -1 +1 @@ -2.1.18 +3.0.0 diff --git a/python/examples/multichannel.py b/python/examples/multichannel.py index 67b6b41..4dd7b4a 100644 --- a/python/examples/multichannel.py +++ b/python/examples/multichannel.py @@ -2,6 +2,7 @@ import imctermite import pandas import datetime +import numpy as np def add_trigger_time(trigger_time, add_time) : trgts = datetime.datetime.strptime(trigger_time,'%Y-%m-%dT%H:%M:%S') @@ -10,34 +11,50 @@ def add_trigger_time(trigger_time, add_time) : if __name__ == "__main__" : - # read file and extract data - imctm = imctermite.imctermite(b"Measurement.raw") - chns = imctm.get_channels(True) + # read file + imctm = imctermite.imctermite("samples/exampleB.raw") - # prepare abscissa - xcol = "time ["+chns[0]['xunit']+"]" - #xcol = "timestamp" - xsts = [add_trigger_time(chns[0]['trigger-time'],tm) for tm in chns[0]['xdata']] + # Get metadata only + chns = imctm.get_channels(False) + + if not chns: + print("No channels found") + exit() + + # Prepare DataFrame + df = pandas.DataFrame() - # sort channels + # Get X-axis from the first channel + first_chn = chns[0] + + data = imctm.get_channel_data(first_chn['uuid'], include_x=True) + x_data = data['x'] + + xcol = "time ["+first_chn['xunit']+"]" + df[xcol] = x_data + + # sort channels by name chnnms = sorted([chn['name'] for chn in chns], reverse=False) - chnsdict = {} - for chn in chns : - chnsdict[chn['name']] = chn + chnsdict = {chn['name']: chn for chn in chns} - # construct dataframe - df = pandas.DataFrame() - df[xcol] = pandas.Series(chns[0]['xdata']) - #df[xcol] = pandas.Series(xsts) - #for idx,chn in enumerate(chns) : for chnnm in chnnms : chn = chnsdict[chnnm] - #xcol = (chn['xname'] if chn['xname'] != '' else "x_"+str(idx))+" ["+chn['xunit']+"]" - #df[xcol] = pandas.Series(chn['xdata']) + uuid = chn['uuid'] + + # Fetch Y data only + data = imctm.get_channel_data(uuid, include_x=False) + y_data = data['y'] + ycol = chn['yname']+" ["+chn['yunit']+"]" - df[ycol] = pandas.Series(chn['ydata']) + + # Assign to DataFrame + if len(y_data) == len(df): + df[ycol] = y_data + else: + # Fallback to Series for alignment/filling + df[ycol] = pandas.Series(y_data) # show entire dataframe and write file print(df) - df.to_csv("Measurement.csv",header=True,sep='\t',index=False) + df.to_csv("exampleB.csv",header=True,sep='\t',index=False) diff --git a/python/examples/usage.py b/python/examples/usage.py index 06cc3ed..2b48e22 100644 --- a/python/examples/usage.py +++ b/python/examples/usage.py @@ -5,7 +5,7 @@ # declare and initialize instance of "imctermite" by passing a raw-file try : - imcraw = imctermite.imctermite(b"samples/exampleB.raw") + imcraw = imctermite.imctermite("samples/exampleB.raw") except RuntimeError as e : raise Exception("failed to load/parse raw-file: " + str(e)) @@ -24,15 +24,15 @@ print() # print the channels into a specific directory -imcraw.print_channels(b"/tmp/",ord(',')) +imcraw.print_channels("/tmp/",ord(',')) # print all channels separately for i,chn in enumerate(channels) : print(str(i)+" : "+chn['name']+" : "+chn['uuid']) filname = os.path.join("/tmp/",str(i) + "_" + chn['name']+".csv") print(filname) - imcraw.print_channel(chn['uuid'].encode(),filname.encode(),ord(',')) + imcraw.print_channel(chn['uuid'],filname,ord(',')) # print all channels in single file -imcraw.print_table(b"/tmp/allchannels.csv") +imcraw.print_table("/tmp/allchannels.csv") diff --git a/python/examples/usage_adv.py b/python/examples/usage_adv.py index 36000a6..0c844d8 100644 --- a/python/examples/usage_adv.py +++ b/python/examples/usage_adv.py @@ -15,7 +15,7 @@ # declare and initialize instance of "imctermite" by passing a raw-file try : - imcraw = imctermite.imctermite(fl.encode()) + imcraw = imctermite.imctermite(fl) except RuntimeError as e : raise Exception("failed to load/parse raw-file: " + str(e)) @@ -24,7 +24,7 @@ print(json.dumps(channels,indent=4, sort_keys=False)) # print the channels into a specific directory - imcraw.print_channels(b"./",ord(',')) + imcraw.print_channels("./",ord(',')) # print all channels in single file - imcraw.print_table(("./"+str(os.path.basename(fl).split('.')[0])+"_allchannels.csv").encode()) + imcraw.print_table(("./"+str(os.path.basename(fl).split('.')[0])+"_allchannels.csv")) diff --git a/python/examples/usage_ext.py b/python/examples/usage_ext.py index b6536e2..e7dd8e5 100644 --- a/python/examples/usage_ext.py +++ b/python/examples/usage_ext.py @@ -6,7 +6,7 @@ # declare and initialize instance of "imctermite" by passing a raw-file try : - imcraw = imctermite.imctermite(b"samples/sampleB.raw") + imcraw = imctermite.imctermite("samples/sampleB.raw") except RuntimeError as e : raise Exception("failed to load/parse raw-file: " + str(e)) diff --git a/python/examples/usage_files.py b/python/examples/usage_files.py index 3dcebd3..b6532d6 100644 --- a/python/examples/usage_files.py +++ b/python/examples/usage_files.py @@ -1,5 +1,5 @@ -import imctermite import imctermite +import imctermite def show_results(imcraw) : @@ -19,11 +19,11 @@ def show_results(imcraw) : print("") # create instance of 'imctermite' -imcraw = imctermite(b'samples/sampleA.raw') +imcraw = imctermite.imctermite("samples/sampleA.raw") show_results(imcraw) # use previous instance of 'imctermite' to provide new file -imcraw.submit_file(b'samples/sampleB.raw') +imcraw.submit_file("samples/sampleB.raw") show_results(imcraw) diff --git a/python/examples/usage_numpy_chunks.py b/python/examples/usage_numpy_chunks.py new file mode 100644 index 0000000..8e63ee2 --- /dev/null +++ b/python/examples/usage_numpy_chunks.py @@ -0,0 +1,87 @@ + +import imctermite +import json +import os +import numpy as np + +# Path to a sample file +# Using sampleB.raw because it has integer data with scaling (factor=0.01, offset=327.68) +raw_file = "samples/sampleB.raw" +if not os.path.exists(raw_file): + print(f"Sample file {raw_file} not found.") + exit(1) + +print(f"Loading {raw_file}") + +try: + imcraw = imctermite.imctermite(raw_file) +except RuntimeError as e: + print(f"Failed to load/parse raw-file: {e}") + exit(1) + +# Get channels metadata +channels = imcraw.get_channels(False) +if not channels: + print("No channels found.") + exit(0) + +# Pick the first channel +# For sampleB.raw, channel 347 is the interesting one +target_uuid = "347" +channel_info = next((ch for ch in channels if ch['uuid'] == target_uuid), channels[0]) + +first_channel_uuid = channel_info['uuid'] +print(f"Iterating over channel {first_channel_uuid} ({channel_info.get('name', 'unnamed')})") + +# Check native datatype +if 'datatype' in channel_info: + print(f"Native IMC datatype ID: {channel_info['datatype']}") + +# Example 1: Scaled mode (default) - returns floats (physical units) +print("\n--- Scaled Mode (Physical Units) ---") +total_rows = 0 +chunk_size = 1000 + +for chunk in imcraw.iter_channel_numpy(first_channel_uuid, include_x=True, chunk_rows=chunk_size, mode="scaled"): + start = chunk['start'] + y = chunk['y'] + x = chunk.get('x') + + count = len(y) + total_rows += count + + if total_rows <= chunk_size * 2: # Print only first few chunks + print(f"Chunk start={start}, count={count}, y_shape={y.shape}, y_dtype={y.dtype}") + if x is not None: + print(f" x_shape={x.shape}, x_dtype={x.dtype}") + if count > 0: + print(f" First y value: {y[0]}") + +print(f"Total rows read (scaled): {total_rows}") + +# Example 2: Raw mode - returns native types (e.g. integers) +print("\n--- Raw Mode (Native Types) ---") + +# Get scaling factors +factor = float(channel_info.get('factor', 1.0)) +offset = float(channel_info.get('offset', 0.0)) +print(f"Scaling: factor={factor}, offset={offset}") + +total_rows = 0 + +for chunk in imcraw.iter_channel_numpy(first_channel_uuid, include_x=True, chunk_rows=chunk_size, mode="raw"): + start = chunk['start'] + y = chunk['y'] + + count = len(y) + total_rows += count + + if total_rows <= chunk_size * 2: + print(f"Chunk start={start}, count={count}, y_shape={y.shape}, y_dtype={y.dtype}") + if count > 0: + raw_val = y[0] + scaled_val = raw_val * factor + offset + print(f" First y value (raw): {raw_val}") + print(f" First y value (manually scaled): {scaled_val}") + +print(f"Total rows read (raw): {total_rows}") diff --git a/python/examples/usage_timerange.py b/python/examples/usage_timerange.py new file mode 100644 index 0000000..eb36d9e --- /dev/null +++ b/python/examples/usage_timerange.py @@ -0,0 +1,65 @@ + +import imctermite +import sys +import os + +def print_timerange(filename): + """ + Demonstrates how to efficiently get the time range (first and last X values) + of channels without reading the entire file. + """ + + try: + imc = imctermite.imctermite(filename) + except RuntimeError as e: + print(f"Error loading file: {e}") + return + + # Get list of channels (metadata only, no data loaded yet) + channels = imc.get_channels(False) + + if not channels: + print("No channels found in file.") + return + + print(f"File: {filename}") + print("-" * 80) + print(f"{'Channel Name':<25} | {'Start (X)':<15} | {'End (X)':<15} | {'Samples':<10}") + print("-" * 80) + + for chn in channels: + uuid = chn['uuid'] + name = chn.get('yname', 'Unknown') + + length = imc.get_channel_length(uuid) + + if length == 0: + print(f"{name:<25} | {'Empty':<15} | {'Empty':<15} | {0:<10}") + continue + + # Get first sample (efficiently, reading only 1 row) + # We request X data to get the time/index + # chunk_rows=1 ensures we only read/convert the absolute minimum data + gen_first = imc.iter_channel_numpy(uuid, start_index=0, chunk_rows=1, include_x=True) + try: + first_chunk = next(gen_first) + first_x = first_chunk['x'][0] + except (StopIteration, IndexError): + first_x = float('nan') + + # Get last sample + gen_last = imc.iter_channel_numpy(uuid, start_index=length-1, chunk_rows=1, include_x=True) + try: + last_chunk = next(gen_last) + last_x = last_chunk['x'][0] + except (StopIteration, IndexError): + last_x = float('nan') + + print(f"{name:<25} | {first_x:<15.5f} | {last_x:<15.5f} | {length:<10}") + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python usage_timerange.py ") + print("Example: python usage_timerange.py ../../samples/datasetA/datasetA_1.raw") + else: + print_timerange(sys.argv[1]) diff --git a/python/imctermite.pxd b/python/imctermite.pxd index f76521e..682946e 100644 --- a/python/imctermite.pxd +++ b/python/imctermite.pxd @@ -6,6 +6,15 @@ from libcpp cimport bool cdef extern from "lib/imc_raw.hpp" namespace "imc": + cdef struct channel_chunk: + vector[unsigned char] x_bytes + vector[unsigned char] y_bytes + unsigned long int start + unsigned long int count + bool has_x + int x_type + int y_type + cdef cppclass cppimctermite "imc::raw": # constructor(s) @@ -18,7 +27,16 @@ cdef extern from "lib/imc_raw.hpp" namespace "imc": # get JSON list of channels vector[string] get_channels(bool json, bool data) except + + # get length of a channel + unsigned long int get_channel_length(string uuid) except + + + # get numeric type of a channel + int get_channel_numeric_type(string uuid) except + + + # read a chunk of channel data + channel_chunk read_channel_chunk(string uuid, unsigned long int start, unsigned long int count, bool include_x, bool raw_mode) except + + # print single channel/all channels - void print_channel(string channeluuid, string outputdir, char delimiter) except + - void print_channels(string outputdir, char delimiter) except + + void print_channel(string channeluuid, string outputdir, char delimiter, unsigned long int chunk_size) except + + void print_channels(string outputdir, char delimiter, unsigned long int chunk_size) except + void print_table(string outputfile) except + diff --git a/python/imctermite.pyi b/python/imctermite.pyi new file mode 100644 index 0000000..07f1dde --- /dev/null +++ b/python/imctermite.pyi @@ -0,0 +1,185 @@ +""" +Type stub file for IMCtermite Cython extension. +This provides IDE support, type checking, and autocomplete for the imctermite module. +""" + +from typing import Any, Dict, Iterator, List, Literal, Optional, Union +import numpy as np +import numpy.typing as npt + +def get_codepage(chn: bytes) -> str: + """Get the codepage for decoding channel data.""" + ... + +class imctermite: + """ + IMCtermite parser for .raw (IMC2 Data Format) files. + + This class provides methods to read and parse IMC measurement data files, + extracting channel metadata and data. + """ + + def __init__(self, rawfile: Union[str, bytes]) -> None: + """ + Initialize parser with a .raw file. + + Args: + rawfile: Path to the .raw file to parse + """ + ... + + def submit_file(self, rawfile: Union[str, bytes]) -> None: + """ + Set or change the raw file to parse. + + Args: + rawfile: Path to the .raw file to parse + """ + ... + + def get_channels(self, include_data: bool = True) -> List[Dict[str, Any]]: + """ + Get list of all channels in the file with their metadata. + + Args: + include_data: If True, includes the actual measurement data in the result. + If False, only returns metadata (faster for inspection). + + Returns: + List of dictionaries containing channel information: + - uuid: Unique identifier for the channel + - xname: X-axis name (typically "time") + - yname: Y-axis name (measurement name) + - xunit: X-axis unit + - yunit: Y-axis unit + - length: Number of data points + - xdata: X-axis data (if include_data=True) + - ydata: Y-axis data (if include_data=True) + - buffer_type: Data type identifier + - codepage: Text encoding information + """ + ... + + def get_channel_length(self, channeluuid: Union[str, bytes]) -> int: + """ + Get the number of data points in a channel. + + Args: + channeluuid: UUID of the channel to query + + Returns: + Number of data points in the channel + """ + ... + + def iter_channel_numpy( + self, + channeluuid: Union[str, bytes], + include_x: bool = True, + chunk_rows: int = 1000000, + mode: Literal["scaled", "raw"] = "scaled", + start_index: int = 0 + ) -> Iterator[Dict[str, Union[int, npt.NDArray[Any]]]]: + """ + Iterate over channel data in chunks as numpy arrays. + + This is memory-efficient for large datasets as it yields data in chunks + rather than loading everything into memory at once. + + Args: + channeluuid: UUID of the channel to read + include_x: If True, includes x-axis data in results + chunk_rows: Number of rows per chunk (default: 1,000,000) + mode: "scaled" for calibrated values or "raw" for uncalibrated ADC values + start_index: Starting row index (for partial reads) + + Yields: + Dictionary containing: + - start: Starting index of this chunk + - y: numpy array of Y-axis values + - x: numpy array of X-axis values (if include_x=True) + + Example: + >>> imc = imctermite("measurement.raw") + >>> channels = imc.get_channels(include_data=False) + >>> uuid = channels[0]['uuid'] + >>> for chunk in imc.iter_channel_numpy(uuid, chunk_rows=100000): + ... print(f"Processing {len(chunk['y'])} samples starting at {chunk['start']}") + ... # Process chunk['x'] and chunk['y'] arrays + """ + ... + + def get_channel_data( + self, + channeluuid: Union[str, bytes], + include_x: bool = True, + mode: Literal["scaled", "raw"] = "scaled" + ) -> Dict[str, npt.NDArray[Any]]: + """ + Get all data for a channel as numpy arrays. + + Args: + channeluuid: UUID of the channel to read + include_x: If True, includes x-axis data in result + mode: "scaled" for calibrated values or "raw" for uncalibrated ADC values + + Returns: + Dictionary containing: + - y: numpy array of Y-axis values + - x: numpy array of X-axis values (if include_x=True) + + Note: + This loads the entire channel into memory. For large datasets, + consider using iter_channel_numpy() instead. + + Example: + >>> imc = imctermite("measurement.raw") + >>> channels = imc.get_channels(include_data=False) + >>> uuid = channels[0]['uuid'] + >>> data = imc.get_channel_data(uuid) + >>> print(f"X shape: {data['x'].shape}, Y shape: {data['y'].shape}") + """ + ... + + def print_channel( + self, + channeluuid: Union[str, bytes], + outputfile: Union[str, bytes], + delimiter: Union[str, bytes] = b',', + chunk_size: int = 100000 + ) -> None: + """ + Export a single channel to a CSV file. + + Args: + channeluuid: UUID of the channel to export + outputfile: Path to output file + delimiter: Column delimiter character (default: comma) + chunk_size: Number of rows to process at once + """ + ... + + def print_channels( + self, + outputdir: Union[str, bytes], + delimiter: Union[str, bytes] = b',', + chunk_size: int = 100000 + ) -> None: + """ + Export all channels to separate CSV files in a directory. + + Args: + outputdir: Directory path for output files + delimiter: Column delimiter character (default: comma) + chunk_size: Number of rows to process at once + """ + ... + + def print_table(self, outputfile: Union[str, bytes]) -> None: + """ + Export all channels with headers to a single formatted text file. + + Args: + outputfile: Path to output file + """ + ... diff --git a/python/imctermite.pyx b/python/imctermite.pyx index 3bbc7fa..10e1991 100644 --- a/python/imctermite.pyx +++ b/python/imctermite.pyx @@ -1,7 +1,10 @@ # distutils: language = c++ # cython: language_level = 3 -from imctermite cimport cppimctermite +from imctermite cimport cppimctermite, channel_chunk +cimport numpy as cnp +import numpy as np +from libc.string cimport memcpy import json as jn import decimal @@ -9,25 +12,45 @@ import platform # auxiliary function for codepage conversion def get_codepage(chn) : - if platform == 'Windows' : - chndec = jn.loads(chn.decode(errors="ignore")) - chncdp = chndec["codepage"] - return 'utf-8' if chncdp is None else chncdp + if platform.system() == 'Windows' : + try: + chndec = jn.loads(chn.decode(errors="ignore")) + chncdp = chndec.get("codepage") + if not chncdp: + return 'utf-8' + # If it's a number like "1252", Python expects "cp1252" + if str(chncdp).isdigit(): + return 'cp' + str(chncdp) + return str(chncdp) + except: + return 'utf-8' else : return 'utf-8' +cdef bytes _as_bytes(obj): + if isinstance(obj, bytes): + return obj + elif isinstance(obj, str): + return obj.encode('utf-8') + else: + return str(obj).encode('utf-8') + cdef class imctermite: - # C++ instance of class => stack allocated (requires nullary constructor!) - cdef cppimctermite cppimc + # C++ instance of class + cdef cppimctermite* cppimc # constructor - def __cinit__(self, string rawfile): - self.cppimc = cppimctermite(rawfile) + def __cinit__(self, rawfile): + self.cppimc = new cppimctermite(_as_bytes(rawfile)) + + def __dealloc__(self): + if self.cppimc != NULL: + del self.cppimc # provide raw file - def submit_file(self,string rawfile): - self.cppimc.set_file(rawfile) + def submit_file(self, rawfile): + self.cppimc.set_file(_as_bytes(rawfile)) # get JSON list of channels def get_channels(self, bool include_data): @@ -35,17 +58,92 @@ cdef class imctermite: chnlstjn = [jn.loads(chn.decode(get_codepage(chn),errors="ignore")) for chn in chnlst] return chnlstjn + # get length of a channel + def get_channel_length(self, channeluuid): + return self.cppimc.get_channel_length(_as_bytes(channeluuid)) + + def iter_channel_numpy(self, channeluuid, bool include_x=True, unsigned long int chunk_rows=1000000, str mode="scaled", unsigned long int start_index=0): + cdef unsigned long int total_len = self.cppimc.get_channel_length(_as_bytes(channeluuid)) + cdef unsigned long int start = start_index + cdef channel_chunk chunk + cdef cnp.ndarray x_arr + cdef cnp.ndarray y_arr + cdef bool raw_mode = (mode == "raw") + + # Map imc::numtype to numpy dtype + # Types 9 (imc_devices_transitional_recording) and 10 (timestamp_ascii) + # are not currently supported by the underlying C++ library. + dtype_map = { + 1: np.uint8, # unsigned_byte + 2: np.int8, # signed_byte + 3: np.uint16, # unsigned_short + 4: np.int16, # signed_short + 5: np.uint32, # unsigned_long (imc_Ulongint is unsigned int (32-bit) on x86_64 usually) + 6: np.int32, # signed_long (imc_Slongint is signed int) + 7: np.float32, # ffloat + 8: np.float64, # ddouble + 11: np.uint16, # two_byte_word_digital + 12: np.uint64, # eight_byte_unsigned_long + 13: np.uint64, # six_byte_unsigned_long (promoted to 8 bytes in C++) + 14: np.int64 # eight_byte_signed_long + } + + while start < total_len: + chunk = self.cppimc.read_channel_chunk(_as_bytes(channeluuid), start, chunk_rows, include_x, raw_mode) + + # Create numpy arrays from bytes + y_dtype = dtype_map.get(chunk.y_type, np.float64) + + y_arr = np.empty(chunk.count, dtype=y_dtype) + + if chunk.y_bytes.size() > 0: + memcpy( cnp.PyArray_DATA(y_arr), + chunk.y_bytes.data(), + chunk.y_bytes.size()) + + result = { + "start": chunk.start, + "y": y_arr + } + + if include_x: + x_dtype = dtype_map.get(chunk.x_type, np.float64) + x_arr = np.empty(chunk.count, dtype=x_dtype) + + if chunk.x_bytes.size() > 0: + memcpy( cnp.PyArray_DATA(x_arr), + chunk.x_bytes.data(), + chunk.x_bytes.size()) + + result["x"] = x_arr + + yield result + + start += chunk.count + if chunk.count == 0: + break + + def get_channel_data(self, channeluuid, bool include_x=True, str mode="scaled"): + cdef unsigned long int total_len = self.cppimc.get_channel_length(_as_bytes(channeluuid)) + if total_len == 0: + res = {'y': np.array([])} + if include_x: + res['x'] = np.array([]) + return res + + return next(self.iter_channel_numpy(channeluuid, include_x, total_len, mode, 0)) + # print single channel/all channels - def print_channel(self, string channeluuid, string outputfile, char delimiter): - self.cppimc.print_channel(channeluuid,outputfile,delimiter) - def print_channels(self, string outputdir, char delimiter): - self.cppimc.print_channels(outputdir,delimiter) + def print_channel(self, channeluuid, outputfile, char delimiter, unsigned long int chunk_size=100000): + self.cppimc.print_channel(_as_bytes(channeluuid),_as_bytes(outputfile),delimiter,chunk_size) + def print_channels(self, outputdir, char delimiter, unsigned long int chunk_size=100000): + self.cppimc.print_channels(_as_bytes(outputdir),delimiter,chunk_size) # print table including channels - def print_table(self, string outputfile): + def print_table(self, outputfile): chnlst = self.cppimc.get_channels(True,True) chnlstjn = [jn.loads(chn.decode(errors="ignore")) for chn in chnlst] - with open(outputfile.decode(),'w') as fout: + with open(outputfile,'w') as fout: for chn in chnlstjn: fout.write('#' +str(chn['xname']).rjust(19)+str(chn['yname']).rjust(20)+'\n') fout.write('#'+str(chn['xunit']).rjust(19)+str(chn['yunit']).rjust(20)+'\n') diff --git a/python/makefile b/python/makefile index 6bb6ecd..878a7b3 100644 --- a/python/makefile +++ b/python/makefile @@ -11,25 +11,20 @@ setup-clean: rm -rf lib/ build: setup - python setup.py build - -build-inplace: setup - python setup.py build_ext --inplace + python3 -m pip install -e . build-sdist: setup - python setup.py sdist - python -m twine check dist/* + python3 -m build --sdist + python3 -m twine check dist/* build-bdist: setup - python setup.py bdist - python -m twine check dist/* + python3 -m build --wheel + python3 -m twine check dist/* build-clean: - python setup.py clean --all - rm -vf imctermite*.so imctermite*.cpp - rm -vf IMCtermite*.so IMCtermite*.cpp - rm -rvf dist/ IMCtermite.egg-info/ + rm -vf imctermite*.so imctermite*.pyd imctermite*.cpp rm -rvf dist/ imctermite.egg-info/ + rm -rvf build/ cibuildwheel-build: setup cibuildwheel --platform linux @@ -38,9 +33,9 @@ cibuildwheel-clean: rm -rvf wheelhouse/ pypi-upload: - python -m twine upload dist/$(shell ls -t dist/ | head -n1) + python3 -m twine upload dist/$(shell ls -t dist/ | head -n1) clean: setup build-clean cibuildwheel-clean setup-clean run-example: - PYTHONPATH=$(pwd) python examples/usage_files.py + PYTHONPATH=$(pwd) python3 examples/usage_files.py diff --git a/python/py.typed b/python/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/python/pyproject.toml b/python/pyproject.toml index 0e657f5..39b64ac 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,6 +1,57 @@ [build-system] -requires = ["setuptools", "wheel","Cython"] +requires = ["setuptools>=77.0.0", "wheel", "Cython", "numpy"] build-backend = "setuptools.build_meta" +[project] +name = "imctermite" +description = "Enables extraction of measurement data from binary files with extension 'raw' used by proprietary software imcFAMOS and imcSTUDIO and facilitates its storage in open source file formats" +readme = "README.md" +requires-python = ">=3.10" +license = "MIT" +authors = [ + {name = "Record Evolution GmbH", email = "mario.fink@record-evolution.de"} +] +maintainers = [ + {name = "Record Evolution GmbH"} +] +keywords = ["IMC", "raw", "imcFAMOS", "imcSTUDIO", "imcCRONOS"] +classifiers = [ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Operating System :: OS Independent", + "Topic :: Scientific/Engineering", + "Topic :: Software Development :: Libraries :: Python Modules" +] +dependencies = [ + "numpy>=1.26.0" +] +dynamic = ["version"] + +[project.optional-dependencies] +test = ["pytest>=7.0.0"] + +[project.urls] +Homepage = "https://github.com/RecordEvolution/IMCtermite.git" + +[tool.setuptools] +# This is a single extension module build, not a package with subdirectories +py-modules = [] +# Explicitly set packages to empty to prevent auto-discovery +packages = [] + +[tool.setuptools.dynamic] +version = {file = "VERSION"} + +[tool.setuptools.package-data] +"*" = ["py.typed", "*.pyi"] + [tool.cibuildwheel] -before-all = "" +# Build for Python 3.10-3.13 +build = "cp310-* cp311-* cp312-* cp313-*" +# Skip 32-bit builds and musllinux +skip = "*-win32 *-manylinux_i686 *-musllinux_*" +# Tests are already run in test.yml workflow before wheel building +test-skip = "*" diff --git a/python/setup.cfg b/python/setup.cfg deleted file mode 100644 index 1308c6e..0000000 --- a/python/setup.cfg +++ /dev/null @@ -1,23 +0,0 @@ - -[metadata] -name = imctermite -description = Enables extraction of measurement data from binary files with extension 'raw' used by proprietary software imcFAMOS and imcSTUDIO and facilitates its storage in open source file formats -long_description = file: README.md -# long_description_content_type = text/x-rst -long_description_content_type = text/markdown -version = file: VERSION -author = Record Evolution GmbH -author_email = mario.fink@record-evolution.de -maintainer = Record Evolution GmbH -url= https://github.com/RecordEvolution/IMCtermite.git -license = MIT License -license_files = LICENSE -keywords = IMC, raw, imcFAMOS, imcSTUDIO, imcCRONOS -classifiers = - Programming Language :: Python :: 3 - License :: OSI Approved :: MIT License - Operating System :: OS Independent - Topic :: Scientific/Engineering - Topic :: Software Development :: Libraries :: Python Modules - -[options] diff --git a/python/setup.py b/python/setup.py index 98ebef7..afb021d 100644 --- a/python/setup.py +++ b/python/setup.py @@ -1,6 +1,7 @@ from setuptools import Extension, setup from Cython.Build import cythonize import sys +import numpy print("building on platform: "+sys.platform) @@ -13,9 +14,12 @@ extension = Extension( "imctermite", sources=["imctermite.pyx"], - extra_compile_args=cmpArgs[sys.platform] + include_dirs=[numpy.get_include()], + extra_compile_args=cmpArgs[sys.platform], + define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")] ) setup( - ext_modules=cythonize(extension,language_level=3) + ext_modules=cythonize(extension, language_level=3), + zip_safe=False ) diff --git a/src/main.cpp b/src/main.cpp index b19e0c7..989d30a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -3,6 +3,7 @@ #include #include #include +#include // #include "imc_key.hpp" // #include "imc_block.hpp" diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..715ea11 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,49 @@ +# IMCtermite Tests + +End-to-end tests for both the CLI tool and Python module. + + +## Running Tests + +### All Tests +```bash +make test # Via makefile (builds if needed) +pytest # Direct pytest +``` + +### CLI Tests Only +```bash +make test-cli +pytest tests/test_cli.py +``` + +### Python Module Tests Only +```bash +make test-python +pytest tests/test_python.py +``` + +## Prerequisites + +### Recommended: Development install + +Install the package in editable mode with test dependencies (handles all requirements automatically): + +```bash +pip install -e "python[test]" +``` + +Then run tests with pytest: +```bash +pytest +``` + +### Alternative: Using makefile + +If you prefer `make test`, just install pytest first: + +```bash +pip install pytest +make test +``` + diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..e906689 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,245 @@ +#!/usr/bin/env python3 +""" +End-to-end tests for IMCtermite CLI tool +""" + +import pytest +import subprocess +import sys +from pathlib import Path + +PROJECT_ROOT = Path(__file__).parent.parent +CLI = PROJECT_ROOT / "imctermite" +if sys.platform == "win32": + CLI = CLI.with_suffix(".exe") +SAMPLES_DIR = PROJECT_ROOT / "samples" / "datasetA" + + +class TestCLIBasics: + """Test basic CLI functionality""" + + def test_cli_exists(self): + """CLI binary should exist""" + assert CLI.exists(), f"CLI not found at {CLI}" + + def test_help_output(self): + """Should display help message""" + result = subprocess.run([str(CLI), "--help"], capture_output=True, text=True) + assert result.returncode == 0 + assert "Usage:" in result.stdout or "usage:" in result.stdout.lower() + + def test_version_output(self): + """Should display version""" + result = subprocess.run([str(CLI), "--version"], capture_output=True, text=True) + assert result.returncode == 0 + assert len(result.stdout) > 0 + + def test_invalid_file_handling(self): + """Should fail gracefully on nonexistent file""" + result = subprocess.run( + [str(CLI), "/nonexistent/file.raw"], + capture_output=True, + text=True + ) + assert result.returncode != 0 + + +class TestChannelOperations: + """Test channel listing and data extraction""" + + @pytest.fixture + def sample_file(self): + """Get path to sample file""" + sample = SAMPLES_DIR / "datasetA_1.raw" + if not sample.exists(): + pytest.skip(f"Sample file not found: {sample}") + return sample + + def test_list_channels(self, sample_file): + """Should list channels with metadata""" + result = subprocess.run( + [str(CLI), str(sample_file), "--listchannels"], + capture_output=True, + text=True + ) + assert result.returncode == 0 + assert "uuid" in result.stdout + + def test_list_blocks(self, sample_file): + """Should list IMC blocks""" + result = subprocess.run( + [str(CLI), str(sample_file), "--listblocks"], + capture_output=True, + text=True + ) + assert result.returncode == 0 + # Block markers like CF, CK, CC, etc. + assert "C" in result.stdout and ("F" in result.stdout or "K" in result.stdout) + + +class TestCSVOutput: + """Test CSV file generation""" + + @pytest.fixture + def sample_file(self): + """Get path to sample file""" + sample = SAMPLES_DIR / "datasetA_1.raw" + if not sample.exists(): + pytest.skip(f"Sample file not found: {sample}") + return sample + + def test_generate_csv_output(self, sample_file, tmp_path): + """Should generate CSV files""" + output_dir = tmp_path / "csv_output" + output_dir.mkdir() + + result = subprocess.run( + [str(CLI), str(sample_file), "--output", str(output_dir)], + capture_output=True, + text=True + ) + assert result.returncode == 0 + + csv_files = list(output_dir.glob("*.csv")) + assert len(csv_files) > 0, "Should generate at least one CSV file" + + def test_csv_format_valid(self, sample_file, tmp_path): + """Generated CSV should have valid format""" + output_dir = tmp_path / "csv_output" + output_dir.mkdir() + + subprocess.run( + [str(CLI), str(sample_file), "--output", str(output_dir)], + capture_output=True + ) + + csv_files = list(output_dir.glob("*.csv")) + assert len(csv_files) > 0 + + # Check first CSV file + first_csv = csv_files[0] + content = first_csv.read_text() + lines = content.strip().split('\n') + + assert len(lines) > 1, "CSV should have header and data" + assert ',' in lines[0], "CSV should use comma delimiter" + + def test_custom_delimiter(self, sample_file, tmp_path): + """Should support custom delimiter""" + output_dir = tmp_path / "csv_delim" + output_dir.mkdir() + + result = subprocess.run( + [str(CLI), str(sample_file), "--output", str(output_dir), "--delimiter", ";"], + capture_output=True, + text=True + ) + assert result.returncode == 0 + + csv_files = list(output_dir.glob("*.csv")) + assert len(csv_files) > 0 + + # Check delimiter is applied + first_csv = csv_files[0] + content = first_csv.read_text() + first_line = content.split('\n')[0] + assert ';' in first_line, "Should use semicolon delimiter" + + +class TestMultipleFiles: + """Test processing multiple sample files""" + + def test_process_all_sample_files(self): + """Should successfully process all .raw and .dat files in samples directory (list channels)""" + samples_root = SAMPLES_DIR.parent + if not samples_root.exists(): + pytest.skip(f"Samples directory not found: {samples_root}") + + # Get all .raw and .dat files recursively + samples = sorted(list(samples_root.glob("*.raw")) + + list(samples_root.glob("*.dat")) + + list(samples_root.glob("**/*.raw")) + + list(samples_root.glob("**/*.dat"))) + # Remove duplicates + samples = sorted(set(samples)) + + if len(samples) == 0: + pytest.skip("No .raw or .dat files in samples directory") + + failed = [] + for sample in samples: + result = subprocess.run( + [str(CLI), str(sample), "--listchannels"], + capture_output=True, + text=True, + errors='replace' # Handle non-UTF8 characters in output + ) + if result.returncode != 0: + failed.append(f"{sample.relative_to(samples_root)}: exit code {result.returncode}") + + assert len(failed) == 0, f"Failed to process {len(failed)}/{len(samples)} files: {failed}" + + def test_extract_all_sample_files_with_data(self): + """Should successfully extract data from all .raw and .dat files""" + import tempfile + import shutil + + samples_root = SAMPLES_DIR.parent + if not samples_root.exists(): + pytest.skip(f"Samples directory not found: {samples_root}") + + # Get all .raw and .dat files recursively + samples = sorted(list(samples_root.glob("*.raw")) + + list(samples_root.glob("*.dat")) + + list(samples_root.glob("**/*.raw")) + + list(samples_root.glob("**/*.dat"))) + samples = sorted(set(samples)) + + if len(samples) == 0: + pytest.skip("No .raw or .dat files in samples directory") + + # Create temp directory for output + temp_dir = tempfile.mkdtemp() + try: + failed = [] + for sample in samples: + result = subprocess.run( + [str(CLI), str(sample), "--output", temp_dir], + capture_output=True, + text=True, + errors='replace' + ) + if result.returncode != 0: + failed.append(f"{sample.relative_to(samples_root)}: exit code {result.returncode}") + + assert len(failed) == 0, f"Failed to extract data from {len(failed)}/{len(samples)} files: {failed}" + finally: + shutil.rmtree(temp_dir, ignore_errors=True) + + +class TestExitCodes: + """Test exit code behavior""" + + def test_success_exit_code(self): + """Should return 0 on success""" + sample = SAMPLES_DIR / "datasetA_1.raw" + if not sample.exists(): + pytest.skip("Sample file not found") + + result = subprocess.run( + [str(CLI), str(sample), "--listchannels"], + capture_output=True + ) + assert result.returncode == 0 + + def test_error_exit_code(self): + """Should return non-zero on error""" + result = subprocess.run( + [str(CLI), "/nonexistent/file.raw"], + capture_output=True + ) + assert result.returncode != 0 + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/test_python.py b/tests/test_python.py new file mode 100644 index 0000000..dd6a5df --- /dev/null +++ b/tests/test_python.py @@ -0,0 +1,486 @@ +#!/usr/bin/env python3 +""" +End-to-end tests for IMCtermite Python module +""" + +import pytest +import os +import tempfile +import csv +import numpy as np +from pathlib import Path + +try: + import imctermite +except ImportError: + pytest.skip("imctermite module not built - run 'make python-build' first", allow_module_level=True) + +PROJECT_ROOT = Path(__file__).parent.parent +SAMPLES_DIR = PROJECT_ROOT / "samples" +DATASET_A = SAMPLES_DIR / "datasetA" +DATASET_B = SAMPLES_DIR / "datasetB" + + +class TestModuleImport: + """Test basic module functionality""" + + def test_module_imports(self): + """Module should import without errors""" + assert imctermite is not None + + def test_can_instantiate(self): + """Should create instance with valid file""" + sample_file = DATASET_A / "datasetA_1.raw" + if not sample_file.exists(): + pytest.skip(f"Sample file not found: {sample_file}") + + imc = imctermite.imctermite(str(sample_file).encode()) + assert imc is not None + + +class TestChannelListing: + """Test channel metadata retrieval""" + + @pytest.fixture + def imc_instance(self): + """Create IMC instance with sample file""" + sample_file = DATASET_A / "datasetA_1.raw" + if not sample_file.exists(): + pytest.skip(f"Sample file not found: {sample_file}") + return imctermite.imctermite(str(sample_file).encode()) + + def test_get_channel_list(self, imc_instance): + """Should return list of channel metadata""" + channels = imc_instance.get_channels(include_data=False) + assert isinstance(channels, list) + assert len(channels) > 0 + + def test_channel_metadata_structure(self, imc_instance): + """Channel metadata should have required fields""" + channels = imc_instance.get_channels(include_data=False) + first_channel = channels[0] + + # Check for expected keys + required_keys = ['name', 'uuid'] + for key in required_keys: + assert key in first_channel, f"Missing key: {key}" + + def test_get_channel_data(self, imc_instance): + """Should return channel data with xdata and ydata""" + channels = imc_instance.get_channels(include_data=True) + assert isinstance(channels, list) + assert len(channels) > 0 + + first_channel = channels[0] + assert 'xdata' in first_channel + assert 'ydata' in first_channel + assert isinstance(first_channel['xdata'], list) + assert isinstance(first_channel['ydata'], list) + assert len(first_channel['xdata']) == len(first_channel['ydata']) + + +class TestDataIntegrity: + """Test data extraction and validation""" + + @pytest.fixture + def sample_data(self): + """Load sample file and extract data""" + sample_file = DATASET_A / "datasetA_1.raw" + if not sample_file.exists(): + pytest.skip(f"Sample file not found: {sample_file}") + + imc = imctermite.imctermite(str(sample_file).encode()) + return imc.get_channels(include_data=True) + + def test_data_arrays_not_empty(self, sample_data): + """Data arrays should not be empty""" + for channel in sample_data: + assert len(channel['xdata']) > 0 + assert len(channel['ydata']) > 0 + + def test_data_values_are_numeric(self, sample_data): + """All data values should be numeric""" + for channel in sample_data: + for x in channel['xdata'][:10]: # Check first 10 + assert isinstance(x, (int, float)) + for y in channel['ydata'][:10]: + assert isinstance(y, (int, float)) + for val in channel['ydata']: + assert isinstance(val, (int, float)) + +class TestChunkedNumpy: + """Test chunked NumPy API""" + + def test_chunked_iteration_all_samples(self): + """Verify chunked iteration against get_channels for all samples""" + + # Get all .raw and .dat files recursively + raw_files = sorted(list(SAMPLES_DIR.glob("**/*.raw")) + + list(SAMPLES_DIR.glob("**/*.dat"))) + + for raw_file in raw_files: + # print(f"Testing {raw_file.name}") + try: + imc = imctermite.imctermite(str(raw_file).encode()) + + # Get reference data + channels_ref = imc.get_channels(include_data=True) + + for ch_ref in channels_ref: + uuid = ch_ref['uuid'].encode('utf-8') + + # Test with include_x=True + y_chunks = [] + x_chunks = [] + + # Use a small chunk size to ensure we test chunking logic even on small files + # Some files might be very small, so 100 is a good stress test + for chunk in imc.iter_channel_numpy(uuid, include_x=True, chunk_rows=100, mode="scaled"): + y_chunks.append(chunk['y']) + x_chunks.append(chunk['x']) + + if not y_chunks: + assert len(ch_ref['ydata']) == 0 + continue + + y_full = np.concatenate(y_chunks) + x_full = np.concatenate(x_chunks) + + # Compare with reference + # Note: get_channels returns lists of floats. + # We compare them with numpy arrays. + + # Check lengths first + assert len(y_full) == len(ch_ref['ydata']), f"Length mismatch in {raw_file.name} channel {uuid}" + + # Check values + assert np.allclose(y_full, ch_ref['ydata'], equal_nan=True), f"Y data mismatch in {raw_file.name} channel {uuid}" + assert np.allclose(x_full, ch_ref['xdata'], equal_nan=True), f"X data mismatch in {raw_file.name} channel {uuid}" + + # Test with include_x=False + y_chunks_nox = [] + for chunk in imc.iter_channel_numpy(uuid, include_x=False, chunk_rows=100, mode="scaled"): + y_chunks_nox.append(chunk['y']) + assert 'x' not in chunk + + if y_chunks_nox: + y_full_nox = np.concatenate(y_chunks_nox) + assert np.allclose(y_full_nox, ch_ref['ydata'], equal_nan=True), f"Y data mismatch (no x) in {raw_file.name} channel {uuid}" + + # Test raw mode (basic check that it runs and returns correct length) + # We can't easily verify values without reimplementing the scaling logic, + # but we can check that it returns something valid. + y_chunks_raw = [] + for chunk in imc.iter_channel_numpy(uuid, include_x=False, chunk_rows=100, mode="raw"): + y_chunks_raw.append(chunk['y']) + # Check that dtype is not float64 unless it really is float data + # Most samples are likely int16 or similar + # print(f"Raw dtype: {chunk['y'].dtype}") + + if y_chunks_raw: + y_full_raw = np.concatenate(y_chunks_raw) + assert len(y_full_raw) == len(ch_ref['ydata']), f"Raw length mismatch in {raw_file.name} channel {uuid}" + + + except Exception as e: + pytest.fail(f"Failed processing {raw_file.name}: {str(e)}") + + +class TestCSVOutput: + """Test CSV file generation""" + + @pytest.fixture + def imc_instance(self): + """Create IMC instance""" + sample_file = DATASET_A / "datasetA_1.raw" + if not sample_file.exists(): + pytest.skip(f"Sample file not found: {sample_file}") + return imctermite.imctermite(str(sample_file).encode()) + + def test_print_channel_to_csv(self, imc_instance, tmp_path): + """Should create CSV file for single channel""" + output_file = tmp_path / "test_channel.csv" + + channels = imc_instance.get_channels(include_data=False) + if len(channels) == 0: + pytest.skip("No channels in sample file") + + channel_uuid = channels[0]['uuid'] + imc_instance.print_channel(channel_uuid.encode(), str(output_file).encode(), b','[0]) + + assert output_file.exists() + assert output_file.stat().st_size > 0 + + def test_csv_format_valid(self, imc_instance, tmp_path): + """Generated CSV should be valid""" + output_file = tmp_path / "test_channel.csv" + + channels = imc_instance.get_channels(include_data=False) + if len(channels) == 0: + pytest.skip("No channels in sample file") + + channel_uuid = channels[0]['uuid'] + imc_instance.print_channel(channel_uuid.encode(), str(output_file).encode(), b','[0]) + + # Read and validate CSV + with open(output_file, 'r') as f: + reader = csv.reader(f) + rows = list(reader) + + assert len(rows) > 1, "CSV should have header and data" + assert len(rows[0]) == 2, "CSV should have 2 columns" + + # Check second row is numeric (first row is header with units) + if len(rows) > 1: + data_row = rows[1] + try: + float(data_row[0]) # Should not raise + float(data_row[1]) # Should not raise + except ValueError: + # Maybe first row is header, try second data row + if len(rows) > 2: + data_row = rows[2] + float(data_row[0]) + float(data_row[1]) + + def test_print_all_channels(self, imc_instance, tmp_path): + """Should create CSV files for all channels""" + output_dir = tmp_path / "all_channels" + output_dir.mkdir() + + imc_instance.print_channels(str(output_dir).encode(), b','[0]) + + csv_files = list(output_dir.glob("*.csv")) + assert len(csv_files) > 0, "Should generate at least one CSV file" + + +class TestMultipleFiles: + """Test processing multiple sample files""" + + def test_process_all_sample_files(self): + """Should process all .raw and .dat files in samples directory (metadata only)""" + if not SAMPLES_DIR.exists(): + pytest.skip(f"Samples directory not found: {SAMPLES_DIR}") + + # Get all .raw and .dat files recursively + files_to_test = sorted(list(SAMPLES_DIR.glob("**/*.raw")) + + list(SAMPLES_DIR.glob("**/*.dat"))) + + if len(files_to_test) == 0: + pytest.skip("No .raw or .dat files in samples directory") + + successful = 0 + failed = [] + for sample_file in files_to_test: + try: + imc = imctermite.imctermite(str(sample_file).encode()) + channels = imc.get_channels(include_data=False) + if len(channels) > 0: + successful += 1 + except Exception as e: + failed.append(f"{sample_file.relative_to(SAMPLES_DIR)}: {e}") + + assert len(failed) == 0, f"Failed to process {len(failed)}/{len(files_to_test)} files: {failed}" + assert successful == len(files_to_test), f"Only {successful}/{len(files_to_test)} files had channels" + + def test_extract_all_sample_files_with_data(self): + """Should fully extract all .raw and .dat files with data""" + if not SAMPLES_DIR.exists(): + pytest.skip(f"Samples directory not found: {SAMPLES_DIR}") + + # Get all .raw and .dat files recursively + files_to_test = sorted(list(SAMPLES_DIR.glob("**/*.raw")) + + list(SAMPLES_DIR.glob("**/*.dat"))) + + if len(files_to_test) == 0: + pytest.skip("No .raw or .dat files in samples directory") + + successful = 0 + failed = [] + for sample_file in files_to_test: + try: + imc = imctermite.imctermite(str(sample_file).encode()) + channels = imc.get_channels(include_data=True) + + # Verify we got data + if len(channels) > 0: + # Check that at least one channel has actual data (xdata or ydata) + has_data = False + for channel in channels: + if ('xdata' in channel and len(channel['xdata']) > 0) or \ + ('ydata' in channel and len(channel['ydata']) > 0): + has_data = True + break + + if has_data: + successful += 1 + else: + failed.append(f"{sample_file.relative_to(SAMPLES_DIR)}: no data in channels") + else: + failed.append(f"{sample_file.relative_to(SAMPLES_DIR)}: no channels found") + except Exception as e: + failed.append(f"{sample_file.relative_to(SAMPLES_DIR)}: {e}") + + assert len(failed) == 0, f"Failed to extract data from {len(failed)}/{len(files_to_test)} files: {failed}" + assert successful == len(files_to_test), f"Only {successful}/{len(files_to_test)} files extracted with data" + + def test_reload_different_file(self): + """Should be able to load different files sequentially""" + file1 = DATASET_A / "datasetA_1.raw" + file2 = DATASET_A / "datasetA_2.raw" + + if not (file1.exists() and file2.exists()): + pytest.skip("Need at least 2 sample files") + + # Load first file + imc1 = imctermite.imctermite(str(file1).encode()) + channels1 = imc1.get_channels(include_data=False) + + # Load second file + imc2 = imctermite.imctermite(str(file2).encode()) + channels2 = imc2.get_channels(include_data=False) + + # Both should work + assert len(channels1) > 0 + assert len(channels2) > 0 + + +class TestDataRegression: + """Test specific known values to catch parsing regressions""" + + @pytest.mark.parametrize("file_path,expected", [ + # datasetA_1.raw - Standard .raw format with gravity unit + ("datasetA/datasetA_1.raw", { + 'num_channels': 1, + 'data_length': 6000, + 'yunit': 'G', + 'xstepwidth': 0.005, + 'ydata_first': [0.010029276, 0.015780726], + 'ydata_last': [-0.02981583, -0.030068753], # [-2], [-1] + 'xdata_first': [416.01], + }), + # sampleA.raw - Pressure data with mbar units + ("sampleA.raw", { + 'num_channels': 1, + 'data_length': 2402, + 'yunit': '"mbar"', + 'xoffset': 2044.03, + 'ydata_first': [956.013793945, 955.484924316, 955.487670898], + 'ydata_last': [866.840881348, 866.91619873, 866.985290527], # [-3], [-2], [-1] + }), + # XY_dataset_example.dat - Different .dat format with explicit X-Y data + ("XY_dataset_example.dat", { + 'num_channels': 1, + 'data_length': 13094, + 'ydata_first': [0, 0, 0], + 'ydata_last': [2796202, 2796202, 2982616], # [-3], [-2], [-1] + 'xdata_first': [67.855759, 67.880796], + 'xdata_last': [395.158317], + }), + ]) + def test_known_values(self, file_path, expected): + """Verify known values from sample files to catch parsing regressions""" + sample_file = SAMPLES_DIR / file_path + if not sample_file.exists(): + pytest.skip(f"Sample file not found: {sample_file}") + + imc = imctermite.imctermite(str(sample_file).encode()) + channels = imc.get_channels(include_data=True) + + # Check number of channels + assert len(channels) == expected['num_channels'], \ + f"Should have {expected['num_channels']} channel(s)" + + ch = channels[0] + + # Verify data length + ydata = ch.get('ydata', []) + assert len(ydata) == expected['data_length'], \ + f"Should have {expected['data_length']} data points" + + # Verify metadata if specified + if 'yunit' in expected: + assert ch.get('yunit') == expected['yunit'], \ + f"Unit should be {expected['yunit']}" + + if 'xstepwidth' in expected: + assert abs(float(ch.get('xstepwidth')) - expected['xstepwidth']) < 1e-9, \ + f"X step width should be {expected['xstepwidth']}" + + if 'xoffset' in expected: + assert abs(float(ch.get('xoffset')) - expected['xoffset']) < 1e-9, \ + f"X offset should be {expected['xoffset']}" + + # Verify ydata first values + tolerance = 1e-6 # Default tolerance for floating-point comparisons + for i, expected_val in enumerate(expected['ydata_first']): + if isinstance(expected_val, float): + assert abs(ydata[i] - expected_val) < tolerance, \ + f"ydata[{i}] should be {expected_val}" + else: + assert ydata[i] == expected_val, \ + f"ydata[{i}] should be {expected_val}" + + # Verify ydata last values + for i, expected_val in enumerate(expected['ydata_last']): + idx = -(len(expected['ydata_last']) - i) + if isinstance(expected_val, float): + assert abs(ydata[idx] - expected_val) < tolerance, \ + f"ydata[{idx}] should be {expected_val}" + else: + assert ydata[idx] == expected_val, \ + f"ydata[{idx}] should be {expected_val}" + + # Verify xdata if specified + if 'xdata_first' in expected: + xdata = ch.get('xdata', []) + for i, expected_val in enumerate(expected['xdata_first']): + assert abs(xdata[i] - expected_val) < tolerance, \ + f"xdata[{i}] should be {expected_val}" + + if 'xdata_last' in expected: + xdata = ch.get('xdata', []) + for i, expected_val in enumerate(expected['xdata_last']): + idx = -(len(expected['xdata_last']) - i) + assert abs(xdata[idx] - expected_val) < tolerance, \ + f"xdata[{idx}] should be {expected_val}" + + +class TestErrorHandling: + """Test error conditions""" + + def test_nonexistent_file(self): + """Should raise error for nonexistent file""" + with pytest.raises(Exception): + imctermite.imctermite(b"/nonexistent/file.raw") + + def test_invalid_channel_name(self): + """Should handle invalid channel name gracefully""" + sample_file = DATASET_A / "datasetA_1.raw" + if not sample_file.exists(): + pytest.skip(f"Sample file not found: {sample_file}") + + imc = imctermite.imctermite(str(sample_file).encode()) + + # This should either raise or return empty - both are acceptable + try: + with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: + output_file = f.name + + imc.print_channel(b"NONEXISTENT_CHANNEL_UUID", output_file.encode(), b','[0]) + + # If it didn't raise, check if file is empty or has minimal content + if os.path.exists(output_file): + size = os.path.getsize(output_file) + # Either file doesn't exist or is very small (just header) + assert size < 100 + except Exception: + # Raising an exception is also acceptable behavior + pass + finally: + if os.path.exists(output_file): + os.unlink(output_file) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/test_streaming.py b/tests/test_streaming.py new file mode 100644 index 0000000..ad95710 --- /dev/null +++ b/tests/test_streaming.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +""" +Tests for the new streaming/chunking functionality in IMCtermite +""" + +import pytest +import numpy as np +from pathlib import Path + +try: + import imctermite +except ImportError: + pytest.skip("imctermite module not built - run 'make python-build' first", allow_module_level=True) + +PROJECT_ROOT = Path(__file__).parent.parent +SAMPLES_DIR = PROJECT_ROOT / "samples" +DATASET_A = SAMPLES_DIR / "datasetA" + +class TestStreaming: + """Test iter_channel_numpy functionality""" + + @pytest.fixture + def imc_instance(self): + """Create IMC instance with sample file""" + sample_file = DATASET_A / "datasetA_1.raw" + if not sample_file.exists(): + pytest.skip(f"Sample file not found: {sample_file}") + return imctermite.imctermite(str(sample_file).encode()) + + @pytest.fixture + def first_channel_uuid(self, imc_instance): + """Get UUID of the first channel""" + channels = imc_instance.get_channels(include_data=False) + assert len(channels) > 0 + return channels[0]['uuid'] + + def test_iter_channel_numpy_scaled(self, imc_instance, first_channel_uuid): + """Test default scaled streaming""" + # Get ground truth via old method + full_channels = imc_instance.get_channels(include_data=True) + target_channel = next(ch for ch in full_channels if ch['uuid'] == first_channel_uuid) + expected_y = np.array(target_channel['ydata']) + + # Stream data + streamed_y = [] + # Encode UUID to bytes for C++ std::string + uuid_bytes = first_channel_uuid.encode('utf-8') + for chunk in imc_instance.iter_channel_numpy(uuid_bytes, chunk_rows=100): + assert 'y' in chunk + assert isinstance(chunk['y'], np.ndarray) + assert chunk['y'].dtype == np.float64 # Scaled should be float64 + streamed_y.append(chunk['y']) + + full_streamed_y = np.concatenate(streamed_y) + + # Compare + np.testing.assert_allclose(full_streamed_y, expected_y, rtol=1e-4) + + def test_iter_channel_numpy_raw(self, imc_instance, first_channel_uuid): + """Test raw streaming""" + # We can't easily compare raw values to scaled values without knowing the factor/offset + # But we can check types and consistency + + streamed_y_raw = [] + uuid_bytes = first_channel_uuid.encode('utf-8') + for chunk in imc_instance.iter_channel_numpy(uuid_bytes, chunk_rows=100, mode="raw"): + assert 'y' in chunk + assert isinstance(chunk['y'], np.ndarray) + # Raw type depends on file, but shouldn't necessarily be float64 unless the raw data is float + streamed_y_raw.append(chunk['y']) + + full_streamed_y_raw = np.concatenate(streamed_y_raw) + + # Ensure we got data + assert len(full_streamed_y_raw) > 0 + + def test_chunking_behavior(self, imc_instance, first_channel_uuid): + """Test that small chunks work correctly""" + # Get total length + channels = imc_instance.get_channels(include_data=False) + # We don't have direct access to length in metadata without loading, + # but we can infer it from a full load or just count + + chunk_size = 10 + uuid_bytes = first_channel_uuid.encode('utf-8') + chunks = list(imc_instance.iter_channel_numpy(uuid_bytes, chunk_rows=chunk_size)) + + # Check that most chunks are of size 10 + for i, chunk in enumerate(chunks[:-1]): # All but last should be full + assert len(chunk['y']) == chunk_size + + # Check continuity of 'start' index + expected_start = 0 + for chunk in chunks: + assert chunk['start'] == expected_start + expected_start += len(chunk['y']) + + def test_include_x_parameter(self, imc_instance, first_channel_uuid): + """Test include_x=False""" + uuid_bytes = first_channel_uuid.encode('utf-8') + for chunk in imc_instance.iter_channel_numpy(uuid_bytes, include_x=False, chunk_rows=100): + assert 'y' in chunk + assert 'x' not in chunk + + def test_invalid_channel_uuid(self, imc_instance): + """Test behavior with invalid UUID""" + # Depending on implementation, this might raise an error or return empty generator + # Based on C++ code: throw std::runtime_error("channel does not exist:" + uuid); + # Cython should propagate this as RuntimeError + + with pytest.raises(RuntimeError): + list(imc_instance.iter_channel_numpy(b"non-existent-uuid"))