From 1a9490cf606d4eb466feeb5e04a602ad5f653a3f Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Fri, 16 Jan 2026 06:24:21 +0100 Subject: [PATCH 1/5] feat: add TextExtractor and TableExtractor traits to nvisy-document - Add TextExtractor trait for native text extraction from documents - extract_text() returns ExtractedText with raw, by-page, by-region text - extract_text_for_page() for single page extraction - needs_ocr() heuristic check for scanned documents - Add TableExtractor trait for table extraction and normalization - extract_tables() returns Vec - NormalizedTable, NormalizedRow, NormalizedCell types - CellDataType inference (Text, Number, Date, Boolean, Formula, Empty) - Add prelude.rs to nvisy-archive crate - Refactor nvisy-core: - ContentData now supports Bytes or HipStr via ContentBytes enum - Content struct wraps ContentData + optional ContentMetadata - Remove from_file_extension/common_extensions from ContentKind - Move extension-to-kind mapping to nvisy-archive - Add runtime documentation: - docs/README.md - overview and crate structure - docs/PIPELINE.md - processing stages with pseudocode - docs/DATATYPES.md - core data structures --- .github/workflows/build.yml | 3 +- .github/workflows/security.yml | 3 +- Cargo.lock | 3 + Cargo.toml | 5 +- crates/nvisy-archive/Cargo.toml | 4 + crates/nvisy-archive/README.md | 11 +- crates/nvisy-archive/src/file/archive_type.rs | 29 +- crates/nvisy-archive/src/file/mod.rs | 112 ++- crates/nvisy-archive/src/handler/mod.rs | 233 ++++++- crates/nvisy-archive/src/lib.rs | 81 +-- crates/nvisy-archive/src/prelude.rs | 12 + crates/nvisy-core/Cargo.toml | 1 + crates/nvisy-core/README.md | 37 +- crates/nvisy-core/src/error/error_source.rs | 37 +- crates/nvisy-core/src/error/error_type.rs | 27 +- crates/nvisy-core/src/error/mod.rs | 193 +++--- crates/nvisy-core/src/fs/content_file.rs | 27 +- crates/nvisy-core/src/fs/content_kind.rs | 110 +-- crates/nvisy-core/src/fs/content_metadata.rs | 29 +- crates/nvisy-core/src/fs/data_sensitivity.rs | 108 +-- .../nvisy-core/src/fs/data_structure_kind.rs | 130 ---- crates/nvisy-core/src/fs/mod.rs | 79 +-- crates/nvisy-core/src/io/content.rs | 291 +++++--- crates/nvisy-core/src/io/content_data.rs | 378 +++++++++-- crates/nvisy-core/src/io/content_read.rs | 8 +- crates/nvisy-core/src/io/data_reference.rs | 38 +- crates/nvisy-core/src/io/mod.rs | 2 +- crates/nvisy-core/src/lib.rs | 18 - crates/nvisy-core/src/prelude.rs | 4 +- crates/nvisy-document/README.md | 44 +- crates/nvisy-document/src/lib.rs | 39 +- crates/nvisy-document/src/table/mod.rs | 86 +++ crates/nvisy-document/src/table/types.rs | 446 ++++++++++++ crates/nvisy-document/src/text/mod.rs | 69 ++ crates/nvisy-document/src/text/types.rs | 162 +++++ crates/nvisy-docx/src/lib.rs | 16 +- crates/nvisy-pdf/src/lib.rs | 14 +- crates/nvisy-text/src/lib.rs | 15 +- docs/DATATYPES.md | 445 ++++++++++++ docs/PIPELINE.md | 635 ++++++++++++++++++ docs/README.md | 31 + 41 files changed, 3002 insertions(+), 1013 deletions(-) create mode 100644 crates/nvisy-archive/src/prelude.rs delete mode 100644 crates/nvisy-core/src/fs/data_structure_kind.rs create mode 100644 crates/nvisy-document/src/table/mod.rs create mode 100644 crates/nvisy-document/src/table/types.rs create mode 100644 crates/nvisy-document/src/text/mod.rs create mode 100644 crates/nvisy-document/src/text/types.rs create mode 100644 docs/DATATYPES.md create mode 100644 docs/PIPELINE.md create mode 100644 docs/README.md diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 8a068ef..d4f3796 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -2,7 +2,8 @@ name: Build on: push: - branches: [main, release] + branches: + - "main" paths: - "crates/**" - "Cargo.toml" diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml index 835a818..1e84721 100644 --- a/.github/workflows/security.yml +++ b/.github/workflows/security.yml @@ -2,7 +2,8 @@ name: Security on: push: - branches: [main, release] + branches: + - "main" paths: - "crates/**" - "Cargo.toml" diff --git a/Cargo.lock b/Cargo.lock index 33d3ba1..0fc610e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -614,10 +614,12 @@ dependencies = [ name = "nvisy-archive" version = "0.1.0" dependencies = [ + "bytes", "bzip2", "flate2", "nvisy-core", "sevenz-rust", + "strum", "tar", "tempfile", "tokio", @@ -631,6 +633,7 @@ name = "nvisy-core" version = "0.1.0" dependencies = [ "bytes", + "derive_more", "hex", "hipstr", "jiff", diff --git a/Cargo.toml b/Cargo.toml index 8092a32..c81f9cc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,8 +14,8 @@ members = [ [workspace.package] version = "0.1.0" -rust-version = "1.89" -edition = "2021" +rust-version = "1.92" +edition = "2024" license = "MIT" publish = false @@ -76,6 +76,7 @@ isolang = { version = "2.4", default-features = false, features = ["english_name # Text processing and pattern matching regex = { version = "1.11", default-features = false, features = [] } +regex-lite = { version = "0.1", default-features = false, features = ["std"] } fancy-regex = { version = "0.16", default-features = false, features = [] } aho-corasick = { version = "1.1", default-features = false, features = [] } unicode-segmentation = { version = "1.10", default-features = false, features = [] } diff --git a/crates/nvisy-archive/Cargo.toml b/crates/nvisy-archive/Cargo.toml index 38742ae..706468b 100644 --- a/crates/nvisy-archive/Cargo.toml +++ b/crates/nvisy-archive/Cargo.toml @@ -28,6 +28,10 @@ xz = ["dep:xz2"] [dependencies] # Core dependencies nvisy-core = { workspace = true } +bytes = { workspace = true } + +# Utilities +strum = { workspace = true, features = ["derive"] } # Async and I/O tokio = { workspace = true, features = ["fs", "io-util", "rt"] } diff --git a/crates/nvisy-archive/README.md b/crates/nvisy-archive/README.md index 05cdbf7..fce88ab 100644 --- a/crates/nvisy-archive/README.md +++ b/crates/nvisy-archive/README.md @@ -6,6 +6,15 @@ Archive handling and compression library for the Nvisy runtime. ## Features +- `zip` - ZIP archive support (enabled by default) +- `tar` - TAR archive support (enabled by default) +- `sevenz` - 7z archive support +- `gzip` - GZIP compression support (enabled by default) +- `bzip2` - BZIP2 compression support (enabled by default) +- `xz` - XZ/LZMA compression support (enabled by default) + +## Capabilities + - **Multiple Formats** - ZIP, TAR, TAR.GZ, TAR.BZ2, TAR.XZ, GZIP, BZIP2, and XZ - **Async Operations** - Full async/await support with Tokio - **Flexible Loading** - Load from file paths, memory, or byte streams @@ -13,7 +22,7 @@ Archive handling and compression library for the Nvisy runtime. - **Memory Efficient** - Stream-based processing for large archives - **Cross-Platform** - Works on Windows, macOS, and Linux -## Key Dependencies +## Dependencies - `tokio` - Async runtime for I/O operations - `tar` - TAR archive format support diff --git a/crates/nvisy-archive/src/file/archive_type.rs b/crates/nvisy-archive/src/file/archive_type.rs index 2ccda40..fdcaa58 100644 --- a/crates/nvisy-archive/src/file/archive_type.rs +++ b/crates/nvisy-archive/src/file/archive_type.rs @@ -4,7 +4,8 @@ //! and provides utilities for working with archive types. use std::ffi::OsStr; -use std::fmt; + +use strum::{AsRefStr, Display, EnumIter, EnumString}; /// Supported archive types /// @@ -12,24 +13,34 @@ use std::fmt; /// It provides methods to determine the archive type from file extensions /// and to get the supported extensions for each type. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(AsRefStr, Display, EnumIter, EnumString)] pub enum ArchiveType { /// ZIP archive format + #[strum(serialize = "ZIP")] Zip, /// TAR archive format (uncompressed) + #[strum(serialize = "TAR")] Tar, /// GZIP compressed TAR archive + #[strum(serialize = "TAR.GZ")] TarGz, /// BZIP2 compressed TAR archive + #[strum(serialize = "TAR.BZ2")] TarBz2, /// XZ compressed TAR archive + #[strum(serialize = "TAR.XZ")] TarXz, /// GZIP compression (single file) + #[strum(serialize = "GZIP")] Gz, /// BZIP2 compression (single file) + #[strum(serialize = "BZIP2")] Bz2, /// XZ compression (single file) + #[strum(serialize = "XZ")] Xz, /// 7-Zip archive format + #[strum(serialize = "7Z")] SevenZ, } @@ -127,22 +138,6 @@ impl ArchiveType { } } -impl fmt::Display for ArchiveType { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::Zip => write!(f, "ZIP"), - Self::Tar => write!(f, "TAR"), - Self::TarGz => write!(f, "TAR.GZ"), - Self::TarBz2 => write!(f, "TAR.BZ2"), - Self::TarXz => write!(f, "TAR.XZ"), - Self::Gz => write!(f, "GZIP"), - Self::Bz2 => write!(f, "BZIP2"), - Self::Xz => write!(f, "XZ"), - Self::SevenZ => write!(f, "7Z"), - } - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/crates/nvisy-archive/src/file/mod.rs b/crates/nvisy-archive/src/file/mod.rs index a1abe1c..c2eeef9 100644 --- a/crates/nvisy-archive/src/file/mod.rs +++ b/crates/nvisy-archive/src/file/mod.rs @@ -10,22 +10,27 @@ use std::io::Cursor; use std::path::{Path, PathBuf}; pub use archive_type::ArchiveType; +use bytes::Bytes; use tempfile::TempDir; use tokio::fs; use crate::handler::ArchiveHandler; #[cfg(feature = "zip")] use crate::ZipResultExt; -use crate::{ArchiveErrorExt, Error, Result}; +use crate::{ArchiveErrorExt, ContentData, ContentSource, Error, Result}; /// Represents an archive file that can be loaded from various sources /// /// This struct encapsulates an archive and provides methods for /// extracting its contents to a temporary directory for processing. +/// It integrates with nvisy-core's `ContentData` and `ContentSource` +/// for content tracking and integrity verification. #[derive(Debug)] pub struct ArchiveFile { + /// Unique identifier for this archive content + content_source: ContentSource, /// Type of archive - pub archive_type: ArchiveType, + archive_type: ArchiveType, /// Source data for the archive source: ArchiveSource, } @@ -35,16 +40,15 @@ pub struct ArchiveFile { enum ArchiveSource { /// Archive loaded from a file path Path(PathBuf), - /// Archive loaded from memory - Memory(Vec), - /// Archive loaded from an iterator - Iterator(Vec), + /// Archive loaded from ContentData (memory with metadata) + ContentData(ContentData), } impl ArchiveFile { /// Create a new archive file from a file path /// /// The archive type is automatically detected from the file extension. + /// A new `ContentSource` is generated to track this archive. /// /// # Example /// @@ -82,45 +86,51 @@ impl ArchiveFile { .ok_or_else(|| Error::unsupported_format(extension.to_string_lossy().to_string()))?; Ok(Self { + content_source: ContentSource::new(), archive_type, source: ArchiveSource::Path(path.to_path_buf()), }) } - /// Create a new archive file from memory with explicit archive type + /// Create a new archive file from ContentData + /// + /// This preserves the content source from the provided ContentData, + /// maintaining content lineage tracking. /// /// # Example /// /// ``` - /// use nvisy_archive::{ArchiveFile, ArchiveType}; + /// use nvisy_archive::{ArchiveFile, ArchiveType, ContentData}; /// - /// let data = vec![0x50, 0x4B, 0x03, 0x04]; // ZIP signature - /// let archive = ArchiveFile::from_memory(ArchiveType::Zip, data); + /// let data = ContentData::from(vec![0x50, 0x4B, 0x03, 0x04]); // ZIP signature + /// let archive = ArchiveFile::from_content_data(ArchiveType::Zip, data); /// ``` - pub fn from_memory(archive_type: ArchiveType, data: Vec) -> Self { + pub fn from_content_data(archive_type: ArchiveType, content_data: ContentData) -> Self { Self { + content_source: content_data.content_source, archive_type, - source: ArchiveSource::Memory(data), + source: ArchiveSource::ContentData(content_data), } } - /// Create a new archive file from an iterator of bytes + /// Create a new archive file from raw bytes with explicit archive type /// - /// The iterator will be consumed immediately and stored in memory. + /// A new `ContentSource` is generated to track this archive. /// /// # Example /// /// ``` /// use nvisy_archive::{ArchiveFile, ArchiveType}; /// - /// let data = [0x50, 0x4B, 0x03, 0x04]; // ZIP signature - /// let archive = ArchiveFile::from_iterator(ArchiveType::Zip, data.into_iter()); + /// let data = vec![0x50, 0x4B, 0x03, 0x04]; // ZIP signature + /// let archive = ArchiveFile::from_bytes(ArchiveType::Zip, data); /// ``` - pub fn from_iterator(archive_type: ArchiveType, data: impl Iterator) -> Self { - let data: Vec = data.collect(); + pub fn from_bytes(archive_type: ArchiveType, data: impl Into) -> Self { + let content_data = ContentData::from(data.into()); Self { + content_source: content_data.content_source, archive_type, - source: ArchiveSource::Iterator(data), + source: ArchiveSource::ContentData(content_data), } } @@ -130,6 +140,11 @@ impl ArchiveFile { self } + /// Get the content source identifier for this archive + pub fn content_source(&self) -> ContentSource { + self.content_source + } + /// Get the archive type pub fn archive_type(&self) -> ArchiveType { self.archive_type @@ -139,7 +154,7 @@ impl ArchiveFile { pub async fn exists(&self) -> bool { match &self.source { ArchiveSource::Path(path) => fs::try_exists(path).await.unwrap_or(false), - ArchiveSource::Memory(_) | ArchiveSource::Iterator(_) => true, + ArchiveSource::ContentData(_) => true, } } @@ -147,18 +162,33 @@ impl ArchiveFile { pub fn path(&self) -> Option<&Path> { match &self.source { ArchiveSource::Path(path) => Some(path), - _ => None, + ArchiveSource::ContentData(_) => None, } } - /// Get the size of the archive data + /// Get the size of the archive data in bytes pub async fn size(&self) -> Result { match &self.source { ArchiveSource::Path(path) => { let metadata = fs::metadata(path).await?; Ok(metadata.len()) } - ArchiveSource::Memory(data) | ArchiveSource::Iterator(data) => Ok(data.len() as u64), + ArchiveSource::ContentData(data) => Ok(data.size() as u64), + } + } + + /// Get the SHA256 hash of the archive content + /// + /// For file-based archives, this reads the file first. + /// For memory-based archives, the hash is computed lazily. + pub async fn sha256(&self) -> Result { + match &self.source { + ArchiveSource::Path(path) => { + let data = fs::read(path).await?; + let content_data = ContentData::from(data); + Ok(content_data.sha256_hex()) + } + ArchiveSource::ContentData(data) => Ok(data.sha256_hex()), } } @@ -198,14 +228,15 @@ impl ArchiveFile { Error::invalid_archive(format!("Failed to create temporary directory: {}", e)) })?; - // Get archive data as bytes - let data = self.get_data().await?; - let cursor = Cursor::new(data); + // Get archive data as ContentData + let content_data = self.get_content_data().await?; + let cursor = Cursor::new(content_data.as_bytes().to_vec()); // Extract based on archive type let files = self.extract_archive(cursor, temp_dir.path()).await?; Ok(ArchiveHandler::new( + self.content_source, self.archive_type, self.path().map(|p| p.to_path_buf()), temp_dir, @@ -213,11 +244,14 @@ impl ArchiveFile { )) } - /// Get the archive data as bytes - async fn get_data(&self) -> Result> { + /// Get the archive data as ContentData + async fn get_content_data(&self) -> Result { match &self.source { - ArchiveSource::Path(path) => fs::read(path).await.map_err(Into::into), - ArchiveSource::Memory(data) | ArchiveSource::Iterator(data) => Ok(data.clone()), + ArchiveSource::Path(path) => { + let data = fs::read(path).await?; + Ok(ContentData::new(self.content_source, data.into())) + } + ArchiveSource::ContentData(data) => Ok(data.clone()), } } @@ -597,18 +631,22 @@ mod tests { use super::*; #[test] - fn test_archive_file_from_memory() { + fn test_archive_file_from_bytes() { let data = vec![0x50, 0x4B, 0x03, 0x04]; // ZIP signature - let archive = ArchiveFile::from_memory(ArchiveType::Zip, data); + let archive = ArchiveFile::from_bytes(ArchiveType::Zip, data); assert_eq!(archive.archive_type(), ArchiveType::Zip); assert!(archive.path().is_none()); + // Content source should be valid + assert!(!archive.content_source().as_uuid().is_nil()); } #[test] - fn test_archive_file_from_iterator() { - let data = [0x50, 0x4B, 0x03, 0x04]; // ZIP signature - let archive = ArchiveFile::from_iterator(ArchiveType::Zip, data.into_iter()); - assert_eq!(archive.archive_type(), ArchiveType::Zip); + fn test_archive_file_from_content_data() { + let content_data = ContentData::from(vec![0x50, 0x4B, 0x03, 0x04]); + let original_source = content_data.content_source; + let archive = ArchiveFile::from_content_data(ArchiveType::Zip, content_data); + // Should preserve the original content source + assert_eq!(archive.content_source(), original_source); } #[test] @@ -635,7 +673,7 @@ mod tests { #[tokio::test] async fn test_memory_size() { let data = vec![1, 2, 3, 4, 5]; - let archive = ArchiveFile::from_memory(ArchiveType::Zip, data); + let archive = ArchiveFile::from_bytes(ArchiveType::Zip, data); assert_eq!(archive.size().await.unwrap(), 5); } } diff --git a/crates/nvisy-archive/src/handler/mod.rs b/crates/nvisy-archive/src/handler/mod.rs index 40a8398..ef3415b 100644 --- a/crates/nvisy-archive/src/handler/mod.rs +++ b/crates/nvisy-archive/src/handler/mod.rs @@ -15,7 +15,39 @@ pub use tar_handler::{TarArchiveBuilder, TarArchiveHandler, TarDirectoryBuilder, use tempfile::TempDir; pub use zip_handler::{ZipArchiveBuilder, ZipArchiveHandler, ZipDirectoryBuilder, ZipEntryInfo}; -use crate::{ArchiveErrorExt, ArchiveType, Error, Result}; +use crate::{ + ArchiveErrorExt, ArchiveType, ContentKind, ContentMetadata, ContentSource, Error, Result, +}; + +/// Detect content kind from file extension +/// +/// This function maps common file extensions to their content kind categories. +fn content_kind_from_extension(extension: &str) -> ContentKind { + let ext = extension.to_lowercase(); + match ext.as_str() { + // Text formats + "txt" | "text" | "md" | "markdown" | "rst" | "xml" | "json" | "yaml" | "yml" | "toml" + | "ini" | "cfg" | "conf" | "log" => ContentKind::Text, + + // Document formats + "pdf" | "doc" | "docx" | "rtf" | "odt" | "pages" => ContentKind::Document, + + // Spreadsheet formats + "csv" | "tsv" | "xls" | "xlsx" | "ods" | "numbers" => ContentKind::Spreadsheet, + + // Image formats + "jpg" | "jpeg" | "png" | "gif" | "bmp" | "svg" | "webp" | "ico" | "tiff" | "tif" => { + ContentKind::Image + } + + // Archive formats + "zip" | "tar" | "gz" | "bz2" | "xz" | "7z" | "rar" | "tgz" | "tbz2" | "txz" => { + ContentKind::Archive + } + + _ => ContentKind::Unknown, + } +} /// Handler for unpacked archive contents /// @@ -24,6 +56,8 @@ use crate::{ArchiveErrorExt, ArchiveType, Error, Result}; /// and repacking the archive. #[derive(Debug)] pub struct ArchiveHandler { + /// Content source identifier for the original archive + pub content_source: ContentSource, /// Type of the original archive pub archive_type: ArchiveType, /// Original archive file path (if loaded from file) @@ -39,12 +73,14 @@ impl ArchiveHandler { /// /// This is typically called internally by `ArchiveFile::unpack()`. pub fn new( + content_source: ContentSource, archive_type: ArchiveType, original_path: Option, temp_dir: TempDir, files: Vec, ) -> Self { Self { + content_source, archive_type, original_path, temp_dir, @@ -87,6 +123,44 @@ impl ArchiveHandler { }) } + /// Find files matching a specific content kind + pub fn find_files_by_kind(&self, kind: ContentKind) -> Vec<&PathBuf> { + self.find_files(|path| self.content_kind_for_path(path) == kind) + } + + /// Get the content kind for a file path based on its extension + pub fn content_kind_for_path(&self, path: &Path) -> ContentKind { + path.extension() + .and_then(|ext| ext.to_str()) + .map(content_kind_from_extension) + .unwrap_or_default() + } + + /// Create content metadata for a file using its relative path within the archive + /// + /// The returned metadata has a new ContentSource (derived from the archive's source) + /// and includes the relative path within the archive. + pub fn content_metadata_for_file(&self, relative_path: impl AsRef) -> ContentMetadata { + ContentMetadata::with_path(ContentSource::new(), relative_path.as_ref()) + } + + /// Get content metadata for all files in the archive + /// + /// Returns a list of ContentMetadata entries for each extracted file, + /// using relative paths within the archive. + pub fn all_content_metadata(&self) -> Result> { + let temp_path = self.temp_path(); + self.files + .iter() + .map(|path| { + let relative = path + .strip_prefix(temp_path) + .map_err(|e| Error::invalid_archive(format!("Invalid file path: {}", e)))?; + Ok(ContentMetadata::with_path(ContentSource::new(), relative)) + }) + .collect() + } + /// Get all files recursively in the temporary directory pub fn refresh_file_list(&mut self) -> Result<()> { self.files = Self::scan_files(self.temp_path())?; @@ -281,8 +355,10 @@ mod tests { fn test_archive_handler_creation() { let temp_dir = TempDir::new().unwrap(); let files = vec![PathBuf::from("test.txt")]; + let content_source = ContentSource::new(); let handler = ArchiveHandler::new( + content_source, ArchiveType::Zip, Some(PathBuf::from("test.zip")), temp_dir, @@ -299,7 +375,13 @@ mod tests { let temp_dir = TempDir::new().unwrap(); let files = vec![]; - let handler = ArchiveHandler::new(ArchiveType::Zip, None, temp_dir, files); + let handler = ArchiveHandler::new( + ContentSource::new(), + ArchiveType::Zip, + None, + temp_dir, + files, + ); assert_eq!(handler.file_count(), 0); assert!(handler.is_empty()); @@ -314,7 +396,13 @@ mod tests { PathBuf::from("image.png"), ]; - let handler = ArchiveHandler::new(ArchiveType::Zip, None, temp_dir, files); + let handler = ArchiveHandler::new( + ContentSource::new(), + ArchiveType::Zip, + None, + temp_dir, + files, + ); let txt_files = handler.find_files_by_extension("txt"); assert_eq!(txt_files.len(), 1); @@ -328,7 +416,13 @@ mod tests { let temp_dir = TempDir::new().unwrap(); let files = vec![PathBuf::from("file1.txt"), PathBuf::from("file2.txt")]; - let handler = ArchiveHandler::new(ArchiveType::Zip, None, temp_dir, files.clone()); + let handler = ArchiveHandler::new( + ContentSource::new(), + ArchiveType::Zip, + None, + temp_dir, + files, + ); let collected: Vec<&PathBuf> = (&handler).into_iter().collect(); assert_eq!(collected.len(), 2); @@ -337,7 +431,13 @@ mod tests { #[tokio::test] async fn test_write_and_read_file() { let temp_dir = TempDir::new().unwrap(); - let mut handler = ArchiveHandler::new(ArchiveType::Zip, None, temp_dir, vec![]); + let mut handler = ArchiveHandler::new( + ContentSource::new(), + ArchiveType::Zip, + None, + temp_dir, + vec![], + ); let content = b"Hello, World!"; handler.write_file("test.txt", content).await.unwrap(); @@ -346,4 +446,127 @@ mod tests { let read_content = handler.read_file("test.txt").await.unwrap(); assert_eq!(read_content, content); } + + #[test] + fn test_find_files_by_kind() { + let temp_dir = TempDir::new().unwrap(); + let files = vec![ + PathBuf::from("document.pdf"), + PathBuf::from("data.csv"), + PathBuf::from("image.png"), + PathBuf::from("archive.zip"), + PathBuf::from("notes.txt"), + ]; + + let handler = ArchiveHandler::new( + ContentSource::new(), + ArchiveType::Zip, + None, + temp_dir, + files, + ); + + let docs = handler.find_files_by_kind(ContentKind::Document); + assert_eq!(docs.len(), 1); + assert!(docs[0].to_string_lossy().contains("document.pdf")); + + let spreadsheets = handler.find_files_by_kind(ContentKind::Spreadsheet); + assert_eq!(spreadsheets.len(), 1); + assert!(spreadsheets[0].to_string_lossy().contains("data.csv")); + + let images = handler.find_files_by_kind(ContentKind::Image); + assert_eq!(images.len(), 1); + + let text = handler.find_files_by_kind(ContentKind::Text); + assert_eq!(text.len(), 1); + } + + #[test] + fn test_content_kind_for_path() { + let temp_dir = TempDir::new().unwrap(); + let handler = ArchiveHandler::new( + ContentSource::new(), + ArchiveType::Zip, + None, + temp_dir, + vec![], + ); + + assert_eq!( + handler.content_kind_for_path(Path::new("test.pdf")), + ContentKind::Document + ); + assert_eq!( + handler.content_kind_for_path(Path::new("data.csv")), + ContentKind::Spreadsheet + ); + assert_eq!( + handler.content_kind_for_path(Path::new("image.png")), + ContentKind::Image + ); + assert_eq!( + handler.content_kind_for_path(Path::new("notes.txt")), + ContentKind::Text + ); + assert_eq!( + handler.content_kind_for_path(Path::new("archive.zip")), + ContentKind::Archive + ); + assert_eq!( + handler.content_kind_for_path(Path::new("no_extension")), + ContentKind::Unknown + ); + } + + #[test] + fn test_content_metadata_for_file() { + let temp_dir = TempDir::new().unwrap(); + let handler = ArchiveHandler::new( + ContentSource::new(), + ArchiveType::Zip, + None, + temp_dir, + vec![], + ); + + let metadata = handler.content_metadata_for_file("docs/report.pdf"); + assert_eq!(metadata.filename(), Some("report.pdf")); + assert_eq!(metadata.file_extension(), Some("pdf")); + assert!(!metadata.content_source.as_uuid().is_nil()); + } + + #[tokio::test] + async fn test_all_content_metadata() { + let temp_dir = TempDir::new().unwrap(); + let temp_path = temp_dir.path().to_path_buf(); + + // Create actual files in temp dir + let file1 = temp_path.join("doc.pdf"); + let file2 = temp_path.join("data.csv"); + tokio::fs::write(&file1, b"pdf content").await.unwrap(); + tokio::fs::write(&file2, b"csv content").await.unwrap(); + + let files = vec![file1, file2]; + let handler = ArchiveHandler::new( + ContentSource::new(), + ArchiveType::Zip, + None, + temp_dir, + files, + ); + + let metadata_list = handler.all_content_metadata().unwrap(); + assert_eq!(metadata_list.len(), 2); + + // Check that each metadata has the correct relative path + let filenames: Vec<_> = metadata_list.iter().filter_map(|m| m.filename()).collect(); + assert!(filenames.contains(&"doc.pdf")); + assert!(filenames.contains(&"data.csv")); + + // Each should have a unique content source + assert_ne!( + metadata_list[0].content_source, + metadata_list[1].content_source + ); + } } diff --git a/crates/nvisy-archive/src/lib.rs b/crates/nvisy-archive/src/lib.rs index 8fc23af..8f2d86c 100644 --- a/crates/nvisy-archive/src/lib.rs +++ b/crates/nvisy-archive/src/lib.rs @@ -1,29 +1,20 @@ -//! Archive handling library for nvisy -//! -//! This crate provides functionality for working with various archive formats -//! including ZIP, TAR, 7z, and other compressed archive types. It supports both -//! reading from files and memory, with flexible loading options. -//! -//! # Features -//! -//! - `zip` - ZIP archive support (enabled by default) -//! - `tar` - TAR archive support (enabled by default) -//! - `sevenz` - 7z archive support -//! - `gzip` - GZIP compression support (enabled by default) -//! - `bzip2` - BZIP2 compression support (enabled by default) -//! - `xz` - XZ/LZMA compression support (enabled by default) - #![forbid(unsafe_code)] #![cfg_attr(docsrs, feature(doc_cfg))] +#![doc = include_str!("../README.md")] pub mod file; pub mod handler; +pub mod prelude; // Re-exports for convenience pub use file::{ArchiveFile, ArchiveType}; pub use handler::ArchiveHandler; -// Re-export error types from nvisy-core + +// Re-export core types used in archive operations pub use nvisy_core::error::{Error, ErrorResource, ErrorType, Result}; +pub use nvisy_core::fs::{ContentKind, ContentMetadata}; +pub use nvisy_core::io::ContentData; +pub use nvisy_core::path::ContentSource; /// Extension trait for creating archive-specific errors pub trait ArchiveErrorExt { @@ -48,51 +39,39 @@ pub trait ArchiveErrorExt { impl ArchiveErrorExt for Error { fn unsupported_format(format: impl Into) -> Error { - Error::new( - ErrorType::Runtime, - ErrorResource::Archive, - format!("Unsupported archive format: {}", format.into()), - ) + Error::new(format!("Unsupported archive format: {}", format.into())) + .with_type(ErrorType::Runtime) + .with_resource(ErrorResource::Archive) } fn invalid_archive(message: impl Into) -> Error { - Error::new( - ErrorType::Runtime, - ErrorResource::Archive, - format!("Invalid archive: {}", message.into()), - ) + Error::new(format!("Invalid archive: {}", message.into())) + .with_type(ErrorType::Runtime) + .with_resource(ErrorResource::Archive) } fn entry_not_found(name: impl Into) -> Error { - Error::new( - ErrorType::Runtime, - ErrorResource::Archive, - format!("Entry not found: {}", name.into()), - ) + Error::new(format!("Entry not found: {}", name.into())) + .with_type(ErrorType::Runtime) + .with_resource(ErrorResource::Archive) } fn archive_permission_denied(message: impl Into) -> Error { - Error::new( - ErrorType::Runtime, - ErrorResource::Archive, - format!("Permission denied: {}", message.into()), - ) + Error::new(format!("Permission denied: {}", message.into())) + .with_type(ErrorType::Runtime) + .with_resource(ErrorResource::Archive) } fn corrupted(message: impl Into) -> Error { - Error::new( - ErrorType::Runtime, - ErrorResource::Archive, - format!("Corrupted archive: {}", message.into()), - ) + Error::new(format!("Corrupted archive: {}", message.into())) + .with_type(ErrorType::Runtime) + .with_resource(ErrorResource::Archive) } fn archive_resource_limit(message: impl Into) -> Error { - Error::new( - ErrorType::Runtime, - ErrorResource::Archive, - format!("Resource limit exceeded: {}", message.into()), - ) + Error::new(format!("Resource limit exceeded: {}", message.into())) + .with_type(ErrorType::Runtime) + .with_resource(ErrorResource::Archive) } } @@ -106,12 +85,9 @@ pub trait ZipErrorExt { #[cfg(feature = "zip")] impl ZipErrorExt for zip::result::ZipError { fn into_archive_error(self) -> Error { - Error::from_source( - ErrorType::Runtime, - ErrorResource::Archive, - "ZIP operation failed", - self, - ) + Error::from_source("ZIP operation failed", self) + .with_type(ErrorType::Runtime) + .with_resource(ErrorResource::Archive) } } @@ -135,7 +111,6 @@ mod tests { #[test] fn test_error_creation() { - // Test archive-specific error constructors from ArchiveErrorExt trait let error = ::unsupported_format("custom"); assert_eq!(error.resource, ErrorResource::Archive); diff --git a/crates/nvisy-archive/src/prelude.rs b/crates/nvisy-archive/src/prelude.rs new file mode 100644 index 0000000..35ecb8c --- /dev/null +++ b/crates/nvisy-archive/src/prelude.rs @@ -0,0 +1,12 @@ +//! Prelude module for commonly used types. +//! +//! This module re-exports the most commonly used types from this crate. +//! It is intended to be glob-imported for convenience. + +// Archive types +pub use crate::file::{ArchiveFile, ArchiveType}; +pub use crate::handler::ArchiveHandler; +// Error handling +pub use crate::{ArchiveErrorExt, Error, ErrorResource, ErrorType, Result}; +// Core types re-exported for convenience +pub use crate::{ContentData, ContentKind, ContentMetadata, ContentSource}; diff --git a/crates/nvisy-core/Cargo.toml b/crates/nvisy-core/Cargo.toml index 13130f3..390a46a 100644 --- a/crates/nvisy-core/Cargo.toml +++ b/crates/nvisy-core/Cargo.toml @@ -36,6 +36,7 @@ serde = { workspace = true, features = ["derive"] } # Utilities strum = { workspace = true, features = ["derive"] } +derive_more = { workspace = true, features = ["as_ref", "deref"] } # Error handling (moved from nvisy-error crate) thiserror = { workspace = true, features = ["std"] } diff --git a/crates/nvisy-core/README.md b/crates/nvisy-core/README.md index 524b07b..68059c0 100644 --- a/crates/nvisy-core/README.md +++ b/crates/nvisy-core/README.md @@ -9,8 +9,20 @@ processing system. ## Overview This crate provides the foundational building blocks for the Nvisy ecosystem, -including data processing primitives, structured error handling, and component -health monitoring. +including data processing primitives, structured error handling, and content +tracking. + +## Core Types + +- [`fs::DataSensitivity`] - Sensitivity levels for risk assessment +- [`fs::ContentFile`] - File operations with content tracking +- [`fs::ContentKind`] - Classification of content types by file extension +- [`fs::ContentMetadata`] - Metadata information for content files +- [`io::Content`] - Content types and data structures +- [`io::ContentData`] - Container for content data with metadata +- [`io::DataReference`] - Data references with source tracking +- [`path::ContentSource`] - UUIDv7-based content source identification +- [`error::Error`] - Structured error handling with source classification ## Features @@ -19,30 +31,23 @@ health monitoring. - **Content Management** - Unified content structures with SHA256 hashing and metadata - **File Operations** - Async file handling with content source tracking -- **Data Classification** - Sensitivity levels and structure type classification +- **Data Classification** - Sensitivity levels for risk assessment - **Format Detection** - Automatic content kind detection from file extensions - **I/O Abstractions** - Modern async traits for content reading and writing - **Zero-Copy Operations** - Efficient data handling using `bytes::Bytes` -### Error Handling & Monitoring +### Error Handling - **Structured Errors** - Rich error types with source classification and context tracking -- **Component Health** - Health status monitoring with operational state - tracking -- **Status Reporting** - Comprehensive status information with severity levels -- **Component Trait** - Standardized interface for component health checks +- **Builder Pattern** - Fluent API with `with_type()`, `with_resource()`, + `with_source()`, and `with_context()` methods - **Result Types** - Ergonomic error handling with custom `Result` type -## Feature Flags - -- `serde` - Enable serialization/deserialization support for all types using - serde. This allows converting structs to/from JSON, YAML, and other formats. -- `jiff` - Enable timestamp support using the jiff datetime library. This adds - timestamp fields to `ComponentStatus` and time-based operations. - ## Dependencies - `tokio` - Async runtime for I/O operations - `bytes` - Zero-copy byte buffer management -- `uuid` - Unique identifiers with v7 support +- `uuid` - Unique identifiers with UUIDv7 support +- `jiff` - Timestamp support for content source tracking +- `strum` - Derive macros for enums diff --git a/crates/nvisy-core/src/error/error_source.rs b/crates/nvisy-core/src/error/error_source.rs index 8839fa9..71a5719 100644 --- a/crates/nvisy-core/src/error/error_source.rs +++ b/crates/nvisy-core/src/error/error_source.rs @@ -1,15 +1,19 @@ use serde::{Deserialize, Serialize}; -use strum::{AsRefStr, Display}; +use strum::{AsRefStr, Display, EnumString}; /// System component sources where errors can originate. /// /// This enum identifies the subsystem or component that generated an error, /// enabling better error categorization and handling across the nvisy ecosystem. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, AsRefStr, Display)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)] +#[derive(AsRefStr, Display, EnumString)] #[derive(Serialize, Deserialize)] #[strum(serialize_all = "snake_case")] #[serde(rename_all = "snake_case")] pub enum ErrorResource { + /// Unknown or unspecified component. + #[default] + Unknown, /// Core framework and foundational components. Core, /// Execution engine and processing components. @@ -48,22 +52,31 @@ impl ErrorResource { #[must_use] pub const fn priority_level(&self) -> u8 { match self { - Self::Core => 6, // Highest priority + Self::Core => 6, Self::Engine => 5, Self::Document | Self::Archive => 4, Self::Pattern => 3, Self::Runtime => 2, - Self::Gateway => 1, // Lowest priority + Self::Gateway => 1, + Self::Unknown => 0, } } } #[cfg(test)] mod tests { + use std::str::FromStr; + use super::*; + #[test] + fn test_default() { + assert_eq!(ErrorResource::default(), ErrorResource::Unknown); + } + #[test] fn test_string_representations() { + assert_eq!(ErrorResource::Unknown.as_ref(), "unknown"); assert_eq!(ErrorResource::Core.as_ref(), "core"); assert_eq!(ErrorResource::Engine.as_ref(), "engine"); assert_eq!(ErrorResource::Document.as_ref(), "document"); @@ -73,6 +86,19 @@ mod tests { assert_eq!(ErrorResource::Gateway.as_ref(), "gateway"); } + #[test] + fn test_from_str() { + assert_eq!( + ErrorResource::from_str("core").unwrap(), + ErrorResource::Core + ); + assert_eq!( + ErrorResource::from_str("engine").unwrap(), + ErrorResource::Engine + ); + assert!(ErrorResource::from_str("invalid").is_err()); + } + #[test] fn test_priority_levels() { assert_eq!(ErrorResource::Core.priority_level(), 6); @@ -82,6 +108,7 @@ mod tests { assert_eq!(ErrorResource::Pattern.priority_level(), 3); assert_eq!(ErrorResource::Runtime.priority_level(), 2); assert_eq!(ErrorResource::Gateway.priority_level(), 1); + assert_eq!(ErrorResource::Unknown.priority_level(), 0); } #[test] @@ -93,5 +120,7 @@ mod tests { assert!(ErrorResource::Archive.is_internal()); assert!(ErrorResource::Runtime.is_external()); assert!(ErrorResource::Gateway.is_external()); + assert!(!ErrorResource::Unknown.is_internal()); + assert!(!ErrorResource::Unknown.is_external()); } } diff --git a/crates/nvisy-core/src/error/error_type.rs b/crates/nvisy-core/src/error/error_type.rs index a1f6073..f9a740c 100644 --- a/crates/nvisy-core/src/error/error_type.rs +++ b/crates/nvisy-core/src/error/error_type.rs @@ -1,8 +1,9 @@ use serde::{Deserialize, Serialize}; -use strum::{AsRefStr, Display}; +use strum::{AsRefStr, Display, EnumString}; /// Classification of error types by their operational domain. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, AsRefStr, Display)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)] +#[derive(AsRefStr, Display, EnumString)] #[derive(Serialize, Deserialize)] #[strum(serialize_all = "snake_case")] #[serde(rename_all = "snake_case")] @@ -12,21 +13,37 @@ pub enum ErrorType { /// Execution-time operational failures. Runtime, /// Internal system logic or state failures. + #[default] Other, } impl ErrorType { - /// Check if this error type is typically recoverable + /// Check if this error type is typically recoverable. #[must_use] - pub fn is_recoverable(&self) -> bool { - matches!(self, ErrorType::Runtime) + pub const fn is_recoverable(&self) -> bool { + matches!(self, Self::Runtime) } } #[cfg(test)] mod tests { + use std::str::FromStr; + use super::*; + #[test] + fn test_default() { + assert_eq!(ErrorType::default(), ErrorType::Other); + } + + #[test] + fn test_from_str() { + assert_eq!(ErrorType::from_str("config").unwrap(), ErrorType::Config); + assert_eq!(ErrorType::from_str("runtime").unwrap(), ErrorType::Runtime); + assert_eq!(ErrorType::from_str("other").unwrap(), ErrorType::Other); + assert!(ErrorType::from_str("invalid").is_err()); + } + #[test] fn test_recoverability() { assert!(ErrorType::Runtime.is_recoverable()); diff --git a/crates/nvisy-core/src/error/mod.rs b/crates/nvisy-core/src/error/mod.rs index 62cb82f..26dca6a 100644 --- a/crates/nvisy-core/src/error/mod.rs +++ b/crates/nvisy-core/src/error/mod.rs @@ -1,7 +1,4 @@ //! Structured error handling for the nvisy ecosystem. -//! -//! This module provides structured error handling with source classification and context tracking -//! that can be reused across all nvisy crates. use std::fmt; @@ -20,6 +17,17 @@ pub type BoxError = Box; /// /// This error type is designed to be used across the entire nvisy ecosystem, /// providing consistent error handling with classification and context. +/// +/// # Example +/// +/// ``` +/// use nvisy_core::error::{Error, ErrorType, ErrorResource}; +/// +/// let error = Error::new("Something went wrong") +/// .with_type(ErrorType::Runtime) +/// .with_resource(ErrorResource::Engine) +/// .with_context("during document processing"); +/// ``` #[must_use] #[derive(Debug)] pub struct Error { @@ -29,7 +37,6 @@ pub struct Error { pub resource: ErrorResource, /// Primary error message. pub message: HipStr<'static>, - /// Underlying source error, if any. source: Option, /// Additional context information. @@ -40,34 +47,31 @@ pub struct Error { pub type Result = std::result::Result; impl Error { - /// Creates a new error with the specified type, source, and message. - pub fn new( - etype: ErrorType, - resource: ErrorResource, - message: impl Into>, - ) -> Self { + /// Creates a new error with the given message. + /// + /// The error type defaults to `ErrorType::Other` and + /// resource defaults to `ErrorResource::Unknown`. + pub fn new(message: impl Into>) -> Self { Self { - etype, - resource, + etype: ErrorType::default(), + resource: ErrorResource::default(), + message: message.into(), source: None, context: None, - message: message.into(), } } - /// Creates a new error with the specified type, source, message, and source error. - pub fn from_source( - etype: ErrorType, - resource: ErrorResource, - message: impl Into>, - source: impl Into, - ) -> Self { + /// Creates a new error from a source error. + /// + /// The error type defaults to `ErrorType::Other` and + /// resource defaults to `ErrorResource::Unknown`. + pub fn from_source(message: impl Into>, source: impl Into) -> Self { Self { - etype, - resource, + etype: ErrorType::default(), + resource: ErrorResource::default(), + message: message.into(), source: Some(source.into()), context: None, - message: message.into(), } } @@ -106,66 +110,23 @@ impl Error { pub fn is_recoverable(&self) -> bool { self.etype.is_recoverable() } +} - /// Returns the display message for the error. - fn display_message(&self) -> String { - let mut parts = Vec::new(); - - parts.push(format!( - "[{}:{}]", +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "[{}:{}] {}", self.resource.as_ref(), - self.etype.as_ref() - )); - parts.push(self.message.to_string()); + self.etype.as_ref(), + self.message + )?; if let Some(ref context) = self.context { - parts.push(format!("(context: {context})")); + write!(f, " (context: {context})")?; } - parts.join(" ") - } - - // Convenience constructors for common error patterns - - /// Creates a runtime error. - pub fn runtime(resource: ErrorResource, message: impl Into>) -> Self { - Self::new(ErrorType::Runtime, resource, message) - } - - /// Creates a configuration error. - pub fn config(resource: ErrorResource, message: impl Into>) -> Self { - Self::new(ErrorType::Config, resource, message) - } - - /// Creates an unsupported format error. - pub fn unsupported_format(message: impl Into>) -> Self { - Self::new(ErrorType::Runtime, ErrorResource::Core, message) - } - - /// Creates an invalid input error. - pub fn invalid_input(message: impl Into>) -> Self { - Self::new(ErrorType::Runtime, ErrorResource::Core, message) - } - - /// Creates a not found error. - pub fn not_found(message: impl Into>) -> Self { - Self::new(ErrorType::Runtime, ErrorResource::Core, message) - } - - /// Creates a permission denied error. - pub fn permission_denied(message: impl Into>) -> Self { - Self::new(ErrorType::Runtime, ErrorResource::Core, message) - } - - /// Creates a resource limit exceeded error. - pub fn resource_limit(message: impl Into>) -> Self { - Self::new(ErrorType::Runtime, ErrorResource::Core, message) - } -} - -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}", self.display_message()) + Ok(()) } } @@ -179,34 +140,25 @@ impl std::error::Error for Error { impl From for Error { fn from(error: std::io::Error) -> Self { - Self::from_source( - ErrorType::Runtime, - ErrorResource::Core, - "I/O operation failed", - error, - ) + Self::from_source("I/O operation failed", error) + .with_type(ErrorType::Runtime) + .with_resource(ErrorResource::Core) } } impl From for Error { fn from(error: std::string::FromUtf8Error) -> Self { - Self::from_source( - ErrorType::Runtime, - ErrorResource::Core, - "Invalid UTF-8 encoding", - error, - ) + Self::from_source("Invalid UTF-8 encoding", error) + .with_type(ErrorType::Runtime) + .with_resource(ErrorResource::Core) } } impl From for Error { fn from(error: std::str::Utf8Error) -> Self { - Self::from_source( - ErrorType::Runtime, - ErrorResource::Core, - "Invalid UTF-8 encoding", - error, - ) + Self::from_source("Invalid UTF-8 encoding", error) + .with_type(ErrorType::Runtime) + .with_resource(ErrorResource::Core) } } @@ -215,25 +167,32 @@ mod tests { use super::*; #[test] - fn test_error_builder() { - let error = Error::new(ErrorType::Config, ErrorResource::Core, "test message"); - assert_eq!(error.etype, ErrorType::Config); - assert_eq!(error.resource, ErrorResource::Core); + fn test_error_new() { + let error = Error::new("test message"); + assert_eq!(error.etype, ErrorType::Other); + assert_eq!(error.resource, ErrorResource::Unknown); assert_eq!(error.message, "test message"); assert!(error.source.is_none()); assert!(error.context.is_none()); } #[test] - fn test_error_with_context() { - let error = Error::new(ErrorType::Other, ErrorResource::Engine, "test") + fn test_error_builder_pattern() { + let error = Error::new("test message") + .with_type(ErrorType::Config) + .with_resource(ErrorResource::Engine) .with_context("additional context"); + + assert_eq!(error.etype, ErrorType::Config); + assert_eq!(error.resource, ErrorResource::Engine); assert_eq!(error.context.as_deref(), Some("additional context")); } #[test] fn test_error_display() { - let error = Error::new(ErrorType::Runtime, ErrorResource::Core, "test error") + let error = Error::new("test error") + .with_type(ErrorType::Runtime) + .with_resource(ErrorResource::Core) .with_context("additional info"); let display_str = error.to_string(); @@ -255,27 +214,25 @@ mod tests { } #[test] - fn test_convenience_constructors() { - let runtime_err = Error::runtime(ErrorResource::Engine, "runtime failure"); - assert_eq!(runtime_err.etype, ErrorType::Runtime); - assert_eq!(runtime_err.resource, ErrorResource::Engine); - - let config_err = Error::config(ErrorResource::Core, "config failure"); - assert_eq!(config_err.etype, ErrorType::Config); + fn test_is_recoverable() { + let runtime_err = Error::new("test").with_type(ErrorType::Runtime); + assert!(runtime_err.is_recoverable()); - let unsupported = Error::unsupported_format("unknown format"); - assert_eq!(unsupported.etype, ErrorType::Runtime); + let config_err = Error::new("test").with_type(ErrorType::Config); + assert!(!config_err.is_recoverable()); - let not_found = Error::not_found("file missing"); - assert_eq!(not_found.etype, ErrorType::Runtime); + let other_err = Error::new("test"); + assert!(!other_err.is_recoverable()); } #[test] - fn test_is_recoverable() { - let runtime_err = Error::runtime(ErrorResource::Core, "test"); - assert!(runtime_err.is_recoverable()); + fn test_from_source() { + let source = std::io::Error::other("underlying error"); + let error = Error::from_source("operation failed", source) + .with_type(ErrorType::Runtime) + .with_resource(ErrorResource::Document); - let config_err = Error::config(ErrorResource::Core, "test"); - assert!(!config_err.is_recoverable()); + assert!(error.source.is_some()); + assert_eq!(error.resource, ErrorResource::Document); } } diff --git a/crates/nvisy-core/src/fs/content_file.rs b/crates/nvisy-core/src/fs/content_file.rs index a9d102d..86bbd8a 100644 --- a/crates/nvisy-core/src/fs/content_file.rs +++ b/crates/nvisy-core/src/fs/content_file.rs @@ -6,11 +6,12 @@ use std::io; use std::path::{Path, PathBuf}; +use bytes::Bytes; use tokio::fs::{File, OpenOptions}; use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWrite, AsyncWriteExt, SeekFrom}; use crate::error::{Error, ErrorResource, ErrorType, Result}; -use crate::fs::{ContentKind, ContentMetadata}; +use crate::fs::ContentMetadata; use crate::io::{AsyncContentRead, AsyncContentWrite, ContentData}; use crate::path::ContentSource; @@ -187,7 +188,7 @@ impl ContentFile { let mut buffer = Vec::new(); self.file.read_to_end(&mut buffer).await?; - let content_data = ContentData::new(self.content_source, buffer.into()); + let content_data = ContentData::new(self.content_source, Bytes::from(buffer)); Ok(content_data) } @@ -210,18 +211,18 @@ impl ContentFile { } if total_read + bytes_read > max_size { - return Err(Error::new( - ErrorType::Runtime, - ErrorResource::Core, - format!("File size exceeds maximum limit of {max_size} bytes"), - )); + return Err(Error::new(format!( + "File size exceeds maximum limit of {max_size} bytes" + )) + .with_type(ErrorType::Runtime) + .with_resource(ErrorResource::Core)); } buffer.extend_from_slice(&temp_buffer[..bytes_read]); total_read += bytes_read; } - let content_data = ContentData::new(self.content_source, buffer.into()); + let content_data = ContentData::new(self.content_source, Bytes::from(buffer)); Ok(content_data) } @@ -357,13 +358,6 @@ impl ContentFile { self.path.extension().and_then(|ext| ext.to_str()) } - /// Detect content kind from file extension - pub fn detect_content_kind(&self) -> ContentKind { - self.extension() - .map(ContentKind::from_file_extension) - .unwrap_or_default() - } - /// Sync all data to disk /// /// # Errors @@ -499,13 +493,12 @@ mod tests { } #[tokio::test] - async fn test_content_kind_detection() { + async fn test_file_extension() { let temp_file = NamedTempFile::new().unwrap(); let mut path = temp_file.path().to_path_buf(); path.set_extension("txt"); let content_file = ContentFile::create(&path).await.unwrap(); - assert_eq!(content_file.detect_content_kind(), ContentKind::Text); assert_eq!(content_file.extension(), Some("txt")); assert_eq!( content_file.filename(), diff --git a/crates/nvisy-core/src/fs/content_kind.rs b/crates/nvisy-core/src/fs/content_kind.rs index 0994bf4..288f488 100644 --- a/crates/nvisy-core/src/fs/content_kind.rs +++ b/crates/nvisy-core/src/fs/content_kind.rs @@ -1,14 +1,19 @@ //! Content type classification for different categories of data //! //! This module provides the [`ContentKind`] enum for classifying content -//! based on file extensions. +//! into broad categories. Extension-to-kind mapping is handled by the +//! engine's format registry. use serde::{Deserialize, Serialize}; -use strum::{Display, EnumIter, EnumString}; +use strum::{AsRefStr, Display, EnumIter, EnumString}; /// Content type classification for different categories of data +/// +/// This enum represents high-level content categories without knowledge +/// of specific file extensions or MIME types. The engine's format registry +/// handles the mapping from extensions/MIME types to content kinds. #[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)] -#[derive(Display, EnumString, EnumIter)] +#[derive(AsRefStr, Display, EnumString, EnumIter)] #[derive(Serialize, Deserialize)] #[strum(serialize_all = "lowercase")] #[serde(rename_all = "lowercase")] @@ -29,35 +34,6 @@ pub enum ContentKind { } impl ContentKind { - /// Detect content kind from file extension - #[must_use] - pub fn from_file_extension(extension: &str) -> Self { - let ext = extension.to_lowercase(); - match ext.as_str() { - // Text formats - "txt" | "text" | "md" | "markdown" | "rst" | "xml" | "json" | "yaml" | "yml" - | "toml" | "ini" | "cfg" | "conf" | "log" => Self::Text, - - // Document formats - "pdf" | "doc" | "docx" | "rtf" | "odt" | "pages" => Self::Document, - - // Spreadsheet formats - "csv" | "tsv" | "xls" | "xlsx" | "ods" | "numbers" => Self::Spreadsheet, - - // Image formats - "jpg" | "jpeg" | "png" | "gif" | "bmp" | "svg" | "webp" | "ico" | "tiff" | "tif" => { - Self::Image - } - - // Archive formats - "zip" | "tar" | "gz" | "bz2" | "xz" | "7z" | "rar" | "tgz" | "tbz2" | "txz" => { - Self::Archive - } - - _ => Self::Unknown, - } - } - /// Check if this content kind represents text-based content #[must_use] pub fn is_text_based(&self) -> bool { @@ -87,54 +63,12 @@ impl ContentKind { pub fn is_archive(&self) -> bool { matches!(self, Self::Archive) } - - /// Get common file extensions for this content kind - #[must_use] - pub fn common_extensions(&self) -> &'static [&'static str] { - match self { - Self::Text => &["txt", "md", "json", "xml", "yaml", "toml"], - Self::Document => &["pdf", "doc", "docx", "rtf", "odt"], - Self::Spreadsheet => &["csv", "xls", "xlsx", "ods"], - Self::Image => &["jpg", "jpeg", "png", "gif", "svg", "webp"], - Self::Archive => &["zip", "tar", "gz", "7z", "rar"], - Self::Unknown => &[], - } - } } #[cfg(test)] mod tests { use super::*; - #[test] - fn test_content_kind_from_extension() { - assert_eq!(ContentKind::from_file_extension("txt"), ContentKind::Text); - assert_eq!(ContentKind::from_file_extension("TXT"), ContentKind::Text); - assert_eq!(ContentKind::from_file_extension("json"), ContentKind::Text); - assert_eq!( - ContentKind::from_file_extension("pdf"), - ContentKind::Document - ); - assert_eq!( - ContentKind::from_file_extension("csv"), - ContentKind::Spreadsheet - ); - assert_eq!( - ContentKind::from_file_extension("xlsx"), - ContentKind::Spreadsheet - ); - assert_eq!(ContentKind::from_file_extension("png"), ContentKind::Image); - assert_eq!( - ContentKind::from_file_extension("zip"), - ContentKind::Archive - ); - assert_eq!(ContentKind::from_file_extension("7z"), ContentKind::Archive); - assert_eq!( - ContentKind::from_file_extension("unknown"), - ContentKind::Unknown - ); - } - #[test] fn test_content_kind_predicates() { assert!(ContentKind::Text.is_text_based()); @@ -164,31 +98,21 @@ mod tests { } #[test] - fn test_common_extensions() { - let text_ext = ContentKind::Text.common_extensions(); - assert!(text_ext.contains(&"txt")); - assert!(text_ext.contains(&"json")); - - let archive_ext = ContentKind::Archive.common_extensions(); - assert!(archive_ext.contains(&"zip")); - assert!(archive_ext.contains(&"7z")); - - let unknown_ext = ContentKind::Unknown.common_extensions(); - assert!(unknown_ext.is_empty()); + fn test_content_kind_as_ref() { + assert_eq!(ContentKind::Text.as_ref(), "text"); + assert_eq!(ContentKind::Document.as_ref(), "document"); } #[test] - fn test_case_insensitive_extension_detection() { - assert_eq!(ContentKind::from_file_extension("TXT"), ContentKind::Text); + fn test_content_kind_from_str() { + use std::str::FromStr; + + assert_eq!(ContentKind::from_str("text").unwrap(), ContentKind::Text); assert_eq!( - ContentKind::from_file_extension("PDF"), + ContentKind::from_str("document").unwrap(), ContentKind::Document ); - assert_eq!(ContentKind::from_file_extension("PNG"), ContentKind::Image); - assert_eq!( - ContentKind::from_file_extension("ZIP"), - ContentKind::Archive - ); + assert!(ContentKind::from_str("invalid").is_err()); } #[test] diff --git a/crates/nvisy-core/src/fs/content_metadata.rs b/crates/nvisy-core/src/fs/content_metadata.rs index 401ed4f..23d01da 100644 --- a/crates/nvisy-core/src/fs/content_metadata.rs +++ b/crates/nvisy-core/src/fs/content_metadata.rs @@ -1,21 +1,19 @@ //! Content metadata for filesystem operations //! //! This module provides the [`ContentMetadata`] struct for handling metadata -//! about content files, including paths, content types, and source tracking. +//! about content files, including paths and source tracking. use std::path::{Path, PathBuf}; use serde::{Deserialize, Serialize}; -use super::ContentKind; use crate::path::ContentSource; /// Metadata associated with content files /// -/// This struct stores metadata about content including its source identifier, -/// file path, and detected content kind based on file extension. -#[derive(Debug, Clone, PartialEq, Eq)] -#[derive(Serialize, Deserialize)] +/// This struct stores metadata about content including its source identifier +/// and file path. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct ContentMetadata { /// Unique identifier for the content source pub content_source: ContentSource, @@ -70,24 +68,6 @@ impl ContentMetadata { .and_then(|ext| ext.to_str()) } - /// Detect content kind from file extension - /// - /// # Example - /// - /// ``` - /// use nvisy_core::{fs::{ContentMetadata, ContentKind}, path::ContentSource}; - /// use std::path::PathBuf; - /// - /// let source = ContentSource::new(); - /// let metadata = ContentMetadata::with_path(source, PathBuf::from("image.png")); - /// assert_eq!(metadata.content_kind(), ContentKind::Image); - /// ``` - pub fn content_kind(&self) -> ContentKind { - self.file_extension() - .map(ContentKind::from_file_extension) - .unwrap_or_default() - } - /// Get the filename if available #[must_use] pub fn filename(&self) -> Option<&str> { @@ -157,7 +137,6 @@ mod tests { let metadata = ContentMetadata::with_path(source, PathBuf::from("document.pdf")); assert_eq!(metadata.file_extension(), Some("pdf")); - assert_eq!(metadata.content_kind(), ContentKind::Document); } #[test] diff --git a/crates/nvisy-core/src/fs/data_sensitivity.rs b/crates/nvisy-core/src/fs/data_sensitivity.rs index 93f636c..b7e1a3b 100644 --- a/crates/nvisy-core/src/fs/data_sensitivity.rs +++ b/crates/nvisy-core/src/fs/data_sensitivity.rs @@ -1,7 +1,7 @@ //! Data sensitivity level classification //! //! This module provides a systematic way to classify data based on sensitivity -//! and risk levels for proper handling and compliance requirements. +//! and risk levels for proper handling. use serde::{Deserialize, Serialize}; use strum::{Display, EnumIter, EnumString}; @@ -25,7 +25,6 @@ use strum::{Display, EnumIter, EnumString}; /// /// assert!(high > medium); /// assert!(medium > low); -/// assert!(high.requires_special_handling()); /// ``` #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] #[derive(EnumIter, EnumString, Display, Serialize, Deserialize)] @@ -62,58 +61,6 @@ impl DataSensitivity { *self as u8 } - /// Check if this sensitivity level requires special handling - #[must_use] - pub fn requires_special_handling(&self) -> bool { - *self >= DataSensitivity::High - } - - /// Check if this sensitivity level requires encryption - #[must_use] - pub fn requires_encryption(&self) -> bool { - *self >= DataSensitivity::Medium - } - - /// Check if this sensitivity level requires access logging - #[must_use] - pub fn requires_access_logging(&self) -> bool { - *self >= DataSensitivity::High - } - - /// Check if this sensitivity level requires a retention policy - #[must_use] - pub fn requires_retention_policy(&self) -> bool { - *self >= DataSensitivity::Medium - } - - /// Check if this sensitivity level requires compliance oversight - #[must_use] - pub fn requires_compliance_oversight(&self) -> bool { - *self >= DataSensitivity::High - } - - /// Get the recommended maximum retention period in days (None = indefinite) - #[must_use] - pub fn max_retention_days(&self) -> Option { - match self { - DataSensitivity::None => None, // Indefinite - DataSensitivity::Low => Some(2555), // ~7 years - DataSensitivity::Medium => Some(1095), // 3 years - DataSensitivity::High => Some(90), // 90 days - } - } - - /// Get all sensitivity levels in ascending order - #[must_use] - pub fn all() -> Vec { - vec![ - DataSensitivity::None, - DataSensitivity::Low, - DataSensitivity::Medium, - DataSensitivity::High, - ] - } - /// Create from a numeric level (0-3) #[must_use] pub fn from_level(level: u8) -> Option { @@ -161,62 +108,21 @@ mod tests { #[test] fn test_from_level() { assert_eq!(DataSensitivity::from_level(0), Some(DataSensitivity::None)); + assert_eq!(DataSensitivity::from_level(1), Some(DataSensitivity::Low)); + assert_eq!( + DataSensitivity::from_level(2), + Some(DataSensitivity::Medium) + ); + assert_eq!(DataSensitivity::from_level(3), Some(DataSensitivity::High)); assert_eq!(DataSensitivity::from_level(4), None); } - #[test] - fn test_requirements() { - let none = DataSensitivity::None; - let low = DataSensitivity::Low; - let medium = DataSensitivity::Medium; - let high = DataSensitivity::High; - // Special handling - assert!(!none.requires_special_handling()); - assert!(!low.requires_special_handling()); - assert!(!medium.requires_special_handling()); - assert!(high.requires_special_handling()); - - // Encryption - assert!(!none.requires_encryption()); - assert!(!low.requires_encryption()); - assert!(medium.requires_encryption()); - assert!(high.requires_encryption()); - - // Access logging - assert!(!none.requires_access_logging()); - assert!(!low.requires_access_logging()); - assert!(!medium.requires_access_logging()); - assert!(high.requires_access_logging()); - - // Compliance oversight - assert!(!none.requires_compliance_oversight()); - assert!(!low.requires_compliance_oversight()); - assert!(!medium.requires_compliance_oversight()); - assert!(high.requires_compliance_oversight()); - } - - #[test] - fn test_retention_periods() { - assert_eq!(DataSensitivity::None.max_retention_days(), None); - assert_eq!(DataSensitivity::Low.max_retention_days(), Some(2555)); - assert_eq!(DataSensitivity::Medium.max_retention_days(), Some(1095)); - assert_eq!(DataSensitivity::High.max_retention_days(), Some(90)); - } - #[test] fn test_display() { assert_eq!(format!("{}", DataSensitivity::High), "High"); assert_eq!(format!("{}", DataSensitivity::None), "None"); } - #[test] - fn test_all_levels() { - let all = DataSensitivity::all(); - assert_eq!(all.len(), 4); - assert_eq!(all[0], DataSensitivity::None); - assert_eq!(all[3], DataSensitivity::High); - } - #[test] fn test_serialization() { let level = DataSensitivity::High; diff --git a/crates/nvisy-core/src/fs/data_structure_kind.rs b/crates/nvisy-core/src/fs/data_structure_kind.rs deleted file mode 100644 index 81562fa..0000000 --- a/crates/nvisy-core/src/fs/data_structure_kind.rs +++ /dev/null @@ -1,130 +0,0 @@ -//! Data structure type classification -//! -//! This module provides classification for different ways data can be structured, -//! from highly organized formats to completely unstructured content. - -use serde::{Deserialize, Serialize}; -use strum::{EnumIter, EnumString}; - -use crate::fs::DataSensitivity; - -/// Classification of data based on its structural organization -/// -/// This enum distinguishes between different levels of data organization, -/// from highly structured formats with defined schemas to completely -/// unstructured content without predefined organization. -/// -/// # Examples -/// -/// ```rust -/// use nvisy_core::fs::DataStructureKind; -/// -/// let structured = DataStructureKind::HighlyStructured; -/// assert!(structured.has_schema()); -/// -/// let unstructured = DataStructureKind::Unstructured; -/// assert!(!unstructured.has_schema()); -/// ``` -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -#[derive(Serialize, Deserialize, EnumIter, EnumString)] -pub enum DataStructureKind { - /// Highly Structured Data - /// - /// Data with rigid schema, defined relationships, and strict formatting rules. - /// Examples: Relational database tables, XML with XSD schema, JSON with JSON Schema. - /// - /// **Schema**: Required and enforced - /// **Queryable**: Highly queryable with structured query languages - /// **Parsing**: Predictable parsing with validation - HighlyStructured, - - /// Semi-Structured Data - /// - /// Data with some organizational structure but flexible schema. - /// Examples: JSON without strict schema, XML without XSD, CSV files, log files. - /// - /// **Schema**: Optional or loosely defined - /// **Queryable**: Moderately queryable with specialized tools - /// **Parsing**: Parseable but may require schema inference - SemiStructured, - - /// Unstructured Data - /// - /// Data without predefined format, schema, or organizational structure. - /// Examples: Plain text, images, audio, video, documents, emails. - /// - /// **Schema**: No schema - /// **Queryable**: Requires full-text search or content analysis - /// **Parsing**: Content-dependent parsing and analysis - Unstructured, -} - -impl DataStructureKind { - /// Get the base sensitivity level for this structure type - /// - /// Note: Actual sensitivity depends on the content, not just the structure - #[must_use] - pub fn base_sensitivity_level(&self) -> DataSensitivity { - match self { - // Structure type alone doesn't determine sensitivity - // Content analysis is required for actual sensitivity assessment - DataStructureKind::HighlyStructured - | DataStructureKind::SemiStructured - | DataStructureKind::Unstructured => DataSensitivity::Low, - } - } - - /// Check if this structure type has a defined schema - #[must_use] - pub fn has_schema(&self) -> bool { - matches!(self, DataStructureKind::HighlyStructured) - } - - /// Check if this structure type is easily queryable - #[must_use] - pub fn is_queryable(&self) -> bool { - !matches!(self, DataStructureKind::Unstructured) - } - - /// Check if parsing is predictable for this structure type - #[must_use] - pub fn has_predictable_parsing(&self) -> bool { - matches!(self, DataStructureKind::HighlyStructured) - } - - /// Check if this structure type supports relationship queries - #[must_use] - pub fn supports_relationships(&self) -> bool { - matches!(self, DataStructureKind::HighlyStructured) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_structure_characteristics() { - let highly_structured = DataStructureKind::HighlyStructured; - assert!(highly_structured.has_schema()); - assert!(highly_structured.is_queryable()); - assert!(highly_structured.has_predictable_parsing()); - - let unstructured = DataStructureKind::Unstructured; - assert!(!unstructured.has_schema()); - assert!(!unstructured.is_queryable()); - assert!(!unstructured.has_predictable_parsing()); - - let highly_structured = DataStructureKind::HighlyStructured; - assert!(highly_structured.supports_relationships()); - assert!(highly_structured.has_schema()); - } - - #[test] - fn test_serialization() { - let structure_type = DataStructureKind::SemiStructured; - let json = serde_json::to_string(&structure_type).unwrap(); - let deserialized: DataStructureKind = serde_json::from_str(&json).unwrap(); - assert_eq!(structure_type, deserialized); - } -} diff --git a/crates/nvisy-core/src/fs/mod.rs b/crates/nvisy-core/src/fs/mod.rs index ab2638f..c6386bd 100644 --- a/crates/nvisy-core/src/fs/mod.rs +++ b/crates/nvisy-core/src/fs/mod.rs @@ -6,7 +6,9 @@ //! # Core Types //! //! - [`ContentFile`]: A file wrapper that combines filesystem operations with content tracking -//! - [`ContentFileMetadata`]: Metadata information for content files +//! - [`ContentMetadata`]: Metadata information for content files +//! - [`ContentKind`]: Classification of content types by file extension +//! - [`DataSensitivity`]: Sensitivity levels for risk assessment //! //! # Example //! @@ -31,84 +33,9 @@ mod content_file; mod content_kind; mod content_metadata; mod data_sensitivity; -mod data_structure_kind; - -use std::path::PathBuf; // Re-export main types pub use content_file::ContentFile; pub use content_kind::ContentKind; pub use content_metadata::ContentMetadata; pub use data_sensitivity::DataSensitivity; -pub use data_structure_kind::DataStructureKind; -use serde::{Deserialize, Serialize}; - -use crate::path::ContentSource; - -/// Metadata information for content files -/// -/// TODO: Implement comprehensive file metadata handling including: -/// - File timestamps (created, modified, accessed) -/// - File permissions and ownership -/// - File size and disk usage -/// - Extended attributes -/// - Content type detection beyond extensions -#[derive(Debug, Clone, PartialEq, Eq)] -#[derive(Serialize, Deserialize)] -pub struct ContentFileMetadata { - /// Content source identifier - pub content_source: ContentSource, - /// Path to the file - pub path: PathBuf, - /// Detected content kind - pub content_kind: Option, - /// File size in bytes - pub size: Option, -} - -impl ContentFileMetadata { - /// Create new file metadata - #[must_use] - pub fn new(content_source: ContentSource, path: PathBuf) -> Self { - Self { - content_source, - path, - content_kind: None, - size: None, - } - } - - /// Set the content kind - #[must_use] - pub fn with_content_kind(mut self, kind: ContentKind) -> Self { - self.content_kind = Some(kind); - self - } - - /// Set the file size - #[must_use] - pub fn with_size(mut self, size: u64) -> Self { - self.size = Some(size); - self - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_content_file_metadata() { - let source = ContentSource::new(); - let path = PathBuf::from("test.txt"); - - let metadata = ContentFileMetadata::new(source, path.clone()) - .with_content_kind(ContentKind::Text) - .with_size(1024); - - assert_eq!(metadata.content_source, source); - assert_eq!(metadata.path, path); - assert_eq!(metadata.content_kind, Some(ContentKind::Text)); - assert_eq!(metadata.size, Some(1024)); - } -} diff --git a/crates/nvisy-core/src/io/content.rs b/crates/nvisy-core/src/io/content.rs index cf0af5f..b3870f4 100644 --- a/crates/nvisy-core/src/io/content.rs +++ b/crates/nvisy-core/src/io/content.rs @@ -1,118 +1,151 @@ -//! Content types supported by the Nvisy system +//! Content representation combining data with metadata //! -//! This module provides the Content enum for representing different types -//! of data content within the system. +//! This module provides the [`Content`] struct that combines [`ContentData`] +//! with optional [`ContentMetadata`] for complete content representation. -use bytes::Bytes; +use derive_more::{AsRef, Deref}; use serde::{Deserialize, Serialize}; -/// Content types supported by the Nvisy system +use super::ContentData; +use crate::fs::ContentMetadata; +use crate::path::ContentSource; + +/// Complete content representation with data and metadata /// -/// Simplified content representation for efficient processing. +/// This struct combines [`ContentData`] (the actual content bytes) with +/// optional [`ContentMetadata`] (path, extension info, etc.) to provide +/// a complete content representation. /// /// # Examples /// /// ```rust -/// use nvisy_core::io::Content; -/// use bytes::Bytes; +/// use nvisy_core::io::{Content, ContentData}; +/// use nvisy_core::fs::ContentMetadata; +/// use nvisy_core::path::ContentSource; +/// +/// // Create content from data +/// let data = ContentData::from("Hello, world!"); +/// let content = Content::new(data); +/// +/// assert_eq!(content.size(), 13); +/// assert!(content.is_text()); /// -/// let text_content = Content::Text("Sample text".to_string()); -/// let binary_content = Content::Binary { -/// data: Bytes::from(vec![0x48, 0x65, 0x6C, 0x6C, 0x6F]), -/// mime_type: "application/octet-stream".to_string(), -/// }; +/// // Create content with metadata +/// let source = ContentSource::new(); +/// let data = ContentData::from_text(source, "Sample text"); +/// let metadata = ContentMetadata::with_path(source, "document.txt"); +/// let content = Content::with_metadata(data, metadata); /// -/// assert!(text_content.is_textual()); -/// assert!(!binary_content.is_textual()); +/// assert_eq!(content.metadata().and_then(|m| m.filename()), Some("document.txt")); /// ``` #[derive(Debug, Clone, PartialEq)] +#[derive(AsRef, Deref)] #[derive(Serialize, Deserialize)] -pub enum Content { - /// Text content stored as UTF-8 string - Text(String), - - /// Generic binary content with MIME type - Binary { - /// Raw binary data - data: Bytes, - /// MIME type describing the content - mime_type: String, - }, - - /// Empty or null content - Empty, +pub struct Content { + /// The actual content data + #[deref] + #[as_ref] + data: ContentData, + /// Optional metadata about the content + metadata: Option, +} + +impl From for Content { + fn from(data: ContentData) -> Self { + Self::new(data) + } } impl Content { - /// Get the type name of this content - pub fn type_name(&self) -> &'static str { - match self { - Content::Text(_) => "text", - Content::Binary { .. } => "binary", - Content::Empty => "empty", + /// Create new content from data without metadata + pub fn new(data: ContentData) -> Self { + Self { + data, + metadata: None, } } - /// Check if this content is textual - pub fn is_textual(&self) -> bool { - matches!(self, Content::Text(_)) + /// Create new content with metadata + pub fn with_metadata(data: ContentData, metadata: ContentMetadata) -> Self { + Self { + data, + metadata: Some(metadata), + } } - /// Check if this content is multimedia (audio, video, image) - pub fn is_multimedia(&self) -> bool { - false // Simplified - no specific multimedia types + /// Get the content data + pub fn data(&self) -> &ContentData { + &self.data } - /// Check if this content has binary data - pub fn has_binary_data(&self) -> bool { - !matches!(self, Content::Text(_) | Content::Empty) + /// Get the content metadata if available + pub fn metadata(&self) -> Option<&ContentMetadata> { + self.metadata.as_ref() } - /// Get the estimated size in bytes - pub fn estimated_size(&self) -> usize { - match self { - Content::Text(text) => text.len(), - Content::Binary { data, .. } => data.len(), - Content::Empty => 0, - } + /// Get the content source + pub fn content_source(&self) -> ContentSource { + self.data.content_source } - /// Get the format/MIME type of this content - pub fn format(&self) -> Option<&str> { - match self { - Content::Text(_) => Some("text/plain"), - Content::Binary { mime_type, .. } => Some(mime_type), - Content::Empty => None, - } + /// Get the size of the content in bytes + pub fn size(&self) -> usize { + self.data.size() } - /// Extract raw bytes from content, if available - pub fn as_bytes(&self) -> Option<&Bytes> { - match self { - Content::Binary { data, .. } => Some(data), - Content::Text(_) | Content::Empty => None, - } + /// Check if the content is empty + pub fn is_empty(&self) -> bool { + self.data.is_empty() } - /// Extract text from content, if it's textual - pub fn as_text(&self) -> Option<&str> { - match self { - Content::Text(text) => Some(text), - _ => None, - } + /// Check if the content is stored as text + pub fn is_text(&self) -> bool { + self.data.is_text() } - /// Create text content - pub fn text>(content: S) -> Self { - Content::Text(content.into()) + /// Check if the content is stored as binary + pub fn is_binary(&self) -> bool { + self.data.is_binary() } - /// Create binary content - pub fn binary>(data: Bytes, mime_type: S) -> Self { - Content::Binary { - data, - mime_type: mime_type.into(), - } + /// Get the content as bytes + pub fn as_bytes(&self) -> &[u8] { + self.data.as_bytes() + } + + /// Try to get the content as a string slice + pub fn as_str(&self) -> crate::error::Result<&str> { + self.data.as_str() + } + + /// Get the file extension from metadata if available + pub fn file_extension(&self) -> Option<&str> { + self.metadata.as_ref().and_then(|m| m.file_extension()) + } + + /// Get the filename from metadata if available + pub fn filename(&self) -> Option<&str> { + self.metadata.as_ref().and_then(|m| m.filename()) + } + + /// Set the metadata + pub fn set_metadata(&mut self, metadata: ContentMetadata) { + self.metadata = Some(metadata); + } + + /// Remove the metadata + pub fn clear_metadata(&mut self) { + self.metadata = None; + } + + /// Consume and return the inner ContentData + pub fn into_data(self) -> ContentData { + self.data + } + + /// Consume and return both data and metadata + pub fn into_parts(self) -> (ContentData, Option) { + (self.data, self.metadata) } } @@ -121,54 +154,92 @@ mod tests { use super::*; #[test] - fn test_content_types() { - let text = Content::text("Hello"); - assert!(text.is_textual()); - assert!(!text.is_multimedia()); - assert!(!text.has_binary_data()); - assert_eq!(text.type_name(), "text"); - assert_eq!(text.format(), Some("text/plain")); - - let binary_data = Bytes::from(vec![1, 2, 3, 4]); - let binary = Content::binary(binary_data, "application/octet-stream"); - assert!(!binary.is_textual()); - assert!(!binary.is_multimedia()); - assert!(binary.has_binary_data()); - assert_eq!(binary.type_name(), "binary"); + fn test_content_creation() { + let data = ContentData::from("Hello, world!"); + let content = Content::new(data.clone()); + + assert_eq!(content.size(), 13); + assert!(content.is_text()); + assert!(content.metadata().is_none()); } #[test] - fn test_content_size_estimation() { - let text = Content::text("Hello, world!"); - assert_eq!(text.estimated_size(), 13); + fn test_content_with_metadata() { + let source = ContentSource::new(); + let data = ContentData::from_text(source, "Test content"); + let metadata = ContentMetadata::with_path(source, "test.txt"); + let content = Content::with_metadata(data, metadata); + + assert!(content.metadata().is_some()); + assert_eq!(content.file_extension(), Some("txt")); + assert_eq!(content.filename(), Some("test.txt")); + } - let binary_data = Bytes::from(vec![0; 100]); - let binary = Content::binary(binary_data, "application/octet-stream"); - assert_eq!(binary.estimated_size(), 100); + #[test] + fn test_content_deref() { + let data = ContentData::from("Hello"); + let content = Content::new(data); - let empty = Content::Empty; - assert_eq!(empty.estimated_size(), 0); + // Test that Deref works - we can call ContentData methods directly + assert_eq!(content.size(), 5); + assert_eq!(content.as_str().unwrap(), "Hello"); } #[test] - fn test_content_data_access() { - let text_content = Content::text("Hello"); - assert_eq!(text_content.as_text(), Some("Hello")); - assert!(text_content.as_bytes().is_none()); + fn test_content_from() { + let data = ContentData::from("Test"); + let content: Content = data.into(); - let binary_data = Bytes::from(vec![1, 2, 3]); - let binary_content = Content::binary(binary_data.clone(), "test"); - assert_eq!(binary_content.as_bytes(), Some(&binary_data)); - assert!(binary_content.as_text().is_none()); + assert_eq!(content.size(), 4); + } + + #[test] + fn test_metadata_operations() { + let data = ContentData::from("Test"); + let mut content = Content::new(data); + + assert!(content.metadata().is_none()); + + let source = content.content_source(); + let metadata = ContentMetadata::with_path(source, "file.pdf"); + content.set_metadata(metadata); + + assert!(content.metadata().is_some()); + assert_eq!(content.file_extension(), Some("pdf")); + + content.clear_metadata(); + assert!(content.metadata().is_none()); + } + + #[test] + fn test_into_parts() { + let source = ContentSource::new(); + let data = ContentData::from_text(source, "Test"); + let metadata = ContentMetadata::with_path(source, "test.txt"); + let content = Content::with_metadata(data.clone(), metadata.clone()); + + let (recovered_data, recovered_metadata) = content.into_parts(); + assert_eq!(recovered_data, data); + assert_eq!(recovered_metadata, Some(metadata)); } #[test] fn test_serialization() { - let content = Content::text("Test content"); + let data = ContentData::from("Test content"); + let content = Content::new(data); let json = serde_json::to_string(&content).unwrap(); let deserialized: Content = serde_json::from_str(&json).unwrap(); assert_eq!(content, deserialized); } + + #[test] + fn test_content_source() { + let source = ContentSource::new(); + let data = ContentData::from_text(source, "Test"); + let content = Content::new(data); + + assert_eq!(content.content_source(), source); + } } diff --git a/crates/nvisy-core/src/io/content_data.rs b/crates/nvisy-core/src/io/content_data.rs index dda1542..1f08bc3 100644 --- a/crates/nvisy-core/src/io/content_data.rs +++ b/crates/nvisy-core/src/io/content_data.rs @@ -7,32 +7,157 @@ use std::fmt; use std::sync::OnceLock; use bytes::Bytes; +use hipstr::HipStr; use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; use crate::error::{Error, ErrorResource, ErrorType, Result}; use crate::path::ContentSource; +/// The underlying data storage type for content +/// +/// This enum allows content to be stored as either binary data (`Bytes`) +/// or text data (`HipStr`). Both types are cheap to clone as they use +/// reference counting internally. +#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum ContentBytes { + /// Binary data stored as `bytes::Bytes` + Binary(Bytes), + /// Text data stored as `hipstr::HipStr<'static>` (owned) + Text(HipStr<'static>), +} + +impl ContentBytes { + /// Get the size of the content in bytes + pub fn len(&self) -> usize { + match self { + Self::Binary(bytes) => bytes.len(), + Self::Text(text) => text.len(), + } + } + + /// Check if the content is empty + pub fn is_empty(&self) -> bool { + match self { + Self::Binary(bytes) => bytes.is_empty(), + Self::Text(text) => text.is_empty(), + } + } + + /// Get the content as a byte slice + pub fn as_bytes(&self) -> &[u8] { + match self { + Self::Binary(bytes) => bytes, + Self::Text(text) => text.as_bytes(), + } + } + + /// Check if this is text content + pub fn is_text(&self) -> bool { + matches!(self, Self::Text(_)) + } + + /// Check if this is binary content + pub fn is_binary(&self) -> bool { + matches!(self, Self::Binary(_)) + } + + /// Try to get the content as a string slice + pub fn as_str(&self) -> Option<&str> { + match self { + Self::Binary(bytes) => std::str::from_utf8(bytes).ok(), + Self::Text(text) => Some(text.as_str()), + } + } + + /// Convert to Bytes (clones if text) + pub fn to_bytes(&self) -> Bytes { + match self { + Self::Binary(bytes) => bytes.clone(), + Self::Text(text) => Bytes::copy_from_slice(text.as_bytes()), + } + } + + /// Convert to HipStr if valid UTF-8 + pub fn to_hipstr(&self) -> Result> { + match self { + Self::Binary(bytes) => { + let s = std::str::from_utf8(bytes).map_err(|e| { + Error::new(format!("Invalid UTF-8: {e}")) + .with_type(ErrorType::Runtime) + .with_resource(ErrorResource::Core) + })?; + Ok(HipStr::from(s)) + } + Self::Text(text) => Ok(text.clone()), + } + } +} + +impl Default for ContentBytes { + fn default() -> Self { + Self::Binary(Bytes::new()) + } +} + +impl From<&str> for ContentBytes { + fn from(s: &str) -> Self { + Self::Text(HipStr::from(s)) + } +} + +impl From for ContentBytes { + fn from(s: String) -> Self { + Self::Text(HipStr::from(s)) + } +} + +impl From> for ContentBytes { + fn from(s: HipStr<'static>) -> Self { + Self::Text(s) + } +} + +impl From<&[u8]> for ContentBytes { + fn from(bytes: &[u8]) -> Self { + Self::Binary(Bytes::copy_from_slice(bytes)) + } +} + +impl From> for ContentBytes { + fn from(vec: Vec) -> Self { + Self::Binary(Bytes::from(vec)) + } +} + +impl From for ContentBytes { + fn from(bytes: Bytes) -> Self { + Self::Binary(bytes) + } +} + /// Content data with metadata and computed hashes /// -/// This struct is a minimal wrapper around `bytes::Bytes` that stores content data +/// This struct wraps [`ContentBytes`] (either `Bytes` or `HipStr`) and stores content data /// along with metadata about its source and optional computed SHA256 hash. -/// It's designed to be cheap to clone using the `bytes::Bytes` type. +/// It's designed to be cheap to clone using reference-counted types. /// The SHA256 hash is lazily computed using `OnceLock` for lock-free access after initialization. #[derive(Debug)] #[derive(Serialize, Deserialize)] pub struct ContentData { /// Unique identifier for the content source pub content_source: ContentSource, - /// The actual content data - pub content_data: Bytes, + /// The actual content data (binary or text) + data: ContentBytes, /// Lazily computed SHA256 hash of the content #[serde(skip)] - content_sha256: OnceLock, + sha256_cache: OnceLock, } impl ContentData { - /// Create new content data + /// Create new content data from bytes /// /// # Example /// @@ -46,17 +171,47 @@ impl ContentData { /// /// assert_eq!(content.size(), 13); /// ``` - pub fn new(content_source: ContentSource, content_data: Bytes) -> Self { + pub fn new(content_source: ContentSource, data: Bytes) -> Self { Self { content_source, - content_data, - content_sha256: OnceLock::new(), + data: ContentBytes::Binary(data), + sha256_cache: OnceLock::new(), + } + } + + /// Create new content data from text + /// + /// # Example + /// + /// ``` + /// use nvisy_core::{io::ContentData, path::ContentSource}; + /// + /// let source = ContentSource::new(); + /// let content = ContentData::from_text(source, "Hello, world!"); + /// + /// assert!(content.is_text()); + /// assert_eq!(content.as_str().unwrap(), "Hello, world!"); + /// ``` + pub fn from_text(content_source: ContentSource, text: impl Into>) -> Self { + Self { + content_source, + data: ContentBytes::Text(text.into()), + sha256_cache: OnceLock::new(), + } + } + + /// Create content data with explicit content bytes type + pub fn with_content_bytes(content_source: ContentSource, data: ContentBytes) -> Self { + Self { + content_source, + data, + sha256_cache: OnceLock::new(), } } /// Get the size of the content in bytes pub fn size(&self) -> usize { - self.content_data.len() + self.data.len() } /// Get pretty formatted size string @@ -73,19 +228,45 @@ impl ContentData { /// Get the content data as bytes slice pub fn as_bytes(&self) -> &[u8] { - &self.content_data + self.data.as_bytes() + } + + /// Get the underlying content bytes + pub fn content_bytes(&self) -> &ContentBytes { + &self.data + } + + /// Convert the content data to Bytes + pub fn to_bytes(&self) -> Bytes { + self.data.to_bytes() } - /// Get the content data as bytes + /// Consume and convert into Bytes pub fn into_bytes(self) -> Bytes { - self.content_data + match self.data { + ContentBytes::Binary(bytes) => bytes, + ContentBytes::Text(text) => Bytes::copy_from_slice(text.as_bytes()), + } + } + + /// Check if the content is stored as text + pub fn is_text(&self) -> bool { + self.data.is_text() } - /// Check if the content is likely text (basic heuristic) + /// Check if the content is stored as binary + pub fn is_binary(&self) -> bool { + self.data.is_binary() + } + + /// Check if the content is likely text (basic heuristic for binary data) pub fn is_likely_text(&self) -> bool { - self.content_data - .iter() - .all(|&b| b.is_ascii_graphic() || b.is_ascii_whitespace()) + match &self.data { + ContentBytes::Text(_) => true, + ContentBytes::Binary(bytes) => bytes + .iter() + .all(|&b| b.is_ascii_graphic() || b.is_ascii_whitespace()), + } } /// Try to convert the content data to a UTF-8 string @@ -94,13 +275,14 @@ impl ContentData { /// /// Returns an error if the content data contains invalid UTF-8 sequences. pub fn as_string(&self) -> Result { - String::from_utf8(self.content_data.to_vec()).map_err(|e| { - Error::new( - ErrorType::Runtime, - ErrorResource::Core, - format!("Invalid UTF-8: {e}"), - ) - }) + match &self.data { + ContentBytes::Text(text) => Ok(text.to_string()), + ContentBytes::Binary(bytes) => String::from_utf8(bytes.to_vec()).map_err(|e| { + Error::new(format!("Invalid UTF-8: {e}")) + .with_type(ErrorType::Runtime) + .with_resource(ErrorResource::Core) + }), + } } /// Try to convert the content data to a UTF-8 string slice @@ -109,25 +291,26 @@ impl ContentData { /// /// Returns an error if the content data contains invalid UTF-8 sequences. pub fn as_str(&self) -> Result<&str> { - std::str::from_utf8(&self.content_data).map_err(|e| { - Error::new( - ErrorType::Runtime, - ErrorResource::Core, - format!("Invalid UTF-8: {e}"), - ) - }) + match &self.data { + ContentBytes::Text(text) => Ok(text.as_str()), + ContentBytes::Binary(bytes) => std::str::from_utf8(bytes).map_err(|e| { + Error::new(format!("Invalid UTF-8: {e}")) + .with_type(ErrorType::Runtime) + .with_resource(ErrorResource::Core) + }), + } } /// Compute SHA256 hash of the content fn compute_sha256_internal(&self) -> Bytes { let mut hasher = Sha256::new(); - hasher.update(&self.content_data); + hasher.update(self.data.as_bytes()); Bytes::from(hasher.finalize().to_vec()) } /// Get the SHA256 hash, computing it if not already done pub fn sha256(&self) -> &Bytes { - self.content_sha256 + self.sha256_cache .get_or_init(|| self.compute_sha256_internal()) } @@ -148,15 +331,13 @@ impl ContentData { if actual_hash.as_ref() == expected { Ok(()) } else { - Err(Error::new( - ErrorType::Runtime, - ErrorResource::Core, - format!( - "Hash mismatch: expected {}, got {}", - hex::encode(expected), - hex::encode(actual_hash) - ), + Err(Error::new(format!( + "Hash mismatch: expected {}, got {}", + hex::encode(expected), + hex::encode(actual_hash) )) + .with_type(ErrorType::Runtime) + .with_resource(ErrorResource::Core)) } } @@ -166,30 +347,29 @@ impl ContentData { /// /// Returns an error if the end index is beyond the content length or if start is greater than end. pub fn slice(&self, start: usize, end: usize) -> Result { - if end > self.content_data.len() { - return Err(Error::new( - ErrorType::Runtime, - ErrorResource::Core, - format!( - "Slice end {} exceeds content length {}", - end, - self.content_data.len() - ), - )); + let bytes = self.data.as_bytes(); + if end > bytes.len() { + return Err(Error::new(format!( + "Slice end {} exceeds content length {}", + end, + bytes.len() + )) + .with_type(ErrorType::Runtime) + .with_resource(ErrorResource::Core)); } if start > end { - return Err(Error::new( - ErrorType::Runtime, - ErrorResource::Core, - format!("Slice start {start} is greater than end {end}"), - )); + return Err( + Error::new(format!("Slice start {start} is greater than end {end}")) + .with_type(ErrorType::Runtime) + .with_resource(ErrorResource::Core), + ); } - Ok(self.content_data.slice(start..end)) + Ok(Bytes::copy_from_slice(&bytes[start..end])) } /// Check if the content is empty pub fn is_empty(&self) -> bool { - self.content_data.is_empty() + self.data.is_empty() } } @@ -198,14 +378,14 @@ impl Clone for ContentData { fn clone(&self) -> Self { let new_lock = OnceLock::new(); // Copy the computed hash if available - if let Some(hash) = self.content_sha256.get() { + if let Some(hash) = self.sha256_cache.get() { let _ = new_lock.set(hash.clone()); } Self { content_source: self.content_source, - content_data: self.content_data.clone(), - content_sha256: new_lock, + data: self.data.clone(), + sha256_cache: new_lock, } } } @@ -213,7 +393,7 @@ impl Clone for ContentData { // Manual implementation of PartialEq impl PartialEq for ContentData { fn eq(&self, other: &Self) -> bool { - self.content_source == other.content_source && self.content_data == other.content_data + self.content_source == other.content_source && self.data == other.data } } @@ -223,14 +403,14 @@ impl Eq for ContentData {} impl From<&str> for ContentData { fn from(s: &str) -> Self { let source = ContentSource::new(); - Self::new(source, Bytes::from(s.to_string())) + Self::from_text(source, s) } } impl From for ContentData { fn from(s: String) -> Self { let source = ContentSource::new(); - Self::new(source, Bytes::from(s)) + Self::from_text(source, s) } } @@ -255,6 +435,13 @@ impl From for ContentData { } } +impl From> for ContentData { + fn from(text: HipStr<'static>) -> Self { + let source = ContentSource::new(); + Self::from_text(source, text) + } +} + impl fmt::Display for ContentData { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if let Ok(text) = self.as_str() { @@ -278,7 +465,34 @@ mod tests { assert_eq!(content.content_source, source); assert_eq!(content.size(), 13); // Check that hash is not computed yet - assert!(content.content_sha256.get().is_none()); + assert!(content.sha256_cache.get().is_none()); + } + + #[test] + fn test_content_data_from_text() { + let source = ContentSource::new(); + let content = ContentData::from_text(source, "Hello, world!"); + + assert!(content.is_text()); + assert!(!content.is_binary()); + assert_eq!(content.as_str().unwrap(), "Hello, world!"); + } + + #[test] + fn test_content_bytes_text() { + let text = ContentBytes::from("Hello"); + assert!(text.is_text()); + assert!(!text.is_binary()); + assert_eq!(text.as_str(), Some("Hello")); + assert_eq!(text.len(), 5); + } + + #[test] + fn test_content_bytes_binary() { + let binary = ContentBytes::from(vec![0xFF, 0xFE]); + assert!(binary.is_binary()); + assert!(!binary.is_text()); + assert_eq!(binary.len(), 2); } #[test] @@ -295,7 +509,7 @@ mod tests { let content = ContentData::from("Hello, world!"); let hash = content.sha256(); - assert!(content.content_sha256.get().is_some()); + assert!(content.sha256_cache.get().is_some()); assert_eq!(hash.len(), 32); // SHA256 is 32 bytes // Test getting cached hash @@ -364,6 +578,10 @@ mod tests { assert_eq!(from_bytes.as_str().unwrap(), "test"); assert_eq!(from_vec.as_str().unwrap(), "test"); assert_eq!(from_bytes_type.as_str().unwrap(), "test"); + + // Text types should be stored as text + assert!(from_str.is_text()); + assert!(from_string.is_text()); } #[test] @@ -384,21 +602,18 @@ mod tests { let cloned = original.clone(); // Both should have the hash computed - assert!(original.content_sha256.get().is_some()); - assert!(cloned.content_sha256.get().is_some()); + assert!(original.sha256_cache.get().is_some()); + assert!(cloned.sha256_cache.get().is_some()); assert_eq!(original.sha256(), cloned.sha256()); } #[test] - fn test_cloning_shares_bytes() { + fn test_cloning_is_cheap() { let original = ContentData::from("Hello, world!"); let cloned = original.clone(); // They should be equal assert_eq!(original, cloned); - - // The underlying bytes should share the same memory - assert_eq!(original.content_data.as_ptr(), cloned.content_data.as_ptr()); } #[test] @@ -414,4 +629,23 @@ mod tests { assert!(content.is_empty()); assert_eq!(content.size(), 0); } + + #[test] + fn test_to_bytes() { + let text_content = ContentData::from_text(ContentSource::new(), "Hello"); + let bytes = text_content.to_bytes(); + assert_eq!(bytes.as_ref(), b"Hello"); + + let binary_content = ContentData::new(ContentSource::new(), Bytes::from("World")); + let bytes = binary_content.to_bytes(); + assert_eq!(bytes.as_ref(), b"World"); + } + + #[test] + fn test_from_hipstr() { + let hipstr = HipStr::from("Hello from HipStr"); + let content = ContentData::from(hipstr); + assert!(content.is_text()); + assert_eq!(content.as_str().unwrap(), "Hello from HipStr"); + } } diff --git a/crates/nvisy-core/src/io/content_read.rs b/crates/nvisy-core/src/io/content_read.rs index 3f3b61e..f889aea 100644 --- a/crates/nvisy-core/src/io/content_read.rs +++ b/crates/nvisy-core/src/io/content_read.rs @@ -44,7 +44,7 @@ pub trait AsyncContentRead: AsyncRead + Unpin + Send { let mut buffer = Vec::new(); self.read_to_end(&mut buffer).await?; - let content_data = ContentData::new(ContentSource::new(), buffer.into()); + let content_data = ContentData::new(ContentSource::new(), Bytes::from(buffer)); Ok(content_data) } } @@ -79,7 +79,7 @@ pub trait AsyncContentRead: AsyncRead + Unpin + Send { let mut buffer = Vec::new(); self.read_to_end(&mut buffer).await?; - let content_data = ContentData::new(source, buffer.into()); + let content_data = ContentData::new(source, Bytes::from(buffer)); Ok(content_data) } } @@ -137,7 +137,7 @@ pub trait AsyncContentRead: AsyncRead + Unpin + Send { total_read += bytes_read; } - let content_data = ContentData::new(ContentSource::new(), buffer.into()); + let content_data = ContentData::new(ContentSource::new(), Bytes::from(buffer)); Ok(content_data) } } @@ -225,7 +225,7 @@ pub trait AsyncContentRead: AsyncRead + Unpin + Send { } // Convert to ContentData after verification - let content_data = ContentData::new(ContentSource::new(), buffer.into()); + let content_data = ContentData::new(ContentSource::new(), Bytes::from(buffer)); Ok(content_data) } } diff --git a/crates/nvisy-core/src/io/data_reference.rs b/crates/nvisy-core/src/io/data_reference.rs index 7dc51df..f97eb1a 100644 --- a/crates/nvisy-core/src/io/data_reference.rs +++ b/crates/nvisy-core/src/io/data_reference.rs @@ -17,9 +17,9 @@ use crate::path::ContentSource; /// # Examples /// /// ```rust -/// use nvisy_core::io::{DataReference, Content}; +/// use nvisy_core::io::{DataReference, Content, ContentData}; /// -/// let content = Content::Text("Hello, world!".to_string()); +/// let content = Content::new(ContentData::from("Hello, world!")); /// let data_ref = DataReference::new(content) /// .with_mapping_id("line-42"); /// @@ -38,7 +38,7 @@ pub struct DataReference { mapping_id: Option, /// The actual content data - content_type: Content, + content: Content, } impl DataReference { @@ -47,7 +47,7 @@ impl DataReference { Self { source: ContentSource::new(), mapping_id: None, - content_type: content, + content, } } @@ -56,7 +56,7 @@ impl DataReference { Self { source, mapping_id: None, - content_type: content, + content, } } @@ -79,39 +79,41 @@ impl DataReference { /// Get a reference to the content pub fn content(&self) -> &Content { - &self.content_type + &self.content } - /// Get the content type name - pub fn content_type_name(&self) -> &'static str { - self.content_type.type_name() + /// Check if the content is text-based + pub fn is_text(&self) -> bool { + self.content.is_text() } - /// Get the estimated size of the content in bytes - pub fn estimated_size(&self) -> usize { - self.content_type.estimated_size() + /// Get the size of the content in bytes + pub fn size(&self) -> usize { + self.content.size() } } #[cfg(test)] mod tests { + use crate::io::ContentData; + use super::*; #[test] fn test_data_reference_creation() { - let content = Content::text("Hello, world!"); + let content = Content::new(ContentData::from("Hello, world!")); let data_ref = DataReference::new(content); - assert_eq!(data_ref.content_type_name(), "text"); + assert!(data_ref.is_text()); assert!(data_ref.mapping_id().is_none()); - assert_eq!(data_ref.estimated_size(), 13); + assert_eq!(data_ref.size(), 13); // Verify UUIDv7 is used assert_eq!(data_ref.source().as_uuid().get_version_num(), 7); } #[test] fn test_data_reference_with_mapping() { - let content = Content::text("Test content"); + let content = Content::new(ContentData::from("Test content")); let data_ref = DataReference::new(content).with_mapping_id("line-42"); assert_eq!(data_ref.mapping_id(), Some("line-42")); @@ -120,7 +122,7 @@ mod tests { #[test] fn test_data_reference_with_source() { let source = ContentSource::new(); - let content = Content::text("Test content"); + let content = Content::new(ContentData::from("Test content")); let data_ref = DataReference::with_source(source, content); assert_eq!(data_ref.source(), source); @@ -128,7 +130,7 @@ mod tests { #[test] fn test_serialization() { - let content = Content::text("Test content"); + let content = Content::new(ContentData::from("Test content")); let data_ref = DataReference::new(content).with_mapping_id("test-mapping"); let json = serde_json::to_string(&data_ref).unwrap(); diff --git a/crates/nvisy-core/src/io/mod.rs b/crates/nvisy-core/src/io/mod.rs index e0f3c44..aa33482 100644 --- a/crates/nvisy-core/src/io/mod.rs +++ b/crates/nvisy-core/src/io/mod.rs @@ -20,7 +20,7 @@ mod data_reference; // Re-export core types and traits pub use content::Content; -pub use content_data::ContentData; +pub use content_data::{ContentBytes, ContentData}; pub use content_read::AsyncContentRead; pub use content_write::AsyncContentWrite; pub use data_reference::DataReference; diff --git a/crates/nvisy-core/src/lib.rs b/crates/nvisy-core/src/lib.rs index b166bd9..4d0dd3e 100644 --- a/crates/nvisy-core/src/lib.rs +++ b/crates/nvisy-core/src/lib.rs @@ -3,24 +3,6 @@ #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("../README.md")] -//! # Nvisy Core -//! -//! Core types and enums for data categorization in the Nvisy content processing system. -//! -//! This crate provides the fundamental data classification system used throughout -//! the Nvisy ecosystem to identify and categorize different types of sensitive data, -//! as well as structured error handling. -//! -//! ## Core Types -//! -//! - [`fs::DataSensitivity`]: Sensitivity levels for risk assessment -//! - [`io::Content`]: Content types and data structures -//! - [`io::DataReference`]: Data references with source tracking -//! - [`fs::DataStructureKind`]: Classification of data structure types -//! - [`fs::ContentFile`]: File operations with content tracking -//! - [`io::ContentData`]: Container for content data with metadata -//! - [`error::Error`]: Structured error handling with source classification - pub mod error; pub mod fs; pub mod io; diff --git a/crates/nvisy-core/src/prelude.rs b/crates/nvisy-core/src/prelude.rs index f39f7e6..b369d58 100644 --- a/crates/nvisy-core/src/prelude.rs +++ b/crates/nvisy-core/src/prelude.rs @@ -6,9 +6,7 @@ // Error handling pub use crate::error::{BoxError, Error, ErrorResource, ErrorType, Result}; // File system types -pub use crate::fs::{ - ContentFile, ContentKind, ContentMetadata, DataSensitivity, DataStructureKind, -}; +pub use crate::fs::{ContentFile, ContentKind, ContentMetadata, DataSensitivity}; // I/O types pub use crate::io::{AsyncContentRead, AsyncContentWrite, Content, ContentData, DataReference}; // Path types diff --git a/crates/nvisy-document/README.md b/crates/nvisy-document/README.md index 0a793b6..9b1d780 100644 --- a/crates/nvisy-document/README.md +++ b/crates/nvisy-document/README.md @@ -1,18 +1,42 @@ # nvisy-document -Document manipulation traits and types for the Nvisy system. +Document manipulation library for VLM-driven editing workflows. -This crate provides a unified interface for working with different document -formats, enabling semantic editing operations driven by VLM (Vision Language -Model) understanding. +This crate provides a format-agnostic abstraction for document editing, +designed to support Vision Language Model (VLM) function calls for +operations like redaction, text replacement, splitting, and merging. + +## Core Concepts + +- **[`DocumentFormat`]** - A format handler that can load and create documents. + Implementations know about format capabilities and how to parse/serialize + documents. + +- **[`Document`]** - A loaded document instance for reading document content. + +- **[`EditableDocument`]** - Extension trait for documents that support editing. + +- **[`Region`]** - Semantic units within a document (text blocks, images, + tables) with stable IDs that persist across edit sessions. + +- **[`EditOperation`]** - Edit commands that target regions by ID, + supporting undo/redo and batch operations. + +## Extension Traits + +Document implementations can optionally implement these extension traits: + +- [`Conversion`] - Convert documents to other formats +- [`Metadata`] - Extract and modify document metadata +- [`ThumbnailGenerator`] - Generate thumbnail images ## Features -- **Document Format Trait**: Common interface for PDF, DOCX, and other formats -- **Format Registry**: Register and look up formats by MIME type or extension -- **Region-based Editing**: Reference and modify document regions with stable IDs -- **Edit Operations**: Redaction, text replacement, structural changes -- **Streaming Support**: Handle large documents with pagination +- **Document Format Trait** - Common interface for PDF, DOCX, and other formats +- **Format Registry** - Register and look up formats by MIME type or extension +- **Region-based Editing** - Reference and modify document regions with stable IDs +- **Edit Operations** - Redaction, text replacement, structural changes +- **Streaming Support** - Handle large documents with pagination ## Architecture @@ -31,7 +55,7 @@ Model) understanding. ┌─────────────────┼─────────────────┐ ▼ ▼ ▼ ┌──────────┐ ┌──────────┐ ┌──────────┐ - │nvisy-pdf │ │nvisy-docx│ │nvisy-txt │ + │nvisy-pdf │ │nvisy-docx│ │nvisy-text│ └──────────┘ └──────────┘ └──────────┘ ``` diff --git a/crates/nvisy-document/src/lib.rs b/crates/nvisy-document/src/lib.rs index f85af03..34435f8 100644 --- a/crates/nvisy-document/src/lib.rs +++ b/crates/nvisy-document/src/lib.rs @@ -2,39 +2,6 @@ #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("../README.md")] -//! # nvisy-document -//! -//! Document manipulation library for VLM-driven editing workflows. -//! -//! This crate provides a format-agnostic abstraction for document editing, -//! designed to support Vision Language Model (VLM) function calls for -//! operations like redaction, text replacement, splitting, and merging. -//! -//! ## Core Concepts -//! -//! - **[`DocumentFormat`]**: A format handler (like a class) that can load -//! and create documents. Implementations know about format capabilities -//! and how to parse/serialize documents. -//! -//! - **[`Document`]**: A loaded document instance for reading document content. -//! Think of this as an instance of a DocumentFormat. -//! -//! - **[`EditableDocument`]**: Extension trait for documents that support editing. -//! -//! - **[`Region`]**: Semantic units within a document (text blocks, images, -//! tables) with stable IDs that persist across edit sessions. -//! -//! - **[`EditOperation`]**: Edit commands that target regions by ID, -//! supporting undo/redo and batch operations. -//! -//! ## Extension Traits -//! -//! Document implementations can optionally implement these extension traits: -//! -//! - [`Conversion`]: Convert documents to other formats -//! - [`Metadata`]: Extract and modify document metadata -//! - [`ThumbnailGenerator`]: Generate thumbnail images - // Core modules pub mod error; pub mod format; @@ -43,6 +10,8 @@ pub mod operation; // Extension trait modules pub mod conversion; pub mod metadata; +pub mod table; +pub mod text; pub mod thumbnail; // Error re-exports @@ -73,5 +42,9 @@ pub use operation::{ MergeOrder, MetadataOperation, PageOperation, RedactStyle, SplitBoundary, StructuralOperation, TextStyle, }; +// Table re-exports +pub use table::{CellDataType, NormalizedCell, NormalizedRow, NormalizedTable, TableExtractor}; +// Text re-exports +pub use text::{ExtractedText, TextExtractor}; // Thumbnail re-exports pub use thumbnail::{ImageFormat, Thumbnail, ThumbnailGenerator, ThumbnailOptions, ThumbnailSize}; diff --git a/crates/nvisy-document/src/table/mod.rs b/crates/nvisy-document/src/table/mod.rs new file mode 100644 index 0000000..44b24f9 --- /dev/null +++ b/crates/nvisy-document/src/table/mod.rs @@ -0,0 +1,86 @@ +//! Table extraction and normalization traits and types. +//! +//! This module defines the [`TableExtractor`] trait for extracting and +//! normalizing tables from documents. + +mod types; + +use async_trait::async_trait; +pub use types::{CellDataType, NormalizedCell, NormalizedRow, NormalizedTable}; + +use crate::error::Result; +use crate::format::{Document, Region, RegionKind}; + +/// Trait for document table extraction and normalization. +/// +/// This trait is implemented by [`Document`] types that contain tabular data. +/// Tables are extracted as [`NormalizedTable`] structures with consistent +/// representation across formats. +/// +/// # Example +/// +/// ```ignore +/// use nvisy_document::{Document, TableExtractor, NormalizedTable}; +/// +/// async fn process_tables(doc: &D) -> Result> +/// where +/// D: TableExtractor, +/// { +/// let tables = doc.extract_tables().await?; +/// +/// for table in &tables { +/// println!("Table with {} rows, {} columns", +/// table.row_count(), +/// table.column_count +/// ); +/// +/// if table.has_header { +/// println!("Headers: {:?}", table.column_headers()); +/// } +/// } +/// +/// Ok(tables) +/// } +/// ``` +#[async_trait] +pub trait TableExtractor: Document { + /// Extracts and normalizes all tables from the document. + /// + /// Tables are identified from the document's regions and converted + /// to a normalized format with: + /// - Consistent cell structure + /// - Resolved merged cells + /// - Inferred column types + /// - Detected headers + async fn extract_tables(&self) -> Result>; + + /// Extracts and normalizes a specific table by its region ID. + /// + /// # Arguments + /// + /// * `region_id` - The ID of the table region + /// + /// # Returns + /// + /// The normalized table, or `None` if the region is not a table. + async fn extract_table( + &self, + region_id: crate::format::RegionId, + ) -> Result>; + + /// Returns the table regions in the document without normalizing them. + /// + /// This is a quick way to check how many tables exist without + /// performing full extraction. + fn table_regions(&self) -> Vec<&Region> { + self.regions() + .iter() + .filter(|r| r.kind == RegionKind::Table) + .collect() + } + + /// Returns the number of tables in the document. + fn table_count(&self) -> usize { + self.table_regions().len() + } +} diff --git a/crates/nvisy-document/src/table/types.rs b/crates/nvisy-document/src/table/types.rs new file mode 100644 index 0000000..f5d4594 --- /dev/null +++ b/crates/nvisy-document/src/table/types.rs @@ -0,0 +1,446 @@ +//! Table extraction and normalization types. + +use serde::{Deserialize, Serialize}; + +use crate::format::RegionId; + +/// A normalized table structure. +/// +/// Tables from different formats (PDF, DOCX, XLSX, HTML) are converted +/// to this common representation for consistent processing. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct NormalizedTable { + /// Reference to the table's region in the document. + pub id: RegionId, + + /// Table rows. + pub rows: Vec, + + /// Total number of columns. + pub column_count: usize, + + /// Whether the table has a header row. + pub has_header: bool, + + /// Number of header rows (0 if no header). + pub header_row_count: usize, + + /// Optional table caption or title. + pub caption: Option, +} + +impl NormalizedTable { + /// Creates a new normalized table. + #[must_use] + pub fn new(id: RegionId) -> Self { + Self { + id, + rows: Vec::new(), + column_count: 0, + has_header: false, + header_row_count: 0, + caption: None, + } + } + + /// Sets the column count. + #[must_use] + pub fn with_column_count(mut self, count: usize) -> Self { + self.column_count = count; + self + } + + /// Sets the header row count. + #[must_use] + pub fn with_header_rows(mut self, count: usize) -> Self { + self.header_row_count = count; + self.has_header = count > 0; + self + } + + /// Sets the caption. + #[must_use] + pub fn with_caption(mut self, caption: impl Into) -> Self { + self.caption = Some(caption.into()); + self + } + + /// Adds a row to the table. + pub fn add_row(&mut self, row: NormalizedRow) { + self.rows.push(row); + } + + /// Returns the number of rows. + #[must_use] + pub fn row_count(&self) -> usize { + self.rows.len() + } + + /// Returns the header rows. + #[must_use] + pub fn header_rows(&self) -> &[NormalizedRow] { + &self.rows[..self.header_row_count.min(self.rows.len())] + } + + /// Returns the data rows (non-header). + #[must_use] + pub fn data_rows(&self) -> &[NormalizedRow] { + let start = self.header_row_count.min(self.rows.len()); + &self.rows[start..] + } + + /// Returns a cell at the given position. + #[must_use] + pub fn cell(&self, row: usize, col: usize) -> Option<&NormalizedCell> { + self.rows.get(row).and_then(|r| r.cells.get(col)) + } + + /// Returns the column headers as strings. + #[must_use] + pub fn column_headers(&self) -> Vec<&str> { + if !self.has_header || self.rows.is_empty() { + return Vec::new(); + } + self.rows[0].cells.iter().map(|c| c.text.as_str()).collect() + } +} + +/// A row within a normalized table. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct NormalizedRow { + /// Cells in this row. + pub cells: Vec, + + /// Whether this is a header row. + pub is_header: bool, +} + +impl NormalizedRow { + /// Creates a new row. + #[must_use] + pub fn new() -> Self { + Self { + cells: Vec::new(), + is_header: false, + } + } + + /// Creates a new header row. + #[must_use] + pub fn header() -> Self { + Self { + cells: Vec::new(), + is_header: true, + } + } + + /// Adds a cell to the row. + pub fn add_cell(&mut self, cell: NormalizedCell) { + self.cells.push(cell); + } + + /// Adds a text cell to the row. + pub fn add_text(&mut self, text: impl Into) { + self.cells.push(NormalizedCell::text(text)); + } + + /// Returns the number of cells. + #[must_use] + pub fn cell_count(&self) -> usize { + self.cells.len() + } +} + +impl Default for NormalizedRow { + fn default() -> Self { + Self::new() + } +} + +/// A cell within a normalized table. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct NormalizedCell { + /// Text content of the cell. + pub text: String, + + /// Number of columns this cell spans. + pub col_span: usize, + + /// Number of rows this cell spans. + pub row_span: usize, + + /// Inferred data type of the cell content. + pub data_type: CellDataType, + + /// Whether this cell is a continuation of a merged cell. + /// + /// True for cells that are covered by a cell with col_span > 1 or row_span > 1. + pub is_merged_continuation: bool, +} + +impl NormalizedCell { + /// Creates a new cell with text content. + #[must_use] + pub fn new(text: impl Into) -> Self { + let text = text.into(); + let data_type = CellDataType::infer(&text); + Self { + text, + col_span: 1, + row_span: 1, + data_type, + is_merged_continuation: false, + } + } + + /// Creates a text cell. + #[must_use] + pub fn text(text: impl Into) -> Self { + Self::new(text) + } + + /// Creates an empty cell. + #[must_use] + pub fn empty() -> Self { + Self { + text: String::new(), + col_span: 1, + row_span: 1, + data_type: CellDataType::Empty, + is_merged_continuation: false, + } + } + + /// Creates a merged continuation cell. + #[must_use] + pub fn merged_continuation() -> Self { + Self { + text: String::new(), + col_span: 1, + row_span: 1, + data_type: CellDataType::Empty, + is_merged_continuation: true, + } + } + + /// Sets the column span. + #[must_use] + pub fn with_col_span(mut self, span: usize) -> Self { + self.col_span = span; + self + } + + /// Sets the row span. + #[must_use] + pub fn with_row_span(mut self, span: usize) -> Self { + self.row_span = span; + self + } + + /// Sets the data type explicitly. + #[must_use] + pub fn with_data_type(mut self, data_type: CellDataType) -> Self { + self.data_type = data_type; + self + } + + /// Returns whether the cell is empty. + #[must_use] + pub fn is_empty(&self) -> bool { + self.text.is_empty() || self.data_type == CellDataType::Empty + } + + /// Returns whether the cell spans multiple columns or rows. + #[must_use] + pub fn is_merged(&self) -> bool { + self.col_span > 1 || self.row_span > 1 + } +} + +/// Inferred data type of cell content. +#[derive( + Debug, + Clone, + Copy, + PartialEq, + Eq, + Hash, + Default, + Serialize, + Deserialize +)] +#[serde(rename_all = "snake_case")] +pub enum CellDataType { + /// Plain text content. + #[default] + Text, + + /// Numeric value (integer or float). + Number, + + /// Date value. + Date, + + /// Date and time value. + DateTime, + + /// Boolean value. + Boolean, + + /// Formula (spreadsheet). + Formula, + + /// Empty cell. + Empty, +} + +impl CellDataType { + /// Infers the data type from a string value. + #[must_use] + pub fn infer(text: &str) -> Self { + let trimmed = text.trim(); + + if trimmed.is_empty() { + return Self::Empty; + } + + // Check for boolean + match trimmed.to_lowercase().as_str() { + "true" | "false" | "yes" | "no" => return Self::Boolean, + _ => {} + } + + // Check for formula (starts with =) + if trimmed.starts_with('=') { + return Self::Formula; + } + + // Check for number + if Self::is_number(trimmed) { + return Self::Number; + } + + // Check for date patterns + if Self::is_date(trimmed) { + return Self::Date; + } + + if Self::is_datetime(trimmed) { + return Self::DateTime; + } + + Self::Text + } + + fn is_number(s: &str) -> bool { + // Remove common number formatting + let cleaned: String = s + .chars() + .filter(|c| *c != ',' && *c != ' ' && *c != '$' && *c != '€' && *c != '%') + .collect(); + + cleaned.parse::().is_ok() + } + + fn is_date(s: &str) -> bool { + // Simple date pattern detection without regex + // Matches: YYYY-MM-DD, MM/DD/YYYY, DD.MM.YYYY, DD-MM-YYYY + let chars: Vec = s.chars().collect(); + + // Check for ISO format: YYYY-MM-DD (10 chars) + if chars.len() == 10 { + let is_iso = chars[0..4].iter().all(|c| c.is_ascii_digit()) + && chars[4] == '-' + && chars[5..7].iter().all(|c| c.is_ascii_digit()) + && chars[7] == '-' + && chars[8..10].iter().all(|c| c.is_ascii_digit()); + + if is_iso { + return true; + } + + // Check for other formats: XX/XX/XXXX, XX.XX.XXXX, XX-XX-XXXX + let sep = chars[2]; + if (sep == '/' || sep == '.' || sep == '-') + && chars[5] == sep + && chars[0..2].iter().all(|c| c.is_ascii_digit()) + && chars[3..5].iter().all(|c| c.is_ascii_digit()) + && chars[6..10].iter().all(|c| c.is_ascii_digit()) + { + return true; + } + } + + false + } + + fn is_datetime(s: &str) -> bool { + // Contains date-like pattern and time-like pattern + s.contains(':') && (s.contains('-') || s.contains('/')) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_normalized_table() { + let id = RegionId::new(); + let mut table = NormalizedTable::new(id) + .with_column_count(3) + .with_header_rows(1); + + let mut header = NormalizedRow::header(); + header.add_text("Name"); + header.add_text("Age"); + header.add_text("City"); + table.add_row(header); + + let mut row1 = NormalizedRow::new(); + row1.add_text("Alice"); + row1.add_text("30"); + row1.add_text("NYC"); + table.add_row(row1); + + assert_eq!(table.row_count(), 2); + assert_eq!(table.column_count, 3); + assert!(table.has_header); + assert_eq!(table.header_rows().len(), 1); + assert_eq!(table.data_rows().len(), 1); + assert_eq!(table.column_headers(), vec!["Name", "Age", "City"]); + } + + #[test] + fn test_cell_data_type_inference() { + assert_eq!(CellDataType::infer(""), CellDataType::Empty); + assert_eq!(CellDataType::infer(" "), CellDataType::Empty); + assert_eq!(CellDataType::infer("Hello"), CellDataType::Text); + assert_eq!(CellDataType::infer("123"), CellDataType::Number); + assert_eq!(CellDataType::infer("123.45"), CellDataType::Number); + assert_eq!(CellDataType::infer("$1,234.56"), CellDataType::Number); + assert_eq!(CellDataType::infer("true"), CellDataType::Boolean); + assert_eq!(CellDataType::infer("YES"), CellDataType::Boolean); + assert_eq!(CellDataType::infer("=SUM(A1:A10)"), CellDataType::Formula); + assert_eq!(CellDataType::infer("2024-01-15"), CellDataType::Date); + assert_eq!(CellDataType::infer("01/15/2024"), CellDataType::Date); + } + + #[test] + fn test_merged_cell() { + let cell = NormalizedCell::text("Merged") + .with_col_span(2) + .with_row_span(3); + + assert!(cell.is_merged()); + assert_eq!(cell.col_span, 2); + assert_eq!(cell.row_span, 3); + } + + #[test] + fn test_merged_continuation() { + let cell = NormalizedCell::merged_continuation(); + assert!(cell.is_merged_continuation); + assert!(cell.is_empty()); + } +} diff --git a/crates/nvisy-document/src/text/mod.rs b/crates/nvisy-document/src/text/mod.rs new file mode 100644 index 0000000..cfcdc25 --- /dev/null +++ b/crates/nvisy-document/src/text/mod.rs @@ -0,0 +1,69 @@ +//! Text extraction traits and types. +//! +//! This module defines the [`TextExtractor`] trait for extracting text +//! content from documents. + +mod types; + +use async_trait::async_trait; +pub use types::ExtractedText; + +use crate::error::Result; +use crate::format::Document; + +/// Trait for document text extraction. +/// +/// This trait is implemented by [`Document`] types that support extracting +/// native text content. Documents that are image-based (scanned PDFs, images) +/// should not implement this trait - they require OCR which is handled externally. +/// +/// # Example +/// +/// ```ignore +/// use nvisy_document::{Document, TextExtractor, ExtractedText}; +/// +/// async fn extract_document_text(doc: &D) -> Result +/// where +/// D: TextExtractor, +/// { +/// let text = doc.extract_text().await?; +/// +/// if text.needs_ocr { +/// println!("Document may need OCR for complete extraction"); +/// } +/// +/// println!("Extracted {} words", text.word_count()); +/// Ok(text) +/// } +/// ``` +#[async_trait] +pub trait TextExtractor: Document { + /// Extracts all text from the document. + /// + /// Returns [`ExtractedText`] containing: + /// - Raw concatenated text + /// - Text organized by page + /// - Text mapped to regions + /// - Whether OCR might be needed for complete extraction + async fn extract_text(&self) -> Result; + + /// Extracts text from a specific page. + /// + /// # Arguments + /// + /// * `page` - The page number (1-indexed) + /// + /// # Returns + /// + /// The text content of the page, or `None` if the page doesn't exist. + async fn extract_text_for_page(&self, page: u32) -> Result>; + + /// Returns whether this document likely needs OCR for text extraction. + /// + /// This is a quick heuristic check without performing full extraction. + /// Returns `true` if: + /// - Document appears to be a scanned image + /// - Document has no extractable text layer + /// - Document is an image format + fn needs_ocr(&self) -> bool; +} diff --git a/crates/nvisy-document/src/text/types.rs b/crates/nvisy-document/src/text/types.rs new file mode 100644 index 0000000..30039e4 --- /dev/null +++ b/crates/nvisy-document/src/text/types.rs @@ -0,0 +1,162 @@ +//! Text extraction types. + +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +use crate::format::RegionId; + +/// Result of text extraction from a document. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct ExtractedText { + /// Full document text concatenated. + pub raw: String, + + /// Text grouped by page number (1-indexed). + pub by_page: HashMap, + + /// Text mapped to region IDs. + pub by_region: HashMap, + + /// Whether the document likely needs OCR for full text extraction. + /// + /// True if: + /// - Document appears to be scanned (images with no text layer) + /// - Text extraction yielded very little content relative to page count + /// - Document contains primarily images + pub needs_ocr: bool, + + /// Extraction warnings or issues encountered. + pub warnings: Vec, +} + +impl ExtractedText { + /// Creates a new empty extracted text result. + #[must_use] + pub fn new() -> Self { + Self::default() + } + + /// Creates an extracted text result indicating OCR is needed. + #[must_use] + pub fn needs_ocr() -> Self { + Self { + needs_ocr: true, + ..Default::default() + } + } + + /// Creates an extracted text result from raw text. + #[must_use] + pub fn from_raw(text: impl Into) -> Self { + Self { + raw: text.into(), + ..Default::default() + } + } + + /// Sets the raw text. + #[must_use] + pub fn with_raw(mut self, text: impl Into) -> Self { + self.raw = text.into(); + self + } + + /// Adds text for a specific page. + #[must_use] + pub fn with_page(mut self, page: u32, text: impl Into) -> Self { + self.by_page.insert(page, text.into()); + self + } + + /// Adds text for a specific region. + #[must_use] + pub fn with_region(mut self, region_id: RegionId, text: impl Into) -> Self { + self.by_region.insert(region_id, text.into()); + self + } + + /// Adds a warning message. + #[must_use] + pub fn with_warning(mut self, warning: impl Into) -> Self { + self.warnings.push(warning.into()); + self + } + + /// Returns the text for a specific page. + #[must_use] + pub fn page_text(&self, page: u32) -> Option<&str> { + self.by_page.get(&page).map(String::as_str) + } + + /// Returns the text for a specific region. + #[must_use] + pub fn region_text(&self, region_id: RegionId) -> Option<&str> { + self.by_region.get(®ion_id).map(String::as_str) + } + + /// Returns the total character count. + #[must_use] + pub fn char_count(&self) -> usize { + self.raw.len() + } + + /// Returns an approximate word count. + #[must_use] + pub fn word_count(&self) -> usize { + self.raw.split_whitespace().count() + } + + /// Returns whether any text was extracted. + #[must_use] + pub fn is_empty(&self) -> bool { + self.raw.is_empty() + } + + /// Returns the number of pages with extracted text. + #[must_use] + pub fn page_count(&self) -> usize { + self.by_page.len() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_extracted_text_builder() { + let region_id = RegionId::new(); + let text = ExtractedText::new() + .with_raw("Hello world") + .with_page(1, "Hello") + .with_page(2, "world") + .with_region(region_id, "Hello"); + + assert_eq!(text.raw, "Hello world"); + assert_eq!(text.page_text(1), Some("Hello")); + assert_eq!(text.page_text(2), Some("world")); + assert_eq!(text.region_text(region_id), Some("Hello")); + assert!(!text.needs_ocr); + } + + #[test] + fn test_needs_ocr() { + let text = ExtractedText::needs_ocr(); + assert!(text.needs_ocr); + assert!(text.is_empty()); + } + + #[test] + fn test_word_count() { + let text = ExtractedText::from_raw("Hello world, this is a test."); + assert_eq!(text.word_count(), 6); + } + + #[test] + fn test_from_raw() { + let text = ExtractedText::from_raw("Simple text"); + assert_eq!(text.raw, "Simple text"); + assert!(text.by_page.is_empty()); + } +} diff --git a/crates/nvisy-docx/src/lib.rs b/crates/nvisy-docx/src/lib.rs index 40b31c4..0a0c451 100644 --- a/crates/nvisy-docx/src/lib.rs +++ b/crates/nvisy-docx/src/lib.rs @@ -1,20 +1,6 @@ -//! DOCX document format support for nvisy. -//! -//! This crate provides a `DocumentFormat` implementation for Microsoft Word -//! DOCX files (.docx). -//! -//! # Example -//! -//! ```ignore -//! use nvisy_docx::DocxFormat; -//! use nvisy_engine::Engine; -//! -//! let engine = Engine::new(); -//! let doc = engine.load_docx(data).await?; -//! ``` - #![forbid(unsafe_code)] #![cfg_attr(docsrs, feature(doc_cfg))] +#![doc = include_str!("../README.md")] mod document; mod format; diff --git a/crates/nvisy-pdf/src/lib.rs b/crates/nvisy-pdf/src/lib.rs index 5011638..4b72b88 100644 --- a/crates/nvisy-pdf/src/lib.rs +++ b/crates/nvisy-pdf/src/lib.rs @@ -1,18 +1,6 @@ -//! PDF document format support for nvisy. -//! -//! This crate provides a `DocumentFormat` implementation for PDF files (.pdf). -//! -//! # Example -//! -//! ```ignore -//! use nvisy_pdf::PdfFormat; -//! use nvisy_engine::Engine; -//! -//! let engine = Engine::new().with_pdf(PdfFormat::new()); -//! ``` - #![forbid(unsafe_code)] #![cfg_attr(docsrs, feature(doc_cfg))] +#![doc = include_str!("../README.md")] mod document; mod format; diff --git a/crates/nvisy-text/src/lib.rs b/crates/nvisy-text/src/lib.rs index 5c5f5c4..b8b6981 100644 --- a/crates/nvisy-text/src/lib.rs +++ b/crates/nvisy-text/src/lib.rs @@ -1,19 +1,6 @@ -//! Plain text document format support for nvisy. -//! -//! This crate provides a `DocumentFormat` implementation for plain text -//! files (.txt, .md, .rst, etc.). -//! -//! # Example -//! -//! ```ignore -//! use nvisy_text::TextFormat; -//! use nvisy_engine::Engine; -//! -//! let engine = Engine::new().with_text(TextFormat::new()); -//! ``` - #![forbid(unsafe_code)] #![cfg_attr(docsrs, feature(doc_cfg))] +#![doc = include_str!("../README.md")] mod document; mod format; diff --git a/docs/DATATYPES.md b/docs/DATATYPES.md new file mode 100644 index 0000000..f3d5954 --- /dev/null +++ b/docs/DATATYPES.md @@ -0,0 +1,445 @@ +# Data Types + +This document defines the core data structures used throughout the processing pipeline. + +--- + +## Format Detection + +### FormatResult + +Result of format detection and validation. + +``` +FormatResult { + format: Format # Detected file format + mime_type: String # MIME type (e.g., "application/pdf") + extension_matches: Bool # True if claimed extension matches content + is_valid: Bool # True if file passed integrity check + errors: List # Validation errors, if any +} +``` + +### Format + +Enumeration of supported file formats. + +``` +Format = + # Documents + | PDF + | DOCX + | DOC + | RTF + | ODT + + # Spreadsheets + | XLSX + | XLS + | CSV + | ODS + + # Text + | TEXT + | MARKDOWN + | JSON + | XML + | HTML + | CODE + + # Images + | PNG + | JPEG + | WEBP + | GIF + | TIFF + | BMP + + # Archives + | ZIP + | TAR + | GZIP + | SEVENZ + | RAR + + | UNKNOWN +``` + +--- + +## Metadata + +### FileMetadata + +Document properties and embedded metadata. + +``` +FileMetadata { + # Basic properties + title: String? + author: String? + subject: String? + keywords: List + + # Timestamps + created_at: DateTime? + modified_at: DateTime? + + # Document-specific + page_count: Int? + word_count: Int? + character_count: Int? + + # Image-specific + dimensions: Dimensions? + color_space: String? + bit_depth: Int? + + # Media-specific + duration: Duration? + + # Embedded metadata + exif: Map? + xmp: Map? + + # Archive-specific + entry_count: Int? + total_uncompressed_size: Int? +} +``` + +### Dimensions + +Width and height in pixels. + +``` +Dimensions { + width: Int + height: Int +} +``` + +--- + +## Regions + +### Region + +A semantic segment of a document with position information. + +``` +Region { + id: RegionId # Unique identifier (UUID) + page: Int? # Page number (1-indexed), if applicable + bounds: BoundingBox # Position in normalized coordinates + text: String? # Text content, if extractable + kind: RegionKind # Semantic type + status: RegionStatus? # Processing status + source: RegionSource # How region was identified + parent: RegionId? # Parent region, if nested + children: List # Child regions, if container +} +``` + +### RegionId + +Stable unique identifier for a region. + +``` +RegionId = UUID + +# Display format: "region_" + first 8 chars of UUID +# Example: "region_a1b2c3d4" +``` + +### RegionKind + +Classification of region by semantic type. + +``` +RegionKind = + | Text # Paragraphs, sentences + | Heading # Titles, section headers + | Table # Tabular data container + | TableRow # Row within table + | TableCell # Cell within row + | Image # Embedded graphics + | List # Bulleted/numbered list + | ListItem # Item within list + | Header # Page header + | Footer # Page footer + | Footnote # Footnotes/endnotes + | Code # Code blocks + | Quote # Block quotes + | Formula # Math equations + | Link # Hyperlinks + | FormField # Interactive form elements + | Annotation # Comments, annotations + | Unknown # Unclassified content +``` + +### RegionSource + +How the region was identified. + +``` +RegionSource = + | Parser # Extracted by document parser + | Layout # Detected by layout analysis + | OCR # Identified by OCR + | User # Manually defined by user + | VLM # Identified by vision-language model +``` + +### RegionStatus + +Current state of the region in an editing session. + +``` +RegionStatus = + | Active # Normal, editable state + | Modified # Content has been changed + | Deleted # Marked for deletion + | Locked # Cannot be modified +``` + +### BoundingBox + +Position in normalized coordinates (0.0 to 1.0 relative to page/container). + +``` +BoundingBox { + x: Float # Left edge (0.0 = left of page) + y: Float # Top edge (0.0 = top of page) + width: Float # Width as fraction of page width + height: Float # Height as fraction of page height +} + +# Coordinate system: +# Origin (0,0) is top-left +# X increases left to right +# Y increases top to bottom +# Full page = {x: 0, y: 0, width: 1, height: 1} +``` + +### Point + +A 2D coordinate. + +``` +Point { + x: Float + y: Float +} +``` + +--- + +## Tables + +### NormalizedTable + +Standardized table structure across all formats. + +``` +NormalizedTable { + id: RegionId # Reference to table region + rows: List + column_count: Int + has_header: Bool + header_row_count: Int +} +``` + +### NormalizedRow + +A row within a normalized table. + +``` +NormalizedRow { + cells: List + is_header: Bool +} +``` + +### NormalizedCell + +A cell within a normalized table. + +``` +NormalizedCell { + text: String + col_span: Int # Number of columns this cell spans + row_span: Int # Number of rows this cell spans + data_type: CellDataType + is_merged_continuation: Bool # True if this is a continuation of a merged cell +} +``` + +### CellDataType + +Inferred data type of cell content. + +``` +CellDataType = + | Text + | Number + | Date + | DateTime + | Boolean + | Formula + | Empty +``` + +--- + +## Text Extraction + +### ExtractedText + +Result of native text extraction. + +``` +ExtractedText { + raw: String # All text concatenated + by_page: Map # Text grouped by page number + by_region: Map # Text mapped to regions + needs_ocr: Bool # True if document appears scanned +} +``` + +--- + +## Archives + +### ArchiveContents + +Contents of an unpacked archive. + +``` +ArchiveContents { + entries: List + total_size: Int # Total uncompressed size in bytes + compressed_size: Int # Compressed size in bytes +} +``` + +### ArchiveEntry + +A single entry within an archive. + +``` +ArchiveEntry { + path: String # Relative path within archive + size: Int # Uncompressed size in bytes + is_directory: Bool + content_kind: ContentKind # Detected content type + nested: ArchiveContents? # If entry is itself an archive +} +``` + +### ContentKind + +High-level content classification. + +``` +ContentKind = + | Text + | Document + | Spreadsheet + | Image + | Archive + | Unknown +``` + +--- + +## Thumbnails + +### Thumbnail + +A generated preview image. + +``` +Thumbnail { + data: Bytes # Encoded image data + width: Int + height: Int + format: ImageFormat + page: Int? # Page number for multi-page documents +} +``` + +### ImageFormat + +Output format for thumbnails. + +``` +ImageFormat = + | PNG + | JPEG + | WEBP +``` + +### ThumbnailConfig + +Configuration for thumbnail generation. + +``` +ThumbnailConfig { + max_width: Int # Maximum width in pixels + max_height: Int # Maximum height in pixels + format: ImageFormat + quality: Int # 1-100, for lossy formats + pages: PageSelection +} +``` + +### PageSelection + +Which pages to generate thumbnails for. + +``` +PageSelection = + | First # Only first page + | Range(start, end) # Specific page range + | All # All pages +``` + +--- + +## Processing Result + +### ProcessingResult + +Complete result of file processing pipeline. + +``` +ProcessingResult { + format: Result + metadata: Result? + regions: Result>? + text: Result? + normalized_tables: Result>? + thumbnails: Result>? + archive_contents: Result? +} +``` + +### ProcessingOptions + +Configuration for the processing pipeline. + +``` +ProcessingOptions { + claimed_extension: String? + extract_metadata: Bool + extract_regions: Bool + extract_text: Bool + normalize_tables: Bool + generate_thumbnails: Bool + unpack_archive: Bool + thumbnail_config: ThumbnailConfig? + max_archive_depth: Int +} +``` diff --git a/docs/PIPELINE.md b/docs/PIPELINE.md new file mode 100644 index 0000000..e9c9da3 --- /dev/null +++ b/docs/PIPELINE.md @@ -0,0 +1,635 @@ +# Processing Pipeline + +This document describes the file processing pipeline implemented by the runtime crates. + +## Pipeline Overview + +``` + ┌─────────────────┐ + │ Input File │ + └────────┬────────┘ + │ + ▼ + ┌─────────────────┐ + │ Format Detection│ + │ & Validation │ + └────────┬────────┘ + │ + ┌──────────────┼──────────────┐ + │ │ │ + ▼ ▼ ▼ + ┌────────────┐ ┌────────────┐ ┌────────────┐ + │ Metadata │ │ Region │ │ Archive │ + │ Extraction │ │ Extraction │ │ Unpacking │ + └────────────┘ └─────┬──────┘ └────────────┘ + │ + ┌────────────┼────────────┐ + │ │ │ + ▼ ▼ ▼ + ┌────────────┐ ┌────────────┐ ┌────────────┐ + │ Text │ │ Table │ │ Thumbnail │ + │ Extraction │ │Normalizatn │ │ Generation │ + └────────────┘ └────────────┘ └────────────┘ +``` + +--- + +## 1. Format Detection & Validation + +Validates file integrity and identifies the actual format regardless of file extension. + +### Algorithm + +``` +function detect_format(file_bytes, claimed_extension): + # Read magic bytes (first 16-32 bytes) + magic = file_bytes[0:32] + + # Match against known signatures + detected_format = match magic: + [0x25, 0x50, 0x44, 0x46] -> PDF + [0x50, 0x4B, 0x03, 0x04] -> ZIP_BASED # Could be DOCX, XLSX, ODT, etc. + [0x89, 0x50, 0x4E, 0x47] -> PNG + [0xFF, 0xD8, 0xFF] -> JPEG + [0x52, 0x49, 0x46, 0x46] -> RIFF_BASED # Could be WEBP, AVI, etc. + ... + _ -> UNKNOWN + + # For container formats, inspect contents + if detected_format == ZIP_BASED: + detected_format = inspect_zip_contents(file_bytes) + + # Validate extension matches content + is_extension_valid = matches(detected_format, claimed_extension) + + # Attempt to parse to verify integrity + integrity_check = try_parse(file_bytes, detected_format) + + return FormatResult { + format: detected_format, + mime_type: get_mime_type(detected_format), + extension_matches: is_extension_valid, + is_valid: integrity_check.success, + errors: integrity_check.errors + } +``` + +### ZIP-Based Format Detection + +``` +function inspect_zip_contents(zip_bytes): + entries = list_zip_entries(zip_bytes) + + if contains(entries, "[Content_Types].xml"): + if contains(entries, "word/document.xml"): + return DOCX + if contains(entries, "xl/workbook.xml"): + return XLSX + if contains(entries, "ppt/presentation.xml"): + return PPTX + + if contains(entries, "mimetype"): + mimetype = read_entry(zip_bytes, "mimetype") + if mimetype == "application/vnd.oasis.opendocument.text": + return ODT + if mimetype == "application/vnd.oasis.opendocument.spreadsheet": + return ODS + + return ZIP +``` + +--- + +## 2. Metadata Extraction + +Extracts document properties and embedded metadata. + +### Algorithm + +``` +function extract_metadata(file_bytes, format): + metadata = Metadata {} + + match format: + PDF: + info_dict = parse_pdf_info_dictionary(file_bytes) + metadata.title = info_dict["Title"] + metadata.author = info_dict["Author"] + metadata.created = parse_pdf_date(info_dict["CreationDate"]) + metadata.modified = parse_pdf_date(info_dict["ModDate"]) + metadata.page_count = count_pdf_pages(file_bytes) + + DOCX: + core_xml = extract_zip_entry(file_bytes, "docProps/core.xml") + app_xml = extract_zip_entry(file_bytes, "docProps/app.xml") + metadata.title = xpath(core_xml, "//dc:title") + metadata.author = xpath(core_xml, "//dc:creator") + metadata.created = xpath(core_xml, "//dcterms:created") + metadata.page_count = xpath(app_xml, "//Pages") + metadata.word_count = xpath(app_xml, "//Words") + + IMAGE: + exif = parse_exif(file_bytes) + metadata.dimensions = get_image_dimensions(file_bytes) + metadata.created = exif["DateTimeOriginal"] + metadata.camera = exif["Make"] + " " + exif["Model"] + metadata.gps = extract_gps_coordinates(exif) + + ARCHIVE: + entries = list_archive_entries(file_bytes) + metadata.entry_count = length(entries) + metadata.total_uncompressed_size = sum(entry.size for entry in entries) + + return metadata +``` + +--- + +## 3. Region Extraction + +Parses documents into semantic regions with positions. + +### Algorithm + +``` +function extract_regions(file_bytes, format): + regions = [] + + match format: + PDF: + for page_num, page in enumerate_pages(file_bytes): + page_regions = extract_pdf_page_regions(page, page_num) + regions.extend(page_regions) + + DOCX: + document_xml = extract_zip_entry(file_bytes, "word/document.xml") + body = parse_xml(document_xml).body + regions = extract_docx_regions(body) + + XLSX: + workbook = parse_xlsx(file_bytes) + for sheet in workbook.sheets: + table_region = create_table_region(sheet) + regions.append(table_region) + + # Build hierarchy + regions = build_region_tree(regions) + + return regions +``` + +### PDF Region Extraction + +``` +function extract_pdf_page_regions(page, page_num): + regions = [] + content_stream = page.content_stream + + # Parse text blocks with positions + text_blocks = extract_text_blocks(content_stream) + for block in text_blocks: + region = Region { + id: generate_uuid(), + page: page_num, + bounds: normalize_bounds(block.bbox, page.dimensions), + text: block.text, + kind: classify_text_block(block) # Heading, Text, etc. + } + regions.append(region) + + # Detect tables using layout analysis + tables = detect_tables_from_layout(text_blocks) + for table in tables: + table_region = Region { + id: generate_uuid(), + page: page_num, + bounds: table.bounds, + kind: TABLE, + children: [] + } + + for row in table.rows: + row_region = create_row_region(row, table_region.id) + table_region.children.append(row_region.id) + regions.append(row_region) + + regions.append(table_region) + + # Extract images + images = extract_images(content_stream) + for image in images: + regions.append(Region { + id: generate_uuid(), + page: page_num, + bounds: normalize_bounds(image.bbox, page.dimensions), + kind: IMAGE + }) + + return regions +``` + +### DOCX Region Extraction + +``` +function extract_docx_regions(body): + regions = [] + position = 0 + + for element in body.children: + match element.tag: + "w:p": # Paragraph + style = get_paragraph_style(element) + kind = match style: + "Heading1", "Heading2", ... -> HEADING + "ListParagraph" -> LIST_ITEM + _ -> TEXT + + region = Region { + id: generate_uuid(), + bounds: estimate_bounds(position), + text: extract_paragraph_text(element), + kind: kind + } + regions.append(region) + position += 1 + + "w:tbl": # Table + table_region = extract_docx_table(element, position) + regions.append(table_region) + regions.extend(table_region.all_descendants()) + position += 1 + + return regions +``` + +--- + +## 4. Table Normalization + +Converts tables from various formats into a consistent structure. + +### Algorithm + +``` +function normalize_table(table_region, source_format): + raw_cells = extract_raw_cells(table_region, source_format) + + # Step 1: Determine grid dimensions + max_row = max(cell.row for cell in raw_cells) + max_col = max(cell.col + cell.col_span - 1 for cell in raw_cells) + + # Step 2: Build occupancy grid for merged cells + grid = Grid(max_row + 1, max_col + 1) + for cell in raw_cells: + for r in range(cell.row, cell.row + cell.row_span): + for c in range(cell.col, cell.col + cell.col_span): + grid[r][c] = CellRef { + source_cell: cell, + is_origin: (r == cell.row and c == cell.col) + } + + # Step 3: Detect header rows + header_row_count = detect_header_rows(raw_cells, source_format) + + # Step 4: Infer column types + column_types = [] + for col in range(max_col + 1): + col_values = [grid[r][col].source_cell.text for r in range(header_row_count, max_row + 1)] + column_types.append(infer_data_type(col_values)) + + # Step 5: Build normalized structure + normalized = NormalizedTable { + id: table_region.id, + column_count: max_col + 1, + has_header: header_row_count > 0, + header_row_count: header_row_count, + rows: [] + } + + for r in range(max_row + 1): + row = NormalizedRow { + is_header: r < header_row_count, + cells: [] + } + for c in range(max_col + 1): + cell_ref = grid[r][c] + row.cells.append(NormalizedCell { + text: cell_ref.source_cell.text if cell_ref.is_origin else "", + col_span: cell_ref.source_cell.col_span if cell_ref.is_origin else 1, + row_span: cell_ref.source_cell.row_span if cell_ref.is_origin else 1, + data_type: column_types[c], + is_merged_continuation: not cell_ref.is_origin + }) + normalized.rows.append(row) + + return normalized +``` + +### Header Detection + +``` +function detect_header_rows(cells, format): + # Format-specific hints + match format: + XLSX: + # Check for explicit header style + if has_header_style(cells[0]): + return 1 + DOCX: + # Check for tblHeader property + if has_table_header_property(cells): + return count_header_rows() + + # Heuristic detection + first_row_cells = [c for c in cells if c.row == 0] + + # Check if first row is bold + if all(cell.is_bold for cell in first_row_cells): + return 1 + + # Check if first row has different background + if has_distinct_background(first_row_cells, cells): + return 1 + + # Check if first row contains no numeric data + if all(not is_numeric(cell.text) for cell in first_row_cells): + data_rows = [c for c in cells if c.row > 0] + if any(is_numeric(cell.text) for cell in data_rows): + return 1 + + return 0 +``` + +--- + +## 5. Text Extraction + +Extracts native text content without OCR. + +### Algorithm + +``` +function extract_text(file_bytes, format, regions): + match format: + PDF: + return extract_pdf_text(file_bytes, regions) + DOCX: + return extract_docx_text(file_bytes) + XLSX: + return extract_xlsx_text(file_bytes) + TEXT, MARKDOWN: + return decode_text(file_bytes) + HTML: + return strip_html_tags(decode_text(file_bytes)) + _: + return null # Requires OCR +``` + +### PDF Text Extraction + +``` +function extract_pdf_text(file_bytes, regions): + result = ExtractedText { + raw: "", + by_page: {}, + by_region: {} + } + + for page_num, page in enumerate_pages(file_bytes): + page_text = "" + + # Extract text following reading order + text_objects = extract_text_objects(page.content_stream) + text_objects = sort_by_reading_order(text_objects) + + for obj in text_objects: + page_text += obj.text + " " + + # Map to region if available + matching_region = find_region_containing(obj.position, regions) + if matching_region: + result.by_region[matching_region.id] += obj.text + " " + + result.by_page[page_num] = page_text.trim() + result.raw += page_text + + # Check if text extraction yielded results + if is_mostly_empty(result.raw): + result.needs_ocr = true + + return result +``` + +### Reading Order Detection + +``` +function sort_by_reading_order(text_objects): + # Group by approximate Y position (same line) + lines = group_by_y_position(text_objects, tolerance=5) + + # Sort lines top to bottom + lines = sort_by_y(lines) + + # Within each line, sort left to right + result = [] + for line in lines: + line = sort_by_x(line) + result.extend(line) + + return result +``` + +--- + +## 6. Archive Unpacking + +Extracts and processes files within archives. + +### Algorithm + +``` +function unpack_archive(file_bytes, format, max_depth=3): + if max_depth <= 0: + return ArchiveResult { error: "Max nesting depth exceeded" } + + entries = [] + + match format: + ZIP, DOCX, XLSX, ODT: + entries = list_zip_entries(file_bytes) + TAR: + entries = list_tar_entries(file_bytes) + GZIP: + decompressed = gunzip(file_bytes) + # Check if it's a tar inside + if starts_with_tar_magic(decompressed): + return unpack_archive(decompressed, TAR, max_depth) + return SingleFileResult { data: decompressed } + SEVENZ: + entries = list_7z_entries(file_bytes) + + result = ArchiveContents { + entries: [], + total_size: 0, + compressed_size: length(file_bytes) + } + + for entry in entries: + entry_info = ArchiveEntry { + path: entry.path, + size: entry.uncompressed_size, + is_directory: entry.is_directory, + content_kind: detect_content_kind(entry.path) + } + + # Check for nested archives + if is_archive_format(entry_info.content_kind): + entry_bytes = extract_entry(file_bytes, entry.path) + entry_info.nested = unpack_archive(entry_bytes, entry_info.content_kind, max_depth - 1) + + result.entries.append(entry_info) + result.total_size += entry.uncompressed_size + + return result +``` + +--- + +## 7. Thumbnail Generation + +Generates preview images for display. + +### Algorithm + +``` +function generate_thumbnails(file_bytes, format, config): + thumbnails = [] + + match format: + PDF: + thumbnails = generate_pdf_thumbnails(file_bytes, config) + DOCX: + # Convert to PDF first, then render + pdf_bytes = convert_docx_to_pdf(file_bytes) + thumbnails = generate_pdf_thumbnails(pdf_bytes, config) + XLSX: + thumbnails = generate_spreadsheet_thumbnail(file_bytes, config) + IMAGE: + thumbnails = generate_image_thumbnails(file_bytes, config) + TEXT, MARKDOWN, CODE: + thumbnails = generate_text_thumbnail(file_bytes, format, config) + ARCHIVE: + thumbnails = [generate_archive_icon()] + + return thumbnails +``` + +### PDF Thumbnail Generation + +``` +function generate_pdf_thumbnails(pdf_bytes, config): + thumbnails = [] + + pages_to_render = match config.pages: + FIRST -> [0] + RANGE(start, end) -> range(start, end) + ALL -> range(0, count_pages(pdf_bytes)) + + for page_num in pages_to_render: + # Render page to image at appropriate DPI + target_width = config.max_width + page_dims = get_page_dimensions(pdf_bytes, page_num) + dpi = calculate_dpi_for_width(page_dims, target_width) + + image = render_pdf_page(pdf_bytes, page_num, dpi) + + # Resize if needed + if image.width > config.max_width or image.height > config.max_height: + image = resize_preserving_aspect(image, config.max_width, config.max_height) + + # Encode to output format + encoded = encode_image(image, config.format, config.quality) + + thumbnails.append(Thumbnail { + data: encoded, + width: image.width, + height: image.height, + format: config.format, + page: page_num + }) + + return thumbnails +``` + +### Text/Code Thumbnail Generation + +``` +function generate_text_thumbnail(file_bytes, format, config): + text = decode_text(file_bytes) + + # Limit to visible portion + lines = split_lines(text)[:50] + + # Apply syntax highlighting if code + if format == CODE: + language = detect_language(text) + highlighted = apply_syntax_highlighting(lines, language) + else: + highlighted = lines + + # Render to image + image = render_text_to_image(highlighted, { + font: "monospace", + font_size: 12, + padding: 16, + max_width: config.max_width, + background: "#ffffff" + }) + + encoded = encode_image(image, config.format, config.quality) + + return [Thumbnail { + data: encoded, + width: image.width, + height: image.height, + format: config.format + }] +``` + +--- + +## Error Handling + +Each pipeline stage operates independently. Failures are captured but don't block other stages. + +``` +function process_file(file_bytes, options): + result = ProcessingResult {} + + # Format detection is required + result.format = detect_format(file_bytes, options.claimed_extension) + if not result.format.is_valid: + return result # Cannot proceed with invalid file + + # Run remaining stages in parallel where possible + if options.extract_metadata: + result.metadata = try { extract_metadata(file_bytes, result.format) } + + if options.extract_regions: + result.regions = try { extract_regions(file_bytes, result.format) } + + if options.extract_text: + result.text = try { extract_text(file_bytes, result.format, result.regions) } + + if options.normalize_tables and result.regions: + tables = filter(result.regions, r -> r.kind == TABLE) + result.normalized_tables = try { normalize_tables(tables, result.format) } + + if options.generate_thumbnails: + result.thumbnails = try { generate_thumbnails(file_bytes, result.format, options.thumbnail_config) } + + if options.unpack_archive and is_archive(result.format): + result.archive_contents = try { unpack_archive(file_bytes, result.format) } + + return result +``` diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..5f5b8dd --- /dev/null +++ b/docs/README.md @@ -0,0 +1,31 @@ +# Runtime + +The runtime crates provide file processing capabilities for the collaboration platform. These crates handle local file operations before content is passed to external services (OCR, embeddings). + +## Overview + +When a user uploads a file, the runtime processes it through a pipeline that: + +1. Validates format and integrity +2. Extracts metadata +3. Identifies regions (paragraphs, tables, images, etc.) +4. Normalizes tabular data +5. Extracts native text +6. Unpacks archives +7. Generates thumbnails + +## Documentation + +- [Pipeline](./PIPELINE.md) — Processing stages and algorithms +- [Data Types](./DATATYPES.md) — Core data structures + +## Crate Structure + +| Crate | Responsibility | +|-------|----------------| +| `nvisy-core` | ContentData, ContentSource, errors | +| `nvisy-archive` | ZIP, TAR, 7Z handling | +| `nvisy-document` | PDF, DOCX parsing and region extraction | +| `nvisy-spreadsheet` | XLSX, CSV parsing | +| `nvisy-metadata` | Unified metadata extraction | +| `nvisy-thumbnail` | Preview image generation | From 43e655f04c2d90349aa738c6e886ac91796098b8 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Fri, 16 Jan 2026 07:18:41 +0100 Subject: [PATCH 2/5] feat: add nvisy-image crate, dynamic registry, and reformat Cargo.toml files - Add nvisy-image stub crate with ImageDocument and ImageFormat - Add dynamic FormatRegistry with type erasure for runtime format selection - Split nvisy-text structured formats into separate modules (xml, yaml, toml, ini) - Use csv 1.4 crate for CSV parsing - Use markdown 1.0 crate for Markdown parsing - Remove write operations from nvisy-document (operation module) - Reformat all Cargo.toml files with grouped dependencies and comments - Standardize package section format across all crates --- Cargo.lock | 58 ++ Cargo.toml | 35 +- crates/nvisy-archive/Cargo.toml | 42 +- crates/nvisy-core/Cargo.toml | 26 +- crates/nvisy-document/Cargo.toml | 10 +- crates/nvisy-document/src/error.rs | 56 +- .../nvisy-document/src/format/capabilities.rs | 482 +++------------- crates/nvisy-document/src/format/mod.rs | 36 +- crates/nvisy-document/src/format/page.rs | 2 +- .../nvisy-document/src/format/region/core.rs | 77 +-- .../nvisy-document/src/format/region/kind.rs | 34 +- .../nvisy-document/src/format/region/mod.rs | 10 +- .../src/format/region/status.rs | 60 +- crates/nvisy-document/src/lib.rs | 29 +- crates/nvisy-document/src/operation/insert.rs | 160 ------ crates/nvisy-document/src/operation/mod.rs | 542 ------------------ crates/nvisy-document/src/operation/redact.rs | 103 ---- crates/nvisy-document/src/operation/result.rs | 136 ----- crates/nvisy-document/src/operation/split.rs | 105 ---- crates/nvisy-docx/Cargo.toml | 6 +- crates/nvisy-docx/src/document.rs | 35 +- crates/nvisy-docx/src/format.rs | 28 +- crates/nvisy-engine/Cargo.toml | 11 +- crates/nvisy-engine/src/engine/mod.rs | 297 +++++----- crates/nvisy-engine/src/lib.rs | 16 +- crates/nvisy-engine/src/registry/mod.rs | 375 ++++++++++++ crates/nvisy-engine/src/session/history.rs | 213 ++----- crates/nvisy-engine/src/session/mod.rs | 238 ++------ crates/nvisy-image/Cargo.toml | 30 + crates/nvisy-image/README.md | 13 + crates/nvisy-image/src/document.rs | 56 ++ crates/nvisy-image/src/format.rs | 82 +++ crates/nvisy-image/src/lib.rs | 9 + crates/nvisy-pdf/Cargo.toml | 6 +- crates/nvisy-pdf/src/document.rs | 33 +- crates/nvisy-pdf/src/format.rs | 20 +- crates/nvisy-text/Cargo.toml | 11 +- crates/nvisy-text/README.md | 100 +++- crates/nvisy-text/src/document.rs | 79 --- crates/nvisy-text/src/documents/csv.rs | 355 ++++++++++++ crates/nvisy-text/src/documents/ini.rs | 229 ++++++++ crates/nvisy-text/src/documents/json.rs | 261 +++++++++ crates/nvisy-text/src/documents/markdown.rs | 343 +++++++++++ crates/nvisy-text/src/documents/mod.rs | 19 + crates/nvisy-text/src/documents/plain.rs | 207 +++++++ crates/nvisy-text/src/documents/toml.rs | 210 +++++++ crates/nvisy-text/src/documents/xml.rs | 174 ++++++ crates/nvisy-text/src/documents/yaml.rs | 189 ++++++ crates/nvisy-text/src/format.rs | 70 --- crates/nvisy-text/src/formats/csv.rs | 114 ++++ crates/nvisy-text/src/formats/ini.rs | 98 ++++ crates/nvisy-text/src/formats/json.rs | 98 ++++ crates/nvisy-text/src/formats/markdown.rs | 99 ++++ crates/nvisy-text/src/formats/mod.rs | 19 + crates/nvisy-text/src/formats/plain.rs | 74 +++ crates/nvisy-text/src/formats/toml.rs | 97 ++++ crates/nvisy-text/src/formats/xml.rs | 99 ++++ crates/nvisy-text/src/formats/yaml.rs | 98 ++++ crates/nvisy-text/src/lib.rs | 26 +- 59 files changed, 4074 insertions(+), 2466 deletions(-) delete mode 100644 crates/nvisy-document/src/operation/insert.rs delete mode 100644 crates/nvisy-document/src/operation/mod.rs delete mode 100644 crates/nvisy-document/src/operation/redact.rs delete mode 100644 crates/nvisy-document/src/operation/result.rs delete mode 100644 crates/nvisy-document/src/operation/split.rs create mode 100644 crates/nvisy-engine/src/registry/mod.rs create mode 100644 crates/nvisy-image/Cargo.toml create mode 100644 crates/nvisy-image/README.md create mode 100644 crates/nvisy-image/src/document.rs create mode 100644 crates/nvisy-image/src/format.rs create mode 100644 crates/nvisy-image/src/lib.rs delete mode 100644 crates/nvisy-text/src/document.rs create mode 100644 crates/nvisy-text/src/documents/csv.rs create mode 100644 crates/nvisy-text/src/documents/ini.rs create mode 100644 crates/nvisy-text/src/documents/json.rs create mode 100644 crates/nvisy-text/src/documents/markdown.rs create mode 100644 crates/nvisy-text/src/documents/mod.rs create mode 100644 crates/nvisy-text/src/documents/plain.rs create mode 100644 crates/nvisy-text/src/documents/toml.rs create mode 100644 crates/nvisy-text/src/documents/xml.rs create mode 100644 crates/nvisy-text/src/documents/yaml.rs delete mode 100644 crates/nvisy-text/src/format.rs create mode 100644 crates/nvisy-text/src/formats/csv.rs create mode 100644 crates/nvisy-text/src/formats/ini.rs create mode 100644 crates/nvisy-text/src/formats/json.rs create mode 100644 crates/nvisy-text/src/formats/markdown.rs create mode 100644 crates/nvisy-text/src/formats/mod.rs create mode 100644 crates/nvisy-text/src/formats/plain.rs create mode 100644 crates/nvisy-text/src/formats/toml.rs create mode 100644 crates/nvisy-text/src/formats/xml.rs create mode 100644 crates/nvisy-text/src/formats/yaml.rs diff --git a/Cargo.lock b/Cargo.lock index 0fc610e..ccd2d32 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -206,6 +206,27 @@ dependencies = [ "typenum", ] +[[package]] +name = "csv" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde_core", +] + +[[package]] +name = "csv-core" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" +dependencies = [ + "memchr", +] + [[package]] name = "deflate64" version = "0.1.10" @@ -551,6 +572,15 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "markdown" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5cab8f2cadc416a82d2e783a1946388b31654d391d1c7d92cc1f03e295b1deb" +dependencies = [ + "unicode-id", +] + [[package]] name = "matchers" version = "0.2.0" @@ -687,9 +717,20 @@ dependencies = [ "nvisy-text", "serde", "serde_json", + "tokio", "uuid", ] +[[package]] +name = "nvisy-image" +version = "0.1.0" +dependencies = [ + "async-trait", + "bytes", + "nvisy-document", + "thiserror", +] + [[package]] name = "nvisy-pdf" version = "0.1.0" @@ -706,8 +747,13 @@ version = "0.1.0" dependencies = [ "async-trait", "bytes", + "csv", + "markdown", "nvisy-document", + "serde_json", "thiserror", + "tokio", + "tokio-test", ] [[package]] @@ -843,6 +889,12 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" +[[package]] +name = "ryu" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a50f4cf475b65d88e057964e0e9bb1f0aa9bbb2036dc65c64596b42932536984" + [[package]] name = "scoped-tls" version = "1.0.1" @@ -1183,6 +1235,12 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" +[[package]] +name = "unicode-id" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ba288e709927c043cbe476718d37be306be53fb1fafecd0dbe36d072be2580" + [[package]] name = "unicode-ident" version = "1.0.19" diff --git a/Cargo.toml b/Cargo.toml index c81f9cc..60a8c09 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ members = [ "./crates/nvisy-docx", "./crates/nvisy-document", "./crates/nvisy-engine", + "./crates/nvisy-image", "./crates/nvisy-pdf", "./crates/nvisy-text", ] @@ -24,57 +25,53 @@ repository = "https://github.com/nvisycom/core" homepage = "https://github.com/nvisycom/core" documentation = "https://docs.rs/nvisy" -[workspace.dependencies] # Default features are disabled for certain dependencies to allow # downstream workspaces/crates to selectively enable them as needed. # # See for more details: https://github.com/rust-lang/cargo/issues/11329 +[workspace.dependencies] # Internal crates nvisy-archive = { path = "./crates/nvisy-archive", version = "0.1.0", features = [] } nvisy-core = { path = "./crates/nvisy-core", version = "0.1.0", features = [] } nvisy-docx = { path = "./crates/nvisy-docx", version = "0.1.0", features = [] } nvisy-document = { path = "./crates/nvisy-document", version = "0.1.0", features = [] } nvisy-engine = { path = "./crates/nvisy-engine", version = "0.1.0", features = [] } +nvisy-image = { path = "./crates/nvisy-image", version = "0.1.0", features = [] } nvisy-pdf = { path = "./crates/nvisy-pdf", version = "0.1.0", features = [] } nvisy-text = { path = "./crates/nvisy-text", version = "0.1.0", features = [] } -# Multithreading -rayon = { version = "1.11", default-features = false, features = [] } - -# Async I/O and file handling +# Async runtime and I/O tokio = { version = "1.49", default-features = false, features = [] } tokio-stream = { version = "0.1", default-features = false, features = [] } tokio-util = { version = "0.7", default-features = false, features = [] } futures = { version = "0.3", default-features = false, features = [] } async-trait = { version = "0.1", default-features = false, features = [] } + +# File system utilities walkdir = { version = "2.5", default-features = false, features = [] } memmap2 = { version = "0.9", default-features = false, features = [] } tempfile = { version = "3.24", default-features = false, features = [] } -# Tracing and observability -tracing = { version = "0.1", features = [] } -tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } - -# Error handling -thiserror = { version = "2.0", features = [] } -anyhow = { version = "1.0", features = ["backtrace"] } +# Multithreading +rayon = { version = "1.11", default-features = false, features = [] } # Serialization serde = { version = "1.0", features = [] } serde_json = { version = "1.0", features = [] } +csv = { version = "1.4", default-features = false, features = [] } # Data types and utilities uuid = { version = "1.19", features = [] } jiff = { version = "0.2", default-features = false, features = [] } size = { version = "0.5", default-features = false, features = [] } bytes = { version = "1.11", default-features = false, features = [] } - rust_decimal = { version = "1.36", default-features = false, features = [] } semver = { version = "1.0", default-features = false, features = [] } isolang = { version = "2.4", default-features = false, features = ["english_names"] } # Text processing and pattern matching +markdown = { version = "1.0.0", default-features = false, features = [] } regex = { version = "1.11", default-features = false, features = [] } regex-lite = { version = "0.1", default-features = false, features = ["std"] } fancy-regex = { version = "0.16", default-features = false, features = [] } @@ -82,7 +79,7 @@ aho-corasick = { version = "1.1", default-features = false, features = [] } unicode-segmentation = { version = "1.10", default-features = false, features = [] } hipstr = { version = "0.8", default-features = false, features = [] } -# Crypto and hashing +# Cryptography and hashing sha2 = { version = "0.10", default-features = false, features = [] } blake3 = { version = "1.8", default-features = false, features = [] } base64 = { version = "0.22", default-features = false, features = [] } @@ -90,7 +87,15 @@ hex = { version = "0.4", features = [] } zeroize = { version = "1.7", default-features = false, features = [] } rand = { version = "0.9", default-features = false, features = [] } -# Macros +# Error handling +thiserror = { version = "2.0", features = [] } +anyhow = { version = "1.0", features = ["backtrace"] } + +# Tracing and observability +tracing = { version = "0.1", features = [] } +tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } + +# Macros and derive utilities derive_more = { version = "2.0", default-features = false, features = [] } strum = { version = "0.27", default-features = false, features = [] } const_format = { version = "0.2", default-features = false, features = [] } diff --git a/crates/nvisy-archive/Cargo.toml b/crates/nvisy-archive/Cargo.toml index 706468b..2c49250 100644 --- a/crates/nvisy-archive/Cargo.toml +++ b/crates/nvisy-archive/Cargo.toml @@ -1,20 +1,24 @@ +# https://doc.rust-lang.org/cargo/reference/manifest.html + [package] name = "nvisy-archive" +description = "Archive handling library for nvisy (ZIP, TAR, 7z, etc.)" +readme = "./README.md" + version = { workspace = true } rust-version = { workspace = true } edition = { workspace = true } license = { workspace = true } publish = { workspace = true } -readme = "./README.md" authors = { workspace = true } repository = { workspace = true } homepage = { workspace = true } documentation = { workspace = true } -description = "Archive handling library for Nvisy, supports ZIP, TAR, 7z, and other archive formats" -keywords = ["archive", "zip", "tar", "7z", "compression", "extraction"] -categories = ["compression", "filesystem"] +[package.metadata.docs.rs] +all-features = true +rustdoc-args = ["--cfg", "docsrs"] [features] default = ["zip", "tar", "gzip", "bzip2", "xz"] @@ -26,26 +30,28 @@ bzip2 = ["dep:bzip2"] xz = ["dep:xz2"] [dependencies] -# Core dependencies +# Internal crates nvisy-core = { workspace = true } -bytes = { workspace = true } -# Utilities -strum = { workspace = true, features = ["derive"] } +# Data types +bytes = { workspace = true } -# Async and I/O +# Async runtime and I/O tokio = { workspace = true, features = ["fs", "io-util", "rt"] } -tempfile = { workspace = true, features = [] } +tempfile = { workspace = true } + +# Macros +strum = { workspace = true, features = ["derive"] } -# Archive formats -tar = { version = "0.4", optional = true, features = [] } -zip = { version = "7.1", optional = true, features = [] } -sevenz-rust = { version = "0.6", optional = true, features = [] } +# Archive formats (optional) +tar = { version = "0.4", optional = true } +zip = { version = "7.1", optional = true } +sevenz-rust = { version = "0.6", optional = true } -# Compression formats (all optional) -flate2 = { version = "1.1", optional = true, features = [] } -bzip2 = { version = "0.6", optional = true, features = [] } -xz2 = { version = "0.1", optional = true, features = [] } +# Compression formats (optional) +flate2 = { version = "1.1", optional = true } +bzip2 = { version = "0.6", optional = true } +xz2 = { version = "0.1", optional = true } [dev-dependencies] tokio = { workspace = true, features = ["macros", "rt-multi-thread"] } diff --git a/crates/nvisy-core/Cargo.toml b/crates/nvisy-core/Cargo.toml index 390a46a..46029ed 100644 --- a/crates/nvisy-core/Cargo.toml +++ b/crates/nvisy-core/Cargo.toml @@ -2,12 +2,14 @@ [package] name = "nvisy-core" +description = "Core types and utilities for nvisy" +readme = "./README.md" + version = { workspace = true } rust-version = { workspace = true } edition = { workspace = true } license = { workspace = true } publish = { workspace = true } -readme = "./README.md" authors = { workspace = true } repository = { workspace = true } @@ -22,26 +24,26 @@ rustdoc-args = ["--cfg", "docsrs"] # Async runtime and I/O tokio = { workspace = true, features = ["fs", "io-util", "rt", "macros"] } -# Data structures and utilities +# Data types uuid = { workspace = true, features = ["v4", "v7", "serde"] } jiff = { workspace = true, features = ["std", "serde"] } bytes = { workspace = true, features = ["serde"] } -# Cryptography -sha2 = { workspace = true, features = [] } -hex = { workspace = true, features = [] } - -# (De)serialization +# Serialization serde = { workspace = true, features = ["derive"] } -# Utilities -strum = { workspace = true, features = ["derive"] } -derive_more = { workspace = true, features = ["as_ref", "deref"] } +# Cryptography +sha2 = { workspace = true } +hex = { workspace = true } -# Error handling (moved from nvisy-error crate) +# Error handling thiserror = { workspace = true, features = ["std"] } hipstr = { workspace = true, features = ["std", "serde"] } +# Macros +strum = { workspace = true, features = ["derive"] } +derive_more = { workspace = true, features = ["as_ref", "deref"] } + [dev-dependencies] serde_json = { workspace = true, features = ["std"] } -tempfile = { workspace = true, features = [] } +tempfile = { workspace = true } diff --git a/crates/nvisy-document/Cargo.toml b/crates/nvisy-document/Cargo.toml index 3d6ec4b..4351dea 100644 --- a/crates/nvisy-document/Cargo.toml +++ b/crates/nvisy-document/Cargo.toml @@ -2,12 +2,14 @@ [package] name = "nvisy-document" +description = "Document abstraction layer for nvisy" +readme = "./README.md" + version = { workspace = true } rust-version = { workspace = true } edition = { workspace = true } license = { workspace = true } publish = { workspace = true } -readme = "./README.md" authors = { workspace = true } repository = { workspace = true } @@ -19,10 +21,10 @@ all-features = true rustdoc-args = ["--cfg", "docsrs"] [dependencies] -# Core nvisy types +# Internal crates nvisy-core = { workspace = true } -# Async runtime +# Async runtime and I/O tokio = { workspace = true, features = ["sync", "io-util"] } async-trait = { workspace = true } @@ -39,7 +41,7 @@ base64 = { workspace = true, features = ["std"] } # Error handling thiserror = { workspace = true, features = ["std"] } -# Utilities +# Macros derive_more = { workspace = true, features = ["display", "from", "into", "deref", "deref_mut", "as_ref", "constructor"] } [dev-dependencies] diff --git a/crates/nvisy-document/src/error.rs b/crates/nvisy-document/src/error.rs index b6b5788..d5a2d77 100644 --- a/crates/nvisy-document/src/error.rs +++ b/crates/nvisy-document/src/error.rs @@ -1,4 +1,4 @@ -//! Error types for document operations. +//! Error types for document processing. use std::fmt; @@ -7,17 +7,17 @@ use crate::format::region::RegionId; /// A boxed error type for wrapping source errors. pub type BoxError = Box; -/// Result type for document operations. +/// Result type for document processing. pub type Result = std::result::Result; -/// The error type for document operations. +/// The error type for document processing. #[derive(Debug)] pub struct Error { kind: ErrorKind, source: Option, } -/// The kind of error that occurred during a document operation. +/// The kind of error that occurred during document processing. #[derive(Debug, Clone, PartialEq, Eq)] pub enum ErrorKind { /// The document format is not supported. @@ -26,25 +26,19 @@ pub enum ErrorKind { /// The document could not be parsed. Parse { message: String }, - /// The requested operation is not supported by this format. - OperationNotSupported { operation: String }, - /// A referenced region was not found. RegionNotFound { id: RegionId }, /// A referenced page was not found. PageNotFound { page: u32 }, - /// An operation would result in invalid document state. - InvalidOperation { message: String }, - /// An I/O error occurred. Io { message: String }, /// Serialization/deserialization error. Serialization { message: String }, - /// The operation was cancelled. + /// The processing was cancelled. Cancelled, /// A timeout occurred. @@ -53,13 +47,10 @@ pub enum ErrorKind { /// Resource limit exceeded. ResourceLimit { resource: String }, - /// Session error (e.g., invalid session state). - Session { message: String }, - /// Conversion error. Conversion { message: String }, - /// Metadata error. + /// Metadata extraction error. Metadata { message: String }, /// Thumbnail generation error. @@ -108,10 +99,7 @@ impl Error { pub fn is_user_error(&self) -> bool { matches!( self.kind, - ErrorKind::RegionNotFound { .. } - | ErrorKind::PageNotFound { .. } - | ErrorKind::InvalidOperation { .. } - | ErrorKind::OperationNotSupported { .. } + ErrorKind::RegionNotFound { .. } | ErrorKind::PageNotFound { .. } ) } @@ -144,13 +132,6 @@ impl Error { }) } - /// Creates an operation not supported error. - pub fn operation_not_supported(operation: impl Into) -> Self { - Self::new(ErrorKind::OperationNotSupported { - operation: operation.into(), - }) - } - /// Creates a region not found error. pub fn region_not_found(id: RegionId) -> Self { Self::new(ErrorKind::RegionNotFound { id }) @@ -161,13 +142,6 @@ impl Error { Self::new(ErrorKind::PageNotFound { page }) } - /// Creates an invalid operation error. - pub fn invalid_operation(message: impl Into) -> Self { - Self::new(ErrorKind::InvalidOperation { - message: message.into(), - }) - } - /// Creates an I/O error. pub fn io(message: impl Into) -> Self { Self::new(ErrorKind::Io { @@ -195,13 +169,6 @@ impl Error { }) } - /// Creates a session error. - pub fn session(message: impl Into) -> Self { - Self::new(ErrorKind::Session { - message: message.into(), - }) - } - /// Creates a timeout error. pub fn timeout(duration_ms: u64) -> Self { Self::new(ErrorKind::Timeout { duration_ms }) @@ -292,22 +259,17 @@ impl fmt::Display for Error { match &self.kind { ErrorKind::UnsupportedFormat { format } => write!(f, "unsupported format: {format}"), ErrorKind::Parse { message } => write!(f, "parse error: {message}"), - ErrorKind::OperationNotSupported { operation } => { - write!(f, "operation not supported: {operation}") - } ErrorKind::RegionNotFound { id } => write!(f, "region not found: {id}"), ErrorKind::PageNotFound { page } => write!(f, "page not found: {page}"), - ErrorKind::InvalidOperation { message } => write!(f, "invalid operation: {message}"), ErrorKind::Io { message } => write!(f, "I/O error: {message}"), ErrorKind::Serialization { message } => write!(f, "serialization error: {message}"), - ErrorKind::Cancelled => write!(f, "operation cancelled"), + ErrorKind::Cancelled => write!(f, "processing cancelled"), ErrorKind::Timeout { duration_ms } => { - write!(f, "operation timed out after {duration_ms}ms") + write!(f, "processing timed out after {duration_ms}ms") } ErrorKind::ResourceLimit { resource } => { write!(f, "resource limit exceeded: {resource}") } - ErrorKind::Session { message } => write!(f, "session error: {message}"), ErrorKind::Conversion { message } => write!(f, "conversion error: {message}"), ErrorKind::Metadata { message } => write!(f, "metadata error: {message}"), ErrorKind::Thumbnail { message } => write!(f, "thumbnail error: {message}"), diff --git a/crates/nvisy-document/src/format/capabilities.rs b/crates/nvisy-document/src/format/capabilities.rs index a8983e9..3d8f248 100644 --- a/crates/nvisy-document/src/format/capabilities.rs +++ b/crates/nvisy-document/src/format/capabilities.rs @@ -1,80 +1,44 @@ //! Document format capabilities. //! -//! Different document formats support different operations. This module -//! defines a capability matrix that allows querying what operations -//! are supported by a given format. +//! Different document formats support different features. This module +//! defines a capability matrix for querying what a format supports. use serde::{Deserialize, Serialize}; -use crate::operation::{ - ContentOperation, DocumentOperation, EditOperation, InsertOperation, MetadataOperation, - PageOperation, RedactStyle, StructuralOperation, -}; - /// Describes the capabilities of a document format. #[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct Capabilities { - /// Text editing capabilities. + /// Text extraction capabilities. pub text: TextCapabilities, - /// Image handling capabilities. - pub image: ImageCapabilities, - /// Structural capabilities. pub structure: StructureCapabilities, - /// Page-level capabilities. - pub page: PageCapabilities, - /// Metadata capabilities. pub metadata: MetadataCapabilities, } -/// Text editing capabilities. +/// Text extraction capabilities. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct TextCapabilities { - /// Can read/extract text content. - pub can_read: bool, - - /// Can replace text while preserving formatting. - pub can_replace_preserving_format: bool, - - /// Can replace text (may lose formatting). - pub can_replace: bool, - - /// Can insert new text. - pub can_insert: bool, - - /// Can delete text regions. - pub can_delete: bool, - - /// Supports rich text formatting. - pub supports_rich_text: bool, - - /// Supports font embedding. - pub supports_font_embedding: bool, -} - -/// Image handling capabilities. -#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct ImageCapabilities { - /// Can extract images. + /// Can extract native text content. pub can_extract: bool, - /// Can replace images. - pub can_replace: bool, - - /// Can insert new images. - pub can_insert: bool, - - /// Can redact images with blur. - pub can_blur: bool, + /// Supports rich text (formatting, styles). + pub has_rich_text: bool, - /// Can redact images with pixelation. - pub can_pixelate: bool, + /// May require OCR for text extraction. + pub may_need_ocr: bool, +} - /// Supported image formats for insertion. - pub supported_formats: Vec, +impl Default for TextCapabilities { + fn default() -> Self { + Self { + can_extract: true, + has_rich_text: false, + may_need_ocr: false, + } + } } /// Structural capabilities. @@ -86,356 +50,102 @@ pub struct StructureCapabilities { /// Can detect tables. pub can_detect_tables: bool, - /// Can modify table structure. - pub can_modify_tables: bool, - - /// Can merge regions. - pub can_merge: bool, - - /// Can split regions. - pub can_split: bool, - - /// Can move regions. - pub can_move: bool, - - /// Can copy regions. - pub can_copy: bool, -} - -/// Page-level capabilities. -#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct PageCapabilities { - /// Document has pages (vs. flowing text). + /// Has page-based layout. pub has_pages: bool, - - /// Can delete pages. - pub can_delete: bool, - - /// Can reorder pages. - pub can_reorder: bool, - - /// Can rotate pages. - pub can_rotate: bool, - - /// Can extract pages to new document. - pub can_extract: bool, - - /// Can split document at page boundaries. - pub can_split: bool, - - /// Can merge multiple documents. - pub can_merge: bool, } /// Metadata capabilities. #[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct MetadataCapabilities { - /// Can read document metadata. - pub can_read: bool, - - /// Can modify document metadata. - pub can_modify: bool, - - /// Can add annotations/comments. - pub can_annotate: bool, + /// Can extract document metadata. + pub can_extract: bool, - /// Annotations are preserved in output. - pub annotations_preserved: bool, + /// Has embedded annotations/comments. + pub has_annotations: bool, } impl Capabilities { - /// Returns capabilities for a format that supports everything. + /// Returns capabilities for a simple text format. #[must_use] - pub fn full() -> Self { + pub fn text() -> Self { Self { text: TextCapabilities { - can_read: true, - can_replace_preserving_format: true, - can_replace: true, - can_insert: true, - can_delete: true, - supports_rich_text: true, - supports_font_embedding: true, - }, - image: ImageCapabilities { can_extract: true, - can_replace: true, - can_insert: true, - can_blur: true, - can_pixelate: true, - supported_formats: vec![ - "image/png".to_string(), - "image/jpeg".to_string(), - "image/gif".to_string(), - ], + has_rich_text: false, + may_need_ocr: false, }, structure: StructureCapabilities { can_detect_structure: true, - can_detect_tables: true, - can_modify_tables: true, - can_merge: true, - can_split: true, - can_move: true, - can_copy: true, - }, - page: PageCapabilities { - has_pages: true, - can_delete: true, - can_reorder: true, - can_rotate: true, - can_extract: true, - can_split: true, - can_merge: true, + can_detect_tables: false, + has_pages: false, }, metadata: MetadataCapabilities { - can_read: true, - can_modify: true, - can_annotate: true, - annotations_preserved: true, + can_extract: false, + has_annotations: false, }, } } - /// Returns capabilities for a read-only format. + /// Returns capabilities for a rich document format (PDF, DOCX). #[must_use] - pub fn read_only() -> Self { + pub fn rich_document() -> Self { Self { text: TextCapabilities { - can_read: true, - can_replace_preserving_format: false, - can_replace: false, - can_insert: false, - can_delete: false, - supports_rich_text: false, - supports_font_embedding: false, - }, - image: ImageCapabilities { can_extract: true, - ..Default::default() + has_rich_text: true, + may_need_ocr: false, }, structure: StructureCapabilities { can_detect_structure: true, can_detect_tables: true, - ..Default::default() - }, - page: PageCapabilities { has_pages: true, - ..Default::default() }, metadata: MetadataCapabilities { - can_read: true, - ..Default::default() + can_extract: true, + has_annotations: true, }, } } - /// Checks if the format supports a specific operation. + /// Returns capabilities for an image format. #[must_use] - pub fn supports(&self, operation: &EditOperation) -> OperationSupport { - match operation { - EditOperation::Content(op) => self.supports_content(op), - EditOperation::Insert(op) => self.supports_insert(op), - EditOperation::Structural(op) => self.supports_structural(op), - EditOperation::Page(op) => self.supports_page(op), - EditOperation::Document(op) => self.supports_document(op), - EditOperation::Metadata(op) => self.supports_metadata(op), - } - } - - fn supports_content(&self, op: &ContentOperation) -> OperationSupport { - match op { - ContentOperation::Redact { style, .. } => { - if !self.text.can_delete && !self.text.can_replace { - return OperationSupport::NotSupported; - } - match style { - RedactStyle::Blur { .. } if !self.image.can_blur => { - OperationSupport::Degraded("Blur not supported, will use black box") - } - RedactStyle::Pixelate { .. } if !self.image.can_pixelate => { - OperationSupport::Degraded("Pixelate not supported, will use black box") - } - _ => OperationSupport::Full, - } - } - - ContentOperation::ReplaceText { - preserve_formatting, - .. - } => { - if !self.text.can_replace { - OperationSupport::NotSupported - } else if *preserve_formatting && !self.text.can_replace_preserving_format { - OperationSupport::Degraded("Formatting may not be fully preserved") - } else { - OperationSupport::Full - } - } - - ContentOperation::ReplaceSubstring { .. } => { - if self.text.can_replace { - OperationSupport::Full - } else { - OperationSupport::NotSupported - } - } - - ContentOperation::Delete { .. } => { - if self.text.can_delete { - OperationSupport::Full - } else { - OperationSupport::NotSupported - } - } - } - } - - fn supports_insert(&self, _op: &InsertOperation) -> OperationSupport { - if self.text.can_insert { - OperationSupport::Full - } else { - OperationSupport::NotSupported - } - } - - fn supports_structural(&self, op: &StructuralOperation) -> OperationSupport { - match op { - StructuralOperation::Move { .. } => { - if self.structure.can_move { - OperationSupport::Full - } else { - OperationSupport::NotSupported - } - } - - StructuralOperation::Copy { .. } => { - if self.structure.can_copy { - OperationSupport::Full - } else { - OperationSupport::NotSupported - } - } - - StructuralOperation::Merge { .. } => { - if self.structure.can_merge { - OperationSupport::Full - } else { - OperationSupport::NotSupported - } - } - - StructuralOperation::SplitRegion { .. } => { - if self.structure.can_split { - OperationSupport::Full - } else { - OperationSupport::NotSupported - } - } - } - } - - fn supports_page(&self, op: &PageOperation) -> OperationSupport { - match op { - PageOperation::DeletePages { .. } => { - if self.page.has_pages && self.page.can_delete { - OperationSupport::Full - } else { - OperationSupport::NotSupported - } - } - - PageOperation::ReorderPages { .. } => { - if self.page.has_pages && self.page.can_reorder { - OperationSupport::Full - } else { - OperationSupport::NotSupported - } - } - - PageOperation::RotatePages { .. } => { - if self.page.has_pages && self.page.can_rotate { - OperationSupport::Full - } else { - OperationSupport::NotSupported - } - } - - PageOperation::ExtractPages { .. } => { - if self.page.has_pages && self.page.can_extract { - OperationSupport::Full - } else { - OperationSupport::NotSupported - } - } - } - } - - fn supports_document(&self, op: &DocumentOperation) -> OperationSupport { - match op { - DocumentOperation::Split { .. } => { - if self.page.can_split { - OperationSupport::Full - } else { - OperationSupport::NotSupported - } - } - } - } - - fn supports_metadata(&self, op: &MetadataOperation) -> OperationSupport { - match op { - MetadataOperation::Reclassify { .. } | MetadataOperation::UpdateBounds { .. } => { - OperationSupport::Full - } - - MetadataOperation::Annotate { .. } => { - if self.metadata.can_annotate { - OperationSupport::Full - } else { - OperationSupport::Degraded("Annotations won't be persisted in output") - } - } + pub fn image() -> Self { + Self { + text: TextCapabilities { + can_extract: false, + has_rich_text: false, + may_need_ocr: true, + }, + structure: StructureCapabilities { + can_detect_structure: false, + can_detect_tables: false, + has_pages: false, + }, + metadata: MetadataCapabilities { + can_extract: true, // EXIF + has_annotations: false, + }, } } -} - -/// Result of checking operation support. -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum OperationSupport { - /// Operation is fully supported. - Full, - - /// Operation is supported but may not work perfectly. - Degraded(&'static str), - - /// Operation is not supported. - NotSupported, -} - -impl OperationSupport { - /// Returns true if the operation can be attempted. - #[must_use] - pub const fn is_supported(&self) -> bool { - !matches!(self, Self::NotSupported) - } - /// Returns true if the operation is fully supported. + /// Returns capabilities for a spreadsheet format. #[must_use] - pub const fn is_full(&self) -> bool { - matches!(self, Self::Full) - } -} - -impl Default for TextCapabilities { - fn default() -> Self { + pub fn spreadsheet() -> Self { Self { - can_read: true, - can_replace_preserving_format: false, - can_replace: false, - can_insert: false, - can_delete: false, - supports_rich_text: false, - supports_font_embedding: false, + text: TextCapabilities { + can_extract: true, + has_rich_text: false, + may_need_ocr: false, + }, + structure: StructureCapabilities { + can_detect_structure: true, + can_detect_tables: true, + has_pages: true, // Sheets as pages + }, + metadata: MetadataCapabilities { + can_extract: true, + has_annotations: false, + }, } } } @@ -443,44 +153,36 @@ impl Default for TextCapabilities { #[cfg(test)] mod tests { use super::*; - use crate::format::region::RegionId; #[test] - fn test_full_capabilities() { - let caps = Capabilities::full(); - let region = RegionId::new(); - - assert!(caps.supports(&EditOperation::redact(region)).is_full()); - assert!(caps.supports(&EditOperation::delete(region)).is_full()); + fn test_text_capabilities() { + let caps = Capabilities::text(); + assert!(caps.text.can_extract); + assert!(!caps.text.has_rich_text); + assert!(!caps.structure.can_detect_tables); } #[test] - fn test_read_only_capabilities() { - let caps = Capabilities::read_only(); - let region = RegionId::new(); - - assert!(!caps.supports(&EditOperation::delete(region)).is_supported()); - assert!(!caps - .supports(&EditOperation::replace_text(region, "test")) - .is_supported()); + fn test_rich_document_capabilities() { + let caps = Capabilities::rich_document(); + assert!(caps.text.can_extract); + assert!(caps.text.has_rich_text); + assert!(caps.structure.can_detect_tables); + assert!(caps.structure.has_pages); } #[test] - fn test_degraded_support() { - let mut caps = Capabilities::full(); - caps.text.can_replace_preserving_format = false; - - let region = RegionId::new(); - let op: EditOperation = ContentOperation::ReplaceText { - target: region, - new_text: "test".to_string(), - preserve_formatting: true, - } - .into(); + fn test_image_capabilities() { + let caps = Capabilities::image(); + assert!(!caps.text.can_extract); + assert!(caps.text.may_need_ocr); + assert!(caps.metadata.can_extract); + } - let support = caps.supports(&op); - assert!(support.is_supported()); - assert!(!support.is_full()); - assert!(matches!(support, OperationSupport::Degraded(_))); + #[test] + fn test_spreadsheet_capabilities() { + let caps = Capabilities::spreadsheet(); + assert!(caps.text.can_extract); + assert!(caps.structure.can_detect_tables); } } diff --git a/crates/nvisy-document/src/format/mod.rs b/crates/nvisy-document/src/format/mod.rs index 58d6296..58b1fa1 100644 --- a/crates/nvisy-document/src/format/mod.rs +++ b/crates/nvisy-document/src/format/mod.rs @@ -2,11 +2,8 @@ //! //! This module defines the core traits for document handling: //! -//! - [`DocumentFormat`]: A format handler (class/factory) that can load and create documents +//! - [`DocumentFormat`]: A format handler (class/factory) that can load documents //! - [`Document`]: A loaded document instance for reading document content -//! - [`EditableDocument`]: Extension trait for documents that support editing -//! -//! Think of `DocumentFormat` as a class and `Document` as an instance of that class. mod capabilities; mod info; @@ -19,24 +16,22 @@ use std::future::Future; use async_trait::async_trait; use bytes::Bytes; pub use capabilities::{ - Capabilities, ImageCapabilities, MetadataCapabilities, OperationSupport, PageCapabilities, - StructureCapabilities, TextCapabilities, + Capabilities, MetadataCapabilities, StructureCapabilities, TextCapabilities, }; pub use info::DocumentInfo; pub use page::PageOptions; pub use region::{BoundingBox, Point, Region, RegionId, RegionKind, RegionSource, RegionStatus}; use crate::error::Result; -use crate::operation::{EditOperation, EditResult}; /// Trait for document format handlers with an associated Document type. /// -/// A `DocumentFormat` is like a class that knows how to load and create +/// A `DocumentFormat` is like a class that knows how to load /// documents of a specific format. Each format implementation provides /// a concrete `Document` type. pub trait DocumentFormat: Send + Sync { /// The concrete document type produced by this format. - type Document: EditableDocument; + type Document: Document; /// Returns the format name (e.g., "pdf", "docx"). fn name(&self) -> &'static str; @@ -52,15 +47,11 @@ pub trait DocumentFormat: Send + Sync { /// Loads a document from bytes. fn load(&self, data: Bytes) -> impl Future> + Send; - - /// Creates a new empty document. - fn create_empty(&self) -> impl Future> + Send; } /// A loaded document instance (read-only access). /// /// Documents provide read access to document content and structure. -/// For editing capabilities, see [`EditableDocument`]. #[async_trait] pub trait Document: Send + Sync { /// Returns document information. @@ -76,22 +67,5 @@ pub trait Document: Send + Sync { fn find_region(&self, id: RegionId) -> Option<&Region>; /// Serializes the document to bytes. - async fn serialize(&self) -> Result; -} - -/// Extension trait for documents that support editing. -/// -/// This trait extends [`Document`] with mutation capabilities. -/// Not all document formats support editing - check the format's -/// [`Capabilities`] to determine what operations are supported. -#[async_trait] -pub trait EditableDocument: Document { - /// Applies an edit operation to the document. - async fn apply(&mut self, operation: &EditOperation) -> Result; - - /// Returns whether the document has unsaved changes. - fn is_modified(&self) -> bool; - - /// Extracts regions for specific pages (for streaming/pagination). - async fn extract_page_regions(&mut self, options: &PageOptions) -> Result>; + async fn to_bytes(&self) -> Result; } diff --git a/crates/nvisy-document/src/format/page.rs b/crates/nvisy-document/src/format/page.rs index ac11c74..bd8e129 100644 --- a/crates/nvisy-document/src/format/page.rs +++ b/crates/nvisy-document/src/format/page.rs @@ -1,4 +1,4 @@ -//! Page-related types for document operations. +//! Page-related types for document processing. /// Page extraction options. #[derive(Debug, Clone, Default)] diff --git a/crates/nvisy-document/src/format/region/core.rs b/crates/nvisy-document/src/format/region/core.rs index 2e10c4e..46a26c5 100644 --- a/crates/nvisy-document/src/format/region/core.rs +++ b/crates/nvisy-document/src/format/region/core.rs @@ -6,10 +6,10 @@ use serde::{Deserialize, Serialize}; use super::{BoundingBox, RegionId, RegionKind, RegionSource, RegionStatus}; -/// A region within a document that can be referenced and modified. +/// A region within a document that can be referenced. /// -/// Regions are the fundamental unit for VLM-driven document editing. -/// Each region has a stable ID, spatial bounds, and optional text content. +/// Regions represent semantically meaningful parts of a document +/// (paragraphs, tables, images, etc.) with stable IDs and spatial bounds. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Region { /// Unique identifier for this region. @@ -27,10 +27,10 @@ pub struct Region { /// Semantic type of this region. pub kind: RegionKind, - /// Current status within the edit session (None means Active). + /// Current status of the region. pub status: Option, - /// How this region was identified/created. + /// How this region was identified. pub source: RegionSource, /// Parent region ID, if this is a nested region. @@ -118,8 +118,10 @@ impl Region { } /// Adds a child region ID. - pub fn add_child(&mut self, child: RegionId) { + #[must_use] + pub fn with_child(mut self, child: RegionId) -> Self { self.children.push(child); + self } /// Returns the effective status (defaults to Active if None). @@ -128,12 +130,6 @@ impl Region { self.status.unwrap_or(RegionStatus::Active) } - /// Returns true if this region is still valid for operations. - #[must_use] - pub fn is_valid(&self) -> bool { - self.effective_status().is_valid() - } - /// Returns true if this region has text content. #[must_use] pub fn has_text(&self) -> bool { @@ -146,34 +142,10 @@ impl Region { self.kind.is_container() || !self.children.is_empty() } - /// Returns true if this region can have its text edited. + /// Returns true if this region kind typically contains text. #[must_use] - pub fn is_text_editable(&self) -> bool { - self.kind.is_text_editable() && self.is_valid() - } - - /// Marks the region as modified. - pub fn mark_modified(&mut self) { - if self.effective_status() == RegionStatus::Active { - self.status = Some(RegionStatus::Modified); - } - } - - /// Marks the region as deleted. - pub fn mark_deleted(&mut self) { - self.status = Some(RegionStatus::Deleted); - } - - /// Updates the text content and marks as modified. - pub fn update_text(&mut self, new_text: String) { - self.text = Some(new_text); - self.mark_modified(); - } - - /// Updates the bounds and marks as modified. - pub fn update_bounds(&mut self, new_bounds: BoundingBox) { - self.bounds = new_bounds; - self.mark_modified(); + pub fn has_text_content(&self) -> bool { + self.kind.has_text_content() } } @@ -192,7 +164,6 @@ mod tests { let bounds = BoundingBox::new(0.1, 0.2, 0.3, 0.4); let region = Region::new(bounds); - assert!(region.is_valid()); assert!(!region.has_text()); assert_eq!(region.kind, RegionKind::Unknown); assert_eq!(region.effective_status(), RegionStatus::Active); @@ -212,27 +183,15 @@ mod tests { } #[test] - fn test_region_modification() { - let mut region = Region::new(BoundingBox::default()).with_text("Original"); - - assert!(region.status.is_none()); - assert_eq!(region.effective_status(), RegionStatus::Active); - - region.update_text("Modified".to_string()); - - assert_eq!(region.status, Some(RegionStatus::Modified)); - assert_eq!(region.text.as_deref(), Some("Modified")); - } - - #[test] - fn test_region_deletion() { - let mut region = Region::new(BoundingBox::default()); - assert!(region.is_valid()); + fn test_region_has_text() { + let region = Region::new(BoundingBox::default()).with_text("Some text"); + assert!(region.has_text()); - region.mark_deleted(); + let empty_region = Region::new(BoundingBox::default()); + assert!(!empty_region.has_text()); - assert!(!region.is_valid()); - assert_eq!(region.status, Some(RegionStatus::Deleted)); + let empty_text_region = Region::new(BoundingBox::default()).with_text(""); + assert!(!empty_text_region.has_text()); } #[test] diff --git a/crates/nvisy-document/src/format/region/kind.rs b/crates/nvisy-document/src/format/region/kind.rs index 2d5182d..1cae528 100644 --- a/crates/nvisy-document/src/format/region/kind.rs +++ b/crates/nvisy-document/src/format/region/kind.rs @@ -4,8 +4,7 @@ use serde::{Deserialize, Serialize}; /// Classification of a document region by its semantic type. /// -/// This helps VLMs understand the context of each region and -/// guides appropriate editing operations. +/// Helps understand the context of each region and what type of content it contains. #[derive( Debug, Default, @@ -76,9 +75,9 @@ pub enum RegionKind { } impl RegionKind { - /// Returns true if this region typically contains editable text. + /// Returns true if this region typically contains extractable text. #[must_use] - pub const fn is_text_editable(&self) -> bool { + pub const fn has_text_content(&self) -> bool { matches!( self, Self::Text @@ -99,16 +98,13 @@ impl RegionKind { matches!(self, Self::Table | Self::TableRow | Self::List) } - /// Returns true if this region can be redacted. + /// Returns true if this region represents structural content. #[must_use] - pub const fn is_redactable(&self) -> bool { - !matches!(self, Self::Unknown) - } - - /// Returns true if this region can be deleted. - #[must_use] - pub const fn is_deletable(&self) -> bool { - true + pub const fn is_structural(&self) -> bool { + matches!( + self, + Self::Table | Self::TableRow | Self::List | Self::Header | Self::Footer + ) } } @@ -117,12 +113,12 @@ mod tests { use super::*; #[test] - fn test_text_editable() { - assert!(RegionKind::Text.is_text_editable()); - assert!(RegionKind::Heading.is_text_editable()); - assert!(RegionKind::TableCell.is_text_editable()); - assert!(!RegionKind::Image.is_text_editable()); - assert!(!RegionKind::Table.is_text_editable()); + fn test_has_text_content() { + assert!(RegionKind::Text.has_text_content()); + assert!(RegionKind::Heading.has_text_content()); + assert!(RegionKind::TableCell.has_text_content()); + assert!(!RegionKind::Image.has_text_content()); + assert!(!RegionKind::Table.has_text_content()); } #[test] diff --git a/crates/nvisy-document/src/format/region/mod.rs b/crates/nvisy-document/src/format/region/mod.rs index adf896e..e9bde8e 100644 --- a/crates/nvisy-document/src/format/region/mod.rs +++ b/crates/nvisy-document/src/format/region/mod.rs @@ -1,8 +1,7 @@ -//! Region types for document manipulation. +//! Region types for document structure. //! -//! Regions are the fundamental unit for VLM-driven document editing. -//! Each region represents a semantically meaningful part of a document -//! (paragraph, table, image, etc.) that can be referenced and modified. +//! Regions represent semantically meaningful parts of a document +//! (paragraphs, tables, images, etc.) that can be referenced and extracted. mod bounds; mod core; @@ -11,9 +10,8 @@ mod kind; mod source; mod status; -pub use core::Region; - pub use bounds::{BoundingBox, Point}; +pub use core::Region; pub use id::RegionId; pub use kind::RegionKind; pub use source::RegionSource; diff --git a/crates/nvisy-document/src/format/region/status.rs b/crates/nvisy-document/src/format/region/status.rs index 7402926..1a3b0d5 100644 --- a/crates/nvisy-document/src/format/region/status.rs +++ b/crates/nvisy-document/src/format/region/status.rs @@ -2,10 +2,9 @@ use serde::{Deserialize, Serialize}; -/// Status of a region within an edit session. +/// Status of a region within a document. /// -/// Tracks the lifecycle of regions as edits are applied, -/// enabling stable references across multi-turn VLM interactions. +/// Describes how a region was detected or its state in the document. #[derive( Debug, Default, @@ -19,43 +18,28 @@ use serde::{Deserialize, Serialize}; )] #[serde(rename_all = "snake_case")] pub enum RegionStatus { - /// Region is active and unchanged from its original state. + /// Region is active and valid. #[default] Active, - /// Region content has been modified. - Modified, + /// Region is hidden or collapsed. + Hidden, - /// Region has been deleted. - Deleted, - - /// Region was split into multiple regions. - Split, - - /// Region was merged with another region. - Merged, - - /// Region was created during this session (not in original document). - Created, + /// Region content is empty. + Empty, } impl RegionStatus { - /// Returns true if the region is still valid for operations. - #[must_use] - pub const fn is_valid(&self) -> bool { - matches!(self, Self::Active | Self::Modified | Self::Created) - } - - /// Returns true if the region has been removed. + /// Returns true if the region is visible. #[must_use] - pub const fn is_removed(&self) -> bool { - matches!(self, Self::Deleted | Self::Merged) + pub const fn is_visible(&self) -> bool { + matches!(self, Self::Active | Self::Empty) } - /// Returns true if the region was changed from its original state. + /// Returns true if the region has content. #[must_use] - pub const fn is_changed(&self) -> bool { - !matches!(self, Self::Active) + pub const fn has_content(&self) -> bool { + matches!(self, Self::Active) } } @@ -64,19 +48,17 @@ mod tests { use super::*; #[test] - fn test_status_validity() { - assert!(RegionStatus::Active.is_valid()); - assert!(RegionStatus::Modified.is_valid()); - assert!(RegionStatus::Created.is_valid()); - assert!(!RegionStatus::Deleted.is_valid()); - assert!(!RegionStatus::Merged.is_valid()); + fn test_status_visibility() { + assert!(RegionStatus::Active.is_visible()); + assert!(RegionStatus::Empty.is_visible()); + assert!(!RegionStatus::Hidden.is_visible()); } #[test] - fn test_status_removed() { - assert!(!RegionStatus::Active.is_removed()); - assert!(RegionStatus::Deleted.is_removed()); - assert!(RegionStatus::Merged.is_removed()); + fn test_status_has_content() { + assert!(RegionStatus::Active.has_content()); + assert!(!RegionStatus::Empty.has_content()); + assert!(!RegionStatus::Hidden.has_content()); } #[test] diff --git a/crates/nvisy-document/src/lib.rs b/crates/nvisy-document/src/lib.rs index 34435f8..ec53667 100644 --- a/crates/nvisy-document/src/lib.rs +++ b/crates/nvisy-document/src/lib.rs @@ -5,7 +5,6 @@ // Core modules pub mod error; pub mod format; -pub mod operation; // Extension trait modules pub mod conversion; @@ -15,36 +14,36 @@ pub mod text; pub mod thumbnail; // Error re-exports -// Conversion re-exports -pub use conversion::{ - Conversion, ConversionOptions, ConversionPath, ConversionResult, ConversionStep, FormatPair, - HtmlOptions, PageMargins, PageOrientation, PdfOptions, SkippedElement, -}; pub use error::{BoxError, Error, ErrorKind, Result}; + // Region re-exports (from format::region) pub use format::region::{ BoundingBox, Point, Region, RegionId, RegionKind, RegionSource, RegionStatus, }; + // Format re-exports pub use format::{ - Capabilities, Document, DocumentFormat, DocumentInfo, EditableDocument, ImageCapabilities, - MetadataCapabilities, OperationSupport, PageCapabilities, PageOptions, StructureCapabilities, - TextCapabilities, + Capabilities, Document, DocumentFormat, DocumentInfo, MetadataCapabilities, PageOptions, + StructureCapabilities, TextCapabilities, +}; + +// Conversion re-exports +pub use conversion::{ + Conversion, ConversionOptions, ConversionPath, ConversionResult, ConversionStep, FormatPair, + HtmlOptions, PageMargins, PageOrientation, PdfOptions, SkippedElement, }; + // Metadata re-exports pub use metadata::{ CustomProperty, DocumentMetadata, Metadata, MetadataExtractOptions, MetadataField, PropertyValue, }; -// Operation re-exports -pub use operation::{ - ContentOperation, DocumentOperation, EditOperation, EditResult, InsertContent, InsertOperation, - MergeOrder, MetadataOperation, PageOperation, RedactStyle, SplitBoundary, StructuralOperation, - TextStyle, -}; + // Table re-exports pub use table::{CellDataType, NormalizedCell, NormalizedRow, NormalizedTable, TableExtractor}; + // Text re-exports pub use text::{ExtractedText, TextExtractor}; + // Thumbnail re-exports pub use thumbnail::{ImageFormat, Thumbnail, ThumbnailGenerator, ThumbnailOptions, ThumbnailSize}; diff --git a/crates/nvisy-document/src/operation/insert.rs b/crates/nvisy-document/src/operation/insert.rs deleted file mode 100644 index 40636b5..0000000 --- a/crates/nvisy-document/src/operation/insert.rs +++ /dev/null @@ -1,160 +0,0 @@ -//! Insert content types. - -use bytes::Bytes; -use serde::{Deserialize, Serialize}; - -use crate::format::region::RegionKind; - -/// Content to insert into a document. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -#[serde(rename_all = "snake_case", tag = "type")] -pub enum InsertContent { - /// Plain text content. - Text { - /// The text to insert. - content: String, - - /// Optional style hint. - style: Option, - }, - - /// Image content. - Image { - /// Image data. - #[serde(with = "bytes_serde")] - data: Bytes, - - /// MIME type (e.g., "image/png"). - mime_type: String, - - /// Optional alt text. - alt_text: Option, - }, - - /// Page break. - PageBreak, - - /// Section break. - SectionBreak, - - /// Horizontal rule/divider. - HorizontalRule, -} - -/// Text style hints for insertion. -#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)] -#[serde(rename_all = "snake_case")] -pub enum TextStyle { - /// Normal paragraph text. - #[default] - Normal, - - /// Heading level 1-6. - Heading(u8), - - /// Bold text. - Bold, - - /// Italic text. - Italic, - - /// Code/monospace text. - Code, - - /// Block quote. - Quote, -} - -impl InsertContent { - /// Creates a text insert with the given content. - #[must_use] - pub fn text(content: impl Into) -> Self { - Self::Text { - content: content.into(), - style: None, - } - } - - /// Creates a text insert with style. - #[must_use] - pub fn styled_text(content: impl Into, style: TextStyle) -> Self { - Self::Text { - content: content.into(), - style: Some(style), - } - } - - /// Creates an image insert. - #[must_use] - pub fn image(data: Bytes, mime_type: impl Into) -> Self { - Self::Image { - data, - mime_type: mime_type.into(), - alt_text: None, - } - } - - /// Returns the region kind this content would create. - #[must_use] - pub fn region_kind(&self) -> RegionKind { - match self { - Self::Text { style, .. } => match style { - Some(TextStyle::Heading(_)) => RegionKind::Heading, - Some(TextStyle::Code) => RegionKind::Code, - Some(TextStyle::Quote) => RegionKind::Quote, - _ => RegionKind::Text, - }, - Self::Image { .. } => RegionKind::Image, - Self::PageBreak | Self::SectionBreak | Self::HorizontalRule => RegionKind::Unknown, - } - } -} - -/// Serde helper for Bytes. -mod bytes_serde { - use bytes::Bytes; - use serde::{Deserialize, Deserializer, Serialize, Serializer}; - - pub fn serialize(bytes: &Bytes, serializer: S) -> Result - where - S: Serializer, - { - base64::Engine::encode(&base64::engine::general_purpose::STANDARD, bytes) - .serialize(serializer) - } - - pub fn deserialize<'de, D>(deserializer: D) -> Result - where - D: Deserializer<'de>, - { - let s = String::deserialize(deserializer)?; - base64::Engine::decode(&base64::engine::general_purpose::STANDARD, &s) - .map(Bytes::from) - .map_err(serde::de::Error::custom) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_text_insert() { - let content = InsertContent::text("Hello, world!"); - assert!(matches!(content, InsertContent::Text { .. })); - assert_eq!(content.region_kind(), RegionKind::Text); - } - - #[test] - fn test_styled_text() { - let content = InsertContent::styled_text("Title", TextStyle::Heading(1)); - assert_eq!(content.region_kind(), RegionKind::Heading); - } - - #[test] - fn test_image_insert() { - let data = Bytes::from(vec![0u8; 10]); - let content = InsertContent::image(data, "image/png"); - assert_eq!(content.region_kind(), RegionKind::Image); - } -} diff --git a/crates/nvisy-document/src/operation/mod.rs b/crates/nvisy-document/src/operation/mod.rs deleted file mode 100644 index b523ecd..0000000 --- a/crates/nvisy-document/src/operation/mod.rs +++ /dev/null @@ -1,542 +0,0 @@ -//! Document edit operations. -//! -//! This module defines all the operations that can be performed on a document. -//! Operations are designed to be: -//! - Reversible (for undo/redo support) -//! - Serializable (for persistence and VLM communication) -//! - Format-agnostic (implementations handle format-specific details) - -mod insert; -mod redact; -mod result; -mod split; - -use derive_more::From; -pub use insert::{InsertContent, TextStyle}; -pub use redact::RedactStyle; -pub use result::EditResult; -use serde::{Deserialize, Serialize}; -pub use split::{MergeOrder, SplitBoundary}; - -use crate::format::region::{BoundingBox, RegionId, RegionKind}; - -/// Content modification operations. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -#[serde(rename_all = "snake_case", tag = "operation")] -pub enum ContentOperation { - /// Redact content within a region. - Redact { - /// Target region to redact. - target: RegionId, - - /// Redaction style. - #[serde(default)] - style: RedactStyle, - }, - - /// Replace text content in a region. - ReplaceText { - /// Target region. - target: RegionId, - - /// New text content. - new_text: String, - - /// Whether to preserve original formatting. - #[serde(default = "default_true")] - preserve_formatting: bool, - }, - - /// Replace a substring within a region's text. - ReplaceSubstring { - /// Target region. - target: RegionId, - - /// Text to find (first occurrence). - find: String, - - /// Text to replace with. - replace: String, - - /// Replace all occurrences vs just the first. - #[serde(default)] - replace_all: bool, - }, - - /// Delete a region entirely. - Delete { - /// Target region to delete. - target: RegionId, - - /// Whether to collapse space left by deletion. - #[serde(default = "default_true")] - collapse_space: bool, - }, -} - -/// Insertion operations. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -#[serde(rename_all = "snake_case", tag = "operation")] -pub enum InsertOperation { - /// Insert content before a region. - InsertBefore { - /// Region to insert before. - target: RegionId, - - /// Content to insert. - content: InsertContent, - }, - - /// Insert content after a region. - InsertAfter { - /// Region to insert after. - target: RegionId, - - /// Content to insert. - content: InsertContent, - }, - - /// Insert content at the start of a region (for containers). - InsertStart { - /// Container region. - target: RegionId, - - /// Content to insert. - content: InsertContent, - }, - - /// Insert content at the end of a region (for containers). - InsertEnd { - /// Container region. - target: RegionId, - - /// Content to insert. - content: InsertContent, - }, -} - -/// Structural operations for moving, copying, merging, and splitting. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -#[serde(rename_all = "snake_case", tag = "operation")] -pub enum StructuralOperation { - /// Move a region to a new location. - Move { - /// Region to move. - source: RegionId, - - /// Target location (insert after this region). - target: RegionId, - }, - - /// Copy a region to a new location. - Copy { - /// Region to copy. - source: RegionId, - - /// Target location (insert after this region). - target: RegionId, - }, - - /// Merge multiple regions into one. - Merge { - /// Regions to merge (in order). - regions: Vec, - - /// Separator between merged content. - separator: Option, - }, - - /// Split a region at a specific point. - SplitRegion { - /// Region to split. - target: RegionId, - - /// Character offset to split at. - at_offset: usize, - }, -} - -/// Page-level operations. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -#[serde(rename_all = "snake_case", tag = "operation")] -pub enum PageOperation { - /// Delete specific pages. - DeletePages { - /// Page numbers to delete (0-indexed). - pages: Vec, - }, - - /// Reorder pages. - ReorderPages { - /// New page order (each value is the old page index). - new_order: Vec, - }, - - /// Rotate pages. - RotatePages { - /// Page numbers to rotate (0-indexed). - pages: Vec, - - /// Rotation in degrees (90, 180, 270). - degrees: i16, - }, - - /// Extract pages to a new document. - ExtractPages { - /// Page numbers to extract (0-indexed). - pages: Vec, - }, -} - -/// Document-level operations. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -#[serde(rename_all = "snake_case", tag = "operation")] -pub enum DocumentOperation { - /// Split document at specified boundaries. - Split { - /// Split boundary definitions. - boundaries: Vec, - }, -} - -/// Metadata operations for classification, bounds, and annotations. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -#[serde(rename_all = "snake_case", tag = "operation")] -pub enum MetadataOperation { - /// Change region kind/classification. - Reclassify { - /// Target region. - target: RegionId, - - /// New region kind. - new_kind: RegionKind, - }, - - /// Update region bounds (for layout adjustments). - UpdateBounds { - /// Target region. - target: RegionId, - - /// New bounding box. - new_bounds: BoundingBox, - }, - - /// Add annotation/comment to a region. - Annotate { - /// Target region. - target: RegionId, - - /// Annotation text. - annotation: String, - - /// Annotation author (optional). - author: Option, - }, -} - -/// An edit operation to be applied to a document. -/// -/// Operations target specific regions by their stable IDs, allowing -/// VLM-driven workflows to reference regions across multiple turns. -#[derive(Debug, Clone, PartialEq, From, Serialize, Deserialize)] -#[serde(rename_all = "snake_case", untagged)] -pub enum EditOperation { - /// Content modification operations. - Content(ContentOperation), - - /// Insertion operations. - Insert(InsertOperation), - - /// Structural operations. - Structural(StructuralOperation), - - /// Page-level operations. - Page(PageOperation), - - /// Document-level operations. - Document(DocumentOperation), - - /// Metadata operations. - Metadata(MetadataOperation), -} - -fn default_true() -> bool { - true -} - -impl EditOperation { - /// Returns the primary target region of this operation, if any. - #[must_use] - pub fn target(&self) -> Option { - match self { - Self::Content(op) => op.target(), - Self::Insert(op) => op.target(), - Self::Structural(op) => op.target(), - Self::Page(_) => None, - Self::Document(_) => None, - Self::Metadata(op) => op.target(), - } - } - - /// Returns all region IDs referenced by this operation. - #[must_use] - pub fn referenced_regions(&self) -> Vec { - match self { - Self::Content(op) => op.referenced_regions(), - Self::Insert(op) => op.referenced_regions(), - Self::Structural(op) => op.referenced_regions(), - Self::Page(_) => vec![], - Self::Document(op) => op.referenced_regions(), - Self::Metadata(op) => op.referenced_regions(), - } - } - - /// Returns true if this operation modifies content (vs. metadata only). - #[must_use] - pub const fn modifies_content(&self) -> bool { - match self { - Self::Content(_) - | Self::Insert(_) - | Self::Structural(_) - | Self::Page(_) - | Self::Document(_) => true, - Self::Metadata(_) => false, - } - } - - /// Returns true if this operation is reversible. - #[must_use] - pub const fn is_reversible(&self) -> bool { - true - } - - /// Creates a redact operation with default style. - #[must_use] - pub fn redact(target: RegionId) -> Self { - ContentOperation::Redact { - target, - style: RedactStyle::default(), - } - .into() - } - - /// Creates a redact operation with custom style. - #[must_use] - pub fn redact_with_style(target: RegionId, style: RedactStyle) -> Self { - ContentOperation::Redact { target, style }.into() - } - - /// Creates a replace text operation. - #[must_use] - pub fn replace_text(target: RegionId, new_text: impl Into) -> Self { - ContentOperation::ReplaceText { - target, - new_text: new_text.into(), - preserve_formatting: true, - } - .into() - } - - /// Creates a delete operation. - #[must_use] - pub fn delete(target: RegionId) -> Self { - ContentOperation::Delete { - target, - collapse_space: true, - } - .into() - } - - /// Creates an insert after operation. - #[must_use] - pub fn insert_after(target: RegionId, content: InsertContent) -> Self { - InsertOperation::InsertAfter { target, content }.into() - } - - /// Creates an insert before operation. - #[must_use] - pub fn insert_before(target: RegionId, content: InsertContent) -> Self { - InsertOperation::InsertBefore { target, content }.into() - } -} - -impl ContentOperation { - /// Returns the target region of this operation. - #[must_use] - pub fn target(&self) -> Option { - match self { - Self::Redact { target, .. } - | Self::ReplaceText { target, .. } - | Self::ReplaceSubstring { target, .. } - | Self::Delete { target, .. } => Some(*target), - } - } - - /// Returns all region IDs referenced by this operation. - #[must_use] - pub fn referenced_regions(&self) -> Vec { - self.target().into_iter().collect() - } -} - -impl InsertOperation { - /// Returns the target region of this operation. - #[must_use] - pub fn target(&self) -> Option { - match self { - Self::InsertBefore { target, .. } - | Self::InsertAfter { target, .. } - | Self::InsertStart { target, .. } - | Self::InsertEnd { target, .. } => Some(*target), - } - } - - /// Returns all region IDs referenced by this operation. - #[must_use] - pub fn referenced_regions(&self) -> Vec { - self.target().into_iter().collect() - } -} - -impl StructuralOperation { - /// Returns the primary target region of this operation. - #[must_use] - pub fn target(&self) -> Option { - match self { - Self::Move { source, .. } | Self::Copy { source, .. } => Some(*source), - Self::Merge { regions, .. } => regions.first().copied(), - Self::SplitRegion { target, .. } => Some(*target), - } - } - - /// Returns all region IDs referenced by this operation. - #[must_use] - pub fn referenced_regions(&self) -> Vec { - match self { - Self::Move { source, target } | Self::Copy { source, target } => vec![*source, *target], - Self::Merge { regions, .. } => regions.clone(), - Self::SplitRegion { target, .. } => vec![*target], - } - } -} - -impl DocumentOperation { - /// Returns all region IDs referenced by this operation. - #[must_use] - pub fn referenced_regions(&self) -> Vec { - match self { - Self::Split { boundaries } => boundaries - .iter() - .filter_map(|b| match b { - SplitBoundary::AfterRegion { region } => Some(*region), - _ => None, - }) - .collect(), - } - } -} - -impl MetadataOperation { - /// Returns the target region of this operation. - #[must_use] - pub fn target(&self) -> Option { - match self { - Self::Reclassify { target, .. } - | Self::UpdateBounds { target, .. } - | Self::Annotate { target, .. } => Some(*target), - } - } - - /// Returns all region IDs referenced by this operation. - #[must_use] - pub fn referenced_regions(&self) -> Vec { - self.target().into_iter().collect() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_target_extraction() { - let region = RegionId::new(); - let op = EditOperation::redact(region); - assert_eq!(op.target(), Some(region)); - } - - #[test] - fn test_referenced_regions() { - let r1 = RegionId::new(); - let r2 = RegionId::new(); - - let op: EditOperation = StructuralOperation::Move { - source: r1, - target: r2, - } - .into(); - let refs = op.referenced_regions(); - assert_eq!(refs.len(), 2); - assert!(refs.contains(&r1)); - assert!(refs.contains(&r2)); - } - - #[test] - fn test_modifies_content() { - let region = RegionId::new(); - - assert!(EditOperation::redact(region).modifies_content()); - assert!(EditOperation::delete(region).modifies_content()); - - let annotate: EditOperation = MetadataOperation::Annotate { - target: region, - annotation: "test".to_string(), - author: None, - } - .into(); - assert!(!annotate.modifies_content()); - } - - #[test] - fn test_from_impls() { - let region = RegionId::new(); - - let _: EditOperation = ContentOperation::Delete { - target: region, - collapse_space: true, - } - .into(); - - let _: EditOperation = InsertOperation::InsertAfter { - target: region, - content: InsertContent::text("test"), - } - .into(); - - let _: EditOperation = StructuralOperation::SplitRegion { - target: region, - at_offset: 10, - } - .into(); - - let _: EditOperation = PageOperation::DeletePages { pages: vec![0] }.into(); - - let _: EditOperation = DocumentOperation::Split { boundaries: vec![] }.into(); - - let _: EditOperation = MetadataOperation::Reclassify { - target: region, - new_kind: RegionKind::Text, - } - .into(); - } - - #[test] - fn test_serde() { - let region = RegionId::new(); - let op = EditOperation::replace_text(region, "Hello, world!"); - - let json = serde_json::to_string_pretty(&op).unwrap(); - let parsed: EditOperation = serde_json::from_str(&json).unwrap(); - assert_eq!(op, parsed); - } -} diff --git a/crates/nvisy-document/src/operation/redact.rs b/crates/nvisy-document/src/operation/redact.rs deleted file mode 100644 index 9776971..0000000 --- a/crates/nvisy-document/src/operation/redact.rs +++ /dev/null @@ -1,103 +0,0 @@ -//! Redaction styles and options. - -use serde::{Deserialize, Serialize}; - -/// Style for redacting content. -#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)] -#[serde(rename_all = "snake_case")] -pub enum RedactStyle { - /// Black box overlay (content hidden but space preserved). - #[default] - BlackBox, - - /// White box overlay (content hidden, blends with background). - WhiteBox, - - /// Replace with placeholder text. - Placeholder { - /// The placeholder text to show. - text: String, - }, - - /// Blur effect (for images, if supported). - Blur { - /// Blur intensity (1-10). - intensity: u8, - }, - - /// Pixelate effect (for images, if supported). - Pixelate { - /// Block size in pixels. - block_size: u8, - }, - - /// Complete removal (content and space removed). - Remove, -} - -impl RedactStyle { - /// Creates a placeholder redaction with the given text. - #[must_use] - pub fn placeholder(text: impl Into) -> Self { - Self::Placeholder { text: text.into() } - } - - /// Creates a blur redaction with the given intensity. - #[must_use] - pub fn blur(intensity: u8) -> Self { - Self::Blur { - intensity: intensity.clamp(1, 10), - } - } - - /// Creates a pixelate redaction with the given block size. - #[must_use] - pub fn pixelate(block_size: u8) -> Self { - Self::Pixelate { - block_size: block_size.max(1), - } - } - - /// Returns true if this style preserves the original space. - #[must_use] - pub const fn preserves_space(&self) -> bool { - !matches!(self, Self::Remove) - } - - /// Returns true if this style is suitable for images. - #[must_use] - pub const fn is_image_style(&self) -> bool { - matches!(self, Self::Blur { .. } | Self::Pixelate { .. }) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_redact_style_default() { - assert_eq!(RedactStyle::default(), RedactStyle::BlackBox); - } - - #[test] - fn test_placeholder() { - let style = RedactStyle::placeholder("[REDACTED]"); - assert!(matches!(style, RedactStyle::Placeholder { text } if text == "[REDACTED]")); - } - - #[test] - fn test_preserves_space() { - assert!(RedactStyle::BlackBox.preserves_space()); - assert!(RedactStyle::placeholder("X").preserves_space()); - assert!(!RedactStyle::Remove.preserves_space()); - } - - #[test] - fn test_serde() { - let style = RedactStyle::Blur { intensity: 5 }; - let json = serde_json::to_string(&style).unwrap(); - let parsed: RedactStyle = serde_json::from_str(&json).unwrap(); - assert_eq!(style, parsed); - } -} diff --git a/crates/nvisy-document/src/operation/result.rs b/crates/nvisy-document/src/operation/result.rs deleted file mode 100644 index 92d083e..0000000 --- a/crates/nvisy-document/src/operation/result.rs +++ /dev/null @@ -1,136 +0,0 @@ -//! Edit operation result types. - -use super::EditOperation; -use crate::format::region::{Region, RegionId}; - -/// Result of applying an edit operation. -#[derive(Debug, Clone)] -pub struct EditResult { - /// Whether the operation succeeded. - pub success: bool, - - /// New regions created by the operation. - pub created_regions: Vec, - - /// Regions modified by the operation. - pub modified_regions: Vec, - - /// Regions deleted by the operation. - pub deleted_region_ids: Vec, - - /// Reverse operation for undo support. - pub reverse_operation: Option, - - /// Warnings generated during the operation. - pub warnings: Vec, -} - -impl EditResult { - /// Creates a successful edit result with no changes. - #[must_use] - pub fn success() -> Self { - Self { - success: true, - created_regions: vec![], - modified_regions: vec![], - deleted_region_ids: vec![], - reverse_operation: None, - warnings: vec![], - } - } - - /// Creates a failed edit result. - #[must_use] - pub fn failed() -> Self { - Self { - success: false, - created_regions: vec![], - modified_regions: vec![], - deleted_region_ids: vec![], - reverse_operation: None, - warnings: vec![], - } - } - - /// Adds a created region. - #[must_use] - pub fn with_created(mut self, region: Region) -> Self { - self.created_regions.push(region); - self - } - - /// Adds a modified region. - #[must_use] - pub fn with_modified(mut self, region: Region) -> Self { - self.modified_regions.push(region); - self - } - - /// Adds a deleted region ID. - #[must_use] - pub fn with_deleted(mut self, id: RegionId) -> Self { - self.deleted_region_ids.push(id); - self - } - - /// Sets the reverse operation. - #[must_use] - pub fn with_reverse(mut self, op: EditOperation) -> Self { - self.reverse_operation = Some(op); - self - } - - /// Adds a warning. - #[must_use] - pub fn with_warning(mut self, warning: impl Into) -> Self { - self.warnings.push(warning.into()); - self - } - - /// Returns true if any regions were affected. - #[must_use] - pub fn has_changes(&self) -> bool { - !self.created_regions.is_empty() - || !self.modified_regions.is_empty() - || !self.deleted_region_ids.is_empty() - } - - /// Returns the total number of affected regions. - #[must_use] - pub fn affected_count(&self) -> usize { - self.created_regions.len() + self.modified_regions.len() + self.deleted_region_ids.len() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_edit_result_success() { - let result = EditResult::success(); - assert!(result.success); - assert!(!result.has_changes()); - assert_eq!(result.affected_count(), 0); - } - - #[test] - fn test_edit_result_failed() { - let result = EditResult::failed(); - assert!(!result.success); - } - - #[test] - fn test_edit_result_builder() { - let region = Region::text("test"); - let result = EditResult::success() - .with_created(region) - .with_warning("Minor issue"); - - assert!(result.success); - assert_eq!(result.created_regions.len(), 1); - assert_eq!(result.warnings.len(), 1); - assert!(result.has_changes()); - assert_eq!(result.affected_count(), 1); - } -} diff --git a/crates/nvisy-document/src/operation/split.rs b/crates/nvisy-document/src/operation/split.rs deleted file mode 100644 index db7eb29..0000000 --- a/crates/nvisy-document/src/operation/split.rs +++ /dev/null @@ -1,105 +0,0 @@ -//! Split operation types. - -use serde::{Deserialize, Serialize}; - -use crate::format::region::RegionId; - -/// Defines where to split a document. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -#[serde(rename_all = "snake_case", tag = "type")] -pub enum SplitBoundary { - /// Split after a specific page. - AfterPage { - /// Page number (0-indexed). - page: u32, - }, - - /// Split after a specific region. - AfterRegion { - /// Region ID to split after. - region: RegionId, - }, - - /// Split at page intervals. - EveryNPages { - /// Number of pages per split. - n: u32, - }, - - /// Split by heading level (each heading starts a new document). - ByHeading { - /// Heading level to split on (1-6). - level: u8, - }, -} - -impl SplitBoundary { - /// Creates a split after a specific page. - #[must_use] - pub fn after_page(page: u32) -> Self { - Self::AfterPage { page } - } - - /// Creates a split after a specific region. - #[must_use] - pub fn after_region(region: RegionId) -> Self { - Self::AfterRegion { region } - } - - /// Creates splits every N pages. - #[must_use] - pub fn every_n_pages(n: u32) -> Self { - Self::EveryNPages { n: n.max(1) } - } - - /// Creates splits at heading level. - #[must_use] - pub fn by_heading(level: u8) -> Self { - Self::ByHeading { - level: level.clamp(1, 6), - } - } -} - -/// Order for merging documents. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)] -#[serde(rename_all = "snake_case")] -pub enum MergeOrder { - /// Merge in the order provided. - #[default] - Sequential, - - /// Interleave pages from each document. - Interleaved, -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_split_boundary() { - let split = SplitBoundary::after_page(5); - assert!(matches!(split, SplitBoundary::AfterPage { page: 5 })); - } - - #[test] - fn test_every_n_pages_minimum() { - let split = SplitBoundary::every_n_pages(0); - assert!(matches!(split, SplitBoundary::EveryNPages { n: 1 })); - } - - #[test] - fn test_heading_level_clamped() { - let split = SplitBoundary::by_heading(10); - assert!(matches!(split, SplitBoundary::ByHeading { level: 6 })); - } - - #[test] - fn test_serde() { - let split = SplitBoundary::after_page(3); - let json = serde_json::to_string(&split).unwrap(); - let parsed: SplitBoundary = serde_json::from_str(&json).unwrap(); - assert_eq!(split, parsed); - } -} diff --git a/crates/nvisy-docx/Cargo.toml b/crates/nvisy-docx/Cargo.toml index 871f217..f4b66cd 100644 --- a/crates/nvisy-docx/Cargo.toml +++ b/crates/nvisy-docx/Cargo.toml @@ -2,20 +2,20 @@ [package] name = "nvisy-docx" +description = "DOCX document format support for nvisy" +readme = "./README.md" + version = { workspace = true } rust-version = { workspace = true } edition = { workspace = true } license = { workspace = true } publish = { workspace = true } -readme = "./README.md" authors = { workspace = true } repository = { workspace = true } homepage = { workspace = true } documentation = { workspace = true } -description = "DOCX document format support for nvisy" - [package.metadata.docs.rs] all-features = true rustdoc-args = ["--cfg", "docsrs"] diff --git a/crates/nvisy-docx/src/document.rs b/crates/nvisy-docx/src/document.rs index d8ad4a4..cee2059 100644 --- a/crates/nvisy-docx/src/document.rs +++ b/crates/nvisy-docx/src/document.rs @@ -2,28 +2,26 @@ use async_trait::async_trait; use bytes::Bytes; -use nvisy_document::{ - Document, DocumentInfo, EditOperation, EditResult, EditableDocument, Error, PageOptions, - Region, RegionId, Result, -}; +use nvisy_document::{Document, DocumentInfo, Error, Region, RegionId, Result}; /// A loaded DOCX document. #[derive(Debug)] pub struct DocxDocument { info: DocumentInfo, regions: Vec, - modified: bool, + #[allow(dead_code)] + data: Bytes, } impl DocxDocument { /// Creates a new DOCX document (internal use). #[must_use] #[allow(dead_code)] // Will be used when load() is implemented - pub(crate) fn new(info: DocumentInfo) -> Self { + pub(crate) fn new(info: DocumentInfo, data: Bytes) -> Self { Self { info, regions: Vec::new(), - modified: false, + data, } } } @@ -49,31 +47,10 @@ impl Document for DocxDocument { self.regions.iter().find(|r| r.id == id) } - async fn serialize(&self) -> Result { + async fn to_bytes(&self) -> Result { // TODO: Implement DOCX serialization Err(Error::unsupported_format( "DOCX serialization not yet implemented", )) } } - -#[async_trait] -impl EditableDocument for DocxDocument { - async fn apply(&mut self, _operation: &EditOperation) -> Result { - // TODO: Implement DOCX editing - Err(Error::unsupported_format( - "DOCX editing not yet implemented", - )) - } - - fn is_modified(&self) -> bool { - self.modified - } - - async fn extract_page_regions(&mut self, _options: &PageOptions) -> Result> { - // TODO: Implement page region extraction - Err(Error::unsupported_format( - "DOCX page extraction not yet implemented", - )) - } -} diff --git a/crates/nvisy-docx/src/format.rs b/crates/nvisy-docx/src/format.rs index e378bcd..b4a1287 100644 --- a/crates/nvisy-docx/src/format.rs +++ b/crates/nvisy-docx/src/format.rs @@ -16,7 +16,7 @@ impl DocxFormat { #[must_use] pub fn new() -> Self { Self { - capabilities: Capabilities::read_only(), + capabilities: Capabilities::rich_document(), } } } @@ -46,13 +46,6 @@ impl DocumentFormat for DocxFormat { "DOCX loading not yet implemented", )) } - - async fn create_empty(&self) -> Result { - // TODO: Implement empty DOCX creation - Err(Error::unsupported_format( - "DOCX creation not yet implemented", - )) - } } #[cfg(test)] @@ -63,9 +56,22 @@ mod tests { fn test_format_metadata() { let format = DocxFormat::new(); assert_eq!(format.name(), "docx"); - assert!(format - .mime_types() - .contains(&"application/vnd.openxmlformats-officedocument.wordprocessingml.document")); + assert!( + format.mime_types().contains( + &"application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ) + ); assert!(format.extensions().contains(&"docx")); } + + #[test] + fn test_capabilities() { + let format = DocxFormat::new(); + let caps = format.capabilities(); + + assert!(caps.text.can_extract); + assert!(caps.text.has_rich_text); + assert!(caps.structure.can_detect_tables); + assert!(caps.structure.has_pages); + } } diff --git a/crates/nvisy-engine/Cargo.toml b/crates/nvisy-engine/Cargo.toml index 2073535..f22977a 100644 --- a/crates/nvisy-engine/Cargo.toml +++ b/crates/nvisy-engine/Cargo.toml @@ -2,12 +2,14 @@ [package] name = "nvisy-engine" +description = "Document processing engine for nvisy" +readme = "./README.md" + version = { workspace = true } rust-version = { workspace = true } edition = { workspace = true } license = { workspace = true } publish = { workspace = true } -readme = "./README.md" authors = { workspace = true } repository = { workspace = true } @@ -25,16 +27,21 @@ docx = ["dep:nvisy-docx"] text = ["dep:nvisy-text"] [dependencies] +# Internal crates nvisy-archive = { workspace = true } nvisy-document = { workspace = true } nvisy-docx = { workspace = true, optional = true } nvisy-pdf = { workspace = true, optional = true } nvisy-text = { workspace = true, optional = true } +# Data types bytes = { workspace = true } +uuid = { workspace = true, features = ["v4"] } jiff = { workspace = true, features = ["std"] } + +# Serialization serde = { workspace = true, features = ["std", "derive"] } -uuid = { workspace = true, features = ["v4"] } [dev-dependencies] serde_json = { workspace = true, features = ["std"] } +tokio = { workspace = true, features = ["rt", "macros"] } diff --git a/crates/nvisy-engine/src/engine/mod.rs b/crates/nvisy-engine/src/engine/mod.rs index 4f27ddc..f16805d 100644 --- a/crates/nvisy-engine/src/engine/mod.rs +++ b/crates/nvisy-engine/src/engine/mod.rs @@ -9,20 +9,16 @@ use std::path::Path; use bytes::Bytes; pub use config::EngineConfig; -use nvisy_document::{DocumentFormat, Error, Result}; -#[cfg(feature = "docx")] -use nvisy_docx::{DocxDocument, DocxFormat}; -#[cfg(feature = "pdf")] -use nvisy_pdf::{PdfDocument, PdfFormat}; -#[cfg(feature = "text")] -use nvisy_text::{TextDocument, TextFormat}; +use nvisy_document::Result; + +use crate::registry::{BoxDocument, FormatRegistry}; /// The central document processing engine. /// /// `Engine` provides a unified interface for: /// - Loading documents from various formats (PDF, DOCX, plain text, etc.) -/// - Managing format handlers -/// - Processing archives containing documents +/// - Managing format handlers via a dynamic registry +/// - Auto-detecting formats from file extensions or MIME types /// /// # Example /// @@ -30,38 +26,32 @@ use nvisy_text::{TextDocument, TextFormat}; /// use nvisy_engine::Engine; /// /// let engine = Engine::new(); -/// let doc = engine.load_pdf(data).await?; +/// +/// // Load by file path (auto-detect format) +/// let doc = engine.load_file("document.pdf").await?; +/// +/// // Load by extension +/// let doc = engine.load_by_extension("json", data).await?; +/// +/// // Load with specific format (when you need the concrete type) +/// let doc = engine.pdf().load(data).await?; /// ``` -#[derive(Debug, Clone)] +#[derive(Debug)] pub struct Engine { /// Configuration for the engine. config: EngineConfig, - /// PDF format handler. - #[cfg(feature = "pdf")] - pdf: PdfFormat, - - /// DOCX format handler. - #[cfg(feature = "docx")] - docx: DocxFormat, - - /// Plain text format handler. - #[cfg(feature = "text")] - text: TextFormat, + /// Format registry for dynamic loading. + registry: FormatRegistry, } impl Engine { - /// Creates a new engine with default configuration. + /// Creates a new engine with default configuration and all default formats. #[must_use] pub fn new() -> Self { Self { config: EngineConfig::default(), - #[cfg(feature = "pdf")] - pdf: PdfFormat::new(), - #[cfg(feature = "docx")] - docx: DocxFormat::new(), - #[cfg(feature = "text")] - text: TextFormat::new(), + registry: FormatRegistry::with_defaults(), } } @@ -70,165 +60,99 @@ impl Engine { pub fn with_config(config: EngineConfig) -> Self { Self { config, - #[cfg(feature = "pdf")] - pdf: PdfFormat::new(), - #[cfg(feature = "docx")] - docx: DocxFormat::new(), - #[cfg(feature = "text")] - text: TextFormat::new(), + registry: FormatRegistry::with_defaults(), } } - /// Returns a reference to the engine configuration. + /// Creates a new engine with a custom registry. #[must_use] - pub fn config(&self) -> &EngineConfig { - &self.config + pub fn with_registry(registry: FormatRegistry) -> Self { + Self { + config: EngineConfig::default(), + registry, + } } - /// Returns the PDF format handler. - #[cfg(feature = "pdf")] - #[cfg_attr(docsrs, doc(cfg(feature = "pdf")))] + /// Creates a new engine with custom configuration and registry. #[must_use] - pub fn pdf(&self) -> &PdfFormat { - &self.pdf + pub fn with_config_and_registry(config: EngineConfig, registry: FormatRegistry) -> Self { + Self { config, registry } } - /// Returns the DOCX format handler. - #[cfg(feature = "docx")] - #[cfg_attr(docsrs, doc(cfg(feature = "docx")))] + /// Returns a reference to the engine configuration. #[must_use] - pub fn docx(&self) -> &DocxFormat { - &self.docx + pub fn config(&self) -> &EngineConfig { + &self.config } - /// Returns the text format handler. - #[cfg(feature = "text")] - #[cfg_attr(docsrs, doc(cfg(feature = "text")))] + /// Returns a reference to the format registry. #[must_use] - pub fn text(&self) -> &TextFormat { - &self.text + pub fn registry(&self) -> &FormatRegistry { + &self.registry } - /// Loads a PDF document from bytes. - #[cfg(feature = "pdf")] - #[cfg_attr(docsrs, doc(cfg(feature = "pdf")))] - pub async fn load_pdf(&self, data: Bytes) -> Result { - self.pdf.load(data).await + /// Returns a mutable reference to the format registry. + /// + /// Use this to register custom formats. + pub fn registry_mut(&mut self) -> &mut FormatRegistry { + &mut self.registry } - /// Loads a DOCX document from bytes. - #[cfg(feature = "docx")] - #[cfg_attr(docsrs, doc(cfg(feature = "docx")))] - pub async fn load_docx(&self, data: Bytes) -> Result { - self.docx.load(data).await + /// Loads a document from a file path. + /// + /// The format is automatically detected from the file extension. + /// + /// # Errors + /// + /// Returns an error if: + /// - The file cannot be read + /// - The file has no extension + /// - The extension is not supported + /// - The document fails to load + pub async fn load_file>(&self, path: P) -> Result { + self.registry.load_file(path).await } - /// Loads a text document from bytes. - #[cfg(feature = "text")] - #[cfg_attr(docsrs, doc(cfg(feature = "text")))] - pub async fn load_text(&self, data: Bytes) -> Result { - self.text.load(data).await + /// Loads a document by file extension. + /// + /// # Errors + /// + /// Returns an error if the extension is not supported or loading fails. + pub async fn load_by_extension(&self, ext: &str, data: Bytes) -> Result { + self.registry.load_by_extension(ext, data).await } - /// Reads a file and returns its contents along with the file extension. + /// Loads a document by MIME type. /// /// # Errors /// - /// Returns an error if: - /// - The file cannot be read - /// - The file has no extension - pub fn read_file>(&self, path: P) -> Result<(Bytes, String)> { - let path = path.as_ref(); - let data = std::fs::read(path) - .map_err(|e| Error::io(format!("Failed to read file '{}': {}", path.display(), e)))?; - - let ext = path - .extension() - .and_then(|e| e.to_str()) - .ok_or_else(|| Error::unsupported_format("No file extension"))? - .to_owned(); - - Ok((Bytes::from(data), ext)) + /// Returns an error if the MIME type is not supported or loading fails. + pub async fn load_by_mime(&self, mime: &str, data: Bytes) -> Result { + self.registry.load_by_mime(mime, data).await } /// Checks if a file extension is supported. #[must_use] pub fn supports_extension(&self, ext: &str) -> bool { - let ext = ext.trim_start_matches('.').to_lowercase(); - - #[cfg(feature = "pdf")] - if self.pdf.extensions().contains(&ext.as_str()) { - return true; - } - - #[cfg(feature = "docx")] - if self.docx.extensions().contains(&ext.as_str()) { - return true; - } - - #[cfg(feature = "text")] - if self.text.extensions().contains(&ext.as_str()) { - return true; - } - - false + self.registry.supports_extension(ext) } /// Checks if a MIME type is supported. #[must_use] - pub fn supports_mime(&self, mime_type: &str) -> bool { - let mime = mime_type.to_lowercase(); - - #[cfg(feature = "pdf")] - if self.pdf.mime_types().contains(&mime.as_str()) { - return true; - } - - #[cfg(feature = "docx")] - if self.docx.mime_types().contains(&mime.as_str()) { - return true; - } - - #[cfg(feature = "text")] - if self.text.mime_types().contains(&mime.as_str()) { - return true; - } - - false + pub fn supports_mime(&self, mime: &str) -> bool { + self.registry.supports_mime(mime) } /// Returns all supported file extensions. #[must_use] pub fn supported_extensions(&self) -> Vec<&'static str> { - let mut exts = Vec::new(); - - #[cfg(feature = "pdf")] - exts.extend(self.pdf.extensions()); - - #[cfg(feature = "docx")] - exts.extend(self.docx.extensions()); - - #[cfg(feature = "text")] - exts.extend(self.text.extensions()); - - exts + self.registry.supported_extensions() } /// Returns all supported MIME types. #[must_use] pub fn supported_mime_types(&self) -> Vec<&'static str> { - let mut mimes = Vec::new(); - - #[cfg(feature = "pdf")] - mimes.extend(self.pdf.mime_types()); - - #[cfg(feature = "docx")] - mimes.extend(self.docx.mime_types()); - - #[cfg(feature = "text")] - mimes.extend(self.text.mime_types()); - - mimes + self.registry.supported_mime_types() } } @@ -238,6 +162,15 @@ impl Default for Engine { } } +impl Clone for Engine { + fn clone(&self) -> Self { + Self { + config: self.config.clone(), + registry: FormatRegistry::with_defaults(), + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -272,6 +205,12 @@ mod tests { { assert!(engine.supports_extension("txt")); assert!(engine.supports_extension("md")); + assert!(engine.supports_extension("json")); + assert!(engine.supports_extension("csv")); + assert!(engine.supports_extension("xml")); + assert!(engine.supports_extension("yaml")); + assert!(engine.supports_extension("toml")); + assert!(engine.supports_extension("ini")); } assert!(!engine.supports_extension("xyz")); @@ -285,8 +224,74 @@ mod tests { assert!(engine.supports_mime("application/pdf")); #[cfg(feature = "text")] - assert!(engine.supports_mime("text/plain")); + { + assert!(engine.supports_mime("text/plain")); + assert!(engine.supports_mime("text/markdown")); + assert!(engine.supports_mime("application/json")); + assert!(engine.supports_mime("text/csv")); + assert!(engine.supports_mime("application/xml")); + assert!(engine.supports_mime("application/x-yaml")); + assert!(engine.supports_mime("application/toml")); + } assert!(!engine.supports_mime("application/unknown")); } + + #[cfg(feature = "text")] + #[tokio::test] + async fn test_load_by_extension() { + let engine = Engine::new(); + + let doc = engine + .load_by_extension("json", Bytes::from(r#"{"key": "value"}"#)) + .await + .unwrap(); + assert!(!doc.regions().is_empty()); + + let doc = engine + .load_by_extension("md", Bytes::from("# Title\n\nParagraph")) + .await + .unwrap(); + assert!(!doc.regions().is_empty()); + } + + #[cfg(feature = "text")] + #[tokio::test] + async fn test_load_by_mime() { + let engine = Engine::new(); + + let doc = engine + .load_by_mime("application/json", Bytes::from(r#"{"key": "value"}"#)) + .await + .unwrap(); + assert!(!doc.regions().is_empty()); + } + + #[test] + fn test_registry_access() { + let engine = Engine::new(); + let registry = engine.registry(); + + #[cfg(feature = "text")] + { + let format = registry.get_by_extension("json").unwrap(); + assert_eq!(format.name(), "json"); + } + } + + #[test] + fn test_custom_registry() { + let mut registry = FormatRegistry::new(); + + #[cfg(feature = "text")] + registry.register(nvisy_text::JsonFormat::new()); + + let engine = Engine::with_registry(registry); + + #[cfg(feature = "text")] + { + assert!(engine.supports_extension("json")); + assert!(!engine.supports_extension("xml")); // Not registered + } + } } diff --git a/crates/nvisy-engine/src/lib.rs b/crates/nvisy-engine/src/lib.rs index 1093cf1..c4d83a1 100644 --- a/crates/nvisy-engine/src/lib.rs +++ b/crates/nvisy-engine/src/lib.rs @@ -3,21 +3,13 @@ #![doc = include_str!("../README.md")] pub mod engine; +pub mod registry; pub mod session; pub use engine::{Engine, EngineConfig}; pub use nvisy_document::{ - self as doc, BoundingBox, Capabilities, DocumentFormat, EditOperation, Point, Region, RegionId, + self as doc, BoundingBox, Capabilities, Document, DocumentFormat, Point, Region, RegionId, RegionKind, }; -// Re-export format types for convenience -#[cfg(feature = "docx")] -#[cfg_attr(docsrs, doc(cfg(feature = "docx")))] -pub use nvisy_docx::{DocxDocument, DocxFormat}; -#[cfg(feature = "pdf")] -#[cfg_attr(docsrs, doc(cfg(feature = "pdf")))] -pub use nvisy_pdf::{PdfDocument, PdfFormat}; -#[cfg(feature = "text")] -#[cfg_attr(docsrs, doc(cfg(feature = "text")))] -pub use nvisy_text::{TextDocument, TextFormat}; -pub use session::{EditHistory, EditSession, HistoryEntry, SessionConfig, SessionId}; +pub use registry::{AnyFormat, BoxDocument, FormatRegistry}; +pub use session::{AccessEntry, AccessHistory, ReadSession, SessionConfig, SessionId}; diff --git a/crates/nvisy-engine/src/registry/mod.rs b/crates/nvisy-engine/src/registry/mod.rs new file mode 100644 index 0000000..514a50d --- /dev/null +++ b/crates/nvisy-engine/src/registry/mod.rs @@ -0,0 +1,375 @@ +//! Format registry for dynamic document loading. +//! +//! The registry provides type-erased format handling, allowing documents +//! to be loaded by extension or MIME type without knowing the concrete +//! format at compile time. + +use std::collections::HashMap; +use std::sync::Arc; + +use bytes::Bytes; +use nvisy_document::{Capabilities, Document, Error, Result}; + +/// A type-erased document that can be used for common operations. +pub type BoxDocument = Box; + +/// A type-erased format handler. +/// +/// This trait provides a common interface for all format handlers, +/// enabling dynamic dispatch and runtime format selection. +pub trait AnyFormat: Send + Sync { + /// Returns the format name. + fn name(&self) -> &'static str; + + /// Returns supported MIME types. + fn mime_types(&self) -> &'static [&'static str]; + + /// Returns supported file extensions. + fn extensions(&self) -> &'static [&'static str]; + + /// Returns the format capabilities. + fn capabilities(&self) -> &Capabilities; + + /// Loads a document from bytes, returning a type-erased document. + fn load_boxed( + &self, + data: Bytes, + ) -> std::pin::Pin> + Send + '_>>; +} + +/// Wrapper that implements AnyFormat for any DocumentFormat. +struct FormatWrapper { + inner: F, +} + +impl AnyFormat for FormatWrapper +where + F: nvisy_document::DocumentFormat + Send + Sync + 'static, + F::Document: Send + Sync + 'static, +{ + fn name(&self) -> &'static str { + nvisy_document::DocumentFormat::name(&self.inner) + } + + fn mime_types(&self) -> &'static [&'static str] { + nvisy_document::DocumentFormat::mime_types(&self.inner) + } + + fn extensions(&self) -> &'static [&'static str] { + nvisy_document::DocumentFormat::extensions(&self.inner) + } + + fn capabilities(&self) -> &Capabilities { + nvisy_document::DocumentFormat::capabilities(&self.inner) + } + + fn load_boxed( + &self, + data: Bytes, + ) -> std::pin::Pin> + Send + '_>> { + Box::pin(async move { + let doc = nvisy_document::DocumentFormat::load(&self.inner, data).await?; + Ok(Box::new(doc) as BoxDocument) + }) + } +} + +/// Registry entry containing a format handler. +struct RegistryEntry { + format: Arc, +} + +/// A registry of document formats. +/// +/// The registry maintains mappings from file extensions and MIME types +/// to format handlers, enabling dynamic document loading. +/// +/// # Example +/// +/// ```ignore +/// use nvisy_engine::FormatRegistry; +/// +/// let registry = FormatRegistry::with_defaults(); +/// +/// // Load by file path +/// let doc = registry.load_file("document.pdf").await?; +/// +/// // Load by extension +/// let doc = registry.load_by_extension("json", data).await?; +/// ``` +#[derive(Default)] +pub struct FormatRegistry { + /// All registered formats. + formats: Vec, + + /// Extension to format index mapping. + by_extension: HashMap<&'static str, usize>, + + /// MIME type to format index mapping. + by_mime: HashMap<&'static str, usize>, +} + +impl FormatRegistry { + /// Creates an empty registry. + #[must_use] + pub fn new() -> Self { + Self { + formats: Vec::new(), + by_extension: HashMap::new(), + by_mime: HashMap::new(), + } + } + + /// Creates a registry with all default formats registered. + #[must_use] + pub fn with_defaults() -> Self { + let mut registry = Self::new(); + registry.register_defaults(); + registry + } + + /// Registers all default formats based on enabled features. + pub fn register_defaults(&mut self) { + #[cfg(feature = "pdf")] + self.register(nvisy_pdf::PdfFormat::new()); + + #[cfg(feature = "docx")] + self.register(nvisy_docx::DocxFormat::new()); + + #[cfg(feature = "text")] + { + self.register(nvisy_text::PlainTextFormat::new()); + self.register(nvisy_text::MarkdownFormat::new()); + self.register(nvisy_text::JsonFormat::new()); + self.register(nvisy_text::CsvFormat::new()); + self.register(nvisy_text::XmlFormat::new()); + self.register(nvisy_text::YamlFormat::new()); + self.register(nvisy_text::TomlFormat::new()); + self.register(nvisy_text::IniFormat::new()); + } + } + + /// Registers a format handler. + /// + /// Extensions and MIME types from the format are automatically indexed. + /// If an extension or MIME type is already registered, the new format + /// takes precedence. + pub fn register(&mut self, format: F) + where + F: nvisy_document::DocumentFormat + Send + Sync + 'static, + F::Document: Send + Sync + 'static, + { + let wrapper = FormatWrapper { inner: format }; + let index = self.formats.len(); + let format: Arc = Arc::new(wrapper); + + // Index by extension + for ext in format.extensions() { + self.by_extension.insert(ext, index); + } + + // Index by MIME type + for mime in format.mime_types() { + self.by_mime.insert(mime, index); + } + + self.formats.push(RegistryEntry { format }); + } + + /// Returns the format handler for a file extension. + #[must_use] + pub fn get_by_extension(&self, ext: &str) -> Option<&dyn AnyFormat> { + let ext = ext.trim_start_matches('.').to_lowercase(); + self.by_extension + .get(ext.as_str()) + .and_then(|&idx| self.formats.get(idx)) + .map(|e| e.format.as_ref()) + } + + /// Returns the format handler for a MIME type. + #[must_use] + pub fn get_by_mime(&self, mime: &str) -> Option<&dyn AnyFormat> { + let mime = mime.to_lowercase(); + self.by_mime + .get(mime.as_str()) + .and_then(|&idx| self.formats.get(idx)) + .map(|e| e.format.as_ref()) + } + + /// Checks if an extension is supported. + #[must_use] + pub fn supports_extension(&self, ext: &str) -> bool { + let ext = ext.trim_start_matches('.').to_lowercase(); + self.by_extension.contains_key(ext.as_str()) + } + + /// Checks if a MIME type is supported. + #[must_use] + pub fn supports_mime(&self, mime: &str) -> bool { + let mime = mime.to_lowercase(); + self.by_mime.contains_key(mime.as_str()) + } + + /// Returns all supported file extensions. + #[must_use] + pub fn supported_extensions(&self) -> Vec<&'static str> { + self.by_extension.keys().copied().collect() + } + + /// Returns all supported MIME types. + #[must_use] + pub fn supported_mime_types(&self) -> Vec<&'static str> { + self.by_mime.keys().copied().collect() + } + + /// Returns all registered formats. + #[must_use] + pub fn formats(&self) -> Vec<&dyn AnyFormat> { + self.formats.iter().map(|e| e.format.as_ref()).collect() + } + + /// Loads a document by file extension. + /// + /// # Errors + /// + /// Returns an error if: + /// - The extension is not supported + /// - The document fails to load + pub async fn load_by_extension(&self, ext: &str, data: Bytes) -> Result { + let ext_lower = ext.trim_start_matches('.').to_lowercase(); + + let format = self + .by_extension + .get(ext_lower.as_str()) + .and_then(|&idx| self.formats.get(idx)) + .ok_or_else(|| Error::unsupported_format(format!("Unsupported extension: {}", ext)))?; + + format.format.load_boxed(data).await + } + + /// Loads a document by MIME type. + /// + /// # Errors + /// + /// Returns an error if: + /// - The MIME type is not supported + /// - The document fails to load + pub async fn load_by_mime(&self, mime: &str, data: Bytes) -> Result { + let mime_lower = mime.to_lowercase(); + + let format = self + .by_mime + .get(mime_lower.as_str()) + .and_then(|&idx| self.formats.get(idx)) + .ok_or_else(|| Error::unsupported_format(format!("Unsupported MIME type: {}", mime)))?; + + format.format.load_boxed(data).await + } + + /// Loads a document from a file path. + /// + /// The format is determined by the file extension. + /// + /// # Errors + /// + /// Returns an error if: + /// - The file cannot be read + /// - The file has no extension + /// - The extension is not supported + /// - The document fails to load + pub async fn load_file>(&self, path: P) -> Result { + let path = path.as_ref(); + + let ext = path + .extension() + .and_then(|e| e.to_str()) + .ok_or_else(|| Error::unsupported_format("File has no extension"))?; + + let data = std::fs::read(path) + .map_err(|e| Error::io(format!("Failed to read file '{}': {}", path.display(), e)))?; + + self.load_by_extension(ext, Bytes::from(data)).await + } +} + +impl std::fmt::Debug for FormatRegistry { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("FormatRegistry") + .field("formats", &self.formats.len()) + .field("extensions", &self.by_extension.keys().collect::>()) + .field("mime_types", &self.by_mime.keys().collect::>()) + .finish() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_registry_creation() { + let registry = FormatRegistry::new(); + assert!(registry.formats().is_empty()); + } + + #[test] + fn test_registry_with_defaults() { + let registry = FormatRegistry::with_defaults(); + assert!(!registry.formats().is_empty()); + + #[cfg(feature = "text")] + { + assert!(registry.supports_extension("txt")); + assert!(registry.supports_extension("json")); + assert!(registry.supports_extension("md")); + } + } + + #[test] + fn test_get_by_extension() { + let registry = FormatRegistry::with_defaults(); + + #[cfg(feature = "text")] + { + let format = registry.get_by_extension("json").unwrap(); + assert_eq!(format.name(), "json"); + + let format = registry.get_by_extension(".JSON").unwrap(); + assert_eq!(format.name(), "json"); + } + + assert!(registry.get_by_extension("xyz").is_none()); + } + + #[cfg(feature = "text")] + #[tokio::test] + async fn test_load_by_extension() { + let registry = FormatRegistry::with_defaults(); + + let doc = registry + .load_by_extension("json", Bytes::from(r#"{"key": "value"}"#)) + .await + .unwrap(); + + assert!(!doc.regions().is_empty()); + } + + #[cfg(feature = "text")] + #[tokio::test] + async fn test_load_by_mime() { + let registry = FormatRegistry::with_defaults(); + + let doc = registry + .load_by_mime("application/json", Bytes::from(r#"{"key": "value"}"#)) + .await + .unwrap(); + + assert!(!doc.regions().is_empty()); + } + + #[test] + fn test_unsupported_extension() { + let registry = FormatRegistry::with_defaults(); + assert!(!registry.supports_extension("xyz")); + } +} diff --git a/crates/nvisy-engine/src/session/history.rs b/crates/nvisy-engine/src/session/history.rs index 63bd15d..5c80bb4 100644 --- a/crates/nvisy-engine/src/session/history.rs +++ b/crates/nvisy-engine/src/session/history.rs @@ -1,225 +1,100 @@ -//! Edit history for undo/redo support. +//! Session history tracking (for audit/logging purposes). use jiff::Timestamp; -use nvisy_document::EditOperation; -/// A single entry in the edit history. +/// A single entry tracking document access. #[derive(Debug, Clone)] -pub struct HistoryEntry { - /// The operation that was applied. - pub operation: EditOperation, - - /// The reverse operation for undoing. - pub reverse: EditOperation, - - /// When the operation was applied. +pub struct AccessEntry { + /// When the access occurred. pub timestamp: Timestamp, - /// Optional description of the operation. - pub description: Option, + /// Description of the access. + pub description: String, } -impl HistoryEntry { - /// Creates a new history entry. - #[must_use] - pub fn new(operation: EditOperation, reverse: EditOperation) -> Self { - Self { - operation, - reverse, - timestamp: Timestamp::now(), - description: None, - } - } - - /// Creates a new history entry with a description. +impl AccessEntry { + /// Creates a new access entry. #[must_use] - pub fn with_description( - operation: EditOperation, - reverse: EditOperation, - description: impl Into, - ) -> Self { + pub fn new(description: impl Into) -> Self { Self { - operation, - reverse, timestamp: Timestamp::now(), - description: Some(description.into()), + description: description.into(), } } } -/// Manages edit history with undo/redo support. +/// Tracks document access history for audit purposes. #[derive(Debug, Default)] -pub struct EditHistory { - /// Stack of operations that can be undone. - undo_stack: Vec, - - /// Stack of operations that can be redone. - redo_stack: Vec, +pub struct AccessHistory { + /// List of access entries. + entries: Vec, } -impl EditHistory { +impl AccessHistory { /// Creates a new empty history. #[must_use] pub fn new() -> Self { Self::default() } - /// Records a new operation in the history. - /// - /// This clears the redo stack since we're diverging from the previous future. - pub fn record(&mut self, entry: HistoryEntry) { - self.redo_stack.clear(); - self.undo_stack.push(entry); - } - - /// Returns true if there are operations that can be undone. - #[must_use] - pub fn can_undo(&self) -> bool { - !self.undo_stack.is_empty() - } - - /// Returns true if there are operations that can be redone. - #[must_use] - pub fn can_redo(&self) -> bool { - !self.redo_stack.is_empty() - } - - /// Returns the number of operations that can be undone. - #[must_use] - pub fn undo_count(&self) -> usize { - self.undo_stack.len() + /// Records a new access entry. + pub fn record(&mut self, description: impl Into) { + self.entries.push(AccessEntry::new(description)); } - /// Returns the number of operations that can be redone. + /// Returns the number of entries. #[must_use] - pub fn redo_count(&self) -> usize { - self.redo_stack.len() + pub fn len(&self) -> usize { + self.entries.len() } - /// Pops the most recent operation for undoing. - /// - /// Returns the entry that should be reversed. - pub fn pop_undo(&mut self) -> Option { - self.undo_stack.pop().inspect(|entry| { - self.redo_stack.push(entry.clone()); - }) - } - - /// Pops the most recently undone operation for redoing. - /// - /// Returns the entry that should be reapplied. - pub fn pop_redo(&mut self) -> Option { - self.redo_stack.pop().inspect(|entry| { - self.undo_stack.push(entry.clone()); - }) - } - - /// Peeks at the most recent undoable operation without removing it. + /// Returns true if there are no entries. #[must_use] - pub fn peek_undo(&self) -> Option<&HistoryEntry> { - self.undo_stack.last() + pub fn is_empty(&self) -> bool { + self.entries.is_empty() } - /// Peeks at the most recent redoable operation without removing it. + /// Returns all entries. #[must_use] - pub fn peek_redo(&self) -> Option<&HistoryEntry> { - self.redo_stack.last() - } - - /// Returns all entries in the undo stack (oldest first). - #[must_use] - pub fn undo_entries(&self) -> &[HistoryEntry] { - &self.undo_stack - } - - /// Returns all entries in the redo stack (oldest first). - #[must_use] - pub fn redo_entries(&self) -> &[HistoryEntry] { - &self.redo_stack + pub fn entries(&self) -> &[AccessEntry] { + &self.entries } /// Clears all history. pub fn clear(&mut self) { - self.undo_stack.clear(); - self.redo_stack.clear(); - } - - /// Clears the redo stack only. - pub fn clear_redo(&mut self) { - self.redo_stack.clear(); + self.entries.clear(); } } #[cfg(test)] mod tests { - use nvisy_document::{InsertContent, RegionId}; - use super::*; - fn make_entry() -> HistoryEntry { - let region = RegionId::new(); - HistoryEntry::new( - EditOperation::delete(region), - EditOperation::insert_after(region, InsertContent::text("original")), - ) - } - #[test] fn test_empty_history() { - let history = EditHistory::new(); - assert!(!history.can_undo()); - assert!(!history.can_redo()); + let history = AccessHistory::new(); + assert!(history.is_empty()); + assert_eq!(history.len(), 0); } #[test] - fn test_record_and_undo() { - let mut history = EditHistory::new(); + fn test_record_access() { + let mut history = AccessHistory::new(); - history.record(make_entry()); - assert!(history.can_undo()); - assert!(!history.can_redo()); + history.record("Loaded document"); + history.record("Extracted text"); - let entry = history.pop_undo(); - assert!(entry.is_some()); - assert!(!history.can_undo()); - assert!(history.can_redo()); + assert_eq!(history.len(), 2); + assert!(!history.is_empty()); + assert_eq!(history.entries()[0].description, "Loaded document"); + assert_eq!(history.entries()[1].description, "Extracted text"); } #[test] - fn test_redo() { - let mut history = EditHistory::new(); - - history.record(make_entry()); - history.pop_undo(); - - assert!(history.can_redo()); - - let entry = history.pop_redo(); - assert!(entry.is_some()); - assert!(history.can_undo()); - assert!(!history.can_redo()); - } - - #[test] - fn test_new_record_clears_redo() { - let mut history = EditHistory::new(); - - history.record(make_entry()); - history.pop_undo(); - assert!(history.can_redo()); - - history.record(make_entry()); - assert!(!history.can_redo()); - } - - #[test] - fn test_unlimited_entries() { - let mut history = EditHistory::new(); - - for _ in 0..1000 { - history.record(make_entry()); - } - - assert_eq!(history.undo_count(), 1000); + fn test_clear() { + let mut history = AccessHistory::new(); + history.record("test"); + history.clear(); + assert!(history.is_empty()); } } diff --git a/crates/nvisy-engine/src/session/mod.rs b/crates/nvisy-engine/src/session/mod.rs index ccdb983..2599aec 100644 --- a/crates/nvisy-engine/src/session/mod.rs +++ b/crates/nvisy-engine/src/session/mod.rs @@ -1,9 +1,8 @@ -//! Document editing sessions. +//! Document reading sessions. //! -//! An `EditSession` wraps a document and provides: -//! - Stable region IDs across edits -//! - Undo/redo support -//! - Operation validation +//! A `ReadSession` wraps a document and provides: +//! - Stable region IDs for referencing +//! - Access history tracking //! - Streaming/pagination for large documents mod history; @@ -12,15 +11,12 @@ use std::collections::HashMap; use std::num::NonZeroU32; use bytes::Bytes; -pub use history::{EditHistory, HistoryEntry}; +pub use history::{AccessEntry, AccessHistory}; use jiff::Timestamp; -use nvisy_document::{ - Capabilities, EditOperation, EditResult, EditableDocument, Error, PageOptions, Region, - RegionId, RegionStatus, Result, -}; +use nvisy_document::{Capabilities, Document, PageOptions, Region, RegionId, Result}; use uuid::Uuid; -/// Unique identifier for an edit session. +/// Unique identifier for a read session. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub struct SessionId(Uuid); @@ -50,45 +46,41 @@ impl std::fmt::Display for SessionId { } } -/// Configuration for an edit session. +/// Configuration for a read session. #[derive(Debug, Clone)] pub struct SessionConfig { - /// Whether to auto-extract regions on load. - pub auto_extract_regions: bool, - /// Page batch size for streaming. pub page_batch_size: u32, - /// Whether to validate operations before applying. - pub validate_operations: bool, + /// Whether to track access history. + pub track_history: bool, } impl Default for SessionConfig { fn default() -> Self { Self { - auto_extract_regions: true, page_batch_size: 10, - validate_operations: true, + track_history: false, } } } -/// An edit session for a document. +/// A read session for a document. /// -/// Sessions provide stable region IDs, undo/redo, and streaming support. +/// Sessions provide stable region IDs and streaming support. /// The session is generic over the document type `D`. -pub struct EditSession { +pub struct ReadSession { /// Unique session identifier. id: SessionId, - /// The underlying document (must support editing). + /// The underlying document. document: D, /// Format capabilities. capabilities: Capabilities, - /// Edit history for undo/redo. - history: EditHistory, + /// Access history for auditing. + history: AccessHistory, /// Session configuration. config: SessionConfig, @@ -99,18 +91,18 @@ pub struct EditSession { /// Region cache for quick lookup. region_cache: HashMap, - /// Pages that have been loaded (for lazy loading). - loaded_pages: Vec, + /// Pages that have been accessed (for streaming). + accessed_pages: Vec, /// Total number of pages in the document. total_pages: Option, } -impl EditSession { - /// Creates a new edit session from a loaded document. +impl ReadSession { + /// Creates a new read session from a loaded document. #[must_use] pub fn new(document: D, capabilities: Capabilities, config: SessionConfig) -> Self { - let history = EditHistory::new(); + let history = AccessHistory::new(); let total_pages = document.info().page_count; let mut region_cache = HashMap::new(); @@ -118,7 +110,7 @@ impl EditSession { region_cache.insert(region.id, region.clone()); } - let loaded_pages = if total_pages.is_some() { + let accessed_pages = if total_pages.is_some() { document .regions() .iter() @@ -138,7 +130,7 @@ impl EditSession { config, created_at: Timestamp::now(), region_cache, - loaded_pages, + accessed_pages, total_pages, } } @@ -155,11 +147,6 @@ impl EditSession { &self.document } - /// Returns a mutable reference to the underlying document. - pub fn document_mut(&mut self) -> &mut D { - &mut self.document - } - /// Returns the format capabilities. #[must_use] pub fn capabilities(&self) -> &Capabilities { @@ -172,22 +159,17 @@ impl EditSession { self.created_at } - /// Returns the edit history. + /// Returns the access history. #[must_use] - pub fn history(&self) -> &EditHistory { + pub fn history(&self) -> &AccessHistory { &self.history } - /// Returns whether there are undoable operations. - #[must_use] - pub fn can_undo(&self) -> bool { - self.history.can_undo() - } - - /// Returns whether there are redoable operations. - #[must_use] - pub fn can_redo(&self) -> bool { - self.history.can_redo() + /// Records an access event. + pub fn record_access(&mut self, description: impl Into) { + if self.config.track_history { + self.history.record(description); + } } /// Returns all regions (from cache). @@ -217,162 +199,31 @@ impl EditSession { self.total_pages } - /// Returns which pages have been loaded. + /// Returns which pages have been accessed. #[must_use] - pub fn loaded_pages(&self) -> &[u32] { - &self.loaded_pages + pub fn accessed_pages(&self) -> &[u32] { + &self.accessed_pages } - /// Checks if a page has been loaded. + /// Checks if a page has been accessed. #[must_use] - pub fn is_page_loaded(&self, page: u32) -> bool { - self.loaded_pages.contains(&page) + pub fn is_page_accessed(&self, page: u32) -> bool { + self.accessed_pages.contains(&page) } - /// Validates an operation before applying. - fn validate_operation(&self, operation: &EditOperation) -> Result<()> { - let support = self.capabilities.supports(operation); - if !support.is_supported() { - return Err(Error::operation_not_supported(format!("{operation:?}"))); - } - - for region_id in operation.referenced_regions() { - if !self.region_cache.contains_key(®ion_id) { - return Err(Error::region_not_found(region_id)); - } - } - - for region_id in operation.referenced_regions() { - if let Some(region) = self.region_cache.get(®ion_id) { - if region.effective_status() == RegionStatus::Deleted { - return Err(Error::invalid_operation(format!( - "region {region_id} is deleted" - ))); - } - } - } - - Ok(()) - } - - /// Applies an edit operation. - pub async fn apply(&mut self, operation: EditOperation) -> Result { - if self.config.validate_operations { - self.validate_operation(&operation)?; - } - - let result = self.document.apply(&operation).await?; - - if result.success { - for region in &result.created_regions { - self.region_cache.insert(region.id, region.clone()); - } - - for region in &result.modified_regions { - self.region_cache.insert(region.id, region.clone()); - } - - for id in &result.deleted_region_ids { - if let Some(region) = self.region_cache.get_mut(id) { - region.status = Some(RegionStatus::Deleted); - } - } - - if let Some(reverse) = result.reverse_operation.clone() { - self.history.record(HistoryEntry::new(operation, reverse)); - } - } - - Ok(result) - } - - /// Undoes the most recent operation. - pub async fn undo(&mut self) -> Result> { - let Some(entry) = self.history.pop_undo() else { - return Ok(None); - }; - - let result = self.document.apply(&entry.reverse).await?; - - if result.success { - for region in &result.created_regions { - self.region_cache.insert(region.id, region.clone()); - } - - for region in &result.modified_regions { - self.region_cache.insert(region.id, region.clone()); - } - - for id in &result.deleted_region_ids { - if let Some(region) = self.region_cache.get_mut(id) { - region.status = Some(RegionStatus::Deleted); - } - } - } - - Ok(Some(result)) - } - - /// Redoes the most recently undone operation. - pub async fn redo(&mut self) -> Result> { - let Some(entry) = self.history.pop_redo() else { - return Ok(None); - }; - - let result = self.document.apply(&entry.operation).await?; - - if result.success { - for region in &result.created_regions { - self.region_cache.insert(region.id, region.clone()); - } - - for region in &result.modified_regions { - self.region_cache.insert(region.id, region.clone()); - } - - for id in &result.deleted_region_ids { - if let Some(region) = self.region_cache.get_mut(id) { - region.status = Some(RegionStatus::Deleted); - } - } - } - - Ok(Some(result)) - } - - /// Loads regions for additional pages (streaming support). - pub async fn load_pages(&mut self, start_page: u32, count: u32) -> Result<()> { - let options = PageOptions { + /// Gets page options for a range of pages. + #[must_use] + pub fn page_options(&self, start_page: u32, count: u32) -> PageOptions { + PageOptions { start_page, page_count: Some(count), extract_regions: true, - }; - - let regions = self.document.extract_page_regions(&options).await?; - - for region in regions { - if let Some(page) = region.page { - if !self.loaded_pages.contains(&page.get()) { - self.loaded_pages.push(page.get()); - } - } - self.region_cache.insert(region.id, region); } - - self.loaded_pages.sort_unstable(); - - Ok(()) } /// Serializes the document to bytes. - pub async fn serialize(&self) -> Result { - self.document.serialize().await - } - - /// Returns whether the document has unsaved changes. - #[must_use] - pub fn is_modified(&self) -> bool { - self.document.is_modified() + pub async fn to_bytes(&self) -> Result { + self.document.to_bytes().await } /// Consumes the session and returns the underlying document. @@ -398,8 +249,7 @@ mod tests { #[test] fn test_session_config_default() { let config = SessionConfig::default(); - assert!(config.auto_extract_regions); assert_eq!(config.page_batch_size, 10); - assert!(config.validate_operations); + assert!(!config.track_history); } } diff --git a/crates/nvisy-image/Cargo.toml b/crates/nvisy-image/Cargo.toml new file mode 100644 index 0000000..d84ecb1 --- /dev/null +++ b/crates/nvisy-image/Cargo.toml @@ -0,0 +1,30 @@ +# https://doc.rust-lang.org/cargo/reference/manifest.html + +[package] +name = "nvisy-image" +description = "Image format support for nvisy" +readme = "./README.md" + +version = { workspace = true } +rust-version = { workspace = true } +edition = { workspace = true } +license = { workspace = true } +publish = { workspace = true } + +authors = { workspace = true } +repository = { workspace = true } +homepage = { workspace = true } +documentation = { workspace = true } + +[package.metadata.docs.rs] +all-features = true +rustdoc-args = ["--cfg", "docsrs"] + +[dependencies] +nvisy-document = { workspace = true } + +async-trait = { workspace = true } +bytes = { workspace = true } +thiserror = { workspace = true } + +[dev-dependencies] diff --git a/crates/nvisy-image/README.md b/crates/nvisy-image/README.md new file mode 100644 index 0000000..b5e1574 --- /dev/null +++ b/crates/nvisy-image/README.md @@ -0,0 +1,13 @@ +# nvisy-image + +Image format support for nvisy. + +This crate provides a `DocumentFormat` implementation for image files (PNG, JPEG, GIF, WebP, etc.). + +## Status + +This crate is currently a stub. Image parsing and manipulation are not yet implemented. + +## License + +MIT diff --git a/crates/nvisy-image/src/document.rs b/crates/nvisy-image/src/document.rs new file mode 100644 index 0000000..092b45c --- /dev/null +++ b/crates/nvisy-image/src/document.rs @@ -0,0 +1,56 @@ +//! Image document implementation. + +use async_trait::async_trait; +use bytes::Bytes; +use nvisy_document::{Document, DocumentInfo, Error, Region, RegionId, Result}; + +/// A loaded image document. +#[derive(Debug)] +pub struct ImageDocument { + info: DocumentInfo, + regions: Vec, + #[allow(dead_code)] + data: Bytes, +} + +impl ImageDocument { + /// Creates a new image document (internal use). + #[must_use] + #[allow(dead_code)] // Will be used when load() is implemented + pub(crate) fn new(info: DocumentInfo, data: Bytes) -> Self { + Self { + info, + regions: Vec::new(), + data, + } + } +} + +#[async_trait] +impl Document for ImageDocument { + fn info(&self) -> &DocumentInfo { + &self.info + } + + fn regions(&self) -> &[Region] { + &self.regions + } + + fn regions_for_page(&self, page: u32) -> Vec<&Region> { + self.regions + .iter() + .filter(|r| r.page.map(|p| p.get()) == Some(page)) + .collect() + } + + fn find_region(&self, id: RegionId) -> Option<&Region> { + self.regions.iter().find(|r| r.id == id) + } + + async fn to_bytes(&self) -> Result { + // TODO: Implement image serialization + Err(Error::unsupported_format( + "Image serialization not yet implemented", + )) + } +} diff --git a/crates/nvisy-image/src/format.rs b/crates/nvisy-image/src/format.rs new file mode 100644 index 0000000..e479706 --- /dev/null +++ b/crates/nvisy-image/src/format.rs @@ -0,0 +1,82 @@ +//! Image format handler implementation. + +use bytes::Bytes; +use nvisy_document::{Capabilities, DocumentFormat, Error, Result}; + +use crate::ImageDocument; + +/// Image document format handler. +#[derive(Debug, Clone, Default)] +pub struct ImageFormat { + capabilities: Capabilities, +} + +impl ImageFormat { + /// Creates a new image format handler. + #[must_use] + pub fn new() -> Self { + Self { + capabilities: Capabilities::image(), + } + } +} + +impl DocumentFormat for ImageFormat { + type Document = ImageDocument; + + fn name(&self) -> &'static str { + "image" + } + + fn mime_types(&self) -> &'static [&'static str] { + &[ + "image/png", + "image/jpeg", + "image/gif", + "image/webp", + "image/bmp", + "image/tiff", + ] + } + + fn extensions(&self) -> &'static [&'static str] { + &["png", "jpg", "jpeg", "gif", "webp", "bmp", "tiff", "tif"] + } + + fn capabilities(&self) -> &Capabilities { + &self.capabilities + } + + async fn load(&self, _data: Bytes) -> Result { + // TODO: Implement image loading + Err(Error::unsupported_format( + "Image loading not yet implemented", + )) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_format_metadata() { + let format = ImageFormat::new(); + assert_eq!(format.name(), "image"); + assert!(format.mime_types().contains(&"image/png")); + assert!(format.mime_types().contains(&"image/jpeg")); + assert!(format.extensions().contains(&"png")); + assert!(format.extensions().contains(&"jpg")); + } + + #[test] + fn test_capabilities() { + let format = ImageFormat::new(); + let caps = format.capabilities(); + + assert!(!caps.text.can_extract); + assert!(caps.text.may_need_ocr); + assert!(!caps.structure.has_pages); + assert!(caps.metadata.can_extract); // EXIF support + } +} diff --git a/crates/nvisy-image/src/lib.rs b/crates/nvisy-image/src/lib.rs new file mode 100644 index 0000000..4b9608e --- /dev/null +++ b/crates/nvisy-image/src/lib.rs @@ -0,0 +1,9 @@ +#![forbid(unsafe_code)] +#![cfg_attr(docsrs, feature(doc_cfg))] +#![doc = include_str!("../README.md")] + +mod document; +mod format; + +pub use document::ImageDocument; +pub use format::ImageFormat; diff --git a/crates/nvisy-pdf/Cargo.toml b/crates/nvisy-pdf/Cargo.toml index 77348cd..c3ac1a0 100644 --- a/crates/nvisy-pdf/Cargo.toml +++ b/crates/nvisy-pdf/Cargo.toml @@ -2,20 +2,20 @@ [package] name = "nvisy-pdf" +description = "PDF document format support for nvisy" +readme = "./README.md" + version = { workspace = true } rust-version = { workspace = true } edition = { workspace = true } license = { workspace = true } publish = { workspace = true } -readme = "./README.md" authors = { workspace = true } repository = { workspace = true } homepage = { workspace = true } documentation = { workspace = true } -description = "PDF document format support for nvisy" - [package.metadata.docs.rs] all-features = true rustdoc-args = ["--cfg", "docsrs"] diff --git a/crates/nvisy-pdf/src/document.rs b/crates/nvisy-pdf/src/document.rs index dc0638b..71ad404 100644 --- a/crates/nvisy-pdf/src/document.rs +++ b/crates/nvisy-pdf/src/document.rs @@ -2,28 +2,26 @@ use async_trait::async_trait; use bytes::Bytes; -use nvisy_document::{ - Document, DocumentInfo, EditOperation, EditResult, EditableDocument, Error, PageOptions, - Region, RegionId, Result, -}; +use nvisy_document::{Document, DocumentInfo, Error, Region, RegionId, Result}; /// A loaded PDF document. #[derive(Debug)] pub struct PdfDocument { info: DocumentInfo, regions: Vec, - modified: bool, + #[allow(dead_code)] + data: Bytes, } impl PdfDocument { /// Creates a new PDF document (internal use). #[must_use] #[allow(dead_code)] // Will be used when load() is implemented - pub(crate) fn new(info: DocumentInfo) -> Self { + pub(crate) fn new(info: DocumentInfo, data: Bytes) -> Self { Self { info, regions: Vec::new(), - modified: false, + data, } } } @@ -49,29 +47,10 @@ impl Document for PdfDocument { self.regions.iter().find(|r| r.id == id) } - async fn serialize(&self) -> Result { + async fn to_bytes(&self) -> Result { // TODO: Implement PDF serialization Err(Error::unsupported_format( "PDF serialization not yet implemented", )) } } - -#[async_trait] -impl EditableDocument for PdfDocument { - async fn apply(&mut self, _operation: &EditOperation) -> Result { - // TODO: Implement PDF editing - Err(Error::unsupported_format("PDF editing not yet implemented")) - } - - fn is_modified(&self) -> bool { - self.modified - } - - async fn extract_page_regions(&mut self, _options: &PageOptions) -> Result> { - // TODO: Implement page region extraction - Err(Error::unsupported_format( - "PDF page extraction not yet implemented", - )) - } -} diff --git a/crates/nvisy-pdf/src/format.rs b/crates/nvisy-pdf/src/format.rs index f48345a..f36167f 100644 --- a/crates/nvisy-pdf/src/format.rs +++ b/crates/nvisy-pdf/src/format.rs @@ -16,7 +16,7 @@ impl PdfFormat { #[must_use] pub fn new() -> Self { Self { - capabilities: Capabilities::read_only(), + capabilities: Capabilities::rich_document(), } } } @@ -44,13 +44,6 @@ impl DocumentFormat for PdfFormat { // TODO: Implement PDF loading Err(Error::unsupported_format("PDF loading not yet implemented")) } - - async fn create_empty(&self) -> Result { - // TODO: Implement empty PDF creation - Err(Error::unsupported_format( - "PDF creation not yet implemented", - )) - } } #[cfg(test)] @@ -64,4 +57,15 @@ mod tests { assert!(format.mime_types().contains(&"application/pdf")); assert!(format.extensions().contains(&"pdf")); } + + #[test] + fn test_capabilities() { + let format = PdfFormat::new(); + let caps = format.capabilities(); + + assert!(caps.text.can_extract); + assert!(caps.text.has_rich_text); + assert!(caps.structure.can_detect_tables); + assert!(caps.structure.has_pages); + } } diff --git a/crates/nvisy-text/Cargo.toml b/crates/nvisy-text/Cargo.toml index d653a15..80ab4ff 100644 --- a/crates/nvisy-text/Cargo.toml +++ b/crates/nvisy-text/Cargo.toml @@ -2,20 +2,20 @@ [package] name = "nvisy-text" +description = "Plain text document format support for nvisy" +readme = "./README.md" + version = { workspace = true } rust-version = { workspace = true } edition = { workspace = true } license = { workspace = true } publish = { workspace = true } -readme = "./README.md" authors = { workspace = true } repository = { workspace = true } homepage = { workspace = true } documentation = { workspace = true } -description = "Plain text document format support for nvisy" - [package.metadata.docs.rs] all-features = true rustdoc-args = ["--cfg", "docsrs"] @@ -25,6 +25,11 @@ nvisy-document = { workspace = true } async-trait = { workspace = true } bytes = { workspace = true } +csv = { workspace = true } +markdown = { workspace = true } +serde_json = { workspace = true } thiserror = { workspace = true } [dev-dependencies] +tokio = { workspace = true, features = ["rt", "macros"] } +tokio-test = { workspace = true } diff --git a/crates/nvisy-text/README.md b/crates/nvisy-text/README.md index f7b701a..10590d2 100644 --- a/crates/nvisy-text/README.md +++ b/crates/nvisy-text/README.md @@ -1,12 +1,104 @@ # nvisy-text -Plain text document format support for nvisy. +Text-based document format support for nvisy. -This crate provides a `DocumentFormat` implementation for plain text files (.txt, .md, .rst, etc.). +This crate provides support for loading and extracting text from +various text-based file formats: -## Status +- **Plain text** (`.txt`, `.text`) +- **Markdown** (`.md`, `.markdown`, `.mdx`) +- **JSON** (`.json`) +- **CSV/TSV** (`.csv`, `.tsv`) +- **XML** (`.xml`, `.xsd`, `.xsl`, `.xslt`, `.svg`, `.xhtml`, `.plist`) +- **YAML** (`.yaml`, `.yml`) +- **TOML** (`.toml`) +- **INI** (`.ini`, `.cfg`, `.conf`, `.config`) -This crate is currently a stub. Text document handling is not yet fully implemented. +## Usage + +```rust +use nvisy_text::{PlainTextFormat, PlainTextDocument}; +use nvisy_document::{DocumentFormat, Document, TextExtractor}; +use bytes::Bytes; + +# tokio_test::block_on(async { +let format = PlainTextFormat::new(); +let data = Bytes::from("Hello, world!\n\nThis is a paragraph."); + +let doc = format.load(data).await.unwrap(); +assert_eq!(doc.regions().len(), 2); + +let text = doc.extract_text().await.unwrap(); +assert_eq!(text.word_count(), 6); +# }); +``` + +## Formats + +### Plain Text + +Basic plain text with paragraph detection. + +```rust +use nvisy_text::PlainTextFormat; +``` + +### Markdown + +Full Markdown parsing using pulldown-cmark with support for headings, lists, code blocks, blockquotes, and more. + +```rust +use nvisy_text::MarkdownFormat; +``` + +### JSON + +JSON parsing with structure detection using serde_json. + +```rust +use nvisy_text::JsonFormat; +``` + +### CSV/TSV + +CSV and TSV parsing using the csv crate. Implements `TableExtractor` for structured table access. + +```rust +use nvisy_text::CsvFormat; +use nvisy_document::TableExtractor; +``` + +### XML + +XML parsing with hierarchical structure detection. + +```rust +use nvisy_text::XmlFormat; +``` + +### YAML + +YAML parsing with list and key-value detection. + +```rust +use nvisy_text::YamlFormat; +``` + +### TOML + +TOML parsing with section and array table detection. + +```rust +use nvisy_text::TomlFormat; +``` + +### INI + +INI/config file parsing with section grouping. + +```rust +use nvisy_text::IniFormat; +``` ## License diff --git a/crates/nvisy-text/src/document.rs b/crates/nvisy-text/src/document.rs deleted file mode 100644 index baf041b..0000000 --- a/crates/nvisy-text/src/document.rs +++ /dev/null @@ -1,79 +0,0 @@ -//! Plain text document implementation. - -use async_trait::async_trait; -use bytes::Bytes; -use nvisy_document::{ - Document, DocumentInfo, EditOperation, EditResult, EditableDocument, Error, PageOptions, - Region, RegionId, Result, -}; - -/// A loaded plain text document. -#[derive(Debug)] -pub struct TextDocument { - info: DocumentInfo, - regions: Vec, - modified: bool, -} - -impl TextDocument { - /// Creates a new text document (internal use). - #[must_use] - #[allow(dead_code)] // Will be used when load() is implemented - pub(crate) fn new(info: DocumentInfo) -> Self { - Self { - info, - regions: Vec::new(), - modified: false, - } - } -} - -#[async_trait] -impl Document for TextDocument { - fn info(&self) -> &DocumentInfo { - &self.info - } - - fn regions(&self) -> &[Region] { - &self.regions - } - - fn regions_for_page(&self, page: u32) -> Vec<&Region> { - self.regions - .iter() - .filter(|r| r.page.map(|p| p.get()) == Some(page)) - .collect() - } - - fn find_region(&self, id: RegionId) -> Option<&Region> { - self.regions.iter().find(|r| r.id == id) - } - - async fn serialize(&self) -> Result { - // TODO: Implement text serialization - Err(Error::unsupported_format( - "Text serialization not yet implemented", - )) - } -} - -#[async_trait] -impl EditableDocument for TextDocument { - async fn apply(&mut self, _operation: &EditOperation) -> Result { - // TODO: Implement text editing - Err(Error::unsupported_format( - "Text editing not yet implemented", - )) - } - - fn is_modified(&self) -> bool { - self.modified - } - - async fn extract_page_regions(&mut self, _options: &PageOptions) -> Result> { - // TODO: Implement page region extraction - Err(Error::unsupported_format( - "Text page extraction not yet implemented", - )) - } -} diff --git a/crates/nvisy-text/src/documents/csv.rs b/crates/nvisy-text/src/documents/csv.rs new file mode 100644 index 0000000..d766a71 --- /dev/null +++ b/crates/nvisy-text/src/documents/csv.rs @@ -0,0 +1,355 @@ +//! CSV/TSV document type. + +use std::num::NonZeroU32; + +use async_trait::async_trait; +use bytes::Bytes; +use csv::{ReaderBuilder, Terminator}; +use nvisy_document::{ + BoundingBox, Document, DocumentInfo, ExtractedText, NormalizedCell, NormalizedRow, + NormalizedTable, Region, RegionId, RegionKind, RegionSource, Result, TableExtractor, + TextExtractor, +}; + +/// A loaded CSV document. +#[derive(Debug, Clone)] +pub struct CsvDocument { + info: DocumentInfo, + content: String, + delimiter: u8, + headers: Vec, + rows: Vec>, + regions: Vec, + table_region_id: RegionId, +} + +impl CsvDocument { + /// Creates a new CSV document from content. + #[must_use] + pub fn new(content: String, delimiter: u8) -> Self { + let (headers, rows) = Self::parse_csv(&content, delimiter); + let (regions, table_region_id) = Self::build_regions(&headers, &rows); + let size = content.len() as u64; + let info = DocumentInfo::new("text/csv", size).with_page_count(1); + + Self { + info, + content, + delimiter, + headers, + rows, + regions, + table_region_id, + } + } + + /// Creates a CSV document (comma-separated). + #[must_use] + pub fn csv(content: String) -> Self { + Self::new(content, b',') + } + + /// Creates a TSV document (tab-separated). + #[must_use] + pub fn tsv(content: String) -> Self { + Self::new(content, b'\t') + } + + /// Returns the raw content. + #[must_use] + pub fn content(&self) -> &str { + &self.content + } + + /// Returns the delimiter byte. + #[must_use] + pub fn delimiter(&self) -> u8 { + self.delimiter + } + + /// Returns the headers. + #[must_use] + pub fn headers(&self) -> &[String] { + &self.headers + } + + /// Returns the data rows (excluding headers). + #[must_use] + pub fn rows(&self) -> &[Vec] { + &self.rows + } + + /// Returns the number of columns. + #[must_use] + pub fn column_count(&self) -> usize { + self.headers.len() + } + + /// Returns the number of data rows (excluding headers). + #[must_use] + pub fn row_count(&self) -> usize { + self.rows.len() + } + + /// Gets a cell value by row and column index. + #[must_use] + pub fn get(&self, row: usize, col: usize) -> Option<&str> { + self.rows + .get(row) + .and_then(|r| r.get(col)) + .map(|s| s.as_str()) + } + + /// Gets a cell value by row index and column name. + #[must_use] + pub fn get_by_name(&self, row: usize, col_name: &str) -> Option<&str> { + let col_idx = self.headers.iter().position(|h| h == col_name)?; + self.get(row, col_idx) + } + + fn parse_csv(content: &str, delimiter: u8) -> (Vec, Vec>) { + let mut reader = ReaderBuilder::new() + .delimiter(delimiter) + .has_headers(true) + .flexible(true) + .trim(csv::Trim::All) + .terminator(Terminator::Any(b'\n')) + .from_reader(content.as_bytes()); + + let headers: Vec = reader + .headers() + .map(|h| h.iter().map(String::from).collect()) + .unwrap_or_default(); + + let rows: Vec> = reader + .records() + .filter_map(|r| r.ok()) + .map(|record| record.iter().map(String::from).collect()) + .collect(); + + (headers, rows) + } + + fn build_regions(headers: &[String], rows: &[Vec]) -> (Vec, RegionId) { + let mut regions = Vec::new(); + let total_rows = rows.len() + 1; + let row_height = 1.0 / total_rows.max(1) as f64; + + // Create table container region + let table_region = Region::on_page( + NonZeroU32::new(1).unwrap(), + BoundingBox::new(0.0, 0.0, 1.0, 1.0), + ) + .with_kind(RegionKind::Table) + .with_source(RegionSource::Parser); + let table_id = table_region.id; + regions.push(table_region); + + // Header row + let header_text = headers.join(" | "); + regions.push( + Region::on_page( + NonZeroU32::new(1).unwrap(), + BoundingBox::new(0.0, 0.0, 1.0, row_height), + ) + .with_text(header_text) + .with_kind(RegionKind::TableRow) + .with_source(RegionSource::Parser) + .with_parent(table_id), + ); + + // Data rows + for (i, row) in rows.iter().enumerate() { + let y = (i + 1) as f64 * row_height; + let row_text = row.join(" | "); + regions.push( + Region::on_page( + NonZeroU32::new(1).unwrap(), + BoundingBox::new(0.0, y, 1.0, row_height), + ) + .with_text(row_text) + .with_kind(RegionKind::TableRow) + .with_source(RegionSource::Parser) + .with_parent(table_id), + ); + } + + (regions, table_id) + } + + /// Builds a normalized table from this CSV document. + fn build_table(&self) -> NormalizedTable { + let mut table = NormalizedTable::new(self.table_region_id) + .with_column_count(self.headers.len()) + .with_header_rows(1); + + // Header row + let mut header_row = NormalizedRow::header(); + for h in &self.headers { + header_row.add_text(h); + } + table.add_row(header_row); + + // Data rows + for row_data in &self.rows { + let mut row = NormalizedRow::new(); + for value in row_data { + row.add_cell(NormalizedCell::text(value)); + } + table.add_row(row); + } + + table + } +} + +#[async_trait] +impl Document for CsvDocument { + fn info(&self) -> &DocumentInfo { + &self.info + } + + fn regions(&self) -> &[Region] { + &self.regions + } + + fn regions_for_page(&self, page: u32) -> Vec<&Region> { + if page == 1 { + self.regions.iter().collect() + } else { + Vec::new() + } + } + + fn find_region(&self, id: RegionId) -> Option<&Region> { + self.regions.iter().find(|r| r.id == id) + } + + async fn to_bytes(&self) -> Result { + Ok(Bytes::from(self.content.clone())) + } +} + +#[async_trait] +impl TextExtractor for CsvDocument { + async fn extract_text(&self) -> Result { + let mut text = String::new(); + + text.push_str(&self.headers.join(" | ")); + text.push('\n'); + text.push_str(&"-".repeat(self.headers.iter().map(|h| h.len() + 3).sum::())); + text.push('\n'); + + for row in &self.rows { + text.push_str(&row.join(" | ")); + text.push('\n'); + } + + let mut extracted = ExtractedText::from_raw(&text).with_page(1, &text); + + for region in &self.regions { + if let Some(t) = ®ion.text { + extracted = extracted.with_region(region.id, t); + } + } + + Ok(extracted) + } + + async fn extract_text_for_page(&self, page: u32) -> Result> { + if page == 1 { + let text = self.extract_text().await?; + Ok(Some(text.raw)) + } else { + Ok(None) + } + } + + fn needs_ocr(&self) -> bool { + false + } +} + +#[async_trait] +impl TableExtractor for CsvDocument { + async fn extract_tables(&self) -> Result> { + Ok(vec![self.build_table()]) + } + + async fn extract_table(&self, region_id: RegionId) -> Result> { + if region_id == self.table_region_id { + Ok(Some(self.build_table())) + } else { + Ok(None) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_simple_csv() { + let csv = "name,age,city\nAlice,30,NYC\nBob,25,LA"; + let doc = CsvDocument::csv(csv.to_string()); + + assert_eq!(doc.headers(), &["name", "age", "city"]); + assert_eq!(doc.row_count(), 2); + assert_eq!(doc.get(0, 0), Some("Alice")); + assert_eq!(doc.get(1, 1), Some("25")); + } + + #[test] + fn test_parse_quoted_fields() { + let csv = r#"name,description +"Smith, John","A ""great"" person" +Bob,Simple value"#; + let doc = CsvDocument::csv(csv.to_string()); + + assert_eq!(doc.get(0, 0), Some("Smith, John")); + assert_eq!(doc.get(0, 1), Some(r#"A "great" person"#)); + } + + #[test] + fn test_get_by_name() { + let csv = "name,age,city\nAlice,30,NYC"; + let doc = CsvDocument::csv(csv.to_string()); + + assert_eq!(doc.get_by_name(0, "name"), Some("Alice")); + assert_eq!(doc.get_by_name(0, "age"), Some("30")); + assert_eq!(doc.get_by_name(0, "unknown"), None); + } + + #[tokio::test] + async fn test_table_extraction() { + let csv = "name,age\nAlice,30\nBob,25"; + let doc = CsvDocument::csv(csv.to_string()); + let tables = doc.extract_tables().await.unwrap(); + + assert_eq!(tables.len(), 1); + assert_eq!(tables[0].column_count, 2); + assert_eq!(tables[0].row_count(), 3); // 1 header + 2 data + } + + #[test] + fn test_tsv_parsing() { + let tsv = "name\tage\tcity\nAlice\t30\tNYC\nBob\t25\tLA"; + let doc = CsvDocument::tsv(tsv.to_string()); + + assert_eq!(doc.headers(), &["name", "age", "city"]); + assert_eq!(doc.row_count(), 2); + assert_eq!(doc.get(0, 0), Some("Alice")); + } + + #[test] + fn test_flexible_columns() { + // csv crate's flexible mode handles rows with different column counts + let csv = "a,b,c\n1,2\n1,2,3,4"; + let doc = CsvDocument::csv(csv.to_string()); + + assert_eq!(doc.headers(), &["a", "b", "c"]); + assert_eq!(doc.rows()[0].len(), 2); + assert_eq!(doc.rows()[1].len(), 4); + } +} diff --git a/crates/nvisy-text/src/documents/ini.rs b/crates/nvisy-text/src/documents/ini.rs new file mode 100644 index 0000000..91c1b3d --- /dev/null +++ b/crates/nvisy-text/src/documents/ini.rs @@ -0,0 +1,229 @@ +//! INI document type. + +use std::num::NonZeroU32; + +use async_trait::async_trait; +use bytes::Bytes; +use nvisy_document::{ + BoundingBox, Document, DocumentInfo, ExtractedText, Region, RegionId, RegionKind, RegionSource, + Result, TextExtractor, +}; + +/// A loaded INI document. +#[derive(Debug, Clone)] +pub struct IniDocument { + info: DocumentInfo, + content: String, + regions: Vec, +} + +impl IniDocument { + /// Creates a new INI document from content. + #[must_use] + pub fn new(content: String) -> Self { + let regions = Self::parse_regions(&content); + let size = content.len() as u64; + let info = DocumentInfo::new("text/plain", size).with_page_count(1); + + Self { + info, + content, + regions, + } + } + + /// Returns the raw content. + #[must_use] + pub fn content(&self) -> &str { + &self.content + } + + fn parse_regions(content: &str) -> Vec { + let mut regions = Vec::new(); + let total_len = content.len().max(1) as f64; + let mut pos = 0_usize; + let mut current_section: Option = None; + + for line in content.lines() { + let line_start = pos; + let line_end = pos + line.len(); + let trimmed = line.trim(); + + if trimmed.is_empty() { + pos = line_end + 1; + continue; + } + + let y_start = line_start as f64 / total_len; + let height = ((line_end - line_start) as f64 / total_len).max(0.02); + + // Section headers: [section] + if trimmed.starts_with('[') && trimmed.ends_with(']') { + let section = Region::on_page( + NonZeroU32::new(1).unwrap(), + BoundingBox::new(0.0, y_start, 1.0, height), + ) + .with_text(trimmed) + .with_kind(RegionKind::Heading) + .with_source(RegionSource::Parser); + current_section = Some(section.id); + regions.push(section); + } else if trimmed.starts_with('#') || trimmed.starts_with(';') { + // Comments (both # and ; style) + regions.push( + Region::on_page( + NonZeroU32::new(1).unwrap(), + BoundingBox::new(0.02, y_start, 0.98, height), + ) + .with_text(trimmed) + .with_kind(RegionKind::Annotation) + .with_source(RegionSource::Parser), + ); + } else { + // Key-value pairs + let mut region = Region::on_page( + NonZeroU32::new(1).unwrap(), + BoundingBox::new(0.02, y_start, 0.98, height), + ) + .with_text(trimmed) + .with_kind(RegionKind::Code) + .with_source(RegionSource::Parser); + + if let Some(parent) = current_section { + region = region.with_parent(parent); + } + regions.push(region); + } + + pos = line_end + 1; + } + + regions + } +} + +#[async_trait] +impl Document for IniDocument { + fn info(&self) -> &DocumentInfo { + &self.info + } + + fn regions(&self) -> &[Region] { + &self.regions + } + + fn regions_for_page(&self, page: u32) -> Vec<&Region> { + if page == 1 { + self.regions.iter().collect() + } else { + Vec::new() + } + } + + fn find_region(&self, id: RegionId) -> Option<&Region> { + self.regions.iter().find(|r| r.id == id) + } + + async fn to_bytes(&self) -> Result { + Ok(Bytes::from(self.content.clone())) + } +} + +#[async_trait] +impl TextExtractor for IniDocument { + async fn extract_text(&self) -> Result { + let mut extracted = ExtractedText::from_raw(&self.content).with_page(1, &self.content); + + for region in &self.regions { + if let Some(text) = ®ion.text { + extracted = extracted.with_region(region.id, text); + } + } + + Ok(extracted) + } + + async fn extract_text_for_page(&self, page: u32) -> Result> { + if page == 1 { + Ok(Some(self.content.clone())) + } else { + Ok(None) + } + } + + fn needs_ocr(&self) -> bool { + false + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_simple_ini() { + let ini = "[section]\nkey=value\nfoo=bar"; + let doc = IniDocument::new(ini.to_string()); + + let sections: Vec<_> = doc + .regions() + .iter() + .filter(|r| r.kind == RegionKind::Heading) + .collect(); + assert_eq!(sections.len(), 1); + assert_eq!(sections[0].text.as_deref(), Some("[section]")); + } + + #[test] + fn test_multiple_sections() { + let ini = "[section1]\nkey1=value1\n\n[section2]\nkey2=value2"; + let doc = IniDocument::new(ini.to_string()); + + let sections: Vec<_> = doc + .regions() + .iter() + .filter(|r| r.kind == RegionKind::Heading) + .collect(); + assert_eq!(sections.len(), 2); + } + + #[test] + fn test_parse_comments() { + let ini = "; Comment style 1\n# Comment style 2\nkey=value"; + let doc = IniDocument::new(ini.to_string()); + + let comments: Vec<_> = doc + .regions() + .iter() + .filter(|r| r.kind == RegionKind::Annotation) + .collect(); + assert_eq!(comments.len(), 2); + } + + #[test] + fn test_parent_child_relationship() { + let ini = "[section]\nkey=value"; + let doc = IniDocument::new(ini.to_string()); + + let section = doc + .regions() + .iter() + .find(|r| r.kind == RegionKind::Heading) + .unwrap(); + let key_value = doc + .regions() + .iter() + .find(|r| r.kind == RegionKind::Code) + .unwrap(); + + assert_eq!(key_value.parent, Some(section.id)); + } + + #[tokio::test] + async fn test_text_extraction() { + let ini = "[section]\nkey=value"; + let doc = IniDocument::new(ini.to_string()); + let text = doc.extract_text().await.unwrap(); + assert!(text.raw.contains("key=value")); + } +} diff --git a/crates/nvisy-text/src/documents/json.rs b/crates/nvisy-text/src/documents/json.rs new file mode 100644 index 0000000..e7c6603 --- /dev/null +++ b/crates/nvisy-text/src/documents/json.rs @@ -0,0 +1,261 @@ +//! JSON document type. + +use std::num::NonZeroU32; + +use async_trait::async_trait; +use bytes::Bytes; +use nvisy_document::{ + BoundingBox, Document, DocumentInfo, ExtractedText, Region, RegionId, RegionKind, RegionSource, + Result, TextExtractor, +}; +use serde_json::Value; + +/// A loaded JSON document. +#[derive(Debug, Clone)] +pub struct JsonDocument { + info: DocumentInfo, + content: String, + parsed: Value, + regions: Vec, +} + +impl JsonDocument { + /// Creates a new JSON document from content. + pub fn new(content: String) -> Result { + let parsed: Value = serde_json::from_str(&content) + .map_err(|e| nvisy_document::Error::parse(format!("Invalid JSON: {e}")))?; + + let regions = Self::extract_regions(&parsed); + let size = content.len() as u64; + let info = DocumentInfo::new("application/json", size).with_page_count(1); + + Ok(Self { + info, + content, + parsed, + regions, + }) + } + + /// Returns the raw JSON content. + #[must_use] + pub fn content(&self) -> &str { + &self.content + } + + /// Returns the parsed JSON value. + #[must_use] + pub fn value(&self) -> &Value { + &self.parsed + } + + /// Returns the JSON pretty-printed. + #[must_use] + pub fn pretty(&self) -> String { + serde_json::to_string_pretty(&self.parsed).unwrap_or_else(|_| self.content.clone()) + } + + fn extract_regions(value: &Value) -> Vec { + let mut regions = Vec::new(); + Self::extract_regions_recursive(value, "", &mut regions, 0); + regions + } + + fn extract_regions_recursive( + value: &Value, + path: &str, + regions: &mut Vec, + depth: usize, + ) { + let y_pos = regions.len() as f64 * 0.05; + let indent = depth as f64 * 0.02; + + match value { + Value::Object(map) => { + let text = if path.is_empty() { + "{...}".to_string() + } else { + format!("{path}: {{...}}") + }; + + regions.push( + Region::on_page( + NonZeroU32::new(1).unwrap(), + BoundingBox::new(indent, y_pos, 1.0 - indent, 0.03), + ) + .with_text(text) + .with_kind(RegionKind::Code) + .with_source(RegionSource::Parser), + ); + + for (key, val) in map { + let new_path = if path.is_empty() { + key.clone() + } else { + format!("{path}.{key}") + }; + Self::extract_regions_recursive(val, &new_path, regions, depth + 1); + } + } + Value::Array(arr) => { + let text = if path.is_empty() { + format!("[{} items]", arr.len()) + } else { + format!("{path}: [{} items]", arr.len()) + }; + + regions.push( + Region::on_page( + NonZeroU32::new(1).unwrap(), + BoundingBox::new(indent, y_pos, 1.0 - indent, 0.03), + ) + .with_text(text) + .with_kind(RegionKind::Code) + .with_source(RegionSource::Parser), + ); + + for (i, val) in arr.iter().enumerate() { + let new_path = format!("{path}[{i}]"); + Self::extract_regions_recursive(val, &new_path, regions, depth + 1); + } + } + Value::String(s) => { + let text = format!("{path}: \"{s}\""); + regions.push( + Region::on_page( + NonZeroU32::new(1).unwrap(), + BoundingBox::new(indent, y_pos, 1.0 - indent, 0.03), + ) + .with_text(text) + .with_kind(RegionKind::Code) + .with_source(RegionSource::Parser), + ); + } + Value::Number(n) => { + let text = format!("{path}: {n}"); + regions.push( + Region::on_page( + NonZeroU32::new(1).unwrap(), + BoundingBox::new(indent, y_pos, 1.0 - indent, 0.03), + ) + .with_text(text) + .with_kind(RegionKind::Code) + .with_source(RegionSource::Parser), + ); + } + Value::Bool(b) => { + let text = format!("{path}: {b}"); + regions.push( + Region::on_page( + NonZeroU32::new(1).unwrap(), + BoundingBox::new(indent, y_pos, 1.0 - indent, 0.03), + ) + .with_text(text) + .with_kind(RegionKind::Code) + .with_source(RegionSource::Parser), + ); + } + Value::Null => { + let text = format!("{path}: null"); + regions.push( + Region::on_page( + NonZeroU32::new(1).unwrap(), + BoundingBox::new(indent, y_pos, 1.0 - indent, 0.03), + ) + .with_text(text) + .with_kind(RegionKind::Code) + .with_source(RegionSource::Parser), + ); + } + } + } +} + +#[async_trait] +impl Document for JsonDocument { + fn info(&self) -> &DocumentInfo { + &self.info + } + + fn regions(&self) -> &[Region] { + &self.regions + } + + fn regions_for_page(&self, page: u32) -> Vec<&Region> { + if page == 1 { + self.regions.iter().collect() + } else { + Vec::new() + } + } + + fn find_region(&self, id: RegionId) -> Option<&Region> { + self.regions.iter().find(|r| r.id == id) + } + + async fn to_bytes(&self) -> Result { + Ok(Bytes::from(self.content.clone())) + } +} + +#[async_trait] +impl TextExtractor for JsonDocument { + async fn extract_text(&self) -> Result { + let pretty = self.pretty(); + let mut extracted = ExtractedText::from_raw(&pretty).with_page(1, &pretty); + + for region in &self.regions { + if let Some(text) = ®ion.text { + extracted = extracted.with_region(region.id, text); + } + } + + Ok(extracted) + } + + async fn extract_text_for_page(&self, page: u32) -> Result> { + if page == 1 { + Ok(Some(self.pretty())) + } else { + Ok(None) + } + } + + fn needs_ocr(&self) -> bool { + false + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_simple_json() { + let json = r#"{"name": "test", "value": 42}"#; + let doc = JsonDocument::new(json.to_string()).unwrap(); + assert!(!doc.regions().is_empty()); + } + + #[test] + fn test_parse_nested_json() { + let json = r#"{"user": {"name": "Alice", "age": 30}, "active": true}"#; + let doc = JsonDocument::new(json.to_string()).unwrap(); + assert!(doc.regions().len() > 3); + } + + #[test] + fn test_invalid_json() { + let invalid = "not valid json {"; + let result = JsonDocument::new(invalid.to_string()); + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_text_extraction() { + let json = r#"{"message": "Hello, World!"}"#; + let doc = JsonDocument::new(json.to_string()).unwrap(); + let extracted = doc.extract_text().await.unwrap(); + assert!(extracted.raw.contains("Hello, World!")); + } +} diff --git a/crates/nvisy-text/src/documents/markdown.rs b/crates/nvisy-text/src/documents/markdown.rs new file mode 100644 index 0000000..c30720f --- /dev/null +++ b/crates/nvisy-text/src/documents/markdown.rs @@ -0,0 +1,343 @@ +//! Markdown document type. + +use std::num::NonZeroU32; + +use async_trait::async_trait; +use bytes::Bytes; +use markdown::{ParseOptions, mdast::Node, to_mdast}; +use nvisy_document::{ + BoundingBox, Document, DocumentInfo, ExtractedText, Region, RegionId, RegionKind, RegionSource, + Result, TextExtractor, +}; + +/// A loaded Markdown document. +#[derive(Debug, Clone)] +pub struct MarkdownDocument { + info: DocumentInfo, + content: String, + regions: Vec, + plain_text: String, +} + +impl MarkdownDocument { + /// Creates a new Markdown document from content. + #[must_use] + pub fn new(content: String) -> Self { + let (regions, plain_text) = Self::parse_content(&content); + let size = content.len() as u64; + let info = DocumentInfo::new("text/markdown", size).with_page_count(1); + + Self { + info, + content, + regions, + plain_text, + } + } + + /// Returns the raw Markdown content. + #[must_use] + pub fn content(&self) -> &str { + &self.content + } + + /// Returns the extracted plain text. + #[must_use] + pub fn plain_text(&self) -> &str { + &self.plain_text + } + + fn parse_content(content: &str) -> (Vec, String) { + let mut regions = Vec::new(); + let mut plain_text = String::new(); + let mut region_count = 0; + + let options = ParseOptions::gfm(); + if let Ok(ast) = to_mdast(content, &options) { + Self::process_node(&ast, &mut regions, &mut plain_text, &mut region_count); + } + + (regions, plain_text.trim().to_string()) + } + + fn process_node( + node: &Node, + regions: &mut Vec, + plain_text: &mut String, + region_count: &mut usize, + ) { + match node { + Node::Root(root) => { + for child in &root.children { + Self::process_node(child, regions, plain_text, region_count); + } + } + Node::Heading(heading) => { + let text = Self::extract_text_from_children(&heading.children); + if !text.is_empty() { + Self::add_region(regions, RegionKind::Heading, &text, region_count); + plain_text.push_str(&text); + plain_text.push('\n'); + } + } + Node::Paragraph(para) => { + let text = Self::extract_text_from_children(¶.children); + if !text.is_empty() { + Self::add_region(regions, RegionKind::Text, &text, region_count); + plain_text.push_str(&text); + plain_text.push('\n'); + } + } + Node::Code(code) => { + if !code.value.is_empty() { + Self::add_region(regions, RegionKind::Code, &code.value, region_count); + plain_text.push_str(&code.value); + plain_text.push('\n'); + } + } + Node::Blockquote(bq) => { + let text = Self::extract_text_from_children(&bq.children); + if !text.is_empty() { + Self::add_region(regions, RegionKind::Quote, &text, region_count); + plain_text.push_str(&text); + plain_text.push('\n'); + } + } + Node::List(list) => { + for item in &list.children { + Self::process_node(item, regions, plain_text, region_count); + } + } + Node::ListItem(item) => { + let text = Self::extract_text_from_children(&item.children); + if !text.is_empty() { + Self::add_region(regions, RegionKind::ListItem, &text, region_count); + plain_text.push_str(&text); + plain_text.push('\n'); + } + } + Node::Table(table) => { + let text = Self::extract_text_from_children(&table.children); + if !text.is_empty() { + Self::add_region(regions, RegionKind::Table, &text, region_count); + plain_text.push_str(&text); + plain_text.push('\n'); + } + } + Node::Link(link) => { + let text = Self::extract_text_from_children(&link.children); + if !text.is_empty() { + Self::add_region(regions, RegionKind::Link, &text, region_count); + plain_text.push_str(&text); + } + } + Node::Image(img) => { + if !img.alt.is_empty() { + Self::add_region(regions, RegionKind::Image, &img.alt, region_count); + plain_text.push_str(&img.alt); + } + } + Node::ThematicBreak(_) => { + plain_text.push_str("\n---\n"); + } + _ => { + // Process any children for other node types + if let Some(children) = node.children() { + for child in children { + Self::process_node(child, regions, plain_text, region_count); + } + } + } + } + } + + fn extract_text_from_children(children: &[Node]) -> String { + let mut text = String::new(); + for child in children { + Self::extract_text_recursive(child, &mut text); + } + text.trim().to_string() + } + + fn extract_text_recursive(node: &Node, text: &mut String) { + match node { + Node::Text(t) => text.push_str(&t.value), + Node::InlineCode(c) => text.push_str(&c.value), + Node::Code(c) => text.push_str(&c.value), + Node::Break(_) => text.push(' '), + _ => { + if let Some(children) = node.children() { + for child in children { + Self::extract_text_recursive(child, text); + } + } + } + } + } + + fn add_region( + regions: &mut Vec, + kind: RegionKind, + text: &str, + region_count: &mut usize, + ) { + let y_start = (*region_count as f64) * 0.05; + let height = 0.04; + + regions.push( + Region::on_page( + NonZeroU32::new(1).unwrap(), + BoundingBox::new(0.0, y_start.min(0.95), 1.0, height), + ) + .with_text(text.to_string()) + .with_kind(kind) + .with_source(RegionSource::Parser), + ); + + *region_count += 1; + } +} + +#[async_trait] +impl Document for MarkdownDocument { + fn info(&self) -> &DocumentInfo { + &self.info + } + + fn regions(&self) -> &[Region] { + &self.regions + } + + fn regions_for_page(&self, page: u32) -> Vec<&Region> { + if page == 1 { + self.regions.iter().collect() + } else { + Vec::new() + } + } + + fn find_region(&self, id: RegionId) -> Option<&Region> { + self.regions.iter().find(|r| r.id == id) + } + + async fn to_bytes(&self) -> Result { + Ok(Bytes::from(self.content.clone())) + } +} + +#[async_trait] +impl TextExtractor for MarkdownDocument { + async fn extract_text(&self) -> Result { + let mut extracted = + ExtractedText::from_raw(&self.plain_text).with_page(1, &self.plain_text); + + for region in &self.regions { + if let Some(text) = ®ion.text { + extracted = extracted.with_region(region.id, text); + } + } + + Ok(extracted) + } + + async fn extract_text_for_page(&self, page: u32) -> Result> { + if page == 1 { + Ok(Some(self.plain_text.clone())) + } else { + Ok(None) + } + } + + fn needs_ocr(&self) -> bool { + false + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_headings() { + let md = "# Title\n\n## Subtitle\n\nParagraph text."; + let doc = MarkdownDocument::new(md.to_string()); + + let headings: Vec<_> = doc + .regions() + .iter() + .filter(|r| r.kind == RegionKind::Heading) + .collect(); + + assert_eq!(headings.len(), 2); + assert_eq!(headings[0].text.as_deref(), Some("Title")); + assert_eq!(headings[1].text.as_deref(), Some("Subtitle")); + } + + #[test] + fn test_parse_list_items() { + let md = "- Item 1\n- Item 2\n\n1. Numbered\n2. List"; + let doc = MarkdownDocument::new(md.to_string()); + + let list_items: Vec<_> = doc + .regions() + .iter() + .filter(|r| r.kind == RegionKind::ListItem) + .collect(); + + assert_eq!(list_items.len(), 4); + } + + #[test] + fn test_code_block() { + let md = "# Title\n\n```rust\nfn main() {}\n```\n\nText."; + let doc = MarkdownDocument::new(md.to_string()); + + let code: Vec<_> = doc + .regions() + .iter() + .filter(|r| r.kind == RegionKind::Code) + .collect(); + + assert_eq!(code.len(), 1); + assert!(code[0].text.as_deref().unwrap().contains("fn main()")); + } + + #[test] + fn test_blockquote() { + let md = "> This is a quote\n\nNormal text."; + let doc = MarkdownDocument::new(md.to_string()); + + let quotes: Vec<_> = doc + .regions() + .iter() + .filter(|r| r.kind == RegionKind::Quote) + .collect(); + + assert_eq!(quotes.len(), 1); + assert_eq!(quotes[0].text.as_deref(), Some("This is a quote")); + } + + #[tokio::test] + async fn test_text_extraction() { + let md = "# Hello\n\nThis is **bold** text."; + let doc = MarkdownDocument::new(md.to_string()); + let extracted = doc.extract_text().await.unwrap(); + + assert!(extracted.raw.contains("Hello")); + assert!(extracted.raw.contains("bold")); + // Formatting should be stripped + assert!(!extracted.raw.contains("**")); + } + + #[test] + fn test_plain_text() { + let md = "# Title\n\nParagraph with **bold** and *italic*."; + let doc = MarkdownDocument::new(md.to_string()); + + assert!(doc.plain_text().contains("Title")); + assert!(doc.plain_text().contains("bold")); + assert!(doc.plain_text().contains("italic")); + assert!(!doc.plain_text().contains("**")); + assert!(!doc.plain_text().contains("*italic*")); + } +} diff --git a/crates/nvisy-text/src/documents/mod.rs b/crates/nvisy-text/src/documents/mod.rs new file mode 100644 index 0000000..aef064e --- /dev/null +++ b/crates/nvisy-text/src/documents/mod.rs @@ -0,0 +1,19 @@ +//! Text-based document types. + +mod csv; +mod ini; +mod json; +mod markdown; +mod plain; +mod toml; +mod xml; +mod yaml; + +pub use self::csv::CsvDocument; +pub use self::ini::IniDocument; +pub use self::json::JsonDocument; +pub use self::markdown::MarkdownDocument; +pub use self::plain::PlainTextDocument; +pub use self::toml::TomlDocument; +pub use self::xml::XmlDocument; +pub use self::yaml::YamlDocument; diff --git a/crates/nvisy-text/src/documents/plain.rs b/crates/nvisy-text/src/documents/plain.rs new file mode 100644 index 0000000..e11caa1 --- /dev/null +++ b/crates/nvisy-text/src/documents/plain.rs @@ -0,0 +1,207 @@ +//! Plain text document type. + +use std::num::NonZeroU32; + +use async_trait::async_trait; +use bytes::Bytes; +use nvisy_document::{ + BoundingBox, Document, DocumentInfo, ExtractedText, Region, RegionId, RegionKind, RegionSource, + Result, TextExtractor, +}; + +/// A loaded plain text document. +#[derive(Debug, Clone)] +pub struct PlainTextDocument { + info: DocumentInfo, + content: String, + regions: Vec, +} + +impl PlainTextDocument { + /// Creates a new plain text document from content. + #[must_use] + pub fn new(content: String) -> Self { + let regions = Self::parse_regions(&content); + let size = content.len() as u64; + let info = DocumentInfo::new("text/plain", size).with_page_count(1); + + Self { + info, + content, + regions, + } + } + + /// Creates an empty plain text document. + #[must_use] + pub fn empty() -> Self { + Self::new(String::new()) + } + + /// Returns the raw text content. + #[must_use] + pub fn content(&self) -> &str { + &self.content + } + + /// Parses text content into regions (paragraphs). + fn parse_regions(content: &str) -> Vec { + let mut regions = Vec::new(); + let total_len = content.len().max(1) as f64; + + let mut current_pos = 0_usize; + let mut paragraph_start = 0_usize; + let mut in_paragraph = false; + + for (i, c) in content.char_indices() { + if c == '\n' { + let next_char = content[i + 1..].chars().next(); + if next_char == Some('\n') || next_char.is_none() { + if in_paragraph && paragraph_start < i { + let text = content[paragraph_start..i].trim(); + if !text.is_empty() { + let region = + Self::create_paragraph_region(text, paragraph_start, i, total_len); + regions.push(region); + } + } + in_paragraph = false; + } else if !in_paragraph { + paragraph_start = i + 1; + in_paragraph = true; + } + } else if !in_paragraph { + paragraph_start = i; + in_paragraph = true; + } + current_pos = i + c.len_utf8(); + } + + if in_paragraph && paragraph_start < current_pos { + let text = content[paragraph_start..].trim(); + if !text.is_empty() { + let region = + Self::create_paragraph_region(text, paragraph_start, current_pos, total_len); + regions.push(region); + } + } + + if regions.is_empty() && !content.trim().is_empty() { + let region = Self::create_paragraph_region(content.trim(), 0, content.len(), total_len); + regions.push(region); + } + + regions + } + + fn create_paragraph_region( + text: &str, + start_byte: usize, + end_byte: usize, + total_len: f64, + ) -> Region { + let y_start = start_byte as f64 / total_len; + let y_end = end_byte as f64 / total_len; + let height = (y_end - y_start).max(0.01); + + Region::on_page( + NonZeroU32::new(1).unwrap(), + BoundingBox::new(0.0, y_start, 1.0, height), + ) + .with_text(text) + .with_kind(RegionKind::Text) + .with_source(RegionSource::Parser) + } +} + +#[async_trait] +impl Document for PlainTextDocument { + fn info(&self) -> &DocumentInfo { + &self.info + } + + fn regions(&self) -> &[Region] { + &self.regions + } + + fn regions_for_page(&self, page: u32) -> Vec<&Region> { + if page == 1 { + self.regions.iter().collect() + } else { + Vec::new() + } + } + + fn find_region(&self, id: RegionId) -> Option<&Region> { + self.regions.iter().find(|r| r.id == id) + } + + async fn to_bytes(&self) -> Result { + Ok(Bytes::from(self.content.clone())) + } +} + +#[async_trait] +impl TextExtractor for PlainTextDocument { + async fn extract_text(&self) -> Result { + let mut extracted = ExtractedText::from_raw(&self.content).with_page(1, &self.content); + + for region in &self.regions { + if let Some(text) = ®ion.text { + extracted = extracted.with_region(region.id, text); + } + } + + Ok(extracted) + } + + async fn extract_text_for_page(&self, page: u32) -> Result> { + if page == 1 { + Ok(Some(self.content.clone())) + } else { + Ok(None) + } + } + + fn needs_ocr(&self) -> bool { + false + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_new_document() { + let doc = PlainTextDocument::new("Hello, world!".to_string()); + assert_eq!(doc.content(), "Hello, world!"); + assert_eq!(doc.regions().len(), 1); + } + + #[test] + fn test_empty_document() { + let doc = PlainTextDocument::empty(); + assert!(doc.content().is_empty()); + assert!(doc.regions().is_empty()); + } + + #[test] + fn test_paragraph_parsing() { + let content = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph."; + let doc = PlainTextDocument::new(content.to_string()); + + assert_eq!(doc.regions().len(), 3); + assert_eq!(doc.regions()[0].text.as_deref(), Some("First paragraph.")); + assert_eq!(doc.regions()[1].text.as_deref(), Some("Second paragraph.")); + assert_eq!(doc.regions()[2].text.as_deref(), Some("Third paragraph.")); + } + + #[tokio::test] + async fn test_to_bytes() { + let content = "Hello, world!"; + let doc = PlainTextDocument::new(content.to_string()); + let bytes = doc.to_bytes().await.unwrap(); + assert_eq!(bytes.as_ref(), content.as_bytes()); + } +} diff --git a/crates/nvisy-text/src/documents/toml.rs b/crates/nvisy-text/src/documents/toml.rs new file mode 100644 index 0000000..f5ae371 --- /dev/null +++ b/crates/nvisy-text/src/documents/toml.rs @@ -0,0 +1,210 @@ +//! TOML document type. + +use std::num::NonZeroU32; + +use async_trait::async_trait; +use bytes::Bytes; +use nvisy_document::{ + BoundingBox, Document, DocumentInfo, ExtractedText, Region, RegionId, RegionKind, RegionSource, + Result, TextExtractor, +}; + +/// A loaded TOML document. +#[derive(Debug, Clone)] +pub struct TomlDocument { + info: DocumentInfo, + content: String, + regions: Vec, +} + +impl TomlDocument { + /// Creates a new TOML document from content. + #[must_use] + pub fn new(content: String) -> Self { + let regions = Self::parse_regions(&content); + let size = content.len() as u64; + let info = DocumentInfo::new("application/toml", size).with_page_count(1); + + Self { + info, + content, + regions, + } + } + + /// Returns the raw content. + #[must_use] + pub fn content(&self) -> &str { + &self.content + } + + fn parse_regions(content: &str) -> Vec { + let mut regions = Vec::new(); + let total_len = content.len().max(1) as f64; + let mut pos = 0_usize; + let mut current_section: Option = None; + + for line in content.lines() { + let line_start = pos; + let line_end = pos + line.len(); + let trimmed = line.trim(); + + if trimmed.is_empty() { + pos = line_end + 1; + continue; + } + + let y_start = line_start as f64 / total_len; + let height = ((line_end - line_start) as f64 / total_len).max(0.02); + + // Table headers: [section] or [[array]] + if (trimmed.starts_with('[') && trimmed.ends_with(']')) + || (trimmed.starts_with("[[") && trimmed.ends_with("]]")) + { + let section = Region::on_page( + NonZeroU32::new(1).unwrap(), + BoundingBox::new(0.0, y_start, 1.0, height), + ) + .with_text(trimmed) + .with_kind(RegionKind::Heading) + .with_source(RegionSource::Parser); + current_section = Some(section.id); + regions.push(section); + } else if trimmed.starts_with('#') { + regions.push( + Region::on_page( + NonZeroU32::new(1).unwrap(), + BoundingBox::new(0.02, y_start, 0.98, height), + ) + .with_text(trimmed) + .with_kind(RegionKind::Annotation) + .with_source(RegionSource::Parser), + ); + } else { + let mut region = Region::on_page( + NonZeroU32::new(1).unwrap(), + BoundingBox::new(0.02, y_start, 0.98, height), + ) + .with_text(trimmed) + .with_kind(RegionKind::Code) + .with_source(RegionSource::Parser); + + if let Some(parent) = current_section { + region = region.with_parent(parent); + } + regions.push(region); + } + + pos = line_end + 1; + } + + regions + } +} + +#[async_trait] +impl Document for TomlDocument { + fn info(&self) -> &DocumentInfo { + &self.info + } + + fn regions(&self) -> &[Region] { + &self.regions + } + + fn regions_for_page(&self, page: u32) -> Vec<&Region> { + if page == 1 { + self.regions.iter().collect() + } else { + Vec::new() + } + } + + fn find_region(&self, id: RegionId) -> Option<&Region> { + self.regions.iter().find(|r| r.id == id) + } + + async fn to_bytes(&self) -> Result { + Ok(Bytes::from(self.content.clone())) + } +} + +#[async_trait] +impl TextExtractor for TomlDocument { + async fn extract_text(&self) -> Result { + let mut extracted = ExtractedText::from_raw(&self.content).with_page(1, &self.content); + + for region in &self.regions { + if let Some(text) = ®ion.text { + extracted = extracted.with_region(region.id, text); + } + } + + Ok(extracted) + } + + async fn extract_text_for_page(&self, page: u32) -> Result> { + if page == 1 { + Ok(Some(self.content.clone())) + } else { + Ok(None) + } + } + + fn needs_ocr(&self) -> bool { + false + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_simple_toml() { + let toml = "[package]\nname = \"test\"\nversion = \"1.0\""; + let doc = TomlDocument::new(toml.to_string()); + + let sections: Vec<_> = doc + .regions() + .iter() + .filter(|r| r.kind == RegionKind::Heading) + .collect(); + assert_eq!(sections.len(), 1); + assert_eq!(sections[0].text.as_deref(), Some("[package]")); + } + + #[test] + fn test_parse_array_tables() { + let toml = "[[dependencies]]\nname = \"foo\"\n\n[[dependencies]]\nname = \"bar\""; + let doc = TomlDocument::new(toml.to_string()); + + let sections: Vec<_> = doc + .regions() + .iter() + .filter(|r| r.kind == RegionKind::Heading) + .collect(); + assert_eq!(sections.len(), 2); + } + + #[test] + fn test_parse_comments() { + let toml = "# Comment\nkey = \"value\""; + let doc = TomlDocument::new(toml.to_string()); + + let comments: Vec<_> = doc + .regions() + .iter() + .filter(|r| r.kind == RegionKind::Annotation) + .collect(); + assert_eq!(comments.len(), 1); + } + + #[tokio::test] + async fn test_text_extraction() { + let toml = "[section]\nkey = \"value\""; + let doc = TomlDocument::new(toml.to_string()); + let text = doc.extract_text().await.unwrap(); + assert!(text.raw.contains("key = \"value\"")); + } +} diff --git a/crates/nvisy-text/src/documents/xml.rs b/crates/nvisy-text/src/documents/xml.rs new file mode 100644 index 0000000..87b2448 --- /dev/null +++ b/crates/nvisy-text/src/documents/xml.rs @@ -0,0 +1,174 @@ +//! XML document type. + +use std::num::NonZeroU32; + +use async_trait::async_trait; +use bytes::Bytes; +use nvisy_document::{ + BoundingBox, Document, DocumentInfo, ExtractedText, Region, RegionId, RegionKind, RegionSource, + Result, TextExtractor, +}; + +/// A loaded XML document. +#[derive(Debug, Clone)] +pub struct XmlDocument { + info: DocumentInfo, + content: String, + regions: Vec, +} + +impl XmlDocument { + /// Creates a new XML document from content. + #[must_use] + pub fn new(content: String) -> Self { + let regions = Self::parse_regions(&content); + let size = content.len() as u64; + let info = DocumentInfo::new("application/xml", size).with_page_count(1); + + Self { + info, + content, + regions, + } + } + + /// Returns the raw content. + #[must_use] + pub fn content(&self) -> &str { + &self.content + } + + fn parse_regions(content: &str) -> Vec { + let mut regions = Vec::new(); + let total_len = content.len().max(1) as f64; + let mut depth = 0_usize; + let mut pos = 0_usize; + + for line in content.lines() { + let line_start = pos; + let line_end = pos + line.len(); + let trimmed = line.trim(); + + if trimmed.is_empty() { + pos = line_end + 1; + continue; + } + + if trimmed.starts_with("") + && !trimmed.contains(" &DocumentInfo { + &self.info + } + + fn regions(&self) -> &[Region] { + &self.regions + } + + fn regions_for_page(&self, page: u32) -> Vec<&Region> { + if page == 1 { + self.regions.iter().collect() + } else { + Vec::new() + } + } + + fn find_region(&self, id: RegionId) -> Option<&Region> { + self.regions.iter().find(|r| r.id == id) + } + + async fn to_bytes(&self) -> Result { + Ok(Bytes::from(self.content.clone())) + } +} + +#[async_trait] +impl TextExtractor for XmlDocument { + async fn extract_text(&self) -> Result { + let mut extracted = ExtractedText::from_raw(&self.content).with_page(1, &self.content); + + for region in &self.regions { + if let Some(text) = ®ion.text { + extracted = extracted.with_region(region.id, text); + } + } + + Ok(extracted) + } + + async fn extract_text_for_page(&self, page: u32) -> Result> { + if page == 1 { + Ok(Some(self.content.clone())) + } else { + Ok(None) + } + } + + fn needs_ocr(&self) -> bool { + false + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_simple_xml() { + let xml = "\n text\n"; + let doc = XmlDocument::new(xml.to_string()); + + assert_eq!(doc.regions().len(), 3); + } + + #[test] + fn test_nested_depth() { + let xml = "\n\n\n\n"; + let doc = XmlDocument::new(xml.to_string()); + + // Each element should create a region with increasing indent + assert!(!doc.regions().is_empty()); + } + + #[tokio::test] + async fn test_text_extraction() { + let xml = "content"; + let doc = XmlDocument::new(xml.to_string()); + let text = doc.extract_text().await.unwrap(); + assert!(text.raw.contains("content")); + } +} diff --git a/crates/nvisy-text/src/documents/yaml.rs b/crates/nvisy-text/src/documents/yaml.rs new file mode 100644 index 0000000..7557513 --- /dev/null +++ b/crates/nvisy-text/src/documents/yaml.rs @@ -0,0 +1,189 @@ +//! YAML document type. + +use std::num::NonZeroU32; + +use async_trait::async_trait; +use bytes::Bytes; +use nvisy_document::{ + BoundingBox, Document, DocumentInfo, ExtractedText, Region, RegionId, RegionKind, RegionSource, + Result, TextExtractor, +}; + +/// A loaded YAML document. +#[derive(Debug, Clone)] +pub struct YamlDocument { + info: DocumentInfo, + content: String, + regions: Vec, +} + +impl YamlDocument { + /// Creates a new YAML document from content. + #[must_use] + pub fn new(content: String) -> Self { + let regions = Self::parse_regions(&content); + let size = content.len() as u64; + let info = DocumentInfo::new("application/x-yaml", size).with_page_count(1); + + Self { + info, + content, + regions, + } + } + + /// Returns the raw content. + #[must_use] + pub fn content(&self) -> &str { + &self.content + } + + fn parse_regions(content: &str) -> Vec { + let mut regions = Vec::new(); + let total_len = content.len().max(1) as f64; + let mut pos = 0_usize; + + for line in content.lines() { + let line_start = pos; + let line_end = pos + line.len(); + let trimmed = line.trim(); + + if trimmed.is_empty() { + pos = line_end + 1; + continue; + } + + let indent_chars = line.len() - line.trim_start().len(); + let indent = (indent_chars as f64 * 0.01).min(0.2); + + let y_start = line_start as f64 / total_len; + let height = ((line_end - line_start) as f64 / total_len).max(0.02); + + let kind = if trimmed.starts_with('#') { + RegionKind::Annotation + } else if trimmed.starts_with('-') { + RegionKind::ListItem + } else if trimmed.contains(':') { + RegionKind::Code + } else { + RegionKind::Text + }; + + regions.push( + Region::on_page( + NonZeroU32::new(1).unwrap(), + BoundingBox::new(indent, y_start, 1.0 - indent, height), + ) + .with_text(trimmed) + .with_kind(kind) + .with_source(RegionSource::Parser), + ); + + pos = line_end + 1; + } + + regions + } +} + +#[async_trait] +impl Document for YamlDocument { + fn info(&self) -> &DocumentInfo { + &self.info + } + + fn regions(&self) -> &[Region] { + &self.regions + } + + fn regions_for_page(&self, page: u32) -> Vec<&Region> { + if page == 1 { + self.regions.iter().collect() + } else { + Vec::new() + } + } + + fn find_region(&self, id: RegionId) -> Option<&Region> { + self.regions.iter().find(|r| r.id == id) + } + + async fn to_bytes(&self) -> Result { + Ok(Bytes::from(self.content.clone())) + } +} + +#[async_trait] +impl TextExtractor for YamlDocument { + async fn extract_text(&self) -> Result { + let mut extracted = ExtractedText::from_raw(&self.content).with_page(1, &self.content); + + for region in &self.regions { + if let Some(text) = ®ion.text { + extracted = extracted.with_region(region.id, text); + } + } + + Ok(extracted) + } + + async fn extract_text_for_page(&self, page: u32) -> Result> { + if page == 1 { + Ok(Some(self.content.clone())) + } else { + Ok(None) + } + } + + fn needs_ocr(&self) -> bool { + false + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_simple_yaml() { + let yaml = "name: test\nvalue: 42"; + let doc = YamlDocument::new(yaml.to_string()); + + assert_eq!(doc.regions().len(), 2); + assert!(doc.regions().iter().all(|r| r.kind == RegionKind::Code)); + } + + #[test] + fn test_parse_list() { + let yaml = "items:\n - first\n - second"; + let doc = YamlDocument::new(yaml.to_string()); + + let list_items: Vec<_> = doc + .regions() + .iter() + .filter(|r| r.kind == RegionKind::ListItem) + .collect(); + assert_eq!(list_items.len(), 2); + } + + #[test] + fn test_parse_comments() { + let yaml = "# This is a comment\nkey: value"; + let doc = YamlDocument::new(yaml.to_string()); + + let comments: Vec<_> = doc + .regions() + .iter() + .filter(|r| r.kind == RegionKind::Annotation) + .collect(); + assert_eq!(comments.len(), 1); + } + + #[tokio::test] + async fn test_text_extraction() { + let yaml = "key: value"; + let doc = YamlDocument::new(yaml.to_string()); + let text = doc.extract_text().await.unwrap(); + assert!(text.raw.contains("key: value")); + } +} diff --git a/crates/nvisy-text/src/format.rs b/crates/nvisy-text/src/format.rs deleted file mode 100644 index 5458c74..0000000 --- a/crates/nvisy-text/src/format.rs +++ /dev/null @@ -1,70 +0,0 @@ -//! Plain text format handler implementation. - -use bytes::Bytes; -use nvisy_document::{Capabilities, DocumentFormat, Error, Result}; - -use crate::TextDocument; - -/// Plain text document format handler. -#[derive(Debug, Clone, Default)] -pub struct TextFormat { - capabilities: Capabilities, -} - -impl TextFormat { - /// Creates a new plain text format handler. - #[must_use] - pub fn new() -> Self { - Self { - capabilities: Capabilities::read_only(), - } - } -} - -impl DocumentFormat for TextFormat { - type Document = TextDocument; - - fn name(&self) -> &'static str { - "text" - } - - fn mime_types(&self) -> &'static [&'static str] { - &["text/plain", "text/markdown", "text/x-rst"] - } - - fn extensions(&self) -> &'static [&'static str] { - &["txt", "md", "markdown", "rst", "text"] - } - - fn capabilities(&self) -> &Capabilities { - &self.capabilities - } - - async fn load(&self, _data: Bytes) -> Result { - // TODO: Implement text loading - Err(Error::unsupported_format( - "Text loading not yet implemented", - )) - } - - async fn create_empty(&self) -> Result { - // TODO: Implement empty text document creation - Err(Error::unsupported_format( - "Text creation not yet implemented", - )) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_format_metadata() { - let format = TextFormat::new(); - assert_eq!(format.name(), "text"); - assert!(format.mime_types().contains(&"text/plain")); - assert!(format.extensions().contains(&"txt")); - assert!(format.extensions().contains(&"md")); - } -} diff --git a/crates/nvisy-text/src/formats/csv.rs b/crates/nvisy-text/src/formats/csv.rs new file mode 100644 index 0000000..e5533a7 --- /dev/null +++ b/crates/nvisy-text/src/formats/csv.rs @@ -0,0 +1,114 @@ +//! CSV format handler. + +use bytes::Bytes; +use nvisy_document::{ + Capabilities, DocumentFormat, Result, StructureCapabilities, TextCapabilities, +}; + +use crate::documents::CsvDocument; + +/// CSV document format handler. +#[derive(Debug, Clone)] +pub struct CsvFormat { + capabilities: Capabilities, +} + +impl CsvFormat { + /// Creates a new CSV format handler. + #[must_use] + pub fn new() -> Self { + Self { + capabilities: Capabilities { + text: TextCapabilities { + can_extract: true, + has_rich_text: false, + may_need_ocr: false, + }, + structure: StructureCapabilities { + can_detect_structure: true, + can_detect_tables: true, + has_pages: false, + }, + ..Default::default() + }, + } + } +} + +impl Default for CsvFormat { + fn default() -> Self { + Self::new() + } +} + +impl DocumentFormat for CsvFormat { + type Document = CsvDocument; + + fn name(&self) -> &'static str { + "csv" + } + + fn mime_types(&self) -> &'static [&'static str] { + &["text/csv", "text/tab-separated-values"] + } + + fn extensions(&self) -> &'static [&'static str] { + &["csv", "tsv"] + } + + fn capabilities(&self) -> &Capabilities { + &self.capabilities + } + + async fn load(&self, data: Bytes) -> Result { + let content = String::from_utf8_lossy(&data).into_owned(); + // Auto-detect delimiter + let first_line = content.lines().next().unwrap_or(""); + let delimiter = if first_line.contains('\t') { + b'\t' + } else { + b',' + }; + Ok(CsvDocument::new(content, delimiter)) + } +} + +#[cfg(test)] +mod tests { + use nvisy_document::Document; + + use super::*; + + #[test] + fn test_format_metadata() { + let format = CsvFormat::new(); + assert_eq!(format.name(), "csv"); + assert!(format.mime_types().contains(&"text/csv")); + assert!(format.extensions().contains(&"csv")); + assert!(format.extensions().contains(&"tsv")); + } + + #[test] + fn test_capabilities() { + let format = CsvFormat::new(); + let caps = format.capabilities(); + assert!(caps.structure.can_detect_tables); + } + + #[tokio::test] + async fn test_load_csv() { + let format = CsvFormat::new(); + let data = Bytes::from("a,b,c\n1,2,3"); + let doc = format.load(data).await.unwrap(); + assert_eq!(doc.delimiter(), b','); + assert!(!doc.regions().is_empty()); + } + + #[tokio::test] + async fn test_load_tsv() { + let format = CsvFormat::new(); + let data = Bytes::from("a\tb\tc\n1\t2\t3"); + let doc = format.load(data).await.unwrap(); + assert_eq!(doc.delimiter(), b'\t'); + } +} diff --git a/crates/nvisy-text/src/formats/ini.rs b/crates/nvisy-text/src/formats/ini.rs new file mode 100644 index 0000000..bfe22ce --- /dev/null +++ b/crates/nvisy-text/src/formats/ini.rs @@ -0,0 +1,98 @@ +//! INI format handler. + +use bytes::Bytes; +use nvisy_document::{ + Capabilities, DocumentFormat, Result, StructureCapabilities, TextCapabilities, +}; + +use crate::documents::IniDocument; + +/// INI document format handler. +#[derive(Debug, Clone)] +pub struct IniFormat { + capabilities: Capabilities, +} + +impl IniFormat { + /// Creates a new INI format handler. + #[must_use] + pub fn new() -> Self { + Self { + capabilities: Capabilities { + text: TextCapabilities { + can_extract: true, + has_rich_text: false, + may_need_ocr: false, + }, + structure: StructureCapabilities { + can_detect_structure: true, + can_detect_tables: false, + has_pages: false, + }, + ..Default::default() + }, + } + } +} + +impl Default for IniFormat { + fn default() -> Self { + Self::new() + } +} + +impl DocumentFormat for IniFormat { + type Document = IniDocument; + + fn name(&self) -> &'static str { + "ini" + } + + fn mime_types(&self) -> &'static [&'static str] { + &["text/plain"] + } + + fn extensions(&self) -> &'static [&'static str] { + &["ini", "cfg", "conf", "config"] + } + + fn capabilities(&self) -> &Capabilities { + &self.capabilities + } + + async fn load(&self, data: Bytes) -> Result { + let content = String::from_utf8_lossy(&data).into_owned(); + Ok(IniDocument::new(content)) + } +} + +#[cfg(test)] +mod tests { + use nvisy_document::Document; + + use super::*; + + #[test] + fn test_format_metadata() { + let format = IniFormat::new(); + assert_eq!(format.name(), "ini"); + assert!(format.extensions().contains(&"ini")); + assert!(format.extensions().contains(&"cfg")); + assert!(format.extensions().contains(&"conf")); + } + + #[test] + fn test_capabilities() { + let format = IniFormat::new(); + let caps = format.capabilities(); + assert!(caps.structure.can_detect_structure); + } + + #[tokio::test] + async fn test_load_ini() { + let format = IniFormat::new(); + let data = Bytes::from("[section]\nkey=value\nfoo=bar"); + let doc = format.load(data).await.unwrap(); + assert!(!doc.regions().is_empty()); + } +} diff --git a/crates/nvisy-text/src/formats/json.rs b/crates/nvisy-text/src/formats/json.rs new file mode 100644 index 0000000..163135e --- /dev/null +++ b/crates/nvisy-text/src/formats/json.rs @@ -0,0 +1,98 @@ +//! JSON format handler. + +use bytes::Bytes; +use nvisy_document::{ + Capabilities, DocumentFormat, Result, StructureCapabilities, TextCapabilities, +}; + +use crate::documents::JsonDocument; + +/// JSON document format handler. +#[derive(Debug, Clone)] +pub struct JsonFormat { + capabilities: Capabilities, +} + +impl JsonFormat { + /// Creates a new JSON format handler. + #[must_use] + pub fn new() -> Self { + Self { + capabilities: Capabilities { + text: TextCapabilities { + can_extract: true, + has_rich_text: false, + may_need_ocr: false, + }, + structure: StructureCapabilities { + can_detect_structure: true, + can_detect_tables: false, + has_pages: false, + }, + ..Default::default() + }, + } + } +} + +impl Default for JsonFormat { + fn default() -> Self { + Self::new() + } +} + +impl DocumentFormat for JsonFormat { + type Document = JsonDocument; + + fn name(&self) -> &'static str { + "json" + } + + fn mime_types(&self) -> &'static [&'static str] { + &["application/json", "text/json"] + } + + fn extensions(&self) -> &'static [&'static str] { + &["json"] + } + + fn capabilities(&self) -> &Capabilities { + &self.capabilities + } + + async fn load(&self, data: Bytes) -> Result { + let content = String::from_utf8_lossy(&data).into_owned(); + JsonDocument::new(content) + } +} + +#[cfg(test)] +mod tests { + use nvisy_document::Document; + + use super::*; + + #[test] + fn test_format_metadata() { + let format = JsonFormat::new(); + assert_eq!(format.name(), "json"); + assert!(format.mime_types().contains(&"application/json")); + assert!(format.extensions().contains(&"json")); + } + + #[tokio::test] + async fn test_load_document() { + let format = JsonFormat::new(); + let data = Bytes::from(r#"{"hello": "world"}"#); + let doc = format.load(data).await.unwrap(); + assert!(!doc.regions().is_empty()); + } + + #[tokio::test] + async fn test_load_invalid_json() { + let format = JsonFormat::new(); + let data = Bytes::from("not valid json {"); + let result = format.load(data).await; + assert!(result.is_err()); + } +} diff --git a/crates/nvisy-text/src/formats/markdown.rs b/crates/nvisy-text/src/formats/markdown.rs new file mode 100644 index 0000000..d040865 --- /dev/null +++ b/crates/nvisy-text/src/formats/markdown.rs @@ -0,0 +1,99 @@ +//! Markdown format handler. + +use bytes::Bytes; +use nvisy_document::{ + Capabilities, DocumentFormat, Result, StructureCapabilities, TextCapabilities, +}; + +use crate::documents::MarkdownDocument; + +/// Markdown document format handler. +#[derive(Debug, Clone)] +pub struct MarkdownFormat { + capabilities: Capabilities, +} + +impl MarkdownFormat { + /// Creates a new Markdown format handler. + #[must_use] + pub fn new() -> Self { + Self { + capabilities: Capabilities { + text: TextCapabilities { + can_extract: true, + has_rich_text: true, + may_need_ocr: false, + }, + structure: StructureCapabilities { + can_detect_structure: true, + can_detect_tables: false, + has_pages: false, + }, + ..Default::default() + }, + } + } +} + +impl Default for MarkdownFormat { + fn default() -> Self { + Self::new() + } +} + +impl DocumentFormat for MarkdownFormat { + type Document = MarkdownDocument; + + fn name(&self) -> &'static str { + "markdown" + } + + fn mime_types(&self) -> &'static [&'static str] { + &["text/markdown", "text/x-markdown"] + } + + fn extensions(&self) -> &'static [&'static str] { + &["md", "markdown", "mdown", "mkd"] + } + + fn capabilities(&self) -> &Capabilities { + &self.capabilities + } + + async fn load(&self, data: Bytes) -> Result { + let content = String::from_utf8_lossy(&data).into_owned(); + Ok(MarkdownDocument::new(content)) + } +} + +#[cfg(test)] +mod tests { + use nvisy_document::Document; + + use super::*; + + #[test] + fn test_format_metadata() { + let format = MarkdownFormat::new(); + assert_eq!(format.name(), "markdown"); + assert!(format.mime_types().contains(&"text/markdown")); + assert!(format.extensions().contains(&"md")); + assert!(format.extensions().contains(&"markdown")); + } + + #[test] + fn test_capabilities() { + let format = MarkdownFormat::new(); + let caps = format.capabilities(); + assert!(caps.text.has_rich_text); + assert!(caps.structure.can_detect_structure); + } + + #[tokio::test] + async fn test_load_document() { + let format = MarkdownFormat::new(); + let data = Bytes::from("# Test\n\nContent here."); + let doc = format.load(data).await.unwrap(); + assert!(!doc.regions().is_empty()); + } +} diff --git a/crates/nvisy-text/src/formats/mod.rs b/crates/nvisy-text/src/formats/mod.rs new file mode 100644 index 0000000..63010aa --- /dev/null +++ b/crates/nvisy-text/src/formats/mod.rs @@ -0,0 +1,19 @@ +//! Text-based document format handlers. + +mod csv; +mod ini; +mod json; +mod markdown; +mod plain; +mod toml; +mod xml; +mod yaml; + +pub use self::csv::CsvFormat; +pub use self::ini::IniFormat; +pub use self::json::JsonFormat; +pub use self::markdown::MarkdownFormat; +pub use self::plain::PlainTextFormat; +pub use self::toml::TomlFormat; +pub use self::xml::XmlFormat; +pub use self::yaml::YamlFormat; diff --git a/crates/nvisy-text/src/formats/plain.rs b/crates/nvisy-text/src/formats/plain.rs new file mode 100644 index 0000000..5a03060 --- /dev/null +++ b/crates/nvisy-text/src/formats/plain.rs @@ -0,0 +1,74 @@ +//! Plain text format handler. + +use bytes::Bytes; +use nvisy_document::{Capabilities, DocumentFormat, Result}; + +use crate::documents::PlainTextDocument; + +/// Plain text document format handler. +#[derive(Debug, Clone)] +pub struct PlainTextFormat { + capabilities: Capabilities, +} + +impl PlainTextFormat { + /// Creates a new plain text format handler. + #[must_use] + pub fn new() -> Self { + Self { + capabilities: Capabilities::text(), + } + } +} + +impl Default for PlainTextFormat { + fn default() -> Self { + Self::new() + } +} + +impl DocumentFormat for PlainTextFormat { + type Document = PlainTextDocument; + + fn name(&self) -> &'static str { + "plain-text" + } + + fn mime_types(&self) -> &'static [&'static str] { + &["text/plain"] + } + + fn extensions(&self) -> &'static [&'static str] { + &["txt", "text"] + } + + fn capabilities(&self) -> &Capabilities { + &self.capabilities + } + + async fn load(&self, data: Bytes) -> Result { + let content = String::from_utf8_lossy(&data).into_owned(); + Ok(PlainTextDocument::new(content)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_format_metadata() { + let format = PlainTextFormat::new(); + assert_eq!(format.name(), "plain-text"); + assert!(format.mime_types().contains(&"text/plain")); + assert!(format.extensions().contains(&"txt")); + } + + #[tokio::test] + async fn test_load_document() { + let format = PlainTextFormat::new(); + let data = Bytes::from("Hello, world!"); + let doc = format.load(data).await.unwrap(); + assert_eq!(doc.content(), "Hello, world!"); + } +} diff --git a/crates/nvisy-text/src/formats/toml.rs b/crates/nvisy-text/src/formats/toml.rs new file mode 100644 index 0000000..b0411a5 --- /dev/null +++ b/crates/nvisy-text/src/formats/toml.rs @@ -0,0 +1,97 @@ +//! TOML format handler. + +use bytes::Bytes; +use nvisy_document::{ + Capabilities, DocumentFormat, Result, StructureCapabilities, TextCapabilities, +}; + +use crate::documents::TomlDocument; + +/// TOML document format handler. +#[derive(Debug, Clone)] +pub struct TomlFormat { + capabilities: Capabilities, +} + +impl TomlFormat { + /// Creates a new TOML format handler. + #[must_use] + pub fn new() -> Self { + Self { + capabilities: Capabilities { + text: TextCapabilities { + can_extract: true, + has_rich_text: false, + may_need_ocr: false, + }, + structure: StructureCapabilities { + can_detect_structure: true, + can_detect_tables: false, + has_pages: false, + }, + ..Default::default() + }, + } + } +} + +impl Default for TomlFormat { + fn default() -> Self { + Self::new() + } +} + +impl DocumentFormat for TomlFormat { + type Document = TomlDocument; + + fn name(&self) -> &'static str { + "toml" + } + + fn mime_types(&self) -> &'static [&'static str] { + &["application/toml"] + } + + fn extensions(&self) -> &'static [&'static str] { + &["toml"] + } + + fn capabilities(&self) -> &Capabilities { + &self.capabilities + } + + async fn load(&self, data: Bytes) -> Result { + let content = String::from_utf8_lossy(&data).into_owned(); + Ok(TomlDocument::new(content)) + } +} + +#[cfg(test)] +mod tests { + use nvisy_document::Document; + + use super::*; + + #[test] + fn test_format_metadata() { + let format = TomlFormat::new(); + assert_eq!(format.name(), "toml"); + assert!(format.mime_types().contains(&"application/toml")); + assert!(format.extensions().contains(&"toml")); + } + + #[test] + fn test_capabilities() { + let format = TomlFormat::new(); + let caps = format.capabilities(); + assert!(caps.structure.can_detect_structure); + } + + #[tokio::test] + async fn test_load_toml() { + let format = TomlFormat::new(); + let data = Bytes::from("[package]\nname = \"test\"\nversion = \"1.0\""); + let doc = format.load(data).await.unwrap(); + assert!(!doc.regions().is_empty()); + } +} diff --git a/crates/nvisy-text/src/formats/xml.rs b/crates/nvisy-text/src/formats/xml.rs new file mode 100644 index 0000000..2113191 --- /dev/null +++ b/crates/nvisy-text/src/formats/xml.rs @@ -0,0 +1,99 @@ +//! XML format handler. + +use bytes::Bytes; +use nvisy_document::{ + Capabilities, DocumentFormat, Result, StructureCapabilities, TextCapabilities, +}; + +use crate::documents::XmlDocument; + +/// XML document format handler. +#[derive(Debug, Clone)] +pub struct XmlFormat { + capabilities: Capabilities, +} + +impl XmlFormat { + /// Creates a new XML format handler. + #[must_use] + pub fn new() -> Self { + Self { + capabilities: Capabilities { + text: TextCapabilities { + can_extract: true, + has_rich_text: false, + may_need_ocr: false, + }, + structure: StructureCapabilities { + can_detect_structure: true, + can_detect_tables: false, + has_pages: false, + }, + ..Default::default() + }, + } + } +} + +impl Default for XmlFormat { + fn default() -> Self { + Self::new() + } +} + +impl DocumentFormat for XmlFormat { + type Document = XmlDocument; + + fn name(&self) -> &'static str { + "xml" + } + + fn mime_types(&self) -> &'static [&'static str] { + &["application/xml", "text/xml"] + } + + fn extensions(&self) -> &'static [&'static str] { + &["xml", "xsd", "xsl", "xslt", "svg", "xhtml", "plist"] + } + + fn capabilities(&self) -> &Capabilities { + &self.capabilities + } + + async fn load(&self, data: Bytes) -> Result { + let content = String::from_utf8_lossy(&data).into_owned(); + Ok(XmlDocument::new(content)) + } +} + +#[cfg(test)] +mod tests { + use nvisy_document::Document; + + use super::*; + + #[test] + fn test_format_metadata() { + let format = XmlFormat::new(); + assert_eq!(format.name(), "xml"); + assert!(format.mime_types().contains(&"application/xml")); + assert!(format.extensions().contains(&"xml")); + assert!(format.extensions().contains(&"svg")); + } + + #[test] + fn test_capabilities() { + let format = XmlFormat::new(); + let caps = format.capabilities(); + assert!(caps.structure.can_detect_structure); + assert!(!caps.structure.can_detect_tables); + } + + #[tokio::test] + async fn test_load_xml() { + let format = XmlFormat::new(); + let data = Bytes::from("content"); + let doc = format.load(data).await.unwrap(); + assert!(!doc.regions().is_empty()); + } +} diff --git a/crates/nvisy-text/src/formats/yaml.rs b/crates/nvisy-text/src/formats/yaml.rs new file mode 100644 index 0000000..63d47a5 --- /dev/null +++ b/crates/nvisy-text/src/formats/yaml.rs @@ -0,0 +1,98 @@ +//! YAML format handler. + +use bytes::Bytes; +use nvisy_document::{ + Capabilities, DocumentFormat, Result, StructureCapabilities, TextCapabilities, +}; + +use crate::documents::YamlDocument; + +/// YAML document format handler. +#[derive(Debug, Clone)] +pub struct YamlFormat { + capabilities: Capabilities, +} + +impl YamlFormat { + /// Creates a new YAML format handler. + #[must_use] + pub fn new() -> Self { + Self { + capabilities: Capabilities { + text: TextCapabilities { + can_extract: true, + has_rich_text: false, + may_need_ocr: false, + }, + structure: StructureCapabilities { + can_detect_structure: true, + can_detect_tables: false, + has_pages: false, + }, + ..Default::default() + }, + } + } +} + +impl Default for YamlFormat { + fn default() -> Self { + Self::new() + } +} + +impl DocumentFormat for YamlFormat { + type Document = YamlDocument; + + fn name(&self) -> &'static str { + "yaml" + } + + fn mime_types(&self) -> &'static [&'static str] { + &["application/x-yaml", "text/yaml", "application/yaml"] + } + + fn extensions(&self) -> &'static [&'static str] { + &["yaml", "yml"] + } + + fn capabilities(&self) -> &Capabilities { + &self.capabilities + } + + async fn load(&self, data: Bytes) -> Result { + let content = String::from_utf8_lossy(&data).into_owned(); + Ok(YamlDocument::new(content)) + } +} + +#[cfg(test)] +mod tests { + use nvisy_document::Document; + + use super::*; + + #[test] + fn test_format_metadata() { + let format = YamlFormat::new(); + assert_eq!(format.name(), "yaml"); + assert!(format.mime_types().contains(&"application/x-yaml")); + assert!(format.extensions().contains(&"yaml")); + assert!(format.extensions().contains(&"yml")); + } + + #[test] + fn test_capabilities() { + let format = YamlFormat::new(); + let caps = format.capabilities(); + assert!(caps.structure.can_detect_structure); + } + + #[tokio::test] + async fn test_load_yaml() { + let format = YamlFormat::new(); + let data = Bytes::from("key: value\nlist:\n - item1\n - item2"); + let doc = format.load(data).await.unwrap(); + assert!(!doc.regions().is_empty()); + } +} diff --git a/crates/nvisy-text/src/lib.rs b/crates/nvisy-text/src/lib.rs index b8b6981..a54d6d0 100644 --- a/crates/nvisy-text/src/lib.rs +++ b/crates/nvisy-text/src/lib.rs @@ -2,8 +2,26 @@ #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("../README.md")] -mod document; -mod format; +pub mod documents; +pub mod formats; -pub use document::TextDocument; -pub use format::TextFormat; +// Re-export document types +pub use documents::{ + CsvDocument, IniDocument, JsonDocument, MarkdownDocument, PlainTextDocument, TomlDocument, + XmlDocument, YamlDocument, +}; + +// Re-export format handlers +pub use formats::{ + CsvFormat, IniFormat, JsonFormat, MarkdownFormat, PlainTextFormat, TomlFormat, XmlFormat, + YamlFormat, +}; + +// Legacy aliases for backwards compatibility +pub use PlainTextDocument as TextDocument; +pub use PlainTextFormat as TextFormat; + +// Re-export commonly used types from nvisy-document +pub use nvisy_document::{ + Document, DocumentFormat, ExtractedText, NormalizedTable, Region, TableExtractor, TextExtractor, +}; From aadcc5bd5e473893c1f28c657c74d97cde004f94 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Fri, 16 Jan 2026 21:28:16 +0100 Subject: [PATCH 3/5] refactor: use nvisy-core error handling, split image formats - Remove nvisy-document error module, re-export from nvisy-core - Add load_file method to DocumentFormat trait - Use data.as_string()? instead of String::from_utf8_lossy in nvisy-text - Split ImageFormat into JpegFormat and PngFormat - Register image formats in nvisy-engine --- Cargo.lock | 1 + Cargo.toml | 25 +- crates/nvisy-core/src/io/content.rs | 32 +- crates/nvisy-core/src/io/content_data.rs | 334 ++++++++-------- crates/nvisy-core/src/io/data_reference.rs | 6 +- crates/nvisy-document/Cargo.toml | 2 +- crates/nvisy-document/src/conversion/mod.rs | 2 +- crates/nvisy-document/src/error.rs | 358 ------------------ crates/nvisy-document/src/format/mod.rs | 29 +- crates/nvisy-document/src/lib.rs | 30 +- crates/nvisy-document/src/metadata/mod.rs | 2 +- crates/nvisy-document/src/table/mod.rs | 2 +- crates/nvisy-document/src/text/mod.rs | 2 +- crates/nvisy-document/src/thumbnail/mod.rs | 2 +- crates/nvisy-docx/src/document.rs | 4 +- crates/nvisy-docx/src/format.rs | 9 +- crates/nvisy-engine/Cargo.toml | 4 +- crates/nvisy-engine/src/engine/mod.rs | 13 +- crates/nvisy-engine/src/registry/mod.rs | 36 +- .../src/{document.rs => documents/jpeg.rs} | 20 +- crates/nvisy-image/src/documents/mod.rs | 7 + crates/nvisy-image/src/documents/png.rs | 54 +++ crates/nvisy-image/src/format.rs | 82 ---- crates/nvisy-image/src/formats/jpeg.rs | 71 ++++ crates/nvisy-image/src/formats/mod.rs | 7 + crates/nvisy-image/src/formats/png.rs | 70 ++++ crates/nvisy-image/src/lib.rs | 8 +- crates/nvisy-pdf/src/document.rs | 4 +- crates/nvisy-pdf/src/format.rs | 7 +- crates/nvisy-text/README.md | 5 +- crates/nvisy-text/src/documents/json.rs | 2 +- crates/nvisy-text/src/formats/csv.rs | 11 +- crates/nvisy-text/src/formats/ini.rs | 9 +- crates/nvisy-text/src/formats/json.rs | 11 +- crates/nvisy-text/src/formats/markdown.rs | 9 +- crates/nvisy-text/src/formats/plain.rs | 9 +- crates/nvisy-text/src/formats/toml.rs | 9 +- crates/nvisy-text/src/formats/xml.rs | 9 +- crates/nvisy-text/src/formats/yaml.rs | 9 +- 39 files changed, 524 insertions(+), 782 deletions(-) delete mode 100644 crates/nvisy-document/src/error.rs rename crates/nvisy-image/src/{document.rs => documents/jpeg.rs} (69%) create mode 100644 crates/nvisy-image/src/documents/mod.rs create mode 100644 crates/nvisy-image/src/documents/png.rs delete mode 100644 crates/nvisy-image/src/format.rs create mode 100644 crates/nvisy-image/src/formats/jpeg.rs create mode 100644 crates/nvisy-image/src/formats/mod.rs create mode 100644 crates/nvisy-image/src/formats/png.rs diff --git a/Cargo.lock b/Cargo.lock index ccd2d32..5accc23 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -713,6 +713,7 @@ dependencies = [ "nvisy-archive", "nvisy-document", "nvisy-docx", + "nvisy-image", "nvisy-pdf", "nvisy-text", "serde", diff --git a/Cargo.toml b/Cargo.toml index 60a8c09..9b787b3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,14 +43,7 @@ nvisy-text = { path = "./crates/nvisy-text", version = "0.1.0", features = [] } # Async runtime and I/O tokio = { version = "1.49", default-features = false, features = [] } -tokio-stream = { version = "0.1", default-features = false, features = [] } -tokio-util = { version = "0.7", default-features = false, features = [] } -futures = { version = "0.3", default-features = false, features = [] } async-trait = { version = "0.1", default-features = false, features = [] } - -# File system utilities -walkdir = { version = "2.5", default-features = false, features = [] } -memmap2 = { version = "0.9", default-features = false, features = [] } tempfile = { version = "3.24", default-features = false, features = [] } # Multithreading @@ -64,44 +57,28 @@ csv = { version = "1.4", default-features = false, features = [] } # Data types and utilities uuid = { version = "1.19", features = [] } jiff = { version = "0.2", default-features = false, features = [] } -size = { version = "0.5", default-features = false, features = [] } bytes = { version = "1.11", default-features = false, features = [] } -rust_decimal = { version = "1.36", default-features = false, features = [] } semver = { version = "1.0", default-features = false, features = [] } -isolang = { version = "2.4", default-features = false, features = ["english_names"] } -# Text processing and pattern matching +# Text processing markdown = { version = "1.0.0", default-features = false, features = [] } -regex = { version = "1.11", default-features = false, features = [] } -regex-lite = { version = "0.1", default-features = false, features = ["std"] } -fancy-regex = { version = "0.16", default-features = false, features = [] } -aho-corasick = { version = "1.1", default-features = false, features = [] } -unicode-segmentation = { version = "1.10", default-features = false, features = [] } hipstr = { version = "0.8", default-features = false, features = [] } # Cryptography and hashing sha2 = { version = "0.10", default-features = false, features = [] } -blake3 = { version = "1.8", default-features = false, features = [] } base64 = { version = "0.22", default-features = false, features = [] } hex = { version = "0.4", features = [] } zeroize = { version = "1.7", default-features = false, features = [] } -rand = { version = "0.9", default-features = false, features = [] } # Error handling thiserror = { version = "2.0", features = [] } -anyhow = { version = "1.0", features = ["backtrace"] } # Tracing and observability tracing = { version = "0.1", features = [] } -tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } # Macros and derive utilities derive_more = { version = "2.0", default-features = false, features = [] } strum = { version = "0.27", default-features = false, features = [] } -const_format = { version = "0.2", default-features = false, features = [] } # Testing utilities tokio-test = { version = "0.4", default-features = false, features = [] } -proptest = { version = "1.4", default-features = false, features = [] } -criterion = { version = "0.7", default-features = false, features = [] } -rstest = { version = "0.26", default-features = false, features = [] } diff --git a/crates/nvisy-core/src/io/content.rs b/crates/nvisy-core/src/io/content.rs index b3870f4..93de761 100644 --- a/crates/nvisy-core/src/io/content.rs +++ b/crates/nvisy-core/src/io/content.rs @@ -7,6 +7,7 @@ use derive_more::{AsRef, Deref}; use serde::{Deserialize, Serialize}; use super::ContentData; +use crate::error::Result; use crate::fs::ContentMetadata; use crate::path::ContentSource; @@ -28,7 +29,7 @@ use crate::path::ContentSource; /// let content = Content::new(data); /// /// assert_eq!(content.size(), 13); -/// assert!(content.is_text()); +/// assert!(content.is_likely_text()); /// /// // Create content with metadata /// let source = ContentSource::new(); @@ -88,33 +89,18 @@ impl Content { self.data.content_source } - /// Get the size of the content in bytes - pub fn size(&self) -> usize { - self.data.size() - } - - /// Check if the content is empty - pub fn is_empty(&self) -> bool { - self.data.is_empty() - } - - /// Check if the content is stored as text - pub fn is_text(&self) -> bool { - self.data.is_text() - } - - /// Check if the content is stored as binary - pub fn is_binary(&self) -> bool { - self.data.is_binary() - } - /// Get the content as bytes pub fn as_bytes(&self) -> &[u8] { self.data.as_bytes() } + /// Returns `true` if the content appears to be text. + pub fn is_likely_text(&self) -> bool { + self.data.is_likely_text() + } + /// Try to get the content as a string slice - pub fn as_str(&self) -> crate::error::Result<&str> { + pub fn as_str(&self) -> Result<&str> { self.data.as_str() } @@ -159,7 +145,7 @@ mod tests { let content = Content::new(data.clone()); assert_eq!(content.size(), 13); - assert!(content.is_text()); + assert!(content.is_likely_text()); assert!(content.metadata().is_none()); } diff --git a/crates/nvisy-core/src/io/content_data.rs b/crates/nvisy-core/src/io/content_data.rs index 1f08bc3..8f41af3 100644 --- a/crates/nvisy-core/src/io/content_data.rs +++ b/crates/nvisy-core/src/io/content_data.rs @@ -4,6 +4,7 @@ //! along with its metadata and source information. use std::fmt; +use std::ops::Deref; use std::sync::OnceLock; use bytes::Bytes; @@ -14,150 +15,158 @@ use sha2::{Digest, Sha256}; use crate::error::{Error, ErrorResource, ErrorType, Result}; use crate::path::ContentSource; -/// The underlying data storage type for content +/// A wrapper around `Bytes` for content storage. /// -/// This enum allows content to be stored as either binary data (`Bytes`) -/// or text data (`HipStr`). Both types are cheap to clone as they use -/// reference counting internally. -#[derive(Debug, Clone, PartialEq, Eq)] +/// This struct wraps `bytes::Bytes` and provides additional methods +/// for text conversion. It's cheap to clone as `Bytes` uses reference +/// counting internally. +#[derive(Debug, Clone, PartialEq, Eq, Default)] #[derive(Serialize, Deserialize)] -#[serde(rename_all = "lowercase")] -pub enum ContentBytes { - /// Binary data stored as `bytes::Bytes` - Binary(Bytes), - /// Text data stored as `hipstr::HipStr<'static>` (owned) - Text(HipStr<'static>), -} +#[serde(transparent)] +pub struct ContentBytes(Bytes); impl ContentBytes { - /// Get the size of the content in bytes + /// Creates a new `ContentBytes` from raw bytes. + #[must_use] + pub fn new(bytes: Bytes) -> Self { + Self(bytes) + } + + /// Returns the size of the content in bytes. + #[must_use] pub fn len(&self) -> usize { - match self { - Self::Binary(bytes) => bytes.len(), - Self::Text(text) => text.len(), - } + self.0.len() } - /// Check if the content is empty + /// Returns `true` if the content is empty. + #[must_use] pub fn is_empty(&self) -> bool { - match self { - Self::Binary(bytes) => bytes.is_empty(), - Self::Text(text) => text.is_empty(), - } + self.0.is_empty() } - /// Get the content as a byte slice + /// Returns the content as a byte slice. + #[must_use] pub fn as_bytes(&self) -> &[u8] { - match self { - Self::Binary(bytes) => bytes, - Self::Text(text) => text.as_bytes(), - } + &self.0 } - /// Check if this is text content - pub fn is_text(&self) -> bool { - matches!(self, Self::Text(_)) + /// Tries to return the content as a string slice. + /// + /// Returns `None` if the content is not valid UTF-8. + #[must_use] + pub fn as_str(&self) -> Option<&str> { + std::str::from_utf8(&self.0).ok() } - /// Check if this is binary content - pub fn is_binary(&self) -> bool { - matches!(self, Self::Binary(_)) + /// Converts to a `HipStr` if the content is valid UTF-8. + /// + /// # Errors + /// + /// Returns an error if the content is not valid UTF-8. + pub fn as_hipstr(&self) -> Result> { + let s = std::str::from_utf8(&self.0).map_err(|e| { + Error::new(format!("Invalid UTF-8: {e}")) + .with_type(ErrorType::Runtime) + .with_resource(ErrorResource::Core) + })?; + Ok(HipStr::from(s)) + } + + /// Returns the underlying `Bytes`. + #[must_use] + pub fn to_bytes(&self) -> Bytes { + self.0.clone() } - /// Try to get the content as a string slice - pub fn as_str(&self) -> Option<&str> { - match self { - Self::Binary(bytes) => std::str::from_utf8(bytes).ok(), - Self::Text(text) => Some(text.as_str()), - } + /// Consumes and returns the underlying `Bytes`. + #[must_use] + pub fn into_bytes(self) -> Bytes { + self.0 } - /// Convert to Bytes (clones if text) - pub fn to_bytes(&self) -> Bytes { - match self { - Self::Binary(bytes) => bytes.clone(), - Self::Text(text) => Bytes::copy_from_slice(text.as_bytes()), - } + /// Returns `true` if the content appears to be text. + /// + /// Uses a simple heuristic: checks if all bytes are ASCII printable + /// or whitespace characters. + #[must_use] + pub fn is_likely_text(&self) -> bool { + self.0 + .iter() + .all(|&b| b.is_ascii_graphic() || b.is_ascii_whitespace()) } +} - /// Convert to HipStr if valid UTF-8 - pub fn to_hipstr(&self) -> Result> { - match self { - Self::Binary(bytes) => { - let s = std::str::from_utf8(bytes).map_err(|e| { - Error::new(format!("Invalid UTF-8: {e}")) - .with_type(ErrorType::Runtime) - .with_resource(ErrorResource::Core) - })?; - Ok(HipStr::from(s)) - } - Self::Text(text) => Ok(text.clone()), - } +impl Deref for ContentBytes { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + &self.0 } } -impl Default for ContentBytes { - fn default() -> Self { - Self::Binary(Bytes::new()) +impl AsRef<[u8]> for ContentBytes { + fn as_ref(&self) -> &[u8] { + &self.0 } } impl From<&str> for ContentBytes { fn from(s: &str) -> Self { - Self::Text(HipStr::from(s)) + Self(Bytes::copy_from_slice(s.as_bytes())) } } impl From for ContentBytes { fn from(s: String) -> Self { - Self::Text(HipStr::from(s)) + Self(Bytes::from(s)) } } impl From> for ContentBytes { fn from(s: HipStr<'static>) -> Self { - Self::Text(s) + Self(Bytes::copy_from_slice(s.as_bytes())) } } impl From<&[u8]> for ContentBytes { fn from(bytes: &[u8]) -> Self { - Self::Binary(Bytes::copy_from_slice(bytes)) + Self(Bytes::copy_from_slice(bytes)) } } impl From> for ContentBytes { fn from(vec: Vec) -> Self { - Self::Binary(Bytes::from(vec)) + Self(Bytes::from(vec)) } } impl From for ContentBytes { fn from(bytes: Bytes) -> Self { - Self::Binary(bytes) + Self(bytes) } } -/// Content data with metadata and computed hashes +/// Content data with metadata and computed hashes. /// -/// This struct wraps [`ContentBytes`] (either `Bytes` or `HipStr`) and stores content data -/// along with metadata about its source and optional computed SHA256 hash. +/// This struct wraps [`ContentBytes`] and stores content data along with +/// metadata about its source and optional computed SHA256 hash. /// It's designed to be cheap to clone using reference-counted types. -/// The SHA256 hash is lazily computed using `OnceLock` for lock-free access after initialization. +/// The SHA256 hash is lazily computed using `OnceLock` for lock-free +/// access after initialization. #[derive(Debug)] #[derive(Serialize, Deserialize)] pub struct ContentData { - /// Unique identifier for the content source + /// Unique identifier for the content source. pub content_source: ContentSource, - /// The actual content data (binary or text) + /// The actual content data. data: ContentBytes, - /// Lazily computed SHA256 hash of the content + /// Lazily computed SHA256 hash of the content. #[serde(skip)] sha256_cache: OnceLock, } impl ContentData { - /// Create new content data from bytes + /// Creates new content data from bytes. /// /// # Example /// @@ -174,12 +183,12 @@ impl ContentData { pub fn new(content_source: ContentSource, data: Bytes) -> Self { Self { content_source, - data: ContentBytes::Binary(data), + data: ContentBytes::new(data), sha256_cache: OnceLock::new(), } } - /// Create new content data from text + /// Creates new content data from text. /// /// # Example /// @@ -189,18 +198,17 @@ impl ContentData { /// let source = ContentSource::new(); /// let content = ContentData::from_text(source, "Hello, world!"); /// - /// assert!(content.is_text()); /// assert_eq!(content.as_str().unwrap(), "Hello, world!"); /// ``` - pub fn from_text(content_source: ContentSource, text: impl Into>) -> Self { + pub fn from_text(content_source: ContentSource, text: impl Into) -> Self { Self { content_source, - data: ContentBytes::Text(text.into()), + data: ContentBytes::from(text.into()), sha256_cache: OnceLock::new(), } } - /// Create content data with explicit content bytes type + /// Creates content data with explicit `ContentBytes`. pub fn with_content_bytes(content_source: ContentSource, data: ContentBytes) -> Self { Self { content_source, @@ -209,13 +217,15 @@ impl ContentData { } } - /// Get the size of the content in bytes + /// Returns the size of the content in bytes. + #[must_use] pub fn size(&self) -> usize { self.data.len() } - /// Get pretty formatted size string + /// Returns a pretty formatted size string. #[allow(clippy::cast_precision_loss)] + #[must_use] pub fn get_pretty_size(&self) -> String { let bytes = self.size(); match bytes { @@ -226,100 +236,91 @@ impl ContentData { } } - /// Get the content data as bytes slice + /// Returns the content data as a byte slice. + #[must_use] pub fn as_bytes(&self) -> &[u8] { self.data.as_bytes() } - /// Get the underlying content bytes + /// Returns a reference to the underlying `ContentBytes`. + #[must_use] pub fn content_bytes(&self) -> &ContentBytes { &self.data } - /// Convert the content data to Bytes + /// Converts the content data to `Bytes`. + #[must_use] pub fn to_bytes(&self) -> Bytes { self.data.to_bytes() } - /// Consume and convert into Bytes + /// Consumes and converts into `Bytes`. + #[must_use] pub fn into_bytes(self) -> Bytes { - match self.data { - ContentBytes::Binary(bytes) => bytes, - ContentBytes::Text(text) => Bytes::copy_from_slice(text.as_bytes()), - } - } - - /// Check if the content is stored as text - pub fn is_text(&self) -> bool { - self.data.is_text() - } - - /// Check if the content is stored as binary - pub fn is_binary(&self) -> bool { - self.data.is_binary() + self.data.into_bytes() } - /// Check if the content is likely text (basic heuristic for binary data) + /// Returns `true` if the content appears to be text. + /// + /// Uses a simple heuristic: checks if all bytes are ASCII printable + /// or whitespace characters. + #[must_use] pub fn is_likely_text(&self) -> bool { - match &self.data { - ContentBytes::Text(_) => true, - ContentBytes::Binary(bytes) => bytes - .iter() - .all(|&b| b.is_ascii_graphic() || b.is_ascii_whitespace()), - } + self.data.is_likely_text() } - /// Try to convert the content data to a UTF-8 string + /// Tries to convert the content data to a UTF-8 string. /// /// # Errors /// /// Returns an error if the content data contains invalid UTF-8 sequences. pub fn as_string(&self) -> Result { - match &self.data { - ContentBytes::Text(text) => Ok(text.to_string()), - ContentBytes::Binary(bytes) => String::from_utf8(bytes.to_vec()).map_err(|e| { - Error::new(format!("Invalid UTF-8: {e}")) - .with_type(ErrorType::Runtime) - .with_resource(ErrorResource::Core) - }), - } + self.data.as_hipstr().map(|s| s.to_string()) } - /// Try to convert the content data to a UTF-8 string slice + /// Tries to convert the content data to a UTF-8 string slice. /// /// # Errors /// /// Returns an error if the content data contains invalid UTF-8 sequences. pub fn as_str(&self) -> Result<&str> { - match &self.data { - ContentBytes::Text(text) => Ok(text.as_str()), - ContentBytes::Binary(bytes) => std::str::from_utf8(bytes).map_err(|e| { - Error::new(format!("Invalid UTF-8: {e}")) - .with_type(ErrorType::Runtime) - .with_resource(ErrorResource::Core) - }), - } + std::str::from_utf8(self.data.as_bytes()).map_err(|e| { + Error::new(format!("Invalid UTF-8: {e}")) + .with_type(ErrorType::Runtime) + .with_resource(ErrorResource::Core) + }) } - /// Compute SHA256 hash of the content + /// Converts to a `HipStr` if the content is valid UTF-8. + /// + /// # Errors + /// + /// Returns an error if the content is not valid UTF-8. + pub fn as_hipstr(&self) -> Result> { + self.data.as_hipstr() + } + + /// Computes SHA256 hash of the content. fn compute_sha256_internal(&self) -> Bytes { let mut hasher = Sha256::new(); hasher.update(self.data.as_bytes()); Bytes::from(hasher.finalize().to_vec()) } - /// Get the SHA256 hash, computing it if not already done + /// Returns the SHA256 hash, computing it if not already done. + #[must_use] pub fn sha256(&self) -> &Bytes { self.sha256_cache .get_or_init(|| self.compute_sha256_internal()) } - /// Get the SHA256 hash as hex string + /// Returns the SHA256 hash as a hex string. + #[must_use] pub fn sha256_hex(&self) -> String { hex::encode(self.sha256()) } - /// Verify the content against a provided SHA256 hash + /// Verifies the content against a provided SHA256 hash. /// /// # Errors /// @@ -341,11 +342,12 @@ impl ContentData { } } - /// Get a slice of the content data + /// Returns a slice of the content data. /// /// # Errors /// - /// Returns an error if the end index is beyond the content length or if start is greater than end. + /// Returns an error if the end index is beyond the content length + /// or if start is greater than end. pub fn slice(&self, start: usize, end: usize) -> Result { let bytes = self.data.as_bytes(); if end > bytes.len() { @@ -367,13 +369,13 @@ impl ContentData { Ok(Bytes::copy_from_slice(&bytes[start..end])) } - /// Check if the content is empty + /// Returns `true` if the content is empty. + #[must_use] pub fn is_empty(&self) -> bool { self.data.is_empty() } } -// Manual implementation of Clone since OnceLock doesn't propagate the computed value impl Clone for ContentData { fn clone(&self) -> Self { let new_lock = OnceLock::new(); @@ -390,7 +392,6 @@ impl Clone for ContentData { } } -// Manual implementation of PartialEq impl PartialEq for ContentData { fn eq(&self, other: &Self) -> bool { self.content_source == other.content_source && self.data == other.data @@ -399,7 +400,6 @@ impl PartialEq for ContentData { impl Eq for ContentData {} -// Implement From conversions for common types impl From<&str> for ContentData { fn from(s: &str) -> Self { let source = ContentSource::new(); @@ -438,7 +438,7 @@ impl From for ContentData { impl From> for ContentData { fn from(text: HipStr<'static>) -> Self { let source = ContentSource::new(); - Self::from_text(source, text) + Self::from_text(source, text.to_string()) } } @@ -464,7 +464,6 @@ mod tests { assert_eq!(content.content_source, source); assert_eq!(content.size(), 13); - // Check that hash is not computed yet assert!(content.sha256_cache.get().is_none()); } @@ -473,26 +472,34 @@ mod tests { let source = ContentSource::new(); let content = ContentData::from_text(source, "Hello, world!"); - assert!(content.is_text()); - assert!(!content.is_binary()); assert_eq!(content.as_str().unwrap(), "Hello, world!"); } #[test] - fn test_content_bytes_text() { - let text = ContentBytes::from("Hello"); - assert!(text.is_text()); - assert!(!text.is_binary()); - assert_eq!(text.as_str(), Some("Hello")); - assert_eq!(text.len(), 5); + fn test_content_bytes_wrapper() { + let bytes = ContentBytes::from("Hello"); + assert_eq!(bytes.as_str(), Some("Hello")); + assert_eq!(bytes.len(), 5); + assert!(!bytes.is_empty()); + } + + #[test] + fn test_content_bytes_as_hipstr() { + let bytes = ContentBytes::from("Hello, HipStr!"); + let hipstr = bytes.as_hipstr().unwrap(); + assert_eq!(hipstr.as_str(), "Hello, HipStr!"); + + // Test with invalid UTF-8 + let invalid = ContentBytes::from(vec![0xFF, 0xFE]); + assert!(invalid.as_hipstr().is_err()); } #[test] fn test_content_bytes_binary() { let binary = ContentBytes::from(vec![0xFF, 0xFE]); - assert!(binary.is_binary()); - assert!(!binary.is_text()); assert_eq!(binary.len(), 2); + assert!(binary.as_str().is_none()); + assert!(!binary.is_likely_text()); } #[test] @@ -510,9 +517,8 @@ mod tests { let hash = content.sha256(); assert!(content.sha256_cache.get().is_some()); - assert_eq!(hash.len(), 32); // SHA256 is 32 bytes + assert_eq!(hash.len(), 32); - // Test getting cached hash let hash2 = content.sha256(); assert_eq!(hash, hash2); } @@ -522,10 +528,8 @@ mod tests { let content = ContentData::from("Hello, world!"); let hash = content.sha256().clone(); - // Should verify successfully against itself assert!(content.verify_sha256(&hash).is_ok()); - // Should fail against different hash let wrong_hash = vec![0u8; 32]; assert!(content.verify_sha256(&wrong_hash).is_err()); } @@ -541,6 +545,16 @@ mod tests { assert!(binary_content.as_str().is_err()); } + #[test] + fn test_as_hipstr() { + let content = ContentData::from("Hello, HipStr!"); + let hipstr = content.as_hipstr().unwrap(); + assert_eq!(hipstr.as_str(), "Hello, HipStr!"); + + let binary_content = ContentData::from(vec![0xFF, 0xFE]); + assert!(binary_content.as_hipstr().is_err()); + } + #[test] fn test_is_likely_text() { let text_content = ContentData::from("Hello, world!"); @@ -560,7 +574,6 @@ mod tests { let slice = content.slice(7, 12).unwrap(); assert_eq!(slice, Bytes::from("world")); - // Test bounds checking assert!(content.slice(0, 100).is_err()); assert!(content.slice(10, 5).is_err()); } @@ -578,10 +591,6 @@ mod tests { assert_eq!(from_bytes.as_str().unwrap(), "test"); assert_eq!(from_vec.as_str().unwrap(), "test"); assert_eq!(from_bytes_type.as_str().unwrap(), "test"); - - // Text types should be stored as text - assert!(from_str.is_text()); - assert!(from_string.is_text()); } #[test] @@ -596,12 +605,10 @@ mod tests { #[test] fn test_cloning_preserves_hash() { let original = ContentData::from("Hello, world!"); - // Compute hash first let _ = original.sha256(); let cloned = original.clone(); - // Both should have the hash computed assert!(original.sha256_cache.get().is_some()); assert!(cloned.sha256_cache.get().is_some()); assert_eq!(original.sha256(), cloned.sha256()); @@ -612,7 +619,6 @@ mod tests { let original = ContentData::from("Hello, world!"); let cloned = original.clone(); - // They should be equal assert_eq!(original, cloned); } @@ -645,7 +651,13 @@ mod tests { fn test_from_hipstr() { let hipstr = HipStr::from("Hello from HipStr"); let content = ContentData::from(hipstr); - assert!(content.is_text()); assert_eq!(content.as_str().unwrap(), "Hello from HipStr"); } + + #[test] + fn test_content_bytes_deref() { + let bytes = ContentBytes::from("Hello"); + assert_eq!(&*bytes, b"Hello"); + assert_eq!(bytes.as_ref(), b"Hello"); + } } diff --git a/crates/nvisy-core/src/io/data_reference.rs b/crates/nvisy-core/src/io/data_reference.rs index f97eb1a..cf98854 100644 --- a/crates/nvisy-core/src/io/data_reference.rs +++ b/crates/nvisy-core/src/io/data_reference.rs @@ -83,8 +83,8 @@ impl DataReference { } /// Check if the content is text-based - pub fn is_text(&self) -> bool { - self.content.is_text() + pub fn is_likely_text(&self) -> bool { + self.content.is_likely_text() } /// Get the size of the content in bytes @@ -104,7 +104,7 @@ mod tests { let content = Content::new(ContentData::from("Hello, world!")); let data_ref = DataReference::new(content); - assert!(data_ref.is_text()); + assert!(data_ref.is_likely_text()); assert!(data_ref.mapping_id().is_none()); assert_eq!(data_ref.size(), 13); // Verify UUIDv7 is used diff --git a/crates/nvisy-document/Cargo.toml b/crates/nvisy-document/Cargo.toml index 4351dea..43a9dde 100644 --- a/crates/nvisy-document/Cargo.toml +++ b/crates/nvisy-document/Cargo.toml @@ -25,7 +25,7 @@ rustdoc-args = ["--cfg", "docsrs"] nvisy-core = { workspace = true } # Async runtime and I/O -tokio = { workspace = true, features = ["sync", "io-util"] } +tokio = { workspace = true, features = ["sync", "io-util", "fs"] } async-trait = { workspace = true } # Data types diff --git a/crates/nvisy-document/src/conversion/mod.rs b/crates/nvisy-document/src/conversion/mod.rs index 14d7efd..f63f1a5 100644 --- a/crates/nvisy-document/src/conversion/mod.rs +++ b/crates/nvisy-document/src/conversion/mod.rs @@ -10,7 +10,7 @@ use async_trait::async_trait; pub use options::{ConversionOptions, HtmlOptions, PageMargins, PageOrientation, PdfOptions}; pub use types::{ConversionPath, ConversionResult, ConversionStep, FormatPair, SkippedElement}; -use crate::error::Result; +use crate::Result; use crate::format::Document; /// Trait for document format conversion. diff --git a/crates/nvisy-document/src/error.rs b/crates/nvisy-document/src/error.rs deleted file mode 100644 index d5a2d77..0000000 --- a/crates/nvisy-document/src/error.rs +++ /dev/null @@ -1,358 +0,0 @@ -//! Error types for document processing. - -use std::fmt; - -use crate::format::region::RegionId; - -/// A boxed error type for wrapping source errors. -pub type BoxError = Box; - -/// Result type for document processing. -pub type Result = std::result::Result; - -/// The error type for document processing. -#[derive(Debug)] -pub struct Error { - kind: ErrorKind, - source: Option, -} - -/// The kind of error that occurred during document processing. -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum ErrorKind { - /// The document format is not supported. - UnsupportedFormat { format: String }, - - /// The document could not be parsed. - Parse { message: String }, - - /// A referenced region was not found. - RegionNotFound { id: RegionId }, - - /// A referenced page was not found. - PageNotFound { page: u32 }, - - /// An I/O error occurred. - Io { message: String }, - - /// Serialization/deserialization error. - Serialization { message: String }, - - /// The processing was cancelled. - Cancelled, - - /// A timeout occurred. - Timeout { duration_ms: u64 }, - - /// Resource limit exceeded. - ResourceLimit { resource: String }, - - /// Conversion error. - Conversion { message: String }, - - /// Metadata extraction error. - Metadata { message: String }, - - /// Thumbnail generation error. - Thumbnail { message: String }, - - /// Protected or encrypted document. - Protected { message: String }, -} - -impl Error { - /// Creates a new error with the given kind. - pub fn new(kind: ErrorKind) -> Self { - Self { kind, source: None } - } - - /// Creates a new error with the given kind and source. - pub fn with_source( - kind: ErrorKind, - source: impl std::error::Error + Send + Sync + 'static, - ) -> Self { - Self { - kind, - source: Some(Box::new(source)), - } - } - - /// Returns the kind of error. - pub fn kind(&self) -> &ErrorKind { - &self.kind - } - - /// Consumes the error and returns the kind. - pub fn into_kind(self) -> ErrorKind { - self.kind - } - - /// Returns true if this error is retriable. - pub fn is_retriable(&self) -> bool { - matches!( - self.kind, - ErrorKind::Timeout { .. } | ErrorKind::Io { .. } | ErrorKind::ResourceLimit { .. } - ) - } - - /// Returns true if this error indicates invalid user input. - pub fn is_user_error(&self) -> bool { - matches!( - self.kind, - ErrorKind::RegionNotFound { .. } | ErrorKind::PageNotFound { .. } - ) - } - - // Convenience constructors - - /// Creates a parse error. - pub fn parse(message: impl Into) -> Self { - Self::new(ErrorKind::Parse { - message: message.into(), - }) - } - - /// Creates a parse error with a source. - pub fn parse_with_source( - message: impl Into, - source: impl std::error::Error + Send + Sync + 'static, - ) -> Self { - Self::with_source( - ErrorKind::Parse { - message: message.into(), - }, - source, - ) - } - - /// Creates an unsupported format error. - pub fn unsupported_format(format: impl Into) -> Self { - Self::new(ErrorKind::UnsupportedFormat { - format: format.into(), - }) - } - - /// Creates a region not found error. - pub fn region_not_found(id: RegionId) -> Self { - Self::new(ErrorKind::RegionNotFound { id }) - } - - /// Creates a page not found error. - pub fn page_not_found(page: u32) -> Self { - Self::new(ErrorKind::PageNotFound { page }) - } - - /// Creates an I/O error. - pub fn io(message: impl Into) -> Self { - Self::new(ErrorKind::Io { - message: message.into(), - }) - } - - /// Creates an I/O error with a source. - pub fn io_with_source( - message: impl Into, - source: impl std::error::Error + Send + Sync + 'static, - ) -> Self { - Self::with_source( - ErrorKind::Io { - message: message.into(), - }, - source, - ) - } - - /// Creates a serialization error. - pub fn serialization(message: impl Into) -> Self { - Self::new(ErrorKind::Serialization { - message: message.into(), - }) - } - - /// Creates a timeout error. - pub fn timeout(duration_ms: u64) -> Self { - Self::new(ErrorKind::Timeout { duration_ms }) - } - - /// Creates a resource limit error. - pub fn resource_limit(resource: impl Into) -> Self { - Self::new(ErrorKind::ResourceLimit { - resource: resource.into(), - }) - } - - /// Creates a cancelled error. - pub fn cancelled() -> Self { - Self::new(ErrorKind::Cancelled) - } - - /// Creates a conversion error. - pub fn conversion(message: impl Into) -> Self { - Self::new(ErrorKind::Conversion { - message: message.into(), - }) - } - - /// Creates a conversion error with a source. - pub fn conversion_with_source( - message: impl Into, - source: impl std::error::Error + Send + Sync + 'static, - ) -> Self { - Self::with_source( - ErrorKind::Conversion { - message: message.into(), - }, - source, - ) - } - - /// Creates a metadata error. - pub fn metadata(message: impl Into) -> Self { - Self::new(ErrorKind::Metadata { - message: message.into(), - }) - } - - /// Creates a metadata error with a source. - pub fn metadata_with_source( - message: impl Into, - source: impl std::error::Error + Send + Sync + 'static, - ) -> Self { - Self::with_source( - ErrorKind::Metadata { - message: message.into(), - }, - source, - ) - } - - /// Creates a thumbnail error. - pub fn thumbnail(message: impl Into) -> Self { - Self::new(ErrorKind::Thumbnail { - message: message.into(), - }) - } - - /// Creates a thumbnail error with a source. - pub fn thumbnail_with_source( - message: impl Into, - source: impl std::error::Error + Send + Sync + 'static, - ) -> Self { - Self::with_source( - ErrorKind::Thumbnail { - message: message.into(), - }, - source, - ) - } - - /// Creates a protected document error. - pub fn protected(message: impl Into) -> Self { - Self::new(ErrorKind::Protected { - message: message.into(), - }) - } -} - -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match &self.kind { - ErrorKind::UnsupportedFormat { format } => write!(f, "unsupported format: {format}"), - ErrorKind::Parse { message } => write!(f, "parse error: {message}"), - ErrorKind::RegionNotFound { id } => write!(f, "region not found: {id}"), - ErrorKind::PageNotFound { page } => write!(f, "page not found: {page}"), - ErrorKind::Io { message } => write!(f, "I/O error: {message}"), - ErrorKind::Serialization { message } => write!(f, "serialization error: {message}"), - ErrorKind::Cancelled => write!(f, "processing cancelled"), - ErrorKind::Timeout { duration_ms } => { - write!(f, "processing timed out after {duration_ms}ms") - } - ErrorKind::ResourceLimit { resource } => { - write!(f, "resource limit exceeded: {resource}") - } - ErrorKind::Conversion { message } => write!(f, "conversion error: {message}"), - ErrorKind::Metadata { message } => write!(f, "metadata error: {message}"), - ErrorKind::Thumbnail { message } => write!(f, "thumbnail error: {message}"), - ErrorKind::Protected { message } => write!(f, "protected document: {message}"), - } - } -} - -impl std::error::Error for Error { - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - self.source - .as_ref() - .map(|e| e.as_ref() as &(dyn std::error::Error + 'static)) - } -} - -impl From for Error { - fn from(error: std::io::Error) -> Self { - Self::io_with_source(error.to_string(), error) - } -} - -impl From for Error { - fn from(kind: ErrorKind) -> Self { - Self::new(kind) - } -} - -#[cfg(test)] -mod tests { - use std::error::Error as StdError; - - use super::*; - - #[test] - fn test_error_display() { - let err = Error::region_not_found(RegionId::new()); - let msg = err.to_string(); - assert!(msg.contains("region not found")); - } - - #[test] - fn test_error_kind() { - let err = Error::timeout(1000); - assert!(matches!( - err.kind(), - ErrorKind::Timeout { duration_ms: 1000 } - )); - } - - #[test] - fn test_error_is_retriable() { - assert!(Error::timeout(1000).is_retriable()); - assert!(Error::io("failed").is_retriable()); - assert!(!Error::region_not_found(RegionId::new()).is_retriable()); - } - - #[test] - fn test_error_is_user_error() { - assert!(Error::region_not_found(RegionId::new()).is_user_error()); - assert!(Error::page_not_found(5).is_user_error()); - assert!(!Error::timeout(1000).is_user_error()); - } - - #[test] - fn test_from_io_error() { - let io_err = std::io::Error::new(std::io::ErrorKind::NotFound, "file not found"); - let err: Error = io_err.into(); - assert!(matches!(err.kind(), ErrorKind::Io { .. })); - assert!(StdError::source(&err).is_some()); - } - - #[test] - fn test_error_with_source() { - let source = std::io::Error::other("underlying error"); - let err = Error::parse_with_source("failed to parse", source); - assert!(StdError::source(&err).is_some()); - } - - #[test] - fn test_from_error_kind() { - let kind = ErrorKind::Cancelled; - let err: Error = kind.into(); - assert!(matches!(err.kind(), ErrorKind::Cancelled)); - } -} diff --git a/crates/nvisy-document/src/format/mod.rs b/crates/nvisy-document/src/format/mod.rs index 58b1fa1..aa6915a 100644 --- a/crates/nvisy-document/src/format/mod.rs +++ b/crates/nvisy-document/src/format/mod.rs @@ -12,6 +12,7 @@ mod page; pub mod region; use std::future::Future; +use std::path::Path; use async_trait::async_trait; use bytes::Bytes; @@ -19,10 +20,11 @@ pub use capabilities::{ Capabilities, MetadataCapabilities, StructureCapabilities, TextCapabilities, }; pub use info::DocumentInfo; +pub use nvisy_core::io::ContentData; pub use page::PageOptions; pub use region::{BoundingBox, Point, Region, RegionId, RegionKind, RegionSource, RegionStatus}; -use crate::error::Result; +use crate::{Error, Result}; /// Trait for document format handlers with an associated Document type. /// @@ -45,8 +47,29 @@ pub trait DocumentFormat: Send + Sync { /// Returns the capabilities of this format. fn capabilities(&self) -> &Capabilities; - /// Loads a document from bytes. - fn load(&self, data: Bytes) -> impl Future> + Send; + /// Loads a document from content data. + /// + /// The `ContentData` provides the raw bytes along with source tracking + /// and content metadata (size, hash, text/binary detection). + fn load(&self, data: ContentData) -> impl Future> + Send; + + /// Loads a document from a file path. + /// + /// This is a convenience method that reads the file and calls [`Self::load`]. + fn load_file( + &self, + path: impl AsRef + Send, + ) -> impl Future> + Send { + async move { + let path = path.as_ref(); + let bytes = tokio::fs::read(path).await.map_err(|e| { + Error::from_source(format!("Failed to read file: {}", path.display()), e) + .with_resource(crate::ErrorResource::Document) + })?; + let data = ContentData::from(bytes); + self.load(data).await + } + } } /// A loaded document instance (read-only access). diff --git a/crates/nvisy-document/src/lib.rs b/crates/nvisy-document/src/lib.rs index ec53667..18560e8 100644 --- a/crates/nvisy-document/src/lib.rs +++ b/crates/nvisy-document/src/lib.rs @@ -3,7 +3,6 @@ #![doc = include_str!("../README.md")] // Core modules -pub mod error; pub mod format; // Extension trait modules @@ -13,37 +12,24 @@ pub mod table; pub mod text; pub mod thumbnail; -// Error re-exports -pub use error::{BoxError, Error, ErrorKind, Result}; +// Error re-exports from nvisy-core +pub use nvisy_core::error::{BoxError, Error, ErrorResource, ErrorType, Result}; -// Region re-exports (from format::region) +pub use conversion::{ + Conversion, ConversionOptions, ConversionPath, ConversionResult, ConversionStep, FormatPair, + HtmlOptions, PageMargins, PageOrientation, PdfOptions, SkippedElement, +}; pub use format::region::{ BoundingBox, Point, Region, RegionId, RegionKind, RegionSource, RegionStatus, }; - -// Format re-exports pub use format::{ - Capabilities, Document, DocumentFormat, DocumentInfo, MetadataCapabilities, PageOptions, - StructureCapabilities, TextCapabilities, -}; - -// Conversion re-exports -pub use conversion::{ - Conversion, ConversionOptions, ConversionPath, ConversionResult, ConversionStep, FormatPair, - HtmlOptions, PageMargins, PageOrientation, PdfOptions, SkippedElement, + Capabilities, ContentData, Document, DocumentFormat, DocumentInfo, MetadataCapabilities, + PageOptions, StructureCapabilities, TextCapabilities, }; - -// Metadata re-exports pub use metadata::{ CustomProperty, DocumentMetadata, Metadata, MetadataExtractOptions, MetadataField, PropertyValue, }; - -// Table re-exports pub use table::{CellDataType, NormalizedCell, NormalizedRow, NormalizedTable, TableExtractor}; - -// Text re-exports pub use text::{ExtractedText, TextExtractor}; - -// Thumbnail re-exports pub use thumbnail::{ImageFormat, Thumbnail, ThumbnailGenerator, ThumbnailOptions, ThumbnailSize}; diff --git a/crates/nvisy-document/src/metadata/mod.rs b/crates/nvisy-document/src/metadata/mod.rs index 80e7568..d2e0c5e 100644 --- a/crates/nvisy-document/src/metadata/mod.rs +++ b/crates/nvisy-document/src/metadata/mod.rs @@ -10,7 +10,7 @@ use async_trait::async_trait; pub use extract::MetadataExtractOptions; pub use types::{CustomProperty, DocumentMetadata, MetadataField, PropertyValue}; -use crate::error::Result; +use crate::Result; use crate::format::Document; /// Trait for document metadata extraction and manipulation. diff --git a/crates/nvisy-document/src/table/mod.rs b/crates/nvisy-document/src/table/mod.rs index 44b24f9..80f82aa 100644 --- a/crates/nvisy-document/src/table/mod.rs +++ b/crates/nvisy-document/src/table/mod.rs @@ -8,7 +8,7 @@ mod types; use async_trait::async_trait; pub use types::{CellDataType, NormalizedCell, NormalizedRow, NormalizedTable}; -use crate::error::Result; +use crate::Result; use crate::format::{Document, Region, RegionKind}; /// Trait for document table extraction and normalization. diff --git a/crates/nvisy-document/src/text/mod.rs b/crates/nvisy-document/src/text/mod.rs index cfcdc25..319380d 100644 --- a/crates/nvisy-document/src/text/mod.rs +++ b/crates/nvisy-document/src/text/mod.rs @@ -8,7 +8,7 @@ mod types; use async_trait::async_trait; pub use types::ExtractedText; -use crate::error::Result; +use crate::Result; use crate::format::Document; /// Trait for document text extraction. diff --git a/crates/nvisy-document/src/thumbnail/mod.rs b/crates/nvisy-document/src/thumbnail/mod.rs index 7db8f4a..2099b96 100644 --- a/crates/nvisy-document/src/thumbnail/mod.rs +++ b/crates/nvisy-document/src/thumbnail/mod.rs @@ -10,7 +10,7 @@ use async_trait::async_trait; pub use options::ThumbnailOptions; pub use types::{ImageFormat, Thumbnail, ThumbnailSize}; -use crate::error::Result; +use crate::Result; use crate::format::Document; /// Trait for document thumbnail generation. diff --git a/crates/nvisy-docx/src/document.rs b/crates/nvisy-docx/src/document.rs index cee2059..c82a00c 100644 --- a/crates/nvisy-docx/src/document.rs +++ b/crates/nvisy-docx/src/document.rs @@ -49,8 +49,6 @@ impl Document for DocxDocument { async fn to_bytes(&self) -> Result { // TODO: Implement DOCX serialization - Err(Error::unsupported_format( - "DOCX serialization not yet implemented", - )) + Err(Error::new("DOCX serialization not yet implemented")) } } diff --git a/crates/nvisy-docx/src/format.rs b/crates/nvisy-docx/src/format.rs index b4a1287..ba50278 100644 --- a/crates/nvisy-docx/src/format.rs +++ b/crates/nvisy-docx/src/format.rs @@ -1,7 +1,6 @@ //! DOCX format handler implementation. -use bytes::Bytes; -use nvisy_document::{Capabilities, DocumentFormat, Error, Result}; +use nvisy_document::{Capabilities, ContentData, DocumentFormat, Error, Result}; use crate::DocxDocument; @@ -40,11 +39,9 @@ impl DocumentFormat for DocxFormat { &self.capabilities } - async fn load(&self, _data: Bytes) -> Result { + async fn load(&self, _data: ContentData) -> Result { // TODO: Implement DOCX loading - Err(Error::unsupported_format( - "DOCX loading not yet implemented", - )) + Err(Error::new("DOCX loading not yet implemented")) } } diff --git a/crates/nvisy-engine/Cargo.toml b/crates/nvisy-engine/Cargo.toml index f22977a..5fb8c27 100644 --- a/crates/nvisy-engine/Cargo.toml +++ b/crates/nvisy-engine/Cargo.toml @@ -21,16 +21,18 @@ all-features = true rustdoc-args = ["--cfg", "docsrs"] [features] -default = ["pdf", "docx", "text"] +default = ["pdf", "docx", "text", "image"] pdf = ["dep:nvisy-pdf"] docx = ["dep:nvisy-docx"] text = ["dep:nvisy-text"] +image = ["dep:nvisy-image"] [dependencies] # Internal crates nvisy-archive = { workspace = true } nvisy-document = { workspace = true } nvisy-docx = { workspace = true, optional = true } +nvisy-image = { workspace = true, optional = true } nvisy-pdf = { workspace = true, optional = true } nvisy-text = { workspace = true, optional = true } diff --git a/crates/nvisy-engine/src/engine/mod.rs b/crates/nvisy-engine/src/engine/mod.rs index f16805d..5b47267 100644 --- a/crates/nvisy-engine/src/engine/mod.rs +++ b/crates/nvisy-engine/src/engine/mod.rs @@ -7,9 +7,8 @@ mod config; use std::path::Path; -use bytes::Bytes; pub use config::EngineConfig; -use nvisy_document::Result; +use nvisy_document::{ContentData, Result}; use crate::registry::{BoxDocument, FormatRegistry}; @@ -118,7 +117,7 @@ impl Engine { /// # Errors /// /// Returns an error if the extension is not supported or loading fails. - pub async fn load_by_extension(&self, ext: &str, data: Bytes) -> Result { + pub async fn load_by_extension(&self, ext: &str, data: ContentData) -> Result { self.registry.load_by_extension(ext, data).await } @@ -127,7 +126,7 @@ impl Engine { /// # Errors /// /// Returns an error if the MIME type is not supported or loading fails. - pub async fn load_by_mime(&self, mime: &str, data: Bytes) -> Result { + pub async fn load_by_mime(&self, mime: &str, data: ContentData) -> Result { self.registry.load_by_mime(mime, data).await } @@ -243,13 +242,13 @@ mod tests { let engine = Engine::new(); let doc = engine - .load_by_extension("json", Bytes::from(r#"{"key": "value"}"#)) + .load_by_extension("json", ContentData::from(r#"{"key": "value"}"#)) .await .unwrap(); assert!(!doc.regions().is_empty()); let doc = engine - .load_by_extension("md", Bytes::from("# Title\n\nParagraph")) + .load_by_extension("md", ContentData::from("# Title\n\nParagraph")) .await .unwrap(); assert!(!doc.regions().is_empty()); @@ -261,7 +260,7 @@ mod tests { let engine = Engine::new(); let doc = engine - .load_by_mime("application/json", Bytes::from(r#"{"key": "value"}"#)) + .load_by_mime("application/json", ContentData::from(r#"{"key": "value"}"#)) .await .unwrap(); assert!(!doc.regions().is_empty()); diff --git a/crates/nvisy-engine/src/registry/mod.rs b/crates/nvisy-engine/src/registry/mod.rs index 514a50d..fc3d854 100644 --- a/crates/nvisy-engine/src/registry/mod.rs +++ b/crates/nvisy-engine/src/registry/mod.rs @@ -7,8 +7,7 @@ use std::collections::HashMap; use std::sync::Arc; -use bytes::Bytes; -use nvisy_document::{Capabilities, Document, Error, Result}; +use nvisy_document::{Capabilities, ContentData, Document, Error, Result}; /// A type-erased document that can be used for common operations. pub type BoxDocument = Box; @@ -30,10 +29,10 @@ pub trait AnyFormat: Send + Sync { /// Returns the format capabilities. fn capabilities(&self) -> &Capabilities; - /// Loads a document from bytes, returning a type-erased document. + /// Loads a document from content data, returning a type-erased document. fn load_boxed( &self, - data: Bytes, + data: ContentData, ) -> std::pin::Pin> + Send + '_>>; } @@ -65,7 +64,7 @@ where fn load_boxed( &self, - data: Bytes, + data: ContentData, ) -> std::pin::Pin> + Send + '_>> { Box::pin(async move { let doc = nvisy_document::DocumentFormat::load(&self.inner, data).await?; @@ -147,6 +146,12 @@ impl FormatRegistry { self.register(nvisy_text::TomlFormat::new()); self.register(nvisy_text::IniFormat::new()); } + + #[cfg(feature = "image")] + { + self.register(nvisy_image::JpegFormat::new()); + self.register(nvisy_image::PngFormat::new()); + } } /// Registers a format handler. @@ -235,14 +240,14 @@ impl FormatRegistry { /// Returns an error if: /// - The extension is not supported /// - The document fails to load - pub async fn load_by_extension(&self, ext: &str, data: Bytes) -> Result { + pub async fn load_by_extension(&self, ext: &str, data: ContentData) -> Result { let ext_lower = ext.trim_start_matches('.').to_lowercase(); let format = self .by_extension .get(ext_lower.as_str()) .and_then(|&idx| self.formats.get(idx)) - .ok_or_else(|| Error::unsupported_format(format!("Unsupported extension: {}", ext)))?; + .ok_or_else(|| Error::new(format!("Unsupported extension: {}", ext)))?; format.format.load_boxed(data).await } @@ -254,14 +259,14 @@ impl FormatRegistry { /// Returns an error if: /// - The MIME type is not supported /// - The document fails to load - pub async fn load_by_mime(&self, mime: &str, data: Bytes) -> Result { + pub async fn load_by_mime(&self, mime: &str, data: ContentData) -> Result { let mime_lower = mime.to_lowercase(); let format = self .by_mime .get(mime_lower.as_str()) .and_then(|&idx| self.formats.get(idx)) - .ok_or_else(|| Error::unsupported_format(format!("Unsupported MIME type: {}", mime)))?; + .ok_or_else(|| Error::new(format!("Unsupported MIME type: {}", mime)))?; format.format.load_boxed(data).await } @@ -283,12 +288,13 @@ impl FormatRegistry { let ext = path .extension() .and_then(|e| e.to_str()) - .ok_or_else(|| Error::unsupported_format("File has no extension"))?; + .ok_or_else(|| Error::new("File has no extension"))?; - let data = std::fs::read(path) - .map_err(|e| Error::io(format!("Failed to read file '{}': {}", path.display(), e)))?; + let data = std::fs::read(path).map_err(|e| { + Error::from_source(format!("Failed to read file '{}'", path.display()), e) + })?; - self.load_by_extension(ext, Bytes::from(data)).await + self.load_by_extension(ext, ContentData::from(data)).await } } @@ -347,7 +353,7 @@ mod tests { let registry = FormatRegistry::with_defaults(); let doc = registry - .load_by_extension("json", Bytes::from(r#"{"key": "value"}"#)) + .load_by_extension("json", ContentData::from(r#"{"key": "value"}"#)) .await .unwrap(); @@ -360,7 +366,7 @@ mod tests { let registry = FormatRegistry::with_defaults(); let doc = registry - .load_by_mime("application/json", Bytes::from(r#"{"key": "value"}"#)) + .load_by_mime("application/json", ContentData::from(r#"{"key": "value"}"#)) .await .unwrap(); diff --git a/crates/nvisy-image/src/document.rs b/crates/nvisy-image/src/documents/jpeg.rs similarity index 69% rename from crates/nvisy-image/src/document.rs rename to crates/nvisy-image/src/documents/jpeg.rs index 092b45c..df75644 100644 --- a/crates/nvisy-image/src/document.rs +++ b/crates/nvisy-image/src/documents/jpeg.rs @@ -1,22 +1,22 @@ -//! Image document implementation. +//! JPEG document implementation. use async_trait::async_trait; use bytes::Bytes; use nvisy_document::{Document, DocumentInfo, Error, Region, RegionId, Result}; -/// A loaded image document. +/// A loaded JPEG document. #[derive(Debug)] -pub struct ImageDocument { +pub struct JpegDocument { info: DocumentInfo, regions: Vec, #[allow(dead_code)] data: Bytes, } -impl ImageDocument { - /// Creates a new image document (internal use). +impl JpegDocument { + /// Creates a new JPEG document (internal use). #[must_use] - #[allow(dead_code)] // Will be used when load() is implemented + #[allow(dead_code)] pub(crate) fn new(info: DocumentInfo, data: Bytes) -> Self { Self { info, @@ -27,7 +27,7 @@ impl ImageDocument { } #[async_trait] -impl Document for ImageDocument { +impl Document for JpegDocument { fn info(&self) -> &DocumentInfo { &self.info } @@ -48,9 +48,7 @@ impl Document for ImageDocument { } async fn to_bytes(&self) -> Result { - // TODO: Implement image serialization - Err(Error::unsupported_format( - "Image serialization not yet implemented", - )) + // TODO: Implement JPEG serialization + Err(Error::new("JPEG serialization not yet implemented")) } } diff --git a/crates/nvisy-image/src/documents/mod.rs b/crates/nvisy-image/src/documents/mod.rs new file mode 100644 index 0000000..d2ab5e1 --- /dev/null +++ b/crates/nvisy-image/src/documents/mod.rs @@ -0,0 +1,7 @@ +//! Image document implementations. + +mod jpeg; +mod png; + +pub use jpeg::JpegDocument; +pub use png::PngDocument; diff --git a/crates/nvisy-image/src/documents/png.rs b/crates/nvisy-image/src/documents/png.rs new file mode 100644 index 0000000..b8ca50e --- /dev/null +++ b/crates/nvisy-image/src/documents/png.rs @@ -0,0 +1,54 @@ +//! PNG document implementation. + +use async_trait::async_trait; +use bytes::Bytes; +use nvisy_document::{Document, DocumentInfo, Error, Region, RegionId, Result}; + +/// A loaded PNG document. +#[derive(Debug)] +pub struct PngDocument { + info: DocumentInfo, + regions: Vec, + #[allow(dead_code)] + data: Bytes, +} + +impl PngDocument { + /// Creates a new PNG document (internal use). + #[must_use] + #[allow(dead_code)] + pub(crate) fn new(info: DocumentInfo, data: Bytes) -> Self { + Self { + info, + regions: Vec::new(), + data, + } + } +} + +#[async_trait] +impl Document for PngDocument { + fn info(&self) -> &DocumentInfo { + &self.info + } + + fn regions(&self) -> &[Region] { + &self.regions + } + + fn regions_for_page(&self, page: u32) -> Vec<&Region> { + self.regions + .iter() + .filter(|r| r.page.map(|p| p.get()) == Some(page)) + .collect() + } + + fn find_region(&self, id: RegionId) -> Option<&Region> { + self.regions.iter().find(|r| r.id == id) + } + + async fn to_bytes(&self) -> Result { + // TODO: Implement PNG serialization + Err(Error::new("PNG serialization not yet implemented")) + } +} diff --git a/crates/nvisy-image/src/format.rs b/crates/nvisy-image/src/format.rs deleted file mode 100644 index e479706..0000000 --- a/crates/nvisy-image/src/format.rs +++ /dev/null @@ -1,82 +0,0 @@ -//! Image format handler implementation. - -use bytes::Bytes; -use nvisy_document::{Capabilities, DocumentFormat, Error, Result}; - -use crate::ImageDocument; - -/// Image document format handler. -#[derive(Debug, Clone, Default)] -pub struct ImageFormat { - capabilities: Capabilities, -} - -impl ImageFormat { - /// Creates a new image format handler. - #[must_use] - pub fn new() -> Self { - Self { - capabilities: Capabilities::image(), - } - } -} - -impl DocumentFormat for ImageFormat { - type Document = ImageDocument; - - fn name(&self) -> &'static str { - "image" - } - - fn mime_types(&self) -> &'static [&'static str] { - &[ - "image/png", - "image/jpeg", - "image/gif", - "image/webp", - "image/bmp", - "image/tiff", - ] - } - - fn extensions(&self) -> &'static [&'static str] { - &["png", "jpg", "jpeg", "gif", "webp", "bmp", "tiff", "tif"] - } - - fn capabilities(&self) -> &Capabilities { - &self.capabilities - } - - async fn load(&self, _data: Bytes) -> Result { - // TODO: Implement image loading - Err(Error::unsupported_format( - "Image loading not yet implemented", - )) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_format_metadata() { - let format = ImageFormat::new(); - assert_eq!(format.name(), "image"); - assert!(format.mime_types().contains(&"image/png")); - assert!(format.mime_types().contains(&"image/jpeg")); - assert!(format.extensions().contains(&"png")); - assert!(format.extensions().contains(&"jpg")); - } - - #[test] - fn test_capabilities() { - let format = ImageFormat::new(); - let caps = format.capabilities(); - - assert!(!caps.text.can_extract); - assert!(caps.text.may_need_ocr); - assert!(!caps.structure.has_pages); - assert!(caps.metadata.can_extract); // EXIF support - } -} diff --git a/crates/nvisy-image/src/formats/jpeg.rs b/crates/nvisy-image/src/formats/jpeg.rs new file mode 100644 index 0000000..8a0cb81 --- /dev/null +++ b/crates/nvisy-image/src/formats/jpeg.rs @@ -0,0 +1,71 @@ +//! JPEG format handler implementation. + +use nvisy_document::{Capabilities, ContentData, DocumentFormat, Error, Result}; + +use crate::documents::JpegDocument; + +/// JPEG image format handler. +#[derive(Debug, Clone, Default)] +pub struct JpegFormat { + capabilities: Capabilities, +} + +impl JpegFormat { + /// Creates a new JPEG format handler. + #[must_use] + pub fn new() -> Self { + Self { + capabilities: Capabilities::image(), + } + } +} + +impl DocumentFormat for JpegFormat { + type Document = JpegDocument; + + fn name(&self) -> &'static str { + "jpeg" + } + + fn mime_types(&self) -> &'static [&'static str] { + &["image/jpeg"] + } + + fn extensions(&self) -> &'static [&'static str] { + &["jpg", "jpeg"] + } + + fn capabilities(&self) -> &Capabilities { + &self.capabilities + } + + async fn load(&self, _data: ContentData) -> Result { + // TODO: Implement JPEG loading + Err(Error::new("JPEG loading not yet implemented")) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_format_metadata() { + let format = JpegFormat::new(); + assert_eq!(format.name(), "jpeg"); + assert!(format.mime_types().contains(&"image/jpeg")); + assert!(format.extensions().contains(&"jpg")); + assert!(format.extensions().contains(&"jpeg")); + } + + #[test] + fn test_capabilities() { + let format = JpegFormat::new(); + let caps = format.capabilities(); + + assert!(!caps.text.can_extract); + assert!(caps.text.may_need_ocr); + assert!(!caps.structure.has_pages); + assert!(caps.metadata.can_extract); + } +} diff --git a/crates/nvisy-image/src/formats/mod.rs b/crates/nvisy-image/src/formats/mod.rs new file mode 100644 index 0000000..aac0ecf --- /dev/null +++ b/crates/nvisy-image/src/formats/mod.rs @@ -0,0 +1,7 @@ +//! Image format handlers. + +mod jpeg; +mod png; + +pub use jpeg::JpegFormat; +pub use png::PngFormat; diff --git a/crates/nvisy-image/src/formats/png.rs b/crates/nvisy-image/src/formats/png.rs new file mode 100644 index 0000000..93572fe --- /dev/null +++ b/crates/nvisy-image/src/formats/png.rs @@ -0,0 +1,70 @@ +//! PNG format handler implementation. + +use nvisy_document::{Capabilities, ContentData, DocumentFormat, Error, Result}; + +use crate::documents::PngDocument; + +/// PNG image format handler. +#[derive(Debug, Clone, Default)] +pub struct PngFormat { + capabilities: Capabilities, +} + +impl PngFormat { + /// Creates a new PNG format handler. + #[must_use] + pub fn new() -> Self { + Self { + capabilities: Capabilities::image(), + } + } +} + +impl DocumentFormat for PngFormat { + type Document = PngDocument; + + fn name(&self) -> &'static str { + "png" + } + + fn mime_types(&self) -> &'static [&'static str] { + &["image/png"] + } + + fn extensions(&self) -> &'static [&'static str] { + &["png"] + } + + fn capabilities(&self) -> &Capabilities { + &self.capabilities + } + + async fn load(&self, _data: ContentData) -> Result { + // TODO: Implement PNG loading + Err(Error::new("PNG loading not yet implemented")) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_format_metadata() { + let format = PngFormat::new(); + assert_eq!(format.name(), "png"); + assert!(format.mime_types().contains(&"image/png")); + assert!(format.extensions().contains(&"png")); + } + + #[test] + fn test_capabilities() { + let format = PngFormat::new(); + let caps = format.capabilities(); + + assert!(!caps.text.can_extract); + assert!(caps.text.may_need_ocr); + assert!(!caps.structure.has_pages); + assert!(caps.metadata.can_extract); + } +} diff --git a/crates/nvisy-image/src/lib.rs b/crates/nvisy-image/src/lib.rs index 4b9608e..f6217f4 100644 --- a/crates/nvisy-image/src/lib.rs +++ b/crates/nvisy-image/src/lib.rs @@ -2,8 +2,8 @@ #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("../README.md")] -mod document; -mod format; +pub mod documents; +pub mod formats; -pub use document::ImageDocument; -pub use format::ImageFormat; +pub use documents::{JpegDocument, PngDocument}; +pub use formats::{JpegFormat, PngFormat}; diff --git a/crates/nvisy-pdf/src/document.rs b/crates/nvisy-pdf/src/document.rs index 71ad404..d74514f 100644 --- a/crates/nvisy-pdf/src/document.rs +++ b/crates/nvisy-pdf/src/document.rs @@ -49,8 +49,6 @@ impl Document for PdfDocument { async fn to_bytes(&self) -> Result { // TODO: Implement PDF serialization - Err(Error::unsupported_format( - "PDF serialization not yet implemented", - )) + Err(Error::new("PDF serialization not yet implemented")) } } diff --git a/crates/nvisy-pdf/src/format.rs b/crates/nvisy-pdf/src/format.rs index f36167f..7f3904e 100644 --- a/crates/nvisy-pdf/src/format.rs +++ b/crates/nvisy-pdf/src/format.rs @@ -1,7 +1,6 @@ //! PDF format handler implementation. -use bytes::Bytes; -use nvisy_document::{Capabilities, DocumentFormat, Error, Result}; +use nvisy_document::{Capabilities, ContentData, DocumentFormat, Error, Result}; use crate::PdfDocument; @@ -40,9 +39,9 @@ impl DocumentFormat for PdfFormat { &self.capabilities } - async fn load(&self, _data: Bytes) -> Result { + async fn load(&self, _data: ContentData) -> Result { // TODO: Implement PDF loading - Err(Error::unsupported_format("PDF loading not yet implemented")) + Err(Error::new("PDF loading not yet implemented")) } } diff --git a/crates/nvisy-text/README.md b/crates/nvisy-text/README.md index 10590d2..7c14a4d 100644 --- a/crates/nvisy-text/README.md +++ b/crates/nvisy-text/README.md @@ -18,12 +18,11 @@ various text-based file formats: ```rust use nvisy_text::{PlainTextFormat, PlainTextDocument}; -use nvisy_document::{DocumentFormat, Document, TextExtractor}; -use bytes::Bytes; +use nvisy_document::{ContentData, DocumentFormat, Document, TextExtractor}; # tokio_test::block_on(async { let format = PlainTextFormat::new(); -let data = Bytes::from("Hello, world!\n\nThis is a paragraph."); +let data = ContentData::from("Hello, world!\n\nThis is a paragraph."); let doc = format.load(data).await.unwrap(); assert_eq!(doc.regions().len(), 2); diff --git a/crates/nvisy-text/src/documents/json.rs b/crates/nvisy-text/src/documents/json.rs index e7c6603..e82360a 100644 --- a/crates/nvisy-text/src/documents/json.rs +++ b/crates/nvisy-text/src/documents/json.rs @@ -23,7 +23,7 @@ impl JsonDocument { /// Creates a new JSON document from content. pub fn new(content: String) -> Result { let parsed: Value = serde_json::from_str(&content) - .map_err(|e| nvisy_document::Error::parse(format!("Invalid JSON: {e}")))?; + .map_err(|e| nvisy_document::Error::new(format!("Invalid JSON: {e}")))?; let regions = Self::extract_regions(&parsed); let size = content.len() as u64; diff --git a/crates/nvisy-text/src/formats/csv.rs b/crates/nvisy-text/src/formats/csv.rs index e5533a7..66b77b2 100644 --- a/crates/nvisy-text/src/formats/csv.rs +++ b/crates/nvisy-text/src/formats/csv.rs @@ -1,8 +1,7 @@ //! CSV format handler. -use bytes::Bytes; use nvisy_document::{ - Capabilities, DocumentFormat, Result, StructureCapabilities, TextCapabilities, + Capabilities, ContentData, DocumentFormat, Result, StructureCapabilities, TextCapabilities, }; use crate::documents::CsvDocument; @@ -60,8 +59,8 @@ impl DocumentFormat for CsvFormat { &self.capabilities } - async fn load(&self, data: Bytes) -> Result { - let content = String::from_utf8_lossy(&data).into_owned(); + async fn load(&self, data: ContentData) -> Result { + let content = data.as_string()?; // Auto-detect delimiter let first_line = content.lines().next().unwrap_or(""); let delimiter = if first_line.contains('\t') { @@ -98,7 +97,7 @@ mod tests { #[tokio::test] async fn test_load_csv() { let format = CsvFormat::new(); - let data = Bytes::from("a,b,c\n1,2,3"); + let data = ContentData::from("a,b,c\n1,2,3"); let doc = format.load(data).await.unwrap(); assert_eq!(doc.delimiter(), b','); assert!(!doc.regions().is_empty()); @@ -107,7 +106,7 @@ mod tests { #[tokio::test] async fn test_load_tsv() { let format = CsvFormat::new(); - let data = Bytes::from("a\tb\tc\n1\t2\t3"); + let data = ContentData::from("a\tb\tc\n1\t2\t3"); let doc = format.load(data).await.unwrap(); assert_eq!(doc.delimiter(), b'\t'); } diff --git a/crates/nvisy-text/src/formats/ini.rs b/crates/nvisy-text/src/formats/ini.rs index bfe22ce..b953cd8 100644 --- a/crates/nvisy-text/src/formats/ini.rs +++ b/crates/nvisy-text/src/formats/ini.rs @@ -1,8 +1,7 @@ //! INI format handler. -use bytes::Bytes; use nvisy_document::{ - Capabilities, DocumentFormat, Result, StructureCapabilities, TextCapabilities, + Capabilities, ContentData, DocumentFormat, Result, StructureCapabilities, TextCapabilities, }; use crate::documents::IniDocument; @@ -60,8 +59,8 @@ impl DocumentFormat for IniFormat { &self.capabilities } - async fn load(&self, data: Bytes) -> Result { - let content = String::from_utf8_lossy(&data).into_owned(); + async fn load(&self, data: ContentData) -> Result { + let content = data.as_string()?; Ok(IniDocument::new(content)) } } @@ -91,7 +90,7 @@ mod tests { #[tokio::test] async fn test_load_ini() { let format = IniFormat::new(); - let data = Bytes::from("[section]\nkey=value\nfoo=bar"); + let data = ContentData::from("[section]\nkey=value\nfoo=bar"); let doc = format.load(data).await.unwrap(); assert!(!doc.regions().is_empty()); } diff --git a/crates/nvisy-text/src/formats/json.rs b/crates/nvisy-text/src/formats/json.rs index 163135e..c0e7be8 100644 --- a/crates/nvisy-text/src/formats/json.rs +++ b/crates/nvisy-text/src/formats/json.rs @@ -1,8 +1,7 @@ //! JSON format handler. -use bytes::Bytes; use nvisy_document::{ - Capabilities, DocumentFormat, Result, StructureCapabilities, TextCapabilities, + Capabilities, ContentData, DocumentFormat, Result, StructureCapabilities, TextCapabilities, }; use crate::documents::JsonDocument; @@ -60,8 +59,8 @@ impl DocumentFormat for JsonFormat { &self.capabilities } - async fn load(&self, data: Bytes) -> Result { - let content = String::from_utf8_lossy(&data).into_owned(); + async fn load(&self, data: ContentData) -> Result { + let content = data.as_string()?; JsonDocument::new(content) } } @@ -83,7 +82,7 @@ mod tests { #[tokio::test] async fn test_load_document() { let format = JsonFormat::new(); - let data = Bytes::from(r#"{"hello": "world"}"#); + let data = ContentData::from(r#"{"hello": "world"}"#); let doc = format.load(data).await.unwrap(); assert!(!doc.regions().is_empty()); } @@ -91,7 +90,7 @@ mod tests { #[tokio::test] async fn test_load_invalid_json() { let format = JsonFormat::new(); - let data = Bytes::from("not valid json {"); + let data = ContentData::from("not valid json {"); let result = format.load(data).await; assert!(result.is_err()); } diff --git a/crates/nvisy-text/src/formats/markdown.rs b/crates/nvisy-text/src/formats/markdown.rs index d040865..4e10f33 100644 --- a/crates/nvisy-text/src/formats/markdown.rs +++ b/crates/nvisy-text/src/formats/markdown.rs @@ -1,8 +1,7 @@ //! Markdown format handler. -use bytes::Bytes; use nvisy_document::{ - Capabilities, DocumentFormat, Result, StructureCapabilities, TextCapabilities, + Capabilities, ContentData, DocumentFormat, Result, StructureCapabilities, TextCapabilities, }; use crate::documents::MarkdownDocument; @@ -60,8 +59,8 @@ impl DocumentFormat for MarkdownFormat { &self.capabilities } - async fn load(&self, data: Bytes) -> Result { - let content = String::from_utf8_lossy(&data).into_owned(); + async fn load(&self, data: ContentData) -> Result { + let content = data.as_string()?; Ok(MarkdownDocument::new(content)) } } @@ -92,7 +91,7 @@ mod tests { #[tokio::test] async fn test_load_document() { let format = MarkdownFormat::new(); - let data = Bytes::from("# Test\n\nContent here."); + let data = ContentData::from("# Test\n\nContent here."); let doc = format.load(data).await.unwrap(); assert!(!doc.regions().is_empty()); } diff --git a/crates/nvisy-text/src/formats/plain.rs b/crates/nvisy-text/src/formats/plain.rs index 5a03060..ee57eb1 100644 --- a/crates/nvisy-text/src/formats/plain.rs +++ b/crates/nvisy-text/src/formats/plain.rs @@ -1,7 +1,6 @@ //! Plain text format handler. -use bytes::Bytes; -use nvisy_document::{Capabilities, DocumentFormat, Result}; +use nvisy_document::{Capabilities, ContentData, DocumentFormat, Result}; use crate::documents::PlainTextDocument; @@ -46,8 +45,8 @@ impl DocumentFormat for PlainTextFormat { &self.capabilities } - async fn load(&self, data: Bytes) -> Result { - let content = String::from_utf8_lossy(&data).into_owned(); + async fn load(&self, data: ContentData) -> Result { + let content = data.as_string()?; Ok(PlainTextDocument::new(content)) } } @@ -67,7 +66,7 @@ mod tests { #[tokio::test] async fn test_load_document() { let format = PlainTextFormat::new(); - let data = Bytes::from("Hello, world!"); + let data = ContentData::from("Hello, world!"); let doc = format.load(data).await.unwrap(); assert_eq!(doc.content(), "Hello, world!"); } diff --git a/crates/nvisy-text/src/formats/toml.rs b/crates/nvisy-text/src/formats/toml.rs index b0411a5..6929395 100644 --- a/crates/nvisy-text/src/formats/toml.rs +++ b/crates/nvisy-text/src/formats/toml.rs @@ -1,8 +1,7 @@ //! TOML format handler. -use bytes::Bytes; use nvisy_document::{ - Capabilities, DocumentFormat, Result, StructureCapabilities, TextCapabilities, + Capabilities, ContentData, DocumentFormat, Result, StructureCapabilities, TextCapabilities, }; use crate::documents::TomlDocument; @@ -60,8 +59,8 @@ impl DocumentFormat for TomlFormat { &self.capabilities } - async fn load(&self, data: Bytes) -> Result { - let content = String::from_utf8_lossy(&data).into_owned(); + async fn load(&self, data: ContentData) -> Result { + let content = data.as_string()?; Ok(TomlDocument::new(content)) } } @@ -90,7 +89,7 @@ mod tests { #[tokio::test] async fn test_load_toml() { let format = TomlFormat::new(); - let data = Bytes::from("[package]\nname = \"test\"\nversion = \"1.0\""); + let data = ContentData::from("[package]\nname = \"test\"\nversion = \"1.0\""); let doc = format.load(data).await.unwrap(); assert!(!doc.regions().is_empty()); } diff --git a/crates/nvisy-text/src/formats/xml.rs b/crates/nvisy-text/src/formats/xml.rs index 2113191..c92cd48 100644 --- a/crates/nvisy-text/src/formats/xml.rs +++ b/crates/nvisy-text/src/formats/xml.rs @@ -1,8 +1,7 @@ //! XML format handler. -use bytes::Bytes; use nvisy_document::{ - Capabilities, DocumentFormat, Result, StructureCapabilities, TextCapabilities, + Capabilities, ContentData, DocumentFormat, Result, StructureCapabilities, TextCapabilities, }; use crate::documents::XmlDocument; @@ -60,8 +59,8 @@ impl DocumentFormat for XmlFormat { &self.capabilities } - async fn load(&self, data: Bytes) -> Result { - let content = String::from_utf8_lossy(&data).into_owned(); + async fn load(&self, data: ContentData) -> Result { + let content = data.as_string()?; Ok(XmlDocument::new(content)) } } @@ -92,7 +91,7 @@ mod tests { #[tokio::test] async fn test_load_xml() { let format = XmlFormat::new(); - let data = Bytes::from("content"); + let data = ContentData::from("content"); let doc = format.load(data).await.unwrap(); assert!(!doc.regions().is_empty()); } diff --git a/crates/nvisy-text/src/formats/yaml.rs b/crates/nvisy-text/src/formats/yaml.rs index 63d47a5..4db8660 100644 --- a/crates/nvisy-text/src/formats/yaml.rs +++ b/crates/nvisy-text/src/formats/yaml.rs @@ -1,8 +1,7 @@ //! YAML format handler. -use bytes::Bytes; use nvisy_document::{ - Capabilities, DocumentFormat, Result, StructureCapabilities, TextCapabilities, + Capabilities, ContentData, DocumentFormat, Result, StructureCapabilities, TextCapabilities, }; use crate::documents::YamlDocument; @@ -60,8 +59,8 @@ impl DocumentFormat for YamlFormat { &self.capabilities } - async fn load(&self, data: Bytes) -> Result { - let content = String::from_utf8_lossy(&data).into_owned(); + async fn load(&self, data: ContentData) -> Result { + let content = data.as_string()?; Ok(YamlDocument::new(content)) } } @@ -91,7 +90,7 @@ mod tests { #[tokio::test] async fn test_load_yaml() { let format = YamlFormat::new(); - let data = Bytes::from("key: value\nlist:\n - item1\n - item2"); + let data = ContentData::from("key: value\nlist:\n - item1\n - item2"); let doc = format.load(data).await.unwrap(); assert!(!doc.regions().is_empty()); } From ac30b2630454da75b1a59a83e92b555c56693727 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Sat, 17 Jan 2026 08:26:45 +0100 Subject: [PATCH 4/5] refactor: rename crates from nvisy-* to nvisy-rt-* - Rename all crate packages to avoid conflicts with consumer library - Update all Cargo.toml dependencies and features - Update all source file imports (nvisy_* -> nvisy_rt_*) - Update README doc examples with new crate names - Fix clippy lints and doc warnings - Add diff module with Differ trait, Change, ChangeKind, RegionChange, Diff types --- Cargo.lock | 54 ++- Cargo.toml | 16 +- crates/nvisy-archive/Cargo.toml | 4 +- crates/nvisy-archive/src/file/archive_type.rs | 6 +- crates/nvisy-archive/src/file/mod.rs | 14 +- crates/nvisy-archive/src/handler/mod.rs | 4 +- .../nvisy-archive/src/handler/tar_handler.rs | 8 +- crates/nvisy-archive/src/lib.rs | 9 +- crates/nvisy-core/Cargo.toml | 2 +- crates/nvisy-core/README.md | 2 +- crates/nvisy-core/src/error/mod.rs | 2 +- crates/nvisy-core/src/fs/content_file.rs | 12 +- crates/nvisy-core/src/fs/content_metadata.rs | 4 +- crates/nvisy-core/src/fs/data_sensitivity.rs | 2 +- crates/nvisy-core/src/fs/mod.rs | 4 +- crates/nvisy-core/src/io/content.rs | 14 +- crates/nvisy-core/src/io/content_data.rs | 4 +- crates/nvisy-core/src/io/content_read.rs | 8 +- crates/nvisy-core/src/io/content_write.rs | 20 +- crates/nvisy-core/src/io/data_reference.rs | 5 +- crates/nvisy-core/src/path/source.rs | 14 +- crates/nvisy-document/Cargo.toml | 5 +- crates/nvisy-document/README.md | 5 - crates/nvisy-document/src/diff/change.rs | 110 +++++ crates/nvisy-document/src/diff/mod.rs | 18 + .../nvisy-document/src/diff/region_change.rs | 119 ++++++ crates/nvisy-document/src/diff/result.rs | 72 ++++ crates/nvisy-document/src/format/mod.rs | 2 +- .../nvisy-document/src/format/region/mod.rs | 3 +- crates/nvisy-document/src/lib.rs | 6 +- crates/nvisy-document/src/table/mod.rs | 2 +- crates/nvisy-document/src/text/mod.rs | 2 +- .../nvisy-document/src/thumbnail/options.rs | 8 +- crates/nvisy-docx/Cargo.toml | 4 +- crates/nvisy-docx/src/document.rs | 2 +- crates/nvisy-docx/src/format.rs | 2 +- crates/nvisy-engine/Cargo.toml | 25 +- crates/nvisy-engine/src/engine/mod.rs | 21 +- crates/nvisy-engine/src/lib.rs | 16 +- .../nvisy-engine/src/registry/format_ref.rs | 114 ++++++ .../src/registry/format_registry.rs | 343 ++++++++++++++++ .../src/registry/loaded_document.rs | 41 ++ crates/nvisy-engine/src/registry/mod.rs | 380 +----------------- crates/nvisy-engine/src/session/mod.rs | 2 +- crates/nvisy-image/Cargo.toml | 4 +- crates/nvisy-image/src/documents/jpeg.rs | 2 +- crates/nvisy-image/src/documents/png.rs | 2 +- crates/nvisy-image/src/formats/jpeg.rs | 2 +- crates/nvisy-image/src/formats/png.rs | 2 +- crates/nvisy-pdf/Cargo.toml | 4 +- crates/nvisy-pdf/src/document.rs | 2 +- crates/nvisy-pdf/src/format.rs | 2 +- crates/nvisy-text/Cargo.toml | 4 +- crates/nvisy-text/README.md | 22 +- crates/nvisy-text/src/documents/csv.rs | 2 +- crates/nvisy-text/src/documents/ini.rs | 2 +- crates/nvisy-text/src/documents/json.rs | 4 +- crates/nvisy-text/src/documents/markdown.rs | 5 +- crates/nvisy-text/src/documents/plain.rs | 2 +- crates/nvisy-text/src/documents/toml.rs | 2 +- crates/nvisy-text/src/documents/xml.rs | 2 +- crates/nvisy-text/src/documents/yaml.rs | 2 +- crates/nvisy-text/src/formats/csv.rs | 4 +- crates/nvisy-text/src/formats/ini.rs | 4 +- crates/nvisy-text/src/formats/json.rs | 4 +- crates/nvisy-text/src/formats/markdown.rs | 4 +- crates/nvisy-text/src/formats/plain.rs | 2 +- crates/nvisy-text/src/formats/toml.rs | 4 +- crates/nvisy-text/src/formats/xml.rs | 4 +- crates/nvisy-text/src/formats/yaml.rs | 4 +- crates/nvisy-text/src/lib.rs | 11 +- 71 files changed, 1050 insertions(+), 568 deletions(-) create mode 100644 crates/nvisy-document/src/diff/change.rs create mode 100644 crates/nvisy-document/src/diff/mod.rs create mode 100644 crates/nvisy-document/src/diff/region_change.rs create mode 100644 crates/nvisy-document/src/diff/result.rs create mode 100644 crates/nvisy-engine/src/registry/format_ref.rs create mode 100644 crates/nvisy-engine/src/registry/format_registry.rs create mode 100644 crates/nvisy-engine/src/registry/loaded_document.rs diff --git a/Cargo.lock b/Cargo.lock index 5accc23..c06b57b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -641,13 +641,13 @@ dependencies = [ ] [[package]] -name = "nvisy-archive" +name = "nvisy-rt-archive" version = "0.1.0" dependencies = [ "bytes", "bzip2", "flate2", - "nvisy-core", + "nvisy-rt-core", "sevenz-rust", "strum", "tar", @@ -659,7 +659,7 @@ dependencies = [ ] [[package]] -name = "nvisy-core" +name = "nvisy-rt-core" version = "0.1.0" dependencies = [ "bytes", @@ -678,7 +678,7 @@ dependencies = [ ] [[package]] -name = "nvisy-document" +name = "nvisy-rt-document" version = "0.1.0" dependencies = [ "async-trait", @@ -686,71 +686,73 @@ dependencies = [ "bytes", "derive_more", "jiff", - "nvisy-core", + "nvisy-rt-core", "serde", "serde_json", + "strum", "thiserror", "tokio", "uuid", ] [[package]] -name = "nvisy-docx" +name = "nvisy-rt-docx" version = "0.1.0" dependencies = [ "async-trait", "bytes", - "nvisy-document", + "nvisy-rt-document", "thiserror", ] [[package]] -name = "nvisy-engine" +name = "nvisy-rt-engine" version = "0.1.0" dependencies = [ "bytes", "jiff", - "nvisy-archive", - "nvisy-document", - "nvisy-docx", - "nvisy-image", - "nvisy-pdf", - "nvisy-text", + "nvisy-rt-archive", + "nvisy-rt-document", + "nvisy-rt-docx", + "nvisy-rt-image", + "nvisy-rt-pdf", + "nvisy-rt-text", "serde", "serde_json", "tokio", + "tracing", "uuid", ] [[package]] -name = "nvisy-image" +name = "nvisy-rt-image" version = "0.1.0" dependencies = [ "async-trait", "bytes", - "nvisy-document", + "nvisy-rt-document", "thiserror", ] [[package]] -name = "nvisy-pdf" +name = "nvisy-rt-pdf" version = "0.1.0" dependencies = [ "async-trait", "bytes", - "nvisy-document", + "nvisy-rt-document", "thiserror", ] [[package]] -name = "nvisy-text" +name = "nvisy-rt-text" version = "0.1.0" dependencies = [ "async-trait", "bytes", "csv", "markdown", - "nvisy-document", + "nvisy-rt-document", "serde_json", "thiserror", "tokio", @@ -1188,9 +1190,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" dependencies = [ "pin-project-lite", + "tracing-attributes", "tracing-core", ] +[[package]] +name = "tracing-attributes" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "tracing-core" version = "0.1.36" diff --git a/Cargo.toml b/Cargo.toml index 9b787b3..f14aa39 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,14 +32,14 @@ documentation = "https://docs.rs/nvisy" [workspace.dependencies] # Internal crates -nvisy-archive = { path = "./crates/nvisy-archive", version = "0.1.0", features = [] } -nvisy-core = { path = "./crates/nvisy-core", version = "0.1.0", features = [] } -nvisy-docx = { path = "./crates/nvisy-docx", version = "0.1.0", features = [] } -nvisy-document = { path = "./crates/nvisy-document", version = "0.1.0", features = [] } -nvisy-engine = { path = "./crates/nvisy-engine", version = "0.1.0", features = [] } -nvisy-image = { path = "./crates/nvisy-image", version = "0.1.0", features = [] } -nvisy-pdf = { path = "./crates/nvisy-pdf", version = "0.1.0", features = [] } -nvisy-text = { path = "./crates/nvisy-text", version = "0.1.0", features = [] } +nvisy-rt-archive = { path = "./crates/nvisy-archive", version = "0.1.0", features = [] } +nvisy-rt-core = { path = "./crates/nvisy-core", version = "0.1.0", features = [] } +nvisy-rt-docx = { path = "./crates/nvisy-docx", version = "0.1.0", features = [] } +nvisy-rt-document = { path = "./crates/nvisy-document", version = "0.1.0", features = [] } +nvisy-rt-engine = { path = "./crates/nvisy-engine", version = "0.1.0", features = [] } +nvisy-rt-image = { path = "./crates/nvisy-image", version = "0.1.0", features = [] } +nvisy-rt-pdf = { path = "./crates/nvisy-pdf", version = "0.1.0", features = [] } +nvisy-rt-text = { path = "./crates/nvisy-text", version = "0.1.0", features = [] } # Async runtime and I/O tokio = { version = "1.49", default-features = false, features = [] } diff --git a/crates/nvisy-archive/Cargo.toml b/crates/nvisy-archive/Cargo.toml index 2c49250..04815d5 100644 --- a/crates/nvisy-archive/Cargo.toml +++ b/crates/nvisy-archive/Cargo.toml @@ -1,7 +1,7 @@ # https://doc.rust-lang.org/cargo/reference/manifest.html [package] -name = "nvisy-archive" +name = "nvisy-rt-archive" description = "Archive handling library for nvisy (ZIP, TAR, 7z, etc.)" readme = "./README.md" @@ -31,7 +31,7 @@ xz = ["dep:xz2"] [dependencies] # Internal crates -nvisy-core = { workspace = true } +nvisy-rt-core = { workspace = true } # Data types bytes = { workspace = true } diff --git a/crates/nvisy-archive/src/file/archive_type.rs b/crates/nvisy-archive/src/file/archive_type.rs index fdcaa58..ddd3cba 100644 --- a/crates/nvisy-archive/src/file/archive_type.rs +++ b/crates/nvisy-archive/src/file/archive_type.rs @@ -59,7 +59,7 @@ impl ArchiveType { /// /// ``` /// use std::ffi::OsStr; - /// use nvisy_archive::ArchiveType; + /// use nvisy_rt_archive::ArchiveType; /// /// assert_eq!(ArchiveType::from_file_extension(OsStr::new("zip")), Some(ArchiveType::Zip)); /// assert_eq!(ArchiveType::from_file_extension(OsStr::new("tar.gz")), Some(ArchiveType::TarGz)); @@ -89,7 +89,7 @@ impl ArchiveType { /// # Examples /// /// ``` - /// use nvisy_archive::ArchiveType; + /// use nvisy_rt_archive::ArchiveType; /// /// assert_eq!(ArchiveType::Zip.file_extensions(), &["zip"]); /// assert_eq!(ArchiveType::TarGz.file_extensions(), &["tar.gz", "tgz"]); @@ -115,7 +115,7 @@ impl ArchiveType { /// # Examples /// /// ``` - /// use nvisy_archive::ArchiveType; + /// use nvisy_rt_archive::ArchiveType; /// /// assert_eq!(ArchiveType::Zip.primary_extension(), "zip"); /// assert_eq!(ArchiveType::TarGz.primary_extension(), "tar.gz"); diff --git a/crates/nvisy-archive/src/file/mod.rs b/crates/nvisy-archive/src/file/mod.rs index c2eeef9..9faa187 100644 --- a/crates/nvisy-archive/src/file/mod.rs +++ b/crates/nvisy-archive/src/file/mod.rs @@ -14,9 +14,9 @@ use bytes::Bytes; use tempfile::TempDir; use tokio::fs; -use crate::handler::ArchiveHandler; #[cfg(feature = "zip")] use crate::ZipResultExt; +use crate::handler::ArchiveHandler; use crate::{ArchiveErrorExt, ContentData, ContentSource, Error, Result}; /// Represents an archive file that can be loaded from various sources @@ -53,11 +53,11 @@ impl ArchiveFile { /// # Example /// /// ```no_run - /// use nvisy_archive::ArchiveFile; + /// use nvisy_rt_archive::ArchiveFile; /// use std::path::PathBuf; /// /// let archive = ArchiveFile::from_path("archive.zip")?; - /// # Ok::<(), nvisy_archive::Error>(()) + /// # Ok::<(), nvisy_rt_archive::Error>(()) /// ``` pub fn from_path(path: impl AsRef) -> Result { let path = path.as_ref(); @@ -100,7 +100,7 @@ impl ArchiveFile { /// # Example /// /// ``` - /// use nvisy_archive::{ArchiveFile, ArchiveType, ContentData}; + /// use nvisy_rt_archive::{ArchiveFile, ArchiveType, ContentData}; /// /// let data = ContentData::from(vec![0x50, 0x4B, 0x03, 0x04]); // ZIP signature /// let archive = ArchiveFile::from_content_data(ArchiveType::Zip, data); @@ -120,7 +120,7 @@ impl ArchiveFile { /// # Example /// /// ``` - /// use nvisy_archive::{ArchiveFile, ArchiveType}; + /// use nvisy_rt_archive::{ArchiveFile, ArchiveType}; /// /// let data = vec![0x50, 0x4B, 0x03, 0x04]; // ZIP signature /// let archive = ArchiveFile::from_bytes(ArchiveType::Zip, data); @@ -209,9 +209,9 @@ impl ArchiveFile { /// # Example /// /// ```no_run - /// use nvisy_archive::ArchiveFile; + /// use nvisy_rt_archive::ArchiveFile; /// - /// # async fn example() -> nvisy_archive::Result<()> { + /// # async fn example() -> nvisy_rt_archive::Result<()> { /// let archive = ArchiveFile::from_path("archive.zip")?; /// let handler = archive.unpack().await?; /// diff --git a/crates/nvisy-archive/src/handler/mod.rs b/crates/nvisy-archive/src/handler/mod.rs index ef3415b..a3b6c30 100644 --- a/crates/nvisy-archive/src/handler/mod.rs +++ b/crates/nvisy-archive/src/handler/mod.rs @@ -182,9 +182,9 @@ impl ArchiveHandler { /// # Example /// /// ```no_run - /// use nvisy_archive::{ArchiveFile, ArchiveType}; + /// use nvisy_rt_archive::{ArchiveFile, ArchiveType}; /// - /// # async fn example() -> nvisy_archive::Result<()> { + /// # async fn example() -> nvisy_rt_archive::Result<()> { /// let archive = ArchiveFile::from_path("original.zip")?; /// let handler = archive.unpack().await?; /// diff --git a/crates/nvisy-archive/src/handler/tar_handler.rs b/crates/nvisy-archive/src/handler/tar_handler.rs index 26a8e2e..efa030b 100644 --- a/crates/nvisy-archive/src/handler/tar_handler.rs +++ b/crates/nvisy-archive/src/handler/tar_handler.rs @@ -363,8 +363,8 @@ impl TarDirectoryBuilder { } #[cfg(feature = "gzip")] ArchiveType::TarGz => { - use flate2::write::GzEncoder; use flate2::Compression; + use flate2::write::GzEncoder; let file = std::fs::File::create(&target_path)?; let encoder = GzEncoder::new(file, Compression::default()); @@ -381,8 +381,8 @@ impl TarDirectoryBuilder { } #[cfg(feature = "bzip2")] ArchiveType::TarBz2 => { - use bzip2::write::BzEncoder; use bzip2::Compression; + use bzip2::write::BzEncoder; let file = std::fs::File::create(&target_path)?; let encoder = BzEncoder::new(file, Compression::default()); @@ -500,8 +500,8 @@ impl TarArchiveBuilder { }) } ArchiveType::TarGz => { - use flate2::write::GzEncoder; use flate2::Compression; + use flate2::write::GzEncoder; let encoder = GzEncoder::new(writer, Compression::default()); let writer: Box = Box::new(encoder); Ok(TarArchiveBuilder { @@ -510,8 +510,8 @@ impl TarArchiveBuilder { }) } ArchiveType::TarBz2 => { - use bzip2::write::BzEncoder; use bzip2::Compression; + use bzip2::write::BzEncoder; let encoder = BzEncoder::new(writer, Compression::default()); let writer: Box = Box::new(encoder); Ok(TarArchiveBuilder { diff --git a/crates/nvisy-archive/src/lib.rs b/crates/nvisy-archive/src/lib.rs index 8f2d86c..17e3c45 100644 --- a/crates/nvisy-archive/src/lib.rs +++ b/crates/nvisy-archive/src/lib.rs @@ -9,12 +9,11 @@ pub mod prelude; // Re-exports for convenience pub use file::{ArchiveFile, ArchiveType}; pub use handler::ArchiveHandler; - // Re-export core types used in archive operations -pub use nvisy_core::error::{Error, ErrorResource, ErrorType, Result}; -pub use nvisy_core::fs::{ContentKind, ContentMetadata}; -pub use nvisy_core::io::ContentData; -pub use nvisy_core::path::ContentSource; +pub use nvisy_rt_core::error::{Error, ErrorResource, ErrorType, Result}; +pub use nvisy_rt_core::fs::{ContentKind, ContentMetadata}; +pub use nvisy_rt_core::io::ContentData; +pub use nvisy_rt_core::path::ContentSource; /// Extension trait for creating archive-specific errors pub trait ArchiveErrorExt { diff --git a/crates/nvisy-core/Cargo.toml b/crates/nvisy-core/Cargo.toml index 46029ed..760fde3 100644 --- a/crates/nvisy-core/Cargo.toml +++ b/crates/nvisy-core/Cargo.toml @@ -1,7 +1,7 @@ # https://doc.rust-lang.org/cargo/reference/manifest.html [package] -name = "nvisy-core" +name = "nvisy-rt-core" description = "Core types and utilities for nvisy" readme = "./README.md" diff --git a/crates/nvisy-core/README.md b/crates/nvisy-core/README.md index 68059c0..30369d4 100644 --- a/crates/nvisy-core/README.md +++ b/crates/nvisy-core/README.md @@ -48,6 +48,6 @@ tracking. - `tokio` - Async runtime for I/O operations - `bytes` - Zero-copy byte buffer management -- `uuid` - Unique identifiers with UUIDv7 support +- `uuid` - Unique identifiers with `UUIDv7` support - `jiff` - Timestamp support for content source tracking - `strum` - Derive macros for enums diff --git a/crates/nvisy-core/src/error/mod.rs b/crates/nvisy-core/src/error/mod.rs index 26dca6a..c087aaf 100644 --- a/crates/nvisy-core/src/error/mod.rs +++ b/crates/nvisy-core/src/error/mod.rs @@ -21,7 +21,7 @@ pub type BoxError = Box; /// # Example /// /// ``` -/// use nvisy_core::error::{Error, ErrorType, ErrorResource}; +/// use nvisy_rt_core::error::{Error, ErrorType, ErrorResource}; /// /// let error = Error::new("Something went wrong") /// .with_type(ErrorType::Runtime) diff --git a/crates/nvisy-core/src/fs/content_file.rs b/crates/nvisy-core/src/fs/content_file.rs index 86bbd8a..e7695b2 100644 --- a/crates/nvisy-core/src/fs/content_file.rs +++ b/crates/nvisy-core/src/fs/content_file.rs @@ -40,7 +40,7 @@ impl ContentFile { /// # Example /// /// ```no_run - /// use nvisy_core::fs::ContentFile; + /// use nvisy_rt_core::fs::ContentFile; /// use std::path::Path; /// /// async fn open_file() -> Result<(), Box> { @@ -89,7 +89,7 @@ impl ContentFile { /// # Example /// /// ```no_run - /// use nvisy_core::fs::ContentFile; + /// use nvisy_rt_core::fs::ContentFile; /// /// async fn create_file() -> Result<(), Box> { /// let content_file = ContentFile::create("new_file.txt").await?; @@ -133,7 +133,7 @@ impl ContentFile { /// # Example /// /// ```no_run - /// use nvisy_core::fs::ContentFile; + /// use nvisy_rt_core::fs::ContentFile; /// use tokio::fs::OpenOptions; /// /// async fn open_with_options() -> Result<(), Box> { @@ -174,7 +174,7 @@ impl ContentFile { /// # Example /// /// ```no_run - /// use nvisy_core::fs::ContentFile; + /// use nvisy_rt_core::fs::ContentFile; /// /// async fn read_content() -> Result<(), Box> { /// let mut content_file = ContentFile::open("example.txt").await?; @@ -236,8 +236,8 @@ impl ContentFile { /// # Example /// /// ```no_run - /// use nvisy_core::fs::ContentFile; - /// use nvisy_core::io::ContentData; + /// use nvisy_rt_core::fs::ContentFile; + /// use nvisy_rt_core::io::ContentData; /// /// async fn write_content() -> Result<(), Box> { /// let mut content_file = ContentFile::create("output.txt").await?; diff --git a/crates/nvisy-core/src/fs/content_metadata.rs b/crates/nvisy-core/src/fs/content_metadata.rs index 23d01da..8ab09c2 100644 --- a/crates/nvisy-core/src/fs/content_metadata.rs +++ b/crates/nvisy-core/src/fs/content_metadata.rs @@ -27,7 +27,7 @@ impl ContentMetadata { /// # Example /// /// ``` - /// use nvisy_core::{fs::ContentMetadata, path::ContentSource}; + /// use nvisy_rt_core::{fs::ContentMetadata, path::ContentSource}; /// /// let source = ContentSource::new(); /// let metadata = ContentMetadata::new(source); @@ -45,7 +45,7 @@ impl ContentMetadata { /// # Example /// /// ``` - /// use nvisy_core::{fs::ContentMetadata, path::ContentSource}; + /// use nvisy_rt_core::{fs::ContentMetadata, path::ContentSource}; /// use std::path::PathBuf; /// /// let source = ContentSource::new(); diff --git a/crates/nvisy-core/src/fs/data_sensitivity.rs b/crates/nvisy-core/src/fs/data_sensitivity.rs index b7e1a3b..a3252d8 100644 --- a/crates/nvisy-core/src/fs/data_sensitivity.rs +++ b/crates/nvisy-core/src/fs/data_sensitivity.rs @@ -17,7 +17,7 @@ use strum::{Display, EnumIter, EnumString}; /// # Examples /// /// ```rust -/// use nvisy_core::fs::DataSensitivity; +/// use nvisy_rt_core::fs::DataSensitivity; /// /// let high = DataSensitivity::High; /// let medium = DataSensitivity::Medium; diff --git a/crates/nvisy-core/src/fs/mod.rs b/crates/nvisy-core/src/fs/mod.rs index c6386bd..bdc802c 100644 --- a/crates/nvisy-core/src/fs/mod.rs +++ b/crates/nvisy-core/src/fs/mod.rs @@ -13,8 +13,8 @@ //! # Example //! //! ```no_run -//! use nvisy_core::fs::ContentFile; -//! use nvisy_core::io::ContentData; +//! use nvisy_rt_core::fs::ContentFile; +//! use nvisy_rt_core::io::ContentData; //! //! async fn example() -> Result<(), Box> { //! // Create a new file diff --git a/crates/nvisy-core/src/io/content.rs b/crates/nvisy-core/src/io/content.rs index 93de761..c0dd1c5 100644 --- a/crates/nvisy-core/src/io/content.rs +++ b/crates/nvisy-core/src/io/content.rs @@ -20,9 +20,9 @@ use crate::path::ContentSource; /// # Examples /// /// ```rust -/// use nvisy_core::io::{Content, ContentData}; -/// use nvisy_core::fs::ContentMetadata; -/// use nvisy_core::path::ContentSource; +/// use nvisy_rt_core::io::{Content, ContentData}; +/// use nvisy_rt_core::fs::ContentMetadata; +/// use nvisy_rt_core::path::ContentSource; /// /// // Create content from data /// let data = ContentData::from("Hello, world!"); @@ -99,7 +99,11 @@ impl Content { self.data.is_likely_text() } - /// Try to get the content as a string slice + /// Try to get the content as a string slice. + /// + /// # Errors + /// + /// Returns an error if the content is not valid UTF-8. pub fn as_str(&self) -> Result<&str> { self.data.as_str() } @@ -124,7 +128,7 @@ impl Content { self.metadata = None; } - /// Consume and return the inner ContentData + /// Consume and return the inner [`ContentData`]. pub fn into_data(self) -> ContentData { self.data } diff --git a/crates/nvisy-core/src/io/content_data.rs b/crates/nvisy-core/src/io/content_data.rs index 8f41af3..f60a17f 100644 --- a/crates/nvisy-core/src/io/content_data.rs +++ b/crates/nvisy-core/src/io/content_data.rs @@ -171,7 +171,7 @@ impl ContentData { /// # Example /// /// ``` - /// use nvisy_core::{io::ContentData, path::ContentSource}; + /// use nvisy_rt_core::{io::ContentData, path::ContentSource}; /// use bytes::Bytes; /// /// let source = ContentSource::new(); @@ -193,7 +193,7 @@ impl ContentData { /// # Example /// /// ``` - /// use nvisy_core::{io::ContentData, path::ContentSource}; + /// use nvisy_rt_core::{io::ContentData, path::ContentSource}; /// /// let source = ContentSource::new(); /// let content = ContentData::from_text(source, "Hello, world!"); diff --git a/crates/nvisy-core/src/io/content_read.rs b/crates/nvisy-core/src/io/content_read.rs index f889aea..23d4216 100644 --- a/crates/nvisy-core/src/io/content_read.rs +++ b/crates/nvisy-core/src/io/content_read.rs @@ -27,7 +27,7 @@ pub trait AsyncContentRead: AsyncRead + Unpin + Send { /// # Example /// /// ```no_run - /// use nvisy_core::io::{AsyncContentRead, ContentData}; + /// use nvisy_rt_core::io::{AsyncContentRead, ContentData}; /// use tokio::fs::File; /// use std::io; /// @@ -58,7 +58,7 @@ pub trait AsyncContentRead: AsyncRead + Unpin + Send { /// # Example /// /// ```no_run - /// use nvisy_core::{io::{AsyncContentRead, ContentData}, path::ContentSource}; + /// use nvisy_rt_core::{io::{AsyncContentRead, ContentData}, path::ContentSource}; /// use tokio::fs::File; /// use std::io; /// @@ -97,7 +97,7 @@ pub trait AsyncContentRead: AsyncRead + Unpin + Send { /// # Example /// /// ```no_run - /// use nvisy_core::io::{AsyncContentRead, ContentData}; + /// use nvisy_rt_core::io::{AsyncContentRead, ContentData}; /// use tokio::fs::File; /// use std::io; /// @@ -155,7 +155,7 @@ pub trait AsyncContentRead: AsyncRead + Unpin + Send { /// # Example /// /// ```no_run - /// use nvisy_core::io::AsyncContentRead; + /// use nvisy_rt_core::io::AsyncContentRead; /// use tokio::fs::File; /// use bytes::Bytes; /// use std::io; diff --git a/crates/nvisy-core/src/io/content_write.rs b/crates/nvisy-core/src/io/content_write.rs index 99e749e..9e84912 100644 --- a/crates/nvisy-core/src/io/content_write.rs +++ b/crates/nvisy-core/src/io/content_write.rs @@ -25,8 +25,8 @@ pub trait AsyncContentWrite: AsyncWrite + Unpin + Send { /// # Example /// /// ```no_run - /// use nvisy_core::io::{AsyncContentWrite, ContentData}; - /// use nvisy_core::fs::ContentMetadata; + /// use nvisy_rt_core::io::{AsyncContentWrite, ContentData}; + /// use nvisy_rt_core::fs::ContentMetadata; /// use tokio::fs::File; /// use std::io; /// @@ -61,8 +61,8 @@ pub trait AsyncContentWrite: AsyncWrite + Unpin + Send { /// # Example /// /// ```no_run - /// use nvisy_core::io::{AsyncContentWrite, ContentData}; - /// use nvisy_core::fs::ContentMetadata; + /// use nvisy_rt_core::io::{AsyncContentWrite, ContentData}; + /// use nvisy_rt_core::fs::ContentMetadata; /// use tokio::fs::File; /// use std::path::PathBuf; /// use std::io; @@ -103,8 +103,8 @@ pub trait AsyncContentWrite: AsyncWrite + Unpin + Send { /// # Example /// /// ```no_run - /// use nvisy_core::io::{AsyncContentWrite, ContentData}; - /// use nvisy_core::fs::ContentMetadata; + /// use nvisy_rt_core::io::{AsyncContentWrite, ContentData}; + /// use nvisy_rt_core::fs::ContentMetadata; /// use tokio::fs::File; /// use std::io; /// @@ -145,8 +145,8 @@ pub trait AsyncContentWrite: AsyncWrite + Unpin + Send { /// # Example /// /// ```no_run - /// use nvisy_core::io::{AsyncContentWrite, ContentData}; - /// use nvisy_core::fs::ContentMetadata; + /// use nvisy_rt_core::io::{AsyncContentWrite, ContentData}; + /// use nvisy_rt_core::fs::ContentMetadata; /// use tokio::fs::File; /// use std::io; /// @@ -191,8 +191,8 @@ pub trait AsyncContentWrite: AsyncWrite + Unpin + Send { /// # Example /// /// ```no_run - /// use nvisy_core::io::{AsyncContentWrite, ContentData}; - /// use nvisy_core::fs::ContentMetadata; + /// use nvisy_rt_core::io::{AsyncContentWrite, ContentData}; + /// use nvisy_rt_core::fs::ContentMetadata; /// use tokio::fs::OpenOptions; /// use std::io; /// diff --git a/crates/nvisy-core/src/io/data_reference.rs b/crates/nvisy-core/src/io/data_reference.rs index cf98854..ed067b3 100644 --- a/crates/nvisy-core/src/io/data_reference.rs +++ b/crates/nvisy-core/src/io/data_reference.rs @@ -17,7 +17,7 @@ use crate::path::ContentSource; /// # Examples /// /// ```rust -/// use nvisy_core::io::{DataReference, Content, ContentData}; +/// use nvisy_rt_core::io::{DataReference, Content, ContentData}; /// /// let content = Content::new(ContentData::from("Hello, world!")); /// let data_ref = DataReference::new(content) @@ -95,9 +95,8 @@ impl DataReference { #[cfg(test)] mod tests { - use crate::io::ContentData; - use super::*; + use crate::io::ContentData; #[test] fn test_data_reference_creation() { diff --git a/crates/nvisy-core/src/path/source.rs b/crates/nvisy-core/src/path/source.rs index 49b2811..88efd35 100644 --- a/crates/nvisy-core/src/path/source.rs +++ b/crates/nvisy-core/src/path/source.rs @@ -28,7 +28,7 @@ impl ContentSource { /// # Example /// /// ``` - /// use nvisy_core::path::ContentSource; + /// use nvisy_rt_core::path::ContentSource; /// /// let source = ContentSource::new(); /// assert!(!source.as_uuid().is_nil()); @@ -52,7 +52,7 @@ impl ContentSource { /// # Example /// /// ``` - /// use nvisy_core::path::ContentSource; + /// use nvisy_rt_core::path::ContentSource; /// use uuid::Uuid; /// /// let source = ContentSource::new(); @@ -70,7 +70,7 @@ impl ContentSource { /// # Example /// /// ``` - /// use nvisy_core::path::ContentSource; + /// use nvisy_rt_core::path::ContentSource; /// /// let source = ContentSource::new(); /// let uuid = source.as_uuid(); @@ -86,7 +86,7 @@ impl ContentSource { /// # Example /// /// ``` - /// use nvisy_core::path::ContentSource; + /// use nvisy_rt_core::path::ContentSource; /// /// let source = ContentSource::new(); /// let id_str = source.to_string(); @@ -102,7 +102,7 @@ impl ContentSource { /// # Example /// /// ``` - /// use nvisy_core::path::ContentSource; + /// use nvisy_rt_core::path::ContentSource; /// /// let source = ContentSource::new(); /// let id_str = source.to_string(); @@ -122,7 +122,7 @@ impl ContentSource { /// # Example /// /// ``` - /// use nvisy_core::path::ContentSource; + /// use nvisy_rt_core::path::ContentSource; /// use std::time::{SystemTime, UNIX_EPOCH}; /// /// let source = ContentSource::new(); @@ -150,7 +150,7 @@ impl ContentSource { /// # Example /// /// ``` - /// use nvisy_core::path::ContentSource; + /// use nvisy_rt_core::path::ContentSource; /// use std::thread; /// use std::time::Duration; /// diff --git a/crates/nvisy-document/Cargo.toml b/crates/nvisy-document/Cargo.toml index 43a9dde..ac62698 100644 --- a/crates/nvisy-document/Cargo.toml +++ b/crates/nvisy-document/Cargo.toml @@ -1,7 +1,7 @@ # https://doc.rust-lang.org/cargo/reference/manifest.html [package] -name = "nvisy-document" +name = "nvisy-rt-document" description = "Document abstraction layer for nvisy" readme = "./README.md" @@ -22,7 +22,7 @@ rustdoc-args = ["--cfg", "docsrs"] [dependencies] # Internal crates -nvisy-core = { workspace = true } +nvisy-rt-core = { workspace = true } # Async runtime and I/O tokio = { workspace = true, features = ["sync", "io-util", "fs"] } @@ -43,6 +43,7 @@ thiserror = { workspace = true, features = ["std"] } # Macros derive_more = { workspace = true, features = ["display", "from", "into", "deref", "deref_mut", "as_ref", "constructor"] } +strum = { workspace = true, features = ["derive"] } [dev-dependencies] tokio = { workspace = true, features = ["rt", "macros"] } diff --git a/crates/nvisy-document/README.md b/crates/nvisy-document/README.md index 9b1d780..15024a2 100644 --- a/crates/nvisy-document/README.md +++ b/crates/nvisy-document/README.md @@ -14,14 +14,9 @@ operations like redaction, text replacement, splitting, and merging. - **[`Document`]** - A loaded document instance for reading document content. -- **[`EditableDocument`]** - Extension trait for documents that support editing. - - **[`Region`]** - Semantic units within a document (text blocks, images, tables) with stable IDs that persist across edit sessions. -- **[`EditOperation`]** - Edit commands that target regions by ID, - supporting undo/redo and batch operations. - ## Extension Traits Document implementations can optionally implement these extension traits: diff --git a/crates/nvisy-document/src/diff/change.rs b/crates/nvisy-document/src/diff/change.rs new file mode 100644 index 0000000..b61d11d --- /dev/null +++ b/crates/nvisy-document/src/diff/change.rs @@ -0,0 +1,110 @@ +//! Change types. + +use serde::{Deserialize, Serialize}; +use strum::{Display, EnumIs}; + +/// The kind of change detected between document versions. +#[derive( + Debug, + Clone, + Copy, + PartialEq, + Eq, + Hash, + Serialize, + Deserialize, + Display, + EnumIs +)] +#[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] +pub enum ChangeKind { + /// Content was added. + Added, + /// Content was removed. + Removed, + /// Content was modified. + Modified, + /// Content was moved to a different location. + Moved, +} + +/// A generic change entry. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct Change { + /// The kind of change. + pub kind: ChangeKind, + /// The old value (for removed/modified). + pub old: Option, + /// The new value (for added/modified). + pub new: Option, +} + +impl Change { + /// Creates a new addition change. + #[must_use] + pub fn added(value: T) -> Self { + Self { + kind: ChangeKind::Added, + old: None, + new: Some(value), + } + } + + /// Creates a new removal change. + #[must_use] + pub fn removed(value: T) -> Self { + Self { + kind: ChangeKind::Removed, + old: Some(value), + new: None, + } + } + + /// Creates a new modification change. + #[must_use] + pub fn modified(old: T, new: T) -> Self { + Self { + kind: ChangeKind::Modified, + old: Some(old), + new: Some(new), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_change_kind_display() { + assert_eq!(ChangeKind::Added.to_string(), "added"); + assert_eq!(ChangeKind::Removed.to_string(), "removed"); + assert_eq!(ChangeKind::Modified.to_string(), "modified"); + assert_eq!(ChangeKind::Moved.to_string(), "moved"); + } + + #[test] + fn test_change_kind_predicates() { + assert!(ChangeKind::Added.is_added()); + assert!(ChangeKind::Removed.is_removed()); + assert!(ChangeKind::Modified.is_modified()); + assert!(ChangeKind::Moved.is_moved()); + } + + #[test] + fn test_generic_change() { + let added: Change = Change::added(42); + assert!(added.kind.is_added()); + assert_eq!(added.new, Some(42)); + + let removed: Change<&str> = Change::removed("gone"); + assert!(removed.kind.is_removed()); + assert_eq!(removed.old, Some("gone")); + + let modified: Change = Change::modified("old".into(), "new".into()); + assert!(modified.kind.is_modified()); + assert_eq!(modified.old, Some("old".into())); + assert_eq!(modified.new, Some("new".into())); + } +} diff --git a/crates/nvisy-document/src/diff/mod.rs b/crates/nvisy-document/src/diff/mod.rs new file mode 100644 index 0000000..7ba1c1a --- /dev/null +++ b/crates/nvisy-document/src/diff/mod.rs @@ -0,0 +1,18 @@ +//! Document diffing and comparison. +//! +//! This module provides types and utilities for comparing documents +//! and tracking changes between document versions. + +mod change; +mod region_change; +mod result; + +pub use change::{Change, ChangeKind}; +pub use region_change::RegionChange; +pub use result::Diff; + +/// A trait for computing diffs between documents. +pub trait Differ { + /// Computes the difference between this document and another. + fn diff(&self, other: &Self) -> Diff; +} diff --git a/crates/nvisy-document/src/diff/region_change.rs b/crates/nvisy-document/src/diff/region_change.rs new file mode 100644 index 0000000..77e776c --- /dev/null +++ b/crates/nvisy-document/src/diff/region_change.rs @@ -0,0 +1,119 @@ +//! Region-specific change type. + +use serde::{Deserialize, Serialize}; + +use super::ChangeKind; +use crate::RegionId; + +/// A change to a specific region in a document. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct RegionChange { + /// The kind of change. + pub kind: ChangeKind, + /// The region ID in the old document (if applicable). + pub old_id: Option, + /// The region ID in the new document (if applicable). + pub new_id: Option, + /// The old text content (for removed/modified). + pub old_text: Option, + /// The new text content (for added/modified). + pub new_text: Option, +} + +impl RegionChange { + /// Creates a new addition change. + #[must_use] + pub fn added(new_id: RegionId, text: impl Into) -> Self { + Self { + kind: ChangeKind::Added, + old_id: None, + new_id: Some(new_id), + old_text: None, + new_text: Some(text.into()), + } + } + + /// Creates a new removal change. + #[must_use] + pub fn removed(old_id: RegionId, text: impl Into) -> Self { + Self { + kind: ChangeKind::Removed, + old_id: Some(old_id), + new_id: None, + old_text: Some(text.into()), + new_text: None, + } + } + + /// Creates a new modification change. + #[must_use] + pub fn modified( + old_id: RegionId, + new_id: RegionId, + old_text: impl Into, + new_text: impl Into, + ) -> Self { + Self { + kind: ChangeKind::Modified, + old_id: Some(old_id), + new_id: Some(new_id), + old_text: Some(old_text.into()), + new_text: Some(new_text.into()), + } + } + + /// Creates a new move change. + #[must_use] + pub fn moved(old_id: RegionId, new_id: RegionId, text: impl Into) -> Self { + let text = text.into(); + Self { + kind: ChangeKind::Moved, + old_id: Some(old_id), + new_id: Some(new_id), + old_text: Some(text.clone()), + new_text: Some(text), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_region_change_added() { + let id = RegionId::new(); + let change = RegionChange::added(id, "new content"); + + assert_eq!(change.kind, ChangeKind::Added); + assert!(change.old_id.is_none()); + assert_eq!(change.new_id, Some(id)); + assert!(change.old_text.is_none()); + assert_eq!(change.new_text, Some("new content".to_string())); + } + + #[test] + fn test_region_change_removed() { + let id = RegionId::new(); + let change = RegionChange::removed(id, "old content"); + + assert_eq!(change.kind, ChangeKind::Removed); + assert_eq!(change.old_id, Some(id)); + assert!(change.new_id.is_none()); + assert_eq!(change.old_text, Some("old content".to_string())); + assert!(change.new_text.is_none()); + } + + #[test] + fn test_region_change_modified() { + let old_id = RegionId::new(); + let new_id = RegionId::new(); + let change = RegionChange::modified(old_id, new_id, "old", "new"); + + assert_eq!(change.kind, ChangeKind::Modified); + assert_eq!(change.old_id, Some(old_id)); + assert_eq!(change.new_id, Some(new_id)); + assert_eq!(change.old_text, Some("old".to_string())); + assert_eq!(change.new_text, Some("new".to_string())); + } +} diff --git a/crates/nvisy-document/src/diff/result.rs b/crates/nvisy-document/src/diff/result.rs new file mode 100644 index 0000000..ac933ca --- /dev/null +++ b/crates/nvisy-document/src/diff/result.rs @@ -0,0 +1,72 @@ +//! Diff result type. + +use derive_more::{Deref, DerefMut}; +use serde::{Deserialize, Serialize}; + +use super::RegionChange; + +/// The result of comparing two documents. +#[derive(Debug, Clone, Default, Serialize, Deserialize, Deref, DerefMut)] +pub struct Diff { + /// Changes to document regions. + #[deref] + #[deref_mut] + pub regions: Vec, + /// Whether the documents are identical. + pub is_identical: bool, +} + +impl Diff { + /// Creates a new empty diff indicating identical documents. + #[must_use] + pub fn identical() -> Self { + Self { + regions: Vec::new(), + is_identical: true, + } + } + + /// Creates a new diff with the given changes. + #[must_use] + pub fn with_changes(regions: Vec) -> Self { + let is_identical = regions.is_empty(); + Self { + regions, + is_identical, + } + } + + /// Returns the number of changes. + #[must_use] + pub fn change_count(&self) -> usize { + self.regions.len() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::RegionId; + + #[test] + fn test_diff_identical() { + let diff = Diff::identical(); + + assert!(diff.is_identical); + assert!(diff.regions.is_empty()); + } + + #[test] + fn test_diff_with_changes() { + let changes = vec![ + RegionChange::added(RegionId::new(), "added"), + RegionChange::removed(RegionId::new(), "removed"), + RegionChange::modified(RegionId::new(), RegionId::new(), "old", "new"), + ]; + + let diff = Diff::with_changes(changes); + + assert!(!diff.is_identical); + assert_eq!(diff.change_count(), 3); + } +} diff --git a/crates/nvisy-document/src/format/mod.rs b/crates/nvisy-document/src/format/mod.rs index aa6915a..2e886e4 100644 --- a/crates/nvisy-document/src/format/mod.rs +++ b/crates/nvisy-document/src/format/mod.rs @@ -20,7 +20,7 @@ pub use capabilities::{ Capabilities, MetadataCapabilities, StructureCapabilities, TextCapabilities, }; pub use info::DocumentInfo; -pub use nvisy_core::io::ContentData; +pub use nvisy_rt_core::io::ContentData; pub use page::PageOptions; pub use region::{BoundingBox, Point, Region, RegionId, RegionKind, RegionSource, RegionStatus}; diff --git a/crates/nvisy-document/src/format/region/mod.rs b/crates/nvisy-document/src/format/region/mod.rs index e9bde8e..b334549 100644 --- a/crates/nvisy-document/src/format/region/mod.rs +++ b/crates/nvisy-document/src/format/region/mod.rs @@ -10,8 +10,9 @@ mod kind; mod source; mod status; -pub use bounds::{BoundingBox, Point}; pub use core::Region; + +pub use bounds::{BoundingBox, Point}; pub use id::RegionId; pub use kind::RegionKind; pub use source::RegionSource; diff --git a/crates/nvisy-document/src/lib.rs b/crates/nvisy-document/src/lib.rs index 18560e8..b27f9b5 100644 --- a/crates/nvisy-document/src/lib.rs +++ b/crates/nvisy-document/src/lib.rs @@ -3,6 +3,7 @@ #![doc = include_str!("../README.md")] // Core modules +pub mod diff; pub mod format; // Extension trait modules @@ -12,13 +13,11 @@ pub mod table; pub mod text; pub mod thumbnail; -// Error re-exports from nvisy-core -pub use nvisy_core::error::{BoxError, Error, ErrorResource, ErrorType, Result}; - pub use conversion::{ Conversion, ConversionOptions, ConversionPath, ConversionResult, ConversionStep, FormatPair, HtmlOptions, PageMargins, PageOrientation, PdfOptions, SkippedElement, }; +pub use diff::{Change, ChangeKind, Diff, Differ, RegionChange}; pub use format::region::{ BoundingBox, Point, Region, RegionId, RegionKind, RegionSource, RegionStatus, }; @@ -30,6 +29,7 @@ pub use metadata::{ CustomProperty, DocumentMetadata, Metadata, MetadataExtractOptions, MetadataField, PropertyValue, }; +pub use nvisy_rt_core::error::{BoxError, Error, ErrorResource, ErrorType, Result}; pub use table::{CellDataType, NormalizedCell, NormalizedRow, NormalizedTable, TableExtractor}; pub use text::{ExtractedText, TextExtractor}; pub use thumbnail::{ImageFormat, Thumbnail, ThumbnailGenerator, ThumbnailOptions, ThumbnailSize}; diff --git a/crates/nvisy-document/src/table/mod.rs b/crates/nvisy-document/src/table/mod.rs index 80f82aa..c5bab68 100644 --- a/crates/nvisy-document/src/table/mod.rs +++ b/crates/nvisy-document/src/table/mod.rs @@ -20,7 +20,7 @@ use crate::format::{Document, Region, RegionKind}; /// # Example /// /// ```ignore -/// use nvisy_document::{Document, TableExtractor, NormalizedTable}; +/// use nvisy_rt_document::{Document, TableExtractor, NormalizedTable}; /// /// async fn process_tables(doc: &D) -> Result> /// where diff --git a/crates/nvisy-document/src/text/mod.rs b/crates/nvisy-document/src/text/mod.rs index 319380d..e53c99c 100644 --- a/crates/nvisy-document/src/text/mod.rs +++ b/crates/nvisy-document/src/text/mod.rs @@ -20,7 +20,7 @@ use crate::format::Document; /// # Example /// /// ```ignore -/// use nvisy_document::{Document, TextExtractor, ExtractedText}; +/// use nvisy_rt_document::{Document, TextExtractor, ExtractedText}; /// /// async fn extract_document_text(doc: &D) -> Result /// where diff --git a/crates/nvisy-document/src/thumbnail/options.rs b/crates/nvisy-document/src/thumbnail/options.rs index 5ee44dd..b7d8d13 100644 --- a/crates/nvisy-document/src/thumbnail/options.rs +++ b/crates/nvisy-document/src/thumbnail/options.rs @@ -166,10 +166,10 @@ impl ThumbnailOptions { return Err("render_dpi exceeds maximum of 600".to_string()); } - if let Some(ref bg) = self.background { - if bg.len() != 6 || !bg.chars().all(|c| c.is_ascii_hexdigit()) { - return Err("background must be a 6-character hex RGB value".to_string()); - } + if let Some(ref bg) = self.background + && (bg.len() != 6 || !bg.chars().all(|c| c.is_ascii_hexdigit())) + { + return Err("background must be a 6-character hex RGB value".to_string()); } if self.size.max_width() > 4096 || self.size.max_height() > 4096 { diff --git a/crates/nvisy-docx/Cargo.toml b/crates/nvisy-docx/Cargo.toml index f4b66cd..9e0b55c 100644 --- a/crates/nvisy-docx/Cargo.toml +++ b/crates/nvisy-docx/Cargo.toml @@ -1,7 +1,7 @@ # https://doc.rust-lang.org/cargo/reference/manifest.html [package] -name = "nvisy-docx" +name = "nvisy-rt-docx" description = "DOCX document format support for nvisy" readme = "./README.md" @@ -21,7 +21,7 @@ all-features = true rustdoc-args = ["--cfg", "docsrs"] [dependencies] -nvisy-document = { workspace = true } +nvisy-rt-document = { workspace = true } async-trait = { workspace = true } bytes = { workspace = true } diff --git a/crates/nvisy-docx/src/document.rs b/crates/nvisy-docx/src/document.rs index c82a00c..7854e89 100644 --- a/crates/nvisy-docx/src/document.rs +++ b/crates/nvisy-docx/src/document.rs @@ -2,7 +2,7 @@ use async_trait::async_trait; use bytes::Bytes; -use nvisy_document::{Document, DocumentInfo, Error, Region, RegionId, Result}; +use nvisy_rt_document::{Document, DocumentInfo, Error, Region, RegionId, Result}; /// A loaded DOCX document. #[derive(Debug)] diff --git a/crates/nvisy-docx/src/format.rs b/crates/nvisy-docx/src/format.rs index ba50278..0f0559b 100644 --- a/crates/nvisy-docx/src/format.rs +++ b/crates/nvisy-docx/src/format.rs @@ -1,6 +1,6 @@ //! DOCX format handler implementation. -use nvisy_document::{Capabilities, ContentData, DocumentFormat, Error, Result}; +use nvisy_rt_document::{Capabilities, ContentData, DocumentFormat, Error, Result}; use crate::DocxDocument; diff --git a/crates/nvisy-engine/Cargo.toml b/crates/nvisy-engine/Cargo.toml index 5fb8c27..0dda308 100644 --- a/crates/nvisy-engine/Cargo.toml +++ b/crates/nvisy-engine/Cargo.toml @@ -1,7 +1,7 @@ # https://doc.rust-lang.org/cargo/reference/manifest.html [package] -name = "nvisy-engine" +name = "nvisy-rt-engine" description = "Document processing engine for nvisy" readme = "./README.md" @@ -22,19 +22,19 @@ rustdoc-args = ["--cfg", "docsrs"] [features] default = ["pdf", "docx", "text", "image"] -pdf = ["dep:nvisy-pdf"] -docx = ["dep:nvisy-docx"] -text = ["dep:nvisy-text"] -image = ["dep:nvisy-image"] +pdf = ["dep:nvisy-rt-pdf"] +docx = ["dep:nvisy-rt-docx"] +text = ["dep:nvisy-rt-text"] +image = ["dep:nvisy-rt-image"] [dependencies] # Internal crates -nvisy-archive = { workspace = true } -nvisy-document = { workspace = true } -nvisy-docx = { workspace = true, optional = true } -nvisy-image = { workspace = true, optional = true } -nvisy-pdf = { workspace = true, optional = true } -nvisy-text = { workspace = true, optional = true } +nvisy-rt-archive = { workspace = true } +nvisy-rt-document = { workspace = true } +nvisy-rt-docx = { workspace = true, optional = true } +nvisy-rt-image = { workspace = true, optional = true } +nvisy-rt-pdf = { workspace = true, optional = true } +nvisy-rt-text = { workspace = true, optional = true } # Data types bytes = { workspace = true } @@ -44,6 +44,9 @@ jiff = { workspace = true, features = ["std"] } # Serialization serde = { workspace = true, features = ["std", "derive"] } +# Observability +tracing = { workspace = true } + [dev-dependencies] serde_json = { workspace = true, features = ["std"] } tokio = { workspace = true, features = ["rt", "macros"] } diff --git a/crates/nvisy-engine/src/engine/mod.rs b/crates/nvisy-engine/src/engine/mod.rs index 5b47267..eb8c258 100644 --- a/crates/nvisy-engine/src/engine/mod.rs +++ b/crates/nvisy-engine/src/engine/mod.rs @@ -8,9 +8,11 @@ mod config; use std::path::Path; pub use config::EngineConfig; -use nvisy_document::{ContentData, Result}; +use nvisy_rt_document::{ContentData, Result}; +use tracing::{debug, info}; -use crate::registry::{BoxDocument, FormatRegistry}; +use crate::TRACING_TARGET_ENGINE; +use crate::registry::{FormatRegistry, LoadedDocument}; /// The central document processing engine. /// @@ -22,7 +24,7 @@ use crate::registry::{BoxDocument, FormatRegistry}; /// # Example /// /// ```ignore -/// use nvisy_engine::Engine; +/// use nvisy_rt_engine::Engine; /// /// let engine = Engine::new(); /// @@ -48,6 +50,7 @@ impl Engine { /// Creates a new engine with default configuration and all default formats. #[must_use] pub fn new() -> Self { + info!(target: TRACING_TARGET_ENGINE, "Creating engine with default configuration"); Self { config: EngineConfig::default(), registry: FormatRegistry::with_defaults(), @@ -57,6 +60,7 @@ impl Engine { /// Creates a new engine with the specified configuration. #[must_use] pub fn with_config(config: EngineConfig) -> Self { + debug!(target: TRACING_TARGET_ENGINE, ?config, "Creating engine with custom configuration"); Self { config, registry: FormatRegistry::with_defaults(), @@ -66,6 +70,7 @@ impl Engine { /// Creates a new engine with a custom registry. #[must_use] pub fn with_registry(registry: FormatRegistry) -> Self { + debug!(target: TRACING_TARGET_ENGINE, "Creating engine with custom registry"); Self { config: EngineConfig::default(), registry, @@ -75,6 +80,7 @@ impl Engine { /// Creates a new engine with custom configuration and registry. #[must_use] pub fn with_config_and_registry(config: EngineConfig, registry: FormatRegistry) -> Self { + debug!(target: TRACING_TARGET_ENGINE, ?config, "Creating engine with custom configuration and registry"); Self { config, registry } } @@ -108,7 +114,7 @@ impl Engine { /// - The file has no extension /// - The extension is not supported /// - The document fails to load - pub async fn load_file>(&self, path: P) -> Result { + pub async fn load_file>(&self, path: P) -> Result { self.registry.load_file(path).await } @@ -117,7 +123,7 @@ impl Engine { /// # Errors /// /// Returns an error if the extension is not supported or loading fails. - pub async fn load_by_extension(&self, ext: &str, data: ContentData) -> Result { + pub async fn load_by_extension(&self, ext: &str, data: ContentData) -> Result { self.registry.load_by_extension(ext, data).await } @@ -126,7 +132,7 @@ impl Engine { /// # Errors /// /// Returns an error if the MIME type is not supported or loading fails. - pub async fn load_by_mime(&self, mime: &str, data: ContentData) -> Result { + pub async fn load_by_mime(&self, mime: &str, data: ContentData) -> Result { self.registry.load_by_mime(mime, data).await } @@ -163,6 +169,7 @@ impl Default for Engine { impl Clone for Engine { fn clone(&self) -> Self { + debug!(target: TRACING_TARGET_ENGINE, "Cloning engine"); Self { config: self.config.clone(), registry: FormatRegistry::with_defaults(), @@ -283,7 +290,7 @@ mod tests { let mut registry = FormatRegistry::new(); #[cfg(feature = "text")] - registry.register(nvisy_text::JsonFormat::new()); + registry.register(nvisy_rt_text::JsonFormat::new()); let engine = Engine::with_registry(registry); diff --git a/crates/nvisy-engine/src/lib.rs b/crates/nvisy-engine/src/lib.rs index c4d83a1..768f829 100644 --- a/crates/nvisy-engine/src/lib.rs +++ b/crates/nvisy-engine/src/lib.rs @@ -7,9 +7,21 @@ pub mod registry; pub mod session; pub use engine::{Engine, EngineConfig}; -pub use nvisy_document::{ +pub use nvisy_rt_document::{ self as doc, BoundingBox, Capabilities, Document, DocumentFormat, Point, Region, RegionId, RegionKind, }; -pub use registry::{AnyFormat, BoxDocument, FormatRegistry}; +pub use registry::{FormatRef, FormatRegistry, LoadedDocument}; pub use session::{AccessEntry, AccessHistory, ReadSession, SessionConfig, SessionId}; + +/// Tracing target for engine operations. +pub const TRACING_TARGET_ENGINE: &str = "nvisy_rt_engine::engine"; + +/// Tracing target for format registry operations. +pub const TRACING_TARGET_REGISTRY: &str = "nvisy_rt_engine::registry"; + +/// Tracing target for session management. +pub const TRACING_TARGET_SESSION: &str = "nvisy_rt_engine::session"; + +/// Tracing target for document loading. +pub const TRACING_TARGET_LOAD: &str = "nvisy_rt_engine::load"; diff --git a/crates/nvisy-engine/src/registry/format_ref.rs b/crates/nvisy-engine/src/registry/format_ref.rs new file mode 100644 index 0000000..784add6 --- /dev/null +++ b/crates/nvisy-engine/src/registry/format_ref.rs @@ -0,0 +1,114 @@ +//! Format reference types. + +use std::pin::Pin; + +use nvisy_rt_document::{Capabilities, ContentData, Document, DocumentFormat, Result}; + +use super::LoadedDocument; + +/// Internal type alias for boxed documents. +pub(crate) type BoxDocument = Box; + +/// A type-erased format handler. +pub(crate) trait AnyFormat: Send + Sync { + fn name(&self) -> &'static str; + fn mime_types(&self) -> &'static [&'static str]; + fn extensions(&self) -> &'static [&'static str]; + fn capabilities(&self) -> &Capabilities; + fn load_boxed( + &self, + data: ContentData, + ) -> Pin> + Send + '_>>; +} + +/// Wrapper that implements `AnyFormat` for any `DocumentFormat`. +pub(crate) struct FormatWrapper { + pub(crate) inner: F, +} + +impl AnyFormat for FormatWrapper +where + F: DocumentFormat + Send + Sync + 'static, + F::Document: Send + Sync + 'static, +{ + fn name(&self) -> &'static str { + DocumentFormat::name(&self.inner) + } + + fn mime_types(&self) -> &'static [&'static str] { + DocumentFormat::mime_types(&self.inner) + } + + fn extensions(&self) -> &'static [&'static str] { + DocumentFormat::extensions(&self.inner) + } + + fn capabilities(&self) -> &Capabilities { + DocumentFormat::capabilities(&self.inner) + } + + fn load_boxed( + &self, + data: ContentData, + ) -> Pin> + Send + '_>> { + Box::pin(async move { + let doc = DocumentFormat::load(&self.inner, data).await?; + Ok(Box::new(doc) as BoxDocument) + }) + } +} + +/// A borrowed reference to a format handler. +/// +/// Provides access to format metadata and document loading. +#[derive(Clone, Copy)] +pub struct FormatRef<'a> { + inner: &'a dyn AnyFormat, +} + +impl std::fmt::Debug for FormatRef<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("FormatRef") + .field("name", &self.name()) + .field("extensions", &self.extensions()) + .field("mime_types", &self.mime_types()) + .finish() + } +} + +impl<'a> FormatRef<'a> { + /// Creates a new format reference. + pub(crate) fn new(inner: &'a dyn AnyFormat) -> Self { + Self { inner } + } + + /// Returns the format name. + #[must_use] + pub fn name(&self) -> &'static str { + self.inner.name() + } + + /// Returns supported MIME types. + #[must_use] + pub fn mime_types(&self) -> &'static [&'static str] { + self.inner.mime_types() + } + + /// Returns supported file extensions. + #[must_use] + pub fn extensions(&self) -> &'static [&'static str] { + self.inner.extensions() + } + + /// Returns the format capabilities. + #[must_use] + pub fn capabilities(&self) -> &Capabilities { + self.inner.capabilities() + } + + /// Loads a document from content data. + pub async fn load(&self, data: ContentData) -> Result { + let doc = self.inner.load_boxed(data).await?; + Ok(LoadedDocument::new(doc)) + } +} diff --git a/crates/nvisy-engine/src/registry/format_registry.rs b/crates/nvisy-engine/src/registry/format_registry.rs new file mode 100644 index 0000000..4dc8b08 --- /dev/null +++ b/crates/nvisy-engine/src/registry/format_registry.rs @@ -0,0 +1,343 @@ +//! Format registry implementation. + +use std::collections::HashMap; +use std::sync::Arc; + +use nvisy_rt_document::{ContentData, DocumentFormat, Error, Result}; +use tracing::{debug, instrument, trace, warn}; + +use super::format_ref::{AnyFormat, FormatWrapper}; +use super::{FormatRef, LoadedDocument}; +use crate::{TRACING_TARGET_LOAD, TRACING_TARGET_REGISTRY}; + +/// Registry entry containing a format handler. +struct RegistryEntry { + format: Arc, +} + +/// A registry of document formats. +/// +/// The registry maintains mappings from file extensions and MIME types +/// to format handlers, enabling dynamic document loading. +/// +/// # Example +/// +/// ```ignore +/// use nvisy_rt_engine::FormatRegistry; +/// +/// let registry = FormatRegistry::with_defaults(); +/// +/// // Load by file path +/// let doc = registry.load_file("document.pdf").await?; +/// +/// // Load by extension +/// let doc = registry.load_by_extension("json", data).await?; +/// ``` +#[derive(Default)] +pub struct FormatRegistry { + /// All registered formats. + formats: Vec, + + /// Extension to format index mapping. + by_extension: HashMap<&'static str, usize>, + + /// MIME type to format index mapping. + by_mime: HashMap<&'static str, usize>, +} + +impl FormatRegistry { + /// Creates an empty registry. + #[must_use] + pub fn new() -> Self { + trace!(target: TRACING_TARGET_REGISTRY, "Creating empty format registry"); + Self { + formats: Vec::new(), + by_extension: HashMap::new(), + by_mime: HashMap::new(), + } + } + + /// Creates a registry with all default formats registered. + #[must_use] + pub fn with_defaults() -> Self { + let mut registry = Self::new(); + registry.register_defaults(); + debug!( + target: TRACING_TARGET_REGISTRY, + formats = registry.formats.len(), + extensions = registry.by_extension.len(), + mime_types = registry.by_mime.len(), + "Initialized registry with default formats" + ); + registry + } + + /// Registers all default formats based on enabled features. + pub fn register_defaults(&mut self) { + #[cfg(feature = "pdf")] + self.register(nvisy_rt_pdf::PdfFormat::new()); + + #[cfg(feature = "docx")] + self.register(nvisy_rt_docx::DocxFormat::new()); + + #[cfg(feature = "text")] + { + self.register(nvisy_rt_text::PlainTextFormat::new()); + self.register(nvisy_rt_text::MarkdownFormat::new()); + self.register(nvisy_rt_text::JsonFormat::new()); + self.register(nvisy_rt_text::CsvFormat::new()); + self.register(nvisy_rt_text::XmlFormat::new()); + self.register(nvisy_rt_text::YamlFormat::new()); + self.register(nvisy_rt_text::TomlFormat::new()); + self.register(nvisy_rt_text::IniFormat::new()); + } + + #[cfg(feature = "image")] + { + self.register(nvisy_rt_image::JpegFormat::new()); + self.register(nvisy_rt_image::PngFormat::new()); + } + } + + /// Registers a format handler. + /// + /// Extensions and MIME types from the format are automatically indexed. + /// If an extension or MIME type is already registered, the new format + /// takes precedence. + pub fn register(&mut self, format: F) + where + F: DocumentFormat + Send + Sync + 'static, + F::Document: Send + Sync + 'static, + { + let wrapper = FormatWrapper { inner: format }; + let index = self.formats.len(); + let format: Arc = Arc::new(wrapper); + + let name = format.name(); + let extensions = format.extensions(); + let mime_types = format.mime_types(); + + // Index by extension + for ext in extensions { + self.by_extension.insert(ext, index); + } + + // Index by MIME type + for mime in mime_types { + self.by_mime.insert(mime, index); + } + + self.formats.push(RegistryEntry { format }); + + trace!( + target: TRACING_TARGET_REGISTRY, + name, + ?extensions, + ?mime_types, + "Registered format" + ); + } + + /// Returns the format handler for a file extension. + #[must_use] + pub fn get_by_extension(&self, ext: &str) -> Option> { + let ext = ext.trim_start_matches('.').to_lowercase(); + self.by_extension + .get(ext.as_str()) + .and_then(|&idx| self.formats.get(idx)) + .map(|e| FormatRef::new(e.format.as_ref())) + } + + /// Returns the format handler for a MIME type. + #[must_use] + pub fn get_by_mime(&self, mime: &str) -> Option> { + let mime = mime.to_lowercase(); + self.by_mime + .get(mime.as_str()) + .and_then(|&idx| self.formats.get(idx)) + .map(|e| FormatRef::new(e.format.as_ref())) + } + + /// Checks if an extension is supported. + #[must_use] + pub fn supports_extension(&self, ext: &str) -> bool { + let ext = ext.trim_start_matches('.').to_lowercase(); + self.by_extension.contains_key(ext.as_str()) + } + + /// Checks if a MIME type is supported. + #[must_use] + pub fn supports_mime(&self, mime: &str) -> bool { + let mime = mime.to_lowercase(); + self.by_mime.contains_key(mime.as_str()) + } + + /// Returns all supported file extensions. + #[must_use] + pub fn supported_extensions(&self) -> Vec<&'static str> { + self.by_extension.keys().copied().collect() + } + + /// Returns all supported MIME types. + #[must_use] + pub fn supported_mime_types(&self) -> Vec<&'static str> { + self.by_mime.keys().copied().collect() + } + + /// Returns all registered formats. + #[must_use] + pub fn formats(&self) -> Vec> { + self.formats + .iter() + .map(|e| FormatRef::new(e.format.as_ref())) + .collect() + } + + /// Loads a document by file extension. + /// + /// # Errors + /// + /// Returns an error if: + /// - The extension is not supported + /// - The document fails to load + #[instrument(target = TRACING_TARGET_LOAD, skip(self, data), fields(size = data.size()))] + pub async fn load_by_extension(&self, ext: &str, data: ContentData) -> Result { + let format = self.get_by_extension(ext).ok_or_else(|| { + warn!(target: TRACING_TARGET_LOAD, ext, "Unsupported extension"); + Error::new(format!("Unsupported extension: {}", ext)) + })?; + + debug!(target: TRACING_TARGET_LOAD, ext, format = format.name(), "Loading document"); + format.load(data).await + } + + /// Loads a document by MIME type. + /// + /// # Errors + /// + /// Returns an error if: + /// - The MIME type is not supported + /// - The document fails to load + #[instrument(target = TRACING_TARGET_LOAD, skip(self, data), fields(size = data.size()))] + pub async fn load_by_mime(&self, mime: &str, data: ContentData) -> Result { + let format = self.get_by_mime(mime).ok_or_else(|| { + warn!(target: TRACING_TARGET_LOAD, mime, "Unsupported MIME type"); + Error::new(format!("Unsupported MIME type: {}", mime)) + })?; + + debug!(target: TRACING_TARGET_LOAD, mime, format = format.name(), "Loading document"); + format.load(data).await + } + + /// Loads a document from a file path. + /// + /// The format is determined by the file extension. + /// + /// # Errors + /// + /// Returns an error if: + /// - The file cannot be read + /// - The file has no extension + /// - The extension is not supported + /// - The document fails to load + #[instrument(target = TRACING_TARGET_LOAD, skip(self), fields(path = %path.as_ref().display()))] + pub async fn load_file>(&self, path: P) -> Result { + let path = path.as_ref(); + + let ext = path.extension().and_then(|e| e.to_str()).ok_or_else(|| { + warn!(target: TRACING_TARGET_LOAD, path = %path.display(), "File has no extension"); + Error::new("File has no extension") + })?; + + debug!(target: TRACING_TARGET_LOAD, path = %path.display(), ext, "Reading file"); + + let data = std::fs::read(path).map_err(|e| { + warn!(target: TRACING_TARGET_LOAD, path = %path.display(), error = %e, "Failed to read file"); + Error::from_source(format!("Failed to read file '{}'", path.display()), e) + })?; + + self.load_by_extension(ext, ContentData::from(data)).await + } +} + +impl std::fmt::Debug for FormatRegistry { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("FormatRegistry") + .field("formats", &self.formats.len()) + .field("extensions", &self.by_extension.keys().collect::>()) + .field("mime_types", &self.by_mime.keys().collect::>()) + .finish() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_registry_creation() { + let registry = FormatRegistry::new(); + assert!(registry.formats().is_empty()); + } + + #[test] + fn test_registry_with_defaults() { + let registry = FormatRegistry::with_defaults(); + assert!(!registry.formats().is_empty()); + + #[cfg(feature = "text")] + { + assert!(registry.supports_extension("txt")); + assert!(registry.supports_extension("json")); + assert!(registry.supports_extension("md")); + } + } + + #[test] + fn test_get_by_extension() { + let registry = FormatRegistry::with_defaults(); + + #[cfg(feature = "text")] + { + let format = registry.get_by_extension("json").unwrap(); + assert_eq!(format.name(), "json"); + + let format = registry.get_by_extension(".JSON").unwrap(); + assert_eq!(format.name(), "json"); + } + + assert!(registry.get_by_extension("xyz").is_none()); + } + + #[cfg(feature = "text")] + #[tokio::test] + async fn test_load_by_extension() { + let registry = FormatRegistry::with_defaults(); + + let doc = registry + .load_by_extension("json", ContentData::from(r#"{"key": "value"}"#)) + .await + .unwrap(); + + assert!(!doc.regions().is_empty()); + } + + #[cfg(feature = "text")] + #[tokio::test] + async fn test_load_by_mime() { + let registry = FormatRegistry::with_defaults(); + + let doc = registry + .load_by_mime("application/json", ContentData::from(r#"{"key": "value"}"#)) + .await + .unwrap(); + + assert!(!doc.regions().is_empty()); + } + + #[test] + fn test_unsupported_extension() { + let registry = FormatRegistry::with_defaults(); + assert!(!registry.supports_extension("xyz")); + } +} diff --git a/crates/nvisy-engine/src/registry/loaded_document.rs b/crates/nvisy-engine/src/registry/loaded_document.rs new file mode 100644 index 0000000..396041d --- /dev/null +++ b/crates/nvisy-engine/src/registry/loaded_document.rs @@ -0,0 +1,41 @@ +//! Loaded document wrapper. + +use nvisy_rt_document::Document; + +use super::format_ref::BoxDocument; + +/// A loaded document from the registry. +/// +/// This struct wraps a type-erased document and provides access +/// to common document operations through `Deref`. +pub struct LoadedDocument { + inner: BoxDocument, +} + +impl LoadedDocument { + /// Creates a new loaded document. + pub(crate) fn new(inner: BoxDocument) -> Self { + Self { inner } + } + + /// Consumes this wrapper and returns the inner boxed document. + pub fn into_inner(self) -> Box { + self.inner + } +} + +impl std::fmt::Debug for LoadedDocument { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("LoadedDocument") + .field("info", self.inner.info()) + .finish() + } +} + +impl std::ops::Deref for LoadedDocument { + type Target = dyn Document + Send + Sync; + + fn deref(&self) -> &Self::Target { + self.inner.as_ref() + } +} diff --git a/crates/nvisy-engine/src/registry/mod.rs b/crates/nvisy-engine/src/registry/mod.rs index fc3d854..7c09587 100644 --- a/crates/nvisy-engine/src/registry/mod.rs +++ b/crates/nvisy-engine/src/registry/mod.rs @@ -4,378 +4,10 @@ //! to be loaded by extension or MIME type without knowing the concrete //! format at compile time. -use std::collections::HashMap; -use std::sync::Arc; +mod format_ref; +mod format_registry; +mod loaded_document; -use nvisy_document::{Capabilities, ContentData, Document, Error, Result}; - -/// A type-erased document that can be used for common operations. -pub type BoxDocument = Box; - -/// A type-erased format handler. -/// -/// This trait provides a common interface for all format handlers, -/// enabling dynamic dispatch and runtime format selection. -pub trait AnyFormat: Send + Sync { - /// Returns the format name. - fn name(&self) -> &'static str; - - /// Returns supported MIME types. - fn mime_types(&self) -> &'static [&'static str]; - - /// Returns supported file extensions. - fn extensions(&self) -> &'static [&'static str]; - - /// Returns the format capabilities. - fn capabilities(&self) -> &Capabilities; - - /// Loads a document from content data, returning a type-erased document. - fn load_boxed( - &self, - data: ContentData, - ) -> std::pin::Pin> + Send + '_>>; -} - -/// Wrapper that implements AnyFormat for any DocumentFormat. -struct FormatWrapper { - inner: F, -} - -impl AnyFormat for FormatWrapper -where - F: nvisy_document::DocumentFormat + Send + Sync + 'static, - F::Document: Send + Sync + 'static, -{ - fn name(&self) -> &'static str { - nvisy_document::DocumentFormat::name(&self.inner) - } - - fn mime_types(&self) -> &'static [&'static str] { - nvisy_document::DocumentFormat::mime_types(&self.inner) - } - - fn extensions(&self) -> &'static [&'static str] { - nvisy_document::DocumentFormat::extensions(&self.inner) - } - - fn capabilities(&self) -> &Capabilities { - nvisy_document::DocumentFormat::capabilities(&self.inner) - } - - fn load_boxed( - &self, - data: ContentData, - ) -> std::pin::Pin> + Send + '_>> { - Box::pin(async move { - let doc = nvisy_document::DocumentFormat::load(&self.inner, data).await?; - Ok(Box::new(doc) as BoxDocument) - }) - } -} - -/// Registry entry containing a format handler. -struct RegistryEntry { - format: Arc, -} - -/// A registry of document formats. -/// -/// The registry maintains mappings from file extensions and MIME types -/// to format handlers, enabling dynamic document loading. -/// -/// # Example -/// -/// ```ignore -/// use nvisy_engine::FormatRegistry; -/// -/// let registry = FormatRegistry::with_defaults(); -/// -/// // Load by file path -/// let doc = registry.load_file("document.pdf").await?; -/// -/// // Load by extension -/// let doc = registry.load_by_extension("json", data).await?; -/// ``` -#[derive(Default)] -pub struct FormatRegistry { - /// All registered formats. - formats: Vec, - - /// Extension to format index mapping. - by_extension: HashMap<&'static str, usize>, - - /// MIME type to format index mapping. - by_mime: HashMap<&'static str, usize>, -} - -impl FormatRegistry { - /// Creates an empty registry. - #[must_use] - pub fn new() -> Self { - Self { - formats: Vec::new(), - by_extension: HashMap::new(), - by_mime: HashMap::new(), - } - } - - /// Creates a registry with all default formats registered. - #[must_use] - pub fn with_defaults() -> Self { - let mut registry = Self::new(); - registry.register_defaults(); - registry - } - - /// Registers all default formats based on enabled features. - pub fn register_defaults(&mut self) { - #[cfg(feature = "pdf")] - self.register(nvisy_pdf::PdfFormat::new()); - - #[cfg(feature = "docx")] - self.register(nvisy_docx::DocxFormat::new()); - - #[cfg(feature = "text")] - { - self.register(nvisy_text::PlainTextFormat::new()); - self.register(nvisy_text::MarkdownFormat::new()); - self.register(nvisy_text::JsonFormat::new()); - self.register(nvisy_text::CsvFormat::new()); - self.register(nvisy_text::XmlFormat::new()); - self.register(nvisy_text::YamlFormat::new()); - self.register(nvisy_text::TomlFormat::new()); - self.register(nvisy_text::IniFormat::new()); - } - - #[cfg(feature = "image")] - { - self.register(nvisy_image::JpegFormat::new()); - self.register(nvisy_image::PngFormat::new()); - } - } - - /// Registers a format handler. - /// - /// Extensions and MIME types from the format are automatically indexed. - /// If an extension or MIME type is already registered, the new format - /// takes precedence. - pub fn register(&mut self, format: F) - where - F: nvisy_document::DocumentFormat + Send + Sync + 'static, - F::Document: Send + Sync + 'static, - { - let wrapper = FormatWrapper { inner: format }; - let index = self.formats.len(); - let format: Arc = Arc::new(wrapper); - - // Index by extension - for ext in format.extensions() { - self.by_extension.insert(ext, index); - } - - // Index by MIME type - for mime in format.mime_types() { - self.by_mime.insert(mime, index); - } - - self.formats.push(RegistryEntry { format }); - } - - /// Returns the format handler for a file extension. - #[must_use] - pub fn get_by_extension(&self, ext: &str) -> Option<&dyn AnyFormat> { - let ext = ext.trim_start_matches('.').to_lowercase(); - self.by_extension - .get(ext.as_str()) - .and_then(|&idx| self.formats.get(idx)) - .map(|e| e.format.as_ref()) - } - - /// Returns the format handler for a MIME type. - #[must_use] - pub fn get_by_mime(&self, mime: &str) -> Option<&dyn AnyFormat> { - let mime = mime.to_lowercase(); - self.by_mime - .get(mime.as_str()) - .and_then(|&idx| self.formats.get(idx)) - .map(|e| e.format.as_ref()) - } - - /// Checks if an extension is supported. - #[must_use] - pub fn supports_extension(&self, ext: &str) -> bool { - let ext = ext.trim_start_matches('.').to_lowercase(); - self.by_extension.contains_key(ext.as_str()) - } - - /// Checks if a MIME type is supported. - #[must_use] - pub fn supports_mime(&self, mime: &str) -> bool { - let mime = mime.to_lowercase(); - self.by_mime.contains_key(mime.as_str()) - } - - /// Returns all supported file extensions. - #[must_use] - pub fn supported_extensions(&self) -> Vec<&'static str> { - self.by_extension.keys().copied().collect() - } - - /// Returns all supported MIME types. - #[must_use] - pub fn supported_mime_types(&self) -> Vec<&'static str> { - self.by_mime.keys().copied().collect() - } - - /// Returns all registered formats. - #[must_use] - pub fn formats(&self) -> Vec<&dyn AnyFormat> { - self.formats.iter().map(|e| e.format.as_ref()).collect() - } - - /// Loads a document by file extension. - /// - /// # Errors - /// - /// Returns an error if: - /// - The extension is not supported - /// - The document fails to load - pub async fn load_by_extension(&self, ext: &str, data: ContentData) -> Result { - let ext_lower = ext.trim_start_matches('.').to_lowercase(); - - let format = self - .by_extension - .get(ext_lower.as_str()) - .and_then(|&idx| self.formats.get(idx)) - .ok_or_else(|| Error::new(format!("Unsupported extension: {}", ext)))?; - - format.format.load_boxed(data).await - } - - /// Loads a document by MIME type. - /// - /// # Errors - /// - /// Returns an error if: - /// - The MIME type is not supported - /// - The document fails to load - pub async fn load_by_mime(&self, mime: &str, data: ContentData) -> Result { - let mime_lower = mime.to_lowercase(); - - let format = self - .by_mime - .get(mime_lower.as_str()) - .and_then(|&idx| self.formats.get(idx)) - .ok_or_else(|| Error::new(format!("Unsupported MIME type: {}", mime)))?; - - format.format.load_boxed(data).await - } - - /// Loads a document from a file path. - /// - /// The format is determined by the file extension. - /// - /// # Errors - /// - /// Returns an error if: - /// - The file cannot be read - /// - The file has no extension - /// - The extension is not supported - /// - The document fails to load - pub async fn load_file>(&self, path: P) -> Result { - let path = path.as_ref(); - - let ext = path - .extension() - .and_then(|e| e.to_str()) - .ok_or_else(|| Error::new("File has no extension"))?; - - let data = std::fs::read(path).map_err(|e| { - Error::from_source(format!("Failed to read file '{}'", path.display()), e) - })?; - - self.load_by_extension(ext, ContentData::from(data)).await - } -} - -impl std::fmt::Debug for FormatRegistry { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("FormatRegistry") - .field("formats", &self.formats.len()) - .field("extensions", &self.by_extension.keys().collect::>()) - .field("mime_types", &self.by_mime.keys().collect::>()) - .finish() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_registry_creation() { - let registry = FormatRegistry::new(); - assert!(registry.formats().is_empty()); - } - - #[test] - fn test_registry_with_defaults() { - let registry = FormatRegistry::with_defaults(); - assert!(!registry.formats().is_empty()); - - #[cfg(feature = "text")] - { - assert!(registry.supports_extension("txt")); - assert!(registry.supports_extension("json")); - assert!(registry.supports_extension("md")); - } - } - - #[test] - fn test_get_by_extension() { - let registry = FormatRegistry::with_defaults(); - - #[cfg(feature = "text")] - { - let format = registry.get_by_extension("json").unwrap(); - assert_eq!(format.name(), "json"); - - let format = registry.get_by_extension(".JSON").unwrap(); - assert_eq!(format.name(), "json"); - } - - assert!(registry.get_by_extension("xyz").is_none()); - } - - #[cfg(feature = "text")] - #[tokio::test] - async fn test_load_by_extension() { - let registry = FormatRegistry::with_defaults(); - - let doc = registry - .load_by_extension("json", ContentData::from(r#"{"key": "value"}"#)) - .await - .unwrap(); - - assert!(!doc.regions().is_empty()); - } - - #[cfg(feature = "text")] - #[tokio::test] - async fn test_load_by_mime() { - let registry = FormatRegistry::with_defaults(); - - let doc = registry - .load_by_mime("application/json", ContentData::from(r#"{"key": "value"}"#)) - .await - .unwrap(); - - assert!(!doc.regions().is_empty()); - } - - #[test] - fn test_unsupported_extension() { - let registry = FormatRegistry::with_defaults(); - assert!(!registry.supports_extension("xyz")); - } -} +pub use format_ref::FormatRef; +pub use format_registry::FormatRegistry; +pub use loaded_document::LoadedDocument; diff --git a/crates/nvisy-engine/src/session/mod.rs b/crates/nvisy-engine/src/session/mod.rs index 2599aec..072831c 100644 --- a/crates/nvisy-engine/src/session/mod.rs +++ b/crates/nvisy-engine/src/session/mod.rs @@ -13,7 +13,7 @@ use std::num::NonZeroU32; use bytes::Bytes; pub use history::{AccessEntry, AccessHistory}; use jiff::Timestamp; -use nvisy_document::{Capabilities, Document, PageOptions, Region, RegionId, Result}; +use nvisy_rt_document::{Capabilities, Document, PageOptions, Region, RegionId, Result}; use uuid::Uuid; /// Unique identifier for a read session. diff --git a/crates/nvisy-image/Cargo.toml b/crates/nvisy-image/Cargo.toml index d84ecb1..46f9ce6 100644 --- a/crates/nvisy-image/Cargo.toml +++ b/crates/nvisy-image/Cargo.toml @@ -1,7 +1,7 @@ # https://doc.rust-lang.org/cargo/reference/manifest.html [package] -name = "nvisy-image" +name = "nvisy-rt-image" description = "Image format support for nvisy" readme = "./README.md" @@ -21,7 +21,7 @@ all-features = true rustdoc-args = ["--cfg", "docsrs"] [dependencies] -nvisy-document = { workspace = true } +nvisy-rt-document = { workspace = true } async-trait = { workspace = true } bytes = { workspace = true } diff --git a/crates/nvisy-image/src/documents/jpeg.rs b/crates/nvisy-image/src/documents/jpeg.rs index df75644..da91868 100644 --- a/crates/nvisy-image/src/documents/jpeg.rs +++ b/crates/nvisy-image/src/documents/jpeg.rs @@ -2,7 +2,7 @@ use async_trait::async_trait; use bytes::Bytes; -use nvisy_document::{Document, DocumentInfo, Error, Region, RegionId, Result}; +use nvisy_rt_document::{Document, DocumentInfo, Error, Region, RegionId, Result}; /// A loaded JPEG document. #[derive(Debug)] diff --git a/crates/nvisy-image/src/documents/png.rs b/crates/nvisy-image/src/documents/png.rs index b8ca50e..8159482 100644 --- a/crates/nvisy-image/src/documents/png.rs +++ b/crates/nvisy-image/src/documents/png.rs @@ -2,7 +2,7 @@ use async_trait::async_trait; use bytes::Bytes; -use nvisy_document::{Document, DocumentInfo, Error, Region, RegionId, Result}; +use nvisy_rt_document::{Document, DocumentInfo, Error, Region, RegionId, Result}; /// A loaded PNG document. #[derive(Debug)] diff --git a/crates/nvisy-image/src/formats/jpeg.rs b/crates/nvisy-image/src/formats/jpeg.rs index 8a0cb81..788dd2c 100644 --- a/crates/nvisy-image/src/formats/jpeg.rs +++ b/crates/nvisy-image/src/formats/jpeg.rs @@ -1,6 +1,6 @@ //! JPEG format handler implementation. -use nvisy_document::{Capabilities, ContentData, DocumentFormat, Error, Result}; +use nvisy_rt_document::{Capabilities, ContentData, DocumentFormat, Error, Result}; use crate::documents::JpegDocument; diff --git a/crates/nvisy-image/src/formats/png.rs b/crates/nvisy-image/src/formats/png.rs index 93572fe..4c1c02a 100644 --- a/crates/nvisy-image/src/formats/png.rs +++ b/crates/nvisy-image/src/formats/png.rs @@ -1,6 +1,6 @@ //! PNG format handler implementation. -use nvisy_document::{Capabilities, ContentData, DocumentFormat, Error, Result}; +use nvisy_rt_document::{Capabilities, ContentData, DocumentFormat, Error, Result}; use crate::documents::PngDocument; diff --git a/crates/nvisy-pdf/Cargo.toml b/crates/nvisy-pdf/Cargo.toml index c3ac1a0..fac0f72 100644 --- a/crates/nvisy-pdf/Cargo.toml +++ b/crates/nvisy-pdf/Cargo.toml @@ -1,7 +1,7 @@ # https://doc.rust-lang.org/cargo/reference/manifest.html [package] -name = "nvisy-pdf" +name = "nvisy-rt-pdf" description = "PDF document format support for nvisy" readme = "./README.md" @@ -21,7 +21,7 @@ all-features = true rustdoc-args = ["--cfg", "docsrs"] [dependencies] -nvisy-document = { workspace = true } +nvisy-rt-document = { workspace = true } async-trait = { workspace = true } bytes = { workspace = true } diff --git a/crates/nvisy-pdf/src/document.rs b/crates/nvisy-pdf/src/document.rs index d74514f..f27c265 100644 --- a/crates/nvisy-pdf/src/document.rs +++ b/crates/nvisy-pdf/src/document.rs @@ -2,7 +2,7 @@ use async_trait::async_trait; use bytes::Bytes; -use nvisy_document::{Document, DocumentInfo, Error, Region, RegionId, Result}; +use nvisy_rt_document::{Document, DocumentInfo, Error, Region, RegionId, Result}; /// A loaded PDF document. #[derive(Debug)] diff --git a/crates/nvisy-pdf/src/format.rs b/crates/nvisy-pdf/src/format.rs index 7f3904e..89d2c3c 100644 --- a/crates/nvisy-pdf/src/format.rs +++ b/crates/nvisy-pdf/src/format.rs @@ -1,6 +1,6 @@ //! PDF format handler implementation. -use nvisy_document::{Capabilities, ContentData, DocumentFormat, Error, Result}; +use nvisy_rt_document::{Capabilities, ContentData, DocumentFormat, Error, Result}; use crate::PdfDocument; diff --git a/crates/nvisy-text/Cargo.toml b/crates/nvisy-text/Cargo.toml index 80ab4ff..1737744 100644 --- a/crates/nvisy-text/Cargo.toml +++ b/crates/nvisy-text/Cargo.toml @@ -1,7 +1,7 @@ # https://doc.rust-lang.org/cargo/reference/manifest.html [package] -name = "nvisy-text" +name = "nvisy-rt-text" description = "Plain text document format support for nvisy" readme = "./README.md" @@ -21,7 +21,7 @@ all-features = true rustdoc-args = ["--cfg", "docsrs"] [dependencies] -nvisy-document = { workspace = true } +nvisy-rt-document = { workspace = true } async-trait = { workspace = true } bytes = { workspace = true } diff --git a/crates/nvisy-text/README.md b/crates/nvisy-text/README.md index 7c14a4d..fb89aad 100644 --- a/crates/nvisy-text/README.md +++ b/crates/nvisy-text/README.md @@ -17,8 +17,8 @@ various text-based file formats: ## Usage ```rust -use nvisy_text::{PlainTextFormat, PlainTextDocument}; -use nvisy_document::{ContentData, DocumentFormat, Document, TextExtractor}; +use nvisy_rt_text::{PlainTextFormat, PlainTextDocument}; +use nvisy_rt_document::{ContentData, DocumentFormat, Document, TextExtractor}; # tokio_test::block_on(async { let format = PlainTextFormat::new(); @@ -39,7 +39,7 @@ assert_eq!(text.word_count(), 6); Basic plain text with paragraph detection. ```rust -use nvisy_text::PlainTextFormat; +use nvisy_rt_text::PlainTextFormat; ``` ### Markdown @@ -47,7 +47,7 @@ use nvisy_text::PlainTextFormat; Full Markdown parsing using pulldown-cmark with support for headings, lists, code blocks, blockquotes, and more. ```rust -use nvisy_text::MarkdownFormat; +use nvisy_rt_text::MarkdownFormat; ``` ### JSON @@ -55,7 +55,7 @@ use nvisy_text::MarkdownFormat; JSON parsing with structure detection using serde_json. ```rust -use nvisy_text::JsonFormat; +use nvisy_rt_text::JsonFormat; ``` ### CSV/TSV @@ -63,8 +63,8 @@ use nvisy_text::JsonFormat; CSV and TSV parsing using the csv crate. Implements `TableExtractor` for structured table access. ```rust -use nvisy_text::CsvFormat; -use nvisy_document::TableExtractor; +use nvisy_rt_text::CsvFormat; +use nvisy_rt_document::TableExtractor; ``` ### XML @@ -72,7 +72,7 @@ use nvisy_document::TableExtractor; XML parsing with hierarchical structure detection. ```rust -use nvisy_text::XmlFormat; +use nvisy_rt_text::XmlFormat; ``` ### YAML @@ -80,7 +80,7 @@ use nvisy_text::XmlFormat; YAML parsing with list and key-value detection. ```rust -use nvisy_text::YamlFormat; +use nvisy_rt_text::YamlFormat; ``` ### TOML @@ -88,7 +88,7 @@ use nvisy_text::YamlFormat; TOML parsing with section and array table detection. ```rust -use nvisy_text::TomlFormat; +use nvisy_rt_text::TomlFormat; ``` ### INI @@ -96,7 +96,7 @@ use nvisy_text::TomlFormat; INI/config file parsing with section grouping. ```rust -use nvisy_text::IniFormat; +use nvisy_rt_text::IniFormat; ``` ## License diff --git a/crates/nvisy-text/src/documents/csv.rs b/crates/nvisy-text/src/documents/csv.rs index d766a71..b44d6f2 100644 --- a/crates/nvisy-text/src/documents/csv.rs +++ b/crates/nvisy-text/src/documents/csv.rs @@ -5,7 +5,7 @@ use std::num::NonZeroU32; use async_trait::async_trait; use bytes::Bytes; use csv::{ReaderBuilder, Terminator}; -use nvisy_document::{ +use nvisy_rt_document::{ BoundingBox, Document, DocumentInfo, ExtractedText, NormalizedCell, NormalizedRow, NormalizedTable, Region, RegionId, RegionKind, RegionSource, Result, TableExtractor, TextExtractor, diff --git a/crates/nvisy-text/src/documents/ini.rs b/crates/nvisy-text/src/documents/ini.rs index 91c1b3d..55e5d53 100644 --- a/crates/nvisy-text/src/documents/ini.rs +++ b/crates/nvisy-text/src/documents/ini.rs @@ -4,7 +4,7 @@ use std::num::NonZeroU32; use async_trait::async_trait; use bytes::Bytes; -use nvisy_document::{ +use nvisy_rt_document::{ BoundingBox, Document, DocumentInfo, ExtractedText, Region, RegionId, RegionKind, RegionSource, Result, TextExtractor, }; diff --git a/crates/nvisy-text/src/documents/json.rs b/crates/nvisy-text/src/documents/json.rs index e82360a..ff51c7f 100644 --- a/crates/nvisy-text/src/documents/json.rs +++ b/crates/nvisy-text/src/documents/json.rs @@ -4,7 +4,7 @@ use std::num::NonZeroU32; use async_trait::async_trait; use bytes::Bytes; -use nvisy_document::{ +use nvisy_rt_document::{ BoundingBox, Document, DocumentInfo, ExtractedText, Region, RegionId, RegionKind, RegionSource, Result, TextExtractor, }; @@ -23,7 +23,7 @@ impl JsonDocument { /// Creates a new JSON document from content. pub fn new(content: String) -> Result { let parsed: Value = serde_json::from_str(&content) - .map_err(|e| nvisy_document::Error::new(format!("Invalid JSON: {e}")))?; + .map_err(|e| nvisy_rt_document::Error::new(format!("Invalid JSON: {e}")))?; let regions = Self::extract_regions(&parsed); let size = content.len() as u64; diff --git a/crates/nvisy-text/src/documents/markdown.rs b/crates/nvisy-text/src/documents/markdown.rs index c30720f..4487cba 100644 --- a/crates/nvisy-text/src/documents/markdown.rs +++ b/crates/nvisy-text/src/documents/markdown.rs @@ -4,8 +4,9 @@ use std::num::NonZeroU32; use async_trait::async_trait; use bytes::Bytes; -use markdown::{ParseOptions, mdast::Node, to_mdast}; -use nvisy_document::{ +use markdown::mdast::Node; +use markdown::{ParseOptions, to_mdast}; +use nvisy_rt_document::{ BoundingBox, Document, DocumentInfo, ExtractedText, Region, RegionId, RegionKind, RegionSource, Result, TextExtractor, }; diff --git a/crates/nvisy-text/src/documents/plain.rs b/crates/nvisy-text/src/documents/plain.rs index e11caa1..75361e7 100644 --- a/crates/nvisy-text/src/documents/plain.rs +++ b/crates/nvisy-text/src/documents/plain.rs @@ -4,7 +4,7 @@ use std::num::NonZeroU32; use async_trait::async_trait; use bytes::Bytes; -use nvisy_document::{ +use nvisy_rt_document::{ BoundingBox, Document, DocumentInfo, ExtractedText, Region, RegionId, RegionKind, RegionSource, Result, TextExtractor, }; diff --git a/crates/nvisy-text/src/documents/toml.rs b/crates/nvisy-text/src/documents/toml.rs index f5ae371..d485a06 100644 --- a/crates/nvisy-text/src/documents/toml.rs +++ b/crates/nvisy-text/src/documents/toml.rs @@ -4,7 +4,7 @@ use std::num::NonZeroU32; use async_trait::async_trait; use bytes::Bytes; -use nvisy_document::{ +use nvisy_rt_document::{ BoundingBox, Document, DocumentInfo, ExtractedText, Region, RegionId, RegionKind, RegionSource, Result, TextExtractor, }; diff --git a/crates/nvisy-text/src/documents/xml.rs b/crates/nvisy-text/src/documents/xml.rs index 87b2448..c3a775b 100644 --- a/crates/nvisy-text/src/documents/xml.rs +++ b/crates/nvisy-text/src/documents/xml.rs @@ -4,7 +4,7 @@ use std::num::NonZeroU32; use async_trait::async_trait; use bytes::Bytes; -use nvisy_document::{ +use nvisy_rt_document::{ BoundingBox, Document, DocumentInfo, ExtractedText, Region, RegionId, RegionKind, RegionSource, Result, TextExtractor, }; diff --git a/crates/nvisy-text/src/documents/yaml.rs b/crates/nvisy-text/src/documents/yaml.rs index 7557513..e109fb4 100644 --- a/crates/nvisy-text/src/documents/yaml.rs +++ b/crates/nvisy-text/src/documents/yaml.rs @@ -4,7 +4,7 @@ use std::num::NonZeroU32; use async_trait::async_trait; use bytes::Bytes; -use nvisy_document::{ +use nvisy_rt_document::{ BoundingBox, Document, DocumentInfo, ExtractedText, Region, RegionId, RegionKind, RegionSource, Result, TextExtractor, }; diff --git a/crates/nvisy-text/src/formats/csv.rs b/crates/nvisy-text/src/formats/csv.rs index 66b77b2..b140964 100644 --- a/crates/nvisy-text/src/formats/csv.rs +++ b/crates/nvisy-text/src/formats/csv.rs @@ -1,6 +1,6 @@ //! CSV format handler. -use nvisy_document::{ +use nvisy_rt_document::{ Capabilities, ContentData, DocumentFormat, Result, StructureCapabilities, TextCapabilities, }; @@ -74,7 +74,7 @@ impl DocumentFormat for CsvFormat { #[cfg(test)] mod tests { - use nvisy_document::Document; + use nvisy_rt_document::Document; use super::*; diff --git a/crates/nvisy-text/src/formats/ini.rs b/crates/nvisy-text/src/formats/ini.rs index b953cd8..d355411 100644 --- a/crates/nvisy-text/src/formats/ini.rs +++ b/crates/nvisy-text/src/formats/ini.rs @@ -1,6 +1,6 @@ //! INI format handler. -use nvisy_document::{ +use nvisy_rt_document::{ Capabilities, ContentData, DocumentFormat, Result, StructureCapabilities, TextCapabilities, }; @@ -67,7 +67,7 @@ impl DocumentFormat for IniFormat { #[cfg(test)] mod tests { - use nvisy_document::Document; + use nvisy_rt_document::Document; use super::*; diff --git a/crates/nvisy-text/src/formats/json.rs b/crates/nvisy-text/src/formats/json.rs index c0e7be8..a16675f 100644 --- a/crates/nvisy-text/src/formats/json.rs +++ b/crates/nvisy-text/src/formats/json.rs @@ -1,6 +1,6 @@ //! JSON format handler. -use nvisy_document::{ +use nvisy_rt_document::{ Capabilities, ContentData, DocumentFormat, Result, StructureCapabilities, TextCapabilities, }; @@ -67,7 +67,7 @@ impl DocumentFormat for JsonFormat { #[cfg(test)] mod tests { - use nvisy_document::Document; + use nvisy_rt_document::Document; use super::*; diff --git a/crates/nvisy-text/src/formats/markdown.rs b/crates/nvisy-text/src/formats/markdown.rs index 4e10f33..bfd280d 100644 --- a/crates/nvisy-text/src/formats/markdown.rs +++ b/crates/nvisy-text/src/formats/markdown.rs @@ -1,6 +1,6 @@ //! Markdown format handler. -use nvisy_document::{ +use nvisy_rt_document::{ Capabilities, ContentData, DocumentFormat, Result, StructureCapabilities, TextCapabilities, }; @@ -67,7 +67,7 @@ impl DocumentFormat for MarkdownFormat { #[cfg(test)] mod tests { - use nvisy_document::Document; + use nvisy_rt_document::Document; use super::*; diff --git a/crates/nvisy-text/src/formats/plain.rs b/crates/nvisy-text/src/formats/plain.rs index ee57eb1..f53e5bf 100644 --- a/crates/nvisy-text/src/formats/plain.rs +++ b/crates/nvisy-text/src/formats/plain.rs @@ -1,6 +1,6 @@ //! Plain text format handler. -use nvisy_document::{Capabilities, ContentData, DocumentFormat, Result}; +use nvisy_rt_document::{Capabilities, ContentData, DocumentFormat, Result}; use crate::documents::PlainTextDocument; diff --git a/crates/nvisy-text/src/formats/toml.rs b/crates/nvisy-text/src/formats/toml.rs index 6929395..d101938 100644 --- a/crates/nvisy-text/src/formats/toml.rs +++ b/crates/nvisy-text/src/formats/toml.rs @@ -1,6 +1,6 @@ //! TOML format handler. -use nvisy_document::{ +use nvisy_rt_document::{ Capabilities, ContentData, DocumentFormat, Result, StructureCapabilities, TextCapabilities, }; @@ -67,7 +67,7 @@ impl DocumentFormat for TomlFormat { #[cfg(test)] mod tests { - use nvisy_document::Document; + use nvisy_rt_document::Document; use super::*; diff --git a/crates/nvisy-text/src/formats/xml.rs b/crates/nvisy-text/src/formats/xml.rs index c92cd48..d3869db 100644 --- a/crates/nvisy-text/src/formats/xml.rs +++ b/crates/nvisy-text/src/formats/xml.rs @@ -1,6 +1,6 @@ //! XML format handler. -use nvisy_document::{ +use nvisy_rt_document::{ Capabilities, ContentData, DocumentFormat, Result, StructureCapabilities, TextCapabilities, }; @@ -67,7 +67,7 @@ impl DocumentFormat for XmlFormat { #[cfg(test)] mod tests { - use nvisy_document::Document; + use nvisy_rt_document::Document; use super::*; diff --git a/crates/nvisy-text/src/formats/yaml.rs b/crates/nvisy-text/src/formats/yaml.rs index 4db8660..b055eb7 100644 --- a/crates/nvisy-text/src/formats/yaml.rs +++ b/crates/nvisy-text/src/formats/yaml.rs @@ -1,6 +1,6 @@ //! YAML format handler. -use nvisy_document::{ +use nvisy_rt_document::{ Capabilities, ContentData, DocumentFormat, Result, StructureCapabilities, TextCapabilities, }; @@ -67,7 +67,7 @@ impl DocumentFormat for YamlFormat { #[cfg(test)] mod tests { - use nvisy_document::Document; + use nvisy_rt_document::Document; use super::*; diff --git a/crates/nvisy-text/src/lib.rs b/crates/nvisy-text/src/lib.rs index a54d6d0..0302f17 100644 --- a/crates/nvisy-text/src/lib.rs +++ b/crates/nvisy-text/src/lib.rs @@ -6,22 +6,19 @@ pub mod documents; pub mod formats; // Re-export document types +// Legacy aliases for backwards compatibility +pub use PlainTextDocument as TextDocument; +pub use PlainTextFormat as TextFormat; pub use documents::{ CsvDocument, IniDocument, JsonDocument, MarkdownDocument, PlainTextDocument, TomlDocument, XmlDocument, YamlDocument, }; - // Re-export format handlers pub use formats::{ CsvFormat, IniFormat, JsonFormat, MarkdownFormat, PlainTextFormat, TomlFormat, XmlFormat, YamlFormat, }; - -// Legacy aliases for backwards compatibility -pub use PlainTextDocument as TextDocument; -pub use PlainTextFormat as TextFormat; - // Re-export commonly used types from nvisy-document -pub use nvisy_document::{ +pub use nvisy_rt_document::{ Document, DocumentFormat, ExtractedText, NormalizedTable, Region, TableExtractor, TextExtractor, }; From b953007bf1f251a70f023737e05b58606b002e7e Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Sat, 17 Jan 2026 08:27:54 +0100 Subject: [PATCH 5/5] style: fix formatting --- crates/nvisy-text/src/lib.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/crates/nvisy-text/src/lib.rs b/crates/nvisy-text/src/lib.rs index 0302f17..53b6b71 100644 --- a/crates/nvisy-text/src/lib.rs +++ b/crates/nvisy-text/src/lib.rs @@ -7,8 +7,6 @@ pub mod formats; // Re-export document types // Legacy aliases for backwards compatibility -pub use PlainTextDocument as TextDocument; -pub use PlainTextFormat as TextFormat; pub use documents::{ CsvDocument, IniDocument, JsonDocument, MarkdownDocument, PlainTextDocument, TomlDocument, XmlDocument, YamlDocument, @@ -22,3 +20,4 @@ pub use formats::{ pub use nvisy_rt_document::{ Document, DocumentFormat, ExtractedText, NormalizedTable, Region, TableExtractor, TextExtractor, }; +pub use {PlainTextDocument as TextDocument, PlainTextFormat as TextFormat};