Skip to content
Merged
1 change: 0 additions & 1 deletion .config/mise/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,4 @@ _.python.venv = { path = ".venv", create = false }

[tasks.test]
description = "🐍 Run tests"
depends = ["start_db"]
run = "pytest -s"
125 changes: 119 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
# magicparse 🛸

Declarative parser
Declarative parser for structured data files.

## Installation

```bash
poetry install magicparse
```

## Requirements

- Python 3.12+

## Usage

Expand Down Expand Up @@ -96,7 +106,7 @@ schema = {
}


rows, errors= magicparse.parse(data="...", schema=schema)
rows = magicparse.parse(data="...", schema=schema)
```


Expand Down Expand Up @@ -124,9 +134,8 @@ schema = {
],
}

rows, errors = magicparse.parse("13ec10cc-cc7e-4ee9-b091-9caa6d11aeb2", schema)
rows = magicparse.parse("13ec10cc-cc7e-4ee9-b091-9caa6d11aeb2", schema)
assert rows == [{"shop-guid": "13ec10cc-cc7e-4ee9-b091-9caa6d11aeb2"}]
assert not errors
```

### Register a custom schema and parse content
Expand All @@ -152,11 +161,51 @@ schema = {
]
}

rows, errors = magicparse.parse("Joe|William|Jack|Averell", schema)
assert not errors
rows = magicparse.parse("Joe|William|Jack|Averell", schema)
assert rows == [{"name": "Joe"}, {"name": "William"}, {"name": "Jack"}, {"name": "Averell"}]
```

### Stream parsing

For large files, you can use streaming to process data incrementally:

```python
import magicparse

schema = {
"file_type": "csv",
"fields": [
{"key": "name", "type": "str", "column-number": 1}
]
}

# Process data in chunks
for row in magicparse.stream_parse(data="...", schema=schema):
match row:
case magicparse.RowParsed(values):
print(f"The values {values}.")
case magicparse.RowFailed(errors):
print(f"The errors {errors}.")
case magicparse.RowSkipped(reason):
print(f"The errors {errors}.")
case _:
print("Unknown type of row.")
```

### Custom encoding

By default, magicparse uses UTF-8 encoding. You can specify a different encoding:

```python
schema = {
"file_type": "csv",
"encoding": "iso8859_5", # or any other encoding
"fields": [
{"key": "name", "type": "str", "column-number": 1}
]
}
```

## API

### File types
Expand Down Expand Up @@ -187,6 +236,7 @@ assert rows == [{"name": "Joe"}, {"name": "William"}, {"name": "Jack"}, {"name":

- regex-matches
- greater-than
- not-null-or-empty

#### Post-processors

Expand All @@ -202,3 +252,66 @@ Types, Pre-processors, Post-processors and validator is same as Field
- concat
- divide
- multiply
- coalesce

## Return Types

The parser returns a list of row objects:

- **`RowParsed`**: Successfully parsed row with `values` dict
- **`RowFailed`**: Failed to parse row with `errors` message
- **`RowSkipped`**: Skipped row with `errors` message

## Error Handling

You can configure error handling for types, validators, and processors:

```python
{
"key": "price",
"type": {
"key": "decimal",
"nullable": True, # Allow null values
"on-error": "skip-row" # Skip on error instead of raising
}
}
```

Error handling options:
- `"raise"` (default): Raise exception on error
- `"skip-row"`: Skip the row and continue processing

## Docker

The project includes Docker support:

```bash
# Build and run with docker-compose
docker-compose up --build

# Or build manually
docker build -t magicparse .
docker run -it magicparse
```

## Development

### Setup

```bash
# Install dependencies
poetry install

# Run tests
poetry run pytest

# Format code
poetry run black .

# Lint code
poetry run flake8
```

## License

This project is licensed under the MIT License.
18 changes: 13 additions & 5 deletions magicparse/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
from io import BytesIO

from .schema import ParsedRow, Schema, builtins as builtins_schemas
from .schema import (
RowParsed,
RowFailed,
RowSkipped,
Schema,
builtins as builtins_schemas,
)
from .post_processors import PostProcessor, builtins as builtins_post_processors
from .pre_processors import PreProcessor, builtins as builtins_pre_processors
from .builders import (
Expand All @@ -9,7 +15,7 @@
)
from .transform import Transform
from .type_converters import TypeConverter, builtins as builtins_type_converters
from typing import Any, Dict, Iterable, List, Tuple, Union
from typing import Any, Dict, Iterable, List, Union
from .validators import Validator, builtins as builtins_validators


Expand All @@ -20,21 +26,23 @@
"PostProcessor",
"PreProcessor",
"Schema",
"ParsedRow",
"RowParsed",
"RowSkipped",
"RowFailed",
"Validator",
]


def parse(
data: Union[bytes, BytesIO], schema_options: Dict[str, Any]
) -> Tuple[List[dict], List[dict]]:
) -> List[RowParsed | RowSkipped | RowFailed]:
schema_definition = Schema.build(schema_options)
return schema_definition.parse(data)


def stream_parse(
data: Union[bytes, BytesIO], schema_options: Dict[str, Any]
) -> Iterable[ParsedRow]:
) -> Iterable[RowParsed | RowSkipped | RowFailed]:
schema_definition = Schema.build(schema_options)
return schema_definition.stream_parse(data)

Expand Down
43 changes: 36 additions & 7 deletions magicparse/builders.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from abc import ABC
from decimal import Decimal

from .transform import Transform
from .transform import Transform, OnError


class Builder(Transform, ABC):
Expand All @@ -17,14 +17,16 @@ def build(cls, options: dict) -> "Builder":
except:
raise ValueError(f"invalid builder '{name}'")

on_error = options.get("on-error", OnError.RAISE)
if "parameters" in options:
return builder(**options["parameters"])
return builder(on_error=on_error, **options["parameters"])
else:
return builder()
return builder(on_error=on_error)


class Concat(Builder):
def __init__(self, fields: list[str]) -> None:
def __init__(self, on_error: OnError, fields: list[str]) -> None:
super().__init__(on_error)
if (
not fields
or isinstance(fields, str)
Expand All @@ -48,7 +50,8 @@ def key() -> str:


class Divide(Builder):
def __init__(self, numerator: str, denominator: str) -> None:
def __init__(self, on_error: OnError, numerator: str, denominator: str) -> None:
super().__init__(on_error)
if not numerator or not isinstance(numerator, str):
raise ValueError(
"builder 'divide': " "'numerator' parameter must be a non null str"
Expand All @@ -69,7 +72,8 @@ def key() -> str:


class Multiply(Builder):
def __init__(self, x_factor: str, y_factor: str) -> None:
def __init__(self, on_error: OnError, x_factor: str, y_factor: str) -> None:
super().__init__(on_error)
if not x_factor or not isinstance(x_factor, str):
raise ValueError(
"builder 'multiply': " "'x_factor' parameter must be a non null str"
Expand All @@ -89,4 +93,29 @@ def key() -> str:
return "multiply"


builtins = [Concat, Divide, Multiply]
class Coalesce(Builder):
def __init__(self, on_error: OnError, fields: list[str]) -> None:
super().__init__(on_error)
if not fields:
raise ValueError("parameters should defined fields to coalesce")
if (
not isinstance(fields, list)
or not all(isinstance(field, str) for field in fields)
or len(fields) < 2
):
raise ValueError("parameters should have two fields at least")

self.fields = fields

def apply(self, row: dict) -> str:
for field in self.fields:
if row[field]:
return row[field]
return None

@staticmethod
def key() -> str:
return "coalesce"


builtins = [Concat, Divide, Multiply, Coalesce]
25 changes: 15 additions & 10 deletions magicparse/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .post_processors import PostProcessor
from .pre_processors import PreProcessor
from .validators import Validator
from .transform import Ok, OnError, Result, SkipRow


class Field(ABC):
Expand All @@ -26,24 +27,28 @@ def __init__(self, key: str, options: dict) -> None:
pre_processors + [type_converter] + validators + post_processors
)

def _process_raw_value(self, raw_value: str):
value = raw_value
def _process_raw_value(self, raw_value: str) -> Result:
if not raw_value:
if self.optional:
return None
return Ok(value=None)
else:
raise ValueError(
f"{self.key} field is required but the value was empty"
)
for transform in self.transforms:
value = transform.apply(value)
return value
try:
raw_value = transform.apply(raw_value)
except Exception as exc:
if transform.on_error == OnError.SKIP_ROW.value:
return SkipRow(exception=exc)
raise
return Ok(value=raw_value)

@abstractmethod
def _read_raw_value(self, row) -> str:
def _read_raw_value(self, row: List[str] | dict) -> str:
pass

def read_value(self, row):
def parse(self, row: List[str] | dict) -> Result:
raw_value = self._read_raw_value(row)
return self._process_raw_value(raw_value)

Expand Down Expand Up @@ -75,7 +80,7 @@ def __init__(self, key: str, options: dict) -> None:
super().__init__(key, options)
self.column_number = options["column-number"]

def _read_raw_value(self, row: List[str]) -> str:
def _read_raw_value(self, row: List[str] | dict) -> str:
return row[self.column_number - 1]

def error(self, exception: Exception) -> dict:
Expand All @@ -93,7 +98,7 @@ def __init__(self, key: str, options: dict) -> None:
self.column_length = options["column-length"]
self.column_end = self.column_start + self.column_length

def _read_raw_value(self, row: str) -> str:
def _read_raw_value(self, row: str | dict) -> str:
return row[self.column_start : self.column_end]

def error(self, exception: Exception) -> dict:
Expand All @@ -110,7 +115,7 @@ def __init__(self, key: str, options: dict) -> None:
super().__init__(key, options)
self.builder = Builder.build(options["builder"])

def _read_raw_value(self, row) -> str:
def _read_raw_value(self, row: List[str] | dict) -> str:
return self.builder.apply(row)

def error(self, exception: Exception) -> dict:
Expand Down
Loading