From d3747c995af737e0109a6a06241a7f65b03cdcea Mon Sep 17 00:00:00 2001
From: Ahmed Zeinelabdin <ahmed.zeinelabdin@adtran.com>
Date: Mon, 15 Dec 2025 17:06:27 -0600
Subject: [PATCH 1/8] Add a non-ASCII guard with documentation and tests

---
 README.md                              |   8 ++
 pre_commit_hooks/non_ascii_guard.py    | 133 ++++++++++++++++++++
 setup.cfg                              |   1 +
 testing/resources/non_ascii_sample.txt |   3 +
 tests/non_ascii_guard_test.py          | 166 +++++++++++++++++++++++++
 5 files changed, 311 insertions(+)
 create mode 100644 pre_commit_hooks/non_ascii_guard.py
 create mode 100644 testing/resources/non_ascii_sample.txt
 create mode 100644 tests/non_ascii_guard_test.py

diff --git a/README.md b/README.md
index 5e23484f..e33247b9 100644
--- a/README.md
+++ b/README.md
@@ -111,6 +111,14 @@ The following arguments are available:
 #### `detect-private-key`
 Checks for the existence of private keys.
 
+#### `detect-non-ascii-characters`
+Detects and strips non-printable, non-ASCII bytes (supply-chain safety guard).
+  - Default allowed range: printable ASCII (`0x20-0x7E`) plus `\n`, `\r`, and `\t`.
+  - `--include-range RANGE` - override allowed byte ranges (comma-separated, decimal or hex, supports `START-END`). Can be repeated.
+  - `--allow-chars TEXT` - permit additional characters (adds their UTF-8 bytes to the allowed set). Can be repeated.
+  - `--files-glob GLOB` - optional fnmatch-style glob to further restrict the provided file list (by default, the hook processes all files handed to it by pre-commit).
+  - `--check-only` - report disallowed bytes without modifying files.
+
 #### `double-quote-string-fixer`
 This hook replaces double quoted strings with single quoted strings.
 
diff --git a/pre_commit_hooks/non_ascii_guard.py b/pre_commit_hooks/non_ascii_guard.py
new file mode 100644
index 00000000..805c7f39
--- /dev/null
+++ b/pre_commit_hooks/non_ascii_guard.py
@@ -0,0 +1,133 @@
+from __future__ import annotations
+
+import argparse
+import fnmatch
+from collections.abc import Sequence
+
+DEFAULT_INCLUDE_RANGE = '0x09,0x0A,0x0D,0x20-0x7E'
+
+
+def _parse_byte(token: str, parser: argparse.ArgumentParser) -> int:
+    base = 16 if token.lower().startswith('0x') else 10
+    try:
+        value = int(token, base)
+    except ValueError:
+        parser.error(f'invalid byte value {token!r}')
+    if not 0 <= value <= 0xFF:
+        parser.error(f'byte value out of range: {token!r}')
+    return value
+
+
+def _parse_range_spec(spec: str, parser: argparse.ArgumentParser) -> set[int]:
+    allowed: set[int] = set()
+    for raw_part in spec.split(','):
+        part = raw_part.strip()
+        if not part:
+            continue
+        if '-' in part:
+            start_s, end_s = part.split('-', 1)
+            start = _parse_byte(start_s, parser)
+            end = _parse_byte(end_s, parser)
+            if start > end:
+                parser.error(f'invalid range {part!r}: start > end')
+            allowed.update(range(start, end + 1))
+        else:
+            allowed.add(_parse_byte(part, parser))
+    return allowed
+
+
+def _build_allowed(args: argparse.Namespace, parser: argparse.ArgumentParser) -> set[int]:
+    include_specs = args.include_range or [DEFAULT_INCLUDE_RANGE]
+    allowed: set[int] = set()
+    for spec in include_specs:
+        allowed.update(_parse_range_spec(spec, parser))
+    for extra in args.allow_chars:
+        allowed.update(extra.encode())
+    return allowed
+
+
+def _filter_filenames(filenames: list[str], globs: list[str]) -> list[str]:
+    if not globs:
+        return filenames
+    return [f for f in filenames if any(fnmatch.fnmatch(f, g) for g in globs)]
+
+
+def _format_offenders(offenders: list[tuple[int, int]]) -> str:
+    def _label(pos: int, b: int) -> str:
+        if 0x20 <= b <= 0x7E:
+            ch = chr(b)
+            # use repr to surface escapes for backslash/quote while staying ASCII
+            ch_repr = repr(ch)[1:-1]
+            return f"0x{b:02x}('{ch_repr}')@{pos}"
+        return f'0x{b:02x}@{pos}'
+
+    preview = ', '.join(_label(i, b) for i, b in offenders[:5])
+    if len(offenders) > 5:
+        preview += ', ...'
+    return preview
+
+
+def main(argv: Sequence[str] | None = None) -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--files-glob',
+        action='append',
+        default=[],
+        metavar='GLOB',
+        help='Optional fnmatch-style glob to further filter listed files.',
+    )
+    parser.add_argument(
+        '--include-range',
+        action='append',
+        metavar='RANGE',
+        help=(
+            'Comma-separated byte ranges to allow. '
+            'Supports decimal or 0x-prefixed hex values and START-END spans. '
+            f'Default: {DEFAULT_INCLUDE_RANGE}'
+        ),
+    )
+    parser.add_argument(
+        '--allow-chars',
+        action='append',
+        default=[],
+        metavar='CHARS',
+        help='Additional characters to permit (UTF-8 bytes of the given text).',
+    )
+    parser.add_argument(
+        '--check-only',
+        action='store_true',
+        help='Detect disallowed bytes but do not modify files.',
+    )
+    parser.add_argument('filenames', nargs='+', help='Files to check')
+    args = parser.parse_args(argv)
+
+    allowed = _build_allowed(args, parser)
+    filenames = _filter_filenames(args.filenames, args.files_glob)
+
+    retv = 0
+    for filename in filenames:
+        with open(filename, 'rb') as f:
+            data = f.read()
+
+        offenders = [(i, b) for i, b in enumerate(data) if b not in allowed]
+        if not offenders:
+            continue
+
+        if args.check_only:
+            print(f'{filename}: disallowed bytes {_format_offenders(offenders)}')
+            retv = 1
+            continue
+
+        new_data = bytes(b for b in data if b in allowed)
+        if new_data != data:
+            with open(filename, 'wb') as f:
+                f.write(new_data)
+            print(
+                f'Fixing {filename}: ' f'disallowed bytes {_format_offenders(offenders)}',
+            )
+            retv = 1
+    return retv
+
+
+if __name__ == '__main__':
+    raise SystemExit(main())
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index 65a6b895..9510f8cb 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -46,6 +46,7 @@ console_scripts =
     destroyed-symlinks = pre_commit_hooks.destroyed_symlinks:main
     detect-aws-credentials = pre_commit_hooks.detect_aws_credentials:main
     detect-private-key = pre_commit_hooks.detect_private_key:main
+    detect-non-ascii-characters = pre_commit_hooks.non_ascii_guard:main
     double-quote-string-fixer = pre_commit_hooks.string_fixer:main
     end-of-file-fixer = pre_commit_hooks.end_of_file_fixer:main
     file-contents-sorter = pre_commit_hooks.file_contents_sorter:main
diff --git a/testing/resources/non_ascii_sample.txt b/testing/resources/non_ascii_sample.txt
new file mode 100644
index 00000000..6c72838e
--- /dev/null
+++ b/testing/resources/non_ascii_sample.txt
@@ -0,0 +1,3 @@
+ASCII ok
+Has ctrl:
+Unicode: café
diff --git a/tests/non_ascii_guard_test.py b/tests/non_ascii_guard_test.py
new file mode 100644
index 00000000..4dcd4ea2
--- /dev/null
+++ b/tests/non_ascii_guard_test.py
@@ -0,0 +1,166 @@
+from __future__ import annotations
+import shutil
+from pre_commit_hooks.non_ascii_guard import main
+from testing.util import get_resource_path
+
+def test_no_changes_returns_zero(tmp_path) -> None:
+    path = tmp_path / 'ok.txt'
+    path.write_bytes(b'hello\n')
+
+    ret = main([str(path)])
+
+    assert ret == 0
+    assert path.read_bytes() == b'hello\n'
+
+
+def test_strips_disallowed_bytes(tmp_path, capsys) -> None:
+    path = tmp_path / 'bad.txt'
+    path.write_bytes(b'abc\x80def\n')
+
+    ret = main([str(path)])
+
+    assert ret == 1
+    assert path.read_bytes() == b'abcdef\n'
+    out = capsys.readouterr().out
+    assert f'Fixing {path}: disallowed bytes 0x80@3' in out
+
+
+def test_check_only_reports_and_keeps(tmp_path, capsys) -> None:
+    path = tmp_path / 'bad.txt'
+    original = b'abc\x00def\n'
+    path.write_bytes(original)
+
+    ret = main(['--check-only', str(path)])
+
+    assert ret == 1
+    assert path.read_bytes() == original
+    out = capsys.readouterr().out
+    assert 'disallowed bytes 0x00@3' in out
+
+
+def test_include_range_allows_bytes(tmp_path) -> None:
+    path = tmp_path / 'binary.bin'
+    path.write_bytes(bytes(range(256)))
+
+    ret = main(['--include-range', '0-255', str(path)])
+
+    assert ret == 0
+    assert path.read_bytes() == bytes(range(256))
+
+
+def test_allow_chars_adds_utf8_bytes(tmp_path) -> None:
+    path = tmp_path / 'text.txt'
+    content = 'café\n'.encode('utf-8')
+    path.write_bytes(content)
+
+    ret = main(['--allow-chars', 'é', str(path)])
+
+    assert ret == 0
+    assert path.read_bytes() == content
+
+
+def test_reports_positions_for_multibyte_chars(tmp_path, capsys) -> None:
+    path = tmp_path / 'multi.txt'
+    path.write_bytes(
+        (
+            'a'  # allowed
+            'éΩ€𝜋💵🪱'  # multibyte non-ASCII
+            '\u200b\u202e\u2066'  # zero-width, bidi controls
+            '\x01'  # control byte
+        ).encode('utf-8'),
+    )
+
+    ret = main(['--check-only', str(path)])
+
+    assert ret == 1
+    out = capsys.readouterr().out
+    assert 'disallowed bytes 0xc3@1, 0xa9@2, 0xce@3, 0xa9@4, 0xe2@5' in out
+
+
+def test_printable_offender_shows_char(tmp_path, capsys) -> None:
+    path = tmp_path / 'ascii.txt'
+    path.write_text('A\n')
+
+    ret = main(['--include-range', '0x00-0x1F', '--check-only', str(path)])
+
+    assert ret == 1
+    out = capsys.readouterr().out
+    assert f'{path}: disallowed bytes 0x41(\'A\')@0' in out
+
+
+def test_default_allows_whitespace_and_printable(tmp_path) -> None:
+    path = tmp_path / 'mix.txt'
+    path.write_bytes(b'abc\t\n\r\x01def')
+
+    ret = main([str(path)])
+
+    assert ret == 1
+    assert path.read_bytes() == b'abc\t\n\rdef'
+
+
+def test_files_glob_filters_targets(tmp_path) -> None:
+    kept = tmp_path / 'skip.bin'
+    target = tmp_path / 'take.txt'
+    kept.write_bytes(b'abc\x01def\n')
+    target.write_bytes(b'xyz\x01uvw\n')
+
+    ret = main(['--files-glob', '*.txt', str(kept), str(target)])
+
+    assert ret == 1
+    assert kept.read_bytes() == b'abc\x01def\n'
+    assert target.read_bytes() == b'xyzuvw\n'
+
+
+def test_fixture_file_is_cleaned(tmp_path, capsys) -> None:
+    fixture = get_resource_path('non_ascii_sample.txt')
+    path = tmp_path / 'copy.txt'
+    shutil.copy(fixture, path)
+
+    ret = main([str(path)])
+
+    assert ret == 1
+    assert path.read_text() == 'ASCII ok\nHas ctrl:\nUnicode: caf\n'
+    out = capsys.readouterr().out
+    assert f'Fixing {path}: disallowed bytes ' in out
+
+def test_combined_parameters(tmp_path, capsys):
+    f1 = tmp_path / 'latin.txt'
+    f2 = tmp_path / 'emoji.txt'
+    f3 = tmp_path / 'bidi.txt'
+    f4 = tmp_path / 'mix.txt'
+    f1.write_text('café\n')
+    f2.write_text('smile 🪱\n')
+    f3.write_text('abc\u202e\n')
+    f4.write_bytes(b'abc\x01\x80\n')
+
+    ret = main([
+        '--files-glob', '*.txt',
+        '--allow-chars', 'é',
+        '--include-range', '0x0A,0x20-0x7E',
+        '--check-only',
+        str(f1), str(f2), str(f3), str(f4)
+    ])
+    out = capsys.readouterr().out
+    assert f1.name not in out
+    assert f2.name in out and 'disallowed bytes' in out
+    assert f3.name in out and 'disallowed bytes' in out
+    assert f4.name in out and 'disallowed bytes' in out
+    assert ret == 1
+
+    f2.write_text('smile 🪱\n')
+    f3.write_text('abc\u202e\n')
+    f4.write_bytes(b'abc\x01\x80\n')
+    ret2 = main([
+        '--files-glob', '*.txt',
+        '--allow-chars', 'é',
+        '--include-range', '0x0A,0x20-0x7E',
+        str(f1), str(f2), str(f3), str(f4)
+    ])
+    out2 = capsys.readouterr().out
+    assert f1.read_text() == 'café\n'
+    assert '🪱' not in f2.read_text()
+    assert '\u202e' not in f3.read_text() and '\u202e'.encode('utf-8') not in f3.read_bytes()
+    assert f4.read_bytes() == b'abc\n'
+    # All files except f1 should be mentioned in output
+    assert f2.name in out2 and f3.name in out2 and f4.name in out2
+    assert ret2 == 1
\ No newline at end of file

From f2d5d494fc8bad4d3fc41b634a638d591b5f67af Mon Sep 17 00:00:00 2001
From: Ahmed Zeinelabdin <ahmed.zeinelabdin@adtran.com>
Date: Mon, 15 Dec 2025 17:15:15 -0600
Subject: [PATCH 2/8] Fix newline at end of files

---
 pre_commit_hooks/non_ascii_guard.py | 2 +-
 tests/non_ascii_guard_test.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pre_commit_hooks/non_ascii_guard.py b/pre_commit_hooks/non_ascii_guard.py
index 805c7f39..8afee1f2 100644
--- a/pre_commit_hooks/non_ascii_guard.py
+++ b/pre_commit_hooks/non_ascii_guard.py
@@ -130,4 +130,4 @@ def main(argv: Sequence[str] | None = None) -> int:
 
 
 if __name__ == '__main__':
-    raise SystemExit(main())
\ No newline at end of file
+    raise SystemExit(main())
diff --git a/tests/non_ascii_guard_test.py b/tests/non_ascii_guard_test.py
index 4dcd4ea2..1225fac4 100644
--- a/tests/non_ascii_guard_test.py
+++ b/tests/non_ascii_guard_test.py
@@ -163,4 +163,4 @@ def test_combined_parameters(tmp_path, capsys):
     assert f4.read_bytes() == b'abc\n'
     # All files except f1 should be mentioned in output
     assert f2.name in out2 and f3.name in out2 and f4.name in out2
-    assert ret2 == 1
\ No newline at end of file
+    assert ret2 == 1

From 59e022fae46cc44ceb3e13f67280a6dd7a9e220c Mon Sep 17 00:00:00 2001
From: Ahmed Zeinelabdin <ahmed.zeinelabdin@adtran.com>
Date: Thu, 18 Dec 2025 14:38:54 -0600
Subject: [PATCH 3/8] use Windows-safe byte literals in non-ASCII guard test

---
 tests/non_ascii_guard_test.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tests/non_ascii_guard_test.py b/tests/non_ascii_guard_test.py
index 1225fac4..4c2a31bd 100644
--- a/tests/non_ascii_guard_test.py
+++ b/tests/non_ascii_guard_test.py
@@ -128,9 +128,10 @@ def test_combined_parameters(tmp_path, capsys):
     f2 = tmp_path / 'emoji.txt'
     f3 = tmp_path / 'bidi.txt'
     f4 = tmp_path / 'mix.txt'
-    f1.write_text('café\n')
-    f2.write_text('smile 🪱\n')
-    f3.write_text('abc\u202e\n')
+    # use explicit UTF-8 to support Windows locales without UTF-8 defaults
+    f1.write_bytes(b'caf\xc3\xa9\n')
+    f2.write_bytes(b'smile \xc2\xa3\n')
+    f3.write_bytes(b'abc\xe2\x80\xae\n')
     f4.write_bytes(b'abc\x01\x80\n')
 
     ret = main([
@@ -147,8 +148,8 @@ def test_combined_parameters(tmp_path, capsys):
     assert f4.name in out and 'disallowed bytes' in out
     assert ret == 1
 
-    f2.write_text('smile 🪱\n')
-    f3.write_text('abc\u202e\n')
+    f2.write_bytes(b'smile \xc2\xa3\n')
+    f3.write_bytes(b'abc\xe2\x80\xae\n')
     f4.write_bytes(b'abc\x01\x80\n')
     ret2 = main([
         '--files-glob', '*.txt',
@@ -157,9 +158,9 @@ def test_combined_parameters(tmp_path, capsys):
         str(f1), str(f2), str(f3), str(f4)
     ])
     out2 = capsys.readouterr().out
-    assert f1.read_text() == 'café\n'
-    assert '🪱' not in f2.read_text()
-    assert '\u202e' not in f3.read_text() and '\u202e'.encode('utf-8') not in f3.read_bytes()
+    assert f1.read_bytes().decode('utf-8') == 'café\n'
+    assert '£' not in f2.read_bytes().decode('utf-8')
+    assert '\u202e' not in f3.read_bytes().decode('utf-8') and b'\xe2\x80\xae' not in f3.read_bytes()
     assert f4.read_bytes() == b'abc\n'
     # All files except f1 should be mentioned in output
     assert f2.name in out2 and f3.name in out2 and f4.name in out2

From 4c1ecebbc3fe2198d34d2236b7ccccdb34f08554 Mon Sep 17 00:00:00 2001
From: Ahmed Zeinelabdin <ahmed.zeinelabdin@adtran.com>
Date: Thu, 18 Dec 2025 14:58:35 -0600
Subject: [PATCH 4/8] Refactor main function to always write new data and
 increase coverage to 100%

---
 pre_commit_hooks/non_ascii_guard.py | 13 +++++-----
 tests/non_ascii_guard_test.py       | 38 +++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/pre_commit_hooks/non_ascii_guard.py b/pre_commit_hooks/non_ascii_guard.py
index 8afee1f2..69aba4c9 100644
--- a/pre_commit_hooks/non_ascii_guard.py
+++ b/pre_commit_hooks/non_ascii_guard.py
@@ -119,13 +119,12 @@ def main(argv: Sequence[str] | None = None) -> int:
             continue
 
         new_data = bytes(b for b in data if b in allowed)
-        if new_data != data:
-            with open(filename, 'wb') as f:
-                f.write(new_data)
-            print(
-                f'Fixing {filename}: ' f'disallowed bytes {_format_offenders(offenders)}',
-            )
-            retv = 1
+        with open(filename, 'wb') as f:
+            f.write(new_data)
+        print(
+            f'Fixing {filename}: ' f'disallowed bytes {_format_offenders(offenders)}',
+        )
+        retv = 1
     return retv
 
 
diff --git a/tests/non_ascii_guard_test.py b/tests/non_ascii_guard_test.py
index 4c2a31bd..46d75495 100644
--- a/tests/non_ascii_guard_test.py
+++ b/tests/non_ascii_guard_test.py
@@ -1,5 +1,9 @@
 from __future__ import annotations
+
 import shutil
+
+import pytest
+
 from pre_commit_hooks.non_ascii_guard import main
 from testing.util import get_resource_path
 
@@ -165,3 +169,37 @@ def test_combined_parameters(tmp_path, capsys):
     # All files except f1 should be mentioned in output
     assert f2.name in out2 and f3.name in out2 and f4.name in out2
     assert ret2 == 1
+
+
+def test_include_range_ignores_empty_parts(tmp_path):
+    path = tmp_path / 'bytes.bin'
+    path.write_bytes(b'\x01\x02')
+
+    ret = main(['--include-range', '1,,2', str(path)])
+
+    assert ret == 0
+    assert path.read_bytes() == b'\x01\x02'
+
+
+def test_invalid_include_range_token_exits(tmp_path):
+    path = tmp_path / 'file.txt'
+    path.write_text('ok')
+
+    with pytest.raises(SystemExit):
+        main(['--include-range', '0xZZ', str(path)])
+
+
+def test_out_of_range_byte_exits(tmp_path):
+    path = tmp_path / 'file.txt'
+    path.write_text('ok')
+
+    with pytest.raises(SystemExit):
+        main(['--include-range', '0x1FF', str(path)])
+
+
+def test_descending_range_exits(tmp_path):
+    path = tmp_path / 'file.txt'
+    path.write_text('ok')
+
+    with pytest.raises(SystemExit):
+        main(['--include-range', '10-5', str(path)])

From 0988891e529ff2ec598e4a4f36d27232384c91a4 Mon Sep 17 00:00:00 2001
From: Ahmed Zeinelabdin <ahmed.zeinelabdin@adtran.com>
Date: Wed, 21 Jan 2026 13:39:57 -0600
Subject: [PATCH 5/8] Enhance non-ASCII guard with new modes and file filtering
 options; update tests accordingly

---
 README.md                           |  11 +-
 pre_commit_hooks/non_ascii_guard.py | 158 ++++++++++++++++++++++++++--
 setup.cfg                           |   6 +-
 tests/non_ascii_guard_test.py       |  89 ++++++++++++++--
 4 files changed, 243 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index e33247b9..862c39e3 100644
--- a/README.md
+++ b/README.md
@@ -113,10 +113,19 @@ Checks for the existence of private keys.
 
 #### `detect-non-ascii-characters`
 Detects and strips non-printable, non-ASCII bytes (supply-chain safety guard).
-  - Default allowed range: printable ASCII (`0x20-0x7E`) plus `\n`, `\r`, and `\t`.
+  - Modes (choose via `--mode`):
+    - `balanced` (default): allow ASCII + Latin-1 accents/symbols; block controls/null, bidi overrides (U+202A–U+202E, U+2066–U+2069), and zero-width characters (U+200B–U+200D).
+    - `visible-plus`: allow ASCII + emoji (U+1F600–U+1F64F and modifiers/VS16), still blocking zero-width joiners, bidi, and controls.
+    - `ascii-only`: allow only tab/lf/cr and `0x20-0x7E`; block everything else.
+    - Examples: `--mode balanced` (default); `--mode visible-plus` (allow 😀, 🚀, etc. but still block zero-width joiners); `--mode ascii-only` (paranoid mode, blocks all non-ASCII).
   - `--include-range RANGE` - override allowed byte ranges (comma-separated, decimal or hex, supports `START-END`). Can be repeated.
+    - Examples: `--include-range 0x09,0x0A,0x0D,0x20-0x7E` (default printable ASCII); `--include-range 0-255` (allow all bytes); `--include-range 0x20-0x7E,0xA0` (allow NBSP too).
   - `--allow-chars TEXT` - permit additional characters (adds their UTF-8 bytes to the allowed set). Can be repeated.
+    - Examples: `--allow-chars "é"` (allow a single accent); `--allow-chars "😀"` (allow an emoji); `--allow-chars "👨‍👩‍👧‍👦"` (allow a grapheme cluster with ZWJ).
   - `--files-glob GLOB` - optional fnmatch-style glob to further restrict the provided file list (by default, the hook processes all files handed to it by pre-commit).
+    - Example: `--files-glob "*.py"` (only consider .py files from the passed list).
+  - `--files-include GLOB` / `--files-exclude GLOB` - additional fnmatch-style filters applied after `--files-glob`.
+    - Examples: `--files-include "*.md"` (only Markdown); `--files-exclude "vendor/*"` (skip vendored files).
   - `--check-only` - report disallowed bytes without modifying files.
 
 #### `double-quote-string-fixer`
diff --git a/pre_commit_hooks/non_ascii_guard.py b/pre_commit_hooks/non_ascii_guard.py
index 69aba4c9..258946a9 100644
--- a/pre_commit_hooks/non_ascii_guard.py
+++ b/pre_commit_hooks/non_ascii_guard.py
@@ -4,8 +4,25 @@
 import fnmatch
 from collections.abc import Sequence
 
+import grapheme
+
+MODE_BALANCED = 'balanced'
+MODE_VISIBLE_PLUS = 'visible-plus'
+MODE_ASCII_ONLY = 'ascii-only'
+MODE_CHOICES = (MODE_BALANCED, MODE_VISIBLE_PLUS, MODE_ASCII_ONLY)
+
 DEFAULT_INCLUDE_RANGE = '0x09,0x0A,0x0D,0x20-0x7E'
 
+ASCII_BASE = {0x09, 0x0A, 0x0D} | set(range(0x20, 0x7F))
+LATIN1_VISIBLE = set(range(0xA0, 0x100))
+CONTROL_C0 = set(range(0x00, 0x20))
+CONTROL_C1 = set(range(0x80, 0xA0))
+BIDI_OVERRIDES = set(range(0x202A, 0x202F)) | set(range(0x2066, 0x206A))
+ZERO_WIDTHS = {0x200B, 0x200C, 0x200D}
+EMOJI_BASE = set(range(0x1F600, 0x1F650))
+EMOJI_MODIFIERS = set(range(0x1F3FB, 0x1F400))
+VARIATION_SELECTORS = {0xFE0F}
+
 
 def _parse_byte(token: str, parser: argparse.ArgumentParser) -> int:
     base = 16 if token.lower().startswith('0x') else 10
@@ -36,20 +53,96 @@ def _parse_range_spec(spec: str, parser: argparse.ArgumentParser) -> set[int]:
     return allowed
 
 
-def _build_allowed(args: argparse.Namespace, parser: argparse.ArgumentParser) -> set[int]:
+def _build_allowed(
+        args: argparse.Namespace, parser: argparse.ArgumentParser,
+) -> tuple[set[int], bool]:
     include_specs = args.include_range or [DEFAULT_INCLUDE_RANGE]
+    restrict_to_includes = bool(args.include_range)
     allowed: set[int] = set()
     for spec in include_specs:
         allowed.update(_parse_range_spec(spec, parser))
     for extra in args.allow_chars:
         allowed.update(extra.encode())
-    return allowed
+    return allowed, restrict_to_includes
+
+
+def _match_any(path: str, patterns: list[str]) -> bool:
+    return any(fnmatch.fnmatch(path, pattern) for pattern in patterns)
+
+
+def _filter_filenames(
+        filenames: list[str],
+        globs: list[str],
+        include: list[str],
+        exclude: list[str],
+) -> list[str]:
+    selected = filenames
+    if globs:
+        selected = [f for f in selected if _match_any(f, globs)]
+    if include:
+        selected = [f for f in selected if _match_any(f, include)]
+    if exclude:
+        selected = [f for f in selected if not _match_any(f, exclude)]
+    return selected
+
+
+def _is_control_or_null(cp: int) -> bool:
+    return cp in CONTROL_C0 or cp in CONTROL_C1 or cp == 0x7F
+
+
+def _cluster_allowed_balanced(cluster_cps: list[int]) -> bool:
+    if any(cp in BIDI_OVERRIDES for cp in cluster_cps):
+        return False
+    if any(cp in ZERO_WIDTHS for cp in cluster_cps):
+        return False
+    if any(_is_control_or_null(cp) and cp not in {0x09, 0x0A, 0x0D} for cp in cluster_cps):
+        return False
+    return all(cp in ASCII_BASE or cp in LATIN1_VISIBLE for cp in cluster_cps)
+
+
+def _cluster_allowed_visible_plus(cluster_cps: list[int]) -> bool:
+    if any(cp in ZERO_WIDTHS for cp in cluster_cps):
+        return False
+    if any(cp in BIDI_OVERRIDES for cp in cluster_cps):
+        return False
+    if any(_is_control_or_null(cp) and cp not in {0x09, 0x0A, 0x0D} for cp in cluster_cps):
+        return False
+
+    if all(cp in ASCII_BASE for cp in cluster_cps):
+        return True
+
+    emoji_ok = all(
+        cp in EMOJI_BASE
+        or cp in EMOJI_MODIFIERS
+        or cp in VARIATION_SELECTORS
+        for cp in cluster_cps
+    )
+    return emoji_ok
+
+
+def _cluster_allowed_ascii_only(cluster_cps: list[int]) -> bool:
+    return all(cp in ASCII_BASE for cp in cluster_cps)
+
 
+def _cluster_allowed(
+        cluster_bytes: bytes,
+        cluster_text: str,
+        allowed_bytes: set[int],
+        mode: str,
+        restrict_to_allowed_bytes: bool,
+) -> bool:
+    if cluster_bytes and all(b in allowed_bytes for b in cluster_bytes):
+        return True
 
-def _filter_filenames(filenames: list[str], globs: list[str]) -> list[str]:
-    if not globs:
-        return filenames
-    return [f for f in filenames if any(fnmatch.fnmatch(f, g) for g in globs)]
+    if restrict_to_allowed_bytes:
+        return False
+
+    cps = [ord(ch) for ch in cluster_text]
+    if mode == MODE_BALANCED:
+        return _cluster_allowed_balanced(cps)
+    if mode == MODE_VISIBLE_PLUS:
+        return _cluster_allowed_visible_plus(cps)
+    return _cluster_allowed_ascii_only(cps)
 
 
 def _format_offenders(offenders: list[tuple[int, int]]) -> str:
@@ -76,6 +169,20 @@ def main(argv: Sequence[str] | None = None) -> int:
         metavar='GLOB',
         help='Optional fnmatch-style glob to further filter listed files.',
     )
+    parser.add_argument(
+        '--files-include',
+        action='append',
+        default=[],
+        metavar='GLOB',
+        help='Additional fnmatch-style patterns to include (after files-glob).',
+    )
+    parser.add_argument(
+        '--files-exclude',
+        action='append',
+        default=[],
+        metavar='GLOB',
+        help='Fnmatch-style patterns to exclude (applied last).',
+    )
     parser.add_argument(
         '--include-range',
         action='append',
@@ -93,6 +200,15 @@ def main(argv: Sequence[str] | None = None) -> int:
         metavar='CHARS',
         help='Additional characters to permit (UTF-8 bytes of the given text).',
     )
+    parser.add_argument(
+        '--mode',
+        choices=MODE_CHOICES,
+        default=MODE_BALANCED,
+        help=(
+            'Character policy: balanced (default, allow ASCII + Latin-1, block bidi/zero-width/control),'  # noqa: E501
+            ' visible-plus (ASCII + emoji, block others), ascii-only (strict).'  # noqa: E501
+        ),
+    )
     parser.add_argument(
         '--check-only',
         action='store_true',
@@ -101,15 +217,37 @@ def main(argv: Sequence[str] | None = None) -> int:
     parser.add_argument('filenames', nargs='+', help='Files to check')
     args = parser.parse_args(argv)
 
-    allowed = _build_allowed(args, parser)
-    filenames = _filter_filenames(args.filenames, args.files_glob)
+    allowed, restrict_to_includes = _build_allowed(args, parser)
+    filenames = _filter_filenames(
+        args.filenames, args.files_glob, args.files_include, args.files_exclude,
+    )
 
     retv = 0
     for filename in filenames:
         with open(filename, 'rb') as f:
             data = f.read()
 
-        offenders = [(i, b) for i, b in enumerate(data) if b not in allowed]
+        try:
+            text = data.decode('utf-8', errors='surrogateescape')
+        except UnicodeDecodeError:
+            text = data.decode('utf-8', errors='ignore')
+
+        offenders: list[tuple[int, int]] = []
+        new_chunks: list[bytes] = []
+
+        byte_pos = 0
+        for cluster in grapheme.graphemes(text):
+            cluster_bytes = cluster.encode('utf-8', errors='surrogateescape')
+            if _cluster_allowed(
+                cluster_bytes, cluster, allowed, args.mode, restrict_to_includes,
+            ):
+                new_chunks.append(cluster_bytes)
+            else:
+                offenders.extend(
+                    (byte_pos + i, cluster_bytes[i]) for i in range(len(cluster_bytes))
+                )
+            byte_pos += len(cluster_bytes)
+
         if not offenders:
             continue
 
@@ -118,7 +256,7 @@ def main(argv: Sequence[str] | None = None) -> int:
             retv = 1
             continue
 
-        new_data = bytes(b for b in data if b in allowed)
+        new_data = b''.join(new_chunks)
         with open(filename, 'wb') as f:
             f.write(new_data)
         print(
diff --git a/setup.cfg b/setup.cfg
index 9510f8cb..8f3d3ff4 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -18,7 +18,11 @@ classifiers =
 
 [options]
 packages = find:
-python_requires = >=3.8
+install_requires =
+    ruamel.yaml>=0.15
+    grapheme>=0.6.0
+    tomli>=1.1.0;python_version<"3.11"
+python_requires = >=3.9
 
 [options.packages.find]
 exclude =
diff --git a/tests/non_ascii_guard_test.py b/tests/non_ascii_guard_test.py
index 46d75495..1e345562 100644
--- a/tests/non_ascii_guard_test.py
+++ b/tests/non_ascii_guard_test.py
@@ -4,6 +4,9 @@
 
 import pytest
 
+from pre_commit_hooks.non_ascii_guard import MODE_ASCII_ONLY
+from pre_commit_hooks.non_ascii_guard import MODE_BALANCED
+from pre_commit_hooks.non_ascii_guard import MODE_VISIBLE_PLUS
 from pre_commit_hooks.non_ascii_guard import main
 from testing.util import get_resource_path
 
@@ -54,10 +57,10 @@ def test_include_range_allows_bytes(tmp_path) -> None:
 
 def test_allow_chars_adds_utf8_bytes(tmp_path) -> None:
     path = tmp_path / 'text.txt'
-    content = 'café\n'.encode('utf-8')
+    content = 'Ωmega\n'.encode('utf-8')
     path.write_bytes(content)
 
-    ret = main(['--allow-chars', 'é', str(path)])
+    ret = main(['--allow-chars', 'Ω', str(path)])
 
     assert ret == 0
     assert path.read_bytes() == content
@@ -78,7 +81,8 @@ def test_reports_positions_for_multibyte_chars(tmp_path, capsys) -> None:
 
     assert ret == 1
     out = capsys.readouterr().out
-    assert 'disallowed bytes 0xc3@1, 0xa9@2, 0xce@3, 0xa9@4, 0xe2@5' in out
+    # Balanced allows é, so first offenders start at Ω (0xce@3)
+    assert 'disallowed bytes 0xce@3, 0xa9@4, 0xe2@5' in out
 
 
 def test_printable_offender_shows_char(tmp_path, capsys) -> None:
@@ -123,7 +127,7 @@ def test_fixture_file_is_cleaned(tmp_path, capsys) -> None:
     ret = main([str(path)])
 
     assert ret == 1
-    assert path.read_text() == 'ASCII ok\nHas ctrl:\nUnicode: caf\n'
+    assert path.read_text() == 'ASCII ok\nHas ctrl:\nUnicode: café\n'
     out = capsys.readouterr().out
     assert f'Fixing {path}: disallowed bytes ' in out
 
@@ -139,6 +143,7 @@ def test_combined_parameters(tmp_path, capsys):
     f4.write_bytes(b'abc\x01\x80\n')
 
     ret = main([
+        '--mode', MODE_BALANCED,
         '--files-glob', '*.txt',
         '--allow-chars', 'é',
         '--include-range', '0x0A,0x20-0x7E',
@@ -146,16 +151,17 @@ def test_combined_parameters(tmp_path, capsys):
         str(f1), str(f2), str(f3), str(f4)
     ])
     out = capsys.readouterr().out
-    assert f1.name not in out
-    assert f2.name in out and 'disallowed bytes' in out
-    assert f3.name in out and 'disallowed bytes' in out
-    assert f4.name in out and 'disallowed bytes' in out
+    assert f1.name not in out  # café allowed in balanced
+    assert f2.name in out and 'disallowed bytes' in out  # £ not allowed
+    assert f3.name in out and 'disallowed bytes' in out  # bidi
+    assert f4.name in out and 'disallowed bytes' in out  # controls
     assert ret == 1
 
     f2.write_bytes(b'smile \xc2\xa3\n')
     f3.write_bytes(b'abc\xe2\x80\xae\n')
     f4.write_bytes(b'abc\x01\x80\n')
     ret2 = main([
+        '--mode', MODE_BALANCED,
         '--files-glob', '*.txt',
         '--allow-chars', 'é',
         '--include-range', '0x0A,0x20-0x7E',
@@ -166,11 +172,76 @@ def test_combined_parameters(tmp_path, capsys):
     assert '£' not in f2.read_bytes().decode('utf-8')
     assert '\u202e' not in f3.read_bytes().decode('utf-8') and b'\xe2\x80\xae' not in f3.read_bytes()
     assert f4.read_bytes() == b'abc\n'
-    # All files except f1 should be mentioned in output
     assert f2.name in out2 and f3.name in out2 and f4.name in out2
     assert ret2 == 1
 
 
+def test_mode_visible_plus_allows_emoji_blocks_accents(tmp_path, capsys):
+    path = tmp_path / 'emoji.txt'
+    path.write_text('hi 😀 é')
+
+    ret = main(['--mode', MODE_VISIBLE_PLUS, '--check-only', str(path)])
+
+    assert ret == 1
+    out = capsys.readouterr().out
+    assert path.name in out
+    assert 'disallowed bytes' in out  # accent is blocked
+
+
+def test_mode_ascii_only_is_strict(tmp_path, capsys):
+    path = tmp_path / 'strict.txt'
+    path.write_text('hi café 😀')
+
+    ret = main(['--mode', MODE_ASCII_ONLY, '--check-only', str(path)])
+
+    assert ret == 1
+    out = capsys.readouterr().out
+    assert 'disallowed bytes' in out
+
+
+def test_mode_balanced_allows_latin1_blocks_bidi(tmp_path, capsys):
+    path = tmp_path / 'latin1.txt'
+    path.write_bytes('café \u202e'.encode('utf-8'))
+
+    ret = main(['--mode', MODE_BALANCED, '--check-only', str(path)])
+
+    assert ret == 1
+    out = capsys.readouterr().out
+    assert 'disallowed bytes' in out
+    assert 'latin1.txt' in out
+
+
+def test_zwj_emoji_blocked_as_cluster(tmp_path, capsys):
+    path = tmp_path / 'family.txt'
+    path.write_text('family: 👨\u200d👩\u200d👧\u200d👦 end')
+
+    ret = main(['--mode', MODE_VISIBLE_PLUS, str(path)])
+
+    assert ret == 1
+    out = capsys.readouterr().out
+    assert 'Fixing' in out
+    assert 'family.txt' in out
+    assert '\u200d' not in path.read_text()
+
+
+def test_files_include_and_exclude(tmp_path):
+    keep = tmp_path / 'skip.md'
+    take = tmp_path / 'scan.py'
+    keep.write_text('ok café')
+    take.write_text('hi café')
+
+    ret = main([
+        '--mode', MODE_VISIBLE_PLUS,
+        '--files-include', '*.py',
+        '--files-exclude', '*.md',
+        str(keep), str(take),
+    ])
+
+    assert ret == 1
+    assert 'café' not in take.read_text()
+    assert keep.read_text() == 'ok café'
+
+
 def test_include_range_ignores_empty_parts(tmp_path):
     path = tmp_path / 'bytes.bin'
     path.write_bytes(b'\x01\x02')

From a84a04e6b788b442ef4efb773e441a4e1d9cc31b Mon Sep 17 00:00:00 2001
From: Ahmed Zeinelabdin <ahmed.zeinelabdin@adtran.com>
Date: Wed, 21 Jan 2026 15:00:20 -0600
Subject: [PATCH 6/8] set encoding for tests

---
 tests/non_ascii_guard_test.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/non_ascii_guard_test.py b/tests/non_ascii_guard_test.py
index 1e345562..d449bbb4 100644
--- a/tests/non_ascii_guard_test.py
+++ b/tests/non_ascii_guard_test.py
@@ -127,7 +127,7 @@ def test_fixture_file_is_cleaned(tmp_path, capsys) -> None:
     ret = main([str(path)])
 
     assert ret == 1
-    assert path.read_text() == 'ASCII ok\nHas ctrl:\nUnicode: café\n'
+    assert path.read_text(encoding='utf-8') == 'ASCII ok\nHas ctrl:\nUnicode: café\n'
     out = capsys.readouterr().out
     assert f'Fixing {path}: disallowed bytes ' in out
 
@@ -178,7 +178,7 @@ def test_combined_parameters(tmp_path, capsys):
 
 def test_mode_visible_plus_allows_emoji_blocks_accents(tmp_path, capsys):
     path = tmp_path / 'emoji.txt'
-    path.write_text('hi 😀 é')
+    path.write_text('hi 😀 é', encoding='utf-8')
 
     ret = main(['--mode', MODE_VISIBLE_PLUS, '--check-only', str(path)])
 
@@ -190,7 +190,7 @@ def test_mode_visible_plus_allows_emoji_blocks_accents(tmp_path, capsys):
 
 def test_mode_ascii_only_is_strict(tmp_path, capsys):
     path = tmp_path / 'strict.txt'
-    path.write_text('hi café 😀')
+    path.write_text('hi café 😀', encoding='utf-8')
 
     ret = main(['--mode', MODE_ASCII_ONLY, '--check-only', str(path)])
 
@@ -213,7 +213,7 @@ def test_mode_balanced_allows_latin1_blocks_bidi(tmp_path, capsys):
 
 def test_zwj_emoji_blocked_as_cluster(tmp_path, capsys):
     path = tmp_path / 'family.txt'
-    path.write_text('family: 👨\u200d👩\u200d👧\u200d👦 end')
+    path.write_text('family: 👨\u200d👩\u200d👧\u200d👦 end', encoding='utf-8')
 
     ret = main(['--mode', MODE_VISIBLE_PLUS, str(path)])
 
@@ -221,14 +221,14 @@ def test_zwj_emoji_blocked_as_cluster(tmp_path, capsys):
     out = capsys.readouterr().out
     assert 'Fixing' in out
     assert 'family.txt' in out
-    assert '\u200d' not in path.read_text()
+    assert '\u200d' not in path.read_text(encoding='utf-8')
 
 
 def test_files_include_and_exclude(tmp_path):
     keep = tmp_path / 'skip.md'
     take = tmp_path / 'scan.py'
-    keep.write_text('ok café')
-    take.write_text('hi café')
+    keep.write_text('ok café', encoding='utf-8')
+    take.write_text('hi café', encoding='utf-8')
 
     ret = main([
         '--mode', MODE_VISIBLE_PLUS,
@@ -238,8 +238,8 @@ def test_files_include_and_exclude(tmp_path):
     ])
 
     assert ret == 1
-    assert 'café' not in take.read_text()
-    assert keep.read_text() == 'ok café'
+    assert 'café' not in take.read_text(encoding='utf-8')
+    assert keep.read_text(encoding='utf-8') == 'ok café'
 
 
 def test_include_range_ignores_empty_parts(tmp_path):

From 94568472912439aeb68ea68700157c0850e8e909 Mon Sep 17 00:00:00 2001
From: Ahmed Zeinelabdin <ahmed.zeinelabdin@adtran.com>
Date: Wed, 21 Jan 2026 15:06:54 -0600
Subject: [PATCH 7/8] increase coverage

---
 pre_commit_hooks/non_ascii_guard.py |  5 +----
 tests/non_ascii_guard_test.py       | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/pre_commit_hooks/non_ascii_guard.py b/pre_commit_hooks/non_ascii_guard.py
index 258946a9..284876f8 100644
--- a/pre_commit_hooks/non_ascii_guard.py
+++ b/pre_commit_hooks/non_ascii_guard.py
@@ -227,10 +227,7 @@ def main(argv: Sequence[str] | None = None) -> int:
         with open(filename, 'rb') as f:
             data = f.read()
 
-        try:
-            text = data.decode('utf-8', errors='surrogateescape')
-        except UnicodeDecodeError:
-            text = data.decode('utf-8', errors='ignore')
+        text = data.decode('utf-8', errors='surrogateescape')
 
         offenders: list[tuple[int, int]] = []
         new_chunks: list[bytes] = []
diff --git a/tests/non_ascii_guard_test.py b/tests/non_ascii_guard_test.py
index d449bbb4..0d25c9a5 100644
--- a/tests/non_ascii_guard_test.py
+++ b/tests/non_ascii_guard_test.py
@@ -242,6 +242,22 @@ def test_files_include_and_exclude(tmp_path):
     assert keep.read_text(encoding='utf-8') == 'ok café'
 
 
+def test_include_range_restricts_even_if_mode_allows(tmp_path, capsys):
+    path = tmp_path / 'range.txt'
+    path.write_text('hello 😀', encoding='utf-8')
+
+    ret = main([
+        '--mode', MODE_VISIBLE_PLUS,
+        '--include-range', '0x20-0x7E',  # ASCII only
+        str(path),
+    ])
+
+    assert ret == 1
+    out = capsys.readouterr().out
+    assert 'disallowed bytes' in out
+    assert 'range.txt' in out
+
+
 def test_include_range_ignores_empty_parts(tmp_path):
     path = tmp_path / 'bytes.bin'
     path.write_bytes(b'\x01\x02')

From 490d43f4d0e6e4d0b191917fe2b8e681ca26b5cf Mon Sep 17 00:00:00 2001
From: Ahmed Zeinelabdin <ahmed.zeinelabdin@adtran.com>
Date: Wed, 21 Jan 2026 15:41:39 -0600
Subject: [PATCH 8/8] add more tests for coverage

---
 tests/non_ascii_guard_test.py | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/tests/non_ascii_guard_test.py b/tests/non_ascii_guard_test.py
index 0d25c9a5..1a0dadef 100644
--- a/tests/non_ascii_guard_test.py
+++ b/tests/non_ascii_guard_test.py
@@ -4,6 +4,7 @@
 
 import pytest
 
+from pre_commit_hooks.non_ascii_guard import _cluster_allowed_visible_plus
 from pre_commit_hooks.non_ascii_guard import MODE_ASCII_ONLY
 from pre_commit_hooks.non_ascii_guard import MODE_BALANCED
 from pre_commit_hooks.non_ascii_guard import MODE_VISIBLE_PLUS
@@ -290,3 +291,36 @@ def test_descending_range_exits(tmp_path):
 
     with pytest.raises(SystemExit):
         main(['--include-range', '10-5', str(path)])
+
+
+def test_visible_plus_blocks_bidi_in_cluster(tmp_path, capsys):
+    """Test line 107: bidi override check in _cluster_allowed_visible_plus"""
+    path = tmp_path / 'bidi.txt'
+    # Emoji followed by bidi override U+202E (RIGHT-TO-LEFT OVERRIDE)
+    path.write_text('test😀\u202Eword', encoding='utf-8')
+
+    ret = main(['--mode', MODE_VISIBLE_PLUS, '--check-only', str(path)])
+
+    assert ret == 1
+    out = capsys.readouterr().out
+    assert 'disallowed bytes' in out
+
+
+def test_visible_plus_blocks_control_in_cluster(tmp_path, capsys):
+    """Test line 109: control character check in _cluster_allowed_visible_plus"""
+    path = tmp_path / 'ctrl.txt'
+    # Emoji followed by control char U+0001 (not tab/LF/CR)
+    path.write_text('test😀\x01word', encoding='utf-8')
+
+    ret = main(['--mode', MODE_VISIBLE_PLUS, '--check-only', str(path)])
+
+    assert ret == 1
+    out = capsys.readouterr().out
+    assert 'disallowed bytes' in out
+
+
+def test_visible_plus_allows_pure_ascii():
+    """Test line 112: early return for ASCII-only clusters in visible-plus"""
+    # Direct unit test of _cluster_allowed_visible_plus with ASCII-only input
+    ascii_cps = [ord(c) for c in 'hello']
+    assert _cluster_allowed_visible_plus(ascii_cps) is True