From d3747c995af737e0109a6a06241a7f65b03cdcea Mon Sep 17 00:00:00 2001 From: Ahmed Zeinelabdin Date: Mon, 15 Dec 2025 17:06:27 -0600 Subject: [PATCH 1/8] Add a non-ASCII guard with documentation and tests --- README.md | 8 ++ pre_commit_hooks/non_ascii_guard.py | 133 ++++++++++++++++++++ setup.cfg | 1 + testing/resources/non_ascii_sample.txt | 3 + tests/non_ascii_guard_test.py | 166 +++++++++++++++++++++++++ 5 files changed, 311 insertions(+) create mode 100644 pre_commit_hooks/non_ascii_guard.py create mode 100644 testing/resources/non_ascii_sample.txt create mode 100644 tests/non_ascii_guard_test.py diff --git a/README.md b/README.md index 5e23484f..e33247b9 100644 --- a/README.md +++ b/README.md @@ -111,6 +111,14 @@ The following arguments are available: #### `detect-private-key` Checks for the existence of private keys. +#### `detect-non-ascii-characters` +Detects and strips non-printable, non-ASCII bytes (supply-chain safety guard). + - Default allowed range: printable ASCII (`0x20-0x7E`) plus `\n`, `\r`, and `\t`. + - `--include-range RANGE` - override allowed byte ranges (comma-separated, decimal or hex, supports `START-END`). Can be repeated. + - `--allow-chars TEXT` - permit additional characters (adds their UTF-8 bytes to the allowed set). Can be repeated. + - `--files-glob GLOB` - optional fnmatch-style glob to further restrict the provided file list (by default, the hook processes all files handed to it by pre-commit). + - `--check-only` - report disallowed bytes without modifying files. + #### `double-quote-string-fixer` This hook replaces double quoted strings with single quoted strings. diff --git a/pre_commit_hooks/non_ascii_guard.py b/pre_commit_hooks/non_ascii_guard.py new file mode 100644 index 00000000..805c7f39 --- /dev/null +++ b/pre_commit_hooks/non_ascii_guard.py @@ -0,0 +1,133 @@ +from __future__ import annotations + +import argparse +import fnmatch +from collections.abc import Sequence + +DEFAULT_INCLUDE_RANGE = '0x09,0x0A,0x0D,0x20-0x7E' + + +def _parse_byte(token: str, parser: argparse.ArgumentParser) -> int: + base = 16 if token.lower().startswith('0x') else 10 + try: + value = int(token, base) + except ValueError: + parser.error(f'invalid byte value {token!r}') + if not 0 <= value <= 0xFF: + parser.error(f'byte value out of range: {token!r}') + return value + + +def _parse_range_spec(spec: str, parser: argparse.ArgumentParser) -> set[int]: + allowed: set[int] = set() + for raw_part in spec.split(','): + part = raw_part.strip() + if not part: + continue + if '-' in part: + start_s, end_s = part.split('-', 1) + start = _parse_byte(start_s, parser) + end = _parse_byte(end_s, parser) + if start > end: + parser.error(f'invalid range {part!r}: start > end') + allowed.update(range(start, end + 1)) + else: + allowed.add(_parse_byte(part, parser)) + return allowed + + +def _build_allowed(args: argparse.Namespace, parser: argparse.ArgumentParser) -> set[int]: + include_specs = args.include_range or [DEFAULT_INCLUDE_RANGE] + allowed: set[int] = set() + for spec in include_specs: + allowed.update(_parse_range_spec(spec, parser)) + for extra in args.allow_chars: + allowed.update(extra.encode()) + return allowed + + +def _filter_filenames(filenames: list[str], globs: list[str]) -> list[str]: + if not globs: + return filenames + return [f for f in filenames if any(fnmatch.fnmatch(f, g) for g in globs)] + + +def _format_offenders(offenders: list[tuple[int, int]]) -> str: + def _label(pos: int, b: int) -> str: + if 0x20 <= b <= 0x7E: + ch = chr(b) + # use repr to surface escapes for backslash/quote while staying ASCII + ch_repr = repr(ch)[1:-1] + return f"0x{b:02x}('{ch_repr}')@{pos}" + return f'0x{b:02x}@{pos}' + + preview = ', '.join(_label(i, b) for i, b in offenders[:5]) + if len(offenders) > 5: + preview += ', ...' + return preview + + +def main(argv: Sequence[str] | None = None) -> int: + parser = argparse.ArgumentParser() + parser.add_argument( + '--files-glob', + action='append', + default=[], + metavar='GLOB', + help='Optional fnmatch-style glob to further filter listed files.', + ) + parser.add_argument( + '--include-range', + action='append', + metavar='RANGE', + help=( + 'Comma-separated byte ranges to allow. ' + 'Supports decimal or 0x-prefixed hex values and START-END spans. ' + f'Default: {DEFAULT_INCLUDE_RANGE}' + ), + ) + parser.add_argument( + '--allow-chars', + action='append', + default=[], + metavar='CHARS', + help='Additional characters to permit (UTF-8 bytes of the given text).', + ) + parser.add_argument( + '--check-only', + action='store_true', + help='Detect disallowed bytes but do not modify files.', + ) + parser.add_argument('filenames', nargs='+', help='Files to check') + args = parser.parse_args(argv) + + allowed = _build_allowed(args, parser) + filenames = _filter_filenames(args.filenames, args.files_glob) + + retv = 0 + for filename in filenames: + with open(filename, 'rb') as f: + data = f.read() + + offenders = [(i, b) for i, b in enumerate(data) if b not in allowed] + if not offenders: + continue + + if args.check_only: + print(f'{filename}: disallowed bytes {_format_offenders(offenders)}') + retv = 1 + continue + + new_data = bytes(b for b in data if b in allowed) + if new_data != data: + with open(filename, 'wb') as f: + f.write(new_data) + print( + f'Fixing {filename}: ' f'disallowed bytes {_format_offenders(offenders)}', + ) + retv = 1 + return retv + + +if __name__ == '__main__': + raise SystemExit(main()) \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 65a6b895..9510f8cb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -46,6 +46,7 @@ console_scripts = destroyed-symlinks = pre_commit_hooks.destroyed_symlinks:main detect-aws-credentials = pre_commit_hooks.detect_aws_credentials:main detect-private-key = pre_commit_hooks.detect_private_key:main + detect-non-ascii-characters = pre_commit_hooks.non_ascii_guard:main double-quote-string-fixer = pre_commit_hooks.string_fixer:main end-of-file-fixer = pre_commit_hooks.end_of_file_fixer:main file-contents-sorter = pre_commit_hooks.file_contents_sorter:main diff --git a/testing/resources/non_ascii_sample.txt b/testing/resources/non_ascii_sample.txt new file mode 100644 index 00000000..6c72838e --- /dev/null +++ b/testing/resources/non_ascii_sample.txt @@ -0,0 +1,3 @@ +ASCII ok +Has ctrl: +Unicode: café diff --git a/tests/non_ascii_guard_test.py b/tests/non_ascii_guard_test.py new file mode 100644 index 00000000..4dcd4ea2 --- /dev/null +++ b/tests/non_ascii_guard_test.py @@ -0,0 +1,166 @@ +from __future__ import annotations +import shutil +from pre_commit_hooks.non_ascii_guard import main +from testing.util import get_resource_path + +def test_no_changes_returns_zero(tmp_path) -> None: + path = tmp_path / 'ok.txt' + path.write_bytes(b'hello\n') + + ret = main([str(path)]) + + assert ret == 0 + assert path.read_bytes() == b'hello\n' + + +def test_strips_disallowed_bytes(tmp_path, capsys) -> None: + path = tmp_path / 'bad.txt' + path.write_bytes(b'abc\x80def\n') + + ret = main([str(path)]) + + assert ret == 1 + assert path.read_bytes() == b'abcdef\n' + out = capsys.readouterr().out + assert f'Fixing {path}: disallowed bytes 0x80@3' in out + + +def test_check_only_reports_and_keeps(tmp_path, capsys) -> None: + path = tmp_path / 'bad.txt' + original = b'abc\x00def\n' + path.write_bytes(original) + + ret = main(['--check-only', str(path)]) + + assert ret == 1 + assert path.read_bytes() == original + out = capsys.readouterr().out + assert 'disallowed bytes 0x00@3' in out + + +def test_include_range_allows_bytes(tmp_path) -> None: + path = tmp_path / 'binary.bin' + path.write_bytes(bytes(range(256))) + + ret = main(['--include-range', '0-255', str(path)]) + + assert ret == 0 + assert path.read_bytes() == bytes(range(256)) + + +def test_allow_chars_adds_utf8_bytes(tmp_path) -> None: + path = tmp_path / 'text.txt' + content = 'café\n'.encode('utf-8') + path.write_bytes(content) + + ret = main(['--allow-chars', 'é', str(path)]) + + assert ret == 0 + assert path.read_bytes() == content + + +def test_reports_positions_for_multibyte_chars(tmp_path, capsys) -> None: + path = tmp_path / 'multi.txt' + path.write_bytes( + ( + 'a' # allowed + 'éΩ€𝜋💵🪱' # multibyte non-ASCII + '\u200b\u202e\u2066' # zero-width, bidi controls + '\x01' # control byte + ).encode('utf-8'), + ) + + ret = main(['--check-only', str(path)]) + + assert ret == 1 + out = capsys.readouterr().out + assert 'disallowed bytes 0xc3@1, 0xa9@2, 0xce@3, 0xa9@4, 0xe2@5' in out + + +def test_printable_offender_shows_char(tmp_path, capsys) -> None: + path = tmp_path / 'ascii.txt' + path.write_text('A\n') + + ret = main(['--include-range', '0x00-0x1F', '--check-only', str(path)]) + + assert ret == 1 + out = capsys.readouterr().out + assert f'{path}: disallowed bytes 0x41(\'A\')@0' in out + + +def test_default_allows_whitespace_and_printable(tmp_path) -> None: + path = tmp_path / 'mix.txt' + path.write_bytes(b'abc\t\n\r\x01def') + + ret = main([str(path)]) + + assert ret == 1 + assert path.read_bytes() == b'abc\t\n\rdef' + + +def test_files_glob_filters_targets(tmp_path) -> None: + kept = tmp_path / 'skip.bin' + target = tmp_path / 'take.txt' + kept.write_bytes(b'abc\x01def\n') + target.write_bytes(b'xyz\x01uvw\n') + + ret = main(['--files-glob', '*.txt', str(kept), str(target)]) + + assert ret == 1 + assert kept.read_bytes() == b'abc\x01def\n' + assert target.read_bytes() == b'xyzuvw\n' + + +def test_fixture_file_is_cleaned(tmp_path, capsys) -> None: + fixture = get_resource_path('non_ascii_sample.txt') + path = tmp_path / 'copy.txt' + shutil.copy(fixture, path) + + ret = main([str(path)]) + + assert ret == 1 + assert path.read_text() == 'ASCII ok\nHas ctrl:\nUnicode: caf\n' + out = capsys.readouterr().out + assert f'Fixing {path}: disallowed bytes ' in out + +def test_combined_parameters(tmp_path, capsys): + f1 = tmp_path / 'latin.txt' + f2 = tmp_path / 'emoji.txt' + f3 = tmp_path / 'bidi.txt' + f4 = tmp_path / 'mix.txt' + f1.write_text('café\n') + f2.write_text('smile 🪱\n') + f3.write_text('abc\u202e\n') + f4.write_bytes(b'abc\x01\x80\n') + + ret = main([ + '--files-glob', '*.txt', + '--allow-chars', 'é', + '--include-range', '0x0A,0x20-0x7E', + '--check-only', + str(f1), str(f2), str(f3), str(f4) + ]) + out = capsys.readouterr().out + assert f1.name not in out + assert f2.name in out and 'disallowed bytes' in out + assert f3.name in out and 'disallowed bytes' in out + assert f4.name in out and 'disallowed bytes' in out + assert ret == 1 + + f2.write_text('smile 🪱\n') + f3.write_text('abc\u202e\n') + f4.write_bytes(b'abc\x01\x80\n') + ret2 = main([ + '--files-glob', '*.txt', + '--allow-chars', 'é', + '--include-range', '0x0A,0x20-0x7E', + str(f1), str(f2), str(f3), str(f4) + ]) + out2 = capsys.readouterr().out + assert f1.read_text() == 'café\n' + assert '🪱' not in f2.read_text() + assert '\u202e' not in f3.read_text() and '\u202e'.encode('utf-8') not in f3.read_bytes() + assert f4.read_bytes() == b'abc\n' + # All files except f1 should be mentioned in output + assert f2.name in out2 and f3.name in out2 and f4.name in out2 + assert ret2 == 1 \ No newline at end of file From f2d5d494fc8bad4d3fc41b634a638d591b5f67af Mon Sep 17 00:00:00 2001 From: Ahmed Zeinelabdin Date: Mon, 15 Dec 2025 17:15:15 -0600 Subject: [PATCH 2/8] Fix newline at end of files --- pre_commit_hooks/non_ascii_guard.py | 2 +- tests/non_ascii_guard_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pre_commit_hooks/non_ascii_guard.py b/pre_commit_hooks/non_ascii_guard.py index 805c7f39..8afee1f2 100644 --- a/pre_commit_hooks/non_ascii_guard.py +++ b/pre_commit_hooks/non_ascii_guard.py @@ -130,4 +130,4 @@ def main(argv: Sequence[str] | None = None) -> int: if __name__ == '__main__': - raise SystemExit(main()) \ No newline at end of file + raise SystemExit(main()) diff --git a/tests/non_ascii_guard_test.py b/tests/non_ascii_guard_test.py index 4dcd4ea2..1225fac4 100644 --- a/tests/non_ascii_guard_test.py +++ b/tests/non_ascii_guard_test.py @@ -163,4 +163,4 @@ def test_combined_parameters(tmp_path, capsys): assert f4.read_bytes() == b'abc\n' # All files except f1 should be mentioned in output assert f2.name in out2 and f3.name in out2 and f4.name in out2 - assert ret2 == 1 \ No newline at end of file + assert ret2 == 1 From 59e022fae46cc44ceb3e13f67280a6dd7a9e220c Mon Sep 17 00:00:00 2001 From: Ahmed Zeinelabdin Date: Thu, 18 Dec 2025 14:38:54 -0600 Subject: [PATCH 3/8] use Windows-safe byte literals in non-ASCII guard test --- tests/non_ascii_guard_test.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/non_ascii_guard_test.py b/tests/non_ascii_guard_test.py index 1225fac4..4c2a31bd 100644 --- a/tests/non_ascii_guard_test.py +++ b/tests/non_ascii_guard_test.py @@ -128,9 +128,10 @@ def test_combined_parameters(tmp_path, capsys): f2 = tmp_path / 'emoji.txt' f3 = tmp_path / 'bidi.txt' f4 = tmp_path / 'mix.txt' - f1.write_text('café\n') - f2.write_text('smile 🪱\n') - f3.write_text('abc\u202e\n') + # use explicit UTF-8 to support Windows locales without UTF-8 defaults + f1.write_bytes(b'caf\xc3\xa9\n') + f2.write_bytes(b'smile \xc2\xa3\n') + f3.write_bytes(b'abc\xe2\x80\xae\n') f4.write_bytes(b'abc\x01\x80\n') ret = main([ @@ -147,8 +148,8 @@ def test_combined_parameters(tmp_path, capsys): assert f4.name in out and 'disallowed bytes' in out assert ret == 1 - f2.write_text('smile 🪱\n') - f3.write_text('abc\u202e\n') + f2.write_bytes(b'smile \xc2\xa3\n') + f3.write_bytes(b'abc\xe2\x80\xae\n') f4.write_bytes(b'abc\x01\x80\n') ret2 = main([ '--files-glob', '*.txt', @@ -157,9 +158,9 @@ def test_combined_parameters(tmp_path, capsys): str(f1), str(f2), str(f3), str(f4) ]) out2 = capsys.readouterr().out - assert f1.read_text() == 'café\n' - assert '🪱' not in f2.read_text() - assert '\u202e' not in f3.read_text() and '\u202e'.encode('utf-8') not in f3.read_bytes() + assert f1.read_bytes().decode('utf-8') == 'café\n' + assert '£' not in f2.read_bytes().decode('utf-8') + assert '\u202e' not in f3.read_bytes().decode('utf-8') and b'\xe2\x80\xae' not in f3.read_bytes() assert f4.read_bytes() == b'abc\n' # All files except f1 should be mentioned in output assert f2.name in out2 and f3.name in out2 and f4.name in out2 From 4c1ecebbc3fe2198d34d2236b7ccccdb34f08554 Mon Sep 17 00:00:00 2001 From: Ahmed Zeinelabdin Date: Thu, 18 Dec 2025 14:58:35 -0600 Subject: [PATCH 4/8] Refactor main function to always write new data and increase coverage to 100% --- pre_commit_hooks/non_ascii_guard.py | 13 +++++----- tests/non_ascii_guard_test.py | 38 +++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/pre_commit_hooks/non_ascii_guard.py b/pre_commit_hooks/non_ascii_guard.py index 8afee1f2..69aba4c9 100644 --- a/pre_commit_hooks/non_ascii_guard.py +++ b/pre_commit_hooks/non_ascii_guard.py @@ -119,13 +119,12 @@ def main(argv: Sequence[str] | None = None) -> int: continue new_data = bytes(b for b in data if b in allowed) - if new_data != data: - with open(filename, 'wb') as f: - f.write(new_data) - print( - f'Fixing {filename}: ' f'disallowed bytes {_format_offenders(offenders)}', - ) - retv = 1 + with open(filename, 'wb') as f: + f.write(new_data) + print( + f'Fixing {filename}: ' f'disallowed bytes {_format_offenders(offenders)}', + ) + retv = 1 return retv diff --git a/tests/non_ascii_guard_test.py b/tests/non_ascii_guard_test.py index 4c2a31bd..46d75495 100644 --- a/tests/non_ascii_guard_test.py +++ b/tests/non_ascii_guard_test.py @@ -1,5 +1,9 @@ from __future__ import annotations + import shutil + +import pytest + from pre_commit_hooks.non_ascii_guard import main from testing.util import get_resource_path @@ -165,3 +169,37 @@ def test_combined_parameters(tmp_path, capsys): # All files except f1 should be mentioned in output assert f2.name in out2 and f3.name in out2 and f4.name in out2 assert ret2 == 1 + + +def test_include_range_ignores_empty_parts(tmp_path): + path = tmp_path / 'bytes.bin' + path.write_bytes(b'\x01\x02') + + ret = main(['--include-range', '1,,2', str(path)]) + + assert ret == 0 + assert path.read_bytes() == b'\x01\x02' + + +def test_invalid_include_range_token_exits(tmp_path): + path = tmp_path / 'file.txt' + path.write_text('ok') + + with pytest.raises(SystemExit): + main(['--include-range', '0xZZ', str(path)]) + + +def test_out_of_range_byte_exits(tmp_path): + path = tmp_path / 'file.txt' + path.write_text('ok') + + with pytest.raises(SystemExit): + main(['--include-range', '0x1FF', str(path)]) + + +def test_descending_range_exits(tmp_path): + path = tmp_path / 'file.txt' + path.write_text('ok') + + with pytest.raises(SystemExit): + main(['--include-range', '10-5', str(path)]) From 0988891e529ff2ec598e4a4f36d27232384c91a4 Mon Sep 17 00:00:00 2001 From: Ahmed Zeinelabdin Date: Wed, 21 Jan 2026 13:39:57 -0600 Subject: [PATCH 5/8] Enhance non-ASCII guard with new modes and file filtering options; update tests accordingly --- README.md | 11 +- pre_commit_hooks/non_ascii_guard.py | 158 ++++++++++++++++++++++++++-- setup.cfg | 6 +- tests/non_ascii_guard_test.py | 89 ++++++++++++++-- 4 files changed, 243 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index e33247b9..862c39e3 100644 --- a/README.md +++ b/README.md @@ -113,10 +113,19 @@ Checks for the existence of private keys. #### `detect-non-ascii-characters` Detects and strips non-printable, non-ASCII bytes (supply-chain safety guard). - - Default allowed range: printable ASCII (`0x20-0x7E`) plus `\n`, `\r`, and `\t`. + - Modes (choose via `--mode`): + - `balanced` (default): allow ASCII + Latin-1 accents/symbols; block controls/null, bidi overrides (U+202A–U+202E, U+2066–U+2069), and zero-width characters (U+200B–U+200D). + - `visible-plus`: allow ASCII + emoji (U+1F600–U+1F64F and modifiers/VS16), still blocking zero-width joiners, bidi, and controls. + - `ascii-only`: allow only tab/lf/cr and `0x20-0x7E`; block everything else. + - Examples: `--mode balanced` (default); `--mode visible-plus` (allow 😀, 🚀, etc. but still block zero-width joiners); `--mode ascii-only` (paranoid mode, blocks all non-ASCII). - `--include-range RANGE` - override allowed byte ranges (comma-separated, decimal or hex, supports `START-END`). Can be repeated. + - Examples: `--include-range 0x09,0x0A,0x0D,0x20-0x7E` (default printable ASCII); `--include-range 0-255` (allow all bytes); `--include-range 0x20-0x7E,0xA0` (allow NBSP too). - `--allow-chars TEXT` - permit additional characters (adds their UTF-8 bytes to the allowed set). Can be repeated. + - Examples: `--allow-chars "é"` (allow a single accent); `--allow-chars "😀"` (allow an emoji); `--allow-chars "👨‍👩‍👧‍👦"` (allow a grapheme cluster with ZWJ). - `--files-glob GLOB` - optional fnmatch-style glob to further restrict the provided file list (by default, the hook processes all files handed to it by pre-commit). + - Example: `--files-glob "*.py"` (only consider .py files from the passed list). + - `--files-include GLOB` / `--files-exclude GLOB` - additional fnmatch-style filters applied after `--files-glob`. + - Examples: `--files-include "*.md"` (only Markdown); `--files-exclude "vendor/*"` (skip vendored files). - `--check-only` - report disallowed bytes without modifying files. #### `double-quote-string-fixer` diff --git a/pre_commit_hooks/non_ascii_guard.py b/pre_commit_hooks/non_ascii_guard.py index 69aba4c9..258946a9 100644 --- a/pre_commit_hooks/non_ascii_guard.py +++ b/pre_commit_hooks/non_ascii_guard.py @@ -4,8 +4,25 @@ import fnmatch from collections.abc import Sequence +import grapheme + +MODE_BALANCED = 'balanced' +MODE_VISIBLE_PLUS = 'visible-plus' +MODE_ASCII_ONLY = 'ascii-only' +MODE_CHOICES = (MODE_BALANCED, MODE_VISIBLE_PLUS, MODE_ASCII_ONLY) + DEFAULT_INCLUDE_RANGE = '0x09,0x0A,0x0D,0x20-0x7E' +ASCII_BASE = {0x09, 0x0A, 0x0D} | set(range(0x20, 0x7F)) +LATIN1_VISIBLE = set(range(0xA0, 0x100)) +CONTROL_C0 = set(range(0x00, 0x20)) +CONTROL_C1 = set(range(0x80, 0xA0)) +BIDI_OVERRIDES = set(range(0x202A, 0x202F)) | set(range(0x2066, 0x206A)) +ZERO_WIDTHS = {0x200B, 0x200C, 0x200D} +EMOJI_BASE = set(range(0x1F600, 0x1F650)) +EMOJI_MODIFIERS = set(range(0x1F3FB, 0x1F400)) +VARIATION_SELECTORS = {0xFE0F} + def _parse_byte(token: str, parser: argparse.ArgumentParser) -> int: base = 16 if token.lower().startswith('0x') else 10 @@ -36,20 +53,96 @@ def _parse_range_spec(spec: str, parser: argparse.ArgumentParser) -> set[int]: return allowed -def _build_allowed(args: argparse.Namespace, parser: argparse.ArgumentParser) -> set[int]: +def _build_allowed( + args: argparse.Namespace, parser: argparse.ArgumentParser, +) -> tuple[set[int], bool]: include_specs = args.include_range or [DEFAULT_INCLUDE_RANGE] + restrict_to_includes = bool(args.include_range) allowed: set[int] = set() for spec in include_specs: allowed.update(_parse_range_spec(spec, parser)) for extra in args.allow_chars: allowed.update(extra.encode()) - return allowed + return allowed, restrict_to_includes + + +def _match_any(path: str, patterns: list[str]) -> bool: + return any(fnmatch.fnmatch(path, pattern) for pattern in patterns) + + +def _filter_filenames( + filenames: list[str], + globs: list[str], + include: list[str], + exclude: list[str], +) -> list[str]: + selected = filenames + if globs: + selected = [f for f in selected if _match_any(f, globs)] + if include: + selected = [f for f in selected if _match_any(f, include)] + if exclude: + selected = [f for f in selected if not _match_any(f, exclude)] + return selected + + +def _is_control_or_null(cp: int) -> bool: + return cp in CONTROL_C0 or cp in CONTROL_C1 or cp == 0x7F + + +def _cluster_allowed_balanced(cluster_cps: list[int]) -> bool: + if any(cp in BIDI_OVERRIDES for cp in cluster_cps): + return False + if any(cp in ZERO_WIDTHS for cp in cluster_cps): + return False + if any(_is_control_or_null(cp) and cp not in {0x09, 0x0A, 0x0D} for cp in cluster_cps): + return False + return all(cp in ASCII_BASE or cp in LATIN1_VISIBLE for cp in cluster_cps) + + +def _cluster_allowed_visible_plus(cluster_cps: list[int]) -> bool: + if any(cp in ZERO_WIDTHS for cp in cluster_cps): + return False + if any(cp in BIDI_OVERRIDES for cp in cluster_cps): + return False + if any(_is_control_or_null(cp) and cp not in {0x09, 0x0A, 0x0D} for cp in cluster_cps): + return False + + if all(cp in ASCII_BASE for cp in cluster_cps): + return True + + emoji_ok = all( + cp in EMOJI_BASE + or cp in EMOJI_MODIFIERS + or cp in VARIATION_SELECTORS + for cp in cluster_cps + ) + return emoji_ok + + +def _cluster_allowed_ascii_only(cluster_cps: list[int]) -> bool: + return all(cp in ASCII_BASE for cp in cluster_cps) + +def _cluster_allowed( + cluster_bytes: bytes, + cluster_text: str, + allowed_bytes: set[int], + mode: str, + restrict_to_allowed_bytes: bool, +) -> bool: + if cluster_bytes and all(b in allowed_bytes for b in cluster_bytes): + return True -def _filter_filenames(filenames: list[str], globs: list[str]) -> list[str]: - if not globs: - return filenames - return [f for f in filenames if any(fnmatch.fnmatch(f, g) for g in globs)] + if restrict_to_allowed_bytes: + return False + + cps = [ord(ch) for ch in cluster_text] + if mode == MODE_BALANCED: + return _cluster_allowed_balanced(cps) + if mode == MODE_VISIBLE_PLUS: + return _cluster_allowed_visible_plus(cps) + return _cluster_allowed_ascii_only(cps) def _format_offenders(offenders: list[tuple[int, int]]) -> str: @@ -76,6 +169,20 @@ def main(argv: Sequence[str] | None = None) -> int: metavar='GLOB', help='Optional fnmatch-style glob to further filter listed files.', ) + parser.add_argument( + '--files-include', + action='append', + default=[], + metavar='GLOB', + help='Additional fnmatch-style patterns to include (after files-glob).', + ) + parser.add_argument( + '--files-exclude', + action='append', + default=[], + metavar='GLOB', + help='Fnmatch-style patterns to exclude (applied last).', + ) parser.add_argument( '--include-range', action='append', @@ -93,6 +200,15 @@ def main(argv: Sequence[str] | None = None) -> int: metavar='CHARS', help='Additional characters to permit (UTF-8 bytes of the given text).', ) + parser.add_argument( + '--mode', + choices=MODE_CHOICES, + default=MODE_BALANCED, + help=( + 'Character policy: balanced (default, allow ASCII + Latin-1, block bidi/zero-width/control),' # noqa: E501 + ' visible-plus (ASCII + emoji, block others), ascii-only (strict).' # noqa: E501 + ), + ) parser.add_argument( '--check-only', action='store_true', @@ -101,15 +217,37 @@ def main(argv: Sequence[str] | None = None) -> int: parser.add_argument('filenames', nargs='+', help='Files to check') args = parser.parse_args(argv) - allowed = _build_allowed(args, parser) - filenames = _filter_filenames(args.filenames, args.files_glob) + allowed, restrict_to_includes = _build_allowed(args, parser) + filenames = _filter_filenames( + args.filenames, args.files_glob, args.files_include, args.files_exclude, + ) retv = 0 for filename in filenames: with open(filename, 'rb') as f: data = f.read() - offenders = [(i, b) for i, b in enumerate(data) if b not in allowed] + try: + text = data.decode('utf-8', errors='surrogateescape') + except UnicodeDecodeError: + text = data.decode('utf-8', errors='ignore') + + offenders: list[tuple[int, int]] = [] + new_chunks: list[bytes] = [] + + byte_pos = 0 + for cluster in grapheme.graphemes(text): + cluster_bytes = cluster.encode('utf-8', errors='surrogateescape') + if _cluster_allowed( + cluster_bytes, cluster, allowed, args.mode, restrict_to_includes, + ): + new_chunks.append(cluster_bytes) + else: + offenders.extend( + (byte_pos + i, cluster_bytes[i]) for i in range(len(cluster_bytes)) + ) + byte_pos += len(cluster_bytes) + if not offenders: continue @@ -118,7 +256,7 @@ def main(argv: Sequence[str] | None = None) -> int: retv = 1 continue - new_data = bytes(b for b in data if b in allowed) + new_data = b''.join(new_chunks) with open(filename, 'wb') as f: f.write(new_data) print( diff --git a/setup.cfg b/setup.cfg index 9510f8cb..8f3d3ff4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -18,7 +18,11 @@ classifiers = [options] packages = find: -python_requires = >=3.8 +install_requires = + ruamel.yaml>=0.15 + grapheme>=0.6.0 + tomli>=1.1.0;python_version<"3.11" +python_requires = >=3.9 [options.packages.find] exclude = diff --git a/tests/non_ascii_guard_test.py b/tests/non_ascii_guard_test.py index 46d75495..1e345562 100644 --- a/tests/non_ascii_guard_test.py +++ b/tests/non_ascii_guard_test.py @@ -4,6 +4,9 @@ import pytest +from pre_commit_hooks.non_ascii_guard import MODE_ASCII_ONLY +from pre_commit_hooks.non_ascii_guard import MODE_BALANCED +from pre_commit_hooks.non_ascii_guard import MODE_VISIBLE_PLUS from pre_commit_hooks.non_ascii_guard import main from testing.util import get_resource_path @@ -54,10 +57,10 @@ def test_include_range_allows_bytes(tmp_path) -> None: def test_allow_chars_adds_utf8_bytes(tmp_path) -> None: path = tmp_path / 'text.txt' - content = 'café\n'.encode('utf-8') + content = 'Ωmega\n'.encode('utf-8') path.write_bytes(content) - ret = main(['--allow-chars', 'é', str(path)]) + ret = main(['--allow-chars', 'Ω', str(path)]) assert ret == 0 assert path.read_bytes() == content @@ -78,7 +81,8 @@ def test_reports_positions_for_multibyte_chars(tmp_path, capsys) -> None: assert ret == 1 out = capsys.readouterr().out - assert 'disallowed bytes 0xc3@1, 0xa9@2, 0xce@3, 0xa9@4, 0xe2@5' in out + # Balanced allows é, so first offenders start at Ω (0xce@3) + assert 'disallowed bytes 0xce@3, 0xa9@4, 0xe2@5' in out def test_printable_offender_shows_char(tmp_path, capsys) -> None: @@ -123,7 +127,7 @@ def test_fixture_file_is_cleaned(tmp_path, capsys) -> None: ret = main([str(path)]) assert ret == 1 - assert path.read_text() == 'ASCII ok\nHas ctrl:\nUnicode: caf\n' + assert path.read_text() == 'ASCII ok\nHas ctrl:\nUnicode: café\n' out = capsys.readouterr().out assert f'Fixing {path}: disallowed bytes ' in out @@ -139,6 +143,7 @@ def test_combined_parameters(tmp_path, capsys): f4.write_bytes(b'abc\x01\x80\n') ret = main([ + '--mode', MODE_BALANCED, '--files-glob', '*.txt', '--allow-chars', 'é', '--include-range', '0x0A,0x20-0x7E', @@ -146,16 +151,17 @@ def test_combined_parameters(tmp_path, capsys): str(f1), str(f2), str(f3), str(f4) ]) out = capsys.readouterr().out - assert f1.name not in out - assert f2.name in out and 'disallowed bytes' in out - assert f3.name in out and 'disallowed bytes' in out - assert f4.name in out and 'disallowed bytes' in out + assert f1.name not in out # café allowed in balanced + assert f2.name in out and 'disallowed bytes' in out # £ not allowed + assert f3.name in out and 'disallowed bytes' in out # bidi + assert f4.name in out and 'disallowed bytes' in out # controls assert ret == 1 f2.write_bytes(b'smile \xc2\xa3\n') f3.write_bytes(b'abc\xe2\x80\xae\n') f4.write_bytes(b'abc\x01\x80\n') ret2 = main([ + '--mode', MODE_BALANCED, '--files-glob', '*.txt', '--allow-chars', 'é', '--include-range', '0x0A,0x20-0x7E', @@ -166,11 +172,76 @@ def test_combined_parameters(tmp_path, capsys): assert '£' not in f2.read_bytes().decode('utf-8') assert '\u202e' not in f3.read_bytes().decode('utf-8') and b'\xe2\x80\xae' not in f3.read_bytes() assert f4.read_bytes() == b'abc\n' - # All files except f1 should be mentioned in output assert f2.name in out2 and f3.name in out2 and f4.name in out2 assert ret2 == 1 +def test_mode_visible_plus_allows_emoji_blocks_accents(tmp_path, capsys): + path = tmp_path / 'emoji.txt' + path.write_text('hi 😀 é') + + ret = main(['--mode', MODE_VISIBLE_PLUS, '--check-only', str(path)]) + + assert ret == 1 + out = capsys.readouterr().out + assert path.name in out + assert 'disallowed bytes' in out # accent is blocked + + +def test_mode_ascii_only_is_strict(tmp_path, capsys): + path = tmp_path / 'strict.txt' + path.write_text('hi café 😀') + + ret = main(['--mode', MODE_ASCII_ONLY, '--check-only', str(path)]) + + assert ret == 1 + out = capsys.readouterr().out + assert 'disallowed bytes' in out + + +def test_mode_balanced_allows_latin1_blocks_bidi(tmp_path, capsys): + path = tmp_path / 'latin1.txt' + path.write_bytes('café \u202e'.encode('utf-8')) + + ret = main(['--mode', MODE_BALANCED, '--check-only', str(path)]) + + assert ret == 1 + out = capsys.readouterr().out + assert 'disallowed bytes' in out + assert 'latin1.txt' in out + + +def test_zwj_emoji_blocked_as_cluster(tmp_path, capsys): + path = tmp_path / 'family.txt' + path.write_text('family: 👨\u200d👩\u200d👧\u200d👦 end') + + ret = main(['--mode', MODE_VISIBLE_PLUS, str(path)]) + + assert ret == 1 + out = capsys.readouterr().out + assert 'Fixing' in out + assert 'family.txt' in out + assert '\u200d' not in path.read_text() + + +def test_files_include_and_exclude(tmp_path): + keep = tmp_path / 'skip.md' + take = tmp_path / 'scan.py' + keep.write_text('ok café') + take.write_text('hi café') + + ret = main([ + '--mode', MODE_VISIBLE_PLUS, + '--files-include', '*.py', + '--files-exclude', '*.md', + str(keep), str(take), + ]) + + assert ret == 1 + assert 'café' not in take.read_text() + assert keep.read_text() == 'ok café' + + def test_include_range_ignores_empty_parts(tmp_path): path = tmp_path / 'bytes.bin' path.write_bytes(b'\x01\x02') From a84a04e6b788b442ef4efb773e441a4e1d9cc31b Mon Sep 17 00:00:00 2001 From: Ahmed Zeinelabdin Date: Wed, 21 Jan 2026 15:00:20 -0600 Subject: [PATCH 6/8] set encoding for tests --- tests/non_ascii_guard_test.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/non_ascii_guard_test.py b/tests/non_ascii_guard_test.py index 1e345562..d449bbb4 100644 --- a/tests/non_ascii_guard_test.py +++ b/tests/non_ascii_guard_test.py @@ -127,7 +127,7 @@ def test_fixture_file_is_cleaned(tmp_path, capsys) -> None: ret = main([str(path)]) assert ret == 1 - assert path.read_text() == 'ASCII ok\nHas ctrl:\nUnicode: café\n' + assert path.read_text(encoding='utf-8') == 'ASCII ok\nHas ctrl:\nUnicode: café\n' out = capsys.readouterr().out assert f'Fixing {path}: disallowed bytes ' in out @@ -178,7 +178,7 @@ def test_combined_parameters(tmp_path, capsys): def test_mode_visible_plus_allows_emoji_blocks_accents(tmp_path, capsys): path = tmp_path / 'emoji.txt' - path.write_text('hi 😀 é') + path.write_text('hi 😀 é', encoding='utf-8') ret = main(['--mode', MODE_VISIBLE_PLUS, '--check-only', str(path)]) @@ -190,7 +190,7 @@ def test_mode_visible_plus_allows_emoji_blocks_accents(tmp_path, capsys): def test_mode_ascii_only_is_strict(tmp_path, capsys): path = tmp_path / 'strict.txt' - path.write_text('hi café 😀') + path.write_text('hi café 😀', encoding='utf-8') ret = main(['--mode', MODE_ASCII_ONLY, '--check-only', str(path)]) @@ -213,7 +213,7 @@ def test_mode_balanced_allows_latin1_blocks_bidi(tmp_path, capsys): def test_zwj_emoji_blocked_as_cluster(tmp_path, capsys): path = tmp_path / 'family.txt' - path.write_text('family: 👨\u200d👩\u200d👧\u200d👦 end') + path.write_text('family: 👨\u200d👩\u200d👧\u200d👦 end', encoding='utf-8') ret = main(['--mode', MODE_VISIBLE_PLUS, str(path)]) @@ -221,14 +221,14 @@ def test_zwj_emoji_blocked_as_cluster(tmp_path, capsys): out = capsys.readouterr().out assert 'Fixing' in out assert 'family.txt' in out - assert '\u200d' not in path.read_text() + assert '\u200d' not in path.read_text(encoding='utf-8') def test_files_include_and_exclude(tmp_path): keep = tmp_path / 'skip.md' take = tmp_path / 'scan.py' - keep.write_text('ok café') - take.write_text('hi café') + keep.write_text('ok café', encoding='utf-8') + take.write_text('hi café', encoding='utf-8') ret = main([ '--mode', MODE_VISIBLE_PLUS, @@ -238,8 +238,8 @@ def test_files_include_and_exclude(tmp_path): ]) assert ret == 1 - assert 'café' not in take.read_text() - assert keep.read_text() == 'ok café' + assert 'café' not in take.read_text(encoding='utf-8') + assert keep.read_text(encoding='utf-8') == 'ok café' def test_include_range_ignores_empty_parts(tmp_path): From 94568472912439aeb68ea68700157c0850e8e909 Mon Sep 17 00:00:00 2001 From: Ahmed Zeinelabdin Date: Wed, 21 Jan 2026 15:06:54 -0600 Subject: [PATCH 7/8] increase coverage --- pre_commit_hooks/non_ascii_guard.py | 5 +---- tests/non_ascii_guard_test.py | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/pre_commit_hooks/non_ascii_guard.py b/pre_commit_hooks/non_ascii_guard.py index 258946a9..284876f8 100644 --- a/pre_commit_hooks/non_ascii_guard.py +++ b/pre_commit_hooks/non_ascii_guard.py @@ -227,10 +227,7 @@ def main(argv: Sequence[str] | None = None) -> int: with open(filename, 'rb') as f: data = f.read() - try: - text = data.decode('utf-8', errors='surrogateescape') - except UnicodeDecodeError: - text = data.decode('utf-8', errors='ignore') + text = data.decode('utf-8', errors='surrogateescape') offenders: list[tuple[int, int]] = [] new_chunks: list[bytes] = [] diff --git a/tests/non_ascii_guard_test.py b/tests/non_ascii_guard_test.py index d449bbb4..0d25c9a5 100644 --- a/tests/non_ascii_guard_test.py +++ b/tests/non_ascii_guard_test.py @@ -242,6 +242,22 @@ def test_files_include_and_exclude(tmp_path): assert keep.read_text(encoding='utf-8') == 'ok café' +def test_include_range_restricts_even_if_mode_allows(tmp_path, capsys): + path = tmp_path / 'range.txt' + path.write_text('hello 😀', encoding='utf-8') + + ret = main([ + '--mode', MODE_VISIBLE_PLUS, + '--include-range', '0x20-0x7E', # ASCII only + str(path), + ]) + + assert ret == 1 + out = capsys.readouterr().out + assert 'disallowed bytes' in out + assert 'range.txt' in out + + def test_include_range_ignores_empty_parts(tmp_path): path = tmp_path / 'bytes.bin' path.write_bytes(b'\x01\x02') From 490d43f4d0e6e4d0b191917fe2b8e681ca26b5cf Mon Sep 17 00:00:00 2001 From: Ahmed Zeinelabdin Date: Wed, 21 Jan 2026 15:41:39 -0600 Subject: [PATCH 8/8] add more tests for coverage --- tests/non_ascii_guard_test.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/non_ascii_guard_test.py b/tests/non_ascii_guard_test.py index 0d25c9a5..1a0dadef 100644 --- a/tests/non_ascii_guard_test.py +++ b/tests/non_ascii_guard_test.py @@ -4,6 +4,7 @@ import pytest +from pre_commit_hooks.non_ascii_guard import _cluster_allowed_visible_plus from pre_commit_hooks.non_ascii_guard import MODE_ASCII_ONLY from pre_commit_hooks.non_ascii_guard import MODE_BALANCED from pre_commit_hooks.non_ascii_guard import MODE_VISIBLE_PLUS @@ -290,3 +291,36 @@ def test_descending_range_exits(tmp_path): with pytest.raises(SystemExit): main(['--include-range', '10-5', str(path)]) + + +def test_visible_plus_blocks_bidi_in_cluster(tmp_path, capsys): + """Test line 107: bidi override check in _cluster_allowed_visible_plus""" + path = tmp_path / 'bidi.txt' + # Emoji followed by bidi override U+202E (RIGHT-TO-LEFT OVERRIDE) + path.write_text('test😀\u202Eword', encoding='utf-8') + + ret = main(['--mode', MODE_VISIBLE_PLUS, '--check-only', str(path)]) + + assert ret == 1 + out = capsys.readouterr().out + assert 'disallowed bytes' in out + + +def test_visible_plus_blocks_control_in_cluster(tmp_path, capsys): + """Test line 109: control character check in _cluster_allowed_visible_plus""" + path = tmp_path / 'ctrl.txt' + # Emoji followed by control char U+0001 (not tab/LF/CR) + path.write_text('test😀\x01word', encoding='utf-8') + + ret = main(['--mode', MODE_VISIBLE_PLUS, '--check-only', str(path)]) + + assert ret == 1 + out = capsys.readouterr().out + assert 'disallowed bytes' in out + + +def test_visible_plus_allows_pure_ascii(): + """Test line 112: early return for ASCII-only clusters in visible-plus""" + # Direct unit test of _cluster_allowed_visible_plus with ASCII-only input + ascii_cps = [ord(c) for c in 'hello'] + assert _cluster_allowed_visible_plus(ascii_cps) is True