diff --git a/README.md b/README.md index 5e23484f..862c39e3 100644 --- a/README.md +++ b/README.md @@ -111,6 +111,23 @@ The following arguments are available: #### `detect-private-key` Checks for the existence of private keys. +#### `detect-non-ascii-characters` +Detects and strips non-printable, non-ASCII bytes (supply-chain safety guard). + - Modes (choose via `--mode`): + - `balanced` (default): allow ASCII + Latin-1 accents/symbols; block controls/null, bidi overrides (U+202A–U+202E, U+2066–U+2069), and zero-width characters (U+200B–U+200D). + - `visible-plus`: allow ASCII + emoji (U+1F600–U+1F64F and modifiers/VS16), still blocking zero-width joiners, bidi, and controls. + - `ascii-only`: allow only tab/lf/cr and `0x20-0x7E`; block everything else. + - Examples: `--mode balanced` (default); `--mode visible-plus` (allow 😀, 🚀, etc. but still block zero-width joiners); `--mode ascii-only` (paranoid mode, blocks all non-ASCII). + - `--include-range RANGE` - override allowed byte ranges (comma-separated, decimal or hex, supports `START-END`). Can be repeated. + - Examples: `--include-range 0x09,0x0A,0x0D,0x20-0x7E` (default printable ASCII); `--include-range 0-255` (allow all bytes); `--include-range 0x20-0x7E,0xA0` (allow NBSP too). + - `--allow-chars TEXT` - permit additional characters (adds their UTF-8 bytes to the allowed set). Can be repeated. + - Examples: `--allow-chars "é"` (allow a single accent); `--allow-chars "😀"` (allow an emoji); `--allow-chars "👨‍👩‍👧‍👦"` (allow a grapheme cluster with ZWJ). + - `--files-glob GLOB` - optional fnmatch-style glob to further restrict the provided file list (by default, the hook processes all files handed to it by pre-commit). + - Example: `--files-glob "*.py"` (only consider .py files from the passed list). + - `--files-include GLOB` / `--files-exclude GLOB` - additional fnmatch-style filters applied after `--files-glob`. + - Examples: `--files-include "*.md"` (only Markdown); `--files-exclude "vendor/*"` (skip vendored files). + - `--check-only` - report disallowed bytes without modifying files. + #### `double-quote-string-fixer` This hook replaces double quoted strings with single quoted strings. diff --git a/pre_commit_hooks/non_ascii_guard.py b/pre_commit_hooks/non_ascii_guard.py new file mode 100644 index 00000000..284876f8 --- /dev/null +++ b/pre_commit_hooks/non_ascii_guard.py @@ -0,0 +1,267 @@ +from __future__ import annotations + +import argparse +import fnmatch +from collections.abc import Sequence + +import grapheme + +MODE_BALANCED = 'balanced' +MODE_VISIBLE_PLUS = 'visible-plus' +MODE_ASCII_ONLY = 'ascii-only' +MODE_CHOICES = (MODE_BALANCED, MODE_VISIBLE_PLUS, MODE_ASCII_ONLY) + +DEFAULT_INCLUDE_RANGE = '0x09,0x0A,0x0D,0x20-0x7E' + +ASCII_BASE = {0x09, 0x0A, 0x0D} | set(range(0x20, 0x7F)) +LATIN1_VISIBLE = set(range(0xA0, 0x100)) +CONTROL_C0 = set(range(0x00, 0x20)) +CONTROL_C1 = set(range(0x80, 0xA0)) +BIDI_OVERRIDES = set(range(0x202A, 0x202F)) | set(range(0x2066, 0x206A)) +ZERO_WIDTHS = {0x200B, 0x200C, 0x200D} +EMOJI_BASE = set(range(0x1F600, 0x1F650)) +EMOJI_MODIFIERS = set(range(0x1F3FB, 0x1F400)) +VARIATION_SELECTORS = {0xFE0F} + + +def _parse_byte(token: str, parser: argparse.ArgumentParser) -> int: + base = 16 if token.lower().startswith('0x') else 10 + try: + value = int(token, base) + except ValueError: + parser.error(f'invalid byte value {token!r}') + if not 0 <= value <= 0xFF: + parser.error(f'byte value out of range: {token!r}') + return value + + +def _parse_range_spec(spec: str, parser: argparse.ArgumentParser) -> set[int]: + allowed: set[int] = set() + for raw_part in spec.split(','): + part = raw_part.strip() + if not part: + continue + if '-' in part: + start_s, end_s = part.split('-', 1) + start = _parse_byte(start_s, parser) + end = _parse_byte(end_s, parser) + if start > end: + parser.error(f'invalid range {part!r}: start > end') + allowed.update(range(start, end + 1)) + else: + allowed.add(_parse_byte(part, parser)) + return allowed + + +def _build_allowed( + args: argparse.Namespace, parser: argparse.ArgumentParser, +) -> tuple[set[int], bool]: + include_specs = args.include_range or [DEFAULT_INCLUDE_RANGE] + restrict_to_includes = bool(args.include_range) + allowed: set[int] = set() + for spec in include_specs: + allowed.update(_parse_range_spec(spec, parser)) + for extra in args.allow_chars: + allowed.update(extra.encode()) + return allowed, restrict_to_includes + + +def _match_any(path: str, patterns: list[str]) -> bool: + return any(fnmatch.fnmatch(path, pattern) for pattern in patterns) + + +def _filter_filenames( + filenames: list[str], + globs: list[str], + include: list[str], + exclude: list[str], +) -> list[str]: + selected = filenames + if globs: + selected = [f for f in selected if _match_any(f, globs)] + if include: + selected = [f for f in selected if _match_any(f, include)] + if exclude: + selected = [f for f in selected if not _match_any(f, exclude)] + return selected + + +def _is_control_or_null(cp: int) -> bool: + return cp in CONTROL_C0 or cp in CONTROL_C1 or cp == 0x7F + + +def _cluster_allowed_balanced(cluster_cps: list[int]) -> bool: + if any(cp in BIDI_OVERRIDES for cp in cluster_cps): + return False + if any(cp in ZERO_WIDTHS for cp in cluster_cps): + return False + if any(_is_control_or_null(cp) and cp not in {0x09, 0x0A, 0x0D} for cp in cluster_cps): + return False + return all(cp in ASCII_BASE or cp in LATIN1_VISIBLE for cp in cluster_cps) + + +def _cluster_allowed_visible_plus(cluster_cps: list[int]) -> bool: + if any(cp in ZERO_WIDTHS for cp in cluster_cps): + return False + if any(cp in BIDI_OVERRIDES for cp in cluster_cps): + return False + if any(_is_control_or_null(cp) and cp not in {0x09, 0x0A, 0x0D} for cp in cluster_cps): + return False + + if all(cp in ASCII_BASE for cp in cluster_cps): + return True + + emoji_ok = all( + cp in EMOJI_BASE + or cp in EMOJI_MODIFIERS + or cp in VARIATION_SELECTORS + for cp in cluster_cps + ) + return emoji_ok + + +def _cluster_allowed_ascii_only(cluster_cps: list[int]) -> bool: + return all(cp in ASCII_BASE for cp in cluster_cps) + + +def _cluster_allowed( + cluster_bytes: bytes, + cluster_text: str, + allowed_bytes: set[int], + mode: str, + restrict_to_allowed_bytes: bool, +) -> bool: + if cluster_bytes and all(b in allowed_bytes for b in cluster_bytes): + return True + + if restrict_to_allowed_bytes: + return False + + cps = [ord(ch) for ch in cluster_text] + if mode == MODE_BALANCED: + return _cluster_allowed_balanced(cps) + if mode == MODE_VISIBLE_PLUS: + return _cluster_allowed_visible_plus(cps) + return _cluster_allowed_ascii_only(cps) + + +def _format_offenders(offenders: list[tuple[int, int]]) -> str: + def _label(pos: int, b: int) -> str: + if 0x20 <= b <= 0x7E: + ch = chr(b) + # use repr to surface escapes for backslash/quote while staying ASCII + ch_repr = repr(ch)[1:-1] + return f"0x{b:02x}('{ch_repr}')@{pos}" + return f'0x{b:02x}@{pos}' + + preview = ', '.join(_label(i, b) for i, b in offenders[:5]) + if len(offenders) > 5: + preview += ', ...' + return preview + + +def main(argv: Sequence[str] | None = None) -> int: + parser = argparse.ArgumentParser() + parser.add_argument( + '--files-glob', + action='append', + default=[], + metavar='GLOB', + help='Optional fnmatch-style glob to further filter listed files.', + ) + parser.add_argument( + '--files-include', + action='append', + default=[], + metavar='GLOB', + help='Additional fnmatch-style patterns to include (after files-glob).', + ) + parser.add_argument( + '--files-exclude', + action='append', + default=[], + metavar='GLOB', + help='Fnmatch-style patterns to exclude (applied last).', + ) + parser.add_argument( + '--include-range', + action='append', + metavar='RANGE', + help=( + 'Comma-separated byte ranges to allow. ' + 'Supports decimal or 0x-prefixed hex values and START-END spans. ' + f'Default: {DEFAULT_INCLUDE_RANGE}' + ), + ) + parser.add_argument( + '--allow-chars', + action='append', + default=[], + metavar='CHARS', + help='Additional characters to permit (UTF-8 bytes of the given text).', + ) + parser.add_argument( + '--mode', + choices=MODE_CHOICES, + default=MODE_BALANCED, + help=( + 'Character policy: balanced (default, allow ASCII + Latin-1, block bidi/zero-width/control),' # noqa: E501 + ' visible-plus (ASCII + emoji, block others), ascii-only (strict).' # noqa: E501 + ), + ) + parser.add_argument( + '--check-only', + action='store_true', + help='Detect disallowed bytes but do not modify files.', + ) + parser.add_argument('filenames', nargs='+', help='Files to check') + args = parser.parse_args(argv) + + allowed, restrict_to_includes = _build_allowed(args, parser) + filenames = _filter_filenames( + args.filenames, args.files_glob, args.files_include, args.files_exclude, + ) + + retv = 0 + for filename in filenames: + with open(filename, 'rb') as f: + data = f.read() + + text = data.decode('utf-8', errors='surrogateescape') + + offenders: list[tuple[int, int]] = [] + new_chunks: list[bytes] = [] + + byte_pos = 0 + for cluster in grapheme.graphemes(text): + cluster_bytes = cluster.encode('utf-8', errors='surrogateescape') + if _cluster_allowed( + cluster_bytes, cluster, allowed, args.mode, restrict_to_includes, + ): + new_chunks.append(cluster_bytes) + else: + offenders.extend( + (byte_pos + i, cluster_bytes[i]) for i in range(len(cluster_bytes)) + ) + byte_pos += len(cluster_bytes) + + if not offenders: + continue + + if args.check_only: + print(f'{filename}: disallowed bytes {_format_offenders(offenders)}') + retv = 1 + continue + + new_data = b''.join(new_chunks) + with open(filename, 'wb') as f: + f.write(new_data) + print( + f'Fixing {filename}: ' f'disallowed bytes {_format_offenders(offenders)}', + ) + retv = 1 + return retv + + +if __name__ == '__main__': + raise SystemExit(main()) diff --git a/setup.cfg b/setup.cfg index 65a6b895..8f3d3ff4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -18,7 +18,11 @@ classifiers = [options] packages = find: -python_requires = >=3.8 +install_requires = + ruamel.yaml>=0.15 + grapheme>=0.6.0 + tomli>=1.1.0;python_version<"3.11" +python_requires = >=3.9 [options.packages.find] exclude = @@ -46,6 +50,7 @@ console_scripts = destroyed-symlinks = pre_commit_hooks.destroyed_symlinks:main detect-aws-credentials = pre_commit_hooks.detect_aws_credentials:main detect-private-key = pre_commit_hooks.detect_private_key:main + detect-non-ascii-characters = pre_commit_hooks.non_ascii_guard:main double-quote-string-fixer = pre_commit_hooks.string_fixer:main end-of-file-fixer = pre_commit_hooks.end_of_file_fixer:main file-contents-sorter = pre_commit_hooks.file_contents_sorter:main diff --git a/testing/resources/non_ascii_sample.txt b/testing/resources/non_ascii_sample.txt new file mode 100644 index 00000000..6c72838e --- /dev/null +++ b/testing/resources/non_ascii_sample.txt @@ -0,0 +1,3 @@ +ASCII ok +Has ctrl: +Unicode: café diff --git a/tests/non_ascii_guard_test.py b/tests/non_ascii_guard_test.py new file mode 100644 index 00000000..1a0dadef --- /dev/null +++ b/tests/non_ascii_guard_test.py @@ -0,0 +1,326 @@ +from __future__ import annotations + +import shutil + +import pytest + +from pre_commit_hooks.non_ascii_guard import _cluster_allowed_visible_plus +from pre_commit_hooks.non_ascii_guard import MODE_ASCII_ONLY +from pre_commit_hooks.non_ascii_guard import MODE_BALANCED +from pre_commit_hooks.non_ascii_guard import MODE_VISIBLE_PLUS +from pre_commit_hooks.non_ascii_guard import main +from testing.util import get_resource_path + +def test_no_changes_returns_zero(tmp_path) -> None: + path = tmp_path / 'ok.txt' + path.write_bytes(b'hello\n') + + ret = main([str(path)]) + + assert ret == 0 + assert path.read_bytes() == b'hello\n' + + +def test_strips_disallowed_bytes(tmp_path, capsys) -> None: + path = tmp_path / 'bad.txt' + path.write_bytes(b'abc\x80def\n') + + ret = main([str(path)]) + + assert ret == 1 + assert path.read_bytes() == b'abcdef\n' + out = capsys.readouterr().out + assert f'Fixing {path}: disallowed bytes 0x80@3' in out + + +def test_check_only_reports_and_keeps(tmp_path, capsys) -> None: + path = tmp_path / 'bad.txt' + original = b'abc\x00def\n' + path.write_bytes(original) + + ret = main(['--check-only', str(path)]) + + assert ret == 1 + assert path.read_bytes() == original + out = capsys.readouterr().out + assert 'disallowed bytes 0x00@3' in out + + +def test_include_range_allows_bytes(tmp_path) -> None: + path = tmp_path / 'binary.bin' + path.write_bytes(bytes(range(256))) + + ret = main(['--include-range', '0-255', str(path)]) + + assert ret == 0 + assert path.read_bytes() == bytes(range(256)) + + +def test_allow_chars_adds_utf8_bytes(tmp_path) -> None: + path = tmp_path / 'text.txt' + content = 'Ωmega\n'.encode('utf-8') + path.write_bytes(content) + + ret = main(['--allow-chars', 'Ω', str(path)]) + + assert ret == 0 + assert path.read_bytes() == content + + +def test_reports_positions_for_multibyte_chars(tmp_path, capsys) -> None: + path = tmp_path / 'multi.txt' + path.write_bytes( + ( + 'a' # allowed + 'éΩ€𝜋💵🪱' # multibyte non-ASCII + '\u200b\u202e\u2066' # zero-width, bidi controls + '\x01' # control byte + ).encode('utf-8'), + ) + + ret = main(['--check-only', str(path)]) + + assert ret == 1 + out = capsys.readouterr().out + # Balanced allows é, so first offenders start at Ω (0xce@3) + assert 'disallowed bytes 0xce@3, 0xa9@4, 0xe2@5' in out + + +def test_printable_offender_shows_char(tmp_path, capsys) -> None: + path = tmp_path / 'ascii.txt' + path.write_text('A\n') + + ret = main(['--include-range', '0x00-0x1F', '--check-only', str(path)]) + + assert ret == 1 + out = capsys.readouterr().out + assert f'{path}: disallowed bytes 0x41(\'A\')@0' in out + + +def test_default_allows_whitespace_and_printable(tmp_path) -> None: + path = tmp_path / 'mix.txt' + path.write_bytes(b'abc\t\n\r\x01def') + + ret = main([str(path)]) + + assert ret == 1 + assert path.read_bytes() == b'abc\t\n\rdef' + + +def test_files_glob_filters_targets(tmp_path) -> None: + kept = tmp_path / 'skip.bin' + target = tmp_path / 'take.txt' + kept.write_bytes(b'abc\x01def\n') + target.write_bytes(b'xyz\x01uvw\n') + + ret = main(['--files-glob', '*.txt', str(kept), str(target)]) + + assert ret == 1 + assert kept.read_bytes() == b'abc\x01def\n' + assert target.read_bytes() == b'xyzuvw\n' + + +def test_fixture_file_is_cleaned(tmp_path, capsys) -> None: + fixture = get_resource_path('non_ascii_sample.txt') + path = tmp_path / 'copy.txt' + shutil.copy(fixture, path) + + ret = main([str(path)]) + + assert ret == 1 + assert path.read_text(encoding='utf-8') == 'ASCII ok\nHas ctrl:\nUnicode: café\n' + out = capsys.readouterr().out + assert f'Fixing {path}: disallowed bytes ' in out + +def test_combined_parameters(tmp_path, capsys): + f1 = tmp_path / 'latin.txt' + f2 = tmp_path / 'emoji.txt' + f3 = tmp_path / 'bidi.txt' + f4 = tmp_path / 'mix.txt' + # use explicit UTF-8 to support Windows locales without UTF-8 defaults + f1.write_bytes(b'caf\xc3\xa9\n') + f2.write_bytes(b'smile \xc2\xa3\n') + f3.write_bytes(b'abc\xe2\x80\xae\n') + f4.write_bytes(b'abc\x01\x80\n') + + ret = main([ + '--mode', MODE_BALANCED, + '--files-glob', '*.txt', + '--allow-chars', 'é', + '--include-range', '0x0A,0x20-0x7E', + '--check-only', + str(f1), str(f2), str(f3), str(f4) + ]) + out = capsys.readouterr().out + assert f1.name not in out # café allowed in balanced + assert f2.name in out and 'disallowed bytes' in out # £ not allowed + assert f3.name in out and 'disallowed bytes' in out # bidi + assert f4.name in out and 'disallowed bytes' in out # controls + assert ret == 1 + + f2.write_bytes(b'smile \xc2\xa3\n') + f3.write_bytes(b'abc\xe2\x80\xae\n') + f4.write_bytes(b'abc\x01\x80\n') + ret2 = main([ + '--mode', MODE_BALANCED, + '--files-glob', '*.txt', + '--allow-chars', 'é', + '--include-range', '0x0A,0x20-0x7E', + str(f1), str(f2), str(f3), str(f4) + ]) + out2 = capsys.readouterr().out + assert f1.read_bytes().decode('utf-8') == 'café\n' + assert '£' not in f2.read_bytes().decode('utf-8') + assert '\u202e' not in f3.read_bytes().decode('utf-8') and b'\xe2\x80\xae' not in f3.read_bytes() + assert f4.read_bytes() == b'abc\n' + assert f2.name in out2 and f3.name in out2 and f4.name in out2 + assert ret2 == 1 + + +def test_mode_visible_plus_allows_emoji_blocks_accents(tmp_path, capsys): + path = tmp_path / 'emoji.txt' + path.write_text('hi 😀 é', encoding='utf-8') + + ret = main(['--mode', MODE_VISIBLE_PLUS, '--check-only', str(path)]) + + assert ret == 1 + out = capsys.readouterr().out + assert path.name in out + assert 'disallowed bytes' in out # accent is blocked + + +def test_mode_ascii_only_is_strict(tmp_path, capsys): + path = tmp_path / 'strict.txt' + path.write_text('hi café 😀', encoding='utf-8') + + ret = main(['--mode', MODE_ASCII_ONLY, '--check-only', str(path)]) + + assert ret == 1 + out = capsys.readouterr().out + assert 'disallowed bytes' in out + + +def test_mode_balanced_allows_latin1_blocks_bidi(tmp_path, capsys): + path = tmp_path / 'latin1.txt' + path.write_bytes('café \u202e'.encode('utf-8')) + + ret = main(['--mode', MODE_BALANCED, '--check-only', str(path)]) + + assert ret == 1 + out = capsys.readouterr().out + assert 'disallowed bytes' in out + assert 'latin1.txt' in out + + +def test_zwj_emoji_blocked_as_cluster(tmp_path, capsys): + path = tmp_path / 'family.txt' + path.write_text('family: 👨\u200d👩\u200d👧\u200d👦 end', encoding='utf-8') + + ret = main(['--mode', MODE_VISIBLE_PLUS, str(path)]) + + assert ret == 1 + out = capsys.readouterr().out + assert 'Fixing' in out + assert 'family.txt' in out + assert '\u200d' not in path.read_text(encoding='utf-8') + + +def test_files_include_and_exclude(tmp_path): + keep = tmp_path / 'skip.md' + take = tmp_path / 'scan.py' + keep.write_text('ok café', encoding='utf-8') + take.write_text('hi café', encoding='utf-8') + + ret = main([ + '--mode', MODE_VISIBLE_PLUS, + '--files-include', '*.py', + '--files-exclude', '*.md', + str(keep), str(take), + ]) + + assert ret == 1 + assert 'café' not in take.read_text(encoding='utf-8') + assert keep.read_text(encoding='utf-8') == 'ok café' + + +def test_include_range_restricts_even_if_mode_allows(tmp_path, capsys): + path = tmp_path / 'range.txt' + path.write_text('hello 😀', encoding='utf-8') + + ret = main([ + '--mode', MODE_VISIBLE_PLUS, + '--include-range', '0x20-0x7E', # ASCII only + str(path), + ]) + + assert ret == 1 + out = capsys.readouterr().out + assert 'disallowed bytes' in out + assert 'range.txt' in out + + +def test_include_range_ignores_empty_parts(tmp_path): + path = tmp_path / 'bytes.bin' + path.write_bytes(b'\x01\x02') + + ret = main(['--include-range', '1,,2', str(path)]) + + assert ret == 0 + assert path.read_bytes() == b'\x01\x02' + + +def test_invalid_include_range_token_exits(tmp_path): + path = tmp_path / 'file.txt' + path.write_text('ok') + + with pytest.raises(SystemExit): + main(['--include-range', '0xZZ', str(path)]) + + +def test_out_of_range_byte_exits(tmp_path): + path = tmp_path / 'file.txt' + path.write_text('ok') + + with pytest.raises(SystemExit): + main(['--include-range', '0x1FF', str(path)]) + + +def test_descending_range_exits(tmp_path): + path = tmp_path / 'file.txt' + path.write_text('ok') + + with pytest.raises(SystemExit): + main(['--include-range', '10-5', str(path)]) + + +def test_visible_plus_blocks_bidi_in_cluster(tmp_path, capsys): + """Test line 107: bidi override check in _cluster_allowed_visible_plus""" + path = tmp_path / 'bidi.txt' + # Emoji followed by bidi override U+202E (RIGHT-TO-LEFT OVERRIDE) + path.write_text('test😀\u202Eword', encoding='utf-8') + + ret = main(['--mode', MODE_VISIBLE_PLUS, '--check-only', str(path)]) + + assert ret == 1 + out = capsys.readouterr().out + assert 'disallowed bytes' in out + + +def test_visible_plus_blocks_control_in_cluster(tmp_path, capsys): + """Test line 109: control character check in _cluster_allowed_visible_plus""" + path = tmp_path / 'ctrl.txt' + # Emoji followed by control char U+0001 (not tab/LF/CR) + path.write_text('test😀\x01word', encoding='utf-8') + + ret = main(['--mode', MODE_VISIBLE_PLUS, '--check-only', str(path)]) + + assert ret == 1 + out = capsys.readouterr().out + assert 'disallowed bytes' in out + + +def test_visible_plus_allows_pure_ascii(): + """Test line 112: early return for ASCII-only clusters in visible-plus""" + # Direct unit test of _cluster_allowed_visible_plus with ASCII-only input + ascii_cps = [ord(c) for c in 'hello'] + assert _cluster_allowed_visible_plus(ascii_cps) is True