diff --git a/.gitignore b/.gitignore index 6add75b..4d9d4a0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ generated_markdown/** +generated_indices/** *.pyc \ No newline at end of file diff --git a/Readme.md b/Readme.md index aa1b22d..9965909 100644 --- a/Readme.md +++ b/Readme.md @@ -6,6 +6,7 @@ These were created by using OCR to extract the text from the book, then manually The following tasks are things I would consider useful for others, and would love help with. * [x] Convert word lists into a machine friendly format, probably JSON. * [x] Apply unicode normalization to NFD to both markdown and JSON formats. +* [x] Create a master word index showing first occurrence by chapter. * [ ] Add line number and word index information for the location of the word in the book. * [ ] Macronize vocab list. @@ -14,5 +15,38 @@ To use the lists effectively, I recommend finding a tool that lets you perform d At the top of every file is the page numbers for the exercises of that chapter. +## Scripts + +### create_word_index.py +Creates master alphabetical indexes of all vocabulary words across all chapters, showing the chapter number where each word first appears. Sorting is case-insensitive with proper Unicode normalization. + +**Usage:** +```bash +# Create regular alphabetical index (all formats: JSON, Markdown, HTML) +python3 create_word_index.py + +# Include both regular and sectioned indexes +python3 create_word_index.py --include-sectioned + +# Create only the sectioned index (organized by grammatical sections) +python3 create_word_index.py --sectioned-only + +# HTML only with custom page density +python3 create_word_index.py --format html --entries-per-page 90 + +# All formats with custom output filenames +python3 create_word_index.py --include-sectioned \ + --json-output my_index.json \ + --section-json-output my_index_by_section.json +``` + +**Output:** +- `word_index.json` — Flat alphabetical index +- `word_index.md` — Flat alphabetical markdown table +- `word_index.html` — Print-optimized HTML with 3-column pagination (102 entries/page) +- `word_index_by_section.json` — Index organized by grammatical sections (14 total) +- `word_index_by_section.md` — Sectioned markdown with sections as headers +- `word_index_by_section.html` — Sectioned HTML with responsive column layout (adapts to browser width, up to 4 columns), table of contents with anchor links, and adaptive font sizing for readability + # Copyright & License The copyright of the word lists remain with the original authors, and if they dislike my public reproduction of their lists then I am fully willing to take this repo down. All code and other novel material in this repository is licensed under the terms of the MIT license. \ No newline at end of file diff --git a/create_word_index.py b/create_word_index.py new file mode 100644 index 0000000..3af3acd --- /dev/null +++ b/create_word_index.py @@ -0,0 +1,748 @@ +#!/usr/bin/env python3 +""" +Create a master word index from JSON chapter files. +Produces an alphabetically sorted list of all unique words with their first chapter appearance. +""" + +import json +import unicodedata +from pathlib import Path + + +def extract_chapter_number(filename): + """Extract chapter number from filename (e.g., 'Ch 1.json' -> 1)""" + name = Path(filename).stem # Remove .json extension + parts = name.split() + if parts[0].lower() == "ch" and len(parts) > 1: + try: + return int(parts[1]) + except ValueError: + return None + return None + + +def extract_words_from_chapter(json_data): + """Extract all words from a chapter JSON structure""" + words = [] + + if "sections" in json_data: + for section in json_data["sections"]: + if "words" in section: + for word_entry in section["words"]: + # Extract the word form (before comma if present) + book_entry = word_entry.get("book_entry", "").strip() + if book_entry: + # Get just the first word form if there are alternatives + primary_word = book_entry.split("|")[0].strip() + words.append(primary_word) + + return words + + +def extract_words_with_sections(json_data): + """Extract all words from a chapter JSON structure with their sections""" + words_with_sections = [] + + if "sections" in json_data: + for section in json_data["sections"]: + section_name = section.get("section", "Unknown") + if "words" in section: + for word_entry in section["words"]: + book_entry = word_entry.get("book_entry", "").strip() + if book_entry: + primary_word = book_entry.split("|")[0].strip() + words_with_sections.append((primary_word, section_name)) + + return words_with_sections + + +def create_word_index(json_dir="json"): + """ + Create a master word index from all JSON chapter files. + + Args: + json_dir: Directory containing JSON chapter files + + Returns: + A dictionary mapping words to their first chapter number + """ + word_to_chapter = {} + + json_path = Path(json_dir) + if not json_path.exists(): + print(f"Error: Directory '{json_dir}' not found") + return word_to_chapter + + # Get all JSON files and sort them by chapter number + json_files = sorted( + json_path.glob("*.json"), key=lambda f: extract_chapter_number(f.name) or 0 + ) + + if not json_files: + print(f"No JSON files found in '{json_dir}'") + return word_to_chapter + + print(f"Processing {len(json_files)} chapter files...") + + for json_file in json_files: + chapter_num = extract_chapter_number(json_file.name) + + try: + with open(json_file, "r", encoding="utf-8") as f: + data = json.load(f) + + words = extract_words_from_chapter(data) + + for word in words: + # Only add if we haven't seen this word before (first occurrence) + if word not in word_to_chapter: + word_to_chapter[word] = chapter_num + + print(f" Chapter {chapter_num}: {len(words)} words extracted") + + except json.JSONDecodeError: + print(f" Error: Could not parse {json_file.name}") + except Exception as e: + print(f" Error processing {json_file.name}: {e}") + + return word_to_chapter + + +def create_sectioned_word_index(json_dir="json"): + """ + Create a word index organized by grammatical sections. + + Args: + json_dir: Directory containing JSON chapter files + + Returns: + A dictionary mapping sections to words to their first chapter number + """ + section_words = {} + + json_path = Path(json_dir) + if not json_path.exists(): + print(f"Error: Directory '{json_dir}' not found") + return section_words + + # Get all JSON files and sort them by chapter number + json_files = sorted( + json_path.glob("*.json"), key=lambda f: extract_chapter_number(f.name) or 0 + ) + + if not json_files: + print(f"No JSON files found in '{json_dir}'") + return section_words + + print(f"Processing {len(json_files)} chapter files for sectioned index...") + + for json_file in json_files: + chapter_num = extract_chapter_number(json_file.name) + + try: + with open(json_file, "r", encoding="utf-8") as f: + data = json.load(f) + + words_with_sections = extract_words_with_sections(data) + + for word, section_name in words_with_sections: + # Initialize section if not exists + if section_name not in section_words: + section_words[section_name] = {} + + # Only add if we haven't seen this word in this section before + if word not in section_words[section_name]: + section_words[section_name][word] = chapter_num + + print(f" Chapter {chapter_num}: processed") + + except json.JSONDecodeError: + print(f" Error: Could not parse {json_file.name}") + except Exception as e: + print(f" Error processing {json_file.name}: {e}") + + return section_words + + +def save_index_as_json(word_index, output_file="word_index.json"): + """Save the word index as a JSON file""" + # Sort alphabetically with custom sort key + sorted_index = dict(sorted(word_index.items(), key=lambda x: get_sort_key(x[0]))) + + with open(output_file, "w", encoding="utf-8") as f: + json.dump(sorted_index, f, ensure_ascii=False, indent=2) + + print(f"\nIndex saved to {output_file}") + print(f"Total unique words: {len(sorted_index)}") + + +def save_index_as_markdown(word_index, output_file="word_index.md"): + """Save the word index as a markdown file""" + sorted_words = sorted(word_index.items(), key=lambda x: get_sort_key(x[0])) + + with open(output_file, "w", encoding="utf-8") as f: + f.write("# Master Word Index\n\n") + f.write(f"Total unique words: {len(sorted_words)}\n\n") + f.write("| Word | Chapter |\n") + f.write("|------|----------|\n") + + for word, chapter in sorted_words: + f.write(f"| {word} | {chapter} |\n") + + print(f"Index saved to {output_file}") + print(f"Total unique words: {len(sorted_words)}") + + +def get_sort_key(word): + """Generate a sort key that treats uppercase and lowercase as equivalent and strips underscores""" + # Strip underscores for sorting + clean_word = word.strip("_") + # Normalize to NFD (consistent with orig_md_to_json.py) and convert to lowercase for case-insensitive sorting + normalized = unicodedata.normalize("NFD", clean_word) + return normalized.lower() + + +def format_word_for_html(word): + """Format word for HTML, italicizing if surrounded by underscores""" + if word.startswith("_") and word.endswith("_"): + # Remove underscores and wrap in em tags + clean_word = word[1:-1] + return f"{clean_word}" + return word + + +def save_sectioned_index_as_json( + section_words, output_file="word_index_by_section.json" +): + """Save the sectioned word index as a JSON file""" + # Sort sections and words within sections alphabetically + sorted_sections = {} + for section in sorted(section_words.keys()): + sorted_words = dict( + sorted(section_words[section].items(), key=lambda x: get_sort_key(x[0])) + ) + sorted_sections[section] = sorted_words + + with open(output_file, "w", encoding="utf-8") as f: + json.dump(sorted_sections, f, ensure_ascii=False, indent=2) + + total_words = sum(len(words) for words in section_words.values()) + print(f"\nSectioned index saved to {output_file}") + print(f"Total unique words: {total_words}") + print(f"Total sections: {len(section_words)}") + + +def save_sectioned_index_as_markdown( + section_words, output_file="word_index_by_section.md" +): + """Save the sectioned word index as a markdown file""" + with open(output_file, "w", encoding="utf-8") as f: + f.write("# Master Word Index by Section\n\n") + total_words = sum(len(words) for words in section_words.values()) + f.write(f"Total unique words: {total_words}\n") + f.write(f"Total sections: {len(section_words)}\n\n") + + for section in sorted(section_words.keys()): + f.write(f"## {section}\n\n") + f.write("| Word | Chapter |\n") + f.write("|------|----------|\n") + + sorted_words = sorted( + section_words[section].items(), key=lambda x: get_sort_key(x[0]) + ) + for word, chapter in sorted_words: + f.write(f"| {word} | {chapter} |\n") + f.write("\n") + + total_words = sum(len(words) for words in section_words.values()) + print(f"\nSectioned index saved to {output_file}") + print(f"Total unique words: {total_words}") + print(f"Total sections: {len(section_words)}") + + +def save_sectioned_index_as_html( + section_words, output_file="word_index_by_section.html" +): + """Save the sectioned word index as an HTML file with column-ordered layout""" + html_content = """ + +
+ + +