From 835b5ce92937df41df0a0caec1fec3c93b71e2bd Mon Sep 17 00:00:00 2001
From: kiyro7 <92889789+kiyro7@users.noreply.github.com>
Date: Wed, 26 Nov 2025 16:09:11 +0300
Subject: [PATCH 01/17] initial commit

---
 .gitignore                               |   4 +-
 app/questions_generator/Dockerfile       |  28 ++
 app/questions_generator/README.md        |  12 +
 app/questions_generator/generator.py     | 150 +++++++++
 app/questions_generator/requirements.txt |   5 +
 app/questions_generator/run.py           |  51 +++
 app/questions_generator/validator.py     | 405 +++++++++++++++++++++++
 7 files changed, 654 insertions(+), 1 deletion(-)
 create mode 100644 app/questions_generator/Dockerfile
 create mode 100644 app/questions_generator/README.md
 create mode 100644 app/questions_generator/generator.py
 create mode 100644 app/questions_generator/requirements.txt
 create mode 100644 app/questions_generator/run.py
 create mode 100644 app/questions_generator/validator.py

diff --git a/.gitignore b/.gitignore
index 1ed6022..06067be 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,4 +5,6 @@ ssl
 __pycache__
 /VERSION.json
 .env
-/whisper_asr_model_cache
\ No newline at end of file
+/whisper_asr_model_cache
+/app/questions_generator/vkr_examples/
+/app/questions_generator/rut5-base/
diff --git a/app/questions_generator/Dockerfile b/app/questions_generator/Dockerfile
new file mode 100644
index 0000000..766e297
--- /dev/null
+++ b/app/questions_generator/Dockerfile
@@ -0,0 +1,28 @@
+FROM python:3.10-slim
+
+# 1. System deps
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        git wget gcc g++ \
+        libprotobuf-dev protobuf-compiler \
+    && rm -rf /var/lib/apt/lists/*
+
+# 2. Workdir
+WORKDIR /app
+
+# 3. Python deps
+COPY requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip \
+    && pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu \
+    && pip install --no-cache-dir -r requirements.txt
+
+# 4. NLTK
+RUN python -m nltk.downloader punkt stopwords
+
+# 5. Copy local model
+COPY rut5-base/ /app/rut5-base/
+
+# 6. Copy project
+COPY . .
+
+# 7. Run
+CMD ["python", "run.py"]
diff --git a/app/questions_generator/README.md b/app/questions_generator/README.md
new file mode 100644
index 0000000..ba6c080
--- /dev/null
+++ b/app/questions_generator/README.md
@@ -0,0 +1,12 @@
+# Запуск
+
+## Загрузка модели локально (единоразово)
+- `powershell -ExecutionPolicy ByPass -c "irm https://hf.co/cli/install.ps1 | iex"` (windows)
+- `curl -LsSf https://hf.co/cli/install.sh | bash` (linux/macos)
+- `cd app\questions_generator`
+- `hf download cointegrated/rut5-base-multitask --local-dir rut5-base`
+## Выбор файла ВКР
+- заменить в `run.py` в функции `main` путь для файла ВКР
+## Запуск
+- `docker build -t vkr-generator .`
+- `docker run -it --rm vkr-generator`
diff --git a/app/questions_generator/generator.py b/app/questions_generator/generator.py
new file mode 100644
index 0000000..4462f69
--- /dev/null
+++ b/app/questions_generator/generator.py
@@ -0,0 +1,150 @@
+import re
+from typing import List, Dict
+from nltk.tokenize import sent_tokenize, word_tokenize
+from nltk.corpus import stopwords
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+
+class VkrQuestionGenerator:
+    """
+    Генератор вопросов по тексту ВКР.
+    Основан на гибридном подходе: NLTK + rut5-base-multitask.
+    """
+    def __init__(self, vkr_text: str, model_path: str = "./rut5-base"):
+        self.vkr_text = vkr_text
+        self.sentences = sent_tokenize(vkr_text)
+        self.stopwords = set(stopwords.words("russian"))
+
+        # ---- Модель rut5 ----
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
+
+    # ---------------------------------------------------------
+    # --- 1. ЭВРИСТИКА: Извлечение ключевых частей ВКР ---
+    # ---------------------------------------------------------
+
+    def extract_section(self, title: str) -> str:
+        """
+        Универсальный метод извлечения раздела по заголовку.
+        """
+        pattern = rf"{title}.*?(?=\n[A-ZА-Я][^\n]*\n)"
+        m = re.search(pattern, self.vkr_text, re.DOTALL | re.IGNORECASE)
+        return m.group(0) if m else ""
+
+    def extract_intro(self) -> str:
+        return self.extract_section("Введение")
+
+    def extract_conclusion(self) -> str:
+        return self.extract_section("Заключение")
+
+    def extract_methodology(self) -> str:
+        return self.extract_section("Методолог")
+
+    # ---------------------------------------------------------
+    # --- 2. ЭВРИСТИКА: Поиск ключевых концепций ---
+    # ---------------------------------------------------------
+
+    def extract_keywords(self, text: str) -> List[str]:
+        tokens = word_tokenize(text.lower())
+        return [
+            t for t in tokens
+            if t.isalnum() and t not in self.stopwords and len(t) > 4
+        ]
+
+    # ---------------------------------------------------------
+    # --- 3. Генерация вопросов через rut5 (режим ask) ---
+    # ---------------------------------------------------------
+
+    def llm_generate_question(self, text_fragment: str) -> str:
+        """
+        Генерация вопроса по фрагменту текста через rut5 ask
+        """
+        prompt = f"ask: {text_fragment}"
+        enc = self.tokenizer(prompt, return_tensors="pt", truncation=True)
+        out = self.model.generate(
+            **enc,
+            max_length=64,
+            num_beams=5,
+            early_stopping=True
+        )
+        return self.tokenizer.decode(out[0], skip_special_tokens=True)
+
+    # ---------------------------------------------------------
+    # --- 4. ЭВРИСТИЧЕСКИЕ ШАБЛОНЫ (из документа) ---
+    # ---------------------------------------------------------
+
+    def heuristic_questions(self) -> List[str]:
+        """
+        Генерация вопросов по эвристикам из загруженных PDF.
+        """
+        intro = self.extract_intro()
+        conc = self.extract_conclusion()
+        meth = self.extract_methodology()
+        keywords = self.extract_keywords(self.vkr_text)
+
+        q = []
+
+        # --- По связям между разделами ---
+        if intro and conc:
+            q.append("Как сформулированные во введении задачи связаны с выводами работы?")
+
+        # --- По методологии ---
+        if meth:
+            for kw in keywords[:3]:
+                q.append(f"Почему был выбран метод {kw} и где он применён в работе?")
+
+        # --- По выводам ---
+        if conc:
+            q.append("На основании каких данных был сделан ключевой вывод в заключении?")
+
+        # --- Общие вопросы (из документа) ---
+        q.extend([
+            "Есть ли опенсорс аналоги упомянутых решений?",
+            "В чем практическая значимость представленного метода?",
+            "Какие ограничения имеет разработанный подход?",
+            "Для каких дополнительных задач можно применить полученные результаты?",
+        ])
+
+        return q
+
+    # ---------------------------------------------------------
+    # --- 5. Гибридная генерация: LLM + эвристики ---
+    # ---------------------------------------------------------
+
+    def generate_llm_questions(self, count=5) -> List[str]:
+        """
+        Генерация N вопросов через rut5 по ключевым фрагментам документа.
+        """
+        q = []
+        fragments = self.sentences[:40]  # первые ~40 предложений для контекста
+
+        step = max(1, len(fragments) // count)
+
+        for i in range(0, len(fragments), step):
+            frag = fragments[i]
+            try:
+                llm_q = self.llm_generate_question(frag)
+                if len(llm_q) > 10:
+                    q.append(llm_q)
+            except:
+                continue
+
+            if len(q) >= count:
+                break
+
+        return q
+
+    # ---------------------------------------------------------
+    # --- 6. Главный метод ---
+    # ---------------------------------------------------------
+
+    def generate_all(self) -> List[str]:
+        """
+        Генерирует полный набор вопросов:
+        - эвристические
+        - модельные (LLM)
+        """
+        result = []
+        result.extend(self.heuristic_questions())
+        result.extend(self.generate_llm_questions(count=7))
+        return list(dict.fromkeys(result))  # убрать дубли
diff --git a/app/questions_generator/requirements.txt b/app/questions_generator/requirements.txt
new file mode 100644
index 0000000..fce2882
--- /dev/null
+++ b/app/questions_generator/requirements.txt
@@ -0,0 +1,5 @@
+transformers
+sentencepiece
+nltk
+huggingface_hub
+python-docx
diff --git a/app/questions_generator/run.py b/app/questions_generator/run.py
new file mode 100644
index 0000000..c8940db
--- /dev/null
+++ b/app/questions_generator/run.py
@@ -0,0 +1,51 @@
+from generator import VkrQuestionGenerator
+from validator import VkrQuestionValidator
+import sys
+import os
+from docx import Document
+
+
+def load_vkr_text(path: str) -> str:
+    if not os.path.exists(path):
+        print(f"[ERROR] Файл '{path}' не найден.")
+        sys.exit(1)
+
+    document = Document(path)
+    text = []
+    for paragraph in document.paragraphs:
+        text.append(paragraph.text)
+
+    return '\n'.join(text)
+
+
+def main():
+    print("=== Загрузка текста ВКР ===")
+    text = load_vkr_text("vkr_examples/VKR1.docx")
+
+    print("=== Инициализация генератора ===")
+    gen = VkrQuestionGenerator(text, model_path="/app/rut5-base")
+
+    print("=== Инициализация валидатора ===")
+    validator = VkrQuestionValidator(text)
+
+    print("=== Генерация вопросов ===")
+    questions = gen.generate_all()
+
+    print("\n=== Результаты ===")
+    for q in questions:
+        rel = validator.check_relevance(q)
+        clr = validator.check_clarity(q)
+        diff = validator.check_difficulty(q)
+
+        status = "✔ OK" if (rel and clr and diff) else "✖ FAIL"
+
+        print(f"\n[{status}] {q}")
+        print(f"  - relevance: {rel}")
+        print(f"  - clarity:   {clr}")
+        print(f"  - difficulty:{diff}")
+
+    print("\n=== Готово ===")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/app/questions_generator/validator.py b/app/questions_generator/validator.py
new file mode 100644
index 0000000..8441006
--- /dev/null
+++ b/app/questions_generator/validator.py
@@ -0,0 +1,405 @@
+import re
+from typing import List, Dict, Set
+from collections import Counter
+import nltk
+from nltk.tokenize import sent_tokenize, word_tokenize
+from nltk.corpus import stopwords
+import string
+from datetime import datetime
+
+
+class VkrQuestionValidator:
+    def __init__(self, vkr_text: str):
+        """
+        Инициализация валидатора с текстом ВКР
+
+        Args:
+            vkr_text: Полный текст ВКР
+        """
+        self.vkr_text = vkr_text.lower()
+        self.keywords = self._extract_keywords()
+        self.stopwords = set(stopwords.words('russian'))
+
+    def _extract_keywords(self) -> Dict[str, Set[str]]:
+        """
+        Извлечение ключевых слов из текста ВКР
+
+        Returns:
+            Словарь с категориями ключевых слов
+        """
+        keywords = {
+            'theme': set(),  # Тематические слова
+            'goals': set(),  # Слова, связанные с целями
+            'methodology': set()  # Методологические термины
+        }
+
+        # Извлечение ключевых слов из введения
+        intro_section = self._extract_introduction()
+        keywords['theme'] = self._tokenize_and_filter(intro_section)
+
+        # Извлечение целей из соответствующего раздела
+        goals_section = self._extract_goals_section()
+        keywords['goals'] = self._tokenize_and_filter(goals_section)
+
+        # Извлечение методологических терминов
+        meth_section = self._extract_methodology_section()
+        keywords['methodology'] = self._tokenize_and_filter(meth_section)
+
+        return keywords
+
+    def _tokenize_and_filter(self, text: str) -> Set[str]:
+        """
+        Токенизация и фильтрация текста для получения ключевых слов
+
+        Args:
+            text: Исходный текст для обработки
+
+        Returns:
+            Множество отфильтрованных токенов
+        """
+        tokens = word_tokenize(text.lower())
+        filtered_tokens = [
+            token for token in tokens
+            if token.isalnum() and
+               token not in self.stopwords and
+               len(token) > 3
+        ]
+        return set(filtered_tokens)
+
+    def _extract_introduction(self) -> str:
+        """
+        Извлечение введения из текста ВКР
+
+        Returns:
+            Текст введения
+        """
+        intro_pattern = r'введение.*?(?=глава|раздел)'
+        match = re.search(intro_pattern, self.vkr_text, re.DOTALL)
+        return match.group(0) if match else ""
+
+    def _extract_goals_section(self) -> str:
+        """
+        Извлечение раздела с целями и задачами
+
+        Returns:
+            Текст раздела с целями
+        """
+        goals_pattern = r'(цель|задачи).*?(?=глава|раздел)'
+        match = re.search(goals_pattern, self.vkr_text, re.DOTALL)
+        return match.group(0) if match else ""
+
+    def _extract_methodology_section(self) -> str:
+        """
+        Извлечение методологического раздела
+
+        Returns:
+            Текст методологического раздела
+        """
+        meth_pattern = r'(методология|методы).*?(?=глава|раздел)'
+        match = re.search(meth_pattern, self.vkr_text, re.DOTALL)
+        return match.group(0) if match else ""
+
+    def check_relevance(self, question: str) -> bool:
+        """
+        Проверка релевантности вопроса
+
+        Args:
+            question: Проверяемый вопрос
+
+        Returns:
+            True если вопрос релевантен, False если нет
+        """
+        score = 0
+
+        # Проверка соответствия теме
+        theme_match = len(set(question.lower().split()) &
+                          set(self.keywords['theme']))
+        if theme_match > 0:
+            score += 1
+
+        # Проверка актуальности
+        actuality_score = self._calculate_actuality_score(question)
+        score += actuality_score
+
+        # Проверка связи с целями
+        goal_match = len(set(question.lower().split()) &
+                         set(self.keywords['goals']))
+        if goal_match > 0:
+            score += 1
+
+        return score >= 2
+
+    def _calculate_actuality_score(self, question: str) -> int:
+        """
+        Расчёт актуальности вопроса
+
+        Args:
+            question: Анализируемый вопрос
+
+        Returns:
+            Оценка актуальности (0 или 1)
+        """
+        current_year = datetime.now().year
+        year_mentions = [int(word) for word in question.split()
+                         if word.isdigit() and 1900 <= int(word) <= current_year]
+        return max(0, min(1, len(year_mentions)))
+
+    def check_completeness(self, questions_list: List[str]) -> bool:
+        """
+        Проверка полноты набора вопросов
+
+        Args:
+            questions_list: Список проверяемых вопросов
+
+        Returns:
+            True если набор полный, False если нет
+        """
+        coverage = {
+            'theoretical': self._check_theory_coverage(questions_list),
+            'practical': self._check_practice_coverage(questions_list),
+            'analysis_levels': self._check_analysis_depth(questions_list)
+        }
+        return all(value >= 0.7 for value in coverage.values())
+
+    def _check_theory_coverage(self, questions: List[str]) -> float:
+        """
+        Проверка теоретического охвата вопросами
+
+        Args:
+            questions: Список вопросов для анализа
+
+        Returns:
+            Значение от 0 до 1, показывающее степень покрытия
+        """
+        theoretical_terms = {'теория', 'модель', 'концепция', 'принцип'}
+        total_questions = len(questions)
+        theory_questions = sum(
+            1 for q in questions
+            if any(term in q.lower() for term in theoretical_terms)
+        )
+        return theory_questions / total_questions if total_questions > 0 else 0
+
+    def _check_practice_coverage(self, questions: List[str]) -> float:
+        """
+        Проверка практического охвата вопросами
+
+        Args:
+            questions: Список вопросов для анализа
+
+        Returns:
+            Значение от 0 до 1, показывающее степень покрытия
+        """
+        practical_terms = {'применение', 'реализация', 'использование', 'результаты'}
+        total_questions = len(questions)
+        practice_questions = sum(
+            1 for q in questions
+            if any(term in q.lower() for term in practical_terms)
+        )
+        return practice_questions / total_questions if total_questions > 0 else 0
+
+    def _check_analysis_depth(self, questions: List[str]) -> float:
+        """
+        Проверка глубины анализа в вопросах
+
+        Args:
+            questions: Список вопросов для анализа
+
+        Returns:
+            Значение от 0 до 1, показывающее глубину анализа
+        """
+        depth_indicators = {
+            'поверхностный': {'что', 'какой'},
+            'средний': {'почему', 'как'},
+            'глубокий': {'анализ', 'оценка', 'сравнение'}
+        }
+
+        depths = []
+        for q in questions:
+            q_lower = q.lower()
+            depth = 0
+            if any(ind in q_lower for ind in depth_indicators['глубокий']):
+                depth = 2
+            elif any(ind in q_lower for ind in depth_indicators['средний']):
+                depth = 1
+            elif any(ind in q_lower for ind in depth_indicators['поверхностный']):
+                depth = 0
+            depths.append(depth)
+
+        return sum(depths) / (len(depths) * 2) if depths else 0
+
+    def check_clarity(self, question: str) -> bool:
+        """
+        Проверка ясности формулировки вопроса
+
+        Args:
+            question: Проверяемый вопрос
+
+        Returns:
+            True если формулировка ясная, False если нет
+        """
+        metrics = {
+            'length': self._check_length(question),
+            'complexity': self._calculate_complexity(question),
+            'ambiguity': self._check_ambiguity(question)
+        }
+        return all(value >= 0.7 for value in metrics.values())
+
+    def _check_length(self, question: str) -> float:
+        """
+        Проверка длины вопроса
+
+        Args:
+            question: Проверяемый вопрос
+
+        Returns:
+            Нормализованное значение от 0 до 1
+        """
+        words = len(question.split())
+        # Оптимальная длина вопроса считается 7-15 слов
+        if words < 7:
+            return 0.5 * (words / 7)
+        elif words > 15:
+            return 1 - 0.5 * ((words - 15) / 15)
+        return 1.0
+
+    def _calculate_complexity(self, question: str) -> float:
+        """
+        Оценка сложности вопроса
+
+        Args:
+            question: Анализируемый вопрос
+
+        Returns:
+            Значение от 0 до 1, показывающее сложность
+        """
+        words = question.split()
+        unique_words = set(words)
+        return min(1.0, len(unique_words) / len(words))
+
+    def _check_ambiguity(self, question: str) -> float:
+        """
+        Проверка наличия двусмысленностей в вопросе
+
+        Args:
+            question: Проверяемый вопрос
+
+        Returns:
+            Значение от 0 до 1, где 1 - нет двусмысленностей
+        """
+        ambiguous_terms = {
+            'или', 'и', 'при этом', 'однако', 'тем не менее',
+            'с другой стороны', 'в то же время'
+        }
+        ambiguity_score = 1.0
+
+        for term in ambiguous_terms:
+            if term in question.lower():
+                ambiguity_score -= 0.2
+
+        return max(0.0, ambiguity_score)
+
+    def check_difficulty(self, question: str) -> bool:
+        """
+        Проверка уровня сложности вопроса
+
+        Args:
+            question: Проверяемый вопрос
+
+        Returns:
+            True если уровень сложности оптимальный, False если нет
+        """
+        difficulty_metrics = {
+            'abstraction_level': self._assess_abstraction(question),
+            'question_type': self._identify_question_type(question),
+            'student_level_match': self._match_student_level(question)
+        }
+        return all(value == 'optimal' for value in difficulty_metrics.values())
+
+    def _assess_abstraction(self, question: str) -> str:
+        """
+        Оценка уровня абстракции вопроса
+
+        Args:
+            question: Анализируемый вопрос
+
+        Returns:
+            'optimal', 'too_high', 'too_low'
+        """
+        abstract_terms = {
+            'концепция', 'модель', 'теория', 'абстракция',
+            'парадигма', 'методология'
+        }
+        concrete_terms = {
+            'пример', 'факт', 'данные', 'результат',
+            'показатель', 'число'
+        }
+
+        abstract_count = sum(1 for term in abstract_terms
+                             if term in question.lower())
+        concrete_count = sum(1 for term in concrete_terms
+                             if term in question.lower())
+
+        if abstract_count > 2 and concrete_count == 0:
+            return 'too_high'
+        elif abstract_count == 0 and concrete_count > 2:
+            return 'too_low'
+        return 'optimal'
+
+    def _identify_question_type(self, question: str) -> str:
+        """
+        Определение типа вопроса
+
+        Args:
+            question: Анализируемый вопрос
+
+        Returns:
+            'optimal', 'too_simple', 'too_complex'
+        """
+        question_types = {
+            'descriptive': {'описать', 'рассказать', 'характеризовать'},
+            'analytical': {'анализировать', 'сравнить', 'оценить'},
+            'practical': {'применить', 'использовать', 'реализовать'}
+        }
+
+        type_count = Counter()
+        for q_type, keywords in question_types.items():
+            count = sum(1 for keyword in keywords
+                        if keyword in question.lower())
+            if count > 0:
+                type_count[q_type] = count
+
+        if len(type_count) >= 2:
+            return 'optimal'
+        elif len(type_count) == 0:
+            return 'too_simple'
+        return 'too_complex'
+
+    def _match_student_level(self, question: str) -> str:
+        """
+        Проверка соответствия вопроса уровню студента
+
+        Args:
+            question: Анализируемый вопрос
+
+        Returns:
+            'optimal', 'too_hard', 'too_easy'
+        """
+        advanced_terms = {
+            'методология', 'парадигма', 'теоретическая модель',
+            'эмпирический анализ', 'статистическая обработка'
+        }
+        basic_terms = {
+            'пример', 'факт', 'данные', 'результат',
+            'показатель', 'число'
+        }
+
+        advanced_count = sum(1 for term in advanced_terms
+                             if term in question.lower())
+        basic_count = sum(1 for term in basic_terms
+                          if term in question.lower())
+
+        if advanced_count > 3:
+            return 'too_hard'
+        elif basic_count > 3:
+            return 'too_easy'
+        return 'optimal'

From 39d54cfdceb0b7e6445742e2cfc5e4cc37ec205e Mon Sep 17 00:00:00 2001
From: kiyro7 <92889789+kiyro7@users.noreply.github.com>
Date: Wed, 26 Nov 2025 16:54:58 +0300
Subject: [PATCH 02/17] first prototype

---
 app/questions_generator/Dockerfile   | 2 ++
 app/questions_generator/README.md    | 2 +-
 app/questions_generator/run.py       | 7 +++++++
 app/questions_generator/validator.py | 2 +-
 4 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/app/questions_generator/Dockerfile b/app/questions_generator/Dockerfile
index 766e297..b6e8dda 100644
--- a/app/questions_generator/Dockerfile
+++ b/app/questions_generator/Dockerfile
@@ -18,6 +18,8 @@ RUN pip install --no-cache-dir --upgrade pip \
 # 4. NLTK
 RUN python -m nltk.downloader punkt stopwords
 
+RUN python -m nltk.downloader punkt
+
 # 5. Copy local model
 COPY rut5-base/ /app/rut5-base/
 
diff --git a/app/questions_generator/README.md b/app/questions_generator/README.md
index ba6c080..2b22193 100644
--- a/app/questions_generator/README.md
+++ b/app/questions_generator/README.md
@@ -7,6 +7,6 @@
 - `hf download cointegrated/rut5-base-multitask --local-dir rut5-base`
 ## Выбор файла ВКР
 - заменить в `run.py` в функции `main` путь для файла ВКР
-## Запуск
+## Запуск (после любых изменений)
 - `docker build -t vkr-generator .`
 - `docker run -it --rm vkr-generator`
diff --git a/app/questions_generator/run.py b/app/questions_generator/run.py
index c8940db..7008cad 100644
--- a/app/questions_generator/run.py
+++ b/app/questions_generator/run.py
@@ -3,6 +3,7 @@
 import sys
 import os
 from docx import Document
+import nltk
 
 
 def load_vkr_text(path: str) -> str:
@@ -19,6 +20,12 @@ def load_vkr_text(path: str) -> str:
 
 
 def main():
+    try:
+        nltk.data.find('tokenizers/punkt_tab/english')
+    except LookupError:
+        print("Загрузка необходимых данных NLTK...")
+        nltk.download('punkt_tab')
+
     print("=== Загрузка текста ВКР ===")
     text = load_vkr_text("vkr_examples/VKR1.docx")
 
diff --git a/app/questions_generator/validator.py b/app/questions_generator/validator.py
index 8441006..c4b8900 100644
--- a/app/questions_generator/validator.py
+++ b/app/questions_generator/validator.py
@@ -17,8 +17,8 @@ def __init__(self, vkr_text: str):
             vkr_text: Полный текст ВКР
         """
         self.vkr_text = vkr_text.lower()
-        self.keywords = self._extract_keywords()
         self.stopwords = set(stopwords.words('russian'))
+        self.keywords = self._extract_keywords()
 
     def _extract_keywords(self) -> Dict[str, Set[str]]:
         """

From d813c886cfadb991b539f85c1b3136687a95d8b2 Mon Sep 17 00:00:00 2001
From: kiyro7 <92889789+kiyro7@users.noreply.github.com>
Date: Wed, 26 Nov 2025 17:17:52 +0300
Subject: [PATCH 03/17] added LLM questions marker

---
 app/questions_generator/generator.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/app/questions_generator/generator.py b/app/questions_generator/generator.py
index 4462f69..77efe22 100644
--- a/app/questions_generator/generator.py
+++ b/app/questions_generator/generator.py
@@ -146,5 +146,6 @@ def generate_all(self) -> List[str]:
         """
         result = []
         result.extend(self.heuristic_questions())
-        result.extend(self.generate_llm_questions(count=7))
+        result.extend(["Начало rut5-base-multitask вопросов"])
+        result.extend(self.generate_llm_questions(count=10))
         return list(dict.fromkeys(result))  # убрать дубли

From 8e42c8e542755f8a148367bfec28bb6e3923fabe Mon Sep 17 00:00:00 2001
From: kiyro7 <92889789+kiyro7@users.noreply.github.com>
Date: Wed, 10 Dec 2025 13:06:11 +0300
Subject: [PATCH 04/17] removed methodology

---
 app/questions_generator/generator.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/app/questions_generator/generator.py b/app/questions_generator/generator.py
index 77efe22..292c0e9 100644
--- a/app/questions_generator/generator.py
+++ b/app/questions_generator/generator.py
@@ -37,9 +37,6 @@ def extract_intro(self) -> str:
     def extract_conclusion(self) -> str:
         return self.extract_section("Заключение")
 
-    def extract_methodology(self) -> str:
-        return self.extract_section("Методолог")
-
     # ---------------------------------------------------------
     # --- 2. ЭВРИСТИКА: Поиск ключевых концепций ---
     # ---------------------------------------------------------
@@ -79,7 +76,6 @@ def heuristic_questions(self) -> List[str]:
         """
         intro = self.extract_intro()
         conc = self.extract_conclusion()
-        meth = self.extract_methodology()
         keywords = self.extract_keywords(self.vkr_text)
 
         q = []
@@ -88,11 +84,6 @@ def heuristic_questions(self) -> List[str]:
         if intro and conc:
             q.append("Как сформулированные во введении задачи связаны с выводами работы?")
 
-        # --- По методологии ---
-        if meth:
-            for kw in keywords[:3]:
-                q.append(f"Почему был выбран метод {kw} и где он применён в работе?")
-
         # --- По выводам ---
         if conc:
             q.append("На основании каких данных был сделан ключевой вывод в заключении?")

From 48ed43cbfd38d5e28750325a019269bfa4c90f5d Mon Sep 17 00:00:00 2001
From: kiyro7 <92889789+kiyro7@users.noreply.github.com>
Date: Wed, 10 Dec 2025 13:17:09 +0300
Subject: [PATCH 05/17] requirements.txt added versions

---
 app/questions_generator/requirements.txt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/app/questions_generator/requirements.txt b/app/questions_generator/requirements.txt
index fce2882..c833faf 100644
--- a/app/questions_generator/requirements.txt
+++ b/app/questions_generator/requirements.txt
@@ -1,5 +1,5 @@
-transformers
-sentencepiece
-nltk
-huggingface_hub
-python-docx
+transformers==4.57.3
+sentencepiece==0.2.1
+nltk==3.9.2
+huggingface_hub==1.2.1
+python-docx==1.2.0

From 1bcf046661bdef9f0b2aeb045ac6263b649f554f Mon Sep 17 00:00:00 2001
From: kiyro7 <92889789+kiyro7@users.noreply.github.com>
Date: Wed, 10 Dec 2025 14:39:58 +0300
Subject: [PATCH 06/17] simplified docker

---
 app/questions_generator/Dockerfile           | 24 ++++++----
 app/questions_generator/README.md            | 15 ++-----
 app/questions_generator/docker-entrypoint.sh | 47 ++++++++++++++++++++
 app/questions_generator/generator.py         |  2 +-
 app/questions_generator/requirements.txt     |  2 +-
 app/questions_generator/run.py               | 35 +++++++++++----
 app/questions_generator/run_docker.py        | 45 +++++++++++++++++++
 7 files changed, 140 insertions(+), 30 deletions(-)
 create mode 100644 app/questions_generator/docker-entrypoint.sh
 create mode 100644 app/questions_generator/run_docker.py

diff --git a/app/questions_generator/Dockerfile b/app/questions_generator/Dockerfile
index b6e8dda..4079bea 100644
--- a/app/questions_generator/Dockerfile
+++ b/app/questions_generator/Dockerfile
@@ -13,18 +13,24 @@ WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir --upgrade pip \
     && pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu \
-    && pip install --no-cache-dir -r requirements.txt
+    && pip install --no-cache-dir -r requirements.txt \
+    && pip install --no-cache-dir "huggingface_hub[cli]"
 
-# 4. NLTK
-RUN python -m nltk.downloader punkt stopwords
+# NLTK будет качаться в отдельный каталог (volume)
+ENV NLTK_DATA=/nltk_data
 
-RUN python -m nltk.downloader punkt
+# 4. Volume'ы:
+#   - /app/question_generator/rut5-base  — модель ruT5
+#   - /nltk_data                          — данные NLTK
+VOLUME ["/app/question_generator/rut5-base", "/nltk_data"]
 
-# 5. Copy local model
-COPY rut5-base/ /app/rut5-base/
-
-# 6. Copy project
+# 5. Копируем проект
 COPY . .
 
-# 7. Run
+# 6. Копируем entrypoint-скрипт
+COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
+RUN chmod +x /usr/local/bin/docker-entrypoint.sh
+
+# 7. Точка входа: сначала — скрипт, затем основной CMD
+ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
 CMD ["python", "run.py"]
diff --git a/app/questions_generator/README.md b/app/questions_generator/README.md
index 2b22193..c57af3c 100644
--- a/app/questions_generator/README.md
+++ b/app/questions_generator/README.md
@@ -1,12 +1,5 @@
-# Запуск
+## Сборка
+`docker build -t vkr-generator .`
 
-## Загрузка модели локально (единоразово)
-- `powershell -ExecutionPolicy ByPass -c "irm https://hf.co/cli/install.ps1 | iex"` (windows)
-- `curl -LsSf https://hf.co/cli/install.sh | bash` (linux/macos)
-- `cd app\questions_generator`
-- `hf download cointegrated/rut5-base-multitask --local-dir rut5-base`
-## Выбор файла ВКР
-- заменить в `run.py` в функции `main` путь для файла ВКР
-## Запуск (после любых изменений)
-- `docker build -t vkr-generator .`
-- `docker run -it --rm vkr-generator`
+## Запуск
+`python run_docker.py <путь к файлу с текстом ВКР>`
\ No newline at end of file
diff --git a/app/questions_generator/docker-entrypoint.sh b/app/questions_generator/docker-entrypoint.sh
new file mode 100644
index 0000000..c4a5243
--- /dev/null
+++ b/app/questions_generator/docker-entrypoint.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+set -e
+
+MODEL_DIR="/app/question_generator/rut5-base"
+NLTK_DIR="${NLTK_DATA:-/nltk_data}"
+
+echo "MODEL_DIR=${MODEL_DIR}"
+echo "NLTK_DIR=${NLTK_DIR}"
+
+# Гарантируем, что каталоги существуют
+mkdir -p "$MODEL_DIR" "$NLTK_DIR"
+
+########################################
+# 1. Загрузка модели в volume (один раз)
+########################################
+if [ -z "$(ls -A "$MODEL_DIR" 2>/dev/null)" ]; then
+  echo "Model directory is empty. Downloading model to $MODEL_DIR..."
+  huggingface-cli download \
+    cointegrated/rut5-base-multitask \
+    --local-dir "$MODEL_DIR" \
+    --local-dir-use-symlinks False
+  echo "Model downloaded."
+else
+  echo "Model directory is not empty, skipping download."
+fi
+
+########################################
+# 2. Загрузка данных NLTK в volume
+########################################
+if [ -z "$(ls -A "$NLTK_DIR" 2>/dev/null)" ]; then
+  echo "NLTK data directory is empty. Downloading 'punkt' and 'stopwords' to $NLTK_DIR..."
+  python - <<EOF
+import nltk
+nltk.data.path = ["$NLTK_DIR"] + nltk.data.path
+nltk.download("punkt", download_dir="$NLTK_DIR")
+nltk.download("stopwords", download_dir="$NLTK_DIR")
+EOF
+  echo "NLTK data downloaded."
+else
+  echo "NLTK data directory is not empty, skipping download."
+fi
+
+# Экспортируем путь для NLTK
+export NLTK_DATA="$NLTK_DIR"
+
+echo "Starting application..."
+exec "$@"
diff --git a/app/questions_generator/generator.py b/app/questions_generator/generator.py
index 292c0e9..8056af5 100644
--- a/app/questions_generator/generator.py
+++ b/app/questions_generator/generator.py
@@ -10,7 +10,7 @@ class VkrQuestionGenerator:
     Генератор вопросов по тексту ВКР.
     Основан на гибридном подходе: NLTK + rut5-base-multitask.
     """
-    def __init__(self, vkr_text: str, model_path: str = "./rut5-base"):
+    def __init__(self, vkr_text: str, model_path: str):
         self.vkr_text = vkr_text
         self.sentences = sent_tokenize(vkr_text)
         self.stopwords = set(stopwords.words("russian"))
diff --git a/app/questions_generator/requirements.txt b/app/questions_generator/requirements.txt
index c833faf..191cdc8 100644
--- a/app/questions_generator/requirements.txt
+++ b/app/questions_generator/requirements.txt
@@ -1,5 +1,5 @@
 transformers==4.57.3
 sentencepiece==0.2.1
 nltk==3.9.2
-huggingface_hub==1.2.1
+huggingface_hub==0.36.0
 python-docx==1.2.0
diff --git a/app/questions_generator/run.py b/app/questions_generator/run.py
index 7008cad..a8ba337 100644
--- a/app/questions_generator/run.py
+++ b/app/questions_generator/run.py
@@ -1,10 +1,13 @@
-from generator import VkrQuestionGenerator
-from validator import VkrQuestionValidator
 import sys
 import os
+import argparse
+
 from docx import Document
 import nltk
 
+from generator import VkrQuestionGenerator
+from validator import VkrQuestionValidator
+
 
 def load_vkr_text(path: str) -> str:
     if not os.path.exists(path):
@@ -16,21 +19,37 @@ def load_vkr_text(path: str) -> str:
     for paragraph in document.paragraphs:
         text.append(paragraph.text)
 
-    return '\n'.join(text)
+    return "\n".join(text)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Генерация экзаменационных вопросов по тексту ВКР"
+    )
+    parser.add_argument(
+        "vkr_path",
+        nargs="?",
+        default="vkr_examples/VKR1.docx",
+        help="Путь к .docx файлу с текстом ВКР (по умолчанию: vkr_examples/VKR1.docx)",
+    )
+    return parser.parse_args()
 
 
 def main():
+    args = parse_args()
+    vkr_path = args.vkr_path
+
     try:
-        nltk.data.find('tokenizers/punkt_tab/english')
+        nltk.data.find("tokenizers/punkt_tab/english")
     except LookupError:
         print("Загрузка необходимых данных NLTK...")
-        nltk.download('punkt_tab')
+        nltk.download("punkt_tab")
 
-    print("=== Загрузка текста ВКР ===")
-    text = load_vkr_text("vkr_examples/VKR1.docx")
+    print(f"=== Загрузка текста ВКР из '{vkr_path}' ===")
+    text = load_vkr_text(vkr_path)
 
     print("=== Инициализация генератора ===")
-    gen = VkrQuestionGenerator(text, model_path="/app/rut5-base")
+    gen = VkrQuestionGenerator(text, model_path="/app/question_generator/rut5-base")
 
     print("=== Инициализация валидатора ===")
     validator = VkrQuestionValidator(text)
diff --git a/app/questions_generator/run_docker.py b/app/questions_generator/run_docker.py
new file mode 100644
index 0000000..30c986e
--- /dev/null
+++ b/app/questions_generator/run_docker.py
@@ -0,0 +1,45 @@
+import os
+import sys
+import argparse
+import subprocess
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Запуск генератора вопросов по ВКР внутри Docker"
+    )
+    parser.add_argument(
+        "vkr_path",
+        help="Путь к .docx файлу с текстом ВКР (на хосте)",
+    )
+    args = parser.parse_args()
+
+    host_path = os.path.abspath(args.vkr_path)
+
+    if not os.path.exists(host_path):
+        print(f"[ERROR] Файл не найден: {host_path}")
+        sys.exit(1)
+
+    # Путь внутри контейнера — фиксированный, один и тот же для всех ОС
+    container_path = "/app/questions_generator/vkr_examples/vkr.docx"
+
+    cmd = [
+        "docker", "run", "-it", "--rm",
+        "-v", "rut5-model:/app/question_generator/rut5-base",
+        "-v", "rut5-nltk:/nltk_data",
+        "-v", f"{host_path}:{container_path}:ro",
+        "vkr-generator",
+        "python", "run.py", container_path,
+    ]
+
+    print(">> Запускаю команду:")
+    print(" ".join(cmd))
+    try:
+        subprocess.run(cmd, check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"[ERROR] docker run завершился с ошибкой: {e.returncode}")
+        sys.exit(e.returncode)
+
+
+if __name__ == "__main__":
+    main()

From 8a54af15e1df53b7af59a090d216ee1c475941aa Mon Sep 17 00:00:00 2001
From: kiyro7 <92889789+kiyro7@users.noreply.github.com>
Date: Wed, 10 Dec 2025 15:00:22 +0300
Subject: [PATCH 07/17] heuristic patterns update

---
 app/questions_generator/README.md    |   2 +-
 app/questions_generator/generator.py | 144 ++++++++++++++-------------
 2 files changed, 75 insertions(+), 71 deletions(-)

diff --git a/app/questions_generator/README.md b/app/questions_generator/README.md
index c57af3c..ade6c3f 100644
--- a/app/questions_generator/README.md
+++ b/app/questions_generator/README.md
@@ -2,4 +2,4 @@
 `docker build -t vkr-generator .`
 
 ## Запуск
-`python run_docker.py <путь к файлу с текстом ВКР>`
\ No newline at end of file
+`python run_docker.py <путь к файлу с текстом ВКР>`
diff --git a/app/questions_generator/generator.py b/app/questions_generator/generator.py
index 8056af5..58f48ae 100644
--- a/app/questions_generator/generator.py
+++ b/app/questions_generator/generator.py
@@ -1,33 +1,36 @@
 import re
 from typing import List, Dict
+
 from nltk.tokenize import sent_tokenize, word_tokenize
 from nltk.corpus import stopwords
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 
 
 class VkrQuestionGenerator:
-    """
-    Генератор вопросов по тексту ВКР.
-    Основан на гибридном подходе: NLTK + rut5-base-multitask.
-    """
-    def __init__(self, vkr_text: str, model_path: str):
+    """Гибридный генератор вопросов по ВКР: NLTK + rut5-base-multitask."""
+
+    SECTION_PATTERNS: Dict[str, str] = {
+        "Введение": r"Введение.*?(?=\n[A-ZА-Я][^\n]*\n)",
+        "Обзор предметной области": r"Обзор предметной области.*?(?=\n[A-ZА-Я][^\n]*\n)",
+        "Постановка задачи": r"Постановка задачи.*?(?=\n[A-ZА-Я][^\n]*\n)",
+        "Метод решения": r"Метод решения.*?(?=\n[A-ZА-Я][^\n]*\n)",
+        "Исследования": r"Исследования.*?(?=\n[A-ZА-Я][^\n]*\n)",
+        "Заключение": r"Заключение.*?(?=\n[A-ZА-Я][^\n]*\n)",
+        "Приложения": r"Приложения.*?(?=\n[A-ZА-Я][^\n]*\n)",
+    }
+
+    def __init__(self, vkr_text: str, model_path: str = "ai-forever/rut5-base-multitask"):
         self.vkr_text = vkr_text
         self.sentences = sent_tokenize(vkr_text)
         self.stopwords = set(stopwords.words("russian"))
 
-        # ---- Модель rut5 ----
+        # Модель rut5-base-multitask для языкового оформления вопросов
         self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
         self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
 
-    # ---------------------------------------------------------
-    # --- 1. ЭВРИСТИКА: Извлечение ключевых частей ВКР ---
-    # ---------------------------------------------------------
-
     def extract_section(self, title: str) -> str:
-        """
-        Универсальный метод извлечения раздела по заголовку.
-        """
-        pattern = rf"{title}.*?(?=\n[A-ZА-Я][^\n]*\n)"
+        """Извлекает раздел по шаблону заголовка."""
+        pattern = self.SECTION_PATTERNS.get(title, rf"{title}.*?(?=\n[A-ZА-Я][^\n]*\n)")
         m = re.search(pattern, self.vkr_text, re.DOTALL | re.IGNORECASE)
         return m.group(0) if m else ""
 
@@ -37,77 +40,86 @@ def extract_intro(self) -> str:
     def extract_conclusion(self) -> str:
         return self.extract_section("Заключение")
 
-    # ---------------------------------------------------------
-    # --- 2. ЭВРИСТИКА: Поиск ключевых концепций ---
-    # ---------------------------------------------------------
-
     def extract_keywords(self, text: str) -> List[str]:
+        """Извлекает ключевые слова из текста."""
         tokens = word_tokenize(text.lower())
         return [
             t for t in tokens
             if t.isalnum() and t not in self.stopwords and len(t) > 4
         ]
 
-    # ---------------------------------------------------------
-    # --- 3. Генерация вопросов через rut5 (режим ask) ---
-    # ---------------------------------------------------------
-
     def llm_generate_question(self, text_fragment: str) -> str:
-        """
-        Генерация вопроса по фрагменту текста через rut5 ask
-        """
+        """Генерирует формулировку вопроса через rut5 ask."""
         prompt = f"ask: {text_fragment}"
         enc = self.tokenizer(prompt, return_tensors="pt", truncation=True)
         out = self.model.generate(
             **enc,
             max_length=64,
             num_beams=5,
-            early_stopping=True
+            early_stopping=True,
         )
         return self.tokenizer.decode(out[0], skip_special_tokens=True)
 
-    # ---------------------------------------------------------
-    # --- 4. ЭВРИСТИЧЕСКИЕ ШАБЛОНЫ (из документа) ---
-    # ---------------------------------------------------------
-
     def heuristic_questions(self) -> List[str]:
-        """
-        Генерация вопросов по эвристикам из загруженных PDF.
-        """
+        """Эвристики, завязанные на структуру ВКР."""
         intro = self.extract_intro()
+        overview = self.extract_section("Обзор предметной области")
+        objectives = self.extract_section("Постановка задачи")
+        method = self.extract_section("Метод решения")
+        research = self.extract_section("Исследования")
         conc = self.extract_conclusion()
-        keywords = self.extract_keywords(self.vkr_text)
+        apps = self.extract_section("Приложения")
 
-        q = []
+        q: List[str] = []
 
-        # --- По связям между разделами ---
+        # Введение ↔ Заключение
         if intro and conc:
-            q.append("Как сформулированные во введении задачи связаны с выводами работы?")
-
-        # --- По выводам ---
-        if conc:
-            q.append("На основании каких данных был сделан ключевой вывод в заключении?")
-
-        # --- Общие вопросы (из документа) ---
+            q.append(
+                "Как цель и задачи, сформулированные во введении, отражены в итоговых выводах заключения?"
+            )
+
+        # Обзор предметной области
+        if overview:
+            q.append(
+                "Какие термины и подходы из обзора предметной области легли в основу формальной постановки задачи?"
+            )
+
+        # Постановка задачи
+        if objectives:
+            q.append(
+                "В каких требованиях к решению, указанных в постановке задачи, находят отражение цели работы?"
+            )
+
+        # Метод решения
+        if method:
+            q.append(
+                "Как архитектура и алгоритмы, описанные в разделе «Метод решения», обеспечивают достижение поставленных требований?"
+            )
+
+        # Исследования
+        if research:
+            q.append(
+                "Какие количественные или качественные свойства решения подтверждены в разделе «Исследования» и как они связаны с задачами введения?"
+            )
+
+        # Приложения
+        if apps:
+            q.append(
+                "Какие дополнительные материалы из приложений необходимы для проверки воспроизводимости результатов?"
+            )
+
+        # Обязательные общие вопросы
         q.extend([
-            "Есть ли опенсорс аналоги упомянутых решений?",
-            "В чем практическая значимость представленного метода?",
-            "Какие ограничения имеет разработанный подход?",
-            "Для каких дополнительных задач можно применить полученные результаты?",
+            "Как практическая значимость работы следует из задач и результатов исследования?",
+            "Какие ограничения метода решения указаны в тексте и как они влияют на достижение цели?",
         ])
 
         return q
 
-    # ---------------------------------------------------------
-    # --- 5. Гибридная генерация: LLM + эвристики ---
-    # ---------------------------------------------------------
-
-    def generate_llm_questions(self, count=5) -> List[str]:
-        """
-        Генерация N вопросов через rut5 по ключевым фрагментам документа.
-        """
-        q = []
-        fragments = self.sentences[:40]  # первые ~40 предложений для контекста
+    def generate_llm_questions(self, count: int = 5) -> List[str]:
+        """Генерирует N вопросов через rut5 по ключевым фрагментам документа."""
+        q: List[str] = []
+        fragments = self.sentences[:40]
 
         step = max(1, len(fragments) // count)
 
@@ -117,7 +129,7 @@ def generate_llm_questions(self, count=5) -> List[str]:
                 llm_q = self.llm_generate_question(frag)
                 if len(llm_q) > 10:
                     q.append(llm_q)
-            except:
+            except Exception:  # noqa: BLE001
                 continue
 
             if len(q) >= count:
@@ -125,18 +137,10 @@ def generate_llm_questions(self, count=5) -> List[str]:
 
         return q
 
-    # ---------------------------------------------------------
-    # --- 6. Главный метод ---
-    # ---------------------------------------------------------
-
     def generate_all(self) -> List[str]:
-        """
-        Генерирует полный набор вопросов:
-        - эвристические
-        - модельные (LLM)
-        """
-        result = []
+        """Генерирует полный набор вопросов: эвристики + LLM."""
+        result: List[str] = []
         result.extend(self.heuristic_questions())
-        result.extend(["Начало rut5-base-multitask вопросов"])
+        result.extend(["--- rut5-base-multitask вопросы ---"])
         result.extend(self.generate_llm_questions(count=10))
-        return list(dict.fromkeys(result))  # убрать дубли
+        return list(dict.fromkeys(result))

From d7a57d7c347dae93638fee1f5aba5e2730c60c72 Mon Sep 17 00:00:00 2001
From: kiyro7 <92889789+kiyro7@users.noreply.github.com>
Date: Wed, 10 Dec 2025 15:03:49 +0300
Subject: [PATCH 08/17] updated questions ranking and added examples

---
 app/questions_generator/README.md | 92 +++++++++++++++++++++++++++++++
 app/questions_generator/run.py    |  2 +-
 2 files changed, 93 insertions(+), 1 deletion(-)

diff --git a/app/questions_generator/README.md b/app/questions_generator/README.md
index ade6c3f..a0fb72a 100644
--- a/app/questions_generator/README.md
+++ b/app/questions_generator/README.md
@@ -3,3 +3,95 @@
 
 ## Запуск
 `python run_docker.py <путь к файлу с текстом ВКР>`
+
+## Пример сгенерированных вопросов по тексту ВКР
+
+[✔ OK] Как цель и задачи, сформулированные во введении, отражены в итоговых выводах заключения?
+  - relevance: True
+  - clarity:   True
+  - difficulty:False
+
+[✔ OK] Какие термины и подходы из обзора предметной области легли в основу формальной постановки задачи?
+  - relevance: True
+  - clarity:   True
+  - difficulty:False
+
+[✖ FAIL] В каких требованиях к решению, указанных в постановке задачи, находят отражение цели работы?
+  - relevance: False
+  - clarity:   True
+  - difficulty:False
+
+[✖ FAIL] Какие количественные или качественные свойства решения подтверждены в разделе «Исследования» и как они связаны с задачами введения?
+  - relevance: True
+  - clarity:   False
+  - difficulty:False
+
+[✔ OK] Какие дополнительные материалы из приложений необходимы для проверки воспроизводимости результатов?
+  - relevance: True
+  - clarity:   True
+  - difficulty:False
+
+[✔ OK] Как практическая значимость работы следует из задач и результатов исследования?
+  - relevance: True
+  - clarity:   True
+  - difficulty:False
+
+[✔ OK] Какие ограничения метода решения указаны в тексте и как они влияют на достижение цели?
+  - relevance: True
+  - clarity:   True
+  - difficulty:False
+
+[✖ FAIL] --- rut5-base-multitask вопросы ---
+  - relevance: False
+  - clarity:   False
+  - difficulty:False
+
+[✖ FAIL] Что такое ЛЭТИ?
+  - relevance: False
+  - clarity:   False
+  - difficulty:False
+
+[✖ FAIL] Что является целью работы в веб-приложении?
+  - relevance: True
+  - clarity:   False
+  - difficulty:False
+
+[✖ FAIL] Что было проведено в конце работы?
+  - relevance: False
+  - clarity:   False
+  - difficulty:False
+
+[✔ OK] Что могут изменять объекты, располагаемые на карте?
+  - relevance: True
+  - clarity:   True
+  - difficulty:False
+
+[✔ OK] Что представляет собой создание набора программных средств для отображения объектов на карте?
+  - relevance: True
+  - clarity:   True
+  - difficulty:False
+
+[✖ FAIL] Сформировать требования к набору программных средств?
+  - relevance: True
+  - clarity:   False
+  - difficulty:False
+
+[✖ FAIL] Что является объектом исследования?
+  - relevance: True
+  - clarity:   False
+  - difficulty:False
+
+[✖ FAIL] Что существует уже давно?
+  - relevance: True
+  - clarity:   False
+  - difficulty:False
+
+[✔ OK] Что можно дать в контексте набора программных средств?
+  - relevance: True
+  - clarity:   True
+  - difficulty:False
+
+[✖ FAIL] ГИС является интегрированной информационной системой?
+  - relevance: True
+  - clarity:   False
+  - difficulty:False
diff --git a/app/questions_generator/run.py b/app/questions_generator/run.py
index a8ba337..36342d5 100644
--- a/app/questions_generator/run.py
+++ b/app/questions_generator/run.py
@@ -63,7 +63,7 @@ def main():
         clr = validator.check_clarity(q)
         diff = validator.check_difficulty(q)
 
-        status = "✔ OK" if (rel and clr and diff) else "✖ FAIL"
+        status = "✔ OK" if (int(rel) + int(clr) + int(diff) >= 2) else "✖ FAIL"  # хотя бы 2 условия выполнены
 
         print(f"\n[{status}] {q}")
         print(f"  - relevance: {rel}")

From e20a3e0c64d7e42d64a4deaa6951e9a6abc7bf05 Mon Sep 17 00:00:00 2001
From: kiyro7 <92889789+kiyro7@users.noreply.github.com>
Date: Wed, 24 Dec 2025 18:29:24 +0300
Subject: [PATCH 09/17] docker-compose finally done

---
 app/questions_generator/Dockerfile            | 38 +++++++++----------
 app/questions_generator/docker-compose.yml    | 24 ++++++++++++
 .../init-volumes.sh}                          | 29 ++++++--------
 3 files changed, 52 insertions(+), 39 deletions(-)
 create mode 100644 app/questions_generator/docker-compose.yml
 rename app/questions_generator/{docker-entrypoint.sh => docker/init-volumes.sh} (56%)

diff --git a/app/questions_generator/Dockerfile b/app/questions_generator/Dockerfile
index 4079bea..3b40b24 100644
--- a/app/questions_generator/Dockerfile
+++ b/app/questions_generator/Dockerfile
@@ -1,36 +1,32 @@
-FROM python:3.10-slim
+FROM python:3.10-slim AS base
 
-# 1. System deps
 RUN apt-get update && apt-get install -y --no-install-recommends \
         git wget gcc g++ \
         libprotobuf-dev protobuf-compiler \
     && rm -rf /var/lib/apt/lists/*
 
-# 2. Workdir
 WORKDIR /app
 
-# 3. Python deps
 COPY requirements.txt .
-RUN pip install --no-cache-dir --upgrade pip \
-    && pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu \
-    && pip install --no-cache-dir -r requirements.txt \
-    && pip install --no-cache-dir "huggingface_hub[cli]"
 
-# NLTK будет качаться в отдельный каталог (volume)
-ENV NLTK_DATA=/nltk_data
+# можно (и полезно) задать глобально:
+ENV PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    PIP_DEFAULT_TIMEOUT=120
 
-# 4. Volume'ы:
-#   - /app/question_generator/rut5-base  — модель ruT5
-#   - /nltk_data                          — данные NLTK
-VOLUME ["/app/question_generator/rut5-base", "/nltk_data"]
+RUN pip install --no-cache-dir torch==2.5.1
+RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install --no-cache-dir "huggingface_hub[cli]"
 
-# 5. Копируем проект
-COPY . .
+ENV NLTK_DATA=/nltk_data
 
-# 6. Копируем entrypoint-скрипт
-COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
-RUN chmod +x /usr/local/bin/docker-entrypoint.sh
+COPY . .
 
-# 7. Точка входа: сначала — скрипт, затем основной CMD
-ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
+# ====== runtime image ======
+FROM base AS app
 CMD ["python", "run.py"]
+
+# ====== init image ======
+FROM base AS init
+COPY docker/init-volumes.sh /usr/local/bin/init-volumes.sh
+RUN chmod +x /usr/local/bin/init-volumes.sh
+ENTRYPOINT ["/usr/local/bin/init-volumes.sh"]
diff --git a/app/questions_generator/docker-compose.yml b/app/questions_generator/docker-compose.yml
new file mode 100644
index 0000000..33f7a0a
--- /dev/null
+++ b/app/questions_generator/docker-compose.yml
@@ -0,0 +1,24 @@
+services:
+  init:
+    build:
+      context: .
+      target: init
+    volumes:
+      - rut5_model:/app/question_generator/rut5-base
+      - nltk_data:/nltk_data
+    restart: "no"
+
+  app:
+    build:
+      context: .
+      target: app
+    depends_on:
+      init:
+        condition: service_completed_successfully
+    volumes:
+      - rut5_model:/app/question_generator/rut5-base
+      - nltk_data:/nltk_data
+
+volumes:
+  rut5_model:
+  nltk_data:
diff --git a/app/questions_generator/docker-entrypoint.sh b/app/questions_generator/docker/init-volumes.sh
similarity index 56%
rename from app/questions_generator/docker-entrypoint.sh
rename to app/questions_generator/docker/init-volumes.sh
index c4a5243..2cf47f8 100644
--- a/app/questions_generator/docker-entrypoint.sh
+++ b/app/questions_generator/docker/init-volumes.sh
@@ -7,12 +7,9 @@ NLTK_DIR="${NLTK_DATA:-/nltk_data}"
 echo "MODEL_DIR=${MODEL_DIR}"
 echo "NLTK_DIR=${NLTK_DIR}"
 
-# Гарантируем, что каталоги существуют
 mkdir -p "$MODEL_DIR" "$NLTK_DIR"
 
-########################################
-# 1. Загрузка модели в volume (один раз)
-########################################
+# 1) ruT5 model (один раз)
 if [ -z "$(ls -A "$MODEL_DIR" 2>/dev/null)" ]; then
   echo "Model directory is empty. Downloading model to $MODEL_DIR..."
   huggingface-cli download \
@@ -24,24 +21,20 @@ else
   echo "Model directory is not empty, skipping download."
 fi
 
-########################################
-# 2. Загрузка данных NLTK в volume
-########################################
+# 2) NLTK data (один раз)
 if [ -z "$(ls -A "$NLTK_DIR" 2>/dev/null)" ]; then
   echo "NLTK data directory is empty. Downloading 'punkt' and 'stopwords' to $NLTK_DIR..."
-  python - <<EOF
+  python - <<'PY'
+import os
 import nltk
-nltk.data.path = ["$NLTK_DIR"] + nltk.data.path
-nltk.download("punkt", download_dir="$NLTK_DIR")
-nltk.download("stopwords", download_dir="$NLTK_DIR")
-EOF
+
+nltk_dir = os.environ.get("NLTK_DATA", "/nltk_data")
+nltk.data.path = [nltk_dir] + nltk.data.path
+
+nltk.download("punkt", download_dir=nltk_dir)
+nltk.download("stopwords", download_dir=nltk_dir)
+PY
   echo "NLTK data downloaded."
 else
   echo "NLTK data directory is not empty, skipping download."
 fi
-
-# Экспортируем путь для NLTK
-export NLTK_DATA="$NLTK_DIR"
-
-echo "Starting application..."
-exec "$@"

From 5694ae7cf95efb85a1a137630798c7b8283e4f17 Mon Sep 17 00:00:00 2001
From: kiyro7 <92889789+kiyro7@users.noreply.github.com>
Date: Wed, 24 Dec 2025 19:00:31 +0300
Subject: [PATCH 10/17] interactive mode

---
 app/questions_generator/Dockerfile         | 2 +-
 app/questions_generator/README.md          | 8 ++++----
 app/questions_generator/docker-compose.yml | 3 +++
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/app/questions_generator/Dockerfile b/app/questions_generator/Dockerfile
index 3b40b24..3c53a21 100644
--- a/app/questions_generator/Dockerfile
+++ b/app/questions_generator/Dockerfile
@@ -23,7 +23,7 @@ COPY . .
 
 # ====== runtime image ======
 FROM base AS app
-CMD ["python", "run.py"]
+CMD ["bash", "-lc", "sleep infinity"]
 
 # ====== init image ======
 FROM base AS init
diff --git a/app/questions_generator/README.md b/app/questions_generator/README.md
index a0fb72a..34bf44e 100644
--- a/app/questions_generator/README.md
+++ b/app/questions_generator/README.md
@@ -1,8 +1,8 @@
-## Сборка
-`docker build -t vkr-generator .`
+## Запуск (контейнер вечно крутится)
+`docker-compose up` - ВАЖНО: Первый раз ОЧЕНЬ ДОЛГО билдится (30-40 минут)!!!
 
-## Запуск
-`python run_docker.py <путь к файлу с текстом ВКР>`
+## Использование (интерактивное)
+`docker compose exec app python run.py /app/vkr_examples/VKR1.docx` - папка `vkr_examples` локальная, лежит рядом с композом
 
 ## Пример сгенерированных вопросов по тексту ВКР
 
diff --git a/app/questions_generator/docker-compose.yml b/app/questions_generator/docker-compose.yml
index 33f7a0a..27f353b 100644
--- a/app/questions_generator/docker-compose.yml
+++ b/app/questions_generator/docker-compose.yml
@@ -15,9 +15,12 @@ services:
     depends_on:
       init:
         condition: service_completed_successfully
+    stdin_open: true
+    tty: true
     volumes:
       - rut5_model:/app/question_generator/rut5-base
       - nltk_data:/nltk_data
+      - ./vkr_examples:/app/vkr_examples  # монтируется для интерактивного запуска с файлами из этой папки (папка рядом с композом)
 
 volumes:
   rut5_model:

From 6ec48774238ff279728741d40a17c2aa6145e7a6 Mon Sep 17 00:00:00 2001
From: kiyro7 <92889789+kiyro7@users.noreply.github.com>
Date: Wed, 24 Dec 2025 19:31:17 +0300
Subject: [PATCH 11/17] logging added

---
 app/questions_generator/generator.py | 231 +++++++++++++--------
 app/questions_generator/run.py       | 138 ++++++++++---
 app/questions_generator/validator.py | 292 +++++++++------------------
 3 files changed, 351 insertions(+), 310 deletions(-)

diff --git a/app/questions_generator/generator.py b/app/questions_generator/generator.py
index 58f48ae..386c5d3 100644
--- a/app/questions_generator/generator.py
+++ b/app/questions_generator/generator.py
@@ -1,4 +1,7 @@
 import re
+import logging
+import time
+from contextlib import contextmanager
 from typing import List, Dict
 
 from nltk.tokenize import sent_tokenize, word_tokenize
@@ -6,6 +9,17 @@
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 
 
+@contextmanager
+def timed(logger: logging.Logger, operation: str, level: int = logging.INFO, **extra):
+    start = time.perf_counter()
+    logger.log(level, "START %s %s", operation, (extra if extra else ""))
+    try:
+        yield
+    finally:
+        elapsed_ms = (time.perf_counter() - start) * 1000.0
+        logger.log(level, "END   %s | %.2f ms %s", operation, elapsed_ms, (extra if extra else ""))
+
+
 class VkrQuestionGenerator:
     """Гибридный генератор вопросов по ВКР: NLTK + rut5-base-multitask."""
 
@@ -20,13 +34,28 @@ class VkrQuestionGenerator:
     }
 
     def __init__(self, vkr_text: str, model_path: str = "ai-forever/rut5-base-multitask"):
-        self.vkr_text = vkr_text
-        self.sentences = sent_tokenize(vkr_text)
-        self.stopwords = set(stopwords.words("russian"))
+        self.logger = logging.getLogger(__name__)
+
+        with timed(self.logger, "generator_init"):
+            self.vkr_text = vkr_text
+
+            with timed(self.logger, "sent_tokenize"):
+                self.sentences = sent_tokenize(vkr_text)
+
+            with timed(self.logger, "load_stopwords"):
+                self.stopwords = set(stopwords.words("russian"))
 
-        # Модель rut5-base-multitask для языкового оформления вопросов
-        self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
-        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
+            # Модель rut5-base-multitask для языкового оформления вопросов
+            with timed(self.logger, "load_tokenizer", model_path=model_path):
+                self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+
+            with timed(self.logger, "load_model", model_path=model_path):
+                self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
+
+        self.logger.info(
+            "Generator ready: sentences=%d stopwords=%d model_path=%s",
+            len(self.sentences), len(self.stopwords), model_path
+        )
 
     def extract_section(self, title: str) -> str:
         """Извлекает раздел по шаблону заголовка."""
@@ -42,105 +71,133 @@ def extract_conclusion(self) -> str:
 
     def extract_keywords(self, text: str) -> List[str]:
         """Извлекает ключевые слова из текста."""
-        tokens = word_tokenize(text.lower())
-        return [
-            t for t in tokens
-            if t.isalnum() and t not in self.stopwords and len(t) > 4
-        ]
+        with timed(self.logger, "extract_keywords", text_len=len(text)):
+            tokens = word_tokenize(text.lower())
+            result = [
+                t for t in tokens
+                if t.isalnum() and t not in self.stopwords and len(t) > 4
+            ]
+        self.logger.info("Keywords extracted: %d", len(result))
+        return result
 
     def llm_generate_question(self, text_fragment: str) -> str:
         """Генерирует формулировку вопроса через rut5 ask."""
         prompt = f"ask: {text_fragment}"
-        enc = self.tokenizer(prompt, return_tensors="pt", truncation=True)
-        out = self.model.generate(
-            **enc,
-            max_length=64,
-            num_beams=5,
-            early_stopping=True,
-        )
-        return self.tokenizer.decode(out[0], skip_special_tokens=True)
+        with timed(self.logger, "llm_generate_question", fragment_len=len(text_fragment)):
+            enc = self.tokenizer(prompt, return_tensors="pt", truncation=True)
+            out = self.model.generate(
+                **enc,
+                max_length=64,
+                num_beams=5,
+                early_stopping=True,
+            )
+            decoded = self.tokenizer.decode(out[0], skip_special_tokens=True)
+        return decoded
 
     def heuristic_questions(self) -> List[str]:
         """Эвристики, завязанные на структуру ВКР."""
-        intro = self.extract_intro()
-        overview = self.extract_section("Обзор предметной области")
-        objectives = self.extract_section("Постановка задачи")
-        method = self.extract_section("Метод решения")
-        research = self.extract_section("Исследования")
-        conc = self.extract_conclusion()
-        apps = self.extract_section("Приложения")
-
-        q: List[str] = []
-
-        # Введение ↔ Заключение
-        if intro and conc:
-            q.append(
-                "Как цель и задачи, сформулированные во введении, отражены в итоговых выводах заключения?"
-            )
-
-        # Обзор предметной области
-        if overview:
-            q.append(
-                "Какие термины и подходы из обзора предметной области легли в основу формальной постановки задачи?"
-            )
-
-        # Постановка задачи
-        if objectives:
-            q.append(
-                "В каких требованиях к решению, указанных в постановке задачи, находят отражение цели работы?"
-            )
-
-        # Метод решения
-        if method:
-            q.append(
-                "Как архитектура и алгоритмы, описанные в разделе «Метод решения», обеспечивают достижение поставленных требований?"
-            )
-
-        # Исследования
-        if research:
-            q.append(
-                "Какие количественные или качественные свойства решения подтверждены в разделе «Исследования» и как они связаны с задачами введения?"
-            )
-
-        # Приложения
-        if apps:
-            q.append(
-                "Какие дополнительные материалы из приложений необходимы для проверки воспроизводимости результатов?"
-            )
-
-        # Обязательные общие вопросы
-        q.extend([
-            "Как практическая значимость работы следует из задач и результатов исследования?",
-            "Какие ограничения метода решения указаны в тексте и как они влияют на достижение цели?",
-        ])
-
+        with timed(self.logger, "heuristic_questions_total"):
+            intro = self.extract_intro()
+            overview = self.extract_section("Обзор предметной области")
+            objectives = self.extract_section("Постановка задачи")
+            method = self.extract_section("Метод решения")
+            research = self.extract_section("Исследования")
+            conc = self.extract_conclusion()
+            apps = self.extract_section("Приложения")
+
+            q: List[str] = []
+
+            # Введение ↔ Заключение
+            if intro and conc:
+                q.append(
+                    "Как цель и задачи, сформулированные во введении, отражены в итоговых выводах заключения?"
+                )
+
+            # Обзор предметной области
+            if overview:
+                q.append(
+                    "Какие термины и подходы из обзора предметной области легли в основу формальной постановки задачи?"
+                )
+
+            # Постановка задачи
+            if objectives:
+                q.append(
+                    "В каких требованиях к решению, указанных в постановке задачи, находят отражение цели работы?"
+                )
+
+            # Метод решения
+            if method:
+                q.append(
+                    "Как архитектура и алгоритмы, описанные в разделе «Метод решения», обеспечивают достижение поставленных требований?"
+                )
+
+            # Исследования
+            if research:
+                q.append(
+                    "Какие количественные или качественные свойства решения подтверждены в разделе «Исследования» и как они связаны с задачами введения?"
+                )
+
+            # Приложения
+            if apps:
+                q.append(
+                    "Какие дополнительные материалы из приложений необходимы для проверки воспроизводимости результатов?"
+                )
+
+            # Обязательные общие вопросы
+            q.extend([
+                "Как практическая значимость работы следует из задач и результатов исследования?",
+                "Какие ограничения метода решения указаны в тексте и как они влияют на достижение цели?",
+            ])
+
+        self.logger.info("Heuristic questions created: %d", len(q))
         return q
 
     def generate_llm_questions(self, count: int = 5) -> List[str]:
         """Генерирует N вопросов через rut5 по ключевым фрагментам документа."""
         q: List[str] = []
         fragments = self.sentences[:40]
-
         step = max(1, len(fragments) // count)
 
-        for i in range(0, len(fragments), step):
-            frag = fragments[i]
-            try:
-                llm_q = self.llm_generate_question(frag)
-                if len(llm_q) > 10:
-                    q.append(llm_q)
-            except Exception:  # noqa: BLE001
-                continue
+        self.logger.info("LLM generation setup: count=%d fragments=%d step=%d", count, len(fragments), step)
+
+        with timed(self.logger, "generate_llm_questions_total", count=count):
+            for i in range(0, len(fragments), step):
+                frag = fragments[i]
+                try:
+                    # Требование: для ИИ — логгировать время каждого вопроса
+                    with timed(self.logger, "llm_question_item", fragment_index=i):
+                        llm_q = self.llm_generate_question(frag)
+
+                    if len(llm_q) > 10:
+                        q.append(llm_q)
+                        self.logger.info("LLM question accepted: idx=%d len=%d", len(q), len(llm_q))
+                    else:
+                        self.logger.info("LLM question rejected (too short): len=%d", len(llm_q))
+
+                except Exception as e:  # noqa: BLE001
+                    self.logger.exception("LLM generation failed at fragment_index=%d: %s", i, e)
+                    continue
 
-            if len(q) >= count:
-                break
+                if len(q) >= count:
+                    break
 
+        self.logger.info("LLM questions created: %d", len(q))
         return q
 
     def generate_all(self) -> List[str]:
         """Генерирует полный набор вопросов: эвристики + LLM."""
-        result: List[str] = []
-        result.extend(self.heuristic_questions())
-        result.extend(["--- rut5-base-multitask вопросы ---"])
-        result.extend(self.generate_llm_questions(count=10))
-        return list(dict.fromkeys(result))
+        with timed(self.logger, "generate_all_total"):
+            result: List[str] = []
+            # Требование: для эвристической генерации можно время создания всех вопросов
+            with timed(self.logger, "generate_heuristic_block"):
+                result.extend(self.heuristic_questions())
+
+            result.extend(["--- rut5-base-multitask вопросы ---"])
+
+            with timed(self.logger, "generate_llm_block"):
+                result.extend(self.generate_llm_questions(count=10))
+
+            deduped = list(dict.fromkeys(result))
+
+        self.logger.info("generate_all done: raw=%d deduped=%d", len(result), len(deduped))
+        return deduped
diff --git a/app/questions_generator/run.py b/app/questions_generator/run.py
index 36342d5..418d0af 100644
--- a/app/questions_generator/run.py
+++ b/app/questions_generator/run.py
@@ -1,6 +1,9 @@
 import sys
 import os
 import argparse
+import logging
+import time
+from contextlib import contextmanager
 
 from docx import Document
 import nltk
@@ -9,17 +12,64 @@
 from validator import VkrQuestionValidator
 
 
+LOG_PATH = os.environ.get("VKR_LOG_PATH", "logs/vkr_question_generator.log")
+
+
+def setup_logging() -> None:
+    os.makedirs(os.path.dirname(LOG_PATH), exist_ok=True)
+
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+
+    # чтобы не дублировать хендлеры при повторном запуске в том же процессе
+    if any(isinstance(h, logging.FileHandler) for h in logger.handlers):
+        return
+
+    fmt = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(name)s:%(lineno)d | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+
+    fh = logging.FileHandler(LOG_PATH, encoding="utf-8")
+    fh.setLevel(logging.INFO)
+    fh.setFormatter(fmt)
+
+    sh = logging.StreamHandler(sys.stdout)
+    sh.setLevel(logging.INFO)
+    sh.setFormatter(fmt)
+
+    logger.addHandler(fh)
+    logger.addHandler(sh)
+
+
+@contextmanager
+def timed(logger: logging.Logger, operation: str, level: int = logging.INFO, **extra):
+    start = time.perf_counter()
+    logger.log(level, "START %s %s", operation, (extra if extra else ""))
+    try:
+        yield
+    finally:
+        elapsed_ms = (time.perf_counter() - start) * 1000.0
+        logger.log(level, "END   %s | %.2f ms %s", operation, elapsed_ms, (extra if extra else ""))
+
+
 def load_vkr_text(path: str) -> str:
+    logger = logging.getLogger(__name__)
+
     if not os.path.exists(path):
+        logger.error("Файл '%s' не найден.", path)
         print(f"[ERROR] Файл '{path}' не найден.")
         sys.exit(1)
 
-    document = Document(path)
-    text = []
-    for paragraph in document.paragraphs:
-        text.append(paragraph.text)
+    with timed(logger, "parse_docx", path=path):
+        document = Document(path)
+        text = []
+        for paragraph in document.paragraphs:
+            text.append(paragraph.text)
+        result = "\n".join(text)
 
-    return "\n".join(text)
+    logger.info("DOCX parsed: chars=%d, paragraphs=%d", len(result), len(document.paragraphs))
+    return result
 
 
 def parse_args() -> argparse.Namespace:
@@ -36,39 +86,79 @@ def parse_args() -> argparse.Namespace:
 
 
 def main():
+    setup_logging()
+    logger = logging.getLogger(__name__)
+
     args = parse_args()
     vkr_path = args.vkr_path
 
-    try:
-        nltk.data.find("tokenizers/punkt_tab/english")
-    except LookupError:
-        print("Загрузка необходимых данных NLTK...")
-        nltk.download("punkt_tab")
+    logger.info("=== RUN START === vkr_path=%s log_path=%s", vkr_path, LOG_PATH)
+
+    with timed(logger, "nltk_check_download"):
+        try:
+            nltk.data.find("tokenizers/punkt_tab/english")
+        except LookupError:
+            logger.info("NLTK punkt_tab not found. Downloading...")
+            print("Загрузка необходимых данных NLTK...")
+            nltk.download("punkt_tab")
 
     print(f"=== Загрузка текста ВКР из '{vkr_path}' ===")
     text = load_vkr_text(vkr_path)
 
     print("=== Инициализация генератора ===")
-    gen = VkrQuestionGenerator(text, model_path="/app/question_generator/rut5-base")
+    with timed(logger, "init_generator"):
+        gen = VkrQuestionGenerator(text, model_path="/app/question_generator/rut5-base")
 
     print("=== Инициализация валидатора ===")
-    validator = VkrQuestionValidator(text)
+    with timed(logger, "init_validator"):
+        validator = VkrQuestionValidator(text)
 
     print("=== Генерация вопросов ===")
-    questions = gen.generate_all()
+    with timed(logger, "generate_all_questions"):
+        questions = gen.generate_all()
 
-    print("\n=== Результаты ===")
-    for q in questions:
-        rel = validator.check_relevance(q)
-        clr = validator.check_clarity(q)
-        diff = validator.check_difficulty(q)
+    logger.info("Questions generated: total=%d", len(questions))
 
-        status = "✔ OK" if (int(rel) + int(clr) + int(diff) >= 2) else "✖ FAIL"  # хотя бы 2 условия выполнены
-
-        print(f"\n[{status}] {q}")
-        print(f"  - relevance: {rel}")
-        print(f"  - clarity:   {clr}")
-        print(f"  - difficulty:{diff}")
+    print("\n=== Результаты ===")
+    ok_count = 0
+    fail_count = 0
+
+    with timed(logger, "validate_all_questions", total=len(questions)):
+        for idx, q in enumerate(questions, start=1):
+            # маркер-разделитель (ваш текстовый разделитель)
+            if q.strip().startswith("---"):
+                logger.info("Separator encountered at %d: %s", idx, q.strip())
+                print(f"\n{q}")
+                continue
+
+            with timed(logger, "validate_question", index=idx):
+                with timed(logger, "check_relevance", index=idx):
+                    rel = validator.check_relevance(q)
+                with timed(logger, "check_clarity", index=idx):
+                    clr = validator.check_clarity(q)
+                with timed(logger, "check_difficulty", index=idx):
+                    diff = validator.check_difficulty(q)
+
+            passed = (int(rel) + int(clr) + int(diff) >= 2)
+            status = "✔ OK" if passed else "✖ FAIL"
+
+            if passed:
+                ok_count += 1
+            else:
+                fail_count += 1
+
+            logger.info(
+                "Question %d status=%s rel=%s clr=%s diff=%s text=%r",
+                idx, ("OK" if passed else "FAIL"), rel, clr, diff, q
+            )
+
+            print(f"\n[{status}] {q}")
+            print(f"  - relevance: {rel}")
+            print(f"  - clarity:   {clr}")
+            print(f"  - difficulty:{diff}")
+
+    logger.info("Validation summary: ok=%d fail=%d total=%d", ok_count, fail_count, len(questions))
+    logger.info("=== RUN END ===")
 
     print("\n=== Готово ===")
 
diff --git a/app/questions_generator/validator.py b/app/questions_generator/validator.py
index c4b8900..463f177 100644
--- a/app/questions_generator/validator.py
+++ b/app/questions_generator/validator.py
@@ -1,13 +1,25 @@
 import re
+import logging
+import time
+from contextlib import contextmanager
 from typing import List, Dict, Set
 from collections import Counter
-import nltk
-from nltk.tokenize import sent_tokenize, word_tokenize
+from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
-import string
 from datetime import datetime
 
 
+@contextmanager
+def timed(logger: logging.Logger, operation: str, level: int = logging.INFO, **extra):
+    start = time.perf_counter()
+    logger.log(level, "START %s %s", operation, (extra if extra else ""))
+    try:
+        yield
+    finally:
+        elapsed_ms = (time.perf_counter() - start) * 1000.0
+        logger.log(level, "END   %s | %.2f ms %s", operation, elapsed_ms, (extra if extra else ""))
+
+
 class VkrQuestionValidator:
     def __init__(self, vkr_text: str):
         """
@@ -16,9 +28,24 @@ def __init__(self, vkr_text: str):
         Args:
             vkr_text: Полный текст ВКР
         """
-        self.vkr_text = vkr_text.lower()
-        self.stopwords = set(stopwords.words('russian'))
-        self.keywords = self._extract_keywords()
+        self.logger = logging.getLogger(__name__)
+
+        with timed(self.logger, "validator_init"):
+            self.vkr_text = vkr_text.lower()
+
+            with timed(self.logger, "validator_load_stopwords"):
+                self.stopwords = set(stopwords.words('russian'))
+
+            with timed(self.logger, "validator_extract_keywords_total"):
+                self.keywords = self._extract_keywords()
+
+        self.logger.info(
+            "Validator ready: stopwords=%d theme=%d goals=%d methodology=%d",
+            len(self.stopwords),
+            len(self.keywords.get("theme", set())),
+            len(self.keywords.get("goals", set())),
+            len(self.keywords.get("methodology", set())),
+        )
 
     def _extract_keywords(self) -> Dict[str, Set[str]]:
         """
@@ -28,34 +55,35 @@ def _extract_keywords(self) -> Dict[str, Set[str]]:
             Словарь с категориями ключевых слов
         """
         keywords = {
-            'theme': set(),  # Тематические слова
-            'goals': set(),  # Слова, связанные с целями
+            'theme': set(),       # Тематические слова
+            'goals': set(),       # Слова, связанные с целями
             'methodology': set()  # Методологические термины
         }
 
-        # Извлечение ключевых слов из введения
-        intro_section = self._extract_introduction()
-        keywords['theme'] = self._tokenize_and_filter(intro_section)
+        with timed(self.logger, "extract_introduction"):
+            intro_section = self._extract_introduction()
+        with timed(self.logger, "tokenize_filter_intro", intro_len=len(intro_section)):
+            keywords['theme'] = self._tokenize_and_filter(intro_section)
 
-        # Извлечение целей из соответствующего раздела
-        goals_section = self._extract_goals_section()
-        keywords['goals'] = self._tokenize_and_filter(goals_section)
+        with timed(self.logger, "extract_goals_section"):
+            goals_section = self._extract_goals_section()
+        with timed(self.logger, "tokenize_filter_goals", goals_len=len(goals_section)):
+            keywords['goals'] = self._tokenize_and_filter(goals_section)
 
-        # Извлечение методологических терминов
-        meth_section = self._extract_methodology_section()
-        keywords['methodology'] = self._tokenize_and_filter(meth_section)
+        with timed(self.logger, "extract_methodology_section"):
+            meth_section = self._extract_methodology_section()
+        with timed(self.logger, "tokenize_filter_methodology", meth_len=len(meth_section)):
+            keywords['methodology'] = self._tokenize_and_filter(meth_section)
 
+        self.logger.info(
+            "Keywords extracted: theme=%d goals=%d methodology=%d",
+            len(keywords["theme"]), len(keywords["goals"]), len(keywords["methodology"])
+        )
         return keywords
 
     def _tokenize_and_filter(self, text: str) -> Set[str]:
         """
         Токенизация и фильтрация текста для получения ключевых слов
-
-        Args:
-            text: Исходный текст для обработки
-
-        Returns:
-            Множество отфильтрованных токенов
         """
         tokens = word_tokenize(text.lower())
         filtered_tokens = [
@@ -67,110 +95,61 @@ def _tokenize_and_filter(self, text: str) -> Set[str]:
         return set(filtered_tokens)
 
     def _extract_introduction(self) -> str:
-        """
-        Извлечение введения из текста ВКР
-
-        Returns:
-            Текст введения
-        """
         intro_pattern = r'введение.*?(?=глава|раздел)'
         match = re.search(intro_pattern, self.vkr_text, re.DOTALL)
         return match.group(0) if match else ""
 
     def _extract_goals_section(self) -> str:
-        """
-        Извлечение раздела с целями и задачами
-
-        Returns:
-            Текст раздела с целями
-        """
         goals_pattern = r'(цель|задачи).*?(?=глава|раздел)'
         match = re.search(goals_pattern, self.vkr_text, re.DOTALL)
         return match.group(0) if match else ""
 
     def _extract_methodology_section(self) -> str:
-        """
-        Извлечение методологического раздела
-
-        Returns:
-            Текст методологического раздела
-        """
         meth_pattern = r'(методология|методы).*?(?=глава|раздел)'
         match = re.search(meth_pattern, self.vkr_text, re.DOTALL)
         return match.group(0) if match else ""
 
     def check_relevance(self, question: str) -> bool:
-        """
-        Проверка релевантности вопроса
+        with timed(self.logger, "validator_check_relevance", q_len=len(question)):
+            score = 0
 
-        Args:
-            question: Проверяемый вопрос
+            theme_match = len(set(question.lower().split()) &
+                              set(self.keywords['theme']))
+            if theme_match > 0:
+                score += 1
 
-        Returns:
-            True если вопрос релевантен, False если нет
-        """
-        score = 0
+            actuality_score = self._calculate_actuality_score(question)
+            score += actuality_score
 
-        # Проверка соответствия теме
-        theme_match = len(set(question.lower().split()) &
-                          set(self.keywords['theme']))
-        if theme_match > 0:
-            score += 1
+            goal_match = len(set(question.lower().split()) &
+                             set(self.keywords['goals']))
+            if goal_match > 0:
+                score += 1
 
-        # Проверка актуальности
-        actuality_score = self._calculate_actuality_score(question)
-        score += actuality_score
+            result = score >= 2
 
-        # Проверка связи с целями
-        goal_match = len(set(question.lower().split()) &
-                         set(self.keywords['goals']))
-        if goal_match > 0:
-            score += 1
-
-        return score >= 2
+        self.logger.info("relevance=%s score=%d q=%r", result, score, question)
+        return result
 
     def _calculate_actuality_score(self, question: str) -> int:
-        """
-        Расчёт актуальности вопроса
-
-        Args:
-            question: Анализируемый вопрос
-
-        Returns:
-            Оценка актуальности (0 или 1)
-        """
         current_year = datetime.now().year
         year_mentions = [int(word) for word in question.split()
                          if word.isdigit() and 1900 <= int(word) <= current_year]
         return max(0, min(1, len(year_mentions)))
 
     def check_completeness(self, questions_list: List[str]) -> bool:
-        """
-        Проверка полноты набора вопросов
+        with timed(self.logger, "validator_check_completeness", total=len(questions_list)):
+            coverage = {
+                'theoretical': self._check_theory_coverage(questions_list),
+                'practical': self._check_practice_coverage(questions_list),
+                'analysis_levels': self._check_analysis_depth(questions_list)
+            }
+            result = all(value >= 0.7 for value in coverage.values())
 
-        Args:
-            questions_list: Список проверяемых вопросов
-
-        Returns:
-            True если набор полный, False если нет
-        """
-        coverage = {
-            'theoretical': self._check_theory_coverage(questions_list),
-            'practical': self._check_practice_coverage(questions_list),
-            'analysis_levels': self._check_analysis_depth(questions_list)
-        }
-        return all(value >= 0.7 for value in coverage.values())
+        self.logger.info("completeness=%s coverage=%s", result, coverage)
+        return result
 
     def _check_theory_coverage(self, questions: List[str]) -> float:
-        """
-        Проверка теоретического охвата вопросами
-
-        Args:
-            questions: Список вопросов для анализа
-
-        Returns:
-            Значение от 0 до 1, показывающее степень покрытия
-        """
         theoretical_terms = {'теория', 'модель', 'концепция', 'принцип'}
         total_questions = len(questions)
         theory_questions = sum(
@@ -180,15 +159,6 @@ def _check_theory_coverage(self, questions: List[str]) -> float:
         return theory_questions / total_questions if total_questions > 0 else 0
 
     def _check_practice_coverage(self, questions: List[str]) -> float:
-        """
-        Проверка практического охвата вопросами
-
-        Args:
-            questions: Список вопросов для анализа
-
-        Returns:
-            Значение от 0 до 1, показывающее степень покрытия
-        """
         practical_terms = {'применение', 'реализация', 'использование', 'результаты'}
         total_questions = len(questions)
         practice_questions = sum(
@@ -198,15 +168,6 @@ def _check_practice_coverage(self, questions: List[str]) -> float:
         return practice_questions / total_questions if total_questions > 0 else 0
 
     def _check_analysis_depth(self, questions: List[str]) -> float:
-        """
-        Проверка глубины анализа в вопросах
-
-        Args:
-            questions: Список вопросов для анализа
-
-        Returns:
-            Значение от 0 до 1, показывающее глубину анализа
-        """
         depth_indicators = {
             'поверхностный': {'что', 'какой'},
             'средний': {'почему', 'как'},
@@ -228,34 +189,19 @@ def _check_analysis_depth(self, questions: List[str]) -> float:
         return sum(depths) / (len(depths) * 2) if depths else 0
 
     def check_clarity(self, question: str) -> bool:
-        """
-        Проверка ясности формулировки вопроса
-
-        Args:
-            question: Проверяемый вопрос
+        with timed(self.logger, "validator_check_clarity", q_len=len(question)):
+            metrics = {
+                'length': self._check_length(question),
+                'complexity': self._calculate_complexity(question),
+                'ambiguity': self._check_ambiguity(question)
+            }
+            result = all(value >= 0.7 for value in metrics.values())
 
-        Returns:
-            True если формулировка ясная, False если нет
-        """
-        metrics = {
-            'length': self._check_length(question),
-            'complexity': self._calculate_complexity(question),
-            'ambiguity': self._check_ambiguity(question)
-        }
-        return all(value >= 0.7 for value in metrics.values())
+        self.logger.info("clarity=%s metrics=%s q=%r", result, metrics, question)
+        return result
 
     def _check_length(self, question: str) -> float:
-        """
-        Проверка длины вопроса
-
-        Args:
-            question: Проверяемый вопрос
-
-        Returns:
-            Нормализованное значение от 0 до 1
-        """
         words = len(question.split())
-        # Оптимальная длина вопроса считается 7-15 слов
         if words < 7:
             return 0.5 * (words / 7)
         elif words > 15:
@@ -263,68 +209,34 @@ def _check_length(self, question: str) -> float:
         return 1.0
 
     def _calculate_complexity(self, question: str) -> float:
-        """
-        Оценка сложности вопроса
-
-        Args:
-            question: Анализируемый вопрос
-
-        Returns:
-            Значение от 0 до 1, показывающее сложность
-        """
         words = question.split()
         unique_words = set(words)
         return min(1.0, len(unique_words) / len(words))
 
     def _check_ambiguity(self, question: str) -> float:
-        """
-        Проверка наличия двусмысленностей в вопросе
-
-        Args:
-            question: Проверяемый вопрос
-
-        Returns:
-            Значение от 0 до 1, где 1 - нет двусмысленностей
-        """
         ambiguous_terms = {
             'или', 'и', 'при этом', 'однако', 'тем не менее',
             'с другой стороны', 'в то же время'
         }
         ambiguity_score = 1.0
-
         for term in ambiguous_terms:
             if term in question.lower():
                 ambiguity_score -= 0.2
-
         return max(0.0, ambiguity_score)
 
     def check_difficulty(self, question: str) -> bool:
-        """
-        Проверка уровня сложности вопроса
+        with timed(self.logger, "validator_check_difficulty", q_len=len(question)):
+            difficulty_metrics = {
+                'abstraction_level': self._assess_abstraction(question),
+                'question_type': self._identify_question_type(question),
+                'student_level_match': self._match_student_level(question)
+            }
+            result = all(value == 'optimal' for value in difficulty_metrics.values())
 
-        Args:
-            question: Проверяемый вопрос
-
-        Returns:
-            True если уровень сложности оптимальный, False если нет
-        """
-        difficulty_metrics = {
-            'abstraction_level': self._assess_abstraction(question),
-            'question_type': self._identify_question_type(question),
-            'student_level_match': self._match_student_level(question)
-        }
-        return all(value == 'optimal' for value in difficulty_metrics.values())
+        self.logger.info("difficulty=%s metrics=%s q=%r", result, difficulty_metrics, question)
+        return result
 
     def _assess_abstraction(self, question: str) -> str:
-        """
-        Оценка уровня абстракции вопроса
-
-        Args:
-            question: Анализируемый вопрос
-
-        Returns:
-            'optimal', 'too_high', 'too_low'
-        """
         abstract_terms = {
             'концепция', 'модель', 'теория', 'абстракция',
             'парадигма', 'методология'
@@ -346,15 +258,6 @@ def _assess_abstraction(self, question: str) -> str:
         return 'optimal'
 
     def _identify_question_type(self, question: str) -> str:
-        """
-        Определение типа вопроса
-
-        Args:
-            question: Анализируемый вопрос
-
-        Returns:
-            'optimal', 'too_simple', 'too_complex'
-        """
         question_types = {
             'descriptive': {'описать', 'рассказать', 'характеризовать'},
             'analytical': {'анализировать', 'сравнить', 'оценить'},
@@ -375,15 +278,6 @@ def _identify_question_type(self, question: str) -> str:
         return 'too_complex'
 
     def _match_student_level(self, question: str) -> str:
-        """
-        Проверка соответствия вопроса уровню студента
-
-        Args:
-            question: Анализируемый вопрос
-
-        Returns:
-            'optimal', 'too_hard', 'too_easy'
-        """
         advanced_terms = {
             'методология', 'парадигма', 'теоретическая модель',
             'эмпирический анализ', 'статистическая обработка'

From 0b28da7b4612b2dde78079bcb5413b5e3d5d8554 Mon Sep 17 00:00:00 2001
From: kiyro7 <92889789+kiyro7@users.noreply.github.com>
Date: Sun, 28 Dec 2025 17:20:46 +0300
Subject: [PATCH 12/17] logging update

---
 app/questions_generator/README.md |  2 +-
 app/questions_generator/run.py    | 97 ++++++++++++++++++++-----------
 2 files changed, 64 insertions(+), 35 deletions(-)

diff --git a/app/questions_generator/README.md b/app/questions_generator/README.md
index 34bf44e..b07802c 100644
--- a/app/questions_generator/README.md
+++ b/app/questions_generator/README.md
@@ -2,7 +2,7 @@
 `docker-compose up` - ВАЖНО: Первый раз ОЧЕНЬ ДОЛГО билдится (30-40 минут)!!!
 
 ## Использование (интерактивное)
-`docker compose exec app python run.py /app/vkr_examples/VKR1.docx` - папка `vkr_examples` локальная, лежит рядом с композом
+`docker compose exec app python run.py /app/vkr_examples/VKR1.docx --no-overflow-logs` - папка `vkr_examples` локальная, лежит рядом с композом
 
 ## Пример сгенерированных вопросов по тексту ВКР
 
diff --git a/app/questions_generator/run.py b/app/questions_generator/run.py
index 418d0af..d52d054 100644
--- a/app/questions_generator/run.py
+++ b/app/questions_generator/run.py
@@ -3,7 +3,7 @@
 import argparse
 import logging
 import time
-from contextlib import contextmanager
+from contextlib import contextmanager, nullcontext
 
 from docx import Document
 import nltk
@@ -53,6 +53,27 @@ def timed(logger: logging.Logger, operation: str, level: int = logging.INFO, **e
         logger.log(level, "END   %s | %.2f ms %s", operation, elapsed_ms, (extra if extra else ""))
 
 
+@contextmanager
+def suppress_console_logs():
+    """
+    Временно отключает вывод логов в консоль (StreamHandler на stdout/stderr),
+    при этом FileHandler продолжает писать в файл.
+    """
+    root = logging.getLogger()
+    saved_levels = []
+
+    for h in root.handlers:
+        if isinstance(h, logging.StreamHandler) and getattr(h, "stream", None) in (sys.stdout, sys.stderr):
+            saved_levels.append((h, h.level))
+            h.setLevel(logging.CRITICAL + 1)  # выше CRITICAL, чтобы ничего не проходило
+
+    try:
+        yield
+    finally:
+        for h, lvl in saved_levels:
+            h.setLevel(lvl)
+
+
 def load_vkr_text(path: str) -> str:
     logger = logging.getLogger(__name__)
 
@@ -82,6 +103,11 @@ def parse_args() -> argparse.Namespace:
         default="vkr_examples/VKR1.docx",
         help="Путь к .docx файлу с текстом ВКР (по умолчанию: vkr_examples/VKR1.docx)",
     )
+    parser.add_argument(
+        "--no-overflow-logs",
+        action="store_true",
+        help="Отключить вывод логов в консоль во время печати вопросов/результатов (логи в файл сохраняются).",
+    )
     return parser.parse_args()
 
 
@@ -123,39 +149,42 @@ def main():
     ok_count = 0
     fail_count = 0
 
-    with timed(logger, "validate_all_questions", total=len(questions)):
-        for idx, q in enumerate(questions, start=1):
-            # маркер-разделитель (ваш текстовый разделитель)
-            if q.strip().startswith("---"):
-                logger.info("Separator encountered at %d: %s", idx, q.strip())
-                print(f"\n{q}")
-                continue
-
-            with timed(logger, "validate_question", index=idx):
-                with timed(logger, "check_relevance", index=idx):
-                    rel = validator.check_relevance(q)
-                with timed(logger, "check_clarity", index=idx):
-                    clr = validator.check_clarity(q)
-                with timed(logger, "check_difficulty", index=idx):
-                    diff = validator.check_difficulty(q)
-
-            passed = (int(rel) + int(clr) + int(diff) >= 2)
-            status = "✔ OK" if passed else "✖ FAIL"
-
-            if passed:
-                ok_count += 1
-            else:
-                fail_count += 1
-
-            logger.info(
-                "Question %d status=%s rel=%s clr=%s diff=%s text=%r",
-                idx, ("OK" if passed else "FAIL"), rel, clr, diff, q
-            )
-
-            print(f"\n[{status}] {q}")
-            print(f"  - relevance: {rel}")
-            print(f"  - clarity:   {clr}")
-            print(f"  - difficulty:{diff}")
+    quiet_ctx = suppress_console_logs() if args.no_overflow_logs else nullcontext()
+
+    with quiet_ctx:
+        with timed(logger, "validate_all_questions", total=len(questions)):
+            for idx, q in enumerate(questions, start=1):
+                # маркер-разделитель (ваш текстовый разделитель)
+                if q.strip().startswith("---"):
+                    logger.info("Separator encountered at %d: %s", idx, q.strip())
+                    print(f"\n{q}")
+                    continue
+
+                with timed(logger, "validate_question", index=idx):
+                    with timed(logger, "check_relevance", index=idx):
+                        rel = validator.check_relevance(q)
+                    with timed(logger, "check_clarity", index=idx):
+                        clr = validator.check_clarity(q)
+                    with timed(logger, "check_difficulty", index=idx):
+                        diff = validator.check_difficulty(q)
+
+                passed = (int(rel) + int(clr) + int(diff) >= 2)
+                status = "✔ OK" if passed else "✖ FAIL"
+
+                if passed:
+                    ok_count += 1
+                else:
+                    fail_count += 1
+
+                logger.info(
+                    "Question %d status=%s rel=%s clr=%s diff=%s text=%r",
+                    idx, ("OK" if passed else "FAIL"), rel, clr, diff, q
+                )
+
+                print(f"\n[{status}] {q}")
+                print(f"  - relevance: {rel}")
+                print(f"  - clarity:   {clr}")
+                print(f"  - difficulty:{diff}")
 
     logger.info("Validation summary: ok=%d fail=%d total=%d", ok_count, fail_count, len(questions))
     logger.info("=== RUN END ===")

From 39f7626ba0ec971890225f39749133f5a263d812 Mon Sep 17 00:00:00 2001
From: kiyro7 <92889789+kiyro7@users.noreply.github.com>
Date: Mon, 5 Jan 2026 23:54:53 +0300
Subject: [PATCH 13/17] docker fix (builds aprox 40 mins)

---
 app/questions_generator/Dockerfile            | 23 +++++--------------
 app/questions_generator/docker-compose.yml    | 12 ++++------
 .../docker/init-volumes.sh                    | 23 +------------------
 app/questions_generator/requirements.txt      |  1 +
 app/questions_generator/run.py                |  1 +
 5 files changed, 14 insertions(+), 46 deletions(-)

diff --git a/app/questions_generator/Dockerfile b/app/questions_generator/Dockerfile
index 3c53a21..a7b8201 100644
--- a/app/questions_generator/Dockerfile
+++ b/app/questions_generator/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.10-slim AS base
+FROM python:3.10-slim
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
         git wget gcc g++ \
@@ -7,26 +7,15 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 
 WORKDIR /app
 
-COPY requirements.txt .
-
-# можно (и полезно) задать глобально:
 ENV PIP_DISABLE_PIP_VERSION_CHECK=1 \
     PIP_DEFAULT_TIMEOUT=120
 
-RUN pip install --no-cache-dir torch==2.5.1
-RUN pip install --no-cache-dir -r requirements.txt
-RUN pip install --no-cache-dir "huggingface_hub[cli]"
-
-ENV NLTK_DATA=/nltk_data
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt \
+        "huggingface_hub[cli]"
 
 COPY . .
 
-# ====== runtime image ======
-FROM base AS app
-CMD ["bash", "-lc", "sleep infinity"]
+COPY --chmod=755 docker/init-volumes.sh /usr/local/bin/init-volumes.sh
 
-# ====== init image ======
-FROM base AS init
-COPY docker/init-volumes.sh /usr/local/bin/init-volumes.sh
-RUN chmod +x /usr/local/bin/init-volumes.sh
-ENTRYPOINT ["/usr/local/bin/init-volumes.sh"]
+CMD ["bash"]
diff --git a/app/questions_generator/docker-compose.yml b/app/questions_generator/docker-compose.yml
index 27f353b..276d88d 100644
--- a/app/questions_generator/docker-compose.yml
+++ b/app/questions_generator/docker-compose.yml
@@ -1,17 +1,14 @@
 services:
   init:
-    build:
-      context: .
-      target: init
+    build: .
+    entrypoint: ["/usr/local/bin/init-volumes.sh"]
     volumes:
       - rut5_model:/app/question_generator/rut5-base
       - nltk_data:/nltk_data
     restart: "no"
 
   app:
-    build:
-      context: .
-      target: app
+    build: .
     depends_on:
       init:
         condition: service_completed_successfully
@@ -20,7 +17,8 @@ services:
     volumes:
       - rut5_model:/app/question_generator/rut5-base
       - nltk_data:/nltk_data
-      - ./vkr_examples:/app/vkr_examples  # монтируется для интерактивного запуска с файлами из этой папки (папка рядом с композом)
+      - ./vkr_examples:/app/vkr_examples
+    command: ["bash", "-lc", "sleep infinity"]
 
 volumes:
   rut5_model:
diff --git a/app/questions_generator/docker/init-volumes.sh b/app/questions_generator/docker/init-volumes.sh
index 2cf47f8..e528d44 100644
--- a/app/questions_generator/docker/init-volumes.sh
+++ b/app/questions_generator/docker/init-volumes.sh
@@ -2,14 +2,11 @@
 set -e
 
 MODEL_DIR="/app/question_generator/rut5-base"
-NLTK_DIR="${NLTK_DATA:-/nltk_data}"
 
 echo "MODEL_DIR=${MODEL_DIR}"
-echo "NLTK_DIR=${NLTK_DIR}"
 
-mkdir -p "$MODEL_DIR" "$NLTK_DIR"
+mkdir -p "$MODEL_DIR"
 
-# 1) ruT5 model (один раз)
 if [ -z "$(ls -A "$MODEL_DIR" 2>/dev/null)" ]; then
   echo "Model directory is empty. Downloading model to $MODEL_DIR..."
   huggingface-cli download \
@@ -20,21 +17,3 @@ if [ -z "$(ls -A "$MODEL_DIR" 2>/dev/null)" ]; then
 else
   echo "Model directory is not empty, skipping download."
 fi
-
-# 2) NLTK data (один раз)
-if [ -z "$(ls -A "$NLTK_DIR" 2>/dev/null)" ]; then
-  echo "NLTK data directory is empty. Downloading 'punkt' and 'stopwords' to $NLTK_DIR..."
-  python - <<'PY'
-import os
-import nltk
-
-nltk_dir = os.environ.get("NLTK_DATA", "/nltk_data")
-nltk.data.path = [nltk_dir] + nltk.data.path
-
-nltk.download("punkt", download_dir=nltk_dir)
-nltk.download("stopwords", download_dir=nltk_dir)
-PY
-  echo "NLTK data downloaded."
-else
-  echo "NLTK data directory is not empty, skipping download."
-fi
diff --git a/app/questions_generator/requirements.txt b/app/questions_generator/requirements.txt
index 191cdc8..0c59711 100644
--- a/app/questions_generator/requirements.txt
+++ b/app/questions_generator/requirements.txt
@@ -3,3 +3,4 @@ sentencepiece==0.2.1
 nltk==3.9.2
 huggingface_hub==0.36.0
 python-docx==1.2.0
+torch==2.5.1
diff --git a/app/questions_generator/run.py b/app/questions_generator/run.py
index d52d054..e0a5b83 100644
--- a/app/questions_generator/run.py
+++ b/app/questions_generator/run.py
@@ -127,6 +127,7 @@ def main():
             logger.info("NLTK punkt_tab not found. Downloading...")
             print("Загрузка необходимых данных NLTK...")
             nltk.download("punkt_tab")
+            nltk.download("stopwords")
 
     print(f"=== Загрузка текста ВКР из '{vkr_path}' ===")
     text = load_vkr_text(vkr_path)

From bee9a7a3178ccc0153180ab5c0e0b7eba22e3c85 Mon Sep 17 00:00:00 2001
From: kiyro7 <92889789+kiyro7@users.noreply.github.com>
Date: Fri, 9 Jan 2026 21:39:24 +0300
Subject: [PATCH 14/17] fixed heuristic questions generation

---
 app/questions_generator/generator.py          | 90 ++++++++-----------
 .../heuristic_questions.csv                   |  8 ++
 2 files changed, 44 insertions(+), 54 deletions(-)
 create mode 100644 app/questions_generator/heuristic_questions.csv

diff --git a/app/questions_generator/generator.py b/app/questions_generator/generator.py
index 386c5d3..c4fae49 100644
--- a/app/questions_generator/generator.py
+++ b/app/questions_generator/generator.py
@@ -3,6 +3,8 @@
 import time
 from contextlib import contextmanager
 from typing import List, Dict
+import csv
+from pathlib import Path
 
 from nltk.tokenize import sent_tokenize, word_tokenize
 from nltk.corpus import stopwords
@@ -33,7 +35,11 @@ class VkrQuestionGenerator:
         "Приложения": r"Приложения.*?(?=\n[A-ZА-Я][^\n]*\n)",
     }
 
-    def __init__(self, vkr_text: str, model_path: str = "ai-forever/rut5-base-multitask"):
+    def __init__(self,
+                 vkr_text: str,
+                 model_path: str = "ai-forever/rut5-base-multitask",
+                 heuristic_csv_path: str = "heuristic_questions.csv"):
+
         self.logger = logging.getLogger(__name__)
 
         with timed(self.logger, "generator_init"):
@@ -52,15 +58,27 @@ def __init__(self, vkr_text: str, model_path: str = "ai-forever/rut5-base-multit
             with timed(self.logger, "load_model", model_path=model_path):
                 self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
 
+            with timed(self.logger, "load_heuristic_questions", path=heuristic_csv_path):
+                self.heuristic_templates: List[Dict[str, str]] = []
+                with Path(heuristic_csv_path).open(encoding="utf-8") as f:
+                    reader = csv.DictReader(f, delimiter="|")
+                    for row in reader:
+                        self.heuristic_templates.append(row)
+
         self.logger.info(
             "Generator ready: sentences=%d stopwords=%d model_path=%s",
             len(self.sentences), len(self.stopwords), model_path
         )
 
     def extract_section(self, title: str) -> str:
-        """Извлекает раздел по шаблону заголовка."""
-        pattern = self.SECTION_PATTERNS.get(title, rf"{title}.*?(?=\n[A-ZА-Я][^\n]*\n)")
-        m = re.search(pattern, self.vkr_text, re.DOTALL | re.IGNORECASE)
+        """Извлекает раздел по заголовку (устойчиво к нумерации и регистру)."""
+        pattern = rf"""
+            (?im)
+            ^\s*(\d+(\.\d+)*\.?\s*)?{re.escape(title)}\s*$
+            (.*?)
+            (?=^\s*(\d+(\.\d+)*\.?\s*[А-ЯA-Z]|$\Z))
+        """
+        m = re.search(pattern, self.vkr_text, re.DOTALL | re.VERBOSE)
         return m.group(0) if m else ""
 
     def extract_intro(self) -> str:
@@ -95,59 +113,23 @@ def llm_generate_question(self, text_fragment: str) -> str:
         return decoded
 
     def heuristic_questions(self) -> List[str]:
-        """Эвристики, завязанные на структуру ВКР."""
+        """Эвристики, завязанные на структуру ВКР (загружаются из CSV)."""
         with timed(self.logger, "heuristic_questions_total"):
-            intro = self.extract_intro()
-            overview = self.extract_section("Обзор предметной области")
-            objectives = self.extract_section("Постановка задачи")
-            method = self.extract_section("Метод решения")
-            research = self.extract_section("Исследования")
-            conc = self.extract_conclusion()
-            apps = self.extract_section("Приложения")
-
             q: List[str] = []
 
-            # Введение ↔ Заключение
-            if intro and conc:
-                q.append(
-                    "Как цель и задачи, сформулированные во введении, отражены в итоговых выводах заключения?"
-                )
-
-            # Обзор предметной области
-            if overview:
-                q.append(
-                    "Какие термины и подходы из обзора предметной области легли в основу формальной постановки задачи?"
-                )
-
-            # Постановка задачи
-            if objectives:
-                q.append(
-                    "В каких требованиях к решению, указанных в постановке задачи, находят отражение цели работы?"
-                )
-
-            # Метод решения
-            if method:
-                q.append(
-                    "Как архитектура и алгоритмы, описанные в разделе «Метод решения», обеспечивают достижение поставленных требований?"
-                )
-
-            # Исследования
-            if research:
-                q.append(
-                    "Какие количественные или качественные свойства решения подтверждены в разделе «Исследования» и как они связаны с задачами введения?"
-                )
-
-            # Приложения
-            if apps:
-                q.append(
-                    "Какие дополнительные материалы из приложений необходимы для проверки воспроизводимости результатов?"
-                )
-
-            # Обязательные общие вопросы
-            q.extend([
-                "Как практическая значимость работы следует из задач и результатов исследования?",
-                "Какие ограничения метода решения указаны в тексте и как они влияют на достижение цели?",
-            ])
+            for item in self.heuristic_templates:
+                sections = item["section"]
+                question = item["question"]
+
+                # пустой sections == обязательный общий вопрос
+                if not sections:
+                    q.append(question)
+                    continue
+                # for x in sections.split(','):
+                #     a = self.extract_section(x)
+                #     self.logger.info(x, a, question, sections)
+                if all([self.extract_section(x) for x in sections.split(",")]):  # если нет всех нужных секций для вопроса, то не добавляем его
+                    q.append(question)
 
         self.logger.info("Heuristic questions created: %d", len(q))
         return q
diff --git a/app/questions_generator/heuristic_questions.csv b/app/questions_generator/heuristic_questions.csv
new file mode 100644
index 0000000..ad180a0
--- /dev/null
+++ b/app/questions_generator/heuristic_questions.csv
@@ -0,0 +1,8 @@
+section|question
+Введение,Заключение|Как цель и задачи, сформулированные во введении, отражены в итоговых выводах заключения?
+Обзор предметной области|Какие термины и подходы из обзора предметной области легли в основу формальной постановки задачи?
+Постановка задачи|В каких требованиях к решению, указанных в постановке задачи, находят отражение цели работы?
+Метод решения|Как архитектура и алгоритмы, описанные в разделе «Метод решения», обеспечивают достижение поставленных требований?
+Исследования|Какие количественные или качественные свойства решения подтверждены в разделе «Исследования» и как они связаны с задачами введения?
+|Как практическая значимость работы следует из задач и результатов исследования?
+|Какие ограничения метода решения указаны в тексте и как они влияют на достижение цели?

From a16784bae5e984fb25a104852eb66c1cb287a2c5 Mon Sep 17 00:00:00 2001
From: kiyro7 <92889789+kiyro7@users.noreply.github.com>
Date: Sat, 10 Jan 2026 17:58:29 +0300
Subject: [PATCH 15/17] clearing

---
 app/questions_generator/Dockerfile            |  2 +-
 app/questions_generator/README.md             | 17 ++--------------
 app/questions_generator/generator.py          | 20 +------------------
 .../{docker => }/init-volumes.sh              |  0
 4 files changed, 4 insertions(+), 35 deletions(-)
 rename app/questions_generator/{docker => }/init-volumes.sh (100%)

diff --git a/app/questions_generator/Dockerfile b/app/questions_generator/Dockerfile
index a7b8201..8a21aeb 100644
--- a/app/questions_generator/Dockerfile
+++ b/app/questions_generator/Dockerfile
@@ -16,6 +16,6 @@ RUN pip install --no-cache-dir -r requirements.txt \
 
 COPY . .
 
-COPY --chmod=755 docker/init-volumes.sh /usr/local/bin/init-volumes.sh
+COPY --chmod=755 init-volumes.sh /usr/local/bin/init-volumes.sh
 
 CMD ["bash"]
diff --git a/app/questions_generator/README.md b/app/questions_generator/README.md
index b07802c..cfb3767 100644
--- a/app/questions_generator/README.md
+++ b/app/questions_generator/README.md
@@ -1,5 +1,5 @@
 ## Запуск (контейнер вечно крутится)
-`docker-compose up` - ВАЖНО: Первый раз ОЧЕНЬ ДОЛГО билдится (30-40 минут)!!!
+`docker-compose up` - ВАЖНО: Первый раз ОЧЕНЬ ДОЛГО билдится (30-40 минут)
 
 ## Использование (интерактивное)
 `docker compose exec app python run.py /app/vkr_examples/VKR1.docx --no-overflow-logs` - папка `vkr_examples` локальная, лежит рядом с композом
@@ -21,16 +21,6 @@
   - clarity:   True
   - difficulty:False
 
-[✖ FAIL] Какие количественные или качественные свойства решения подтверждены в разделе «Исследования» и как они связаны с задачами введения?
-  - relevance: True
-  - clarity:   False
-  - difficulty:False
-
-[✔ OK] Какие дополнительные материалы из приложений необходимы для проверки воспроизводимости результатов?
-  - relevance: True
-  - clarity:   True
-  - difficulty:False
-
 [✔ OK] Как практическая значимость работы следует из задач и результатов исследования?
   - relevance: True
   - clarity:   True
@@ -41,10 +31,7 @@
   - clarity:   True
   - difficulty:False
 
-[✖ FAIL] --- rut5-base-multitask вопросы ---
-  - relevance: False
-  - clarity:   False
-  - difficulty:False
+--- rut5-base-multitask вопросы ---
 
 [✖ FAIL] Что такое ЛЭТИ?
   - relevance: False
diff --git a/app/questions_generator/generator.py b/app/questions_generator/generator.py
index c4fae49..cc85bf8 100644
--- a/app/questions_generator/generator.py
+++ b/app/questions_generator/generator.py
@@ -25,16 +25,6 @@ def timed(logger: logging.Logger, operation: str, level: int = logging.INFO, **e
 class VkrQuestionGenerator:
     """Гибридный генератор вопросов по ВКР: NLTK + rut5-base-multitask."""
 
-    SECTION_PATTERNS: Dict[str, str] = {
-        "Введение": r"Введение.*?(?=\n[A-ZА-Я][^\n]*\n)",
-        "Обзор предметной области": r"Обзор предметной области.*?(?=\n[A-ZА-Я][^\n]*\n)",
-        "Постановка задачи": r"Постановка задачи.*?(?=\n[A-ZА-Я][^\n]*\n)",
-        "Метод решения": r"Метод решения.*?(?=\n[A-ZА-Я][^\n]*\n)",
-        "Исследования": r"Исследования.*?(?=\n[A-ZА-Я][^\n]*\n)",
-        "Заключение": r"Заключение.*?(?=\n[A-ZА-Я][^\n]*\n)",
-        "Приложения": r"Приложения.*?(?=\n[A-ZА-Я][^\n]*\n)",
-    }
-
     def __init__(self,
                  vkr_text: str,
                  model_path: str = "ai-forever/rut5-base-multitask",
@@ -81,12 +71,6 @@ def extract_section(self, title: str) -> str:
         m = re.search(pattern, self.vkr_text, re.DOTALL | re.VERBOSE)
         return m.group(0) if m else ""
 
-    def extract_intro(self) -> str:
-        return self.extract_section("Введение")
-
-    def extract_conclusion(self) -> str:
-        return self.extract_section("Заключение")
-
     def extract_keywords(self, text: str) -> List[str]:
         """Извлекает ключевые слова из текста."""
         with timed(self.logger, "extract_keywords", text_len=len(text)):
@@ -125,9 +109,7 @@ def heuristic_questions(self) -> List[str]:
                 if not sections:
                     q.append(question)
                     continue
-                # for x in sections.split(','):
-                #     a = self.extract_section(x)
-                #     self.logger.info(x, a, question, sections)
+
                 if all([self.extract_section(x) for x in sections.split(",")]):  # если нет всех нужных секций для вопроса, то не добавляем его
                     q.append(question)
 
diff --git a/app/questions_generator/docker/init-volumes.sh b/app/questions_generator/init-volumes.sh
similarity index 100%
rename from app/questions_generator/docker/init-volumes.sh
rename to app/questions_generator/init-volumes.sh

From c2df6e4c74c60301a9d281fc6d40a83070928675 Mon Sep 17 00:00:00 2001
From: kiyro7 <92889789+kiyro7@users.noreply.github.com>
Date: Sat, 10 Jan 2026 18:08:08 +0300
Subject: [PATCH 16/17] created static folder

---
 .gitignore                                                   | 2 +-
 app/questions_generator/README.md                            | 2 +-
 app/questions_generator/docker-compose.yml                   | 2 +-
 app/questions_generator/generator.py                         | 4 ++--
 app/questions_generator/run.py                               | 2 +-
 app/questions_generator/run_docker.py                        | 2 +-
 app/questions_generator/{ => static}/heuristic_questions.csv | 0
 7 files changed, 7 insertions(+), 7 deletions(-)
 rename app/questions_generator/{ => static}/heuristic_questions.csv (100%)

diff --git a/.gitignore b/.gitignore
index 06067be..d15ce74 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,5 +6,5 @@ __pycache__
 /VERSION.json
 .env
 /whisper_asr_model_cache
-/app/questions_generator/vkr_examples/
+/app/questions_generator/static/vkr_examples/
 /app/questions_generator/rut5-base/
diff --git a/app/questions_generator/README.md b/app/questions_generator/README.md
index cfb3767..ad5d787 100644
--- a/app/questions_generator/README.md
+++ b/app/questions_generator/README.md
@@ -2,7 +2,7 @@
 `docker-compose up` - ВАЖНО: Первый раз ОЧЕНЬ ДОЛГО билдится (30-40 минут)
 
 ## Использование (интерактивное)
-`docker compose exec app python run.py /app/vkr_examples/VKR1.docx --no-overflow-logs` - папка `vkr_examples` локальная, лежит рядом с композом
+`docker compose exec app python run.py /app/static/vkr_examples/VKR1.docx --no-overflow-logs` - папка `vkr_examples` локальная
 
 ## Пример сгенерированных вопросов по тексту ВКР
 
diff --git a/app/questions_generator/docker-compose.yml b/app/questions_generator/docker-compose.yml
index 276d88d..3705ce4 100644
--- a/app/questions_generator/docker-compose.yml
+++ b/app/questions_generator/docker-compose.yml
@@ -17,7 +17,7 @@ services:
     volumes:
       - rut5_model:/app/question_generator/rut5-base
       - nltk_data:/nltk_data
-      - ./vkr_examples:/app/vkr_examples
+      - ./static/vkr_examples:/app/static/vkr_examples
     command: ["bash", "-lc", "sleep infinity"]
 
 volumes:
diff --git a/app/questions_generator/generator.py b/app/questions_generator/generator.py
index cc85bf8..336495f 100644
--- a/app/questions_generator/generator.py
+++ b/app/questions_generator/generator.py
@@ -27,8 +27,8 @@ class VkrQuestionGenerator:
 
     def __init__(self,
                  vkr_text: str,
-                 model_path: str = "ai-forever/rut5-base-multitask",
-                 heuristic_csv_path: str = "heuristic_questions.csv"):
+                 model_path: str = "/app/question_generator/rut5-base",
+                 heuristic_csv_path: str = "static/heuristic_questions.csv"):
 
         self.logger = logging.getLogger(__name__)
 
diff --git a/app/questions_generator/run.py b/app/questions_generator/run.py
index e0a5b83..3228750 100644
--- a/app/questions_generator/run.py
+++ b/app/questions_generator/run.py
@@ -134,7 +134,7 @@ def main():
 
     print("=== Инициализация генератора ===")
     with timed(logger, "init_generator"):
-        gen = VkrQuestionGenerator(text, model_path="/app/question_generator/rut5-base")
+        gen = VkrQuestionGenerator(text, model_path="/app/question_generator/rut5-base", heuristic_csv_path="static/heuristic_questions.csv")
 
     print("=== Инициализация валидатора ===")
     with timed(logger, "init_validator"):
diff --git a/app/questions_generator/run_docker.py b/app/questions_generator/run_docker.py
index 30c986e..1b066be 100644
--- a/app/questions_generator/run_docker.py
+++ b/app/questions_generator/run_docker.py
@@ -21,7 +21,7 @@ def main():
         sys.exit(1)
 
     # Путь внутри контейнера — фиксированный, один и тот же для всех ОС
-    container_path = "/app/questions_generator/vkr_examples/vkr.docx"
+    container_path = "/app/questions_generator/static/vkr_examples/vkr.docx"
 
     cmd = [
         "docker", "run", "-it", "--rm",
diff --git a/app/questions_generator/heuristic_questions.csv b/app/questions_generator/static/heuristic_questions.csv
similarity index 100%
rename from app/questions_generator/heuristic_questions.csv
rename to app/questions_generator/static/heuristic_questions.csv

From 666535d860aa8a69f850ae7422bd8cbb28f29c5c Mon Sep 17 00:00:00 2001
From: kiyro7 <92889789+kiyro7@users.noreply.github.com>
Date: Sat, 10 Jan 2026 18:50:54 +0300
Subject: [PATCH 17/17] full logs refactor and translation to russian

---
 app/questions_generator/generator.py     | 170 ++++++++++--------
 app/questions_generator/init-volumes.sh  |   6 +-
 app/questions_generator/logging_utils.py |  82 +++++++++
 app/questions_generator/run.py           | 216 +++++++----------------
 app/questions_generator/run_docker.py    |  35 ++--
 app/questions_generator/validator.py     | 162 ++++++++---------
 6 files changed, 338 insertions(+), 333 deletions(-)
 create mode 100644 app/questions_generator/logging_utils.py

diff --git a/app/questions_generator/generator.py b/app/questions_generator/generator.py
index 336495f..3c9fb7a 100644
--- a/app/questions_generator/generator.py
+++ b/app/questions_generator/generator.py
@@ -1,91 +1,90 @@
 import re
 import logging
-import time
-from contextlib import contextmanager
-from typing import List, Dict
 import csv
 from pathlib import Path
+from typing import List, Dict
 
 from nltk.tokenize import sent_tokenize, word_tokenize
 from nltk.corpus import stopwords
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 
-
-@contextmanager
-def timed(logger: logging.Logger, operation: str, level: int = logging.INFO, **extra):
-    start = time.perf_counter()
-    logger.log(level, "START %s %s", operation, (extra if extra else ""))
-    try:
-        yield
-    finally:
-        elapsed_ms = (time.perf_counter() - start) * 1000.0
-        logger.log(level, "END   %s | %.2f ms %s", operation, elapsed_ms, (extra if extra else ""))
+from logging_utils import log_timed
 
 
 class VkrQuestionGenerator:
     """Гибридный генератор вопросов по ВКР: NLTK + rut5-base-multitask."""
 
-    def __init__(self,
-                 vkr_text: str,
-                 model_path: str = "/app/question_generator/rut5-base",
-                 heuristic_csv_path: str = "static/heuristic_questions.csv"):
-
+    def __init__(
+        self,
+        vkr_text: str,
+        model_path: str = "/app/question_generator/rut5-base",
+        heuristic_csv_path: str = "static/heuristic_questions.csv",
+    ):
         self.logger = logging.getLogger(__name__)
 
-        with timed(self.logger, "generator_init"):
+        with log_timed(self.logger, "инициализация генератора"):
             self.vkr_text = vkr_text
 
-            with timed(self.logger, "sent_tokenize"):
+            with log_timed(self.logger, "токенизация предложений"):
                 self.sentences = sent_tokenize(vkr_text)
 
-            with timed(self.logger, "load_stopwords"):
+            with log_timed(self.logger, "загрузка стоп-слов"):
                 self.stopwords = set(stopwords.words("russian"))
 
-            # Модель rut5-base-multitask для языкового оформления вопросов
-            with timed(self.logger, "load_tokenizer", model_path=model_path):
-                self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+            with log_timed(self.logger, "загрузка токенизатора", путь=model_path):
+                self.tokenizer = AutoTokenizer.from_pretrained(
+                    model_path, use_fast=False
+                )
 
-            with timed(self.logger, "load_model", model_path=model_path):
+            with log_timed(self.logger, "загрузка модели", путь=model_path):
                 self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
 
-            with timed(self.logger, "load_heuristic_questions", path=heuristic_csv_path):
+            with log_timed(
+                self.logger,
+                "загрузка эвристических вопросов",
+                путь=heuristic_csv_path,
+            ):
                 self.heuristic_templates: List[Dict[str, str]] = []
                 with Path(heuristic_csv_path).open(encoding="utf-8") as f:
                     reader = csv.DictReader(f, delimiter="|")
-                    for row in reader:
-                        self.heuristic_templates.append(row)
+                    self.heuristic_templates.extend(reader)
 
         self.logger.info(
-            "Generator ready: sentences=%d stopwords=%d model_path=%s",
-            len(self.sentences), len(self.stopwords), model_path
+            "Генератор готов: предложений=%d стоп-слов=%d модель=%s",
+            len(self.sentences),
+            len(self.stopwords),
+            model_path,
         )
 
     def extract_section(self, title: str) -> str:
-        """Извлекает раздел по заголовку (устойчиво к нумерации и регистру)."""
         pattern = rf"""
             (?im)
             ^\s*(\d+(\.\d+)*\.?\s*)?{re.escape(title)}\s*$
             (.*?)
             (?=^\s*(\d+(\.\d+)*\.?\s*[А-ЯA-Z]|$\Z))
         """
-        m = re.search(pattern, self.vkr_text, re.DOTALL | re.VERBOSE)
-        return m.group(0) if m else ""
+        match = re.search(pattern, self.vkr_text, re.DOTALL | re.VERBOSE)
+        return match.group(0) if match else ""
 
     def extract_keywords(self, text: str) -> List[str]:
-        """Извлекает ключевые слова из текста."""
-        with timed(self.logger, "extract_keywords", text_len=len(text)):
+        with log_timed(self.logger, "извлечение ключевых слов", длина=len(text)):
             tokens = word_tokenize(text.lower())
             result = [
                 t for t in tokens
                 if t.isalnum() and t not in self.stopwords and len(t) > 4
             ]
-        self.logger.info("Keywords extracted: %d", len(result))
+
+        self.logger.info("Ключевые слова извлечены: %d", len(result))
         return result
 
     def llm_generate_question(self, text_fragment: str) -> str:
-        """Генерирует формулировку вопроса через rut5 ask."""
         prompt = f"ask: {text_fragment}"
-        with timed(self.logger, "llm_generate_question", fragment_len=len(text_fragment)):
+
+        with log_timed(
+            self.logger,
+            "генерация вопроса LLM",
+            длина_фрагмента=len(text_fragment),
+        ):
             enc = self.tokenizer(prompt, return_tensors="pt", truncation=True)
             out = self.model.generate(
                 **enc,
@@ -94,74 +93,99 @@ def llm_generate_question(self, text_fragment: str) -> str:
                 early_stopping=True,
             )
             decoded = self.tokenizer.decode(out[0], skip_special_tokens=True)
+
         return decoded
 
     def heuristic_questions(self) -> List[str]:
-        """Эвристики, завязанные на структуру ВКР (загружаются из CSV)."""
-        with timed(self.logger, "heuristic_questions_total"):
-            q: List[str] = []
+        with log_timed(self.logger, "эвристическая генерация вопросов"):
+            questions: List[str] = []
 
             for item in self.heuristic_templates:
                 sections = item["section"]
                 question = item["question"]
 
-                # пустой sections == обязательный общий вопрос
                 if not sections:
-                    q.append(question)
+                    questions.append(question)
                     continue
 
-                if all([self.extract_section(x) for x in sections.split(",")]):  # если нет всех нужных секций для вопроса, то не добавляем его
-                    q.append(question)
+                if all(self.extract_section(x) for x in sections.split(",")):
+                    questions.append(question)
 
-        self.logger.info("Heuristic questions created: %d", len(q))
-        return q
+        self.logger.info(
+            "Эвристические вопросы сформированы: %d",
+            len(questions),
+        )
+        return questions
 
     def generate_llm_questions(self, count: int = 5) -> List[str]:
-        """Генерирует N вопросов через rut5 по ключевым фрагментам документа."""
-        q: List[str] = []
+        questions: List[str] = []
         fragments = self.sentences[:40]
         step = max(1, len(fragments) // count)
 
-        self.logger.info("LLM generation setup: count=%d fragments=%d step=%d", count, len(fragments), step)
+        self.logger.info(
+            "Настройка LLM: требуется=%d фрагментов=%d шаг=%d",
+            count,
+            len(fragments),
+            step,
+        )
 
-        with timed(self.logger, "generate_llm_questions_total", count=count):
+        with log_timed(self.logger, "LLM генерация всех вопросов", количество=count):
             for i in range(0, len(fragments), step):
-                frag = fragments[i]
+                fragment = fragments[i]
                 try:
-                    # Требование: для ИИ — логгировать время каждого вопроса
-                    with timed(self.logger, "llm_question_item", fragment_index=i):
-                        llm_q = self.llm_generate_question(frag)
+                    with log_timed(
+                        self.logger,
+                        "LLM вопрос",
+                        индекс=i,
+                    ):
+                        llm_q = self.llm_generate_question(fragment)
 
                     if len(llm_q) > 10:
-                        q.append(llm_q)
-                        self.logger.info("LLM question accepted: idx=%d len=%d", len(q), len(llm_q))
+                        questions.append(llm_q)
+                        self.logger.info(
+                            "LLM вопрос принят: номер=%d длина=%d",
+                            len(questions),
+                            len(llm_q),
+                        )
                     else:
-                        self.logger.info("LLM question rejected (too short): len=%d", len(llm_q))
-
-                except Exception as e:  # noqa: BLE001
-                    self.logger.exception("LLM generation failed at fragment_index=%d: %s", i, e)
-                    continue
-
-                if len(q) >= count:
+                        self.logger.info(
+                            "LLM вопрос отклонён (слишком короткий): длина=%d",
+                            len(llm_q),
+                        )
+
+                except Exception as exc:  # noqa: BLE001
+                    self.logger.exception(
+                        "Ошибка генерации LLM вопроса: индекс=%d ошибка=%s",
+                        i,
+                        exc,
+                    )
+
+                if len(questions) >= count:
                     break
 
-        self.logger.info("LLM questions created: %d", len(q))
-        return q
+        self.logger.info(
+            "LLM вопросы сформированы: %d",
+            len(questions),
+        )
+        return questions
 
     def generate_all(self) -> List[str]:
-        """Генерирует полный набор вопросов: эвристики + LLM."""
-        with timed(self.logger, "generate_all_total"):
+        with log_timed(self.logger, "полная генерация вопросов"):
             result: List[str] = []
-            # Требование: для эвристической генерации можно время создания всех вопросов
-            with timed(self.logger, "generate_heuristic_block"):
+
+            with log_timed(self.logger, "эвристический блок"):
                 result.extend(self.heuristic_questions())
 
-            result.extend(["--- rut5-base-multitask вопросы ---"])
+            result.append("--- rut5-base-multitask вопросы ---")
 
-            with timed(self.logger, "generate_llm_block"):
+            with log_timed(self.logger, "LLM блок"):
                 result.extend(self.generate_llm_questions(count=10))
 
             deduped = list(dict.fromkeys(result))
 
-        self.logger.info("generate_all done: raw=%d deduped=%d", len(result), len(deduped))
+        self.logger.info(
+            "Генерация завершена: всего=%d уникальных=%d",
+            len(result),
+            len(deduped),
+        )
         return deduped
diff --git a/app/questions_generator/init-volumes.sh b/app/questions_generator/init-volumes.sh
index e528d44..cd348fd 100644
--- a/app/questions_generator/init-volumes.sh
+++ b/app/questions_generator/init-volumes.sh
@@ -8,12 +8,12 @@ echo "MODEL_DIR=${MODEL_DIR}"
 mkdir -p "$MODEL_DIR"
 
 if [ -z "$(ls -A "$MODEL_DIR" 2>/dev/null)" ]; then
-  echo "Model directory is empty. Downloading model to $MODEL_DIR..."
+  echo "Не видно модельки rut5-base, грузим в папку $MODEL_DIR..."
   huggingface-cli download \
     cointegrated/rut5-base-multitask \
     --local-dir "$MODEL_DIR" \
     --local-dir-use-symlinks False
-  echo "Model downloaded."
+  echo "Загрузили"
 else
-  echo "Model directory is not empty, skipping download."
+  echo "В директории модельки что-то есть, не будем ещё раз загружать"
 fi
diff --git a/app/questions_generator/logging_utils.py b/app/questions_generator/logging_utils.py
new file mode 100644
index 0000000..0c5d1ec
--- /dev/null
+++ b/app/questions_generator/logging_utils.py
@@ -0,0 +1,82 @@
+import logging
+import os
+import sys
+import time
+from contextlib import contextmanager
+from typing import Optional
+
+DEFAULT_LOG_PATH = os.environ.get(
+    "VKR_LOG_PATH",
+    "logs/vkr_question_generator.log"
+)
+
+
+def setup_logging(log_path: Optional[str] = None) -> None:
+    path = log_path or DEFAULT_LOG_PATH
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+
+    root = logging.getLogger()
+    root.setLevel(logging.INFO)
+
+    if root.handlers:
+        return
+
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(name)s:%(lineno)d | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+
+    file_handler = logging.FileHandler(path, encoding="utf-8")
+    file_handler.setFormatter(formatter)
+    file_handler.setLevel(logging.INFO)
+
+    console_handler = logging.StreamHandler(sys.stdout)
+    console_handler.setFormatter(formatter)
+    console_handler.setLevel(logging.INFO)
+
+    root.addHandler(file_handler)
+    root.addHandler(console_handler)
+
+
+@contextmanager
+def log_timed(
+    logger: logging.Logger,
+    operation: str,
+    level: int = logging.INFO,
+    **extra,
+):
+    start = time.perf_counter()
+    logger.log(
+        level,
+        "Начало операции: %s %s",
+        operation,
+        extra if extra else "",
+    )
+    try:
+        yield
+    finally:
+        elapsed_ms = (time.perf_counter() - start) * 1000
+        logger.log(
+            level,
+            "Завершение операции: %s | %.2f мс %s",
+            operation,
+            elapsed_ms,
+            extra if extra else "",
+        )
+
+
+@contextmanager
+def suppress_console_logs():
+    root = logging.getLogger()
+    saved = []
+
+    for h in root.handlers:
+        if isinstance(h, logging.StreamHandler):
+            saved.append((h, h.level))
+            h.setLevel(logging.CRITICAL + 1)
+
+    try:
+        yield
+    finally:
+        for h, lvl in saved:
+            h.setLevel(lvl)
diff --git a/app/questions_generator/run.py b/app/questions_generator/run.py
index 3228750..67e047c 100644
--- a/app/questions_generator/run.py
+++ b/app/questions_generator/run.py
@@ -1,196 +1,100 @@
-import sys
-import os
 import argparse
 import logging
-import time
-from contextlib import contextmanager, nullcontext
+import os
+import sys
+from contextlib import nullcontext
 
-from docx import Document
 import nltk
+from docx import Document
 
 from generator import VkrQuestionGenerator
 from validator import VkrQuestionValidator
-
-
-LOG_PATH = os.environ.get("VKR_LOG_PATH", "logs/vkr_question_generator.log")
-
-
-def setup_logging() -> None:
-    os.makedirs(os.path.dirname(LOG_PATH), exist_ok=True)
-
-    logger = logging.getLogger()
-    logger.setLevel(logging.INFO)
-
-    # чтобы не дублировать хендлеры при повторном запуске в том же процессе
-    if any(isinstance(h, logging.FileHandler) for h in logger.handlers):
-        return
-
-    fmt = logging.Formatter(
-        fmt="%(asctime)s | %(levelname)s | %(name)s:%(lineno)d | %(message)s",
-        datefmt="%Y-%m-%d %H:%M:%S",
-    )
-
-    fh = logging.FileHandler(LOG_PATH, encoding="utf-8")
-    fh.setLevel(logging.INFO)
-    fh.setFormatter(fmt)
-
-    sh = logging.StreamHandler(sys.stdout)
-    sh.setLevel(logging.INFO)
-    sh.setFormatter(fmt)
-
-    logger.addHandler(fh)
-    logger.addHandler(sh)
-
-
-@contextmanager
-def timed(logger: logging.Logger, operation: str, level: int = logging.INFO, **extra):
-    start = time.perf_counter()
-    logger.log(level, "START %s %s", operation, (extra if extra else ""))
-    try:
-        yield
-    finally:
-        elapsed_ms = (time.perf_counter() - start) * 1000.0
-        logger.log(level, "END   %s | %.2f ms %s", operation, elapsed_ms, (extra if extra else ""))
-
-
-@contextmanager
-def suppress_console_logs():
-    """
-    Временно отключает вывод логов в консоль (StreamHandler на stdout/stderr),
-    при этом FileHandler продолжает писать в файл.
-    """
-    root = logging.getLogger()
-    saved_levels = []
-
-    for h in root.handlers:
-        if isinstance(h, logging.StreamHandler) and getattr(h, "stream", None) in (sys.stdout, sys.stderr):
-            saved_levels.append((h, h.level))
-            h.setLevel(logging.CRITICAL + 1)  # выше CRITICAL, чтобы ничего не проходило
-
-    try:
-        yield
-    finally:
-        for h, lvl in saved_levels:
-            h.setLevel(lvl)
+from logging_utils import (
+    setup_logging,
+    log_timed,
+    suppress_console_logs,
+)
 
 
 def load_vkr_text(path: str) -> str:
     logger = logging.getLogger(__name__)
 
     if not os.path.exists(path):
-        logger.error("Файл '%s' не найден.", path)
-        print(f"[ERROR] Файл '{path}' не найден.")
+        logger.error("Файл не найден: %s", path)
         sys.exit(1)
 
-    with timed(logger, "parse_docx", path=path):
-        document = Document(path)
-        text = []
-        for paragraph in document.paragraphs:
-            text.append(paragraph.text)
-        result = "\n".join(text)
-
-    logger.info("DOCX parsed: chars=%d, paragraphs=%d", len(result), len(document.paragraphs))
-    return result
-
+    with log_timed(logger, "чтение DOCX", путь=path):
+        doc = Document(path)
+        text = "\n".join(p.text for p in doc.paragraphs)
 
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description="Генерация экзаменационных вопросов по тексту ВКР"
+    logger.info(
+        "DOCX обработан: символов=%d абзацев=%d",
+        len(text),
+        len(doc.paragraphs),
     )
-    parser.add_argument(
-        "vkr_path",
-        nargs="?",
-        default="vkr_examples/VKR1.docx",
-        help="Путь к .docx файлу с текстом ВКР (по умолчанию: vkr_examples/VKR1.docx)",
-    )
-    parser.add_argument(
-        "--no-overflow-logs",
-        action="store_true",
-        help="Отключить вывод логов в консоль во время печати вопросов/результатов (логи в файл сохраняются).",
-    )
-    return parser.parse_args()
+    return text
 
 
 def main():
     setup_logging()
     logger = logging.getLogger(__name__)
 
-    args = parse_args()
-    vkr_path = args.vkr_path
+    parser = argparse.ArgumentParser()
+    parser.add_argument("vkr_path")
+    parser.add_argument("--no-overflow-logs", action="store_true")
+    args = parser.parse_args()
 
-    logger.info("=== RUN START === vkr_path=%s log_path=%s", vkr_path, LOG_PATH)
+    logger.info("Запуск генерации: файл=%s", args.vkr_path)
 
-    with timed(logger, "nltk_check_download"):
+    with log_timed(logger, "проверка NLTK"):
         try:
             nltk.data.find("tokenizers/punkt_tab/english")
         except LookupError:
-            logger.info("NLTK punkt_tab not found. Downloading...")
-            print("Загрузка необходимых данных NLTK...")
+            logger.info("Загрузка данных NLTK")
             nltk.download("punkt_tab")
             nltk.download("stopwords")
 
-    print(f"=== Загрузка текста ВКР из '{vkr_path}' ===")
-    text = load_vkr_text(vkr_path)
+    text = load_vkr_text(args.vkr_path)
 
-    print("=== Инициализация генератора ===")
-    with timed(logger, "init_generator"):
-        gen = VkrQuestionGenerator(text, model_path="/app/question_generator/rut5-base", heuristic_csv_path="static/heuristic_questions.csv")
+    with log_timed(logger, "инициализация генератора"):
+        gen = VkrQuestionGenerator(text)
 
-    print("=== Инициализация валидатора ===")
-    with timed(logger, "init_validator"):
+    with log_timed(logger, "инициализация валидатора"):
         validator = VkrQuestionValidator(text)
 
-    print("=== Генерация вопросов ===")
-    with timed(logger, "generate_all_questions"):
+    with log_timed(logger, "генерация вопросов"):
         questions = gen.generate_all()
 
-    logger.info("Questions generated: total=%d", len(questions))
-
-    print("\n=== Результаты ===")
-    ok_count = 0
-    fail_count = 0
-
-    quiet_ctx = suppress_console_logs() if args.no_overflow_logs else nullcontext()
-
-    with quiet_ctx:
-        with timed(logger, "validate_all_questions", total=len(questions)):
-            for idx, q in enumerate(questions, start=1):
-                # маркер-разделитель (ваш текстовый разделитель)
-                if q.strip().startswith("---"):
-                    logger.info("Separator encountered at %d: %s", idx, q.strip())
-                    print(f"\n{q}")
-                    continue
-
-                with timed(logger, "validate_question", index=idx):
-                    with timed(logger, "check_relevance", index=idx):
-                        rel = validator.check_relevance(q)
-                    with timed(logger, "check_clarity", index=idx):
-                        clr = validator.check_clarity(q)
-                    with timed(logger, "check_difficulty", index=idx):
-                        diff = validator.check_difficulty(q)
-
-                passed = (int(rel) + int(clr) + int(diff) >= 2)
-                status = "✔ OK" if passed else "✖ FAIL"
-
-                if passed:
-                    ok_count += 1
-                else:
-                    fail_count += 1
-
-                logger.info(
-                    "Question %d status=%s rel=%s clr=%s diff=%s text=%r",
-                    idx, ("OK" if passed else "FAIL"), rel, clr, diff, q
-                )
-
-                print(f"\n[{status}] {q}")
-                print(f"  - relevance: {rel}")
-                print(f"  - clarity:   {clr}")
-                print(f"  - difficulty:{diff}")
-
-    logger.info("Validation summary: ok=%d fail=%d total=%d", ok_count, fail_count, len(questions))
-    logger.info("=== RUN END ===")
-
-    print("\n=== Готово ===")
+    logger.info("Сгенерировано вопросов: %d", len(questions))
+
+    quiet = suppress_console_logs() if args.no_overflow_logs else nullcontext()
+
+    with quiet:
+        for idx, q in enumerate(questions, 1):
+            if q.startswith("---"):
+                print(f"\n{q}")
+                continue
+
+            rel = validator.check_relevance(q)
+            clr = validator.check_clarity(q)
+            diff = validator.check_difficulty(q)
+
+            passed = (int(rel) + int(clr) + int(diff) >= 2)
+            status = "✔ OK" if passed else "✖ FAIL"
+
+            logger.info(
+                "Вопрос %d статус=%s релевантность=%s ясность=%s сложность=%s",
+                idx,
+                "OK" if passed else "FAIL",
+                rel,
+                clr,
+                diff,
+            )
+
+            print(f"\n[{status}] {q}")
+            print(f"  - релевантность: {rel}")
+            print(f"  - ясность:   {clr}")
+            print(f"  - сложность:{diff}")
 
 
 if __name__ == "__main__":
diff --git a/app/questions_generator/run_docker.py b/app/questions_generator/run_docker.py
index 1b066be..ca79ab5 100644
--- a/app/questions_generator/run_docker.py
+++ b/app/questions_generator/run_docker.py
@@ -1,26 +1,26 @@
-import os
-import sys
 import argparse
+import logging
+import os
 import subprocess
+import sys
+
+from logging_utils import setup_logging
 
 
 def main():
-    parser = argparse.ArgumentParser(
-        description="Запуск генератора вопросов по ВКР внутри Docker"
-    )
-    parser.add_argument(
-        "vkr_path",
-        help="Путь к .docx файлу с текстом ВКР (на хосте)",
-    )
+    setup_logging()
+    logger = logging.getLogger(__name__)
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("vkr_path")
     args = parser.parse_args()
 
     host_path = os.path.abspath(args.vkr_path)
 
     if not os.path.exists(host_path):
-        print(f"[ERROR] Файл не найден: {host_path}")
+        logger.error("Файл не найден: %s", host_path)
         sys.exit(1)
 
-    # Путь внутри контейнера — фиксированный, один и тот же для всех ОС
     container_path = "/app/questions_generator/static/vkr_examples/vkr.docx"
 
     cmd = [
@@ -32,13 +32,16 @@ def main():
         "python", "run.py", container_path,
     ]
 
-    print(">> Запускаю команду:")
-    print(" ".join(cmd))
+    logger.info("Запуск Docker команды: %s", " ".join(cmd))
+
     try:
         subprocess.run(cmd, check=True)
-    except subprocess.CalledProcessError as e:
-        print(f"[ERROR] docker run завершился с ошибкой: {e.returncode}")
-        sys.exit(e.returncode)
+    except subprocess.CalledProcessError as exc:
+        logger.exception(
+            "Docker завершился с ошибкой, код=%d",
+            exc.returncode,
+        )
+        sys.exit(exc.returncode)
 
 
 if __name__ == "__main__":
diff --git a/app/questions_generator/validator.py b/app/questions_generator/validator.py
index 463f177..1f60359 100644
--- a/app/questions_generator/validator.py
+++ b/app/questions_generator/validator.py
@@ -1,83 +1,63 @@
 import re
 import logging
-import time
-from contextlib import contextmanager
-from typing import List, Dict, Set
+from datetime import datetime
 from collections import Counter
+from typing import List, Dict, Set
+
 from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
-from datetime import datetime
-
 
-@contextmanager
-def timed(logger: logging.Logger, operation: str, level: int = logging.INFO, **extra):
-    start = time.perf_counter()
-    logger.log(level, "START %s %s", operation, (extra if extra else ""))
-    try:
-        yield
-    finally:
-        elapsed_ms = (time.perf_counter() - start) * 1000.0
-        logger.log(level, "END   %s | %.2f ms %s", operation, elapsed_ms, (extra if extra else ""))
+from logging_utils import log_timed
 
 
 class VkrQuestionValidator:
     def __init__(self, vkr_text: str):
-        """
-        Инициализация валидатора с текстом ВКР
-
-        Args:
-            vkr_text: Полный текст ВКР
-        """
         self.logger = logging.getLogger(__name__)
 
-        with timed(self.logger, "validator_init"):
+        with log_timed(self.logger, "инициализация валидатора"):
             self.vkr_text = vkr_text.lower()
 
-            with timed(self.logger, "validator_load_stopwords"):
-                self.stopwords = set(stopwords.words('russian'))
+            with log_timed(self.logger, "загрузка стоп-слов"):
+                self.stopwords = set(stopwords.words("russian"))
 
-            with timed(self.logger, "validator_extract_keywords_total"):
+            with log_timed(self.logger, "извлечение ключевых слов"):
                 self.keywords = self._extract_keywords()
 
         self.logger.info(
-            "Validator ready: stopwords=%d theme=%d goals=%d methodology=%d",
+            "Валидатор готов: стоп-слов=%d тема=%d цели=%d методология=%d",
             len(self.stopwords),
-            len(self.keywords.get("theme", set())),
-            len(self.keywords.get("goals", set())),
-            len(self.keywords.get("methodology", set())),
+            len(self.keywords["theme"]),
+            len(self.keywords["goals"]),
+            len(self.keywords["methodology"]),
         )
 
     def _extract_keywords(self) -> Dict[str, Set[str]]:
-        """
-        Извлечение ключевых слов из текста ВКР
-
-        Returns:
-            Словарь с категориями ключевых слов
-        """
         keywords = {
-            'theme': set(),       # Тематические слова
-            'goals': set(),       # Слова, связанные с целями
-            'methodology': set()  # Методологические термины
+            "theme": set(),
+            "goals": set(),
+            "methodology": set(),
         }
 
-        with timed(self.logger, "extract_introduction"):
-            intro_section = self._extract_introduction()
-        with timed(self.logger, "tokenize_filter_intro", intro_len=len(intro_section)):
-            keywords['theme'] = self._tokenize_and_filter(intro_section)
+        with log_timed(self.logger, "извлечение введения"):
+            intro = self._extract_introduction()
+        with log_timed(self.logger, "токенизация введения"):
+            keywords["theme"] = self._tokenize_and_filter(intro)
 
-        with timed(self.logger, "extract_goals_section"):
-            goals_section = self._extract_goals_section()
-        with timed(self.logger, "tokenize_filter_goals", goals_len=len(goals_section)):
-            keywords['goals'] = self._tokenize_and_filter(goals_section)
+        with log_timed(self.logger, "извлечение целей"):
+            goals = self._extract_goals_section()
+        with log_timed(self.logger, "токенизация целей"):
+            keywords["goals"] = self._tokenize_and_filter(goals)
 
-        with timed(self.logger, "extract_methodology_section"):
-            meth_section = self._extract_methodology_section()
-        with timed(self.logger, "tokenize_filter_methodology", meth_len=len(meth_section)):
-            keywords['methodology'] = self._tokenize_and_filter(meth_section)
+        with log_timed(self.logger, "извлечение методологии"):
+            meth = self._extract_methodology_section()
+        with log_timed(self.logger, "токенизация методологии"):
+            keywords["methodology"] = self._tokenize_and_filter(meth)
 
         self.logger.info(
-            "Keywords extracted: theme=%d goals=%d methodology=%d",
-            len(keywords["theme"]), len(keywords["goals"]), len(keywords["methodology"])
+            "Ключевые слова извлечены: тема=%d цели=%d методология=%d",
+            len(keywords["theme"]),
+            len(keywords["goals"]),
+            len(keywords["methodology"]),
         )
         return keywords
 
@@ -110,25 +90,19 @@ def _extract_methodology_section(self) -> str:
         return match.group(0) if match else ""
 
     def check_relevance(self, question: str) -> bool:
-        with timed(self.logger, "validator_check_relevance", q_len=len(question)):
+        with log_timed(self.logger, "проверка релевантности", длина=len(question)):
             score = 0
-
-            theme_match = len(set(question.lower().split()) &
-                              set(self.keywords['theme']))
-            if theme_match > 0:
-                score += 1
-
-            actuality_score = self._calculate_actuality_score(question)
-            score += actuality_score
-
-            goal_match = len(set(question.lower().split()) &
-                             set(self.keywords['goals']))
-            if goal_match > 0:
-                score += 1
-
+            score += bool(set(question.lower().split()) & self.keywords["theme"])
+            score += self._calculate_actuality_score(question)
+            score += bool(set(question.lower().split()) & self.keywords["goals"])
             result = score >= 2
 
-        self.logger.info("relevance=%s score=%d q=%r", result, score, question)
+        self.logger.info(
+            "Релевантность=%s балл=%d вопрос=%r",
+            result,
+            score,
+            question,
+        )
         return result
 
     def _calculate_actuality_score(self, question: str) -> int:
@@ -138,15 +112,23 @@ def _calculate_actuality_score(self, question: str) -> int:
         return max(0, min(1, len(year_mentions)))
 
     def check_completeness(self, questions_list: List[str]) -> bool:
-        with timed(self.logger, "validator_check_completeness", total=len(questions_list)):
+        with log_timed(
+                self.logger,
+                "проверка полноты набора вопросов",
+                всего=len(questions_list),
+        ):
             coverage = {
-                'theoretical': self._check_theory_coverage(questions_list),
-                'practical': self._check_practice_coverage(questions_list),
-                'analysis_levels': self._check_analysis_depth(questions_list)
+                "теория": self._check_theory_coverage(questions_list),
+                "практика": self._check_practice_coverage(questions_list),
+                "глубина_анализа": self._check_analysis_depth(questions_list),
             }
             result = all(value >= 0.7 for value in coverage.values())
 
-        self.logger.info("completeness=%s coverage=%s", result, coverage)
+        self.logger.info(
+            "Полнота набора вопросов=%s покрытие=%s",
+            result,
+            coverage,
+        )
         return result
 
     def _check_theory_coverage(self, questions: List[str]) -> float:
@@ -189,15 +171,20 @@ def _check_analysis_depth(self, questions: List[str]) -> float:
         return sum(depths) / (len(depths) * 2) if depths else 0
 
     def check_clarity(self, question: str) -> bool:
-        with timed(self.logger, "validator_check_clarity", q_len=len(question)):
+        with log_timed(self.logger, "проверка ясности", длина=len(question)):
             metrics = {
-                'length': self._check_length(question),
-                'complexity': self._calculate_complexity(question),
-                'ambiguity': self._check_ambiguity(question)
+                "length": self._check_length(question),
+                "complexity": self._calculate_complexity(question),
+                "ambiguity": self._check_ambiguity(question),
             }
-            result = all(value >= 0.7 for value in metrics.values())
+            result = all(v >= 0.7 for v in metrics.values())
 
-        self.logger.info("clarity=%s metrics=%s q=%r", result, metrics, question)
+        self.logger.info(
+            "Ясность=%s метрики=%s вопрос=%r",
+            result,
+            metrics,
+            question,
+        )
         return result
 
     def _check_length(self, question: str) -> float:
@@ -225,15 +212,20 @@ def _check_ambiguity(self, question: str) -> float:
         return max(0.0, ambiguity_score)
 
     def check_difficulty(self, question: str) -> bool:
-        with timed(self.logger, "validator_check_difficulty", q_len=len(question)):
-            difficulty_metrics = {
-                'abstraction_level': self._assess_abstraction(question),
-                'question_type': self._identify_question_type(question),
-                'student_level_match': self._match_student_level(question)
+        with log_timed(self.logger, "проверка сложности", длина=len(question)):
+            metrics = {
+                "abstraction": self._assess_abstraction(question),
+                "type": self._identify_question_type(question),
+                "level": self._match_student_level(question),
             }
-            result = all(value == 'optimal' for value in difficulty_metrics.values())
+            result = all(v == "optimal" for v in metrics.values())
 
-        self.logger.info("difficulty=%s metrics=%s q=%r", result, difficulty_metrics, question)
+        self.logger.info(
+            "Сложность=%s метрики=%s вопрос=%r",
+            result,
+            metrics,
+            question,
+        )
         return result
 
     def _assess_abstraction(self, question: str) -> str: