diff --git a/app/celery_setup/app.py b/app/celery_setup/app.py index 0b50834..d123eeb 100644 --- a/app/celery_setup/app.py +++ b/app/celery_setup/app.py @@ -3,4 +3,4 @@ app = Celery('llm') app.conf.update(**generate_config_from_env()) -app.autodiscover_tasks(packages=['topic_prompt']) +app.autodiscover_tasks(packages=['topic_prompt', 'sheet_scoring']) diff --git a/app/llm_interface/sefaria_llm_interface/sheet_scoring/__init__.py b/app/llm_interface/sefaria_llm_interface/sheet_scoring/__init__.py new file mode 100644 index 0000000..f0aaabe --- /dev/null +++ b/app/llm_interface/sefaria_llm_interface/sheet_scoring/__init__.py @@ -0,0 +1,3 @@ +from sefaria_llm_interface.sheet_scoring.sheet_scoring_input import * +from sefaria_llm_interface.sheet_scoring.sheet_scoring_output import * + diff --git a/app/llm_interface/sefaria_llm_interface/sheet_scoring/sheet_scoring_input.py b/app/llm_interface/sefaria_llm_interface/sheet_scoring/sheet_scoring_input.py new file mode 100644 index 0000000..e6fcb9d --- /dev/null +++ b/app/llm_interface/sefaria_llm_interface/sheet_scoring/sheet_scoring_input.py @@ -0,0 +1,12 @@ +from dataclasses import dataclass +from typing import List, Dict, Union + + +@dataclass +class SheetScoringInput: + # str version of id + sheet_id: str + title: str + sources: List[Dict[str, Union[str, Dict[str, str]]]] + expanded_refs: List[str] + diff --git a/app/llm_interface/sefaria_llm_interface/sheet_scoring/sheet_scoring_output.py b/app/llm_interface/sefaria_llm_interface/sheet_scoring/sheet_scoring_output.py new file mode 100644 index 0000000..22f5ba6 --- /dev/null +++ b/app/llm_interface/sefaria_llm_interface/sheet_scoring/sheet_scoring_output.py @@ -0,0 +1,21 @@ +from dataclasses import dataclass +from typing import Dict +from datetime import datetime + + +@dataclass +class SheetScoringOutput: + sheet_id: str + processed_datetime: str + language: str + title_interest_level: int + title_interest_reason: str + creativity_score: float + ref_levels: Dict[str, int] + ref_scores: Dict[str, float] + request_status: int + request_status_message: str + + def __post_init__(self): + if isinstance(self.processed_datetime, datetime): + self.processed_datetime = self.processed_datetime.isoformat() \ No newline at end of file diff --git a/app/requirements.txt b/app/requirements.txt index 9d11f97..83e0398 100644 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -1,8 +1,11 @@ -langchain[llms]~=0.2.1 +langchain==0.2.1 +langchain-core==0.2.2 +langchain-openai==0.1.8 langsmith~=0.1.0 anthropic~=0.26.1 stanza~=1.5.0 openai~=1.30.0 +httpx~=0.27.0 typer~=0.4.1 pydantic~=2.7.1 loguru~=0.7.2 @@ -10,9 +13,9 @@ tqdm~=4.66.1 celery[redis]~=5.2.7 diff-match-patch dnspython~=2.5.0 -tiktoken~=0.4.0 +tiktoken readability_lxml tenacity==8.3.0 requests numpy -git+https://github.com/Sefaria/LLM@v1.0.3#egg=sefaria_llm_interface&subdirectory=app/llm_interface +git+https://github.com/Sefaria/LLM@v1.3.6#egg=sefaria_llm_interface&subdirectory=app/llm_interface diff --git a/app/sheet_scoring/README.md b/app/sheet_scoring/README.md new file mode 100644 index 0000000..1c789fe --- /dev/null +++ b/app/sheet_scoring/README.md @@ -0,0 +1,231 @@ +# SheetScorer - Jewish Study Sheet Analysis Tool + +**SheetScorer** is a Python tool that uses **LLMs** to automatically analyze +and score Jewish study sheets for reference relevance and title interest. +It processes sheets, evaluates how well each cited reference +is discussed, and assigns engagement scores to sheet titles. + +## Scores Extracted + +- **Reference Discussion Scoring**: Analyzes how thoroughly each reference is discussed (**0-4 scale**) +- **Title Interest Scoring**: Evaluates how engaging sheet titles are to potential readers (**0-4 scale**) +- **Creativity Assessment**: Computes creativity scores based on percentage of **user-generated content**. +- **Title Interest Reason**: Explanation of title scoring. +- **Language**: Language of the sheet [all the languages are supported not only he and en]. + +## Quick Start + +```python +from sheet_scoring.sheet_scoring import score_one_sheet +from sefaria_llm_interface.sheet_scoring import SheetScoringInput + +input_data = SheetScoringInput( + sheet_id="123", + title="Understanding Genesis Creation", + expanded_refs=["Genesis 1:1", "Genesis 1:2"], + sources=[ + {"outsideText": "This commentary explores..."}, + {"ref": "Genesis 1:1", "text": {"en": "In the beginning..."}, "comment": "Analysis here..."} + ] +) + +result = score_one_sheet(input_data) +print(f"Title score: {result.title_interest_level}") +print(f"Ref scores: {result.ref_scores}") +print(result) +``` + +## Scoring System + +### Architecture + +#### sheet_scoring (package) +- sheet_scoring.py - Main API with score_one_sheet() function +- tasks.py - Celery task wrapper for async processing +- text_utils.py - Content parsing and token counting utilities +- openai_sheets_scorer.py - Core LLM scoring engine +- README.md + +### Reference Discussion Levels + +The tool evaluates how well each reference is discussed using a **0-4 scale**: + +| Level | Description | +|-------|-------------| +| **0 - Not Discussed** | Reference is **quoted only**, no discussion or commentary | +| **1 - Minimal** | Mentioned only through **neighboring verses**, minimal engagement | +| **2 - Moderate** | Some discussion present with **basic commentary** | +| **3 - Significant** | **Substantial discussion** with detailed commentary | +| **4 - Central** | Reference is a **central focus** of the entire sheet | + +### Title Interest Levels + +Sheet titles are scored for **user engagement** on a **0-4 scale**: + +| Level | Description | +|-------|-------------| +| **0 - Not Interesting** | **Off-topic** or unengaging for target users | +| **1 - Slight Relevance** | **Low appeal**, users unlikely to engage | +| **2 - Somewhat Interesting** | Users might **skim**, moderate appeal | +| **3 - Interesting** | Users **likely to open** and read | +| **4 - Very Compelling** | **Must-read content**, high engagement expected | + +### Creativity Score + +user_tokens / total_tokens - Higher = more original content vs canonical quotes. + +### Language +ISO-639-1 language code of the sheet, and in case the sheet has no user generated content, the language code of the title. + +## Data Structures +#### Input (SheetScoringInput) + +```python +{ + "sheet_id": "123", + "title": "Sheet title", + "expanded_refs": ["Genesis 1:1", "Exodus 2:3"], + "sources": [ + {"outsideText": "User commentary"}, + {"outsideBiText": {"en": "English", "he": "Hebrew"}}, + {"ref": "Genesis 1:1", "text": {"en": "Quote"}, "comment": "Analysis"} + ] +} +``` +#### Output (SheetScoringOutput) +```python +{ + "sheet_id": "123", + "ref_levels": {"Genesis 1:1": 3, "Exodus 2:3": 2}, # Raw 0-4 scores + "ref_scores": {"Genesis 1:1": 60.0, "Exodus 2:3": 40.0}, # Normalized % + "title_interest_level": 3, + "title_interest_reason": "Compelling theological question", + "language": "en", + "creativity_score": 0.75, + "processed_datetime": "2025-01-31T10:30:00Z", + "request_status": 1, # 1=success, 0=failure + "request_status_message": "" +} +``` + +## Configuration Options + +### Initialization Parameters + +```python +with SheetScorer( + api_key=os.getenv("OPENAI_API_KEY"), + model="gpt-4o-mini", # Default model + max_prompt_tokens=128000, # Input token budget + token_margin=16384, # Reserved for output + max_ref_to_process=800, # Max num of refs that can be processed + chunk_size=80 # Refs per LLM call +) as scorer: + result = scorer.process_sheet_by_content(...) +``` + +The constants DEFAULT_MAX_OUTPUT_TOKENS, DEFAULT_MAX_INPUT_OUTPUT_TOKENS are model specific +and can be found on the internet. + +## Content Processing Strategy + +The tool uses an **adjustable approach** for canonical quotations: + +1. **Always includes** all user commentary and **original content** +2. **Conditionally includes** canonical quotes only if the **entire bundle** fits within token limits +and **add_full_commentary is set to True** +3. **Truncates intelligently** using **LLM summarization** when content exceeds limits + + 1. ***LLM Summarization***: Uses secondary LLM to compress content while preserving key information + 2. ***Reference Preservation***: Maintains all biblical reference tags during compression + 3. ***Character Fallback***: Falls back to character-based truncation if summarization fails +## Grading Strategy +Processed content is sent to LLM, together with references for grading: + +### Resilient Grading List Processing + +- **Chunking**: Large reference lists are processed in **chunks** to stay within model limits +- **Overlap Handling**: Smart overlap between chunks prevents **reference boundary issues** + +### Resilient Reference Grading + +- **Primary attempt**: Process **all references together** +- **Fallback**: Split reference list in **half** and process **recursively** +- **Final fallback**: Assign **default score of 0** to problematic references + + +### Resilient score extraction + +Uses **OpenAI's function calling** feature with **strict schemas**: + +#### Middle Chunk Scoring Schema +```python +{ + "name": "score_references", + "parameters": { + "ref_levels": { + "Genesis 1:1": {"type": "integer", "minimum": 0, "maximum": 4}, + # ... for each reference + } + } +} +``` + +#### Title Scoring Schema +```python +{ + "name": "score_title", + "parameters": { + "language": {"type": "string"}, + "title_interest_level": {"type": "integer", "minimum": 0, "maximum": 4}, + "title_interest_reason": {"type": "string", "maxLength": 100} + } +} +``` + + +## Database Integration + +Designed for **MongoDB integration** with expected document structure: + +```python +{ + "id": "unique id", + "title": "Sheet Title", + "expandedRefs": ["Genesis 1:1", "Exodus 2:3"], + # Additional sheet content fields... +} +``` + +## Output Fields + +| Field | Description | +|------------------------------|------------------------------------------------| +| **`ref_levels`** | Raw **0-4 scores** for each reference | +| **`ref_scores`** | **Normalized percentage scores** (sum to 100%) | +| **`title_interest_level`** | Title **engagement score** (0-4) | +| **`title_interest_reason`** | **Brief explanation** of title score | +| **`language`** | **Detected language code** | +| **`creativity_score`** | **Percentage** of user-generated content | +| **`processed_datetime`** | **Processing timestamp** | +| **`request_status`** | **Whether scoring succeded/failed** | +| **`request_status_message`** | **The reason why scoring failed** | + + + + +## Logging + +**Comprehensive logging** for monitoring and debugging: + +- **Info**: Processing decisions and **content statistics** +- **Warning**: **Score validation** and fallback usage +- **Error**: **LLM failures** and processing errors + +Configure logging level as needed: +```python +import logging +logging.getLogger('sheet_scorer').setLevel(logging.INFO) +``` + + diff --git a/app/sheet_scoring/__init__.py b/app/sheet_scoring/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/sheet_scoring/openai_sheets_scorer.py b/app/sheet_scoring/openai_sheets_scorer.py new file mode 100644 index 0000000..740146e --- /dev/null +++ b/app/sheet_scoring/openai_sheets_scorer.py @@ -0,0 +1,803 @@ +import json +import logging +from datetime import datetime +from enum import IntEnum +from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union +import textwrap +import tiktoken +import httpx +from langchain.schema import HumanMessage +from langchain_openai import ChatOpenAI +from sheet_scoring.text_utils import sheet_to_text_views +from sefaria_llm_interface.sheet_scoring import SheetScoringOutput +# Configure logging +logger = logging.getLogger(__name__) + + +class IncompleteScoreError(Exception): + """Raised when LLM JSON is valid but doesn’t cover every reference.""" + pass + + +class RequestStatusOptions(IntEnum): + """Enumeration for tracking the status of LLM processing requests.""" + SUCCESS = 1 + FAILURE = 0 + + +class ScoreLevel(IntEnum): + """Reference discussion and title interest levels.""" + NOT_DISCUSSED = 0 + MINIMAL = 1 + MODERATE = 2 + SIGNIFICANT = 3 + CENTRAL = 4 + + +class LanguageCode: + """Supported language codes.""" + ENGLISH = 'en' + HEBREW = 'he' + DEFAULT = ENGLISH + + +class SheetScorer: + """ + Scores Jewish study sheets for reference relevance and title interest using LLMs, + computes creativity score based on percentage of user generated content. + + This class processes sheets from MongoDB, analyzes their content using OpenAI's GPT models, + and assigns scores for how well each reference is discussed and how interesting + the sheet title is to users. + """ + + # Configuration constants - + # DEFAULT_MAX_INPUT_OUTPUT_TOKENS: total + # tokens (prompt+response) we’ll send in one API call. Lowering this + # shrinks your available context; raising it risks exceeding the model’s + # limit. + # DEFAULT_MAX_OUTPUT_TOKENS: cap on how many tokens the model + # may generate. If you set this too low, responses may be cut off; too + # high wastes quota. + # DEFAULT_CHUNK_SIZE: how many references to score + # in each batch. Larger chunks use more context (better global view) but + # may exceed token budgets. + # MAX_CHUNK_OVERLAP: how many refs to repeat + # between chunks. More overlap reduces missing-edge-case errors at the + # cost of redundant API calls. + # DEFAULT_MAX_REFS_TO_PROCESS: total refs + # before falling back to equal-distribution scoring. Hitting this limit + # skips heavy LLM work to avoid runaway costs. - + # DEFAULT_TOKEN_CHAR_RATIO: fallback characters‐per‐token estimate when + # encoding fails. Tweak if you find your actual token counts diverge + # significantly from this estimate. + DEFAULT_MAX_OUTPUT_TOKENS = 16384 + DEFAULT_CHUNK_SIZE = 80 + DEFAULT_MAX_INPUT_OUTPUT_TOKENS = 128000 + DEFAULT_MAX_REFS_TO_PROCESS = 800 + DEFAULT_TOKEN_CHAR_RATIO = 3 + MAX_CHUNK_OVERLAP = 10 + # Database field names + REF_SCORES_FIELD = "ref_scores" + REF_LEVELS_FIELD = "ref_levels" + TITLE_INTEREST_FIELD = "title_interest_level" + LANGUAGE_FIELD = "language" + TITLE_INTEREST_REASON_FIELD = 'title_interest_reason' + PROCESSED_DATETIME_FIELD = "processed_datetime" + CREATIVITY_SCORE_FIELD = 'creativity_score' + + # Valid score levels + VALID_LEVELS: Set[int] = {level.value for level in ScoreLevel} + + def __init__( + self, + api_key: Optional[str], + model: str = "gpt-4o-mini", + max_prompt_tokens: int = DEFAULT_MAX_INPUT_OUTPUT_TOKENS, + token_margin: int = DEFAULT_MAX_OUTPUT_TOKENS, + max_ref_to_process: int = DEFAULT_MAX_REFS_TO_PROCESS, + chunk_size: int = DEFAULT_CHUNK_SIZE, + ): + self.max_prompt_tokens = max_prompt_tokens + self.token_margin = token_margin + self.model = model + self.chunk_size = chunk_size + self.max_ref_to_process = max_ref_to_process + self._http_client_json = httpx.Client() + self._http_client_text = httpx.Client() + self.llm = self._create_json_llm(api_key, model) + self.summarizer = self._create_text_llm(api_key, model) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + return False + + def close(self): + """Close HTTP clients to release resources.""" + self._http_client_json.close() + self._http_client_text.close() + + def _create_json_llm(self, api_key: str, model: str) -> ChatOpenAI: + """Create LLM client for JSON responses.""" + return ChatOpenAI( + model=model, + temperature=0, + top_p=0, + frequency_penalty=0, + presence_penalty=0, + seed=42, + api_key=api_key, + http_client=self._http_client_json, + ) + + def _create_text_llm(self, api_key: str, model: str) -> ChatOpenAI: + """Create LLM client for text responses.""" + return ChatOpenAI( + model=model, + temperature=0, + model_kwargs={"response_format": {"type": "text"}}, + api_key=api_key, + http_client=self._http_client_text, + ) + + def _invoke_llm_with_function(self, prompt: str, + function_schema: Dict[str, Any]) -> ( + Dict)[str, Any]: + """Invoke LLM using function calling instead of JSON mode.""" + response = self.llm.invoke( + [HumanMessage(content=prompt)], + functions=[function_schema], + function_call={"name": function_schema["name"]} + ) + + function_call = getattr(response, "additional_kwargs", {}).get( + "function_call" + ) + if function_call: + return json.loads(function_call["arguments"]) + + raise ValueError("No function call in response") + + def _get_reference_scoring_function_schema(self, expanded_refs: List[str]) -> \ + Dict[str, Any]: + """Create function schema for reference scoring with exact reference + names.""" + return { + "name": "score_references", + "description": "Score how well each reference is " + "discussed in the sheet", + "parameters": { + "type": "object", + "properties": { + self.REF_LEVELS_FIELD: { + "type": "object", + "description": "Scores for each reference (0-4 scale)", + "properties": { + ref_name: { + "type": "integer", + "description": f"Discussion level for {ref_name}", + "minimum": 0, + "maximum": 4 + } + for ref_name in expanded_refs + }, + "required": expanded_refs, + "additionalProperties": False + } + }, + "required": [self.REF_LEVELS_FIELD], + "additionalProperties": False + } + } + + def _get_title_scoring_schema(self) -> Dict[str, Any]: + """Create function schema for both reference and title scoring.""" + return { + "name": "score_title", + "description": "Score title interest for a Jewish study sheet", + "parameters": { + "type": "object", + "properties": { + self.LANGUAGE_FIELD: { + "type": "string", + "description": "ISO-639-1 title language code", + }, + self.TITLE_INTEREST_FIELD: { + "type": "integer", + "description": "How interesting the title is to " + "users (0-4 scale)", + "minimum": 0, + "maximum": 4 + }, + self.TITLE_INTEREST_REASON_FIELD: { + "type": "string", + "description": "Brief explanation of title interest " + "score (max 20 words)", + "maxLength": 100 + } + }, + "required": [self.LANGUAGE_FIELD, self.TITLE_INTEREST_FIELD, + self.TITLE_INTEREST_REASON_FIELD], + "additionalProperties": False + } + } + + def _get_full_scoring_function_schema(self, expanded_refs: List[str]) -> ( + Dict)[str, Any]: + """Create function schema for both reference and title scoring.""" + return { + "name": "score_sheet", + "description": "Score references and title interest for a Jewish " + "study sheet", + "parameters": { + "type": "object", + "properties": { + self.LANGUAGE_FIELD: { + "type": "string", + "description": "# ISO‑639‑1 code inferred from " + "*original user‑written* content", + }, + self.REF_LEVELS_FIELD: { + "type": "object", + "description": "Scores for each reference (0-4 scale)", + "properties": { + ref_name: { + "type": "integer", + "description": f"Discussion level for {ref_name}", + "minimum": 0, + "maximum": 4 + } + for ref_name in expanded_refs + }, + "required": expanded_refs, + "additionalProperties": False + }, + self.TITLE_INTEREST_FIELD: { + "type": "integer", + "description": "How interesting the title is to " + "users (0-4 scale)", + "minimum": 0, + "maximum": 4 + }, + self.TITLE_INTEREST_REASON_FIELD: { + "type": "string", + "description": "Brief explanation of title interest " + "score (max 20 words)", + "maxLength": 100 + } + }, + "required": [self.LANGUAGE_FIELD, self.REF_LEVELS_FIELD, + self.TITLE_INTEREST_FIELD, + self.TITLE_INTEREST_REASON_FIELD], + "additionalProperties": False + } + } + + @staticmethod + def chunk_list(lst: List[Any], n: int) -> Iterator[List[Any]]: + """Yield successive n‑sized chunks from lst.""" + for i in range(0, len(lst), n): + yield lst[i: i + n] + + def _count_tokens(self, text: str) -> int: + """Rough token count; if no encoder, fall back to char heuristic.""" + try: + encoding = tiktoken.encoding_for_model(self.model) + return len(encoding.encode(text)) + except (KeyError, ValueError) as e: + logger.warning( + f"Could not get encoding for model {self.model}: {e}" + ) + return len(text) // self.DEFAULT_TOKEN_CHAR_RATIO + + def _invoke_llm(self, prompt: str) -> Dict[str, Any]: + """Invoke LLM with prompt and parse JSON response.""" + response = self.llm([HumanMessage(content=prompt)]) + return json.loads(response.content) + + def _create_title_only_prompt_function(self, sheet_title: str) -> str: + return textwrap.dedent( + f"""You are scoring THE TITLE of a Jewish study sheet for how interesting it would be to users. + + SHEET TITLE: + {sheet_title} + + TASK: Return JSON with keys `title_interest_level` (0-4) and `title_interest_reason` ( < 20 words). + + Title interest level (int 0–4): + 0: Not interesting / off‑topic for users + 1: Slight relevance, low pull + 2: Somewhat interesting; user might skim + 3: Interesting; user likely to open + 4: Very compelling / must‑open + """) + + def _create_chunk_prompt_for_function(self, sheet_content: str, + expanded_refs: List[str]) -> str: + """Create prompt for function calling (no JSON format instructions + needed).""" + refs_md = "\n".join(f"- {r}" for r in expanded_refs) + return textwrap.dedent( + f""" + You are analyzing a Jewish study sheet. Rate how much each listed reference + is discussed or central in the sheet. + + SHEET CONTENT: + {sheet_content} + + REFERENCES TO EVALUATE: + {refs_md} + + Scoring Scale (0-4): + {ScoreLevel.NOT_DISCUSSED}: Quoted only, no discussion + {ScoreLevel.MINIMAL}: Mentioned only through neighboring verses + {ScoreLevel.MODERATE}: Moderate discussion (some commentary) + {ScoreLevel.SIGNIFICANT}: Significant discussion (substantial commentary) + {ScoreLevel.CENTRAL}: Central focus of sheet + + Score each reference based on how thoroughly it's discussed in the content.""" + ) + + def _create_final_chunk_prompt_for_function(self, sheet_content: str, + expanded_refs: List[str], + sheet_title: str) -> str: + """Create prompt for final chunk with title scoring using function + calling.""" + sheet_title_clean = sheet_title.strip() or "(untitled)" + refs_md = "\n".join(f"- {r}" for r in expanded_refs) + + return textwrap.dedent(f""" + Analyze this Jewish study sheet and provide two types of scores: + + SHEET TITLE: {sheet_title_clean} + + SHEET CONTENT: + {sheet_content} + + REFERENCES TO EVALUATE: + {refs_md} + + TASKS: + 1. Reference Discussion Scoring (0-4): + {ScoreLevel.NOT_DISCUSSED}: Quoted only, no discussion + {ScoreLevel.MINIMAL}: Mentioned only through neighboring verses + {ScoreLevel.MODERATE}: Moderate discussion (some commentary) + {ScoreLevel.SIGNIFICANT}: Significant discussion (substantial commentary) + {ScoreLevel.CENTRAL}: Central focus of sheet + + 2. Title Interest Scoring (0-4): + 0: Not interesting/off-topic + 1: Slight relevance, low appeal + 2: Somewhat interesting; user might skim + 3: Interesting; user likely to open + 4: Very compelling/must-open + + Infer the language from the original user-written content. + """) + + def _validate_score_level(self, score: Any, + field_name: str = "score") -> int: + """Validate and normalize score to valid range.""" + if score not in self.VALID_LEVELS: + try: + score = int(score) + except (ValueError, TypeError): + logger.warning( + f"Invalid {field_name}: {score}, defaulting to 0" + ) + return ScoreLevel.NOT_DISCUSSED + + if score not in self.VALID_LEVELS: + clamped = max( + ScoreLevel.NOT_DISCUSSED, + min(ScoreLevel.CENTRAL, score) + ) + logger.warning( + f"{field_name} {score} out of range, clamping to {clamped}" + ) + return clamped + + return score + + def _sheet_to_text( + self, + no_quotes_content: str, + full_content: str, + max_tokens: int, + add_full_commentary: bool + ) -> str: + """ + Build a text snapshot of the sheet with an *all‑or‑nothing* rule: + • Always include every bit of author commentary. + • Append *all* canonical quotations only if the whole bundle still + fits into `max_tokens`. + """ + comm_tokens = self._count_tokens(no_quotes_content) + # Commentary alone is already bigger than the budget → truncate & quit + full_tokens = self._count_tokens(full_content) + if add_full_commentary: + if full_tokens <= max_tokens: + logger.info("Sending to LLM sheet with quotations") + return full_content + + if comm_tokens >= max_tokens: + logger.info("Truncating user commentaries") + return self._truncate_to_token_budget(no_quotes_content, max_tokens) + logger.info("Sending to LLM sheet without quotations text") + return no_quotes_content + + def _get_title_info(self, sheet_title: str) -> Dict[str, Any]: + """Obtain title-interest score ONLY (used when no content).""" + prompt = self._create_title_only_prompt_function(sheet_title) + try: + function_schema = self._get_title_scoring_schema() + data = self._invoke_llm_with_function(prompt, function_schema) + title_level = self._validate_score_level( + data.get(self.TITLE_INTEREST_FIELD), + self.TITLE_INTEREST_FIELD + ) + + return { + self.TITLE_INTEREST_FIELD: + title_level, + self.TITLE_INTEREST_REASON_FIELD: + data.get(self.TITLE_INTEREST_REASON_FIELD, ""), + self.LANGUAGE_FIELD: data.get( + self.LANGUAGE_FIELD, LanguageCode.DEFAULT + ), + } + except Exception as e: + logger.error(f"Title-only GPT attempt failed: {e}") + return { + self.TITLE_INTEREST_FIELD: ScoreLevel.NOT_DISCUSSED, + self.TITLE_INTEREST_REASON_FIELD: "LLM error", + self.LANGUAGE_FIELD: LanguageCode.DEFAULT + } + + def _normalize_scores_to_percentages( + self, + sheet_tokens: int, + score_levels: Dict[str, int], + beta: float = 1500 # token mass where no penalty + ) -> Dict[str, float]: + """Convert reference scores to percentages with size penalty + for shorter sheets.""" + + total_level = sum(score_levels.values()) or 1 + size_factor = min(1.0, sheet_tokens / beta) # clamp to 1 + + # small sheets (few tokens) → size_factor < 1 → percentages shrink + percentages = { + ref: round(level * 100 / total_level * size_factor, 2) + for ref, level in score_levels.items() + } + + norm = sum(percentages.values()) or 1 + percentages = {r: round(v * 100 / norm, 2) for r, v in + percentages.items()} + return percentages + + def _grade_refs_resilient( + self, + content: str, + refs: List[str], + *, + with_title: bool = False, + sheet_title: str = "" + ) -> Tuple[Optional[Dict[str, Any]], Dict[str, int]]: + """ + Fault-tolerant reference scoring using divide-and-conquer strategy. + Attempts to score all references at once via LLM. If that fails + (due to incomplete responses), + recursively splits the reference list in half and scores each + subset separately until all references have scores. + This prevents total failure when the LLM struggles with large + reference lists or encounters transient errors. + + """ + if not refs: + return {}, {} + + try: + if with_title: + prompt = self._create_final_chunk_prompt_for_function( + content, refs, sheet_title + ) + function_schema = self._get_full_scoring_function_schema(refs) + else: + prompt = self._create_chunk_prompt_for_function(content, refs) + function_schema = self._get_reference_scoring_function_schema( + refs + ) + data, scores = self._get_gpt_ref_scores_function( + prompt, function_schema, refs + ) + return data, scores + except Exception: + pass + + # fallback branch + if len(refs) == 1: # nothing left to split + return {}, {refs[0]: ScoreLevel.NOT_DISCUSSED} + + mid = len(refs) // 2 + ld, ls = self._grade_refs_resilient( + content, refs[:mid], + with_title=with_title, + sheet_title=sheet_title + ) + rd, rs = self._grade_refs_resilient( + content, refs[mid:], + with_title=with_title, + sheet_title=sheet_title + ) + merged_scores = {**ls, **rs} + merged_data = ld or rd + return merged_data, merged_scores + + def _get_gpt_ref_scores_function(self, prompt: str, function_schema, + expected_refs: List[str]): + """Calls the LLM with structured function schema, validates all + returned scores are in valid range (0-4), handles missing references, + and ensures exactly the expected references are scored.""" + try: + data = self._invoke_llm_with_function(prompt, function_schema) + chunk_scores = data.get(self.REF_LEVELS_FIELD, {}) + validated_scores = {} + for ref, score in chunk_scores.items(): + validated_scores[ref] = self._validate_score_level( + score, f"ref_score[{ref}]" + ) + + # Check for missing references and assign default scores (0) + missing_refs = set(expected_refs) - set(validated_scores.keys()) + if missing_refs: + logger.warning( + f"GPT didn't return scores for {len(missing_refs)} " + ) + if len(missing_refs) < 5: + logger.warning(f"Defaulting missing scores to zeros") + for ref in missing_refs: + validated_scores[ref] = ScoreLevel.NOT_DISCUSSED + + else: + raise IncompleteScoreError( + f"Missing {len(missing_refs)} references" + ) + + # Ensure we only include expected references (in case GPT + # returned extras) + final_scores = { + ref: validated_scores.get(ref, ScoreLevel.NOT_DISCUSSED) for ref + in expected_refs} + + data[self.REF_SCORES_FIELD] = final_scores + return data, final_scores + + except IncompleteScoreError: + raise + + except Exception as e: + logger.error(f"Chunk GPT failed: {e}") + return None + + def _last_regular_start(self, n: int, chunk: int, overlap: int) -> int: + """ + Return the index where the *final* chunk (with title) should start. + If the total length fits into one chunk plus the allowed overlap, + analyse everything together (start = 0). + """ + if n <= chunk + overlap: + return 0 + step = chunk - overlap + return max(0, n - chunk) if step <= 0 else (n - chunk) + + def _process_reference_chunks( + self, + content: str, + expanded_refs: List[str] + ) -> Optional[Dict[str, int]]: + """Process reference chunks in batches.""" + ref_scores: Dict[str, int] = {} + + last_chunk_start = self._last_regular_start( + len(expanded_refs), self.chunk_size, self.MAX_CHUNK_OVERLAP + ) + + for chunk in self.chunk_list( + expanded_refs[:last_chunk_start], self.chunk_size + ): + # prompt = self._create_chunk_prompt(content,chunk) + _, chunk_scores = self._grade_refs_resilient( + content=content, + refs=chunk, + with_title=False + ) + if chunk_scores is None: + return None + ref_scores.update(chunk_scores) + + return ref_scores + + def _process_final_chunk_with_title( + self, + content: str, + expanded_refs: List[str], + title: str, + ) -> Optional[Dict[str, Any]]: + """Process final chunk and get title scores.""" + start = self._last_regular_start( + len(expanded_refs), self.chunk_size, self.MAX_CHUNK_OVERLAP + ) + final_chunk = expanded_refs[start:] + + # prompt = self._create_final_chunk_prompt(content,final_chunk,title) + result = self._grade_refs_resilient( + content=content, + refs=final_chunk, + with_title=True, + sheet_title=title + ) + + if result is None: + return None + + data, _ = result + return data + + def get_gpt_scores( + self, + content: str, + expanded_refs: List[str], + title: str, + ) -> Optional[Dict[str, Any]]: + """Get GPT scores for references and title.""" + # Process reference chunks + ref_scores = self._process_reference_chunks(content, expanded_refs) + if ref_scores is None: + return None + + # Process final chunk with title + final_data = self._process_final_chunk_with_title( + content, expanded_refs, title + ) + if final_data is None: + return None + + # Combine scores + final_chunk_scores = final_data.get(self.REF_SCORES_FIELD, {}) + ref_scores.update(final_chunk_scores) + + # Normalize to percentages + score_percentages = self._normalize_scores_to_percentages( + score_levels=ref_scores, + sheet_tokens=self._count_tokens(content) + ) + + # Validate title score + title_level = self._validate_score_level( + final_data.get(self.TITLE_INTEREST_FIELD), + self.TITLE_INTEREST_FIELD + ) + + return { + self.LANGUAGE_FIELD: final_data.get( + self.LANGUAGE_FIELD, LanguageCode.DEFAULT + ), + self.REF_LEVELS_FIELD: ref_scores, + self.REF_SCORES_FIELD: score_percentages, + self.TITLE_INTEREST_FIELD: title_level, + self.TITLE_INTEREST_REASON_FIELD: final_data.get( + self.TITLE_INTEREST_REASON_FIELD, "" + ), + } + + def _truncate_to_token_budget(self, text: str, max_tokens: int) -> str: + """Truncate text to fit within token budget using LLM summarization.""" + if self._count_tokens(text) <= max_tokens: + return text + try: + prompt = f""" + Compress the following commentary to ≤ {max_tokens} tokens. + Keep every reference tag like "Genesis 1:1" or "Exodus 2:5". + Use clear sentences; preserve main ideas. + + {text} + """ + summary = self.summarizer( + [HumanMessage(content=prompt)] + ).content.strip() + + if self._count_tokens(summary) <= max_tokens: + return summary + else: + # Fallback: character-based truncation + return summary[:max_tokens * self.DEFAULT_TOKEN_CHAR_RATIO] + + except Exception as e: + logger.error(f"Summarization failed: {e}") + # Fallback: character-based truncation + return text[:max_tokens * self.DEFAULT_TOKEN_CHAR_RATIO] + + def create_failure_output(self, sheet_id: str, request_status_message: str) -> ( + SheetScoringOutput): + """Create a standardized failure output when sheet processing cannot + be completed.""" + return SheetScoringOutput( + sheet_id=sheet_id, + processed_datetime=str(datetime.utcnow()), + language="", + title_interest_level=0, + title_interest_reason="", + creativity_score=0, + ref_levels={}, + ref_scores={}, + request_status=RequestStatusOptions.FAILURE, + request_status_message=request_status_message + ) + + def process_sheet_by_content(self, + sheet_id: str, + expanded_refs: List[str], + title: str, + sources: List[Dict[str, Union[str, Dict[str, str]]]], + add_full_commentary=False) -> SheetScoringOutput: + """Score a single sheet based on its content.""" + if not expanded_refs: + request_status_message = f"No expanded refs for sheet {sheet_id}, skipping" + logger.info(request_status_message) + return self.create_failure_output(sheet_id, + request_status_message=request_status_message) + text_views = sheet_to_text_views(title=title, sources=sources, default_lang=LanguageCode.DEFAULT) + no_quotes_content = text_views["no_quotes"] + full_content = text_views["with_quotes"] + has_original = text_views["has_original"] + creativity_score = text_views["creativity_score"] + + # Check for original content and reference limits + if (not has_original or + len(expanded_refs) > self.max_ref_to_process): + logger.info(f"Sheet {sheet_id}: using equal distribution") + score_percentages = {ref: 0 for ref in expanded_refs} + title_info = self._get_title_info(title) + + return SheetScoringOutput(sheet_id=sheet_id, + ref_levels=score_percentages, + ref_scores=score_percentages, + processed_datetime=str(datetime.utcnow()), + creativity_score=creativity_score, + title_interest_level=title_info[self.TITLE_INTEREST_FIELD], + title_interest_reason=title_info[self.TITLE_INTEREST_REASON_FIELD], + language=title_info[self.LANGUAGE_FIELD], + request_status=RequestStatusOptions.SUCCESS, + request_status_message="The sheet has no user generated content" + ) + + content = self._sheet_to_text( + no_quotes_content=no_quotes_content, + full_content=full_content, + max_tokens=self.max_prompt_tokens - self.token_margin, + add_full_commentary=add_full_commentary) + # Process with GPT + gpt_analysis = self.get_gpt_scores(content, expanded_refs, title) + if not gpt_analysis: + request_status_message = f"Failed to get GPT scores for sheet {sheet_id}" + logger.error(request_status_message) + return self.create_failure_output(sheet_id=sheet_id, + request_status_message=request_status_message) + + return SheetScoringOutput( + sheet_id=sheet_id, + ref_levels=gpt_analysis[self.REF_LEVELS_FIELD], + ref_scores=gpt_analysis[self.REF_SCORES_FIELD], + processed_datetime=str(datetime.utcnow()), + creativity_score=creativity_score, + title_interest_level=gpt_analysis[self.TITLE_INTEREST_FIELD], + title_interest_reason=gpt_analysis[self.TITLE_INTEREST_REASON_FIELD], + language=gpt_analysis[self.LANGUAGE_FIELD], + request_status=RequestStatusOptions.SUCCESS, + request_status_message="" + ) diff --git a/app/sheet_scoring/sheet_scoring.py b/app/sheet_scoring/sheet_scoring.py new file mode 100644 index 0000000..2fbb481 --- /dev/null +++ b/app/sheet_scoring/sheet_scoring.py @@ -0,0 +1,14 @@ +from sheet_scoring.openai_sheets_scorer import SheetScorer +import os +from sefaria_llm_interface.sheet_scoring import ( + SheetScoringInput, + SheetScoringOutput, +) + + +def score_one_sheet(inp: SheetScoringInput) -> SheetScoringOutput: + with SheetScorer(api_key=os.getenv("OPENAI_API_KEY")) as scorer: + return scorer.process_sheet_by_content(sheet_id=inp.sheet_id, + title=inp.title, + sources=inp.sources, + expanded_refs=inp.expanded_refs) \ No newline at end of file diff --git a/app/sheet_scoring/tasks.py b/app/sheet_scoring/tasks.py new file mode 100644 index 0000000..eb4aa55 --- /dev/null +++ b/app/sheet_scoring/tasks.py @@ -0,0 +1,13 @@ +from celery import shared_task +from sheet_scoring.sheet_scoring import score_one_sheet +from sefaria_llm_interface.sheet_scoring import ( + SheetScoringInput +) +from dataclasses import asdict + + +@shared_task(name='llm.score_sheet') +def score_sheet_task(raw_input: dict) -> dict: + inp = SheetScoringInput(**raw_input) + out = score_one_sheet(inp) + return asdict(out) \ No newline at end of file diff --git a/app/sheet_scoring/text_utils.py b/app/sheet_scoring/text_utils.py new file mode 100644 index 0000000..551e282 --- /dev/null +++ b/app/sheet_scoring/text_utils.py @@ -0,0 +1,114 @@ +import re +from typing import Dict, List, Union, Any + +TOKEN_RE = re.compile(r"\b\w+\b", re.UNICODE) + + +def strip_html(raw: str) -> str: + """Remove tags & entities, collapse whitespace.""" + if not raw: + return "" + return '\n'.join([' '.join(line.split()) for line in raw.split('\n')]) + + +def token_count(text: str) -> int: + """Approximate word tokens (both English & Hebrew).""" + return len(TOKEN_RE.findall(text)) + + +def sheet_to_text_views(title: str, + sources: List[Dict[str, Union[str, Dict[str, str]]]], + default_lang: str = "en") -> Dict[str, Any]: + """ + Build three plain‑text snapshots of a Sefaria sheet **and** compute a + creativity score. + + Returns + ------- + quotes_only str – ref + canonical text blocks + no_quotes str – title & user commentary, refs only for quotes + with_quotes str – full sheet (title, commentary, *and* quotes) + has_original bool – True if any user commentary exists + creativity_score float – user_token_count / total_token_count + """ + + quotes: List[str] = [] + no_quotes: List[str] = [] + with_quotes: List[str] = [] + + original_tokens = 0 + quoted_tokens = 0 + has_original = False + + if title: + tok = token_count(title) + original_tokens += tok + no_quotes.append(title) + with_quotes.append(title) + + for blk in sources: + # --- outsideText (single‑lang commentary) + if "outsideText" in blk: + txt = strip_html(blk["outsideText"]).strip() + if txt: + has_original = True + t = token_count(txt) + original_tokens += t + no_quotes.append(txt) + with_quotes.append(txt) + + if "outsideBiText" in blk: + for lang in ("en", "he"): + txt = strip_html(blk["outsideBiText"].get(lang, "")).strip() + if txt: + has_original = True + original_tokens += token_count(txt) + no_quotes.append(txt) + with_quotes.append(txt) + + if "text" in blk: + ref = blk.get("ref", "").strip() + canon = strip_html(blk["text"].get(default_lang, "")).strip() + + # show ref label in all views + if ref: + no_quotes.append(ref) + header = f"{ref}:" + else: + header = "" + + if canon: + # quote tokens count toward quoted_tokens + qtok = token_count(canon) + quoted_tokens += qtok + + # add to quotes‑only and with_quotes + if header: + quotes.append(header) + with_quotes.append(header) + quotes.append(canon) + with_quotes.append(canon) + + if "comment" in blk: + txt = strip_html(blk["comment"]).strip() + if txt: + has_original = True + original_tokens += token_count(txt) + no_quotes.append(txt) + with_quotes.append(txt) + + joiner = "\n\n" + quotes_only = joiner.join(quotes) + commentary = joiner.join(no_quotes) + full_sheet = joiner.join(with_quotes) + + total_tokens = original_tokens + quoted_tokens or 1 # avoid div‑by‑zero + creativity = original_tokens / total_tokens + + return { + "quotes_only": quotes_only, + "no_quotes": commentary, + "with_quotes": full_sheet, + "has_original": has_original, + "creativity_score": creativity + } \ No newline at end of file diff --git a/app/util/sentencizer.py b/app/util/sentencizer.py index 363876c..d76da17 100644 --- a/app/util/sentencizer.py +++ b/app/util/sentencizer.py @@ -74,7 +74,7 @@ def claude_sentencizer_first_sentence(text): from basic_langchain.chat_models import ChatAnthropic from basic_langchain.schema import SystemMessage, HumanMessage from util.general import get_by_xml_tag - system = SystemMessage(content="Given a text discussing Torah topics will little to no punctuation, " + system = SystemMessage(content="Given a text discussing Torah topics with little to no punctuation, " "output the first sentence. Input is in tags. The first sentence " "should be output verbatim as it appears in wrapped in " " tags. Since the input text has no punctuation, use your judgement as to where the first sentence ends. Prefer smaller sentences.") diff --git a/build/entrypoint.sh b/build/entrypoint.sh index fbb070e..7c5136d 100644 --- a/build/entrypoint.sh +++ b/build/entrypoint.sh @@ -1,3 +1,3 @@ #!/bin/bash -celery -A celery_setup.app worker -Q ${QUEUE_NAME} -l INFO --concurrency 50 \ No newline at end of file +celery -A celery_setup.app worker -Q ${QUEUE_NAME} -l INFO --concurrency 4 \ No newline at end of file