diff --git a/app/celery_setup/app.py b/app/celery_setup/app.py
index 0b50834..d123eeb 100644
--- a/app/celery_setup/app.py
+++ b/app/celery_setup/app.py
@@ -3,4 +3,4 @@
app = Celery('llm')
app.conf.update(**generate_config_from_env())
-app.autodiscover_tasks(packages=['topic_prompt'])
+app.autodiscover_tasks(packages=['topic_prompt', 'sheet_scoring'])
diff --git a/app/llm_interface/sefaria_llm_interface/sheet_scoring/__init__.py b/app/llm_interface/sefaria_llm_interface/sheet_scoring/__init__.py
new file mode 100644
index 0000000..f0aaabe
--- /dev/null
+++ b/app/llm_interface/sefaria_llm_interface/sheet_scoring/__init__.py
@@ -0,0 +1,3 @@
+from sefaria_llm_interface.sheet_scoring.sheet_scoring_input import *
+from sefaria_llm_interface.sheet_scoring.sheet_scoring_output import *
+
diff --git a/app/llm_interface/sefaria_llm_interface/sheet_scoring/sheet_scoring_input.py b/app/llm_interface/sefaria_llm_interface/sheet_scoring/sheet_scoring_input.py
new file mode 100644
index 0000000..e6fcb9d
--- /dev/null
+++ b/app/llm_interface/sefaria_llm_interface/sheet_scoring/sheet_scoring_input.py
@@ -0,0 +1,12 @@
+from dataclasses import dataclass
+from typing import List, Dict, Union
+
+
+@dataclass
+class SheetScoringInput:
+ # str version of id
+ sheet_id: str
+ title: str
+ sources: List[Dict[str, Union[str, Dict[str, str]]]]
+ expanded_refs: List[str]
+
diff --git a/app/llm_interface/sefaria_llm_interface/sheet_scoring/sheet_scoring_output.py b/app/llm_interface/sefaria_llm_interface/sheet_scoring/sheet_scoring_output.py
new file mode 100644
index 0000000..22f5ba6
--- /dev/null
+++ b/app/llm_interface/sefaria_llm_interface/sheet_scoring/sheet_scoring_output.py
@@ -0,0 +1,21 @@
+from dataclasses import dataclass
+from typing import Dict
+from datetime import datetime
+
+
+@dataclass
+class SheetScoringOutput:
+ sheet_id: str
+ processed_datetime: str
+ language: str
+ title_interest_level: int
+ title_interest_reason: str
+ creativity_score: float
+ ref_levels: Dict[str, int]
+ ref_scores: Dict[str, float]
+ request_status: int
+ request_status_message: str
+
+ def __post_init__(self):
+ if isinstance(self.processed_datetime, datetime):
+ self.processed_datetime = self.processed_datetime.isoformat()
\ No newline at end of file
diff --git a/app/requirements.txt b/app/requirements.txt
index 9d11f97..83e0398 100644
--- a/app/requirements.txt
+++ b/app/requirements.txt
@@ -1,8 +1,11 @@
-langchain[llms]~=0.2.1
+langchain==0.2.1
+langchain-core==0.2.2
+langchain-openai==0.1.8
langsmith~=0.1.0
anthropic~=0.26.1
stanza~=1.5.0
openai~=1.30.0
+httpx~=0.27.0
typer~=0.4.1
pydantic~=2.7.1
loguru~=0.7.2
@@ -10,9 +13,9 @@ tqdm~=4.66.1
celery[redis]~=5.2.7
diff-match-patch
dnspython~=2.5.0
-tiktoken~=0.4.0
+tiktoken
readability_lxml
tenacity==8.3.0
requests
numpy
-git+https://github.com/Sefaria/LLM@v1.0.3#egg=sefaria_llm_interface&subdirectory=app/llm_interface
+git+https://github.com/Sefaria/LLM@v1.3.6#egg=sefaria_llm_interface&subdirectory=app/llm_interface
diff --git a/app/sheet_scoring/README.md b/app/sheet_scoring/README.md
new file mode 100644
index 0000000..1c789fe
--- /dev/null
+++ b/app/sheet_scoring/README.md
@@ -0,0 +1,231 @@
+# SheetScorer - Jewish Study Sheet Analysis Tool
+
+**SheetScorer** is a Python tool that uses **LLMs** to automatically analyze
+and score Jewish study sheets for reference relevance and title interest.
+It processes sheets, evaluates how well each cited reference
+is discussed, and assigns engagement scores to sheet titles.
+
+## Scores Extracted
+
+- **Reference Discussion Scoring**: Analyzes how thoroughly each reference is discussed (**0-4 scale**)
+- **Title Interest Scoring**: Evaluates how engaging sheet titles are to potential readers (**0-4 scale**)
+- **Creativity Assessment**: Computes creativity scores based on percentage of **user-generated content**.
+- **Title Interest Reason**: Explanation of title scoring.
+- **Language**: Language of the sheet [all the languages are supported not only he and en].
+
+## Quick Start
+
+```python
+from sheet_scoring.sheet_scoring import score_one_sheet
+from sefaria_llm_interface.sheet_scoring import SheetScoringInput
+
+input_data = SheetScoringInput(
+ sheet_id="123",
+ title="Understanding Genesis Creation",
+ expanded_refs=["Genesis 1:1", "Genesis 1:2"],
+ sources=[
+ {"outsideText": "This commentary explores..."},
+ {"ref": "Genesis 1:1", "text": {"en": "In the beginning..."}, "comment": "Analysis here..."}
+ ]
+)
+
+result = score_one_sheet(input_data)
+print(f"Title score: {result.title_interest_level}")
+print(f"Ref scores: {result.ref_scores}")
+print(result)
+```
+
+## Scoring System
+
+### Architecture
+
+#### sheet_scoring (package)
+- sheet_scoring.py - Main API with score_one_sheet() function
+- tasks.py - Celery task wrapper for async processing
+- text_utils.py - Content parsing and token counting utilities
+- openai_sheets_scorer.py - Core LLM scoring engine
+- README.md
+
+### Reference Discussion Levels
+
+The tool evaluates how well each reference is discussed using a **0-4 scale**:
+
+| Level | Description |
+|-------|-------------|
+| **0 - Not Discussed** | Reference is **quoted only**, no discussion or commentary |
+| **1 - Minimal** | Mentioned only through **neighboring verses**, minimal engagement |
+| **2 - Moderate** | Some discussion present with **basic commentary** |
+| **3 - Significant** | **Substantial discussion** with detailed commentary |
+| **4 - Central** | Reference is a **central focus** of the entire sheet |
+
+### Title Interest Levels
+
+Sheet titles are scored for **user engagement** on a **0-4 scale**:
+
+| Level | Description |
+|-------|-------------|
+| **0 - Not Interesting** | **Off-topic** or unengaging for target users |
+| **1 - Slight Relevance** | **Low appeal**, users unlikely to engage |
+| **2 - Somewhat Interesting** | Users might **skim**, moderate appeal |
+| **3 - Interesting** | Users **likely to open** and read |
+| **4 - Very Compelling** | **Must-read content**, high engagement expected |
+
+### Creativity Score
+
+user_tokens / total_tokens - Higher = more original content vs canonical quotes.
+
+### Language
+ISO-639-1 language code of the sheet, and in case the sheet has no user generated content, the language code of the title.
+
+## Data Structures
+#### Input (SheetScoringInput)
+
+```python
+{
+ "sheet_id": "123",
+ "title": "Sheet title",
+ "expanded_refs": ["Genesis 1:1", "Exodus 2:3"],
+ "sources": [
+ {"outsideText": "User commentary"},
+ {"outsideBiText": {"en": "English", "he": "Hebrew"}},
+ {"ref": "Genesis 1:1", "text": {"en": "Quote"}, "comment": "Analysis"}
+ ]
+}
+```
+#### Output (SheetScoringOutput)
+```python
+{
+ "sheet_id": "123",
+ "ref_levels": {"Genesis 1:1": 3, "Exodus 2:3": 2}, # Raw 0-4 scores
+ "ref_scores": {"Genesis 1:1": 60.0, "Exodus 2:3": 40.0}, # Normalized %
+ "title_interest_level": 3,
+ "title_interest_reason": "Compelling theological question",
+ "language": "en",
+ "creativity_score": 0.75,
+ "processed_datetime": "2025-01-31T10:30:00Z",
+ "request_status": 1, # 1=success, 0=failure
+ "request_status_message": ""
+}
+```
+
+## Configuration Options
+
+### Initialization Parameters
+
+```python
+with SheetScorer(
+ api_key=os.getenv("OPENAI_API_KEY"),
+ model="gpt-4o-mini", # Default model
+ max_prompt_tokens=128000, # Input token budget
+ token_margin=16384, # Reserved for output
+ max_ref_to_process=800, # Max num of refs that can be processed
+ chunk_size=80 # Refs per LLM call
+) as scorer:
+ result = scorer.process_sheet_by_content(...)
+```
+
+The constants DEFAULT_MAX_OUTPUT_TOKENS, DEFAULT_MAX_INPUT_OUTPUT_TOKENS are model specific
+and can be found on the internet.
+
+## Content Processing Strategy
+
+The tool uses an **adjustable approach** for canonical quotations:
+
+1. **Always includes** all user commentary and **original content**
+2. **Conditionally includes** canonical quotes only if the **entire bundle** fits within token limits
+and **add_full_commentary is set to True**
+3. **Truncates intelligently** using **LLM summarization** when content exceeds limits
+
+ 1. ***LLM Summarization***: Uses secondary LLM to compress content while preserving key information
+ 2. ***Reference Preservation***: Maintains all biblical reference tags during compression
+ 3. ***Character Fallback***: Falls back to character-based truncation if summarization fails
+## Grading Strategy
+Processed content is sent to LLM, together with references for grading:
+
+### Resilient Grading List Processing
+
+- **Chunking**: Large reference lists are processed in **chunks** to stay within model limits
+- **Overlap Handling**: Smart overlap between chunks prevents **reference boundary issues**
+
+### Resilient Reference Grading
+
+- **Primary attempt**: Process **all references together**
+- **Fallback**: Split reference list in **half** and process **recursively**
+- **Final fallback**: Assign **default score of 0** to problematic references
+
+
+### Resilient score extraction
+
+Uses **OpenAI's function calling** feature with **strict schemas**:
+
+#### Middle Chunk Scoring Schema
+```python
+{
+ "name": "score_references",
+ "parameters": {
+ "ref_levels": {
+ "Genesis 1:1": {"type": "integer", "minimum": 0, "maximum": 4},
+ # ... for each reference
+ }
+ }
+}
+```
+
+#### Title Scoring Schema
+```python
+{
+ "name": "score_title",
+ "parameters": {
+ "language": {"type": "string"},
+ "title_interest_level": {"type": "integer", "minimum": 0, "maximum": 4},
+ "title_interest_reason": {"type": "string", "maxLength": 100}
+ }
+}
+```
+
+
+## Database Integration
+
+Designed for **MongoDB integration** with expected document structure:
+
+```python
+{
+ "id": "unique id",
+ "title": "Sheet Title",
+ "expandedRefs": ["Genesis 1:1", "Exodus 2:3"],
+ # Additional sheet content fields...
+}
+```
+
+## Output Fields
+
+| Field | Description |
+|------------------------------|------------------------------------------------|
+| **`ref_levels`** | Raw **0-4 scores** for each reference |
+| **`ref_scores`** | **Normalized percentage scores** (sum to 100%) |
+| **`title_interest_level`** | Title **engagement score** (0-4) |
+| **`title_interest_reason`** | **Brief explanation** of title score |
+| **`language`** | **Detected language code** |
+| **`creativity_score`** | **Percentage** of user-generated content |
+| **`processed_datetime`** | **Processing timestamp** |
+| **`request_status`** | **Whether scoring succeded/failed** |
+| **`request_status_message`** | **The reason why scoring failed** |
+
+
+
+
+## Logging
+
+**Comprehensive logging** for monitoring and debugging:
+
+- **Info**: Processing decisions and **content statistics**
+- **Warning**: **Score validation** and fallback usage
+- **Error**: **LLM failures** and processing errors
+
+Configure logging level as needed:
+```python
+import logging
+logging.getLogger('sheet_scorer').setLevel(logging.INFO)
+```
+
+
diff --git a/app/sheet_scoring/__init__.py b/app/sheet_scoring/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/sheet_scoring/openai_sheets_scorer.py b/app/sheet_scoring/openai_sheets_scorer.py
new file mode 100644
index 0000000..740146e
--- /dev/null
+++ b/app/sheet_scoring/openai_sheets_scorer.py
@@ -0,0 +1,803 @@
+import json
+import logging
+from datetime import datetime
+from enum import IntEnum
+from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union
+import textwrap
+import tiktoken
+import httpx
+from langchain.schema import HumanMessage
+from langchain_openai import ChatOpenAI
+from sheet_scoring.text_utils import sheet_to_text_views
+from sefaria_llm_interface.sheet_scoring import SheetScoringOutput
+# Configure logging
+logger = logging.getLogger(__name__)
+
+
+class IncompleteScoreError(Exception):
+ """Raised when LLM JSON is valid but doesn’t cover every reference."""
+ pass
+
+
+class RequestStatusOptions(IntEnum):
+ """Enumeration for tracking the status of LLM processing requests."""
+ SUCCESS = 1
+ FAILURE = 0
+
+
+class ScoreLevel(IntEnum):
+ """Reference discussion and title interest levels."""
+ NOT_DISCUSSED = 0
+ MINIMAL = 1
+ MODERATE = 2
+ SIGNIFICANT = 3
+ CENTRAL = 4
+
+
+class LanguageCode:
+ """Supported language codes."""
+ ENGLISH = 'en'
+ HEBREW = 'he'
+ DEFAULT = ENGLISH
+
+
+class SheetScorer:
+ """
+ Scores Jewish study sheets for reference relevance and title interest using LLMs,
+ computes creativity score based on percentage of user generated content.
+
+ This class processes sheets from MongoDB, analyzes their content using OpenAI's GPT models,
+ and assigns scores for how well each reference is discussed and how interesting
+ the sheet title is to users.
+ """
+
+ # Configuration constants -
+ # DEFAULT_MAX_INPUT_OUTPUT_TOKENS: total
+ # tokens (prompt+response) we’ll send in one API call. Lowering this
+ # shrinks your available context; raising it risks exceeding the model’s
+ # limit.
+ # DEFAULT_MAX_OUTPUT_TOKENS: cap on how many tokens the model
+ # may generate. If you set this too low, responses may be cut off; too
+ # high wastes quota.
+ # DEFAULT_CHUNK_SIZE: how many references to score
+ # in each batch. Larger chunks use more context (better global view) but
+ # may exceed token budgets.
+ # MAX_CHUNK_OVERLAP: how many refs to repeat
+ # between chunks. More overlap reduces missing-edge-case errors at the
+ # cost of redundant API calls.
+ # DEFAULT_MAX_REFS_TO_PROCESS: total refs
+ # before falling back to equal-distribution scoring. Hitting this limit
+ # skips heavy LLM work to avoid runaway costs. -
+ # DEFAULT_TOKEN_CHAR_RATIO: fallback characters‐per‐token estimate when
+ # encoding fails. Tweak if you find your actual token counts diverge
+ # significantly from this estimate.
+ DEFAULT_MAX_OUTPUT_TOKENS = 16384
+ DEFAULT_CHUNK_SIZE = 80
+ DEFAULT_MAX_INPUT_OUTPUT_TOKENS = 128000
+ DEFAULT_MAX_REFS_TO_PROCESS = 800
+ DEFAULT_TOKEN_CHAR_RATIO = 3
+ MAX_CHUNK_OVERLAP = 10
+ # Database field names
+ REF_SCORES_FIELD = "ref_scores"
+ REF_LEVELS_FIELD = "ref_levels"
+ TITLE_INTEREST_FIELD = "title_interest_level"
+ LANGUAGE_FIELD = "language"
+ TITLE_INTEREST_REASON_FIELD = 'title_interest_reason'
+ PROCESSED_DATETIME_FIELD = "processed_datetime"
+ CREATIVITY_SCORE_FIELD = 'creativity_score'
+
+ # Valid score levels
+ VALID_LEVELS: Set[int] = {level.value for level in ScoreLevel}
+
+ def __init__(
+ self,
+ api_key: Optional[str],
+ model: str = "gpt-4o-mini",
+ max_prompt_tokens: int = DEFAULT_MAX_INPUT_OUTPUT_TOKENS,
+ token_margin: int = DEFAULT_MAX_OUTPUT_TOKENS,
+ max_ref_to_process: int = DEFAULT_MAX_REFS_TO_PROCESS,
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
+ ):
+ self.max_prompt_tokens = max_prompt_tokens
+ self.token_margin = token_margin
+ self.model = model
+ self.chunk_size = chunk_size
+ self.max_ref_to_process = max_ref_to_process
+ self._http_client_json = httpx.Client()
+ self._http_client_text = httpx.Client()
+ self.llm = self._create_json_llm(api_key, model)
+ self.summarizer = self._create_text_llm(api_key, model)
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ self.close()
+ return False
+
+ def close(self):
+ """Close HTTP clients to release resources."""
+ self._http_client_json.close()
+ self._http_client_text.close()
+
+ def _create_json_llm(self, api_key: str, model: str) -> ChatOpenAI:
+ """Create LLM client for JSON responses."""
+ return ChatOpenAI(
+ model=model,
+ temperature=0,
+ top_p=0,
+ frequency_penalty=0,
+ presence_penalty=0,
+ seed=42,
+ api_key=api_key,
+ http_client=self._http_client_json,
+ )
+
+ def _create_text_llm(self, api_key: str, model: str) -> ChatOpenAI:
+ """Create LLM client for text responses."""
+ return ChatOpenAI(
+ model=model,
+ temperature=0,
+ model_kwargs={"response_format": {"type": "text"}},
+ api_key=api_key,
+ http_client=self._http_client_text,
+ )
+
+ def _invoke_llm_with_function(self, prompt: str,
+ function_schema: Dict[str, Any]) -> (
+ Dict)[str, Any]:
+ """Invoke LLM using function calling instead of JSON mode."""
+ response = self.llm.invoke(
+ [HumanMessage(content=prompt)],
+ functions=[function_schema],
+ function_call={"name": function_schema["name"]}
+ )
+
+ function_call = getattr(response, "additional_kwargs", {}).get(
+ "function_call"
+ )
+ if function_call:
+ return json.loads(function_call["arguments"])
+
+ raise ValueError("No function call in response")
+
+ def _get_reference_scoring_function_schema(self, expanded_refs: List[str]) -> \
+ Dict[str, Any]:
+ """Create function schema for reference scoring with exact reference
+ names."""
+ return {
+ "name": "score_references",
+ "description": "Score how well each reference is "
+ "discussed in the sheet",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ self.REF_LEVELS_FIELD: {
+ "type": "object",
+ "description": "Scores for each reference (0-4 scale)",
+ "properties": {
+ ref_name: {
+ "type": "integer",
+ "description": f"Discussion level for {ref_name}",
+ "minimum": 0,
+ "maximum": 4
+ }
+ for ref_name in expanded_refs
+ },
+ "required": expanded_refs,
+ "additionalProperties": False
+ }
+ },
+ "required": [self.REF_LEVELS_FIELD],
+ "additionalProperties": False
+ }
+ }
+
+ def _get_title_scoring_schema(self) -> Dict[str, Any]:
+ """Create function schema for both reference and title scoring."""
+ return {
+ "name": "score_title",
+ "description": "Score title interest for a Jewish study sheet",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ self.LANGUAGE_FIELD: {
+ "type": "string",
+ "description": "ISO-639-1 title language code",
+ },
+ self.TITLE_INTEREST_FIELD: {
+ "type": "integer",
+ "description": "How interesting the title is to "
+ "users (0-4 scale)",
+ "minimum": 0,
+ "maximum": 4
+ },
+ self.TITLE_INTEREST_REASON_FIELD: {
+ "type": "string",
+ "description": "Brief explanation of title interest "
+ "score (max 20 words)",
+ "maxLength": 100
+ }
+ },
+ "required": [self.LANGUAGE_FIELD, self.TITLE_INTEREST_FIELD,
+ self.TITLE_INTEREST_REASON_FIELD],
+ "additionalProperties": False
+ }
+ }
+
+ def _get_full_scoring_function_schema(self, expanded_refs: List[str]) -> (
+ Dict)[str, Any]:
+ """Create function schema for both reference and title scoring."""
+ return {
+ "name": "score_sheet",
+ "description": "Score references and title interest for a Jewish "
+ "study sheet",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ self.LANGUAGE_FIELD: {
+ "type": "string",
+ "description": "# ISO‑639‑1 code inferred from "
+ "*original user‑written* content",
+ },
+ self.REF_LEVELS_FIELD: {
+ "type": "object",
+ "description": "Scores for each reference (0-4 scale)",
+ "properties": {
+ ref_name: {
+ "type": "integer",
+ "description": f"Discussion level for {ref_name}",
+ "minimum": 0,
+ "maximum": 4
+ }
+ for ref_name in expanded_refs
+ },
+ "required": expanded_refs,
+ "additionalProperties": False
+ },
+ self.TITLE_INTEREST_FIELD: {
+ "type": "integer",
+ "description": "How interesting the title is to "
+ "users (0-4 scale)",
+ "minimum": 0,
+ "maximum": 4
+ },
+ self.TITLE_INTEREST_REASON_FIELD: {
+ "type": "string",
+ "description": "Brief explanation of title interest "
+ "score (max 20 words)",
+ "maxLength": 100
+ }
+ },
+ "required": [self.LANGUAGE_FIELD, self.REF_LEVELS_FIELD,
+ self.TITLE_INTEREST_FIELD,
+ self.TITLE_INTEREST_REASON_FIELD],
+ "additionalProperties": False
+ }
+ }
+
+ @staticmethod
+ def chunk_list(lst: List[Any], n: int) -> Iterator[List[Any]]:
+ """Yield successive n‑sized chunks from lst."""
+ for i in range(0, len(lst), n):
+ yield lst[i: i + n]
+
+ def _count_tokens(self, text: str) -> int:
+ """Rough token count; if no encoder, fall back to char heuristic."""
+ try:
+ encoding = tiktoken.encoding_for_model(self.model)
+ return len(encoding.encode(text))
+ except (KeyError, ValueError) as e:
+ logger.warning(
+ f"Could not get encoding for model {self.model}: {e}"
+ )
+ return len(text) // self.DEFAULT_TOKEN_CHAR_RATIO
+
+ def _invoke_llm(self, prompt: str) -> Dict[str, Any]:
+ """Invoke LLM with prompt and parse JSON response."""
+ response = self.llm([HumanMessage(content=prompt)])
+ return json.loads(response.content)
+
+ def _create_title_only_prompt_function(self, sheet_title: str) -> str:
+ return textwrap.dedent(
+ f"""You are scoring THE TITLE of a Jewish study sheet for how interesting it would be to users.
+
+ SHEET TITLE:
+ {sheet_title}
+
+ TASK: Return JSON with keys `title_interest_level` (0-4) and `title_interest_reason` ( < 20 words).
+
+ Title interest level (int 0–4):
+ 0: Not interesting / off‑topic for users
+ 1: Slight relevance, low pull
+ 2: Somewhat interesting; user might skim
+ 3: Interesting; user likely to open
+ 4: Very compelling / must‑open
+ """)
+
+ def _create_chunk_prompt_for_function(self, sheet_content: str,
+ expanded_refs: List[str]) -> str:
+ """Create prompt for function calling (no JSON format instructions
+ needed)."""
+ refs_md = "\n".join(f"- {r}" for r in expanded_refs)
+ return textwrap.dedent(
+ f"""
+ You are analyzing a Jewish study sheet. Rate how much each listed reference
+ is discussed or central in the sheet.
+
+ SHEET CONTENT:
+ {sheet_content}
+
+ REFERENCES TO EVALUATE:
+ {refs_md}
+
+ Scoring Scale (0-4):
+ {ScoreLevel.NOT_DISCUSSED}: Quoted only, no discussion
+ {ScoreLevel.MINIMAL}: Mentioned only through neighboring verses
+ {ScoreLevel.MODERATE}: Moderate discussion (some commentary)
+ {ScoreLevel.SIGNIFICANT}: Significant discussion (substantial commentary)
+ {ScoreLevel.CENTRAL}: Central focus of sheet
+
+ Score each reference based on how thoroughly it's discussed in the content."""
+ )
+
+ def _create_final_chunk_prompt_for_function(self, sheet_content: str,
+ expanded_refs: List[str],
+ sheet_title: str) -> str:
+ """Create prompt for final chunk with title scoring using function
+ calling."""
+ sheet_title_clean = sheet_title.strip() or "(untitled)"
+ refs_md = "\n".join(f"- {r}" for r in expanded_refs)
+
+ return textwrap.dedent(f"""
+ Analyze this Jewish study sheet and provide two types of scores:
+
+ SHEET TITLE: {sheet_title_clean}
+
+ SHEET CONTENT:
+ {sheet_content}
+
+ REFERENCES TO EVALUATE:
+ {refs_md}
+
+ TASKS:
+ 1. Reference Discussion Scoring (0-4):
+ {ScoreLevel.NOT_DISCUSSED}: Quoted only, no discussion
+ {ScoreLevel.MINIMAL}: Mentioned only through neighboring verses
+ {ScoreLevel.MODERATE}: Moderate discussion (some commentary)
+ {ScoreLevel.SIGNIFICANT}: Significant discussion (substantial commentary)
+ {ScoreLevel.CENTRAL}: Central focus of sheet
+
+ 2. Title Interest Scoring (0-4):
+ 0: Not interesting/off-topic
+ 1: Slight relevance, low appeal
+ 2: Somewhat interesting; user might skim
+ 3: Interesting; user likely to open
+ 4: Very compelling/must-open
+
+ Infer the language from the original user-written content.
+ """)
+
+ def _validate_score_level(self, score: Any,
+ field_name: str = "score") -> int:
+ """Validate and normalize score to valid range."""
+ if score not in self.VALID_LEVELS:
+ try:
+ score = int(score)
+ except (ValueError, TypeError):
+ logger.warning(
+ f"Invalid {field_name}: {score}, defaulting to 0"
+ )
+ return ScoreLevel.NOT_DISCUSSED
+
+ if score not in self.VALID_LEVELS:
+ clamped = max(
+ ScoreLevel.NOT_DISCUSSED,
+ min(ScoreLevel.CENTRAL, score)
+ )
+ logger.warning(
+ f"{field_name} {score} out of range, clamping to {clamped}"
+ )
+ return clamped
+
+ return score
+
+ def _sheet_to_text(
+ self,
+ no_quotes_content: str,
+ full_content: str,
+ max_tokens: int,
+ add_full_commentary: bool
+ ) -> str:
+ """
+ Build a text snapshot of the sheet with an *all‑or‑nothing* rule:
+ • Always include every bit of author commentary.
+ • Append *all* canonical quotations only if the whole bundle still
+ fits into `max_tokens`.
+ """
+ comm_tokens = self._count_tokens(no_quotes_content)
+ # Commentary alone is already bigger than the budget → truncate & quit
+ full_tokens = self._count_tokens(full_content)
+ if add_full_commentary:
+ if full_tokens <= max_tokens:
+ logger.info("Sending to LLM sheet with quotations")
+ return full_content
+
+ if comm_tokens >= max_tokens:
+ logger.info("Truncating user commentaries")
+ return self._truncate_to_token_budget(no_quotes_content, max_tokens)
+ logger.info("Sending to LLM sheet without quotations text")
+ return no_quotes_content
+
+ def _get_title_info(self, sheet_title: str) -> Dict[str, Any]:
+ """Obtain title-interest score ONLY (used when no content)."""
+ prompt = self._create_title_only_prompt_function(sheet_title)
+ try:
+ function_schema = self._get_title_scoring_schema()
+ data = self._invoke_llm_with_function(prompt, function_schema)
+ title_level = self._validate_score_level(
+ data.get(self.TITLE_INTEREST_FIELD),
+ self.TITLE_INTEREST_FIELD
+ )
+
+ return {
+ self.TITLE_INTEREST_FIELD:
+ title_level,
+ self.TITLE_INTEREST_REASON_FIELD:
+ data.get(self.TITLE_INTEREST_REASON_FIELD, ""),
+ self.LANGUAGE_FIELD: data.get(
+ self.LANGUAGE_FIELD, LanguageCode.DEFAULT
+ ),
+ }
+ except Exception as e:
+ logger.error(f"Title-only GPT attempt failed: {e}")
+ return {
+ self.TITLE_INTEREST_FIELD: ScoreLevel.NOT_DISCUSSED,
+ self.TITLE_INTEREST_REASON_FIELD: "LLM error",
+ self.LANGUAGE_FIELD: LanguageCode.DEFAULT
+ }
+
+ def _normalize_scores_to_percentages(
+ self,
+ sheet_tokens: int,
+ score_levels: Dict[str, int],
+ beta: float = 1500 # token mass where no penalty
+ ) -> Dict[str, float]:
+ """Convert reference scores to percentages with size penalty
+ for shorter sheets."""
+
+ total_level = sum(score_levels.values()) or 1
+ size_factor = min(1.0, sheet_tokens / beta) # clamp to 1
+
+ # small sheets (few tokens) → size_factor < 1 → percentages shrink
+ percentages = {
+ ref: round(level * 100 / total_level * size_factor, 2)
+ for ref, level in score_levels.items()
+ }
+
+ norm = sum(percentages.values()) or 1
+ percentages = {r: round(v * 100 / norm, 2) for r, v in
+ percentages.items()}
+ return percentages
+
+ def _grade_refs_resilient(
+ self,
+ content: str,
+ refs: List[str],
+ *,
+ with_title: bool = False,
+ sheet_title: str = ""
+ ) -> Tuple[Optional[Dict[str, Any]], Dict[str, int]]:
+ """
+ Fault-tolerant reference scoring using divide-and-conquer strategy.
+ Attempts to score all references at once via LLM. If that fails
+ (due to incomplete responses),
+ recursively splits the reference list in half and scores each
+ subset separately until all references have scores.
+ This prevents total failure when the LLM struggles with large
+ reference lists or encounters transient errors.
+
+ """
+ if not refs:
+ return {}, {}
+
+ try:
+ if with_title:
+ prompt = self._create_final_chunk_prompt_for_function(
+ content, refs, sheet_title
+ )
+ function_schema = self._get_full_scoring_function_schema(refs)
+ else:
+ prompt = self._create_chunk_prompt_for_function(content, refs)
+ function_schema = self._get_reference_scoring_function_schema(
+ refs
+ )
+ data, scores = self._get_gpt_ref_scores_function(
+ prompt, function_schema, refs
+ )
+ return data, scores
+ except Exception:
+ pass
+
+ # fallback branch
+ if len(refs) == 1: # nothing left to split
+ return {}, {refs[0]: ScoreLevel.NOT_DISCUSSED}
+
+ mid = len(refs) // 2
+ ld, ls = self._grade_refs_resilient(
+ content, refs[:mid],
+ with_title=with_title,
+ sheet_title=sheet_title
+ )
+ rd, rs = self._grade_refs_resilient(
+ content, refs[mid:],
+ with_title=with_title,
+ sheet_title=sheet_title
+ )
+ merged_scores = {**ls, **rs}
+ merged_data = ld or rd
+ return merged_data, merged_scores
+
+ def _get_gpt_ref_scores_function(self, prompt: str, function_schema,
+ expected_refs: List[str]):
+ """Calls the LLM with structured function schema, validates all
+ returned scores are in valid range (0-4), handles missing references,
+ and ensures exactly the expected references are scored."""
+ try:
+ data = self._invoke_llm_with_function(prompt, function_schema)
+ chunk_scores = data.get(self.REF_LEVELS_FIELD, {})
+ validated_scores = {}
+ for ref, score in chunk_scores.items():
+ validated_scores[ref] = self._validate_score_level(
+ score, f"ref_score[{ref}]"
+ )
+
+ # Check for missing references and assign default scores (0)
+ missing_refs = set(expected_refs) - set(validated_scores.keys())
+ if missing_refs:
+ logger.warning(
+ f"GPT didn't return scores for {len(missing_refs)} "
+ )
+ if len(missing_refs) < 5:
+ logger.warning(f"Defaulting missing scores to zeros")
+ for ref in missing_refs:
+ validated_scores[ref] = ScoreLevel.NOT_DISCUSSED
+
+ else:
+ raise IncompleteScoreError(
+ f"Missing {len(missing_refs)} references"
+ )
+
+ # Ensure we only include expected references (in case GPT
+ # returned extras)
+ final_scores = {
+ ref: validated_scores.get(ref, ScoreLevel.NOT_DISCUSSED) for ref
+ in expected_refs}
+
+ data[self.REF_SCORES_FIELD] = final_scores
+ return data, final_scores
+
+ except IncompleteScoreError:
+ raise
+
+ except Exception as e:
+ logger.error(f"Chunk GPT failed: {e}")
+ return None
+
+ def _last_regular_start(self, n: int, chunk: int, overlap: int) -> int:
+ """
+ Return the index where the *final* chunk (with title) should start.
+ If the total length fits into one chunk plus the allowed overlap,
+ analyse everything together (start = 0).
+ """
+ if n <= chunk + overlap:
+ return 0
+ step = chunk - overlap
+ return max(0, n - chunk) if step <= 0 else (n - chunk)
+
+ def _process_reference_chunks(
+ self,
+ content: str,
+ expanded_refs: List[str]
+ ) -> Optional[Dict[str, int]]:
+ """Process reference chunks in batches."""
+ ref_scores: Dict[str, int] = {}
+
+ last_chunk_start = self._last_regular_start(
+ len(expanded_refs), self.chunk_size, self.MAX_CHUNK_OVERLAP
+ )
+
+ for chunk in self.chunk_list(
+ expanded_refs[:last_chunk_start], self.chunk_size
+ ):
+ # prompt = self._create_chunk_prompt(content,chunk)
+ _, chunk_scores = self._grade_refs_resilient(
+ content=content,
+ refs=chunk,
+ with_title=False
+ )
+ if chunk_scores is None:
+ return None
+ ref_scores.update(chunk_scores)
+
+ return ref_scores
+
+ def _process_final_chunk_with_title(
+ self,
+ content: str,
+ expanded_refs: List[str],
+ title: str,
+ ) -> Optional[Dict[str, Any]]:
+ """Process final chunk and get title scores."""
+ start = self._last_regular_start(
+ len(expanded_refs), self.chunk_size, self.MAX_CHUNK_OVERLAP
+ )
+ final_chunk = expanded_refs[start:]
+
+ # prompt = self._create_final_chunk_prompt(content,final_chunk,title)
+ result = self._grade_refs_resilient(
+ content=content,
+ refs=final_chunk,
+ with_title=True,
+ sheet_title=title
+ )
+
+ if result is None:
+ return None
+
+ data, _ = result
+ return data
+
+ def get_gpt_scores(
+ self,
+ content: str,
+ expanded_refs: List[str],
+ title: str,
+ ) -> Optional[Dict[str, Any]]:
+ """Get GPT scores for references and title."""
+ # Process reference chunks
+ ref_scores = self._process_reference_chunks(content, expanded_refs)
+ if ref_scores is None:
+ return None
+
+ # Process final chunk with title
+ final_data = self._process_final_chunk_with_title(
+ content, expanded_refs, title
+ )
+ if final_data is None:
+ return None
+
+ # Combine scores
+ final_chunk_scores = final_data.get(self.REF_SCORES_FIELD, {})
+ ref_scores.update(final_chunk_scores)
+
+ # Normalize to percentages
+ score_percentages = self._normalize_scores_to_percentages(
+ score_levels=ref_scores,
+ sheet_tokens=self._count_tokens(content)
+ )
+
+ # Validate title score
+ title_level = self._validate_score_level(
+ final_data.get(self.TITLE_INTEREST_FIELD),
+ self.TITLE_INTEREST_FIELD
+ )
+
+ return {
+ self.LANGUAGE_FIELD: final_data.get(
+ self.LANGUAGE_FIELD, LanguageCode.DEFAULT
+ ),
+ self.REF_LEVELS_FIELD: ref_scores,
+ self.REF_SCORES_FIELD: score_percentages,
+ self.TITLE_INTEREST_FIELD: title_level,
+ self.TITLE_INTEREST_REASON_FIELD: final_data.get(
+ self.TITLE_INTEREST_REASON_FIELD, ""
+ ),
+ }
+
+ def _truncate_to_token_budget(self, text: str, max_tokens: int) -> str:
+ """Truncate text to fit within token budget using LLM summarization."""
+ if self._count_tokens(text) <= max_tokens:
+ return text
+ try:
+ prompt = f"""
+ Compress the following commentary to ≤ {max_tokens} tokens.
+ Keep every reference tag like "Genesis 1:1" or "Exodus 2:5".
+ Use clear sentences; preserve main ideas.
+
+ {text}
+ """
+ summary = self.summarizer(
+ [HumanMessage(content=prompt)]
+ ).content.strip()
+
+ if self._count_tokens(summary) <= max_tokens:
+ return summary
+ else:
+ # Fallback: character-based truncation
+ return summary[:max_tokens * self.DEFAULT_TOKEN_CHAR_RATIO]
+
+ except Exception as e:
+ logger.error(f"Summarization failed: {e}")
+ # Fallback: character-based truncation
+ return text[:max_tokens * self.DEFAULT_TOKEN_CHAR_RATIO]
+
+ def create_failure_output(self, sheet_id: str, request_status_message: str) -> (
+ SheetScoringOutput):
+ """Create a standardized failure output when sheet processing cannot
+ be completed."""
+ return SheetScoringOutput(
+ sheet_id=sheet_id,
+ processed_datetime=str(datetime.utcnow()),
+ language="",
+ title_interest_level=0,
+ title_interest_reason="",
+ creativity_score=0,
+ ref_levels={},
+ ref_scores={},
+ request_status=RequestStatusOptions.FAILURE,
+ request_status_message=request_status_message
+ )
+
+ def process_sheet_by_content(self,
+ sheet_id: str,
+ expanded_refs: List[str],
+ title: str,
+ sources: List[Dict[str, Union[str, Dict[str, str]]]],
+ add_full_commentary=False) -> SheetScoringOutput:
+ """Score a single sheet based on its content."""
+ if not expanded_refs:
+ request_status_message = f"No expanded refs for sheet {sheet_id}, skipping"
+ logger.info(request_status_message)
+ return self.create_failure_output(sheet_id,
+ request_status_message=request_status_message)
+ text_views = sheet_to_text_views(title=title, sources=sources, default_lang=LanguageCode.DEFAULT)
+ no_quotes_content = text_views["no_quotes"]
+ full_content = text_views["with_quotes"]
+ has_original = text_views["has_original"]
+ creativity_score = text_views["creativity_score"]
+
+ # Check for original content and reference limits
+ if (not has_original or
+ len(expanded_refs) > self.max_ref_to_process):
+ logger.info(f"Sheet {sheet_id}: using equal distribution")
+ score_percentages = {ref: 0 for ref in expanded_refs}
+ title_info = self._get_title_info(title)
+
+ return SheetScoringOutput(sheet_id=sheet_id,
+ ref_levels=score_percentages,
+ ref_scores=score_percentages,
+ processed_datetime=str(datetime.utcnow()),
+ creativity_score=creativity_score,
+ title_interest_level=title_info[self.TITLE_INTEREST_FIELD],
+ title_interest_reason=title_info[self.TITLE_INTEREST_REASON_FIELD],
+ language=title_info[self.LANGUAGE_FIELD],
+ request_status=RequestStatusOptions.SUCCESS,
+ request_status_message="The sheet has no user generated content"
+ )
+
+ content = self._sheet_to_text(
+ no_quotes_content=no_quotes_content,
+ full_content=full_content,
+ max_tokens=self.max_prompt_tokens - self.token_margin,
+ add_full_commentary=add_full_commentary)
+ # Process with GPT
+ gpt_analysis = self.get_gpt_scores(content, expanded_refs, title)
+ if not gpt_analysis:
+ request_status_message = f"Failed to get GPT scores for sheet {sheet_id}"
+ logger.error(request_status_message)
+ return self.create_failure_output(sheet_id=sheet_id,
+ request_status_message=request_status_message)
+
+ return SheetScoringOutput(
+ sheet_id=sheet_id,
+ ref_levels=gpt_analysis[self.REF_LEVELS_FIELD],
+ ref_scores=gpt_analysis[self.REF_SCORES_FIELD],
+ processed_datetime=str(datetime.utcnow()),
+ creativity_score=creativity_score,
+ title_interest_level=gpt_analysis[self.TITLE_INTEREST_FIELD],
+ title_interest_reason=gpt_analysis[self.TITLE_INTEREST_REASON_FIELD],
+ language=gpt_analysis[self.LANGUAGE_FIELD],
+ request_status=RequestStatusOptions.SUCCESS,
+ request_status_message=""
+ )
diff --git a/app/sheet_scoring/sheet_scoring.py b/app/sheet_scoring/sheet_scoring.py
new file mode 100644
index 0000000..2fbb481
--- /dev/null
+++ b/app/sheet_scoring/sheet_scoring.py
@@ -0,0 +1,14 @@
+from sheet_scoring.openai_sheets_scorer import SheetScorer
+import os
+from sefaria_llm_interface.sheet_scoring import (
+ SheetScoringInput,
+ SheetScoringOutput,
+)
+
+
+def score_one_sheet(inp: SheetScoringInput) -> SheetScoringOutput:
+ with SheetScorer(api_key=os.getenv("OPENAI_API_KEY")) as scorer:
+ return scorer.process_sheet_by_content(sheet_id=inp.sheet_id,
+ title=inp.title,
+ sources=inp.sources,
+ expanded_refs=inp.expanded_refs)
\ No newline at end of file
diff --git a/app/sheet_scoring/tasks.py b/app/sheet_scoring/tasks.py
new file mode 100644
index 0000000..eb4aa55
--- /dev/null
+++ b/app/sheet_scoring/tasks.py
@@ -0,0 +1,13 @@
+from celery import shared_task
+from sheet_scoring.sheet_scoring import score_one_sheet
+from sefaria_llm_interface.sheet_scoring import (
+ SheetScoringInput
+)
+from dataclasses import asdict
+
+
+@shared_task(name='llm.score_sheet')
+def score_sheet_task(raw_input: dict) -> dict:
+ inp = SheetScoringInput(**raw_input)
+ out = score_one_sheet(inp)
+ return asdict(out)
\ No newline at end of file
diff --git a/app/sheet_scoring/text_utils.py b/app/sheet_scoring/text_utils.py
new file mode 100644
index 0000000..551e282
--- /dev/null
+++ b/app/sheet_scoring/text_utils.py
@@ -0,0 +1,114 @@
+import re
+from typing import Dict, List, Union, Any
+
+TOKEN_RE = re.compile(r"\b\w+\b", re.UNICODE)
+
+
+def strip_html(raw: str) -> str:
+ """Remove tags & entities, collapse whitespace."""
+ if not raw:
+ return ""
+ return '\n'.join([' '.join(line.split()) for line in raw.split('\n')])
+
+
+def token_count(text: str) -> int:
+ """Approximate word tokens (both English & Hebrew)."""
+ return len(TOKEN_RE.findall(text))
+
+
+def sheet_to_text_views(title: str,
+ sources: List[Dict[str, Union[str, Dict[str, str]]]],
+ default_lang: str = "en") -> Dict[str, Any]:
+ """
+ Build three plain‑text snapshots of a Sefaria sheet **and** compute a
+ creativity score.
+
+ Returns
+ -------
+ quotes_only str – ref + canonical text blocks
+ no_quotes str – title & user commentary, refs only for quotes
+ with_quotes str – full sheet (title, commentary, *and* quotes)
+ has_original bool – True if any user commentary exists
+ creativity_score float – user_token_count / total_token_count
+ """
+
+ quotes: List[str] = []
+ no_quotes: List[str] = []
+ with_quotes: List[str] = []
+
+ original_tokens = 0
+ quoted_tokens = 0
+ has_original = False
+
+ if title:
+ tok = token_count(title)
+ original_tokens += tok
+ no_quotes.append(title)
+ with_quotes.append(title)
+
+ for blk in sources:
+ # --- outsideText (single‑lang commentary)
+ if "outsideText" in blk:
+ txt = strip_html(blk["outsideText"]).strip()
+ if txt:
+ has_original = True
+ t = token_count(txt)
+ original_tokens += t
+ no_quotes.append(txt)
+ with_quotes.append(txt)
+
+ if "outsideBiText" in blk:
+ for lang in ("en", "he"):
+ txt = strip_html(blk["outsideBiText"].get(lang, "")).strip()
+ if txt:
+ has_original = True
+ original_tokens += token_count(txt)
+ no_quotes.append(txt)
+ with_quotes.append(txt)
+
+ if "text" in blk:
+ ref = blk.get("ref", "").strip()
+ canon = strip_html(blk["text"].get(default_lang, "")).strip()
+
+ # show ref label in all views
+ if ref:
+ no_quotes.append(ref)
+ header = f"{ref}:"
+ else:
+ header = ""
+
+ if canon:
+ # quote tokens count toward quoted_tokens
+ qtok = token_count(canon)
+ quoted_tokens += qtok
+
+ # add to quotes‑only and with_quotes
+ if header:
+ quotes.append(header)
+ with_quotes.append(header)
+ quotes.append(canon)
+ with_quotes.append(canon)
+
+ if "comment" in blk:
+ txt = strip_html(blk["comment"]).strip()
+ if txt:
+ has_original = True
+ original_tokens += token_count(txt)
+ no_quotes.append(txt)
+ with_quotes.append(txt)
+
+ joiner = "\n\n"
+ quotes_only = joiner.join(quotes)
+ commentary = joiner.join(no_quotes)
+ full_sheet = joiner.join(with_quotes)
+
+ total_tokens = original_tokens + quoted_tokens or 1 # avoid div‑by‑zero
+ creativity = original_tokens / total_tokens
+
+ return {
+ "quotes_only": quotes_only,
+ "no_quotes": commentary,
+ "with_quotes": full_sheet,
+ "has_original": has_original,
+ "creativity_score": creativity
+ }
\ No newline at end of file
diff --git a/app/util/sentencizer.py b/app/util/sentencizer.py
index 363876c..d76da17 100644
--- a/app/util/sentencizer.py
+++ b/app/util/sentencizer.py
@@ -74,7 +74,7 @@ def claude_sentencizer_first_sentence(text):
from basic_langchain.chat_models import ChatAnthropic
from basic_langchain.schema import SystemMessage, HumanMessage
from util.general import get_by_xml_tag
- system = SystemMessage(content="Given a text discussing Torah topics will little to no punctuation, "
+ system = SystemMessage(content="Given a text discussing Torah topics with little to no punctuation, "
"output the first sentence. Input is in tags. The first sentence "
"should be output verbatim as it appears in wrapped in "
" tags. Since the input text has no punctuation, use your judgement as to where the first sentence ends. Prefer smaller sentences.")
diff --git a/build/entrypoint.sh b/build/entrypoint.sh
index fbb070e..7c5136d 100644
--- a/build/entrypoint.sh
+++ b/build/entrypoint.sh
@@ -1,3 +1,3 @@
#!/bin/bash
-celery -A celery_setup.app worker -Q ${QUEUE_NAME} -l INFO --concurrency 50
\ No newline at end of file
+celery -A celery_setup.app worker -Q ${QUEUE_NAME} -l INFO --concurrency 4
\ No newline at end of file