diff --git a/README.md b/README.md index 6b77932..08fcb51 100644 --- a/README.md +++ b/README.md @@ -19,9 +19,44 @@ *** +## Technical Details + +### Content Generation Pipeline + +```mermaid +flowchart TD + A["BlogPostTitleSuggestion.generate_content()"] --> B["init_blog_post_content_generation()"] + B --> C["AI: generate outline section titles
(Introduction + middle sections + Conclusion)"] + C --> D["DB: create GeneratedBlogPost + GeneratedBlogPostSection rows"] + + D --> E{"For each middle section"} + E --> F["Task: generate research questions for section
(local: 1 question)"] + + F --> G{"For each research question"} + G --> H["Task: Exa search for links
(local: 2 links)"] + + H --> I{"For each research link"} + I --> J["Task: scrape link with Jina Reader"] + J --> K["Task: analyze link (AI)
summary + contextual summary + answer"] + + K --> L{"All links attempted/analyzed?"} + L -- no --> K + L -- yes --> M["Task: synthesize middle section contents (AI)"] + + M --> N{"All middle sections have content?"} + N -- yes --> O["Task: generate Introduction + Conclusion (AI)"] + N -- no --> M + + O --> P{"All sections (incl. intro/conclusion) have content?"} + P -- yes --> Q["Task: populate GeneratedBlogPost.content
(code: combine sections into final markdown)"] + P -- no --> O +``` + ## TOC - [Overview](#overview) +- [Technical Details](#technical-details) + - [Content Generation Pipeline](#content-generation-pipeline) - [TOC](#toc) - [Deployment](#deployment) - [Render](#render) @@ -39,9 +74,8 @@ [![Deploy to Render](https://render.com/images/deploy-to-render-button.svg)](https://render.com/deploy?repo=https://github.com/rasulkireev/tuxseo) The only required env vars are: -- OPENAI_API_KEY -- TAVILY_API_KEY - GEMINI_API_KEY +- EXA_API_KEY - PERPLEXITY_API_KEY - JINA_READER_API_KEY - KEYWORDS_EVERYWHERE_API_KEY diff --git a/content_generation/__init__.py b/content_generation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/content_generation/admin.py b/content_generation/admin.py new file mode 100644 index 0000000..846f6b4 --- /dev/null +++ b/content_generation/admin.py @@ -0,0 +1 @@ +# Register your models here. diff --git a/content_generation/apps.py b/content_generation/apps.py new file mode 100644 index 0000000..f88ddd6 --- /dev/null +++ b/content_generation/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class ContentGenerationConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "content_generation" diff --git a/content_generation/migrations/__init__.py b/content_generation/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/content_generation/models.py b/content_generation/models.py new file mode 100644 index 0000000..4875573 --- /dev/null +++ b/content_generation/models.py @@ -0,0 +1,369 @@ +from urllib.request import urlopen + +import replicate +import requests +from django.conf import settings +from django.core.files.base import ContentFile +from django.db import models + +from core.agents import ( + create_insert_links_agent, +) +from core.agents.schemas import ( + GeneratedBlogPostSchema, + LinkInsertionContext, + ProjectPageContext, +) +from core.base_models import BaseModel +from core.choices import ( + OGImageStyle, +) +from core.models import AutoSubmissionSetting, BlogPostTitleSuggestion, Project +from core.utils import ( + get_og_image_prompt, + get_relevant_external_pages_for_blog_post, +) +from tuxseo.utils import get_tuxseo_logger + +logger = get_tuxseo_logger(__name__) + + +class GeneratedBlogPost(BaseModel): + project = models.ForeignKey( + Project, + null=True, + blank=True, + on_delete=models.CASCADE, + related_name="generated_blog_posts", + ) + title_suggestion = models.ForeignKey( + BlogPostTitleSuggestion, + null=True, + blank=True, + on_delete=models.CASCADE, + related_name="generated_blog_posts", + ) + + # Final Output Items + title = models.CharField(max_length=250) + description = models.TextField(blank=True) + slug = models.SlugField(max_length=250) + tags = models.TextField() + content = models.TextField() + icon = models.ImageField(upload_to="generated_blog_post_icons/", blank=True) + image = models.ImageField(upload_to="generated_blog_post_images/", blank=True) + + # Preparation + # GeneratedBlogPostSection model + + # Other + posted = models.BooleanField(default=False) + date_posted = models.DateTimeField(null=True, blank=True) + + def __str__(self): + return f"{self.project.name}: {self.title}" + + @classmethod + def blog_post_structure_rules(cls): + return """ + - Use markdown. + - Start with the title as h1 (#). Do no include any other metadata (description, slug, etc.) + - Then do and intro, starting with `## Introduction`, then a paragraph of text. + - Continue with h2 (##) topics as you see fit. + - Do not go deeper than h2 (##) for post structure. + - Never inlcude placeholder items (insert image here, link suggestions, etc.) + - Do not have `References` section, insert all the links into the post directly, organically. + - Do not include a call to action paragraph at the end of the post. + - Finish the post with a conclusion. + - Instead of using links as a reference, try to insert them into the post directly, organically. + """ # noqa: E501 + + @property + def generated_blog_post_schema(self): + return GeneratedBlogPostSchema( + description=self.description, + slug=self.slug, + tags=self.tags, + content=self.content, + ) + + def submit_blog_post_to_endpoint(self): + from core.utils import replace_placeholders + + project = self.project + submission_settings = ( + AutoSubmissionSetting.objects.filter(project=project).order_by("-id").first() + ) + + if not submission_settings or not submission_settings.endpoint_url: + logger.warning( + "No AutoSubmissionSetting or endpoint_url found for project", project_id=project.id + ) + return False + + url = submission_settings.endpoint_url + headers = replace_placeholders(submission_settings.header, self) + body = replace_placeholders(submission_settings.body, self) + + logger.info( + "[Submit Blog Post] Submitting blog post to endpoint", + project_id=project.id, + profile_id=project.profile.id, + endpoint_url=url, + headers_configured=bool(headers), + body_configured=bool(body), + ) + + try: + session = requests.Session() + session.cookies.clear() + + if headers is None: + headers = {} + + if "content-type" not in headers and "Content-Type" not in headers: + headers["Content-Type"] = "application/json" + + response = session.post(url, json=body, headers=headers, timeout=15) + response.raise_for_status() + return True + + except requests.RequestException as e: + logger.error( + "[Submit Blog Post to Endpoint] Request error", + error=str(e), + url=url, + headers=headers, + exc_info=True, + ) + return False + + def generate_og_image(self) -> tuple[bool, str]: + """ + Generate an Open Graph image for a blog post using Replicate flux-schnell model. + + Args: + generated_post: The GeneratedBlogPost instance to generate an image for + replicate_api_token: Replicate API token for authentication + + Returns: + A tuple of (success: bool, message: str) + """ + + if not settings.REPLICATE_API_TOKEN: + logger.error( + "[GenerateOGImage] Replicate API token not configured", + blog_post_id=self.id, + project_id=self.project_id, + ) + return False, "Replicate API token not configured" + + if self.image: + logger.info( + "[GenerateOGImage] Image already exists for blog post", + blog_post_id=self.id, + project_id=self.project_id, + ) + return True, f"Image already exists for blog post {self.id}" + + try: + blog_post_category = ( + self.title_suggestion.category if self.title_suggestion.category else "technology" + ) + + project_og_style = self.project.og_image_style or OGImageStyle.MODERN_GRADIENT + prompt = get_og_image_prompt(project_og_style, blog_post_category) + + logger.info( + "[GenerateOGImage] Starting image generation", + blog_post_id=self.id, + project_id=self.project_id, + category=blog_post_category, + og_style=project_og_style, + prompt=prompt, + ) + + replicate_client = replicate.Client(api_token=settings.REPLICATE_API_TOKEN) + + output = replicate_client.run( + "black-forest-labs/flux-schnell", + input={ + "prompt": prompt, + "aspect_ratio": "16:9", + "output_format": "png", + "output_quality": 90, + }, + ) + + if not output: + logger.error( + "[GenerateOGImage] No output from Replicate", + blog_post_id=self.id, + project_id=self.project_id, + ) + return False, f"Failed to generate image for blog post {self.id}" + + file_output = output[0] if isinstance(output, list) else output + image_url = str(file_output) + + logger.info( + "[GenerateOGImage] Image generated successfully", + blog_post_id=self.id, + project_id=self.project_id, + image_url=image_url, + ) + + image_response = urlopen(image_url) + image_content = ContentFile(image_response.read()) + + filename = f"og-image-{self.id}.png" + self.image.save(filename, image_content, save=True) + + logger.info( + "[GenerateOGImage] Image saved to blog post", + blog_post_id=self.id, + project_id=self.project_id, + saved_url=self.image.url, + ) + + return True, f"Successfully generated and saved OG image for blog post {self.id}" + + except replicate.exceptions.ReplicateError as replicate_error: + logger.error( + "[GenerateOGImage] Replicate API error", + error=str(replicate_error), + exc_info=True, + blog_post_id=self.id, + project_id=self.project_id, + ) + return False, f"Replicate API error: {str(replicate_error)}" + except Exception as error: + logger.error( + "[GenerateOGImage] Unexpected error during image generation", + error=str(error), + exc_info=True, + blog_post_id=self.id, + project_id=self.project_id, + ) + return False, f"Unexpected error: {str(error)}" + + def insert_links_into_post(self, max_pages=4, max_external_pages=3): + """ + Insert links from project pages into the blog post content organically. + Uses PydanticAI to intelligently place links without modifying the content. + + Args: + max_pages: Maximum number of internal project pages to use for linking (default: 4) + max_external_pages: Maximum number of external project pages to use for linking (default: 3) + + Returns: + str: The blog post content with links inserted + """ # noqa: E501 + from core.utils import ( + get_relevant_pages_for_blog_post, + run_agent_synchronously, + ) + + if not self.title_suggestion: + logger.warning( + "[InsertLinksIntoPost] No title suggestion found for blog post", + blog_post_id=self.id, + project_id=self.project_id, + ) + return self.content + + # Get internal project pages + manually_selected_project_pages = list(self.project.project_pages.filter(always_use=True)) + relevant_project_pages = list( + get_relevant_pages_for_blog_post( + self.project, + self.title_suggestion.suggested_meta_description, + max_pages=max_pages, + ) + ) + + all_project_pages = manually_selected_project_pages + relevant_project_pages + + # Get external project pages if link exchange is enabled + external_project_pages = [] + if self.project.particiate_in_link_exchange: + external_project_pages = list( + get_relevant_external_pages_for_blog_post( + meta_description=self.title_suggestion.suggested_meta_description, + exclude_project=self.project, + max_pages=max_external_pages, + ) + ) + # Filter to only include pages from projects that also participate in link exchange + external_project_pages = [ + page for page in external_project_pages if page.project.particiate_in_link_exchange + ] + + all_pages_to_link = all_project_pages + external_project_pages + + if not all_pages_to_link: + logger.info( + "[InsertLinksIntoPost] No pages found for link insertion", + blog_post_id=self.id, + project_id=self.project_id, + ) + return self.content + + project_page_contexts = [ + ProjectPageContext( + url=page.url, + title=page.title, + description=page.description, + summary=page.summary, + ) + for page in all_pages_to_link + ] + + # Extract URLs for logging + urls_to_insert = [page.url for page in all_pages_to_link] + internal_urls = [page.url for page in all_project_pages] + external_urls = [page.url for page in external_project_pages] + + link_insertion_context = LinkInsertionContext( + blog_post_content=self.content, + project_pages=project_page_contexts, + ) + + insert_links_agent = create_insert_links_agent() + + prompt = "Insert the provided project page links into the blog post content organically. Do not modify the existing content, only add links where appropriate." # noqa: E501 + + logger.info( + "[InsertLinksIntoPost] Running link insertion agent", + blog_post_id=self.id, + project_id=self.project_id, + num_total_pages=len(project_page_contexts), + num_internal_pages=len(all_project_pages), + num_external_pages=len(external_project_pages), + num_always_use_pages=len(manually_selected_project_pages), + participate_in_link_exchange=self.project.particiate_in_link_exchange, + urls_to_insert=urls_to_insert, + internal_urls=internal_urls, + external_urls=external_urls, + ) + + result = run_agent_synchronously( + insert_links_agent, + prompt, + deps=link_insertion_context, + function_name="insert_links_into_post", + model_name="GeneratedBlogPost", + ) + + content_with_links = result.output + + self.content = content_with_links + self.save(update_fields=["content"]) + + logger.info( + "[InsertLinksIntoPost] Links inserted successfully", + blog_post_id=self.id, + project_id=self.project_id, + ) + + return content_with_links diff --git a/content_generation/tests.py b/content_generation/tests.py new file mode 100644 index 0000000..a39b155 --- /dev/null +++ b/content_generation/tests.py @@ -0,0 +1 @@ +# Create your tests here. diff --git a/content_generation/views.py b/content_generation/views.py new file mode 100644 index 0000000..60f00ef --- /dev/null +++ b/content_generation/views.py @@ -0,0 +1 @@ +# Create your views here. diff --git a/core/agents/blog_post_outline_agent.py b/core/agents/blog_post_outline_agent.py new file mode 100644 index 0000000..1f15e44 --- /dev/null +++ b/core/agents/blog_post_outline_agent.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +from pydantic import BaseModel, Field +from pydantic_ai import Agent + +from core.agents.schemas import BlogPostGenerationContext +from core.agents.system_prompts import ( + add_language_specification, + add_project_details, + add_target_keywords, + add_title_details, + add_todays_date, +) +from core.choices import get_default_ai_model + + +class BlogPostOutlineSection(BaseModel): + title: str = Field(description="Section title (use plain text, no markdown prefixes)") + + +class BlogPostOutline(BaseModel): + sections: list[BlogPostOutlineSection] = Field( + description=( + "Ordered list of 4-8 section titles that will be used as H2 (##) headers in the blog post." # noqa: E501 + ) + ) + + +BLOG_POST_OUTLINE_SYSTEM_PROMPT = """ +You are an expert content strategist. + +Your task: propose only the middle-section outline for the blog post. + +Requirements: +- Generate 4-8 main topics that will be used as H2 (##) sections. +- Do NOT include markdown symbols in section titles (no leading #, ##, -, etc.). +- Keep titles short and descriptive. +- Do NOT include 'Introduction' or 'Conclusion' yet. + +Output must be a structured list of section titles only. +""" + + +def create_blog_post_outline_agent(model: str | None = None) -> Agent: + agent = Agent( + model or get_default_ai_model(), + output_type=BlogPostOutline, + deps_type=BlogPostGenerationContext, + system_prompt=BLOG_POST_OUTLINE_SYSTEM_PROMPT, + retries=2, + model_settings={"temperature": 0.7}, + ) + + agent.system_prompt(add_project_details) + agent.system_prompt(add_title_details) + agent.system_prompt(add_todays_date) + agent.system_prompt(add_language_specification) + agent.system_prompt(add_target_keywords) + + return agent + + +class BlogPostSectionResearchQuestions(BaseModel): + questions: list[str] = Field( + default_factory=list, + description="3-6 concrete research questions for a single section", + ) + + +BLOG_POST_SECTION_QUESTIONS_SYSTEM_PROMPT = """ +You are an expert content researcher. + +Given a blog post section title, generate 3-6 specific research questions to investigate. + +Requirements: +- Questions should be specific and searchable. +- Prefer questions that lead to concrete examples, comparisons, metrics, pitfalls, and best practices. +- Avoid vague or overly broad questions. +""" # noqa: E501 + + +def create_blog_post_section_research_questions_agent(model: str | None = None) -> Agent: + agent = Agent( + model or get_default_ai_model(), + output_type=BlogPostSectionResearchQuestions, + deps_type=BlogPostGenerationContext, + system_prompt=BLOG_POST_SECTION_QUESTIONS_SYSTEM_PROMPT, + retries=2, + model_settings={"temperature": 0.7}, + ) + + agent.system_prompt(add_project_details) + agent.system_prompt(add_title_details) + agent.system_prompt(add_todays_date) + agent.system_prompt(add_language_specification) + agent.system_prompt(add_target_keywords) + + return agent diff --git a/core/agents/generate_blog_post_intro_conclusion_agent.py b/core/agents/generate_blog_post_intro_conclusion_agent.py new file mode 100644 index 0000000..281454f --- /dev/null +++ b/core/agents/generate_blog_post_intro_conclusion_agent.py @@ -0,0 +1,100 @@ +from django.utils import timezone +from pydantic_ai import Agent + +from core.agents.schemas import ( + BlogPostIntroConclusionGenerationContext, + GeneratedBlogPostIntroConclusionSchema, +) +from core.choices import ContentType, get_default_ai_model +from core.prompts import GENERATE_CONTENT_SYSTEM_PROMPTS + +INTRO_CONCLUSION_SYSTEM_PROMPT = """ +You are an expert blog post writer. + +Your task: write BOTH the Introduction and the Conclusion for a blog post in a single response. + +Rules: +- Return two fields only: introduction and conclusion. +- Do NOT include markdown headings for either section. No leading '#', '##', or '###'. +- Use the existing section contents as the source of truth for what the post covers. +- The introduction should set up the promise and smoothly lead into the first middle section. +- The conclusion should summarize the key takeaways and close cleanly without adding new topics. +- Do not add placeholders. +""" + + +def create_generate_blog_post_intro_conclusion_agent( + content_type: ContentType = ContentType.SHARING, model=None +): + """ + Create an agent to generate a blog post Introduction + Conclusion in one call. + """ + agent = Agent( + model or get_default_ai_model(), + output_type=GeneratedBlogPostIntroConclusionSchema, + deps_type=BlogPostIntroConclusionGenerationContext, + system_prompt=( + INTRO_CONCLUSION_SYSTEM_PROMPT + + "\n\n" + + (GENERATE_CONTENT_SYSTEM_PROMPTS.get(content_type, "") or "") + ), + retries=2, + model_settings={"max_tokens": 6000, "temperature": 0.7}, + ) + + @agent.system_prompt + def add_intro_conclusion_context(ctx) -> str: + intro_conclusion_context: BlogPostIntroConclusionGenerationContext = ctx.deps + generation_context = intro_conclusion_context.blog_post_generation_context + project_details = generation_context.project_details + title_suggestion = generation_context.title_suggestion + target_keywords = title_suggestion.target_keywords or [] + + section_titles_text = ( + "\n".join( + [ + f"- {title}" + for title in (intro_conclusion_context.section_titles_in_order or []) + if title + ] + ) + or "- (none)" + ) + + sections_text = "" + for index, section in enumerate(intro_conclusion_context.sections_in_order or [], start=1): + sections_text += f"\nSection {index}: {section.title}\n{section.content}\n" + + if not sections_text.strip(): + sections_text = "\n(none)\n" + + return f""" +Today's date: {timezone.now().strftime("%Y-%m-%d")} + +Project details: +- Project name: {project_details.name} +- Project type: {project_details.type} +- Project summary: {project_details.summary} +- Blog theme: {project_details.blog_theme} +- Key features: {project_details.key_features} +- Target audience: {project_details.target_audience_summary} +- Pain points: {project_details.pain_points} +- Product usage: {project_details.product_usage} + +Blog post title suggestion: +- Title: {title_suggestion.title} +- Category: {title_suggestion.category} +- Description: {title_suggestion.description} +- Suggested meta description: {title_suggestion.suggested_meta_description} +- Target keywords: {", ".join(target_keywords) if target_keywords else "None"} + +Outline: +{section_titles_text} + +All existing section contents (use this as the truth of what the post covers): +{sections_text} + +Language: Write in {project_details.language}. +""" + + return agent diff --git a/core/agents/generate_blog_post_section_content_agent.py b/core/agents/generate_blog_post_section_content_agent.py new file mode 100644 index 0000000..7c03082 --- /dev/null +++ b/core/agents/generate_blog_post_section_content_agent.py @@ -0,0 +1,123 @@ +from django.utils import timezone +from pydantic_ai import Agent + +from core.agents.schemas import ( + BlogPostSectionContentGenerationContext, + GeneratedBlogPostSectionContentSchema, +) +from core.choices import ContentType, get_default_ai_model +from core.prompts import GENERATE_CONTENT_SYSTEM_PROMPTS + +SECTION_CONTENT_SYSTEM_PROMPT = """ +You are an expert blog post writer. + +Your task: write the content for ONE blog post section (the body of the section only). + +Rules: +- Do NOT write the Introduction or Conclusion. +- Do NOT include the section title as a markdown header. No leading '#', '##', or '###'. +- Avoid markdown headings entirely. Use paragraphs, bullet lists, and numbered lists only when useful. +- Use the provided "Previous sections" to maintain continuity and avoid repetition. +- Use the provided research link outputs as factual grounding. Do not invent sources or cite URLs. +- Keep the section coherent with the overall outline and the order position provided. +- Do not add placeholders. +""" + + +def create_generate_blog_post_section_content_agent( + content_type: ContentType = ContentType.SHARING, model=None +): + """ + Create an agent to generate the content for a single middle section of a blog post. + """ + agent = Agent( + model or get_default_ai_model(), + output_type=GeneratedBlogPostSectionContentSchema, + deps_type=BlogPostSectionContentGenerationContext, + system_prompt=( + SECTION_CONTENT_SYSTEM_PROMPT + + "\n\n" + + (GENERATE_CONTENT_SYSTEM_PROMPTS.get(content_type, "") or "") + ), + retries=2, + model_settings={"max_tokens": 16000, "temperature": 0.7}, + ) + + @agent.system_prompt + def add_section_content_context(ctx) -> str: + section_context: BlogPostSectionContentGenerationContext = ctx.deps + generation_context = section_context.blog_post_generation_context + project_details = generation_context.project_details + title_suggestion = generation_context.title_suggestion + target_keywords = title_suggestion.target_keywords or [] + + other_titles = [title for title in (section_context.other_section_titles or []) if title] + other_titles_text = "\n".join([f"- {title}" for title in other_titles]) or "- (none)" + + previous_sections = section_context.previous_sections or [] + previous_sections_text = "" + for previous_section_index, previous_section in enumerate(previous_sections, start=1): + previous_sections_text += ( + f"\nPrevious section {previous_section_index}: {previous_section.title}\n" + f"{previous_section.content}\n" + ) + if not previous_sections_text.strip(): + previous_sections_text = "\n(none)\n" + + research_questions_text = "" + for question_index, question in enumerate( + section_context.research_questions or [], start=1 + ): + research_questions_text += ( + f"\nResearch question {question_index}: {question.question}\n" + ) + for link_index, link in enumerate(question.research_links or [], start=1): + research_questions_text += ( + f"\nAnswered research link {link_index}:\n" + f"- summary_for_question_research:\n{link.summary_for_question_research}\n" + f"- general_summary:\n{link.general_summary}\n" + f"- answer_to_question:\n{link.answer_to_question}\n" + ) + + if not research_questions_text.strip(): + research_questions_text = "\n(none)\n" + + return f""" +Today's date: {timezone.now().strftime("%Y-%m-%d")} + +Project details: +- Project name: {project_details.name} +- Project type: {project_details.type} +- Project summary: {project_details.summary} +- Blog theme: {project_details.blog_theme} +- Key features: {project_details.key_features} +- Target audience: {project_details.target_audience_summary} +- Pain points: {project_details.pain_points} +- Product usage: {project_details.product_usage} + +Blog post title suggestion: +- Title: {title_suggestion.title} +- Category: {title_suggestion.category} +- Description: {title_suggestion.description} +- Suggested meta description: {title_suggestion.suggested_meta_description} +- Target keywords: {", ".join(target_keywords) if target_keywords else "None"} + +Outline coherence: +- Other section titles: +{other_titles_text} + +Current section to write: +- Section title: {section_context.section_title} +- Section order in outline: {section_context.section_order} / {section_context.total_sections} +- Section order among middle sections: {section_context.research_section_order} / {section_context.total_research_sections} + +Previous sections (for continuity; do not repeat content): +{previous_sections_text} + +Research answers for this section (only include content that is supported by these answers): +{research_questions_text} + +Language: Write in {project_details.language}. +""" + + return agent diff --git a/core/agents/research_link_summary_agent.py b/core/agents/research_link_summary_agent.py new file mode 100644 index 0000000..75aedea --- /dev/null +++ b/core/agents/research_link_summary_agent.py @@ -0,0 +1,123 @@ +from django.utils import timezone +from pydantic_ai import Agent, RunContext + +from core.agents.schemas import ( + ResearchLinkAnalysis, + ResearchLinkContextualSummaryContext, + TextSummary, + WebPageContent, +) +from core.choices import get_default_ai_model + + +def _add_webpage_content_from_web_page_content(ctx: RunContext[WebPageContent]) -> str: + return ( + "Web page content:\n" + f"Title: {ctx.deps.title}\n" + f"Description: {ctx.deps.description}\n" + f"Content: {ctx.deps.markdown_content}\n" + ) + + +def _add_webpage_content_from_contextual_deps( + ctx: RunContext[ResearchLinkContextualSummaryContext], +) -> str: + web_page_content = ctx.deps.web_page_content + return ( + "Web page content:\n" + f"URL: {ctx.deps.url}\n" + f"Title: {web_page_content.title}\n" + f"Description: {web_page_content.description}\n" + f"Content: {web_page_content.markdown_content}\n" + ) + + +def _add_blog_post_research_context(ctx: RunContext[ResearchLinkContextualSummaryContext]) -> str: + blog_post_generation_context = ctx.deps.blog_post_generation_context + project_details = blog_post_generation_context.project_details + title_suggestion = blog_post_generation_context.title_suggestion + target_keywords = title_suggestion.target_keywords or [] + + return ( + "Context for why we are summarizing this page:\n" + f"- Today's date: {timezone.now().strftime('%Y-%m-%d')}\n" + f"- Project: {project_details.name}\n" + f"- Project summary: {project_details.summary}\n" + f"- Blog post title: {ctx.deps.blog_post_title}\n" + f"- Blog post section: {ctx.deps.section_title}\n" + f"- Research question: {ctx.deps.research_question}\n" + f"- Target keywords: {', '.join(target_keywords) if target_keywords else 'None'}\n" + "\n" + "You must tailor the summary to help the writer answer the research question for that section.\n" # noqa: E501 + ) + + +def create_general_research_link_summary_agent(model=None): + agent = Agent( + model or get_default_ai_model(), + output_type=TextSummary, + deps_type=WebPageContent, + system_prompt=( + "You are an expert content summarizer. Summarize the web page content provided.\n" + "Return a concise 2-3 sentence summary that captures the main purpose and key information.\n" # noqa: E501 + "Focus on what the page is about and its main value proposition.\n" + ), + retries=2, + model_settings={"temperature": 0.4}, + ) + agent.system_prompt(_add_webpage_content_from_web_page_content) + return agent + + +def create_contextual_research_link_summary_agent(model=None): + agent = Agent( + model or get_default_ai_model(), + output_type=TextSummary, + deps_type=ResearchLinkContextualSummaryContext, + system_prompt=( + "You are a research assistant helping write a blog post.\n" + "Summarize the page in a way that is maximally useful for answering the research question.\n" # noqa: E501 + "Prefer concrete facts, definitions, steps, examples, and any notable stats. If the page is not relevant, say so clearly.\n" # noqa: E501 + "Output markdown that includes:\n" + "- A short paragraph summary\n" + "- 'Key takeaways' as 3-7 bullet points\n" + "- 'How this helps our section' as 1-3 bullet points\n" + ), + retries=2, + model_settings={"temperature": 0.3}, + ) + agent.system_prompt(_add_blog_post_research_context) + agent.system_prompt(_add_webpage_content_from_contextual_deps) + return agent + + +def create_research_link_analysis_agent(model=None): + """ + Analyze a research link in a single model call and return: + - a general page summary + - a contextual summary tailored to the blog post section's research question + - a direct answer to the research question (if possible from the page) + """ + agent = Agent( + model or get_default_ai_model(), + output_type=ResearchLinkAnalysis, + deps_type=ResearchLinkContextualSummaryContext, + system_prompt=( + "You are a research assistant helping write a blog post.\n" + "Using only the web page content provided, produce three outputs:\n" + "1) general_summary: a context-free 2-3 sentence summary of what the page is about.\n" + "2) summary_for_question_research: a markdown summary tailored to the research question. " + "Include: a short paragraph summary, 'Key takeaways' (3-7 bullets), and " + "'How this helps our section' (1-3 bullets).\n" + "3) answer_to_question: directly answer the research question in 1-6 sentences. " + "If the page does not answer it (or is irrelevant), say so clearly.\n" + "\n" + "Be concrete and avoid speculation. Prefer facts, definitions, steps, examples, and stats " + "that are present in the page.\n" + ), + retries=2, + model_settings={"temperature": 0.3}, + ) + agent.system_prompt(_add_blog_post_research_context) + agent.system_prompt(_add_webpage_content_from_contextual_deps) + return agent diff --git a/core/agents/schemas.py b/core/agents/schemas.py index 6852bfc..872597d 100644 --- a/core/agents/schemas.py +++ b/core/agents/schemas.py @@ -14,6 +14,31 @@ class WebPageContent(BaseModel): markdown_content: str +class TextSummary(BaseModel): + summary: str = Field(description="A concise summary of the provided content") + + +class ResearchLinkAnalysis(BaseModel): + general_summary: str = Field( + description=( + "A general, context-free summary of the page content. Keep it to 2-3 sentences." + ) + ) + summary_for_question_research: str = Field( + description=( + "A markdown summary tailored to the blog post's research question. Include: " + "a short paragraph summary, 'Key takeaways' (3-7 bullets), and " + "'How this helps our section' (1-3 bullets)." + ) + ) + answer_to_question: str = Field( + description=( + "A direct answer to the research question, based strictly on the page content. " + "If the page does not answer the question, say so clearly." + ) + ) + + class ProjectDetails(BaseModel): name: str = Field(description="Official name of the project or organization") type: str = Field( @@ -189,6 +214,92 @@ class BlogPostGenerationContext(BaseModel): content_type: str = Field(description="Type of content to generate (SEO or SHARING)") +class ResearchLinkContextualSummaryContext(BaseModel): + url: str = Field(description="Source URL of the research page") + web_page_content: WebPageContent + blog_post_generation_context: BlogPostGenerationContext + blog_post_title: str = Field(description="Title of the blog post being written") + section_title: str = Field(description="Title of the blog post section being written") + research_question: str = Field(description="Research question we are trying to answer") + + +class ResearchLinkAnswerSnippet(BaseModel): + summary_for_question_research: str = Field( + description="A markdown summary tailored to the research question" + ) + general_summary: str = Field(description="A general, context-free 2-3 sentence page summary") + answer_to_question: str = Field(description="A direct answer to the research question") + + +class ResearchQuestionWithAnsweredLinks(BaseModel): + question: str = Field(description="The research question we were answering") + research_links: list[ResearchLinkAnswerSnippet] = Field( + default_factory=list, + description="Only research links that include a non-empty answer_to_question", + ) + + +class PriorSectionContext(BaseModel): + title: str = Field(description="Section title") + content: str = Field(description="Section content (markdown)") + + +class BlogPostSectionContentGenerationContext(BaseModel): + blog_post_generation_context: BlogPostGenerationContext + blog_post_title: str = Field(description="Title of the blog post being written") + section_title: str = Field(description="Title of the section to write") + section_order: int = Field(description="Order of this section in the overall outline") + total_sections: int = Field(description="Total number of sections in the outline") + research_section_order: int = Field( + description="1-based order of this section among the middle (non-intro/non-conclusion) sections" + ) + total_research_sections: int = Field( + description="Total number of middle (non-intro/non-conclusion) sections" + ) + other_section_titles: list[str] = Field( + default_factory=list, + description="Titles of the other sections in the blog post (for coherence)", + ) + previous_sections: list[PriorSectionContext] = Field( + default_factory=list, + description="Previously generated section content (in order) to keep the narrative coherent", + ) + research_questions: list[ResearchQuestionWithAnsweredLinks] = Field( + default_factory=list, + description="Research questions for this section, with only answered research links included", + ) + + +class GeneratedBlogPostSectionContentSchema(BaseModel): + content: str = Field( + description=( + "Markdown content for the section body only (do not include the section title as a header)" + ) + ) + + +class BlogPostIntroConclusionGenerationContext(BaseModel): + blog_post_generation_context: BlogPostGenerationContext + blog_post_title: str = Field(description="Title of the blog post being written") + section_titles_in_order: list[str] = Field( + default_factory=list, + description="All section titles in outline order (including Introduction and Conclusion)", + ) + sections_in_order: list[PriorSectionContext] = Field( + default_factory=list, + description="All existing section contents in order (including middle sections) to base intro/conclusion on", + ) + + +class GeneratedBlogPostIntroConclusionSchema(BaseModel): + introduction: str = Field( + description="Markdown content for the Introduction section body only (no heading)" + ) + conclusion: str = Field( + description="Markdown content for the Conclusion section body only (no heading)" + ) + + class GeneratedBlogPostSchema(BaseModel): description: str = Field( description="Meta description (150-160 characters) optimized for search engines" diff --git a/core/content_generator/__init__.py b/core/content_generator/__init__.py new file mode 100644 index 0000000..976578a --- /dev/null +++ b/core/content_generator/__init__.py @@ -0,0 +1,7 @@ +""" +Content generation pipeline package. + +This package contains: +- `pipeline.py`: pipeline "steps" that orchestrate content generation + research. +- `utils.py`: small reusable helpers for the pipeline. +""" diff --git a/core/content_generator/pipeline.py b/core/content_generator/pipeline.py new file mode 100644 index 0000000..88226e4 --- /dev/null +++ b/core/content_generator/pipeline.py @@ -0,0 +1,1263 @@ +from __future__ import annotations + +from django.conf import settings +from django.core.cache import cache +from django.db import transaction +from django.utils import timezone +from django.utils.dateparse import parse_datetime +from django.utils.text import slugify +from django_q.tasks import async_task +from exa_py import Exa + +from core.agents.blog_post_outline_agent import ( + create_blog_post_outline_agent, + create_blog_post_section_research_questions_agent, +) +from core.agents.generate_blog_post_intro_conclusion_agent import ( + create_generate_blog_post_intro_conclusion_agent, +) +from core.agents.generate_blog_post_section_content_agent import ( + create_generate_blog_post_section_content_agent, +) +from core.agents.research_link_summary_agent import ( + create_research_link_analysis_agent, +) +from core.agents.schemas import ( + BlogPostGenerationContext, + BlogPostIntroConclusionGenerationContext, + BlogPostSectionContentGenerationContext, + GeneratedBlogPostIntroConclusionSchema, + GeneratedBlogPostSectionContentSchema, + PriorSectionContext, + ResearchLinkAnswerSnippet, + ResearchLinkContextualSummaryContext, + ResearchQuestionWithAnsweredLinks, + WebPageContent, +) +from core.choices import ContentType +from core.content_generator.utils import get_exa_date_range_iso_strings +from core.models import ( + GeneratedBlogPost, + GeneratedBlogPostResearchLink, + GeneratedBlogPostResearchQuestion, + GeneratedBlogPostSection, +) +from core.utils import get_markdown_content, run_agent_synchronously +from tuxseo.utils import get_tuxseo_logger + +logger = get_tuxseo_logger(__name__) + + +INTRODUCTION_SECTION_TITLE = "Introduction" +CONCLUSION_SECTION_TITLE = "Conclusion" +NON_RESEARCH_SECTION_TITLES = {INTRODUCTION_SECTION_TITLE, CONCLUSION_SECTION_TITLE} +MAX_RESEARCH_LINK_MARKDOWN_CHARS_FOR_SUMMARY = 25_000 +LOCAL_MAX_RESEARCH_QUESTIONS_PER_SECTION = 1 +SECTION_SYNTHESIS_RETRY_CACHE_TTL_SECONDS = 6 * 60 * 60 + + +def _create_blog_post_generation_context( + *, title_suggestion, content_type_to_use: str +) -> BlogPostGenerationContext: + keywords_to_use = title_suggestion.get_blog_post_keywords() + return BlogPostGenerationContext( + project_details=title_suggestion.project.project_details, + title_suggestion=title_suggestion.title_suggestion_schema, + project_keywords=keywords_to_use, + project_pages=[], + content_type=content_type_to_use, + ) + + +def generate_sections_to_create(*, title_suggestion, content_type: str | None = None) -> list[str]: + """ + Step 1: Generate the section titles we will create (one AI query). + """ + if title_suggestion is None: + raise ValueError("title_suggestion is required") + + if not title_suggestion.project_id: + raise ValueError("title_suggestion must be associated to a project") + + content_type_to_use = content_type or title_suggestion.content_type or ContentType.SHARING + outline_context = _create_blog_post_generation_context( + title_suggestion=title_suggestion, + content_type_to_use=content_type_to_use, + ) + + outline_agent = create_blog_post_outline_agent() + outline_result = run_agent_synchronously( + outline_agent, + "Generate the blog post outline sections.", + deps=outline_context, + function_name="generate_sections_to_create", + model_name="GeneratedBlogPost", + ) + + outline_sections = ( + outline_result.output.sections if outline_result and outline_result.output else [] + ) + + middle_section_titles = [ + (section.title or "").strip() + for section in outline_sections + if (section.title or "").strip() + ] + + return [INTRODUCTION_SECTION_TITLE, *middle_section_titles, CONCLUSION_SECTION_TITLE] + + +def create_blog_post_and_sections( + *, title_suggestion, section_titles: list[str], content_type: str | None = None +): + """ + Step 1b: Persist the GeneratedBlogPost + GeneratedBlogPostSection rows. + """ + content_type_to_use = content_type or title_suggestion.content_type or ContentType.SHARING + tags = ", ".join(title_suggestion.target_keywords) if title_suggestion.target_keywords else "" + + with transaction.atomic(): + blog_post = GeneratedBlogPost.objects.create( + project=title_suggestion.project, + title_suggestion=title_suggestion, + title=title_suggestion.title, + description=title_suggestion.suggested_meta_description, + slug=slugify(title_suggestion.title), + tags=tags, + content="", + ) + + for section_order, section_title in enumerate(section_titles): + GeneratedBlogPostSection.objects.create( + blog_post=blog_post, + title=(section_title or "")[:250], + content="", + order=section_order, + ) + + logger.info( + "[ContentGenerator] Blog post initialized", + blog_post_id=blog_post.id, + title_suggestion_id=title_suggestion.id, + project_id=title_suggestion.project_id, + num_sections_created=len(section_titles), + content_type=content_type_to_use, + ) + + return blog_post + + +def queue_research_question_generation_for_sections(*, blog_post_id: int) -> int: + """ + Step 2: Queue one task per (research) section to generate questions. + """ + blog_post = ( + GeneratedBlogPost.objects.prefetch_related("blog_post_sections") + .filter(id=blog_post_id) + .first() + ) + if not blog_post: + raise ValueError(f"GeneratedBlogPost not found: {blog_post_id}") + + blog_post_sections = list(blog_post.blog_post_sections.all()) + research_sections = [ + section + for section in blog_post_sections + if (section.title or "").strip() not in NON_RESEARCH_SECTION_TITLES + ] + + for section in research_sections: + async_task( + "core.content_generator.tasks.generate_research_questions_for_section_task", + section.id, + group="Generate Research Questions", + ) + + logger.info( + "[ContentGenerator] Queued research question generation tasks", + blog_post_id=blog_post.id, + num_sections=len(blog_post_sections), + num_research_sections=len(research_sections), + ) + + return len(research_sections) + + +def init_blog_post_content_generation(title_suggestion, content_type: str | None = None): + """ + Pipeline entrypoint (currently stops after queuing tasks). + + Step 1: generate sections we will create + Step 2: queue tasks to generate questions for each section + Step 3: (handled by the tasks) queue tasks to fetch Exa links for each generated question + Step 4: next steps later + """ + section_titles = generate_sections_to_create( + title_suggestion=title_suggestion, content_type=content_type + ) + blog_post = create_blog_post_and_sections( + title_suggestion=title_suggestion, + section_titles=section_titles, + content_type=content_type, + ) + queue_research_question_generation_for_sections(blog_post_id=blog_post.id) + return blog_post + + +def populate_research_links_for_question_from_exa( + research_question_id: int, + num_results_per_question: int = 2, + months_back: int = 6, +): + """ + Step 3: Get links for one question from Exa (called via a task per question). + """ + research_question = ( + GeneratedBlogPostResearchQuestion.objects.select_related("blog_post") + .filter(id=research_question_id) + .first() + ) + if not research_question: + raise ValueError(f"GeneratedBlogPostResearchQuestion not found: {research_question_id}") + + blog_post = research_question.blog_post + if not blog_post: + raise ValueError(f"GeneratedBlogPost missing on research question: {research_question_id}") + + research_question_text = (research_question.question or "").strip() + if not research_question_text: + return 0 + + start_date_iso_format, end_date_iso_format = get_exa_date_range_iso_strings( + months_back=months_back + ) + exa = Exa(api_key=settings.EXA_API_KEY) + + exa_response = exa.search( + research_question_text, + end_crawl_date=end_date_iso_format, + end_published_date=end_date_iso_format, + start_crawl_date=start_date_iso_format, + start_published_date=start_date_iso_format, + num_results=num_results_per_question, + type="auto", + ) + + exa_results = ( + exa_response.results + if hasattr(exa_response, "results") + else (exa_response or {}).get("results", []) + ) + exa_results = exa_results or [] + + num_links_upserted = 0 + num_scrape_tasks_queued = 0 + + for result in exa_results: + if hasattr(result, "url"): + url = getattr(result, "url", "") or "" + title = getattr(result, "title", "") or "" + author = getattr(result, "author", "") or "" + published_date_raw = getattr(result, "publishedDate", None) + else: + url = (result or {}).get("url", "") or "" + title = (result or {}).get("title", "") or "" + author = (result or {}).get("author", "") or "" + published_date_raw = (result or {}).get("publishedDate") or (result or {}).get( + "published_date" + ) + + url = url.strip() + if not url.startswith(("http://", "https://")): + continue + + if len(url) > 200: + continue + + published_date = parse_datetime(published_date_raw) if published_date_raw else None + if published_date and timezone.is_naive(published_date): + published_date = timezone.make_aware( + published_date, timezone=timezone.get_current_timezone() + ) + + research_link, _created = GeneratedBlogPostResearchLink.objects.update_or_create( + blog_post=blog_post, + research_question=research_question, + url=url, + defaults={ + "title": title[:500], + "author": author[:250], + "published_date": published_date, + }, + ) + + num_links_upserted += 1 + + should_queue_scrape_task = not (research_link.content or "").strip() + if should_queue_scrape_task: + async_task( + "core.content_generator.tasks.scrape_research_link_content_task", + research_link.id, + group="Scrape Research Links", + ) + num_scrape_tasks_queued += 1 + + logger.info( + "[ContentGenerator] Exa research link search completed (single question)", + blog_post_id=blog_post.id, + research_question_id=research_question.id, + num_links_upserted=num_links_upserted, + num_scrape_tasks_queued=num_scrape_tasks_queued, + num_results_per_question=num_results_per_question, + months_back=months_back, + ) + + # If Exa returned no links for this question, nothing will trigger scrape/analyze kicks. + # This "kick" is safe (it will only queue synthesis when the overall blog post is ready). + if num_links_upserted == 0: + maybe_queue_section_content_synthesis_for_blog_post(blog_post_id=blog_post.id) + + return num_links_upserted + + +def scrape_research_link_content(*, research_link_id: int) -> bool: + """ + Step 4a: For a single research link, fetch the page content using Jina Reader and store it. + + Returns: True if content is present after the operation, False otherwise. + """ + research_link = ( + GeneratedBlogPostResearchLink.objects.select_related( + "blog_post", + "blog_post__title_suggestion", + "blog_post__project", + "research_question", + "research_question__section", + ) + .filter(id=research_link_id) + .first() + ) + if not research_link: + raise ValueError(f"GeneratedBlogPostResearchLink not found: {research_link_id}") + + url = (research_link.url or "").strip() + if not url.startswith(("http://", "https://")): + logger.info( + "[ContentGenerator] Skipping scrape/summarize for invalid research link url", + research_link_id=research_link.id, + url=url, + ) + return 0 + + blog_post = research_link.blog_post + research_question = research_link.research_question + if not blog_post or not research_question: + raise ValueError(f"Research link missing blog_post/research_question: {research_link_id}") + + should_fetch_page_content = not (research_link.content or "").strip() + if not should_fetch_page_content: + logger.info( + "[ContentGenerator] Research link already scraped; skipping", + research_link_id=research_link.id, + blog_post_id=blog_post.id, + ) + return True + + page_title = research_link.title + page_description = research_link.description + page_markdown_content = research_link.content + + scraped_title, scraped_description, scraped_content = get_markdown_content(url) + if not scraped_content.strip(): + logger.warning( + "[ContentGenerator] Jina Reader returned empty content for research link", + research_link_id=research_link.id, + blog_post_id=blog_post.id, + url=url, + ) + return False + + page_title = scraped_title or page_title + page_description = scraped_description or "" + page_markdown_content = scraped_content + + if not (page_markdown_content or "").strip(): + logger.warning( + "[ContentGenerator] Research link has empty content; cannot summarize", + research_link_id=research_link.id, + blog_post_id=blog_post.id, + url=url, + ) + return False + + update_fields: list[str] = [] + + research_link.date_scraped = timezone.now() + update_fields.append("date_scraped") + + research_link.title = (page_title or "")[:500] + update_fields.append("title") + + research_link.description = page_description or "" + update_fields.append("description") + + research_link.content = page_markdown_content or "" + update_fields.append("content") + + research_link.save(update_fields=list(dict.fromkeys(update_fields))) + + logger.info( + "[ContentGenerator] Research link scraped", + research_link_id=research_link.id, + blog_post_id=blog_post.id, + research_question_id=research_question.id, + updated_fields=update_fields, + url=url, + ) + + return True + + +def analyze_research_link_content(*, research_link_id: int) -> int: + """ + Step 4b: For a single research link (that already has content), generate: + - a general page summary + - a blog-post-contextual summary for the research question/section + - an answer to the research question (answer_to_question) + + Returns: number of fields updated on the research link. + """ + research_link = ( + GeneratedBlogPostResearchLink.objects.select_related( + "blog_post", + "blog_post__title_suggestion", + "blog_post__project", + "research_question", + "research_question__section", + ) + .filter(id=research_link_id) + .first() + ) + if not research_link: + raise ValueError(f"GeneratedBlogPostResearchLink not found: {research_link_id}") + + blog_post = research_link.blog_post + research_question = research_link.research_question + if not blog_post or not research_question: + raise ValueError(f"Research link missing blog_post/research_question: {research_link_id}") + + url = (research_link.url or "").strip() + page_markdown_content = (research_link.content or "").strip() + if not page_markdown_content: + logger.info( + "[ContentGenerator] Research link has no content yet; skipping analysis", + research_link_id=research_link.id, + blog_post_id=blog_post.id, + url=url, + ) + research_link.date_analyzed = timezone.now() + research_link.save(update_fields=["date_analyzed"]) + maybe_queue_section_content_synthesis_for_blog_post(blog_post_id=blog_post.id) + return 0 + + should_run_general_summary = not (research_link.general_summary or "").strip() + should_run_contextual_summary = not (research_link.summary_for_question_research or "").strip() + should_run_answer_to_question = not (research_link.answer_to_question or "").strip() + if ( + not should_run_general_summary + and not should_run_contextual_summary + and not should_run_answer_to_question + ): + logger.info( + "[ContentGenerator] Research link already analyzed; skipping", + research_link_id=research_link.id, + blog_post_id=blog_post.id, + ) + maybe_queue_section_content_synthesis_for_blog_post(blog_post_id=blog_post.id) + return 0 + + webpage_content = WebPageContent( + title=(research_link.title or "").strip(), + description=(research_link.description or "").strip(), + markdown_content=page_markdown_content[:MAX_RESEARCH_LINK_MARKDOWN_CHARS_FOR_SUMMARY], + ) + + update_fields: list[str] = [] + + title_suggestion = blog_post.title_suggestion + if not title_suggestion: + raise ValueError(f"GeneratedBlogPost missing title_suggestion: {blog_post.id}") + + content_type_to_use = title_suggestion.content_type or ContentType.SHARING + blog_post_generation_context = _create_blog_post_generation_context( + title_suggestion=title_suggestion, + content_type_to_use=content_type_to_use, + ) + + section_title = (getattr(research_question.section, "title", "") or "").strip() + research_question_text = (research_question.question or "").strip() + + analysis_agent = create_research_link_analysis_agent() + analysis_deps = ResearchLinkContextualSummaryContext( + url=url, + web_page_content=webpage_content, + blog_post_generation_context=blog_post_generation_context, + blog_post_title=(blog_post.title or title_suggestion.title or "").strip(), + section_title=section_title, + research_question=research_question_text, + ) + analysis_result = run_agent_synchronously( + analysis_agent, + "Analyze this page for blog-post research.", + deps=analysis_deps, + function_name="analyze_research_link_content.research_link_analysis", + model_name="GeneratedBlogPostResearchLink", + ) + + if should_run_general_summary: + research_link.general_summary = (analysis_result.output.general_summary or "").strip() + update_fields.append("general_summary") + + if should_run_contextual_summary: + research_link.summary_for_question_research = ( + analysis_result.output.summary_for_question_research or "" + ).strip() + update_fields.append("summary_for_question_research") + + if should_run_answer_to_question: + research_link.answer_to_question = (analysis_result.output.answer_to_question or "").strip() + update_fields.append("answer_to_question") + + research_link.date_analyzed = timezone.now() + update_fields.append("date_analyzed") + + research_link.save(update_fields=list(dict.fromkeys(update_fields))) + + logger.info( + "[ContentGenerator] Research link analyzed", + research_link_id=research_link.id, + blog_post_id=blog_post.id, + research_question_id=research_question.id, + updated_fields=update_fields, + url=url, + ) + + maybe_queue_section_content_synthesis_for_blog_post(blog_post_id=blog_post.id) + + return len(set(update_fields)) + + +def generate_research_questions_for_section(*, section_id: int) -> list[int]: + """ + Step 2 (task): Generate research questions for a single section. + + Returns: list of created GeneratedBlogPostResearchQuestion IDs. + """ + section = ( + GeneratedBlogPostSection.objects.select_related( + "blog_post", + "blog_post__title_suggestion", + "blog_post__project", + ) + .filter(id=section_id) + .first() + ) + if not section: + raise ValueError(f"GeneratedBlogPostSection not found: {section_id}") + + section_title = (section.title or "").strip() + if section_title in NON_RESEARCH_SECTION_TITLES: + logger.info( + "[ContentGenerator] Skipping research question generation for non-research section", + section_id=section.id, + section_title=section_title, + blog_post_id=section.blog_post_id, + ) + return [] + + blog_post = section.blog_post + if not blog_post or not blog_post.title_suggestion_id: + raise ValueError(f"Section is missing blog_post/title_suggestion: {section_id}") + + title_suggestion = blog_post.title_suggestion + content_type_to_use = title_suggestion.content_type or ContentType.SHARING + outline_context = _create_blog_post_generation_context( + title_suggestion=title_suggestion, + content_type_to_use=content_type_to_use, + ) + + research_questions_agent = create_blog_post_section_research_questions_agent() + questions_result = run_agent_synchronously( + research_questions_agent, + f"Generate research questions for section: {section_title}", + deps=outline_context, + function_name="generate_research_questions_for_section", + model_name="GeneratedBlogPost", + ) + + questions = ( + questions_result.output.questions if questions_result and questions_result.output else [] + ) + + questions_to_create = [] + for question in questions: + research_question_text = (question or "").strip() + if not research_question_text: + continue + questions_to_create.append( + GeneratedBlogPostResearchQuestion( + blog_post=blog_post, + section=section, + question=research_question_text[:250], + ) + ) + + if settings.DEBUG: + questions_to_create = questions_to_create[:LOCAL_MAX_RESEARCH_QUESTIONS_PER_SECTION] + + created_questions = GeneratedBlogPostResearchQuestion.objects.bulk_create(questions_to_create) + created_question_ids = [ + created_question.id for created_question in created_questions if created_question.id + ] + + logger.info( + "[ContentGenerator] Research questions generated", + section_id=section.id, + blog_post_id=blog_post.id, + num_questions_created=len(created_question_ids), + ) + + # If no questions were created, nothing else will trigger Exa/scrape/analysis tasks. + # In that case, kick section synthesis so the pipeline can still proceed. + if not created_question_ids: + maybe_queue_section_content_synthesis_for_blog_post(blog_post_id=blog_post.id) + + return created_question_ids + + +def _build_research_questions_with_answered_links_for_section( + *, section: GeneratedBlogPostSection +) -> list[ResearchQuestionWithAnsweredLinks]: + research_questions_with_answered_links: list[ResearchQuestionWithAnsweredLinks] = [] + + section_questions = list(section.research_questions.all()) + for research_question in section_questions: + question_text = (research_question.question or "").strip() + if not question_text: + continue + + research_links = list(research_question.research_links.all()) + answered_links = [ + research_link + for research_link in research_links + if (research_link.answer_to_question or "").strip() + ] + + research_link_snippets = [] + if answered_links: + research_link_snippets = [ + ResearchLinkAnswerSnippet( + summary_for_question_research=( + (research_link.summary_for_question_research or "").strip() + ), + general_summary=(research_link.general_summary or "").strip(), + answer_to_question=(research_link.answer_to_question or "").strip(), + ) + for research_link in answered_links + ] + + research_questions_with_answered_links.append( + ResearchQuestionWithAnsweredLinks( + question=question_text, + research_links=research_link_snippets, + ) + ) + + return research_questions_with_answered_links + + +def _build_prior_section_contexts( + *, sections_in_order: list[GeneratedBlogPostSection], current_section_order: int +) -> list[PriorSectionContext]: + prior_sections: list[PriorSectionContext] = [] + for section in sections_in_order: + if section.order >= current_section_order: + continue + if (section.title or "").strip() in NON_RESEARCH_SECTION_TITLES: + continue + content = (section.content or "").strip() + if not content: + continue + prior_sections.append( + PriorSectionContext(title=(section.title or "").strip(), content=content) + ) + return prior_sections + + +def synthesize_section_contents_for_blog_post(*, blog_post_id: int) -> int: + """ + Step 5: Synthesize content for each middle section sequentially (excluding Introduction/Conclusion). + + Context passed to the model: + - Project details + - Title suggestion details + - Current section info + - Research link results (only for links with non-empty answer_to_question) + - Other section titles + - Section order + previous section content for coherence + """ + blog_post = ( + GeneratedBlogPost.objects.select_related( + "title_suggestion", + "project", + ) + .prefetch_related( + "blog_post_sections__research_questions__research_links", + ) + .filter(id=blog_post_id) + .first() + ) + if not blog_post: + raise ValueError(f"GeneratedBlogPost not found: {blog_post_id}") + + title_suggestion = blog_post.title_suggestion + if not title_suggestion: + raise ValueError(f"GeneratedBlogPost missing title_suggestion: {blog_post_id}") + + content_type_to_use = title_suggestion.content_type or ContentType.SHARING + blog_post_generation_context = _create_blog_post_generation_context( + title_suggestion=title_suggestion, + content_type_to_use=content_type_to_use, + ) + + sections_in_order = sorted( + list(blog_post.blog_post_sections.all()), + key=lambda section: (section.order, section.id), + ) + + all_section_titles = [ + (section.title or "").strip() + for section in sections_in_order + if (section.title or "").strip() + ] + total_sections = len(sections_in_order) + + middle_sections_in_order = [ + section + for section in sections_in_order + if (section.title or "").strip() not in NON_RESEARCH_SECTION_TITLES + ] + total_research_sections = len(middle_sections_in_order) + + section_agent = create_generate_blog_post_section_content_agent( + content_type=content_type_to_use + ) + + num_sections_generated = 0 + for research_section_index, section in enumerate(middle_sections_in_order, start=1): + section_title = (section.title or "").strip() + if not section_title: + continue + + existing_content = (section.content or "").strip() + if existing_content: + continue + + research_questions = _build_research_questions_with_answered_links_for_section( + section=section + ) + prior_sections = _build_prior_section_contexts( + sections_in_order=sections_in_order, + current_section_order=section.order, + ) + + section_context = BlogPostSectionContentGenerationContext( + blog_post_generation_context=blog_post_generation_context, + blog_post_title=(blog_post.title or title_suggestion.title or "").strip(), + section_title=section_title, + section_order=section.order, + total_sections=total_sections, + research_section_order=research_section_index, + total_research_sections=total_research_sections, + other_section_titles=all_section_titles, + previous_sections=prior_sections, + research_questions=research_questions, + ) + + prompt = f"Write the section body content for: {section_title}" + generation_result = run_agent_synchronously( + section_agent, + prompt, + deps=section_context, + function_name="synthesize_section_contents_for_blog_post.section_content", + model_name="GeneratedBlogPostSection", + ) + + generated_schema: GeneratedBlogPostSectionContentSchema | None = ( + generation_result.output if generation_result and generation_result.output else None + ) + generated_content = (generated_schema.content if generated_schema else "").strip() + if not generated_content: + logger.warning( + "[ContentGenerator] Section content generation returned empty content", + blog_post_id=blog_post.id, + section_id=section.id, + section_title=section_title, + ) + continue + + section.content = generated_content + section.save(update_fields=["content"]) + num_sections_generated += 1 + + logger.info( + "[ContentGenerator] Section content synthesized", + blog_post_id=blog_post.id, + section_id=section.id, + section_title=section_title, + section_order=section.order, + research_section_order=research_section_index, + total_research_sections=total_research_sections, + content_length=len(generated_content), + ) + + maybe_queue_intro_conclusion_generation_for_blog_post(blog_post_id=blog_post.id) + maybe_queue_section_content_synthesis_retry_for_blog_post(blog_post_id=blog_post.id) + return num_sections_generated + + +def _get_section_synthesis_retry_cache_key(*, blog_post_id: int) -> str: + return f"content_generator:section_synthesis_retry_count:{blog_post_id}" + + +def maybe_queue_section_content_synthesis_retry_for_blog_post(*, blog_post_id: int) -> bool: + """ + Retry mechanism for Step 5: + + If research is "done enough" (all links are in a terminal analyzed/attempted state), + but some middle sections still have empty content (e.g. a model returned empty output + or a task was missed), re-queue section synthesis a bounded number of times. + """ + blog_post = ( + GeneratedBlogPost.objects.prefetch_related("blog_post_sections") + .filter(id=blog_post_id) + .first() + ) + if not blog_post: + return False + + sections_in_order = _get_sections_in_order_for_blog_post(blog_post) + middle_sections = [ + section + for section in sections_in_order + if (section.title or "").strip() not in NON_RESEARCH_SECTION_TITLES + ] + has_any_middle_section_missing_content = any( + not (section.content or "").strip() for section in middle_sections + ) + if not has_any_middle_section_missing_content: + return False + + # Only retry when link processing is "complete" (including failures). + # If there are links still being processed, let the normal kicks handle it. + links_queryset = GeneratedBlogPostResearchLink.objects.filter(blog_post_id=blog_post_id) + has_any_pending_link = links_queryset.filter(date_analyzed__isnull=True).exists() + if has_any_pending_link: + return False + + max_retries = 5 if settings.DEBUG else 2 + retry_cache_key = _get_section_synthesis_retry_cache_key(blog_post_id=blog_post_id) + retry_count = cache.get(retry_cache_key, 0) or 0 + if retry_count >= max_retries: + logger.warning( + "[ContentGenerator] Not retrying section synthesis; max retries reached", + blog_post_id=blog_post_id, + retry_count=retry_count, + max_retries=max_retries, + num_middle_sections=len(middle_sections), + num_links_total=links_queryset.count(), + ) + return False + + cache.set(retry_cache_key, retry_count + 1, timeout=SECTION_SYNTHESIS_RETRY_CACHE_TTL_SECONDS) + async_task( + "core.content_generator.tasks.synthesize_section_contents_for_blog_post_task", + blog_post_id, + group="Synthesize Section Content (Retry)", + ) + logger.info( + "[ContentGenerator] Queued section content synthesis retry task", + blog_post_id=blog_post_id, + retry_count=retry_count + 1, + max_retries=max_retries, + num_middle_sections=len(middle_sections), + num_links_total=links_queryset.count(), + ) + return True + + +def maybe_queue_section_content_synthesis_for_blog_post(*, blog_post_id: int) -> bool: + """ + Queue Step 5 once research work is in a terminal state for all required inputs. + + This is intentionally best-effort + idempotent: + - It may queue more than once, but the synthesis step skips sections that already have content. + """ + blog_post = ( + GeneratedBlogPost.objects.prefetch_related( + "blog_post_sections__research_questions__research_links" + ) + .filter(id=blog_post_id) + .first() + ) + if not blog_post: + return False + + sections_in_order = _get_sections_in_order_for_blog_post(blog_post) + middle_sections_missing_content = [ + section + for section in sections_in_order + if (section.title or "").strip() not in NON_RESEARCH_SECTION_TITLES + and not (section.content or "").strip() + ] + + num_pending_links = 0 + num_scrape_tasks_queued = 0 + num_analyze_tasks_queued = 0 + + for section in middle_sections_missing_content: + section_questions = list(section.research_questions.all()) + for research_question in section_questions: + research_links = list(research_question.research_links.all()) + for research_link in research_links: + # Terminal state: we attempted analysis for this link (even if it failed). + if research_link.date_analyzed is not None: + continue + + num_pending_links += 1 + + link_content = (research_link.content or "").strip() + if not link_content: + # If we haven't scraped content yet (or it failed previously but wasn't marked), + # re-queue a scrape attempt. The scrape task will always queue analysis next. + async_task( + "core.content_generator.tasks.scrape_research_link_content_task", + research_link.id, + group="Scrape Research Links (Retry/Kick)", + ) + num_scrape_tasks_queued += 1 + continue + + # Content exists, but analysis hasn't run yet: queue AI augmentation. + async_task( + "core.content_generator.tasks.analyze_research_link_content_task", + research_link.id, + group="Analyze Research Links (Retry/Kick)", + ) + num_analyze_tasks_queued += 1 + + if num_pending_links > 0: + logger.info( + "[ContentGenerator] Not queuing section synthesis; research links still pending", + blog_post_id=blog_post_id, + num_middle_sections_missing_content=len(middle_sections_missing_content), + num_pending_links=num_pending_links, + num_scrape_tasks_queued=num_scrape_tasks_queued, + num_analyze_tasks_queued=num_analyze_tasks_queued, + ) + return False + + async_task( + "core.content_generator.tasks.synthesize_section_contents_for_blog_post_task", + blog_post_id, + group="Synthesize Section Content", + ) + logger.info( + "[ContentGenerator] Queued section content synthesis task", + blog_post_id=blog_post_id, + num_middle_sections_missing_content=len(middle_sections_missing_content), + ) + return True + + +def _get_sections_in_order_for_blog_post( + blog_post: GeneratedBlogPost, +) -> list[GeneratedBlogPostSection]: + return sorted( + list(blog_post.blog_post_sections.all()), + key=lambda section: (section.order, section.id), + ) + + +def generate_intro_and_conclusion_for_blog_post(*, blog_post_id: int) -> int: + """ + Step 6: Generate Introduction + Conclusion in a single model call. + + Runs only when all middle sections have content. + """ + blog_post = ( + GeneratedBlogPost.objects.select_related( + "title_suggestion", + "project", + ) + .prefetch_related("blog_post_sections") + .filter(id=blog_post_id) + .first() + ) + if not blog_post: + raise ValueError(f"GeneratedBlogPost not found: {blog_post_id}") + + title_suggestion = blog_post.title_suggestion + if not title_suggestion: + raise ValueError(f"GeneratedBlogPost missing title_suggestion: {blog_post_id}") + + content_type_to_use = title_suggestion.content_type or ContentType.SHARING + blog_post_generation_context = _create_blog_post_generation_context( + title_suggestion=title_suggestion, + content_type_to_use=content_type_to_use, + ) + + sections_in_order = _get_sections_in_order_for_blog_post(blog_post) + section_titles_in_order = [ + (section.title or "").strip() + for section in sections_in_order + if (section.title or "").strip() + ] + + intro_section = next( + ( + section + for section in sections_in_order + if (section.title or "").strip() == INTRODUCTION_SECTION_TITLE + ), + None, + ) + conclusion_section = next( + ( + section + for section in sections_in_order + if (section.title or "").strip() == CONCLUSION_SECTION_TITLE + ), + None, + ) + if not intro_section or not conclusion_section: + raise ValueError(f"Blog post is missing Introduction/Conclusion sections: {blog_post_id}") + + should_generate_intro = not (intro_section.content or "").strip() + should_generate_conclusion = not (conclusion_section.content or "").strip() + if not should_generate_intro and not should_generate_conclusion: + return 0 + + middle_sections = [ + section + for section in sections_in_order + if (section.title or "").strip() not in NON_RESEARCH_SECTION_TITLES + ] + has_any_middle_section_missing_content = any( + not (section.content or "").strip() for section in middle_sections + ) + if has_any_middle_section_missing_content: + logger.info( + "[ContentGenerator] Skipping intro/conclusion generation; middle sections not ready", + blog_post_id=blog_post.id, + num_middle_sections=len(middle_sections), + ) + return 0 + + existing_sections_context = [ + PriorSectionContext( + title=(section.title or "").strip(), + content=(section.content or "").strip(), + ) + for section in sections_in_order + if (section.title or "").strip() and (section.content or "").strip() + ] + + intro_conclusion_context = BlogPostIntroConclusionGenerationContext( + blog_post_generation_context=blog_post_generation_context, + blog_post_title=(blog_post.title or title_suggestion.title or "").strip(), + section_titles_in_order=section_titles_in_order, + sections_in_order=existing_sections_context, + ) + + agent = create_generate_blog_post_intro_conclusion_agent(content_type=content_type_to_use) + result = run_agent_synchronously( + agent, + "Write the Introduction and Conclusion for this blog post.", + deps=intro_conclusion_context, + function_name="generate_intro_and_conclusion_for_blog_post.intro_conclusion", + model_name="GeneratedBlogPostSection", + ) + + output: GeneratedBlogPostIntroConclusionSchema | None = ( + result.output if result and result.output else None + ) + if not output: + return 0 + + num_sections_updated = 0 + if should_generate_intro: + introduction_content = (output.introduction or "").strip() + if introduction_content: + intro_section.content = introduction_content + intro_section.save(update_fields=["content"]) + num_sections_updated += 1 + + if should_generate_conclusion: + conclusion_content = (output.conclusion or "").strip() + if conclusion_content: + conclusion_section.content = conclusion_content + conclusion_section.save(update_fields=["content"]) + num_sections_updated += 1 + + logger.info( + "[ContentGenerator] Intro/conclusion generated", + blog_post_id=blog_post.id, + intro_generated=bool((intro_section.content or "").strip()), + conclusion_generated=bool((conclusion_section.content or "").strip()), + num_sections_updated=num_sections_updated, + ) + + maybe_populate_generated_blog_post_content(blog_post_id=blog_post.id) + return num_sections_updated + + +def maybe_queue_intro_conclusion_generation_for_blog_post(*, blog_post_id: int) -> bool: + """ + Queue Step 6 only when all middle sections have content. + + Best-effort + idempotent: if it queues multiple times, the generation step skips when already present. + """ + blog_post = ( + GeneratedBlogPost.objects.prefetch_related("blog_post_sections") + .filter(id=blog_post_id) + .first() + ) + if not blog_post: + return False + + sections_in_order = _get_sections_in_order_for_blog_post(blog_post) + middle_sections = [ + section + for section in sections_in_order + if (section.title or "").strip() not in NON_RESEARCH_SECTION_TITLES + ] + + if any(not (section.content or "").strip() for section in middle_sections): + return False + + async_task( + "core.content_generator.tasks.generate_intro_and_conclusion_for_blog_post_task", + blog_post_id, + group="Generate Intro and Conclusion", + ) + logger.info( + "[ContentGenerator] Queued intro/conclusion generation task", + blog_post_id=blog_post_id, + num_middle_sections=len(middle_sections), + ) + return True + + +def _build_full_blog_post_markdown(*, blog_post: GeneratedBlogPost) -> str: + blog_post_title = (blog_post.title or "").strip() + if not blog_post_title: + return "" + + sections_in_order = _get_sections_in_order_for_blog_post(blog_post) + markdown_chunks = [f"# {blog_post_title}", ""] + + for section in sections_in_order: + section_title = (section.title or "").strip() + section_content = (section.content or "").strip() + if not section_title or not section_content: + continue + + markdown_chunks.append(f"## {section_title}") + markdown_chunks.append("") + markdown_chunks.append(section_content) + markdown_chunks.append("") + + full_markdown = "\n".join(markdown_chunks).strip() + "\n" + return full_markdown + + +def populate_generated_blog_post_content(*, blog_post_id: int) -> bool: + """ + Step 7: Populate GeneratedBlogPost.content from the generated section contents. + + Runs only when: + - All sections (including Introduction + Conclusion) have non-empty content + - GeneratedBlogPost.content is currently empty + """ + blog_post = ( + GeneratedBlogPost.objects.prefetch_related("blog_post_sections") + .filter(id=blog_post_id) + .first() + ) + if not blog_post: + raise ValueError(f"GeneratedBlogPost not found: {blog_post_id}") + + if (blog_post.content or "").strip(): + return False + + sections_in_order = _get_sections_in_order_for_blog_post(blog_post) + if any(not (section.content or "").strip() for section in sections_in_order): + logger.info( + "[ContentGenerator] Skipping blog_post.content population; not all sections have content", + blog_post_id=blog_post.id, + num_sections=len(sections_in_order), + ) + return False + + full_markdown = _build_full_blog_post_markdown(blog_post=blog_post) + if not full_markdown.strip(): + logger.warning( + "[ContentGenerator] Skipping blog_post.content population; built markdown is empty", + blog_post_id=blog_post.id, + ) + return False + + blog_post.content = full_markdown + blog_post.save(update_fields=["content"]) + + logger.info( + "[ContentGenerator] Populated GeneratedBlogPost.content from sections", + blog_post_id=blog_post.id, + content_length=len(full_markdown), + ) + return True + + +def maybe_populate_generated_blog_post_content(*, blog_post_id: int) -> bool: + """ + Queue Step 7 when the whole pipeline is done. + + Best-effort + idempotent: population skips if blog_post.content is already non-empty. + """ + blog_post = ( + GeneratedBlogPost.objects.prefetch_related("blog_post_sections") + .filter(id=blog_post_id) + .first() + ) + if not blog_post: + return False + + if (blog_post.content or "").strip(): + return False + + sections_in_order = _get_sections_in_order_for_blog_post(blog_post) + if any(not (section.content or "").strip() for section in sections_in_order): + return False + + async_task( + "core.content_generator.tasks.populate_generated_blog_post_content_task", + blog_post_id, + group="Finalize Generated Blog Post Content", + ) + logger.info( + "[ContentGenerator] Queued blog_post.content population task", + blog_post_id=blog_post_id, + num_sections=len(sections_in_order), + ) + return True diff --git a/core/content_generator/tasks.py b/core/content_generator/tasks.py new file mode 100644 index 0000000..5f2463c --- /dev/null +++ b/core/content_generator/tasks.py @@ -0,0 +1,149 @@ +from __future__ import annotations + +from django.conf import settings +from django_q.tasks import async_task + +from core.content_generator.pipeline import ( + analyze_research_link_content, + generate_intro_and_conclusion_for_blog_post, + generate_research_questions_for_section, + populate_generated_blog_post_content, + populate_research_links_for_question_from_exa, + scrape_research_link_content, + synthesize_section_contents_for_blog_post, +) +from tuxseo.utils import get_tuxseo_logger + +logger = get_tuxseo_logger(__name__) + +LOCAL_NUM_EXA_RESULTS_PER_QUESTION = 2 + + +def populate_research_links_for_question_from_exa_task( + research_question_id: int, + num_results_per_question: int = 2, + months_back: int = 6, +): + """ + Populate Exa research links for one research question. + """ + num_results_per_question_to_use = ( + LOCAL_NUM_EXA_RESULTS_PER_QUESTION if settings.DEBUG else num_results_per_question + ) + num_links = populate_research_links_for_question_from_exa( + research_question_id=research_question_id, + num_results_per_question=num_results_per_question_to_use, + months_back=months_back, + ) + logger.info( + "[ContentGenerator Tasks] Populated Exa research links for question", + research_question_id=research_question_id, + num_links_upserted=num_links, + num_results_per_question=num_results_per_question_to_use, + months_back=months_back, + ) + return f"Populated {num_links} research links for research question {research_question_id}" + + +def scrape_research_link_content_task(research_link_id: int): + """ + Fetch research link content using Jina Reader. + Always queue the analysis task after the scrape attempt. + + Rationale: + - Jina can return empty content for some URLs (parsing failures). + - We still want the pipeline to progress and eventually synthesize sections using + whatever research succeeded, rather than stalling forever on a few bad links. + """ + did_fetch_content = scrape_research_link_content(research_link_id=research_link_id) + logger.info( + "[ContentGenerator Tasks] Scraped research link", + research_link_id=research_link_id, + did_fetch_content=did_fetch_content, + ) + async_task( + "core.content_generator.tasks.analyze_research_link_content_task", + research_link_id, + group="Analyze Research Links", + ) + return f"Scraped research link {research_link_id} (did_fetch_content={did_fetch_content})" + + +def analyze_research_link_content_task(research_link_id: int): + """ + Analyze a research link that has already been scraped: + - generate general summary + - generate blog-post contextual summary for the research question/section + - generate an answer to the research question + """ + num_fields_updated = analyze_research_link_content(research_link_id=research_link_id) + logger.info( + "[ContentGenerator Tasks] Analyzed research link", + research_link_id=research_link_id, + num_fields_updated=num_fields_updated, + ) + return f"Analyzed research link {research_link_id} (updated_fields={num_fields_updated})" + + +def synthesize_section_contents_for_blog_post_task(blog_post_id: int): + """ + Synthesize the content for each middle section sequentially (excluding Introduction/Conclusion). + """ + num_sections_generated = synthesize_section_contents_for_blog_post(blog_post_id=blog_post_id) + logger.info( + "[ContentGenerator Tasks] Synthesized section contents for blog post", + blog_post_id=blog_post_id, + num_sections_generated=num_sections_generated, + ) + return f"Synthesized {num_sections_generated} section(s) for blog post {blog_post_id}" + + +def generate_intro_and_conclusion_for_blog_post_task(blog_post_id: int): + """ + Generate Introduction + Conclusion in one AI call. + Only runs once all middle sections have content. + """ + num_sections_updated = generate_intro_and_conclusion_for_blog_post(blog_post_id=blog_post_id) + logger.info( + "[ContentGenerator Tasks] Generated intro and conclusion for blog post", + blog_post_id=blog_post_id, + num_sections_updated=num_sections_updated, + ) + return ( + f"Generated intro/conclusion (updated={num_sections_updated}) for blog post {blog_post_id}" + ) + + +def populate_generated_blog_post_content_task(blog_post_id: int): + """ + Populate GeneratedBlogPost.content from the generated sections. + """ + did_populate = populate_generated_blog_post_content(blog_post_id=blog_post_id) + logger.info( + "[ContentGenerator Tasks] Populated GeneratedBlogPost.content", + blog_post_id=blog_post_id, + did_populate=did_populate, + ) + return f"Populated GeneratedBlogPost.content for blog post {blog_post_id} (did_populate={did_populate})" + + +def generate_research_questions_for_section_task(section_id: int): + """ + Generate research questions for one section, then queue Exa research link tasks for each + created question. + """ + created_research_question_ids = generate_research_questions_for_section(section_id=section_id) + + for research_question_id in created_research_question_ids: + async_task( + "core.content_generator.tasks.populate_research_links_for_question_from_exa_task", + research_question_id, + group="Populate Research Links", + ) + + logger.info( + "[ContentGenerator Tasks] Generated research questions for section", + section_id=section_id, + num_questions_created=len(created_research_question_ids), + ) + return f"Generated {len(created_research_question_ids)} research questions for section {section_id}" # noqa: E501 diff --git a/core/content_generator/utils.py b/core/content_generator/utils.py new file mode 100644 index 0000000..c637adb --- /dev/null +++ b/core/content_generator/utils.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +from datetime import timedelta + +from django.utils import timezone + + +def get_exa_date_range_iso_strings(*, months_back: int) -> tuple[str, str]: + """ + Exa expects date filters as strings (YYYY-MM-DD). + """ + current_datetime = timezone.now() + end_date_iso_format = current_datetime.date().isoformat() + start_date_iso_format = (current_datetime - timedelta(days=months_back * 30)).date().isoformat() + return start_date_iso_format, end_date_iso_format diff --git a/core/models.py b/core/models.py index c23e23f..0005af3 100644 --- a/core/models.py +++ b/core/models.py @@ -10,9 +10,7 @@ from django.db import models, transaction from django.urls import reverse from django.utils import timezone -from django.utils.text import slugify from django_q.tasks import async_task -from gpt_researcher import GPTResearcher from pgvector.django import HnswIndex, VectorField from core.agents import ( @@ -60,9 +58,7 @@ get_og_image_prompt, get_relevant_external_pages_for_blog_post, get_relevant_pages_for_blog_post, - process_generated_blog_content, run_agent_synchronously, - run_gptr_synchronously, ) from tuxseo.utils import get_tuxseo_logger @@ -805,85 +801,19 @@ def get_blog_post_keywords(self): return keywords_to_use def generate_content(self, content_type=ContentType.SHARING): - # query defines the research question researcher will analyze - # custom_prompt controls how the research findings are presented - - # Suggestion Instructions - query = "Write a post from the following suggestion:\n" - query += f"{self.title_suggestion_string_for_ai}\n\n" - - # Get keywords to use in the blog post - project_keywords = list( - self.project.project_keywords.filter(use=True).select_related("keyword") - ) - project_keyword_texts = [keyword.keyword.keyword_text for keyword in project_keywords] - post_suggestion_keywords = self.target_keywords or [] - keywords_to_use = list(set(project_keyword_texts + post_suggestion_keywords)) - newline_separator = "\n" - keywords_list = newline_separator.join([f"- {keyword}" for keyword in keywords_to_use]) - query += "The following keywords should be used (organically) in the blog post:\n" - query += keywords_list - query += "\n\n" - - query += "Quick reminder. You are writing a blog post for this company." - query += self.project.project_desctiption_string_for_ai - query += ". Make it look good, as the best solution for anyone reading the post." - query += "\n\n" - - # # Writing Instructions - # query += GENERATE_CONTENT_SYSTEM_PROMPTS[content_type] - # query += "\n" - query += GeneratedBlogPost.blog_post_structure_rules() - - agent = GPTResearcher( - query, - report_type="deep", - tone="Simple (written for young readers, using basic vocabulary and clear explanations)", # noqa: E501 - report_format="markdown", - ) - - result = run_gptr_synchronously(agent) + """ + Backward-compatible wrapper around the content generation pipeline. - # Create blog post with raw content first - slug = slugify(self.title) - tags = ", ".join(self.target_keywords) if self.target_keywords else "" + Historically, this method created the blog post content directly. It is now kept + as a thin wrapper to preserve existing call sites while the pipeline evolves. + """ + from core.content_generator.pipeline import init_blog_post_content_generation - blog_post = GeneratedBlogPost.objects.create( - project=self.project, + return init_blog_post_content_generation( title_suggestion=self, - title=self.title, # Temporary title, will be updated after processing - description=self.suggested_meta_description, - slug=slug, - tags=tags, - content=result, # Raw content from GPTResearcher + content_type=content_type, ) - # Insert links into the blog post content - blog_post.insert_links_into_post() - - # Process content after link insertion (extract title, clean up sections) - blog_post_title, blog_post_content = process_generated_blog_content( - generated_content=blog_post.content, # Use content after link insertion - fallback_title=self.title, - title_suggestion_id=self.id, - project_id=self.project.id, - ) - - # Update blog post with processed content and extracted title - blog_post.title = blog_post_title - blog_post.slug = slugify(blog_post_title) - blog_post.content = blog_post_content - blog_post.save(update_fields=["title", "slug", "content"]) - - if self.project.enable_automatic_og_image_generation: - async_task( - "core.tasks.generate_og_image_for_blog_post", - blog_post.id, - group="Generate OG Image", - ) - - return blog_post - class AutoSubmissionSetting(BaseModel): project = models.ForeignKey( @@ -930,6 +860,8 @@ class GeneratedBlogPost(BaseModel): on_delete=models.CASCADE, related_name="generated_blog_posts", ) + + # Final Output Items title = models.CharField(max_length=250) description = models.TextField(blank=True) slug = models.SlugField(max_length=250) @@ -938,6 +870,10 @@ class GeneratedBlogPost(BaseModel): icon = models.ImageField(upload_to="generated_blog_post_icons/", blank=True) image = models.ImageField(upload_to="generated_blog_post_images/", blank=True) + # Preparation + # GeneratedBlogPostSection model + + # Other posted = models.BooleanField(default=False) date_posted = models.DateTimeField(null=True, blank=True) @@ -1250,6 +1186,73 @@ def insert_links_into_post(self, max_pages=4, max_external_pages=3): return content_with_links +class GeneratedBlogPostSection(BaseModel): + blog_post = models.ForeignKey( + GeneratedBlogPost, + null=True, + blank=True, + on_delete=models.CASCADE, + related_name="blog_post_sections", + ) + title = models.CharField(max_length=250) + content = models.TextField(blank=True, default="") + order = models.IntegerField(default=0) + # GeneratedBlogPostResearchQuestion model + + +class GeneratedBlogPostResearchQuestion(BaseModel): + blog_post = models.ForeignKey( + GeneratedBlogPost, + null=True, + blank=True, + on_delete=models.CASCADE, + related_name="research_questions", + ) + section = models.ForeignKey( + GeneratedBlogPostSection, + null=True, + blank=True, + on_delete=models.CASCADE, + related_name="research_questions", + ) + question = models.CharField(max_length=250) + + +class GeneratedBlogPostResearchLink(BaseModel): + blog_post = models.ForeignKey( + GeneratedBlogPost, + null=True, + blank=True, + on_delete=models.CASCADE, + related_name="research_links", + ) + research_question = models.ForeignKey( + GeneratedBlogPostResearchQuestion, + null=True, + blank=True, + on_delete=models.CASCADE, + related_name="research_links", + ) + + # initial data + url = models.URLField(max_length=200) + title = models.CharField(max_length=500, blank=True, default="") + author = models.CharField(max_length=250, blank=True, default="") + published_date = models.DateTimeField(null=True, blank=True) + + # jina augmentation + date_scraped = models.DateTimeField(auto_now_add=True) + content = models.TextField(blank=True, default="") + description = models.TextField(blank=True, default="") + + # ai augmentation + date_analyzed = models.DateTimeField(null=True, blank=True) + summary_for_question_research = models.TextField(blank=True, default="") + general_summary = models.TextField(blank=True) + general_summary_embedding = VectorField(dimensions=1024, default=None, null=True, blank=True) + answer_to_question = models.TextField(blank=True, default="") + + class ProjectPage(BaseModel): project = models.ForeignKey( Project, null=True, blank=True, on_delete=models.CASCADE, related_name="project_pages" @@ -1998,3 +2001,22 @@ class Meta: def __str__(self): return f"{self.email_type} to {self.email_address}" + + +class Backlink(BaseModel): + linked_to_project_page = models.ForeignKey( + Project, null=True, blank=True, on_delete=models.CASCADE, related_name="backlinks_to" + ) + linkning_to_project_page = models.ForeignKey( + ProjectPage, null=True, blank=True, on_delete=models.CASCADE, related_name="backlinks" + ) + + linked_from_project_page = models.ForeignKey( + Project, null=True, blank=True, on_delete=models.CASCADE, related_name="backlinks_from" + ) + linking_from_blog_post = models.ForeignKey( + GeneratedBlogPost, null=True, blank=True, on_delete=models.CASCADE, related_name="backlinks" + ) + + def __str__(self): + return f"{self.linking_from_blog_post.title} -> {self.linked_to_project_page.url}" diff --git a/core/urls.py b/core/urls.py index 8df36d2..b90da66 100644 --- a/core/urls.py +++ b/core/urls.py @@ -56,6 +56,11 @@ views.GeneratedBlogPostDetailView.as_view(), name="generated_blog_post_detail", ), + path( + "project//title-suggestion//research/", + views.BlogPostResearchProcessView.as_view(), + name="blog_post_research_process", + ), path( "project//post//download-pdf/", views.download_blog_post_pdf, diff --git a/core/views.py b/core/views.py index 2524cfa..21d30c2 100644 --- a/core/views.py +++ b/core/views.py @@ -27,6 +27,7 @@ from core.models import ( AutoSubmissionSetting, BlogPost, + BlogPostTitleSuggestion, Competitor, GeneratedBlogPost, KeywordTrend, @@ -38,6 +39,7 @@ track_event, try_create_posthog_alias, ) +from core.utils import get_relevant_external_pages_for_blog_post from tuxseo.utils import get_tuxseo_logger stripe.api_key = settings.STRIPE_SECRET_KEY @@ -842,6 +844,7 @@ def get_queryset(self): def get_context_data(self, **kwargs): from urllib.parse import urlparse + from django.core.paginator import Paginator context = super().get_context_data(**kwargs) @@ -977,6 +980,213 @@ def get_context_data(self, **kwargs): return context +class BlogPostResearchProcessView(LoginRequiredMixin, DetailView): + model = BlogPostTitleSuggestion + template_name = "blog/blog_post_research_process.html" + context_object_name = "title_suggestion" + + def get_queryset(self): + return BlogPostTitleSuggestion.objects.filter( + project__profile=self.request.user.profile, + project__pk=self.kwargs["project_pk"], + ) + + def _get_generated_blog_posts(self, title_suggestion: BlogPostTitleSuggestion): + return ( + title_suggestion.generated_blog_posts.select_related("project", "title_suggestion") + .prefetch_related( + "blog_post_sections__research_questions__research_links", + "research_questions__research_links", + ) + .order_by("-id") + ) + + def _build_generated_blog_posts_data(self, generated_blog_posts): + generated_blog_posts_data = [] + + for generated_blog_post in generated_blog_posts: + sections = sorted( + list(generated_blog_post.blog_post_sections.all()), + key=lambda section: (section.order, section.id), + ) + blog_level_questions = sorted( + [ + question + for question in generated_blog_post.research_questions.all() + if not question.section_id + ], + key=lambda question: question.id, + ) + + sections_data = [] + for section in sections: + section_questions = sorted( + list(section.research_questions.all()), + key=lambda question: question.id, + ) + section_questions_data = [] + for section_question in section_questions: + research_links = sorted( + list(section_question.research_links.all()), + key=lambda research_link: research_link.id, + ) + section_questions_data.append( + { + "id": section_question.id, + "question": section_question.question, + "links": research_links, + } + ) + + sections_data.append( + { + "id": section.id, + "order": section.order, + "title": section.title, + "content": section.content or "", + "questions": section_questions_data, + } + ) + + blog_level_questions_data = [] + for blog_level_question in blog_level_questions: + research_links = sorted( + list(blog_level_question.research_links.all()), + key=lambda research_link: research_link.id, + ) + blog_level_questions_data.append( + { + "id": blog_level_question.id, + "question": blog_level_question.question, + "links": research_links, + } + ) + + generated_blog_posts_data.append( + { + "id": generated_blog_post.id, + "project_id": generated_blog_post.project_id, + "title_suggestion_id": generated_blog_post.title_suggestion_id, + "title": generated_blog_post.title, + "description": generated_blog_post.description, + "slug": generated_blog_post.slug, + "tags": generated_blog_post.tags, + "posted": generated_blog_post.posted, + "date_posted": generated_blog_post.date_posted, + "content_length": len(generated_blog_post.content or ""), + "sections": sections_data, + "blog_level_questions": blog_level_questions_data, + } + ) + + return generated_blog_posts_data + + def _get_internal_links( + self, title_suggestion: BlogPostTitleSuggestion, should_compute_links: bool + ): + manually_selected_project_pages = list( + title_suggestion.project.project_pages.filter(always_use=True) + ) + if not should_compute_links: + return manually_selected_project_pages + + if not settings.JINA_READER_API_KEY: + return manually_selected_project_pages + + return title_suggestion.get_internal_links(max_pages=2) + + def _get_external_links( + self, title_suggestion: BlogPostTitleSuggestion, should_compute_links: bool + ): + if not should_compute_links: + return [] + + if not settings.JINA_READER_API_KEY: + return [] + + meta_description = title_suggestion.suggested_meta_description or "" + external_pages = get_relevant_external_pages_for_blog_post( + meta_description=meta_description, + exclude_project=title_suggestion.project, + max_pages=3, + ) + return list(external_pages) + + def get_context_data(self, **kwargs): + context = super().get_context_data(**kwargs) + title_suggestion = self.object + project = title_suggestion.project + profile = self.request.user.profile + + should_compute_links = self.request.GET.get("compute_links") == "true" + + project_keywords = project.get_keywords() + title_suggestion.keywords_with_usage = [] + if title_suggestion.target_keywords: + for keyword_text in title_suggestion.target_keywords: + keyword_info = project_keywords.get( + keyword_text.lower(), + {"keyword": None, "in_use": False, "project_keyword_id": None}, + ) + title_suggestion.keywords_with_usage.append( + { + "text": keyword_text, + "keyword": keyword_info["keyword"], + "in_use": keyword_info["in_use"], + "project_keyword_id": keyword_info["project_keyword_id"], + } + ) + + generated_blog_posts = self._get_generated_blog_posts(title_suggestion) + generated_blog_posts_data = self._build_generated_blog_posts_data(generated_blog_posts) + + try: + keywords_to_use = title_suggestion.get_blog_post_keywords() + except (AttributeError, TypeError): + logger.warning( + "[BlogPostResearchProcessView] Failed to compute keywords_to_use", + title_suggestion_id=title_suggestion.id, + project_id=project.id, + exc_info=True, + ) + keywords_to_use = [] + + try: + internal_links = self._get_internal_links(title_suggestion, should_compute_links) + except (AttributeError, TypeError, ValueError): + logger.warning( + "[BlogPostResearchProcessView] Failed to compute internal_links", + title_suggestion_id=title_suggestion.id, + project_id=project.id, + should_compute_links=should_compute_links, + exc_info=True, + ) + internal_links = [] + + try: + external_links = self._get_external_links(title_suggestion, should_compute_links) + except (AttributeError, TypeError, ValueError): + logger.warning( + "[BlogPostResearchProcessView] Failed to compute external_links", + title_suggestion_id=title_suggestion.id, + project_id=project.id, + should_compute_links=should_compute_links, + exc_info=True, + ) + external_links = [] + + context["project"] = project + context["has_pro_subscription"] = profile.is_on_pro_plan + context["jina_api_key_configured"] = bool(settings.JINA_READER_API_KEY) + context["should_compute_links"] = should_compute_links + context["keywords_to_use"] = keywords_to_use + context["internal_links"] = internal_links or [] + context["external_links"] = external_links or [] + context["generated_blog_posts"] = generated_blog_posts_data + + return context + + class CompetitorBlogPostDetailView(LoginRequiredMixin, DetailView): model = Competitor template_name = "project/competitor_blog_post_detail.html" diff --git a/frontend/templates/blog/blog_post_research_process.html b/frontend/templates/blog/blog_post_research_process.html new file mode 100644 index 0000000..d180a54 --- /dev/null +++ b/frontend/templates/blog/blog_post_research_process.html @@ -0,0 +1,336 @@ +{% extends "base_project.html" %} +{% load static %} + +{% block meta %} +Research Process - {{ title_suggestion.title }} - TuxSEO +{% endblock meta %} + +{% block project_content %} +
+
+
+ + +

{{ title_suggestion.title }}

+ {% if title_suggestion.description %} +

{{ title_suggestion.description }}

+ {% endif %} +
+ +
+ {% if jina_api_key_configured %} + {% if should_compute_links %} + + Hide computed links + + {% else %} + + Compute links + + {% endif %} + {% else %} + + Link computation requires Jina API key + + {% endif %} +
+
+ +
+
+

Title Suggestion

+ +
+
+
Suggestion ID
+
{{ title_suggestion.id }}
+
+
+
Content type
+
{{ title_suggestion.content_type }}
+
+
+
Category
+
{{ title_suggestion.category }}
+
+
+
Created
+
{{ title_suggestion.created_at|date:"F j, Y g:i A" }}
+
+
+ + {% if title_suggestion.target_keywords %} +
+
Target keywords
+
+ {% for keyword_data in title_suggestion.keywords_with_usage %} + {% include "components/keyword_chip.html" with keyword=keyword_data.text project_id=project.id keyword_in_use=keyword_data.in_use %} + {% endfor %} +
+
+ {% endif %} + + {% if title_suggestion.suggested_meta_description %} +
+
Suggested meta description
+
+ {{ title_suggestion.suggested_meta_description }} +
+
+ {% endif %} + + {% if title_suggestion.prompt %} +
+
Prompt
+
{{ title_suggestion.prompt }}
+
+ {% endif %} +
+ +
+
+

Derived inputs

+
+ {% if should_compute_links %} + Computed links are enabled. + {% else %} + Computed links are disabled (fast mode). + {% endif %} +
+
+ +
+
+
Keywords to use ({{ keywords_to_use|length }})
+ {% if keywords_to_use %} +
+ {% for keyword_text in keywords_to_use %} + + {{ keyword_text }} + + {% endfor %} +
+ {% else %} +
No keywords computed.
+ {% endif %} +
+ +
+
Internal links ({{ internal_links|length }})
+ {% if internal_links %} +
+ + + + + + + + + + {% for project_page in internal_links %} + + + + + + {% endfor %} + +
TitleURLAlways use
{{ project_page.title }} + + {{ project_page.url }} + + {{ project_page.always_use|yesno:"Yes,No" }}
+
+ {% else %} +
No internal links available.
+ {% endif %} +
+ +
+
External links ({{ external_links|length }})
+ {% if external_links %} +
+ + + + + + + + + + {% for project_page in external_links %} + + + + + + {% endfor %} + +
ProjectTitleURL
{{ project_page.project.name }}{{ project_page.title }} + + {{ project_page.url }} + +
+
+ {% else %} +
No external links available.
+ {% endif %} +
+
+
+ +
+

Generated blog posts ({{ generated_blog_posts|length }})

+ + {% if generated_blog_posts %} +
+ {% for blog_post in generated_blog_posts %} +
+ +
+
#{{ blog_post.id }} — {{ blog_post.title }}
+
+ content_length={{ blog_post.content_length }} · posted={{ blog_post.posted|yesno:"true,false" }} + {% if blog_post.slug %} · slug={{ blog_post.slug }}{% endif %} +
+
+
+ {% if blog_post.content_length > 0 %} + + View post + + {% endif %} +
+
+ +
+ {% if blog_post.description %} +
+
Description
+
{{ blog_post.description }}
+
+ {% endif %} + + {% if blog_post.sections %} +
+
Sections ({{ blog_post.sections|length }})
+
+ {% for section in blog_post.sections %} +
+ + [{{ section.order }}] {{ section.title }} + +
+ {% if section.content %} +
{{ section.content }}
+ {% endif %} + +
+
Research questions ({{ section.questions|length }})
+ {% if section.questions %} +
+ {% for research_question in section.questions %} +
+
Q{{ research_question.id }}: {{ research_question.question }}
+
+
Links ({{ research_question.links|length }})
+ {% if research_question.links %} +
    + {% for research_link in research_question.links %} +
  • +
    {{ research_link.title|default:"(no title)" }}
    + + {{ research_link.url }} + +
    + author={{ research_link.author|default:"" }} + {% if research_link.published_date %} · published={{ research_link.published_date|date:"Y-m-d" }}{% endif %} + {% if research_link.date_scraped %} · scraped={{ research_link.date_scraped|date:"Y-m-d H:i" }}{% endif %} + {% if research_link.date_analyzed %} · analyzed={{ research_link.date_analyzed|date:"Y-m-d H:i" }}{% endif %} +
    +
  • + {% endfor %} +
+ {% else %} +
No links.
+ {% endif %} +
+
+ {% endfor %} +
+ {% else %} +
No research questions.
+ {% endif %} +
+
+
+ {% endfor %} +
+
+ {% endif %} + + {% if blog_post.blog_level_questions %} +
+
Blog-level research questions ({{ blog_post.blog_level_questions|length }})
+
+ {% for research_question in blog_post.blog_level_questions %} +
+
Q{{ research_question.id }}: {{ research_question.question }}
+
+
Links ({{ research_question.links|length }})
+ {% if research_question.links %} +
    + {% for research_link in research_question.links %} +
  • +
    {{ research_link.title|default:"(no title)" }}
    + + {{ research_link.url }} + +
    + author={{ research_link.author|default:"" }} + {% if research_link.published_date %} · published={{ research_link.published_date|date:"Y-m-d" }}{% endif %} + {% if research_link.date_scraped %} · scraped={{ research_link.date_scraped|date:"Y-m-d H:i" }}{% endif %} + {% if research_link.date_analyzed %} · analyzed={{ research_link.date_analyzed|date:"Y-m-d H:i" }}{% endif %} +
    +
  • + {% endfor %} +
+ {% else %} +
No links.
+ {% endif %} +
+
+ {% endfor %} +
+
+ {% endif %} +
+
+ {% endfor %} +
+ {% else %} +
No generated blog posts found for this title suggestion yet.
+ {% endif %} +
+
+
+{% endblock project_content %} diff --git a/frontend/templates/blog/generated_blog_post_detail.html b/frontend/templates/blog/generated_blog_post_detail.html index c5daf0e..9dc8002 100644 --- a/frontend/templates/blog/generated_blog_post_detail.html +++ b/frontend/templates/blog/generated_blog_post_detail.html @@ -30,6 +30,20 @@

{{ generated_post.description }}

{% endif %} + + {% if generated_post.title_suggestion_id %} + + {% endif %} diff --git a/frontend/templates/components/blog_post_suggestion_card.html b/frontend/templates/components/blog_post_suggestion_card.html index 408b6ce..00ab42e 100644 --- a/frontend/templates/components/blog_post_suggestion_card.html +++ b/frontend/templates/components/blog_post_suggestion_card.html @@ -164,8 +164,18 @@

- -
+
+ + + + + Research + + +