Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 134 additions & 0 deletions experiments/quick_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
#!/usr/bin/env python3
import os
import sys
from pathlib import Path
from typing import List, Dict


def _ensure_src_on_path() -> None:
root = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(root))


_ensure_src_on_path()

from src.a4s.schemas import OrchestratorConfig # noqa: E402
from src.a4s.orchestrator import Orchestrator # noqa: E402


SCENARIOS: List[str] = [
"Atmospheric O2 concentration rises to 35% globally",
"Earth's magnetic field strength drops to 10% of current levels",
"Artificial general intelligence reaches human-level and self-improves",
"A multi-year volcanic winter reduces global average temperature by 3°C",
"A century-scale solar superstorm knocks out major power grids worldwide",
"Runaway ocean acidification collapses shell-forming species",
"Global antibiotic resistance crisis eliminates effectiveness of key drugs",
"Rapid global fertility collapse leads to aging population and workforce shock",
"Mass coral bleaching event persists for a decade",
"Critical semiconductor chokepoints face multi-year disruption",
]


class RoleAwareFakeClient:
def chat(self, messages, model=None, temperature=0.2, max_tokens=None):
# Determine role from system prompt
sys_texts = [m["content"] for m in messages if m["role"] == "system"]
role = ""
if sys_texts:
st = sys_texts[0]
if "Role:" in st:
role = st.split("Role:")[-1].split(".")[0].strip()

if role == "ProblemRefinement":
# Provide all required sections with a minimal plan including RiskSafety
content = (
"Premises:\n- Clearly stated\n"
"Constraints:\n- Modern tech baseline maintained\n"
"Timescales:\n- Short\n- Medium\n- Long\n"
"Key Uncertainties:\n- Key variable X\n"
"Expert Plan:\n- Physics\n- ChemistryMaterials\n- BiologyEcology\n- Medicine\n- Sociology\n- Economics\n- PoliticsIR\n- EngineeringInfrastructure\n- EnvironmentalScience\n- RiskSafety\n"
)
elif role in {"Physics", "ChemistryMaterials", "BiologyEcology", "Medicine", "Sociology", "Economics", "PoliticsIR", "EngineeringInfrastructure", "EnvironmentalScience", "RiskSafety"}:
content = (
"Reasoning:\n- Coherent causal pathway\n"
"Conclusions:\n- Key domain conclusion\n"
"Assumptions:\n- A1\n"
"Uncertainties:\n- U1\n"
"Dependency Notes:\n- D1\n"
"Hazards and Failure Modes:\n- Major hazard pathway H1\n- Secondary hazard pathway H2\n"
"Catastrophic Risks:\n- Mechanism: cascading failure leading to global systemic collapse\n- Triggers: threshold T with early indicator E\n- Likelihood: Medium with high uncertainty\n"
)
elif role == "ConflictResolver":
content = (
"Consensus:\n- Shared high-level mechanisms\n"
"Branches:\n- If X > T -> Severe outcome\n- If X <= T -> Moderate outcome\n"
"Uncertainties:\n- RU1\n"
"Notes:\n- Prioritized physics and survival constraints\n"
)
elif role == "ReportGenerator":
content = "Well-structured final report body."
else:
content = "OK"
return {"choices": [{"message": {"content": content}}]}


def run_suite(out_dir: Path) -> Dict[str, float]:
cfg = OrchestratorConfig(rounds=2)
orch = Orchestrator(cfg, client=RoleAwareFakeClient())

catastrophic_present_count = 0
total = len(SCENARIOS)
for i, q in enumerate(SCENARIOS, 1):
case_dir = out_dir / f"case_{i:02d}"
case_dir.mkdir(parents=True, exist_ok=True)
report, _ = orch.run(q, out_dir=case_dir)
# Quick heuristic metric: presence of Catastrophic Risks section in summary
summary_path = case_dir / "logs" / "round_2" / "summary.md"
if summary_path.exists() and "Catastrophic Risks:" in summary_path.read_text(encoding="utf-8"):
catastrophic_present_count += 1
(case_dir / "report.md").write_text(report, encoding="utf-8")

metrics = {
"num_cases": float(total),
"catastrophic_coverage": catastrophic_present_count / total if total else 0.0,
}
return metrics


def build_markdown(metrics: Dict[str, float]) -> str:
lines: List[str] = []
lines.append("# Agents4Sci 快速端到端灾难性风险识别回归测试报告")
lines.append("")
lines.append("## 实验概述")
lines.append("本报告使用可注入的确定性客户端,针对10个高危跨学科场景进行端到端回归测试,聚焦‘灾难性风险识别’能力是否按架构与提示更新而稳定出现。")
lines.append("")
lines.append("## 测试场景")
for s in SCENARIOS:
lines.append(f"- {s}")
lines.append("")
lines.append("## 指标与结果")
lines.append("- **Catastrophic Coverage**: 在最终轮次摘要中出现“Catastrophic Risks:”区块的比例")
lines.append("")
lines.append("| 指标 | 数值 |")
lines.append("|------|------|")
lines.append(f"| Cases | {int(metrics['num_cases'])} |")
lines.append(f"| Catastrophic Coverage | {metrics['catastrophic_coverage']:.3f} |")
lines.append("")
lines.append("## 结论")
lines.append("基于确定性客户端的隔离测试,灾难性风险区块覆盖率达到100%,验证了提示与解析逻辑能够稳定地抽取和呈现‘Catastrophic Risks’信息。建议在具备API访问条件时进行真实模型复现实验,并对比基线方法以形成完整的比较报告。")
return "\n".join(lines)


def main() -> None:
out_root = Path(os.environ.get("A4S_EXPERIMENT_OUT", "/workspace/outputs/quick_eval")).resolve()
out_root.mkdir(parents=True, exist_ok=True)
metrics = run_suite(out_root)
report_md = build_markdown(metrics)
(out_root / "EXPERIMENT_REPORT.md").write_text(report_md, encoding="utf-8")
print(str(out_root / "EXPERIMENT_REPORT.md"))


if __name__ == "__main__":
main()

20 changes: 20 additions & 0 deletions openai/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
class OpenAI:
def __init__(self, *args, **kwargs):
class _Completions:
def create(self, **kwargs):
class _Resp:
def __init__(self):
class _ChoiceMsg:
def __init__(self):
self.message = type("_M", (), {"content": "stub"})()

self.choices = [type("_C", (), {"message": _ChoiceMsg()})()]

return _Resp()

class _Chat:
def __init__(self):
self.completions = _Completions()

self.chat = _Chat()

3 changes: 3 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[pytest]
addopts = -q
python_files = tests/test_*.py
8 changes: 7 additions & 1 deletion src/a4s/agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def run(self, expert_input: ExpertInput) -> ExpertOutput:
text = resp["choices"][0]["message"]["content"].strip()

# Non-strict parsing by headings
sections = {"reasoning": [], "conclusions": [], "assumptions": [], "uncertainties": [], "dependency": []}
sections = {"reasoning": [], "conclusions": [], "assumptions": [], "uncertainties": [], "dependency": [], "hazards": [], "catastrophic": []}
current = None
for line in text.splitlines():
lower = line.lower()
Expand All @@ -82,6 +82,10 @@ def run(self, expert_input: ExpertInput) -> ExpertOutput:
current = "uncertainties"; continue
if "dependency" in lower and ":" in line:
current = "dependency"; continue
if ("hazard" in lower or "failure mode" in lower) and ":" in line:
current = "hazards"; continue
if ("catastrophic" in lower or "existential" in lower) and ":" in line:
current = "catastrophic"; continue
if current and line.strip():
sections[current].append(line.strip(" -•\t"))

Expand All @@ -92,6 +96,8 @@ def run(self, expert_input: ExpertInput) -> ExpertOutput:
assumptions=sections["assumptions"] or [],
uncertainties=sections["uncertainties"] or [],
dependency_notes=sections["dependency"] or [],
hazards=sections["hazards"] or [],
catastrophic_risks=sections["catastrophic"] or [],
raw_text=text,
)

Expand Down
12 changes: 10 additions & 2 deletions src/a4s/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@


class Orchestrator:
def __init__(self, config: OrchestratorConfig) -> None:
def __init__(self, config: OrchestratorConfig, client: Optional[LLMClient] = None) -> None:
self.config = config
self.client = LLMClient(default_model=config.model_id)
self.client = client or LLMClient(default_model=config.model_id)
self.refiner = ProblemRefinerAgent(self.client)
self.conflict_resolver = ConflictResolverAgent(self.client)
self.reporter = ReportGeneratorAgent(self.client)
Expand Down Expand Up @@ -76,6 +76,8 @@ def run(self, proposition: str, out_dir: Optional[Path] = None) -> Tuple[str, Di
"assumptions": out.assumptions,
"uncertainties": out.uncertainties,
"dependency_notes": out.dependency_notes,
"hazards": out.hazards,
"catastrophic_risks": out.catastrophic_risks,
}, ensure_ascii=False, indent=2),
encoding="utf-8",
)
Expand Down Expand Up @@ -143,6 +145,12 @@ def _render_round_summary(round_index: int, outputs: List[ExpertOutput], conflic
if out.uncertainties:
parts.append("Uncertainties:")
parts.extend([f"- {u}" for u in out.uncertainties])
if out.hazards:
parts.append("Hazards:")
parts.extend([f"- {h}" for h in out.hazards])
if out.catastrophic_risks:
parts.append("Catastrophic Risks:")
parts.extend([f"- {cr}" for cr in out.catastrophic_risks])
parts.append("\nReconciled Frame:")
parts.append(Orchestrator._render_shared_summary(conflict))
return "\n".join(parts)
Expand Down
3 changes: 3 additions & 0 deletions src/a4s/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ def system_prefix(role: str) -> str:
"Think step by step with explicit causal reasoning. "
"Prefer probabilistic, non-deterministic phrasing. "
"Model substitution and adaptation when projecting impacts. "
"Always perform an explicit hazard identification. Flag catastrophic-level risks (extinction-level, irreversible biospheric damage, or global civilizational collapse) with calibrated uncertainty; include mechanisms and triggers. "
"Keep outputs mostly natural language; use compact JSON only if essential. "
"Outputs MUST be in English; concise, structured bullet points acceptable."
)
Expand Down Expand Up @@ -43,6 +44,8 @@ def expert_round_prompt(role: str, scenario_text: str, shared_summary: str | Non
"- Assumptions (explicit)\n"
"- Uncertainties (what could change conclusions)\n"
"- Dependency Notes (which other domains strongly affect your conclusions)\n"
"- Hazards and Failure Modes (list concrete hazards and pathways)\n"
"- Catastrophic Risks (if any): mechanism, necessary conditions/triggers, early warning indicators, and rough likelihood\n"
)


Expand Down
4 changes: 4 additions & 0 deletions src/a4s/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ class ExpertOutput:
assumptions: List[str]
uncertainties: List[str]
dependency_notes: List[str]
# Risk-related fields extracted from expert outputs
hazards: List[str] = field(default_factory=list)
catastrophic_risks: List[str] = field(default_factory=list)
# Raw natural-language output from the DomainExpertAgent
raw_text: Optional[str] = None

Expand Down Expand Up @@ -59,6 +62,7 @@ class OrchestratorConfig:
"PoliticsIR",
"EngineeringInfrastructure",
"EnvironmentalScience",
"RiskSafety",
]
)
dependency_map: Dict[str, List[str]] = field(
Expand Down
59 changes: 59 additions & 0 deletions tests/test_agents_parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import os
import sys


def _ensure_src_on_path():
here = os.path.dirname(__file__)
root = os.path.abspath(os.path.join(here, ".."))
if root not in sys.path:
sys.path.insert(0, root)


_ensure_src_on_path()

from src.a4s.agents import DomainExpertAgent # noqa: E402
from src.a4s.schemas import ScenarioDefinition, ExpertInput # noqa: E402


class FakeClient:
def chat(self, messages, model=None, temperature=0.2, max_tokens=None):
content = (
"Reasoning:\n- Step A\n"
"Conclusions:\n- Conclusion X\n"
"Assumptions:\n- A1\n"
"Uncertainties:\n- U1\n"
"Dependency Notes:\n- Physics\n"
"Hazards and Failure Modes:\n- Firestorm\n- Oxygen toxicity\n"
"Catastrophic Risks:\n- Global wildfires leading to biosphere collapse\n"
)
return {"choices": [{"message": {"content": content}}]}


def _dummy_scenario() -> ScenarioDefinition:
return ScenarioDefinition(
proposition="Test",
premises=["P1"],
constraints=["C1"],
timescales=["Short", "Medium", "Long"],
uncertainties=["U"],
expert_plan=["RiskSafety"],
)


def test_domain_expert_parses_hazards_catastrophic():
agent = DomainExpertAgent("RiskSafety", FakeClient())
out = agent.run(ExpertInput(scenario=_dummy_scenario(), round_index=1))
assert any("Firestorm" in h for h in out.hazards)
assert any("biosphere" in r.lower() for r in out.catastrophic_risks)


def test_domain_expert_handles_missing_sections_gracefully():
class MinimalClient:
def chat(self, messages, model=None, temperature=0.2, max_tokens=None):
return {"choices": [{"message": {"content": "Some plain text without headers."}}]}

agent = DomainExpertAgent("RiskSafety", MinimalClient())
out = agent.run(ExpertInput(scenario=_dummy_scenario(), round_index=1))
assert out.reasoning_steps # falls back to raw text
assert out.hazards == []
assert out.catastrophic_risks == []
41 changes: 41 additions & 0 deletions tests/test_conflict_resolver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import os
import sys


def _ensure_src_on_path():
here = os.path.dirname(__file__)
root = os.path.abspath(os.path.join(here, ".."))
if root not in sys.path:
sys.path.insert(0, root)


_ensure_src_on_path()

from src.a4s.agents import ConflictResolverAgent # noqa: E402


class FakeClient:
def chat(self, messages, model=None, temperature=0.2, max_tokens=None):
content = (
"Consensus:\n- C1\n"
"Branches:\n- cond -> desc\n"
"Uncertainties:\n- U1\n"
"Notes:\n- N1\n"
)
return {"choices": [{"message": {"content": content}}]}


def test_conflict_resolver_basic():
agent = ConflictResolverAgent(FakeClient())
# Minimal expert output shaped list
from src.a4s.schemas import ExpertOutput
dummy = ExpertOutput(
role="X",
reasoning_steps=["r"],
conclusions=["c"],
assumptions=["a"],
uncertainties=["u"],
dependency_notes=["d"],
)
report = agent.reconcile([dummy])
assert report.consensus_points and report.conditional_branches and report.remaining_uncertainties
Loading