AgentSkillsCN

Agent Testing

利用 Mock 进行单元测试、开展集成测试、借助 LangSmith 实施评估与基准测试

SKILL.md
--- frontmatter
description: Unit testing with mocks, integration testing, LangSmith evaluation, benchmarking

Agent Testing

Unit testing with mocks, integration testing, LangSmith evaluation, benchmarking

Agent Testing Skill

Implement comprehensive testing strategies for agents - unit tests with mocks, integration tests, LangSmith evaluation, and benchmarking.

Agent Testing Skill

Implement comprehensive testing strategies for agents - unit tests with mocks, integration tests, LangSmith evaluation, and benchmarking.

Process

Step 1: Unit Testing with Mocks

Write unit tests with mocked LLM and tools:

python
import pytest
from unittest.mock import AsyncMock, MagicMock, patch
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
from langchain_core.tools import tool

# Simple agent function to test
async def simple_agent(prompt: str, llm, tools: list) -> str:
    """Simple agent that uses LLM and tools."""
    llm_with_tools = llm.bind_tools(tools)
    response = await llm_with_tools.ainvoke(prompt)
    
    if response.tool_calls:
        # Execute tools (simplified)
        return "Tool executed"
    
    return response.content

# Unit test with mocked LLM
@pytest.mark.asyncio
async def test_simple_agent_with_mock():
    """Test agent with mocked LLM."""
    # Create mock LLM
    mock_llm = AsyncMock()
    mock_response = AIMessage(content="Test response")
    mock_llm.bind_tools.return_value.ainvoke = AsyncMock(return_value=mock_response)
    
    # Test
    result = await simple_agent("test prompt", mock_llm, [])
    
    assert result == "Test response"
    mock_llm.bind_tools.return_value.ainvoke.assert_called_once()

# Test with tool calls
@pytest.mark.asyncio
async def test_agent_with_tool_calls():
    """Test agent that calls tools."""
    mock_llm = AsyncMock()
    mock_response = AIMessage(
        content="",
        tool_calls=[{
            "name": "test_tool",
            "args": {"arg1": "value1"},
            "id": "call_123"
        }]
    )
    mock_llm.bind_tools.return_value.ainvoke = AsyncMock(return_value=mock_response)
    
    @tool
    def test_tool(arg1: str) -> str:
        """Test tool."""
        return f"Result: {arg1}"
    
    result = await simple_agent("test prompt", mock_llm, [test_tool])
    assert result == "Tool executed"

# Using pytest fixtures
@pytest.fixture
def mock_llm():
    """Fixture for mocked LLM."""
    llm = AsyncMock()
    llm.bind_tools.return_value.ainvoke = AsyncMock(
        return_value=AIMessage(content="Mocked response")
    )
    return llm

@pytest.fixture
def sample_tools():
    """Fixture for sample tools."""
    @tool
    def get_weather(location: str) -> str:
        """Get weather for location."""
        return f"Weather in {location}: Sunny"
    
    return [get_weather]

@pytest.mark.asyncio
async def test_agent_with_fixtures(mock_llm, sample_tools):
    """Test using fixtures."""
    result = await simple_agent("What's the weather?", mock_llm, sample_tools)
    assert result is not None

Step 2: Integration Testing

Test agents with real or test LLMs:

python
import pytest
from langchain_openai import ChatOpenAI
from langchain_core.tools import tool
import os

# Integration test with test LLM
@pytest.mark.asyncio
@pytest.mark.integration
async def test_agent_integration():
    """Integration test with real LLM."""
    # Use test API key or test model
    llm = ChatOpenAI(
        model="gpt-3.5-turbo",  # Cheaper model for testing
        temperature=0,
        api_key=os.getenv("OPENAI_API_KEY")
    )
    
    @tool
    def add_numbers(a: int, b: int) -> str:
        """Add two numbers."""
        return str(a + b)
    
    # Test agent
    result = await simple_agent("Add 2 and 3", llm, [add_numbers])
    
    # Verify result contains expected content
    assert "5" in result or "tool" in result.lower()

# Test with test database
@pytest.fixture
def test_db():
    """Fixture for test database."""
    # Setup test database
    db = {}
    yield db
    # Teardown
    db.clear()

@pytest.mark.asyncio
@pytest.mark.integration
async def test_agent_with_database(test_db):
    """Test agent that interacts with database."""
    @tool
    def store_data(key: str, value: str) -> str:
        """Store data in test database."""
        test_db[key] = value
        return f"Stored {key}"
    
    @tool
    def get_data(key: str) -> str:
        """Get data from test database."""
        return test_db.get(key, "Not found")
    
    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
    
    # Test storing and retrieving
    await simple_agent("Store test_key with value test_value", llm, [store_data])
    assert test_db["test_key"] == "test_value"

Step 3: LangSmith Evaluation

Use LangSmith for agent evaluation:

python
from langsmith import Client, evaluate
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage
import os

# Initialize LangSmith client
client = Client(api_key=os.getenv("LANGSMITH_API_KEY"))

# Define evaluation dataset
dataset_name = "agent_test_dataset"

# Create dataset
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Test dataset for agent evaluation"
)

# Add examples
client.create_examples(
    inputs=[
        {"input": "What is Python?"},
        {"input": "Add 2 and 3"},
        {"input": "What's the weather in NYC?"}
    ],
    outputs=[
        {"output": "Python is a programming language"},
        {"output": "5"},
        {"output": "Weather information"}
    ],
    dataset_id=dataset.id
)

# Define agent function
async def test_agent(input: dict) -> dict:
    """Agent function for evaluation."""
    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
    response = await llm.ainvoke(input["input"])
    return {"output": response.content}

# Evaluation function
def evaluate_agent(run, example):
    """Custom evaluation function."""
    predicted = run.outputs.get("output", "")
    expected = example.outputs.get("output", "")
    
    # Simple accuracy check
    accuracy = 1.0 if expected.lower() in predicted.lower() else 0.0
    
    return {
        "accuracy": accuracy,
        "predicted": predicted,
        "expected": expected
    }

# Run evaluation
results = evaluate(
    test_agent,
    data=dataset_name,
    evaluators=[evaluate_agent],
    experiment_prefix="agent_test"
)

# View results
for result in results:
    print(f"Accuracy: {result['accuracy']}")
    print(f"Predicted: {result['predicted']}")
    print(f"Expected: {result['expected']}")

# Using LangSmith's built-in evaluators
from langsmith.evaluation import LangChainStringEvaluator

# Create evaluator
evaluator = LangChainStringEvaluator(
    "labeled_score_string",
    criteria={
        "helpfulness": "Is the response helpful?",
        "accuracy": "Is the response accurate?"
    }
)

# Run evaluation with built-in evaluator
results = evaluate(
    test_agent,
    data=dataset_name,
    evaluators=[evaluator],
    experiment_prefix="agent_eval"
)

Step 4: Benchmarking Agents

Measure agent performance:

python
import time
import asyncio
from typing import List, Dict
import statistics

class AgentBenchmark:
    """Benchmark agent performance."""
    
    def __init__(self, agent_func):
        self.agent_func = agent_func
        self.results: List[Dict] = []
    
    async def benchmark(
        self,
        test_cases: List[str],
        iterations: int = 10
    ) -> Dict:
        """Run benchmark on test cases."""
        latencies = []
        successes = 0
        errors = 0
        
        for test_case in test_cases:
            for _ in range(iterations):
                start_time = time.time()
                
                try:
                    result = await self.agent_func(test_case)
                    latency = time.time() - start_time
                    latencies.append(latency)
                    successes += 1
                except Exception as e:
                    errors += 1
                    self.results.append({
                        "test_case": test_case,
                        "success": False,
                        "error": str(e),
                        "latency": None
                    })
                    continue
                
                self.results.append({
                    "test_case": test_case,
                    "success": True,
                    "latency": latency,
                    "result_length": len(str(result))
                })
        
        return {
            "total_tests": len(test_cases) * iterations,
            "successes": successes,
            "errors": errors,
            "success_rate": successes / (successes + errors) if (successes + errors) > 0 else 0,
            "avg_latency": statistics.mean(latencies) if latencies else 0,
            "median_latency": statistics.median(latencies) if latencies else 0,
            "p95_latency": statistics.quantiles(latencies, n=20)[18] if len(latencies) >= 20 else 0,
            "min_latency": min(latencies) if latencies else 0,
            "max_latency": max(latencies) if latencies else 0
        }

# Usage
async def benchmark_agent(prompt: str) -> str:
    """Agent function to benchmark."""
    llm = ChatOpenAI(model="gpt-3.5-turbo")
    response = await llm.ainvoke(prompt)
    return response.content

benchmark = AgentBenchmark(benchmark_agent)
results = await benchmark.benchmark(
    test_cases=["What is Python?", "Explain async/await"],
    iterations=5
)

print(f"Success Rate: {results['success_rate']:.2%}")
print(f"Average Latency: {results['avg_latency']:.2f}s")
print(f"P95 Latency: {results['p95_latency']:.2f}s")

Step 5: Testing Tool Execution

Test agent tool usage:

python
import pytest
from unittest.mock import AsyncMock, patch

@pytest.mark.asyncio
async def test_tool_execution():
    """Test that agent executes tools correctly."""
    @tool
    def test_tool(param: str) -> str:
        """Test tool."""
        return f"Result: {param}"
    
    # Mock tool execution
    with patch.object(test_tool, 'invoke', return_value="Result: test") as mock_tool:
        # Test agent that uses tool
        # ... agent code ...
        
        # Verify tool was called
        mock_tool.assert_called_once_with({"param": "test"})

# Test tool error handling
@pytest.mark.asyncio
async def test_tool_error_handling():
    """Test agent handles tool errors."""
    @tool
    def failing_tool() -> str:
        """Tool that always fails."""
        raise ValueError("Tool error")
    
    # Agent should handle tool errors gracefully
    # ... test implementation ...

Step 6: End-to-End Testing

Test complete agent workflows:

python
import pytest
from langchain_core.messages import HumanMessage

@pytest.mark.asyncio
@pytest.mark.e2e
async def test_complete_agent_workflow():
    """End-to-end test of agent workflow."""
    # Setup
    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
    
    @tool
    def get_weather(location: str) -> str:
        """Get weather."""
        return f"Weather in {location}: Sunny, 72°F"
    
    @tool
    def search_kb(query: str) -> str:
        """Search knowledge base."""
        return f"Results for: {query}"
    
    tools = [get_weather, search_kb]
    llm_with_tools = llm.bind_tools(tools)
    
    # Execute workflow
    messages = [HumanMessage(content="What's the weather in NYC and search for Python?")]
    
    response = await llm_with_tools.ainvoke(messages)
    
    # Verify response
    assert response is not None
    assert len(response.tool_calls) > 0
    
    # Verify tool calls
    tool_names = [tc["name"] for tc in response.tool_calls]
    assert "get_weather" in tool_names or "search_kb" in tool_names

# Test multi-turn conversation
@pytest.mark.asyncio
@pytest.mark.e2e
async def test_multi_turn_conversation():
    """Test agent maintains context across turns."""
    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
    messages = []
    
    # Turn 1
    messages.append(HumanMessage(content="My name is Alice"))
    response1 = await llm.ainvoke(messages)
    messages.append(response1)
    
    # Turn 2
    messages.append(HumanMessage(content="What's my name?"))
    response2 = await llm.ainvoke(messages)
    
    # Verify agent remembers
    assert "Alice" in response2.content.lower()

Step 7: Property-Based Testing

Test agent properties:

python
from hypothesis import given, strategies as st
import pytest
import time
import asyncio

@given(st.text(min_size=1, max_size=100))
@pytest.mark.asyncio
async def test_agent_always_responds(prompt: str):
    """Property: Agent always returns a response."""
    llm = ChatOpenAI(model="gpt-3.5-turbo")
    response = await llm.ainvoke(prompt)
    
    assert response is not None
    assert hasattr(response, 'content')
    assert len(response.content) > 0

@given(st.text(min_size=1, max_size=50))
@pytest.mark.asyncio
async def test_agent_response_time(prompt: str):
    """Property: Agent responds within timeout."""
    llm = ChatOpenAI(model="gpt-3.5-turbo")
    
    start_time = time.time()
    response = await asyncio.wait_for(
        llm.ainvoke(prompt),
        timeout=30.0
    )
    latency = time.time() - start_time
    
    assert latency < 30.0
    assert response is not None
python
import pytest
from unittest.mock import AsyncMock, MagicMock, patch
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
from langchain_core.tools import tool

# Simple agent function to test
async def simple_agent(prompt: str, llm, tools: list) -> str:
    """Simple agent that uses LLM and tools."""
    llm_with_tools = llm.bind_tools(tools)
    response = await llm_with_tools.ainvoke(prompt)
    
    if response.tool_calls:
        # Execute tools (simplified)
        return "Tool executed"
    
    return response.content

# Unit test with mocked LLM
@pytest.mark.asyncio
async def test_simple_agent_with_mock():
    """Test agent with mocked LLM."""
    # Create mock LLM
    mock_llm = AsyncMock()
    mock_response = AIMessage(content="Test response")
    mock_llm.bind_tools.return_value.ainvoke = AsyncMock(return_value=mock_response)
    
    # Test
    result = await simple_agent("test prompt", mock_llm, [])
    
    assert result == "Test response"
    mock_llm.bind_tools.return_value.ainvoke.assert_called_once()

# Test with tool calls
@pytest.mark.asyncio
async def test_agent_with_tool_calls():
    """Test agent that calls tools."""
    mock_llm = AsyncMock()
    mock_response = AIMessage(
        content="",
        tool_calls=[{
            "name": "test_tool",
            "args": {"arg1": "value1"},
            "id": "call_123"
        }]
    )
    mock_llm.bind_tools.return_value.ainvoke = AsyncMock(return_value=mock_response)
    
    @tool
    def test_tool(arg1: str) -> str:
        """Test tool."""
        return f"Result: {arg1}"
    
    result = await simple_agent("test prompt", mock_llm, [test_tool])
    assert result == "Tool executed"

# Using pytest fixtures
@pytest.fixture
def mock_llm():
    """Fixture for mocked LLM."""
    llm = AsyncMock()
    llm.bind_tools.return_value.ainvoke = AsyncMock(
        return_value=AIMessage(content="Mocked response")
    )
    return llm

@pytest.fixture
def sample_tools():
    """Fixture for sample tools."""
    @tool
    def get_weather(location: str) -> str:
        """Get weather for location."""
        return f"Weather in {location}: Sunny"
    
    return [get_weather]

@pytest.mark.asyncio
async def test_agent_with_fixtures(mock_llm, sample_tools):
    """Test using fixtures."""
    result = await simple_agent("What's the weather?", mock_llm, sample_tools)
    assert result is not None
python
import pytest
from langchain_openai import ChatOpenAI
from langchain_core.tools import tool
import os

# Integration test with test LLM
@pytest.mark.asyncio
@pytest.mark.integration
async def test_agent_integration():
    """Integration test with real LLM."""
    # Use test API key or test model
    llm = ChatOpenAI(
        model="gpt-3.5-turbo",  # Cheaper model for testing
        temperature=0,
        api_key=os.getenv("OPENAI_API_KEY")
    )
    
    @tool
    def add_numbers(a: int, b: int) -> str:
        """Add two numbers."""
        return str(a + b)
    
    # Test agent
    result = await simple_agent("Add 2 and 3", llm, [add_numbers])
    
    # Verify result contains expected content
    assert "5" in result or "tool" in result.lower()

# Test with test database
@pytest.fixture
def test_db():
    """Fixture for test database."""
    # Setup test database
    db = {}
    yield db
    # Teardown
    db.clear()

@pytest.mark.asyncio
@pytest.mark.integration
async def test_agent_with_database(test_db):
    """Test agent that interacts with database."""
    @tool
    def store_data(key: str, value: str) -> str:
        """Store data in test database."""
        test_db[key] = value
        return f"Stored {key}"
    
    @tool
    def get_data(key: str) -> str:
        """Get data from test database."""
        return test_db.get(key, "Not found")
    
    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
    
    # Test storing and retrieving
    await simple_agent("Store test_key with value test_value", llm, [store_data])
    assert test_db["test_key"] == "test_value"
python
from langsmith import Client, evaluate
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage
import os

# Initialize LangSmith client
client = Client(api_key=os.getenv("LANGSMITH_API_KEY"))

# Define evaluation dataset
dataset_name = "agent_test_dataset"

# Create dataset
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Test dataset for agent evaluation"
)

# Add examples
client.create_examples(
    inputs=[
        {"input": "What is Python?"},
        {"input": "Add 2 and 3"},
        {"input": "What's the weather in NYC?"}
    ],
    outputs=[
        {"output": "Python is a programming language"},
        {"output": "5"},
        {"output": "Weather information"}
    ],
    dataset_id=dataset.id
)

# Define agent function
async def test_agent(input: dict) -> dict:
    """Agent function for evaluation."""
    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
    response = await llm.ainvoke(input["input"])
    return {"output": response.content}

# Evaluation function
def evaluate_agent(run, example):
    """Custom evaluation function."""
    predicted = run.outputs.get("output", "")
    expected = example.outputs.get("output", "")
    
    # Simple accuracy check
    accuracy = 1.0 if expected.lower() in predicted.lower() else 0.0
    
    return {
        "accuracy": accuracy,
        "predicted": predicted,
        "expected": expected
    }

# Run evaluation
results = evaluate(
    test_agent,
    data=dataset_name,
    evaluators=[evaluate_agent],
    experiment_prefix="agent_test"
)

# View results
for result in results:
    print(f"Accuracy: {result['accuracy']}")
    print(f"Predicted: {result['predicted']}")
    print(f"Expected: {result['expected']}")

# Using LangSmith's built-in evaluators
from langsmith.evaluation import LangChainStringEvaluator

# Create evaluator
evaluator = LangChainStringEvaluator(
    "labeled_score_string",
    criteria={
        "helpfulness": "Is the response helpful?",
        "accuracy": "Is the response accurate?"
    }
)

# Run evaluation with built-in evaluator
results = evaluate(
    test_agent,
    data=dataset_name,
    evaluators=[evaluator],
    experiment_prefix="agent_eval"
)
python
import time
import asyncio
from typing import List, Dict
import statistics

class AgentBenchmark:
    """Benchmark agent performance."""
    
    def __init__(self, agent_func):
        self.agent_func = agent_func
        self.results: List[Dict] = []
    
    async def benchmark(
        self,
        test_cases: List[str],
        iterations: int = 10
    ) -> Dict:
        """Run benchmark on test cases."""
        latencies = []
        successes = 0
        errors = 0
        
        for test_case in test_cases:
            for _ in range(iterations):
                start_time = time.time()
                
                try:
                    result = await self.agent_func(test_case)
                    latency = time.time() - start_time
                    latencies.append(latency)
                    successes += 1
                except Exception as e:
                    errors += 1
                    self.results.append({
                        "test_case": test_case,
                        "success": False,
                        "error": str(e),
                        "latency": None
                    })
                    continue
                
                self.results.append({
                    "test_case": test_case,
                    "success": True,
                    "latency": latency,
                    "result_length": len(str(result))
                })
        
        return {
            "total_tests": len(test_cases) * iterations,
            "successes": successes,
            "errors": errors,
            "success_rate": successes / (successes + errors) if (successes + errors) > 0 else 0,
            "avg_latency": statistics.mean(latencies) if latencies else 0,
            "median_latency": statistics.median(latencies) if latencies else 0,
            "p95_latency": statistics.quantiles(latencies, n=20)[18] if len(latencies) >= 20 else 0,
            "min_latency": min(latencies) if latencies else 0,
            "max_latency": max(latencies) if latencies else 0
        }

# Usage
async def benchmark_agent(prompt: str) -> str:
    """Agent function to benchmark."""
    llm = ChatOpenAI(model="gpt-3.5-turbo")
    response = await llm.ainvoke(prompt)
    return response.content

benchmark = AgentBenchmark(benchmark_agent)
results = await benchmark.benchmark(
    test_cases=["What is Python?", "Explain async/await"],
    iterations=5
)

print(f"Success Rate: {results['success_rate']:.2%}")
print(f"Average Latency: {results['avg_latency']:.2f}s")
print(f"P95 Latency: {results['p95_latency']:.2f}s")
python
import pytest
from unittest.mock import AsyncMock, patch

@pytest.mark.asyncio
async def test_tool_execution():
    """Test that agent executes tools correctly."""
    @tool
    def test_tool(param: str) -> str:
        """Test tool."""
        return f"Result: {param}"
    
    # Mock tool execution
    with patch.object(test_tool, 'invoke', return_value="Result: test") as mock_tool:
        # Test agent that uses tool
        # ... agent code ...
        
        # Verify tool was called
        mock_tool.assert_called_once_with({"param": "test"})

# Test tool error handling
@pytest.mark.asyncio
async def test_tool_error_handling():
    """Test agent handles tool errors."""
    @tool
    def failing_tool() -> str:
        """Tool that always fails."""
        raise ValueError("Tool error")
    
    # Agent should handle tool errors gracefully
    # ... test implementation ...
python
import pytest
from langchain_core.messages import HumanMessage

@pytest.mark.asyncio
@pytest.mark.e2e
async def test_complete_agent_workflow():
    """End-to-end test of agent workflow."""
    # Setup
    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
    
    @tool
    def get_weather(location: str) -> str:
        """Get weather."""
        return f"Weather in {location}: Sunny, 72°F"
    
    @tool
    def search_kb(query: str) -> str:
        """Search knowledge base."""
        return f"Results for: {query}"
    
    tools = [get_weather, search_kb]
    llm_with_tools = llm.bind_tools(tools)
    
    # Execute workflow
    messages = [HumanMessage(content="What's the weather in NYC and search for Python?")]
    
    response = await llm_with_tools.ainvoke(messages)
    
    # Verify response
    assert response is not None
    assert len(response.tool_calls) > 0
    
    # Verify tool calls
    tool_names = [tc["name"] for tc in response.tool_calls]
    assert "get_weather" in tool_names or "search_kb" in tool_names

# Test multi-turn conversation
@pytest.mark.asyncio
@pytest.mark.e2e
async def test_multi_turn_conversation():
    """Test agent maintains context across turns."""
    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
    messages = []
    
    # Turn 1
    messages.append(HumanMessage(content="My name is Alice"))
    response1 = await llm.ainvoke(messages)
    messages.append(response1)
    
    # Turn 2
    messages.append(HumanMessage(content="What's my name?"))
    response2 = await llm.ainvoke(messages)
    
    # Verify agent remembers
    assert "Alice" in response2.content.lower()
python
from hypothesis import given, strategies as st
import pytest
import time
import asyncio

@given(st.text(min_size=1, max_size=100))
@pytest.mark.asyncio
async def test_agent_always_responds(prompt: str):
    """Property: Agent always returns a response."""
    llm = ChatOpenAI(model="gpt-3.5-turbo")
    response = await llm.ainvoke(prompt)
    
    assert response is not None
    assert hasattr(response, 'content')
    assert len(response.content) > 0

@given(st.text(min_size=1, max_size=50))
@pytest.mark.asyncio
async def test_agent_response_time(prompt: str):
    """Property: Agent responds within timeout."""
    llm = ChatOpenAI(model="gpt-3.5-turbo")
    
    start_time = time.time()
    response = await asyncio.wait_for(
        llm.ainvoke(prompt),
        timeout=30.0
    )
    latency = time.time() - start_time
    
    assert latency < 30.0
    assert response is not None

Testing Strategies

StrategyUse CaseTools
Unit TestsIsolated componentspytest, unittest.mock
Integration TestsComponent interactionspytest, test databases
E2E TestsComplete workflowspytest, real LLMs
Property TestsInvariantshypothesis
Performance TestsLatency, throughputCustom benchmarks
EvaluationQuality metricsLangSmith

Best Practices

  • Mock external dependencies in unit tests
  • Use fixtures for common test setup
  • Test error cases and edge cases
  • Use LangSmith for evaluation
  • Benchmark performance regularly
  • Test tool execution separately
  • Use property-based testing for invariants
  • Maintain test coverage > 80%
  • Test async code with pytest-asyncio
  • Use test databases for integration tests

Anti-Patterns

Anti-PatternFix
Testing with production LLMUse test models or mocks
No error case testingTest all error paths
Slow testsMock expensive operations
No test isolationUse fixtures and teardown
Testing implementation detailsTest behavior, not implementation
No performance testingAdd benchmarks
Ignoring flaky testsFix or remove flaky tests
No test data managementUse fixtures and factories
Synchronous async testsUse pytest-asyncio
No evaluation metricsUse LangSmith evaluation

Related

  • Knowledge: knowledge/agent-testing-patterns.json
  • Skill: langsmith-tracing
  • Skill: langsmith-prompts
  • Skill: error-handling

Prerequisites

[!IMPORTANT] Requirements:

  • Packages: pytest, pytest-asyncio, pytest-mock, langchain-core, langchain-openai, langsmith
  • Knowledge: agent-testing-patterns.json