AgentSkillsCN

Langsmith Prompts

通过 LangSmith Hub 进行提示管理——版本控制、测试与评估

SKILL.md
--- frontmatter
description: Prompt management with LangSmith Hub - versioning, testing, and evaluation

Langsmith Prompts

Prompt management with LangSmith Hub - versioning, testing, and evaluation

LangSmith Prompt Management Skill

Manage prompts professionally with versioning, A/B testing, and evaluation using LangSmith Hub.

LangSmith Prompt Management Skill

Manage prompts professionally with versioning, A/B testing, and evaluation using LangSmith Hub.

Process

Step 1: Push Prompts to Hub

python
from langchain import hub
from langchain_core.prompts import ChatPromptTemplate

# Create prompt
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a {role}. {instructions}"),
    ("human", "{input}")
])

# Push to Hub (creates versioned prompt)
hub.push("my-org/agent-prompt", prompt, new_repo_is_public=False)

Step 2: Pull Prompts from Hub

python
from langchain import hub

# Pull latest version
prompt = hub.pull("my-org/agent-prompt")

# Pull specific version
prompt_v2 = hub.pull("my-org/agent-prompt:v2")

# Use in chain
chain = prompt | llm | parser

Step 3: Prompt Versioning Strategy

python
# Development workflow
def update_prompt(prompt_name: str, new_prompt: ChatPromptTemplate):
    """Update prompt with version tracking."""
    # Pull current to compare
    try:
        current = hub.pull(prompt_name)
        print(f"Current: {current}")
    except Exception:
        print("Creating new prompt")
    
    # Push new version
    hub.push(prompt_name, new_prompt)
    print(f"Pushed new version of {prompt_name}")

Step 4: A/B Testing Prompts

python
import random
from langsmith import traceable

@traceable(tags=["ab-test"])
async def run_with_ab_test(input_data: dict, test_name: str = "prompt_v1_vs_v2"):
    """Run A/B test between prompt versions."""
    
    # Select variant
    variant = random.choice(["control", "treatment"])
    
    if variant == "control":
        prompt = hub.pull("my-org/agent-prompt:v1")
    else:
        prompt = hub.pull("my-org/agent-prompt:v2")
    
    chain = prompt | llm | parser
    result = await chain.ainvoke(input_data)
    
    # Log variant for analysis
    return {"result": result, "variant": variant}

Step 5: Prompt Evaluation

python
from langsmith import Client
from langsmith.evaluation import evaluate

client = Client()

# Create evaluation dataset
dataset = client.create_dataset("prompt-eval")

# Add examples
client.create_example(
    dataset_id=dataset.id,
    inputs={"input": "What is Python?"},
    outputs={"expected": "Python is a programming language..."}
)

# Define evaluator
def relevance_evaluator(run, example):
    """Check if response is relevant to input."""
    output = run.outputs.get("output", "")
    expected = example.outputs.get("expected", "")
    
    # Simple keyword overlap (use LLM for better eval)
    overlap = len(set(output.split()) & set(expected.split()))
    return {"score": min(overlap / 10, 1.0)}

# Run evaluation
results = evaluate(
    lambda x: chain.invoke(x),
    data="prompt-eval",
    evaluators=[relevance_evaluator]
)

Step 6: Prompt Templates with Variables

python
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

# Complex prompt with all features
prompt = ChatPromptTemplate.from_messages([
    ("system", """You are a {role} assistant.

Context: {context}

Instructions:
{instructions}

Output Format: {output_format}
"""),
    MessagesPlaceholder("history", optional=True),
    ("human", "{input}")
])

# Partial application for reuse
analyst_prompt = prompt.partial(
    role="data analyst",
    output_format="JSON with 'analysis' and 'confidence' keys"
)

# Use
result = await (analyst_prompt | llm).ainvoke({
    "context": "Sales data Q4 2024",
    "instructions": "Analyze trends and anomalies",
    "input": "What are the key insights?"
})
python
from langchain import hub
from langchain_core.prompts import ChatPromptTemplate

# Create prompt
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a {role}. {instructions}"),
    ("human", "{input}")
])

# Push to Hub (creates versioned prompt)
hub.push("my-org/agent-prompt", prompt, new_repo_is_public=False)
python
from langchain import hub

# Pull latest version
prompt = hub.pull("my-org/agent-prompt")

# Pull specific version
prompt_v2 = hub.pull("my-org/agent-prompt:v2")

# Use in chain
chain = prompt | llm | parser
python
# Development workflow
def update_prompt(prompt_name: str, new_prompt: ChatPromptTemplate):
    """Update prompt with version tracking."""
    # Pull current to compare
    try:
        current = hub.pull(prompt_name)
        print(f"Current: {current}")
    except Exception:
        print("Creating new prompt")
    
    # Push new version
    hub.push(prompt_name, new_prompt)
    print(f"Pushed new version of {prompt_name}")
python
import random
from langsmith import traceable

@traceable(tags=["ab-test"])
async def run_with_ab_test(input_data: dict, test_name: str = "prompt_v1_vs_v2"):
    """Run A/B test between prompt versions."""
    
    # Select variant
    variant = random.choice(["control", "treatment"])
    
    if variant == "control":
        prompt = hub.pull("my-org/agent-prompt:v1")
    else:
        prompt = hub.pull("my-org/agent-prompt:v2")
    
    chain = prompt | llm | parser
    result = await chain.ainvoke(input_data)
    
    # Log variant for analysis
    return {"result": result, "variant": variant}
python
from langsmith import Client
from langsmith.evaluation import evaluate

client = Client()

# Create evaluation dataset
dataset = client.create_dataset("prompt-eval")

# Add examples
client.create_example(
    dataset_id=dataset.id,
    inputs={"input": "What is Python?"},
    outputs={"expected": "Python is a programming language..."}
)

# Define evaluator
def relevance_evaluator(run, example):
    """Check if response is relevant to input."""
    output = run.outputs.get("output", "")
    expected = example.outputs.get("expected", "")
    
    # Simple keyword overlap (use LLM for better eval)
    overlap = len(set(output.split()) & set(expected.split()))
    return {"score": min(overlap / 10, 1.0)}

# Run evaluation
results = evaluate(
    lambda x: chain.invoke(x),
    data="prompt-eval",
    evaluators=[relevance_evaluator]
)
python
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

# Complex prompt with all features
prompt = ChatPromptTemplate.from_messages([
    ("system", """You are a {role} assistant.

Context: {context}

Instructions:
{instructions}

Output Format: {output_format}
"""),
    MessagesPlaceholder("history", optional=True),
    ("human", "{input}")
])

# Partial application for reuse
analyst_prompt = prompt.partial(
    role="data analyst",
    output_format="JSON with 'analysis' and 'confidence' keys"
)

# Use
result = await (analyst_prompt | llm).ainvoke({
    "context": "Sales data Q4 2024",
    "instructions": "Analyze trends and anomalies",
    "input": "What are the key insights?"
})

Prompt Organization

code
prompts/
├── agents/
│   ├── analyst-v1
│   ├── analyst-v2
│   └── researcher
├── tools/
│   ├── sql-generator
│   └── api-caller
└── evaluation/
    ├── relevance-judge
    └── quality-judge
code
prompts/
├── agents/
│   ├── analyst-v1
│   ├── analyst-v2
│   └── researcher
├── tools/
│   ├── sql-generator
│   └── api-caller
└── evaluation/
    ├── relevance-judge
    └── quality-judge

Best Practices

  • Use semantic versioning for prompts (v1, v2, etc.)
  • Include clear descriptions when pushing
  • Test prompts before deploying to production
  • Use evaluation datasets for regression testing
  • Tag prompts by use case and domain
  • Keep production and dev prompts separate

Anti-Patterns

Anti-PatternFix
Hardcoded promptsUse Hub for all prompts
No versioningAlways version prompts
No testingCreate evaluation datasets
Shared dev/prodSeparate environments

Related

  • Knowledge: knowledge/langsmith-prompts-patterns.json
  • Skill: langsmith-tracing
  • Skill: agent-testing

Prerequisites

[!IMPORTANT] Requirements:

  • Packages: langsmith, langchain-core
  • Knowledge: langsmith-prompts-patterns.json