Reward Configure
Set up and configure reward models for scoring LLM outputs.
AutoRM: Universal Loader
python
from reward_hub import AutoRM
# Load any supported reward model
rm = AutoRM.load("Qwen/Qwen2.5-Math-PRM-7B", load_method="vllm")
# Score responses
scores = rm.score(messages)
Reward Model Types
Outcome Reward Models (ORM)
Score complete responses.
python
# Load ORM
orm = AutoRM.load("internlm/internlm2-7b-reward", load_method="vllm")
# Score a response
messages = [
{"role": "user", "content": "What is 2+2?"},
{"role": "assistant", "content": "2+2 equals 4."},
]
score = orm.score(messages)
print(score) # e.g., 0.85
Process Reward Models (PRM)
Score each step in a reasoning chain.
python
# Load PRM
prm = AutoRM.load("Qwen/Qwen2.5-Math-PRM-7B", load_method="vllm")
# Score step by step
response = """Let me solve this step by step.
Step 1: First, I'll add 2+2.
Step 2: 2+2 = 4.
Therefore, the answer is 4."""
step_scores = prm.score_steps(
prompt="What is 2+2?",
response=response,
step_delimiter="\n",
)
print(step_scores) # [0.9, 0.95, 0.98]
LLM-as-Judge
Use an LLM to score responses.
python
from reward_hub import LLMJudge
judge = LLMJudge(
model="gpt-4",
api_base="https://api.openai.com/v1",
api_key="sk-...",
rubric="""
Score the response from 1-5 based on:
- Correctness (weight: 0.5)
- Clarity (weight: 0.3)
- Completeness (weight: 0.2)
""",
)
score = judge.score(prompt="Explain gravity", response="...")
DrSow (Data-efficient Reward via Self-Optimized Weak-to-strong)
Efficient reward modeling with minimal data.
python
from reward_hub import DrSow
rm = DrSow(
base_model="meta-llama/Llama-3.1-8B",
preference_data="./preferences.jsonl",
)
rm.train()
score = rm.score(messages)
Loading Methods
vLLM (recommended for speed)
python
rm = AutoRM.load(
"Qwen/Qwen2.5-Math-PRM-7B",
load_method="vllm",
tensor_parallel_size=2, # Multi-GPU
gpu_memory_utilization=0.8,
)
HuggingFace Transformers
python
rm = AutoRM.load(
"internlm/internlm2-7b-reward",
load_method="transformers",
device="cuda",
torch_dtype="bfloat16",
)
API endpoint
python
rm = AutoRM.load(
"custom-rm",
load_method="api",
api_base="http://localhost:8000/v1",
)
Batch Scoring
python
# Score multiple responses efficiently
batch_messages = [
[{"role": "user", "content": "Q1"}, {"role": "assistant", "content": "A1"}],
[{"role": "user", "content": "Q2"}, {"role": "assistant", "content": "A2"}],
[{"role": "user", "content": "Q3"}, {"role": "assistant", "content": "A3"}],
]
scores = rm.score_batch(batch_messages, batch_size=32)
print(scores) # [0.8, 0.6, 0.9]
Popular Models
Math Reasoning
| Model | Type | Notes |
|---|---|---|
Qwen/Qwen2.5-Math-PRM-7B | PRM | Excellent for math step scoring |
Qwen/Qwen2.5-Math-RM-72B | ORM | High-quality math outcome scoring |
General
| Model | Type | Notes |
|---|---|---|
internlm/internlm2-7b-reward | ORM | General-purpose reward |
sfairXC/FsfairX-LLaMA3-RM-v0.1 | ORM | Good for instruction following |
Code
| Model | Type | Notes |
|---|---|---|
Qwen/Qwen2.5-Coder-7B-Instruct + judge | LLM-as-Judge | Code quality evaluation |
Custom Reward Functions
python
from reward_hub import BaseRM
class CustomRM(BaseRM):
"""Custom reward model implementation."""
def __init__(self, **config):
super().__init__()
self.config = config
def score(self, messages: list[dict]) -> float:
response = messages[-1]["content"]
# Custom scoring logic
score = 0.0
if "step by step" in response.lower():
score += 0.2
if len(response) > 100:
score += 0.3
# Add more criteria...
return min(score, 1.0)
def score_batch(self, batch: list, **kwargs) -> list[float]:
return [self.score(msgs) for msgs in batch]
Combining Reward Models
python
from reward_hub import EnsembleRM
# Combine multiple RMs
ensemble = EnsembleRM([
(AutoRM.load("model1"), 0.5), # (model, weight)
(AutoRM.load("model2"), 0.3),
(custom_rm, 0.2),
])
score = ensemble.score(messages)
Integration with ITS
python
from its_hub.algorithms import BestOfN
from its_hub.integration.reward_hub import RewardHubORM
from reward_hub import AutoRM
# Load reward model
rm = AutoRM.load("internlm/internlm2-7b-reward", load_method="vllm")
# Wrap for ITS compatibility
orm = RewardHubORM(rm)
# Use with Best-of-N
alg = BestOfN(orm=orm)
result = alg.infer(lm, prompt, budget=8)
Caching & Performance
python
from reward_hub import AutoRM, CachedRM
# Wrap with caching for repeated queries
rm = AutoRM.load("Qwen/Qwen2.5-Math-PRM-7B", load_method="vllm")
cached_rm = CachedRM(rm, cache_dir="./rm_cache")
# Same query returns cached result
score1 = cached_rm.score(messages) # Computed
score2 = cached_rm.score(messages) # Cached
Debugging
python
# Enable detailed logging
import logging
logging.getLogger("reward_hub").setLevel(logging.DEBUG)
# Get detailed score breakdown
score, details = rm.score(messages, return_details=True)
print(details)
# {'raw_logits': [...], 'normalized_score': 0.85, 'tokens_scored': 150}
Related Skills
- •
/its-setup- Configure inference-time scaling - •
/pipeline-design- Design end-to-end pipelines