SDG Create Block
Scaffold custom synthetic data generation blocks for sdg_hub.
BaseBlock Pattern
All blocks inherit from BaseBlock and implement the generate method:
python
from typing import Any
from datasets import Dataset
from sdg_hub.core.blocks.base import BaseBlock
class MyCustomBlock(BaseBlock):
"""A custom block for data transformation."""
# Block configuration (set via YAML or constructor)
my_param: str = "default_value"
another_param: int = 10
def generate(self, dataset: Dataset, **kwargs: Any) -> Dataset:
"""Transform the input dataset.
Args:
dataset: Input HuggingFace Dataset
**kwargs: Additional runtime arguments
Returns:
Transformed Dataset
"""
def transform_row(example):
# Your transformation logic here
example["new_column"] = f"{self.my_param}: {example['input']}"
return example
return dataset.map(transform_row)
Block Types
LLMBlock (for model calls)
python
from sdg_hub.core.blocks.llm import LLMBlock
class MyLLMBlock(LLMBlock):
"""Block that makes LLM calls."""
system_prompt: str = "You are a helpful assistant."
temperature: float = 0.7
def generate(self, dataset: Dataset, **kwargs: Any) -> Dataset:
def process_row(example):
response = self.call_llm(
messages=[
{"role": "system", "content": self.system_prompt},
{"role": "user", "content": example["prompt"]},
],
temperature=self.temperature,
)
example["response"] = response
return example
return dataset.map(process_row)
FilterBlock (for data filtering)
python
from sdg_hub.core.blocks.base import BaseBlock
class QualityFilterBlock(BaseBlock):
"""Filter rows based on quality criteria."""
min_length: int = 100
max_length: int = 10000
def generate(self, dataset: Dataset, **kwargs: Any) -> Dataset:
def filter_fn(example):
text = example.get("text", "")
return self.min_length <= len(text) <= self.max_length
return dataset.filter(filter_fn)
TransformBlock (for data transformation)
python
from sdg_hub.core.blocks.base import BaseBlock
class FormatConverterBlock(BaseBlock):
"""Convert between data formats."""
input_format: str = "alpaca"
output_format: str = "sharegpt"
def generate(self, dataset: Dataset, **kwargs: Any) -> Dataset:
if self.input_format == "alpaca" and self.output_format == "sharegpt":
return self._alpaca_to_sharegpt(dataset)
raise ValueError(f"Unsupported conversion: {self.input_format} -> {self.output_format}")
def _alpaca_to_sharegpt(self, dataset: Dataset) -> Dataset:
def convert(example):
return {
"conversations": [
{"from": "human", "value": example["instruction"]},
{"from": "gpt", "value": example["output"]},
]
}
return dataset.map(convert)
YAML Registration
Register your block in a flow YAML:
yaml
blocks:
- block_type: my_custom_block
block_config:
my_param: "custom_value"
another_param: 20
Block Configuration
Using Pydantic validation
python
from pydantic import Field, field_validator
from sdg_hub.core.blocks.base import BaseBlock
class ValidatedBlock(BaseBlock):
temperature: float = Field(default=0.7, ge=0.0, le=2.0)
max_tokens: int = Field(default=1024, gt=0)
@field_validator("temperature")
@classmethod
def validate_temperature(cls, v):
if v > 1.5:
import warnings
warnings.warn("High temperature may produce unstable outputs")
return v
Testing Your Block
python
from datasets import Dataset
# Create test dataset
test_data = Dataset.from_dict({
"input": ["test input 1", "test input 2"],
"label": ["label1", "label2"],
})
# Instantiate and run block
block = MyCustomBlock(my_param="test")
result = block.generate(test_data)
# Verify output
assert "new_column" in result.column_names
print(result[0])
Best Practices
- •Idempotency: Blocks should produce the same output for the same input
- •Column preservation: Don't remove columns unless explicitly intended
- •Error handling: Catch and log errors, optionally filter failed rows
- •Batching: Use
dataset.map(fn, batched=True)for efficiency - •Progress: Use
tqdmfor long-running operations
Related Skills
- •
/sdg-discover-flows- Find existing flows - •
/sdg-run-flow- Execute flows