Hugging Face Ecosystem
Expert guidance for the full Hugging Face ecosystem: Transformers, Datasets, Tokenizers, Accelerate, PEFT, Evaluate, TGI, Spaces, Hub API, Safetensors, and Optimum. Covers model loading, training, evaluation, optimization, and deployment.
Transformers Library
AutoModel and AutoTokenizer
python
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import torch
# Load model and tokenizer (auto-detects architecture)
model_name = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map="auto", # Automatic device placement
attn_implementation="flash_attention_2", # Flash Attention for speed
)
# Inspect config without downloading weights
config = AutoConfig.from_pretrained(model_name)
print(f"Hidden size: {config.hidden_size}")
print(f"Num layers: {config.num_hidden_layers}")
print(f"Vocab size: {config.vocab_size}")
AutoModel Classes by Task
| Task | Class | Example Use |
|---|---|---|
| Text generation | AutoModelForCausalLM | GPT, Llama, Mistral |
| Fill mask | AutoModelForMaskedLM | BERT, RoBERTa |
| Classification | AutoModelForSequenceClassification | Sentiment, topic |
| Token classification | AutoModelForTokenClassification | NER, POS tagging |
| Question answering | AutoModelForQuestionAnswering | Extractive QA |
| Seq2Seq | AutoModelForSeq2SeqLM | T5, BART |
| Image classification | AutoModelForImageClassification | ViT, ResNet |
| Object detection | AutoModelForObjectDetection | DETR, YOLOS |
| Speech recognition | AutoModelForSpeechSeq2Seq | Whisper |
| Vision-language | AutoModelForVision2Seq | LLaVA, PaliGemma |
Pipeline API (Quick Inference)
python
from transformers import pipeline
# Text generation
generator = pipeline(
"text-generation",
model="meta-llama/Llama-3.1-8B-Instruct",
torch_dtype=torch.bfloat16,
device_map="auto",
)
messages = [
{"role": "system", "content": "You are a helpful coding assistant."},
{"role": "user", "content": "Write a Python function to calculate fibonacci numbers."},
]
output = generator(
messages,
max_new_tokens=512,
temperature=0.7,
do_sample=True,
top_p=0.9,
)
print(output[0]["generated_text"][-1]["content"])
# Sentiment analysis
classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
result = classifier("This product is amazing!")
# [{'label': 'POSITIVE', 'score': 0.9998}]
# Named entity recognition
ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
entities = ner("Apple Inc. is headquartered in Cupertino, California.")
# [{'entity_group': 'ORG', 'word': 'Apple Inc.', 'score': 0.99},
# {'entity_group': 'LOC', 'word': 'Cupertino, California', 'score': 0.98}]
# Zero-shot classification (no fine-tuning needed)
zero_shot = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
result = zero_shot(
"I need to return this broken laptop",
candidate_labels=["refund", "technical support", "billing", "shipping"],
)
# {'labels': ['refund', 'shipping', ...], 'scores': [0.82, 0.09, ...]}
# Image classification
img_classifier = pipeline("image-classification", model="google/vit-base-patch16-224")
result = img_classifier("photo.jpg")
# Speech-to-text
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3")
result = transcriber("audio.mp3", return_timestamps=True)
Generation Configuration
python
from transformers import GenerationConfig
gen_config = GenerationConfig(
max_new_tokens=512,
temperature=0.7,
top_p=0.9,
top_k=50,
repetition_penalty=1.1,
do_sample=True,
num_beams=1, # 1 = greedy/sampling, >1 = beam search
early_stopping=True,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
)
# Apply chat template and generate
inputs = tokenizer.apply_chat_template(
messages,
return_tensors="pt",
add_generation_prompt=True,
).to(model.device)
outputs = model.generate(inputs, generation_config=gen_config)
response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
Model Hub
Finding Models
python
from huggingface_hub import HfApi
api = HfApi()
# Search models by task and library
models = api.list_models(
filter="text-generation",
library="transformers",
sort="downloads",
direction=-1,
limit=10,
)
for model in models:
print(f"{model.id}: {model.downloads:,} downloads")
# Search by specific criteria
models = api.list_models(
search="llama 8b instruct",
sort="likes",
direction=-1,
)
Uploading Models
python
from huggingface_hub import HfApi, create_repo
api = HfApi()
# Create repository
create_repo("my-org/my-model", private=True, repo_type="model")
# Upload entire directory
api.upload_folder(
folder_path="./my-model",
repo_id="my-org/my-model",
commit_message="Upload fine-tuned model",
ignore_patterns=["*.pyc", "__pycache__", "*.log"],
)
# Or use the model's save method
model.push_to_hub("my-org/my-model", private=True)
tokenizer.push_to_hub("my-org/my-model")
Datasets Library
Loading Datasets
python
from datasets import load_dataset, DatasetDict, Dataset
# From Hub
dataset = load_dataset("squad")
dataset = load_dataset("imdb", split="train")
dataset = load_dataset("json", data_files="data.jsonl")
dataset = load_dataset("csv", data_files={"train": "train.csv", "test": "test.csv"})
# Streaming (for large datasets, no download required)
dataset = load_dataset("allenai/c4", "en", split="train", streaming=True)
for example in dataset.take(10):
print(example["text"][:100])
# From pandas
import pandas as pd
df = pd.read_csv("data.csv")
dataset = Dataset.from_pandas(df)
# From dict
dataset = Dataset.from_dict({
"text": ["Hello world", "How are you"],
"label": [0, 1],
})
Processing Datasets
python
# Map (apply function to each example)
def tokenize_function(examples):
return tokenizer(
examples["text"],
padding="max_length",
truncation=True,
max_length=512,
)
tokenized = dataset.map(
tokenize_function,
batched=True, # Process in batches for speed
batch_size=1000,
num_proc=4, # Parallel processing
remove_columns=["text"],
)
# Filter
filtered = dataset.filter(lambda x: len(x["text"]) > 100)
filtered = dataset.filter(lambda x: x["label"] in [0, 1], num_proc=4)
# Select/shuffle/split
shuffled = dataset.shuffle(seed=42)
subset = dataset.select(range(1000))
split = dataset.train_test_split(test_size=0.1, seed=42)
# Sort
sorted_ds = dataset.sort("length", reverse=True)
# Rename and remove columns
dataset = dataset.rename_column("old_name", "new_name")
dataset = dataset.remove_columns(["unnecessary_col1", "unnecessary_col2"])
# Add new column
dataset = dataset.add_column("idx", list(range(len(dataset))))
Creating Custom Datasets
python
from datasets import DatasetDict, Dataset, Features, Value, ClassLabel
# Define schema
features = Features({
"text": Value("string"),
"label": ClassLabel(names=["negative", "neutral", "positive"]),
"score": Value("float32"),
})
# Create with schema
dataset = Dataset.from_dict(
{"text": ["great!", "ok", "terrible"], "label": [2, 1, 0], "score": [0.95, 0.5, 0.1]},
features=features,
)
# Upload to Hub
dataset.push_to_hub("my-org/my-dataset", private=True)
Tokenizers
Using Tokenizers
python
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
# Basic tokenization
tokens = tokenizer("Hello, world!")
print(tokens)
# {'input_ids': [128000, 9906, 11, 1917, 0], 'attention_mask': [1, 1, 1, 1, 1]}
# Decode back
text = tokenizer.decode(tokens["input_ids"], skip_special_tokens=True)
# Batch tokenization with padding/truncation
batch = tokenizer(
["Short text", "This is a much longer piece of text that needs truncation"],
padding=True,
truncation=True,
max_length=128,
return_tensors="pt",
)
# Special tokens
print(f"BOS: {tokenizer.bos_token} ({tokenizer.bos_token_id})")
print(f"EOS: {tokenizer.eos_token} ({tokenizer.eos_token_id})")
print(f"PAD: {tokenizer.pad_token} ({tokenizer.pad_token_id})")
# Set pad token if missing (common for decoder-only models)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
Training Custom Tokenizers
python
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors
# BPE tokenizer from scratch
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
trainer = trainers.BpeTrainer(
vocab_size=32000,
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
min_frequency=2,
show_progress=True,
)
# Train on files
tokenizer.train(files=["corpus.txt"], trainer=trainer)
# Or train from the existing tokenizer (extend vocabulary)
from transformers import AutoTokenizer
base_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
new_tokens = ["<custom_tag>", "<end_custom>", "[DOMAIN_TERM]"]
num_added = base_tokenizer.add_tokens(new_tokens)
print(f"Added {num_added} tokens")
# Resize model embeddings to match
model.resize_token_embeddings(len(base_tokenizer))
Accelerate (Multi-GPU Training)
Basic Usage
python
from accelerate import Accelerator
accelerator = Accelerator(
mixed_precision="bf16",
gradient_accumulation_steps=4,
log_with="wandb",
)
# Prepare model, optimizer, and dataloader
model, optimizer, train_dataloader = accelerator.prepare(
model, optimizer, train_dataloader
)
# Training loop
for epoch in range(num_epochs):
model.train()
for batch in train_dataloader:
with accelerator.accumulate(model):
outputs = model(**batch)
loss = outputs.loss
accelerator.backward(loss)
optimizer.step()
optimizer.zero_grad()
accelerator.log({"train_loss": loss.item()})
# Save model (handles distributed state)
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(
"output",
save_function=accelerator.save,
state_dict=accelerator.get_state_dict(model),
)
Accelerate Config
bash
# Interactive configuration accelerate config # Launch training accelerate launch --num_processes 4 train.py # Or with config file accelerate launch --config_file accelerate_config.yaml train.py
yaml
# accelerate_config.yaml compute_environment: LOCAL_MACHINE distributed_type: MULTI_GPU num_processes: 4 mixed_precision: bf16 use_cpu: false gpu_ids: 0,1,2,3
Big Model Inference
python
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
# Load model across multiple devices (model parallelism)
with init_empty_weights():
model = AutoModelForCausalLM.from_config(config)
model = load_checkpoint_and_dispatch(
model,
checkpoint="./model-weights",
device_map="auto",
no_split_module_classes=["LlamaDecoderLayer"],
max_memory={0: "20GiB", 1: "20GiB", "cpu": "30GiB"},
)
PEFT (Parameter-Efficient Fine-Tuning)
Adapter Types
python
from peft import (
LoraConfig,
PrefixTuningConfig,
PromptTuningConfig,
IA3Config,
get_peft_model,
)
# LoRA (most popular, see fine-tuning skill for details)
lora_config = LoraConfig(r=16, lora_alpha=32, target_modules="all-linear")
# Prefix Tuning (prepends trainable tokens to each layer)
prefix_config = PrefixTuningConfig(
task_type="CAUSAL_LM",
num_virtual_tokens=20,
prefix_projection=True,
)
# Prompt Tuning (soft prompts prepended to input)
prompt_config = PromptTuningConfig(
task_type="CAUSAL_LM",
num_virtual_tokens=20,
prompt_tuning_init="TEXT",
prompt_tuning_init_text="Classify the sentiment of this text:",
tokenizer_name_or_path="model-name",
)
# IA3 (smallest adapter, modifies activations)
ia3_config = IA3Config(
task_type="CAUSAL_LM",
target_modules=["k_proj", "v_proj", "down_proj"],
feedforward_modules=["down_proj"],
)
model = get_peft_model(base_model, lora_config) # Or any config
Loading and Switching Adapters
python
from peft import PeftModel
# Load base model with adapter
model = PeftModel.from_pretrained(base_model, "path/to/adapter")
# Multiple adapters
model.load_adapter("path/to/code-adapter", adapter_name="code")
model.load_adapter("path/to/chat-adapter", adapter_name="chat")
# Switch between adapters
model.set_adapter("code")
code_output = model.generate(**inputs)
model.set_adapter("chat")
chat_output = model.generate(**inputs)
# Merge adapter into base model (for deployment)
merged = model.merge_and_unload()
Evaluate Library
python
import evaluate
# Load metrics
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
# Compute metrics
accuracy_result = accuracy.compute(
predictions=[0, 1, 1, 0],
references=[0, 1, 0, 0],
)
# {'accuracy': 0.75}
rouge_result = rouge.compute(
predictions=["The cat sat on the mat"],
references=["The cat is sitting on the mat"],
)
# {'rouge1': 0.857, 'rouge2': 0.6, 'rougeL': 0.857, 'rougeLsum': 0.857}
bertscore_result = bertscore.compute(
predictions=["The weather is nice today"],
references=["It is a beautiful day"],
lang="en",
)
# Combine metrics
combined = evaluate.combine(["accuracy", "f1", "precision", "recall"])
results = combined.compute(predictions=[0, 1, 1], references=[0, 1, 0])
Text Generation Inference (TGI)
Deployment
bash
# Docker deployment
docker run --gpus all --shm-size 1g \
-p 8080:80 \
-v /path/to/model:/model \
ghcr.io/huggingface/text-generation-inference:latest \
--model-id /model \
--dtype bfloat16 \
--max-input-length 4096 \
--max-total-tokens 8192 \
--max-batch-prefill-tokens 4096 \
--max-concurrent-requests 128
Client Usage
python
from huggingface_hub import InferenceClient
client = InferenceClient("http://localhost:8080")
# Simple generation
output = client.text_generation(
"Explain quantum computing:",
max_new_tokens=256,
temperature=0.7,
)
# Chat completions (OpenAI-compatible)
response = client.chat_completion(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is machine learning?"},
],
max_tokens=512,
temperature=0.7,
)
print(response.choices[0].message.content)
# Streaming
for token in client.text_generation(
"Once upon a time",
max_new_tokens=100,
stream=True,
):
print(token, end="")
Spaces (Gradio / Streamlit)
Gradio App
python
import gradio as gr
from transformers import pipeline
# Load model
generator = pipeline("text-generation", model="gpt2")
def generate_text(prompt, max_length, temperature):
result = generator(
prompt,
max_new_tokens=max_length,
temperature=temperature,
do_sample=True,
)
return result[0]["generated_text"]
# Build interface
demo = gr.Interface(
fn=generate_text,
inputs=[
gr.Textbox(label="Prompt", lines=3),
gr.Slider(50, 500, value=200, label="Max Length"),
gr.Slider(0.1, 2.0, value=0.7, label="Temperature"),
],
outputs=gr.Textbox(label="Generated Text", lines=10),
title="Text Generator",
examples=[["Once upon a time", 200, 0.7]],
)
demo.launch()
Hub API
Programmatic Management
python
from huggingface_hub import HfApi, hf_hub_download, snapshot_download
api = HfApi()
# Download specific file
file_path = hf_hub_download(
repo_id="meta-llama/Llama-3.1-8B",
filename="config.json",
token="hf_...",
)
# Download entire model
snapshot_download(
repo_id="meta-llama/Llama-3.1-8B",
local_dir="./llama-3.1-8b",
ignore_patterns=["*.bin"], # Skip pytorch_model.bin if using safetensors
)
# Repository management
api.create_repo("my-org/new-model", private=True)
api.delete_repo("my-org/old-model")
api.update_repo_visibility("my-org/my-model", private=False)
# File management
api.upload_file(
path_or_fileobj="./model.safetensors",
path_in_repo="model.safetensors",
repo_id="my-org/my-model",
)
# Create model card
from huggingface_hub import ModelCard
card = ModelCard.from_template(
card_data=ModelCardData(
language="en",
license="mit",
library_name="transformers",
tags=["text-generation", "llama"],
datasets=["my-dataset"],
metrics=["accuracy"],
),
model_id="my-org/my-model",
model_description="Fine-tuned Llama for customer support.",
)
card.push_to_hub("my-org/my-model")
Safetensors
python
from safetensors.torch import save_file, load_file
# Save tensors (safe, fast, memory-mapped)
tensors = {"weight": torch.randn(10, 10), "bias": torch.randn(10)}
save_file(tensors, "model.safetensors")
# Load tensors
loaded = load_file("model.safetensors")
# Benefits over pickle-based formats:
# - No arbitrary code execution risk
# - Memory-mapped loading (fast, low memory)
# - Zero-copy loading on GPU
# - Format validation before loading
Optimum (ONNX / Quantization / Optimization)
ONNX Export
python
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer
# Export and load in one step
model = ORTModelForSequenceClassification.from_pretrained(
"distilbert-base-uncased-finetuned-sst-2-english",
export=True,
)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
# Use with pipeline (same API, faster inference)
from optimum.pipelines import pipeline
ort_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
result = ort_pipeline("This is great!")
# Save ONNX model
model.save_pretrained("./onnx_model")
Quantization with Optimum
python
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig
# Dynamic quantization (no calibration data needed)
quantizer = ORTQuantizer.from_pretrained("./onnx_model")
dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
quantizer.quantize(save_dir="./quantized_model", quantization_config=dqconfig)
# Static quantization (needs calibration data, better quality)
from optimum.onnxruntime.configuration import AutoCalibrationConfig
qconfig = AutoQuantizationConfig.avx512_vnni(is_static=True, per_channel=True)
calibration_config = AutoCalibrationConfig.minmax(calibration_dataset)
quantizer.quantize(
save_dir="./static_quantized",
quantization_config=qconfig,
calibration_config=calibration_config,
)
Trainer API
Custom Training Loop with Callbacks
python
from transformers import (
Trainer, TrainingArguments,
EarlyStoppingCallback,
TrainerCallback,
)
# Custom callback
class LoggingCallback(TrainerCallback):
def on_log(self, args, state, control, logs=None, **kwargs):
if logs:
print(f"Step {state.global_step}: {logs}")
def on_epoch_end(self, args, state, control, **kwargs):
print(f"Epoch {state.epoch} completed")
# Custom metric computation
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = predictions.argmax(axis=-1)
accuracy = (predictions == labels).mean()
f1 = f1_score(labels, predictions, average="weighted")
return {"accuracy": accuracy, "f1": f1}
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=5,
per_device_train_batch_size=16,
per_device_eval_batch_size=32,
learning_rate=2e-5,
warmup_steps=500,
weight_decay=0.01,
eval_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="f1",
greater_is_better=True,
report_to="wandb",
bf16=True,
dataloader_num_workers=4,
group_by_length=True, # Group similar lengths for efficiency
save_total_limit=3,
logging_steps=50,
push_to_hub=True,
hub_model_id="my-org/my-model",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
compute_metrics=compute_metrics,
callbacks=[
EarlyStoppingCallback(early_stopping_patience=3),
LoggingCallback(),
],
)
trainer.train()
trainer.push_to_hub()
Resuming from Checkpoint
python
# Training automatically saves checkpoints trainer.train(resume_from_checkpoint=True) # Or specify a specific checkpoint trainer.train(resume_from_checkpoint="./results/checkpoint-1000")
Decision Framework: Choosing the Right Tool
| Need | Tool | Example |
|---|---|---|
| Quick prototype | pipeline() | Sentiment analysis demo |
| Custom training | Trainer + Accelerate | Fine-tune BERT for NER |
| Efficient fine-tuning | PEFT + TRL | LoRA on Llama 8B |
| Production serving | TGI or vLLM | API endpoint for chat |
| Demo/app | Gradio on Spaces | Interactive model showcase |
| Edge deployment | Optimum (ONNX) | Mobile inference |
| Large datasets | datasets streaming | Process C4 corpus |
| Multi-GPU | Accelerate | Distributed training |
| Model evaluation | evaluate | Benchmark on standard metrics |
| Safe storage | Safetensors | Model weight persistence |