AgentSkillsCN

Monitoring Observability

监控与可观测性

SKILL.md

Monitoring & Observability

Tracing, metrics, logging, and alerting across multiple stacks.

Metadata

  • Category: operations
  • Scope: Backend (Rust 60%, Go 15%, Python 15%, Node.js 10%)
  • Complexity: Intermediate
  • Maturity: Stable

Overview

Observability encompasses the three pillars: Logs (events), Metrics (measurements), and Traces (request flows).

Observability Stack

ComponentTools
LogsLoki, Elasticsearch, CloudWatch
MetricsPrometheus, Datadog, CloudWatch
TracesJaeger, Tempo, Honeycomb
All-in-OneGrafana Cloud, Datadog, New Relic

OpenTelemetry

OpenTelemetry (OTel) is the standard for collecting telemetry data across all stacks.

Quick Start

Rust - tracing + OpenTelemetry

rust
// Cargo.toml
// tracing = "0.1"
// tracing-subscriber = { version = "0.3", features = ["env-filter"] }
// tracing-opentelemetry = "0.22"
// opentelemetry = "0.21"
// opentelemetry-otlp = "0.14"

use tracing::{info, instrument, span, Level};
use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};

pub fn init_telemetry() -> Result<(), Box<dyn std::error::Error>> {
    // OTLP exporter
    let tracer = opentelemetry_otlp::new_pipeline()
        .tracing()
        .with_exporter(opentelemetry_otlp::new_exporter().tonic())
        .with_trace_config(
            opentelemetry_sdk::trace::config()
                .with_resource(opentelemetry_sdk::Resource::new(vec![
                    opentelemetry::KeyValue::new("service.name", "my-service"),
                ]))
        )
        .install_batch(opentelemetry_sdk::runtime::Tokio)?;
    
    let telemetry = tracing_opentelemetry::layer().with_tracer(tracer);
    
    tracing_subscriber::registry()
        .with(tracing_subscriber::EnvFilter::from_default_env())
        .with(tracing_subscriber::fmt::layer().json())
        .with(telemetry)
        .init();
    
    Ok(())
}

// Instrument functions automatically
#[instrument(skip(pool), fields(user_id = %user_id))]
pub async fn get_user(pool: &PgPool, user_id: &str) -> Result<User, Error> {
    info!("Fetching user");
    
    let user = sqlx::query_as!(User, "SELECT * FROM users WHERE id = $1", user_id)
        .fetch_optional(pool)
        .await?
        .ok_or(Error::NotFound)?;
    
    info!(user_name = %user.name, "User found");
    Ok(user)
}

// Manual spans
async fn process_order(order: Order) -> Result<(), Error> {
    let span = span!(Level::INFO, "process_order", order_id = %order.id);
    let _guard = span.enter();
    
    info!("Processing order");
    
    // Child span
    let payment_span = span!(Level::INFO, "process_payment");
    let _payment_guard = payment_span.enter();
    process_payment(&order).await?;
    
    Ok(())
}

Rust - Prometheus Metrics

rust
// Cargo.toml: metrics = "0.21", metrics-exporter-prometheus = "0.12"

use axum::{routing::get, Router};
use metrics::{counter, gauge, histogram};
use metrics_exporter_prometheus::PrometheusBuilder;

pub fn setup_metrics() -> PrometheusHandle {
    PrometheusBuilder::new()
        .install_recorder()
        .expect("Failed to install Prometheus recorder")
}

// Metrics middleware
pub async fn metrics_middleware(
    request: Request,
    next: Next,
) -> Response {
    let path = request.uri().path().to_string();
    let method = request.method().to_string();
    
    let start = std::time::Instant::now();
    let response = next.run(request).await;
    let duration = start.elapsed();
    
    // Record metrics
    histogram!("http_request_duration_seconds", "path" => path.clone(), "method" => method.clone())
        .record(duration.as_secs_f64());
    
    counter!("http_requests_total", "path" => path, "method" => method, "status" => response.status().as_str().to_string())
        .increment(1);
    
    response
}

// Expose /metrics endpoint
async fn metrics_handler(State(handle): State<PrometheusHandle>) -> String {
    handle.render()
}

// Custom metrics
fn record_business_metrics(order: &Order) {
    counter!("orders_total", "status" => order.status.to_string()).increment(1);
    histogram!("order_value_usd").record(order.total as f64);
    gauge!("active_users").set(get_active_user_count() as f64);
}

Go - OpenTelemetry

go
import (
    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
    "go.opentelemetry.io/otel/sdk/trace"
    "go.opentelemetry.io/contrib/instrumentation/github.com/gofiber/fiber/otelfiber"
)

func InitTelemetry(ctx context.Context) (*trace.TracerProvider, error) {
    exporter, err := otlptracegrpc.New(ctx)
    if err != nil {
        return nil, err
    }
    
    tp := trace.NewTracerProvider(
        trace.WithBatcher(exporter),
        trace.WithResource(resource.NewWithAttributes(
            semconv.ServiceName("my-service"),
        )),
    )
    
    otel.SetTracerProvider(tp)
    return tp, nil
}

// Fiber middleware
app.Use(otelfiber.Middleware())

// Manual tracing
func GetUser(ctx context.Context, userID string) (*User, error) {
    tracer := otel.Tracer("user-service")
    ctx, span := tracer.Start(ctx, "GetUser")
    defer span.End()
    
    span.SetAttributes(attribute.String("user.id", userID))
    
    // ... fetch user
    
    return user, nil
}

Go - Prometheus

go
import (
    "github.com/prometheus/client_golang/prometheus"
    "github.com/prometheus/client_golang/prometheus/promhttp"
)

var (
    httpRequestsTotal = prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "http_requests_total",
            Help: "Total HTTP requests",
        },
        []string{"method", "path", "status"},
    )
    
    httpRequestDuration = prometheus.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "http_request_duration_seconds",
            Help:    "HTTP request duration",
            Buckets: prometheus.DefBuckets,
        },
        []string{"method", "path"},
    )
)

func init() {
    prometheus.MustRegister(httpRequestsTotal, httpRequestDuration)
}

// /metrics endpoint
http.Handle("/metrics", promhttp.Handler())

Python - OpenTelemetry

python
from opentelemetry import trace
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor

def init_telemetry():
    provider = TracerProvider()
    processor = BatchSpanProcessor(OTLPSpanExporter())
    provider.add_span_processor(processor)
    trace.set_tracer_provider(provider)

# Auto-instrument FastAPI
FastAPIInstrumentor.instrument_app(app)

# Manual tracing
tracer = trace.get_tracer(__name__)

@tracer.start_as_current_span("get_user")
def get_user(user_id: str):
    span = trace.get_current_span()
    span.set_attribute("user.id", user_id)
    # ... fetch user

Python - Prometheus

python
from prometheus_client import Counter, Histogram, generate_latest
from fastapi import Response

REQUEST_COUNT = Counter('http_requests_total', 'Total requests', ['method', 'path', 'status'])
REQUEST_LATENCY = Histogram('http_request_duration_seconds', 'Request latency', ['method', 'path'])

@app.middleware("http")
async def metrics_middleware(request: Request, call_next):
    start = time.time()
    response = await call_next(request)
    duration = time.time() - start
    
    REQUEST_COUNT.labels(request.method, request.url.path, response.status_code).inc()
    REQUEST_LATENCY.labels(request.method, request.url.path).observe(duration)
    
    return response

@app.get("/metrics")
def metrics():
    return Response(generate_latest(), media_type="text/plain")

Node.js - OpenTelemetry

typescript
import { NodeSDK } from '@opentelemetry/sdk-node';
import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-grpc';

const sdk = new NodeSDK({
  traceExporter: new OTLPTraceExporter(),
  instrumentations: [getNodeAutoInstrumentations()],
  serviceName: 'my-service',
});

sdk.start();

// Manual tracing
import { trace } from '@opentelemetry/api';

const tracer = trace.getTracer('user-service');

async function getUser(userId: string) {
  return tracer.startActiveSpan('getUser', async (span) => {
    span.setAttribute('user.id', userId);
    
    try {
      const user = await db.users.findOne({ id: userId });
      return user;
    } finally {
      span.end();
    }
  });
}

Structured Logging

rust
// JSON logging for production
tracing_subscriber::fmt()
    .json()
    .with_current_span(true)
    .with_span_list(true)
    .init();

// Log with context
info!(
    user_id = %user.id,
    action = "login",
    ip = %request.ip(),
    "User logged in successfully"
);

Alerting (Prometheus)

yaml
# prometheus/alerts.yml
groups:
  - name: api
    rules:
      - alert: HighErrorRate
        expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High error rate detected"
      
      - alert: SlowResponses
        expr: histogram_quantile(0.95, http_request_duration_seconds_bucket) > 1
        for: 5m
        labels:
          severity: warning

Related Skills

  • devops - Grafana, Prometheus deployment
  • security - Security event logging
  • databases - Query performance tracing