ResolveSpec/pkg/tracing/README.md
Hein b741958895
Some checks failed
Tests / Run Tests (1.23.x) (push) Has been cancelled
Tests / Run Tests (1.24.x) (push) Has been cancelled
Tests / Lint Code (push) Has been cancelled
Tests / Build (push) Has been cancelled
Code sanity fixes, added middlewares
2025-12-08 08:28:43 +02:00

10 KiB

Tracing Package

OpenTelemetry distributed tracing for ResolveSpec.

Quick Start

import "github.com/bitechdev/ResolveSpec/pkg/tracing"

// Initialize tracer
config := tracing.Config{
    ServiceName:    "my-api",
    ServiceVersion: "1.0.0",
    Endpoint:       "localhost:4317", // OTLP collector
    Enabled:        true,
}

shutdown, err := tracing.InitTracer(config)
if err != nil {
    log.Fatal(err)
}
defer shutdown(context.Background())

// Apply middleware
router.Use(tracing.Middleware)

Configuration

type Config struct {
    ServiceName    string  // Service identifier
    ServiceVersion string  // Version for tracking deployments
    Endpoint       string  // OTLP collector endpoint (e.g., "localhost:4317")
    Enabled        bool    // Enable/disable tracing
}

Environment-based Configuration

import "os"

config := tracing.Config{
    ServiceName:    os.Getenv("SERVICE_NAME"),
    ServiceVersion: os.Getenv("VERSION"),
    Endpoint:       getEnv("OTEL_ENDPOINT", "localhost:4317"),
    Enabled:        getEnv("TRACING_ENABLED", "true") == "true",
}

Automatic HTTP Tracing

The middleware automatically creates spans for all HTTP requests:

router.Use(tracing.Middleware)

Captured attributes:

  • HTTP method
  • HTTP URL
  • HTTP path
  • HTTP scheme
  • Host name
  • Span kind (server)

Manual Span Creation

Basic Span

import "go.opentelemetry.io/otel/attribute"

func processOrder(ctx context.Context, orderID string) error {
    ctx, span := tracing.StartSpan(ctx, "process-order",
        attribute.String("order.id", orderID),
    )
    defer span.End()

    // Your logic here...
    return nil
}

Nested Spans

func handleRequest(ctx context.Context) error {
    ctx, span := tracing.StartSpan(ctx, "handle-request")
    defer span.End()

    // Child span 1
    if err := validateInput(ctx); err != nil {
        return err
    }

    // Child span 2
    if err := processData(ctx); err != nil {
        return err
    }

    return nil
}

func validateInput(ctx context.Context) error {
    ctx, span := tracing.StartSpan(ctx, "validate-input")
    defer span.End()

    // Validation logic...
    return nil
}

func processData(ctx context.Context) error {
    ctx, span := tracing.StartSpan(ctx, "process-data")
    defer span.End()

    // Processing logic...
    return nil
}

Adding Attributes

import "go.opentelemetry.io/otel/attribute"

ctx, span := tracing.StartSpan(ctx, "database-query",
    attribute.String("db.table", "users"),
    attribute.String("db.operation", "SELECT"),
    attribute.Int("user.id", 123),
)
defer span.End()

Or add attributes later:

tracing.SetAttributes(ctx,
    attribute.String("result.status", "success"),
    attribute.Int("result.count", 42),
)

Recording Events

tracing.AddEvent(ctx, "cache-miss",
    attribute.String("cache.key", cacheKey),
)

tracing.AddEvent(ctx, "retry-attempt",
    attribute.Int("attempt", 2),
    attribute.String("reason", "timeout"),
)

Error Recording

result, err := someOperation()
if err != nil {
    tracing.RecordError(ctx, err)
    return err
}

With additional context:

if err != nil {
    span := tracing.SpanFromContext(ctx)
    span.RecordError(err)
    span.SetAttributes(
        attribute.String("error.type", "database"),
        attribute.Bool("error.retriable", true),
    )
    return err
}

Complete Example

package main

import (
    "context"
    "database/sql"
    "log"
    "net/http"
    "time"

    "github.com/bitechdev/ResolveSpec/pkg/tracing"
    "github.com/gorilla/mux"
    "go.opentelemetry.io/otel/attribute"
)

func main() {
    // Initialize tracing
    config := tracing.Config{
        ServiceName:    "user-service",
        ServiceVersion: "1.0.0",
        Endpoint:       "localhost:4317",
        Enabled:        true,
    }

    shutdown, err := tracing.InitTracer(config)
    if err != nil {
        log.Fatal(err)
    }
    defer shutdown(context.Background())

    // Create router
    router := mux.NewRouter()

    // Apply tracing middleware
    router.Use(tracing.Middleware)

    // Routes
    router.HandleFunc("/users/{id}", getUserHandler)

    log.Fatal(http.ListenAndServe(":8080", router))
}

func getUserHandler(w http.ResponseWriter, r *http.Request) {
    ctx := r.Context()

    // Extract user ID from path
    vars := mux.Vars(r)
    userID := vars["id"]

    // Create span for this operation
    ctx, span := tracing.StartSpan(ctx, "get-user",
        attribute.String("user.id", userID),
    )
    defer span.End()

    // Fetch user
    user, err := fetchUser(ctx, userID)
    if err != nil {
        tracing.RecordError(ctx, err)
        http.Error(w, "Internal Server Error", 500)
        return
    }

    // Record success
    tracing.SetAttributes(ctx,
        attribute.String("user.name", user.Name),
        attribute.Bool("user.active", user.Active),
    )

    // Return user...
}

func fetchUser(ctx context.Context, userID string) (*User, error) {
    // Create database span
    ctx, span := tracing.StartSpan(ctx, "db.query",
        attribute.String("db.system", "postgresql"),
        attribute.String("db.operation", "SELECT"),
        attribute.String("db.table", "users"),
    )
    defer span.End()

    start := time.Now()

    // Execute query
    user, err := queryUser(ctx, userID)

    // Record duration
    duration := time.Since(start)
    span.SetAttributes(
        attribute.Int64("db.duration_ms", duration.Milliseconds()),
    )

    if err != nil {
        tracing.RecordError(ctx, err)
        return nil, err
    }

    return user, nil
}

OpenTelemetry Collector Setup

Docker Compose

version: '3'
services:
  app:
    build: .
    ports:
      - "8080:8080"
    environment:
      - OTEL_ENDPOINT=otel-collector:4317
    depends_on:
      - otel-collector

  otel-collector:
    image: otel/opentelemetry-collector:latest
    command: ["--config=/etc/otel-collector-config.yaml"]
    volumes:
      - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml
    ports:
      - "4317:4317"   # OTLP gRPC
      - "4318:4318"   # OTLP HTTP

  jaeger:
    image: jaegertracing/all-in-one:latest
    ports:
      - "16686:16686" # Jaeger UI
      - "14250:14250" # Jaeger gRPC

Collector Configuration

otel-collector-config.yaml:

receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318

exporters:
  jaeger:
    endpoint: jaeger:14250
    tls:
      insecure: true

  logging:
    loglevel: debug

processors:
  batch:
    timeout: 10s

service:
  pipelines:
    traces:
      receivers: [otlp]
      processors: [batch]
      exporters: [jaeger, logging]

Viewing Traces

Jaeger UI

Access at http://localhost:16686

Finding traces:

  1. Select service: "my-api"
  2. Select operation: "GET /users/:id"
  3. Click "Find Traces"

Sample Trace

GET /users/123 (200ms)
├── get-user (180ms)
│   ├── validate-permissions (20ms)
│   ├── db.query (150ms)
│   │   └── SELECT FROM users WHERE id = 123
│   └── transform-response (10ms)
└── send-response (20ms)

Best Practices

1. Span Naming

Good:

tracing.StartSpan(ctx, "database.query.users")
tracing.StartSpan(ctx, "http.request.external-api")
tracing.StartSpan(ctx, "cache.get")

Bad:

tracing.StartSpan(ctx, "DoStuff")           // Too vague
tracing.StartSpan(ctx, "user_123_query")     // User-specific (high cardinality)

2. Attribute Keys

Follow OpenTelemetry semantic conventions:

// HTTP
attribute.String("http.method", "GET")
attribute.String("http.url", url)
attribute.Int("http.status_code", 200)

// Database
attribute.String("db.system", "postgresql")
attribute.String("db.table", "users")
attribute.String("db.operation", "SELECT")

// Custom
attribute.String("user.id", userID)
attribute.String("order.status", "pending")

3. Error Handling

Always record errors:

if err != nil {
    tracing.RecordError(ctx, err)
    // Also add context
    tracing.SetAttributes(ctx,
        attribute.Bool("error.retriable", isRetriable(err)),
        attribute.String("error.type", errorType(err)),
    )
    return err
}

4. Sampling

For high-traffic services, configure sampling:

// In production: sample 10% of traces
// Currently using AlwaysSample() - update in tracing.go if needed

5. Context Propagation

Always pass context through the call chain:

func handler(w http.ResponseWriter, r *http.Request) {
    ctx := r.Context()  // Get context from request
    processRequest(ctx) // Pass it down
}

func processRequest(ctx context.Context) {
    // Context carries trace information
    ctx, span := tracing.StartSpan(ctx, "process")
    defer span.End()

    // Pass to next function
    saveData(ctx)
}

Performance Impact

  • Overhead: <1% CPU, <5MB memory
  • Latency: <100μs per span
  • Safe for production at high throughput

Troubleshooting

Traces Not Appearing

  1. Check collector is running:

    docker-compose ps
    
  2. Verify endpoint:

    Endpoint: "localhost:4317"  // Correct
    Endpoint: "http://localhost:4317"  // Wrong (no http://)
    
  3. Check logs:

    docker-compose logs otel-collector
    

Disable Tracing

config := tracing.Config{
    Enabled: false, // Tracing disabled
}

TLS in Production

Update tracing.go line with TLS credentials:

client := otlptracegrpc.NewClient(
    otlptracegrpc.WithEndpoint(config.Endpoint),
    otlptracegrpc.WithTLSCredentials(credentials.NewClientTLSFromCert(nil, "")),
)

Integration with Metrics

Combine with metrics for full observability:

import (
    "github.com/bitechdev/ResolveSpec/pkg/metrics"
    "github.com/bitechdev/ResolveSpec/pkg/tracing"
)

// Apply both
router.Use(metrics.GetProvider().Middleware)
router.Use(tracing.Middleware)

Distributed Tracing

Traces automatically propagate across services via HTTP headers:

Service A:

// Create request with trace context
req, _ := http.NewRequestWithContext(ctx, "GET", "http://service-b/api", nil)
resp, _ := client.Do(req)

Service B:

// Trace context automatically extracted by middleware
router.Use(tracing.Middleware)

The trace ID propagates across both services, creating a unified trace.