mirror of
https://github.com/bitechdev/ResolveSpec.git
synced 2025-12-14 17:30:35 +00:00
534 lines
10 KiB
Markdown
534 lines
10 KiB
Markdown
# Tracing Package
|
|
|
|
OpenTelemetry distributed tracing for ResolveSpec.
|
|
|
|
## Quick Start
|
|
|
|
```go
|
|
import "github.com/bitechdev/ResolveSpec/pkg/tracing"
|
|
|
|
// Initialize tracer
|
|
config := tracing.Config{
|
|
ServiceName: "my-api",
|
|
ServiceVersion: "1.0.0",
|
|
Endpoint: "localhost:4317", // OTLP collector
|
|
Enabled: true,
|
|
}
|
|
|
|
shutdown, err := tracing.InitTracer(config)
|
|
if err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
defer shutdown(context.Background())
|
|
|
|
// Apply middleware
|
|
router.Use(tracing.Middleware)
|
|
```
|
|
|
|
## Configuration
|
|
|
|
```go
|
|
type Config struct {
|
|
ServiceName string // Service identifier
|
|
ServiceVersion string // Version for tracking deployments
|
|
Endpoint string // OTLP collector endpoint (e.g., "localhost:4317")
|
|
Enabled bool // Enable/disable tracing
|
|
}
|
|
```
|
|
|
|
### Environment-based Configuration
|
|
|
|
```go
|
|
import "os"
|
|
|
|
config := tracing.Config{
|
|
ServiceName: os.Getenv("SERVICE_NAME"),
|
|
ServiceVersion: os.Getenv("VERSION"),
|
|
Endpoint: getEnv("OTEL_ENDPOINT", "localhost:4317"),
|
|
Enabled: getEnv("TRACING_ENABLED", "true") == "true",
|
|
}
|
|
```
|
|
|
|
## Automatic HTTP Tracing
|
|
|
|
The middleware automatically creates spans for all HTTP requests:
|
|
|
|
```go
|
|
router.Use(tracing.Middleware)
|
|
```
|
|
|
|
**Captured attributes:**
|
|
- HTTP method
|
|
- HTTP URL
|
|
- HTTP path
|
|
- HTTP scheme
|
|
- Host name
|
|
- Span kind (server)
|
|
|
|
## Manual Span Creation
|
|
|
|
### Basic Span
|
|
|
|
```go
|
|
import "go.opentelemetry.io/otel/attribute"
|
|
|
|
func processOrder(ctx context.Context, orderID string) error {
|
|
ctx, span := tracing.StartSpan(ctx, "process-order",
|
|
attribute.String("order.id", orderID),
|
|
)
|
|
defer span.End()
|
|
|
|
// Your logic here...
|
|
return nil
|
|
}
|
|
```
|
|
|
|
### Nested Spans
|
|
|
|
```go
|
|
func handleRequest(ctx context.Context) error {
|
|
ctx, span := tracing.StartSpan(ctx, "handle-request")
|
|
defer span.End()
|
|
|
|
// Child span 1
|
|
if err := validateInput(ctx); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Child span 2
|
|
if err := processData(ctx); err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func validateInput(ctx context.Context) error {
|
|
ctx, span := tracing.StartSpan(ctx, "validate-input")
|
|
defer span.End()
|
|
|
|
// Validation logic...
|
|
return nil
|
|
}
|
|
|
|
func processData(ctx context.Context) error {
|
|
ctx, span := tracing.StartSpan(ctx, "process-data")
|
|
defer span.End()
|
|
|
|
// Processing logic...
|
|
return nil
|
|
}
|
|
```
|
|
|
|
## Adding Attributes
|
|
|
|
```go
|
|
import "go.opentelemetry.io/otel/attribute"
|
|
|
|
ctx, span := tracing.StartSpan(ctx, "database-query",
|
|
attribute.String("db.table", "users"),
|
|
attribute.String("db.operation", "SELECT"),
|
|
attribute.Int("user.id", 123),
|
|
)
|
|
defer span.End()
|
|
```
|
|
|
|
**Or add attributes later:**
|
|
|
|
```go
|
|
tracing.SetAttributes(ctx,
|
|
attribute.String("result.status", "success"),
|
|
attribute.Int("result.count", 42),
|
|
)
|
|
```
|
|
|
|
## Recording Events
|
|
|
|
```go
|
|
tracing.AddEvent(ctx, "cache-miss",
|
|
attribute.String("cache.key", cacheKey),
|
|
)
|
|
|
|
tracing.AddEvent(ctx, "retry-attempt",
|
|
attribute.Int("attempt", 2),
|
|
attribute.String("reason", "timeout"),
|
|
)
|
|
```
|
|
|
|
## Error Recording
|
|
|
|
```go
|
|
result, err := someOperation()
|
|
if err != nil {
|
|
tracing.RecordError(ctx, err)
|
|
return err
|
|
}
|
|
```
|
|
|
|
**With additional context:**
|
|
|
|
```go
|
|
if err != nil {
|
|
span := tracing.SpanFromContext(ctx)
|
|
span.RecordError(err)
|
|
span.SetAttributes(
|
|
attribute.String("error.type", "database"),
|
|
attribute.Bool("error.retriable", true),
|
|
)
|
|
return err
|
|
}
|
|
```
|
|
|
|
## Complete Example
|
|
|
|
```go
|
|
package main
|
|
|
|
import (
|
|
"context"
|
|
"database/sql"
|
|
"log"
|
|
"net/http"
|
|
"time"
|
|
|
|
"github.com/bitechdev/ResolveSpec/pkg/tracing"
|
|
"github.com/gorilla/mux"
|
|
"go.opentelemetry.io/otel/attribute"
|
|
)
|
|
|
|
func main() {
|
|
// Initialize tracing
|
|
config := tracing.Config{
|
|
ServiceName: "user-service",
|
|
ServiceVersion: "1.0.0",
|
|
Endpoint: "localhost:4317",
|
|
Enabled: true,
|
|
}
|
|
|
|
shutdown, err := tracing.InitTracer(config)
|
|
if err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
defer shutdown(context.Background())
|
|
|
|
// Create router
|
|
router := mux.NewRouter()
|
|
|
|
// Apply tracing middleware
|
|
router.Use(tracing.Middleware)
|
|
|
|
// Routes
|
|
router.HandleFunc("/users/{id}", getUserHandler)
|
|
|
|
log.Fatal(http.ListenAndServe(":8080", router))
|
|
}
|
|
|
|
func getUserHandler(w http.ResponseWriter, r *http.Request) {
|
|
ctx := r.Context()
|
|
|
|
// Extract user ID from path
|
|
vars := mux.Vars(r)
|
|
userID := vars["id"]
|
|
|
|
// Create span for this operation
|
|
ctx, span := tracing.StartSpan(ctx, "get-user",
|
|
attribute.String("user.id", userID),
|
|
)
|
|
defer span.End()
|
|
|
|
// Fetch user
|
|
user, err := fetchUser(ctx, userID)
|
|
if err != nil {
|
|
tracing.RecordError(ctx, err)
|
|
http.Error(w, "Internal Server Error", 500)
|
|
return
|
|
}
|
|
|
|
// Record success
|
|
tracing.SetAttributes(ctx,
|
|
attribute.String("user.name", user.Name),
|
|
attribute.Bool("user.active", user.Active),
|
|
)
|
|
|
|
// Return user...
|
|
}
|
|
|
|
func fetchUser(ctx context.Context, userID string) (*User, error) {
|
|
// Create database span
|
|
ctx, span := tracing.StartSpan(ctx, "db.query",
|
|
attribute.String("db.system", "postgresql"),
|
|
attribute.String("db.operation", "SELECT"),
|
|
attribute.String("db.table", "users"),
|
|
)
|
|
defer span.End()
|
|
|
|
start := time.Now()
|
|
|
|
// Execute query
|
|
user, err := queryUser(ctx, userID)
|
|
|
|
// Record duration
|
|
duration := time.Since(start)
|
|
span.SetAttributes(
|
|
attribute.Int64("db.duration_ms", duration.Milliseconds()),
|
|
)
|
|
|
|
if err != nil {
|
|
tracing.RecordError(ctx, err)
|
|
return nil, err
|
|
}
|
|
|
|
return user, nil
|
|
}
|
|
```
|
|
|
|
## OpenTelemetry Collector Setup
|
|
|
|
### Docker Compose
|
|
|
|
```yaml
|
|
version: '3'
|
|
services:
|
|
app:
|
|
build: .
|
|
ports:
|
|
- "8080:8080"
|
|
environment:
|
|
- OTEL_ENDPOINT=otel-collector:4317
|
|
depends_on:
|
|
- otel-collector
|
|
|
|
otel-collector:
|
|
image: otel/opentelemetry-collector:latest
|
|
command: ["--config=/etc/otel-collector-config.yaml"]
|
|
volumes:
|
|
- ./otel-collector-config.yaml:/etc/otel-collector-config.yaml
|
|
ports:
|
|
- "4317:4317" # OTLP gRPC
|
|
- "4318:4318" # OTLP HTTP
|
|
|
|
jaeger:
|
|
image: jaegertracing/all-in-one:latest
|
|
ports:
|
|
- "16686:16686" # Jaeger UI
|
|
- "14250:14250" # Jaeger gRPC
|
|
```
|
|
|
|
### Collector Configuration
|
|
|
|
**otel-collector-config.yaml:**
|
|
|
|
```yaml
|
|
receivers:
|
|
otlp:
|
|
protocols:
|
|
grpc:
|
|
endpoint: 0.0.0.0:4317
|
|
http:
|
|
endpoint: 0.0.0.0:4318
|
|
|
|
exporters:
|
|
jaeger:
|
|
endpoint: jaeger:14250
|
|
tls:
|
|
insecure: true
|
|
|
|
logging:
|
|
loglevel: debug
|
|
|
|
processors:
|
|
batch:
|
|
timeout: 10s
|
|
|
|
service:
|
|
pipelines:
|
|
traces:
|
|
receivers: [otlp]
|
|
processors: [batch]
|
|
exporters: [jaeger, logging]
|
|
```
|
|
|
|
## Viewing Traces
|
|
|
|
### Jaeger UI
|
|
|
|
Access at `http://localhost:16686`
|
|
|
|
**Finding traces:**
|
|
1. Select service: "my-api"
|
|
2. Select operation: "GET /users/:id"
|
|
3. Click "Find Traces"
|
|
|
|
### Sample Trace
|
|
|
|
```
|
|
GET /users/123 (200ms)
|
|
├── get-user (180ms)
|
|
│ ├── validate-permissions (20ms)
|
|
│ ├── db.query (150ms)
|
|
│ │ └── SELECT FROM users WHERE id = 123
|
|
│ └── transform-response (10ms)
|
|
└── send-response (20ms)
|
|
```
|
|
|
|
## Best Practices
|
|
|
|
### 1. Span Naming
|
|
|
|
**Good:**
|
|
```go
|
|
tracing.StartSpan(ctx, "database.query.users")
|
|
tracing.StartSpan(ctx, "http.request.external-api")
|
|
tracing.StartSpan(ctx, "cache.get")
|
|
```
|
|
|
|
**Bad:**
|
|
```go
|
|
tracing.StartSpan(ctx, "DoStuff") // Too vague
|
|
tracing.StartSpan(ctx, "user_123_query") // User-specific (high cardinality)
|
|
```
|
|
|
|
### 2. Attribute Keys
|
|
|
|
Follow OpenTelemetry semantic conventions:
|
|
|
|
```go
|
|
// HTTP
|
|
attribute.String("http.method", "GET")
|
|
attribute.String("http.url", url)
|
|
attribute.Int("http.status_code", 200)
|
|
|
|
// Database
|
|
attribute.String("db.system", "postgresql")
|
|
attribute.String("db.table", "users")
|
|
attribute.String("db.operation", "SELECT")
|
|
|
|
// Custom
|
|
attribute.String("user.id", userID)
|
|
attribute.String("order.status", "pending")
|
|
```
|
|
|
|
### 3. Error Handling
|
|
|
|
Always record errors:
|
|
|
|
```go
|
|
if err != nil {
|
|
tracing.RecordError(ctx, err)
|
|
// Also add context
|
|
tracing.SetAttributes(ctx,
|
|
attribute.Bool("error.retriable", isRetriable(err)),
|
|
attribute.String("error.type", errorType(err)),
|
|
)
|
|
return err
|
|
}
|
|
```
|
|
|
|
### 4. Sampling
|
|
|
|
For high-traffic services, configure sampling:
|
|
|
|
```go
|
|
// In production: sample 10% of traces
|
|
// Currently using AlwaysSample() - update in tracing.go if needed
|
|
```
|
|
|
|
### 5. Context Propagation
|
|
|
|
Always pass context through the call chain:
|
|
|
|
```go
|
|
func handler(w http.ResponseWriter, r *http.Request) {
|
|
ctx := r.Context() // Get context from request
|
|
processRequest(ctx) // Pass it down
|
|
}
|
|
|
|
func processRequest(ctx context.Context) {
|
|
// Context carries trace information
|
|
ctx, span := tracing.StartSpan(ctx, "process")
|
|
defer span.End()
|
|
|
|
// Pass to next function
|
|
saveData(ctx)
|
|
}
|
|
```
|
|
|
|
## Performance Impact
|
|
|
|
- **Overhead**: <1% CPU, <5MB memory
|
|
- **Latency**: <100μs per span
|
|
- **Safe for production** at high throughput
|
|
|
|
## Troubleshooting
|
|
|
|
### Traces Not Appearing
|
|
|
|
1. **Check collector is running:**
|
|
```bash
|
|
docker-compose ps
|
|
```
|
|
|
|
2. **Verify endpoint:**
|
|
```go
|
|
Endpoint: "localhost:4317" // Correct
|
|
Endpoint: "http://localhost:4317" // Wrong (no http://)
|
|
```
|
|
|
|
3. **Check logs:**
|
|
```bash
|
|
docker-compose logs otel-collector
|
|
```
|
|
|
|
### Disable Tracing
|
|
|
|
```go
|
|
config := tracing.Config{
|
|
Enabled: false, // Tracing disabled
|
|
}
|
|
```
|
|
|
|
### TLS in Production
|
|
|
|
Update `tracing.go` line with TLS credentials:
|
|
|
|
```go
|
|
client := otlptracegrpc.NewClient(
|
|
otlptracegrpc.WithEndpoint(config.Endpoint),
|
|
otlptracegrpc.WithTLSCredentials(credentials.NewClientTLSFromCert(nil, "")),
|
|
)
|
|
```
|
|
|
|
## Integration with Metrics
|
|
|
|
Combine with metrics for full observability:
|
|
|
|
```go
|
|
import (
|
|
"github.com/bitechdev/ResolveSpec/pkg/metrics"
|
|
"github.com/bitechdev/ResolveSpec/pkg/tracing"
|
|
)
|
|
|
|
// Apply both
|
|
router.Use(metrics.GetProvider().Middleware)
|
|
router.Use(tracing.Middleware)
|
|
```
|
|
|
|
## Distributed Tracing
|
|
|
|
Traces automatically propagate across services via HTTP headers:
|
|
|
|
**Service A:**
|
|
```go
|
|
// Create request with trace context
|
|
req, _ := http.NewRequestWithContext(ctx, "GET", "http://service-b/api", nil)
|
|
resp, _ := client.Do(req)
|
|
```
|
|
|
|
**Service B:**
|
|
```go
|
|
// Trace context automatically extracted by middleware
|
|
router.Use(tracing.Middleware)
|
|
```
|
|
|
|
The trace ID propagates across both services, creating a unified trace.
|