Better server manager

2026-02-16 13:26:12 +00:00 · 2025-12-29 17:19:16 +02:00
parent 8f83e8fdc1
commit d4a6f9c4c2
8 changed files with 1475 additions and 857 deletions
--- a/pkg/server/manager.go
+++ b/pkg/server/manager.go
@@ -4,26 +4,173 @@ import (
 	"context"
 	"crypto/tls"
 	"fmt"
+	"net"
 	"net/http"
+	"os"
+	"os/signal"
 	"sync"
+	"sync/atomic"
+	"syscall"
 	"time"

 	"github.com/bitechdev/ResolveSpec/pkg/logger"
 	"github.com/bitechdev/ResolveSpec/pkg/middleware"
 	"github.com/klauspost/compress/gzhttp"
-	"golang.org/x/net/http2"
 )

-// serverManager manages a collection of server instances.
+// gracefulServer wraps http.Server with graceful shutdown capabilities (internal type)
+type gracefulServer struct {
+	server           *http.Server
+	shutdownTimeout  time.Duration
+	drainTimeout     time.Duration
+	inFlightRequests atomic.Int64
+	isShuttingDown   atomic.Bool
+	shutdownOnce     sync.Once
+	shutdownComplete chan struct{}
+}
+
+// trackRequestsMiddleware tracks in-flight requests and blocks new requests during shutdown
+func (gs *gracefulServer) trackRequestsMiddleware(next http.Handler) http.Handler {
+	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		// Check if shutting down
+		if gs.isShuttingDown.Load() {
+			http.Error(w, `{"error":"service_unavailable","message":"Server is shutting down"}`, http.StatusServiceUnavailable)
+			return
+		}
+
+		// Increment in-flight counter
+		gs.inFlightRequests.Add(1)
+		defer gs.inFlightRequests.Add(-1)
+
+		// Serve the request
+		next.ServeHTTP(w, r)
+	})
+}
+
+// shutdown performs graceful shutdown with request draining
+func (gs *gracefulServer) shutdown(ctx context.Context) error {
+	var shutdownErr error
+
+	gs.shutdownOnce.Do(func() {
+		logger.Info("Starting graceful shutdown...")
+
+		// Mark as shutting down (new requests will be rejected)
+		gs.isShuttingDown.Store(true)
+
+		// Create context with timeout
+		shutdownCtx, cancel := context.WithTimeout(ctx, gs.shutdownTimeout)
+		defer cancel()
+
+		// Wait for in-flight requests to complete (with drain timeout)
+		drainCtx, drainCancel := context.WithTimeout(shutdownCtx, gs.drainTimeout)
+		defer drainCancel()
+
+		shutdownErr = gs.drainRequests(drainCtx)
+		if shutdownErr != nil {
+			logger.Error("Error draining requests: %v", shutdownErr)
+		}
+
+		// Shutdown the server
+		logger.Info("Shutting down HTTP server...")
+		if err := gs.server.Shutdown(shutdownCtx); err != nil {
+			logger.Error("Error shutting down server: %v", err)
+			if shutdownErr == nil {
+				shutdownErr = err
+			}
+		}
+
+		logger.Info("Graceful shutdown complete")
+		close(gs.shutdownComplete)
+	})
+
+	return shutdownErr
+}
+
+// drainRequests waits for in-flight requests to complete
+func (gs *gracefulServer) drainRequests(ctx context.Context) error {
+	ticker := time.NewTicker(100 * time.Millisecond)
+	defer ticker.Stop()
+
+	startTime := time.Now()
+
+	for {
+		inFlight := gs.inFlightRequests.Load()
+
+		if inFlight == 0 {
+			logger.Info("All requests drained in %v", time.Since(startTime))
+			return nil
+		}
+
+		select {
+		case <-ctx.Done():
+			logger.Warn("Drain timeout exceeded with %d requests still in flight", inFlight)
+			return fmt.Errorf("drain timeout exceeded: %d requests still in flight", inFlight)
+		case <-ticker.C:
+			logger.Debug("Waiting for %d in-flight requests to complete...", inFlight)
+		}
+	}
+}
+
+// inFlightRequests returns the current number of in-flight requests
+func (gs *gracefulServer) inFlightRequestsCount() int64 {
+	return gs.inFlightRequests.Load()
+}
+
+// isShutdown returns true if the server is shutting down
+func (gs *gracefulServer) isShutdown() bool {
+	return gs.isShuttingDown.Load()
+}
+
+// wait blocks until shutdown is complete
+func (gs *gracefulServer) wait() {
+	<-gs.shutdownComplete
+}
+
+// healthCheckHandler returns a handler that responds to health checks
+func (gs *gracefulServer) healthCheckHandler() http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		if gs.isShutdown() {
+			http.Error(w, `{"status":"shutting_down"}`, http.StatusServiceUnavailable)
+			return
+		}
+
+		w.Header().Set("Content-Type", "application/json")
+		w.WriteHeader(http.StatusOK)
+		_, err := w.Write([]byte(`{"status":"healthy"}`))
+		if err != nil {
+			logger.Warn("Failed to write health check response: %v", err)
+		}
+	}
+}
+
+// readinessHandler returns a handler for readiness checks
+func (gs *gracefulServer) readinessHandler() http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		if gs.isShutdown() {
+			http.Error(w, `{"ready":false,"reason":"shutting_down"}`, http.StatusServiceUnavailable)
+			return
+		}
+
+		inFlight := gs.inFlightRequestsCount()
+		w.Header().Set("Content-Type", "application/json")
+		w.WriteHeader(http.StatusOK)
+		fmt.Fprintf(w, `{"ready":true,"in_flight_requests":%d}`, inFlight)
+	}
+}
+
+// serverManager manages a collection of server instances with graceful shutdown support.
 type serverManager struct {
-	instances map[string]Instance
-	mu        sync.RWMutex
+	instances         map[string]Instance
+	mu                sync.RWMutex
+	shutdownCallbacks []ShutdownCallback
+	callbacksMu       sync.Mutex
 }

 // NewManager creates a new server manager.
 func NewManager() Manager {
 	return &serverManager{
-		instances: make(map[string]Instance),
+		instances:         make(map[string]Instance),
+		shutdownCallbacks: make([]ShutdownCallback, 0),
 	}
 }

@@ -74,7 +221,7 @@ func (sm *serverManager) Remove(name string) error {
 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 	defer cancel()
 	if err := instance.Stop(ctx); err != nil {
-		logger.Warn("Failed to gracefully stop server '%s' on remove: %v", name, err, context.Background())
+		logger.Warn("Failed to gracefully stop server '%s' on remove: %v", name, err)
 	}

 	delete(sm.instances, name)
@@ -94,7 +241,6 @@ func (sm *serverManager) StartAll() error {
 	}

 	if len(startErrors) > 0 {
-		// In a real-world scenario, you might want a more sophisticated error handling strategy
 		return fmt.Errorf("encountered errors while starting servers: %v", startErrors)
 	}
 	return nil
@@ -102,6 +248,11 @@ func (sm *serverManager) StartAll() error {

 // StopAll gracefully shuts down all running server instances.
 func (sm *serverManager) StopAll() error {
+	return sm.StopAllWithContext(context.Background())
+}
+
+// StopAllWithContext gracefully shuts down all running server instances with a context.
+func (sm *serverManager) StopAllWithContext(ctx context.Context) error {
 	sm.mu.RLock()
 	instancesToStop := make([]Instance, 0, len(sm.instances))
 	for _, instance := range sm.instances {
@@ -109,19 +260,38 @@ func (sm *serverManager) StopAll() error {
 	}
 	sm.mu.RUnlock()

-	logger.Info("Shutting down all servers...", context.Background())
+	logger.Info("Shutting down all servers...")

+	// Execute shutdown callbacks first
+	sm.callbacksMu.Lock()
+	callbacks := make([]ShutdownCallback, len(sm.shutdownCallbacks))
+	copy(callbacks, sm.shutdownCallbacks)
+	sm.callbacksMu.Unlock()
+
+	if len(callbacks) > 0 {
+		logger.Info("Executing %d shutdown callbacks...", len(callbacks))
+		for i, cb := range callbacks {
+			if err := cb(ctx); err != nil {
+				logger.Error("Shutdown callback %d failed: %v", i+1, err)
+			}
+		}
+	}
+
+	// Stop all instances in parallel
 	var shutdownErrors []error
 	var wg sync.WaitGroup
+	var errorsMu sync.Mutex

 	for _, instance := range instancesToStop {
 		wg.Add(1)
 		go func(inst Instance) {
 			defer wg.Done()
-			ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
+			shutdownCtx, cancel := context.WithTimeout(ctx, 15*time.Second)
 			defer cancel()
-			if err := inst.Stop(ctx); err != nil {
-				shutdownErrors = append(shutdownErrors, fmt.Errorf("failed to stop server '%s': %w", inst.Addr(), err))
+			if err := inst.Stop(shutdownCtx); err != nil {
+				errorsMu.Lock()
+				shutdownErrors = append(shutdownErrors, fmt.Errorf("failed to stop server '%s': %w", inst.Name(), err))
+				errorsMu.Unlock()
 			}
 		}(instance)
 	}
@@ -131,13 +301,13 @@ func (sm *serverManager) StopAll() error {
 	if len(shutdownErrors) > 0 {
 		return fmt.Errorf("encountered errors while stopping servers: %v", shutdownErrors)
 	}
-	logger.Info("All servers stopped gracefully.", context.Background())
+	logger.Info("All servers stopped gracefully.")
 	return nil
 }

 // RestartAll gracefully restarts all running server instances.
 func (sm *serverManager) RestartAll() error {
-	logger.Info("Restarting all servers...", context.Background())
+	logger.Info("Restarting all servers...")
 	if err := sm.StopAll(); err != nil {
 		return fmt.Errorf("failed to stop servers during restart: %w", err)
 	}
@@ -148,7 +318,7 @@ func (sm *serverManager) RestartAll() error {
 	if err := sm.StartAll(); err != nil {
 		return fmt.Errorf("failed to start servers during restart: %w", err)
 	}
-	logger.Info("All servers restarted successfully.", context.Background())
+	logger.Info("All servers restarted successfully.")
 	return nil
 }

@@ -164,13 +334,46 @@ func (sm *serverManager) List() []Instance {
 	return instances
 }

+// RegisterShutdownCallback registers a callback to be called during shutdown.
+func (sm *serverManager) RegisterShutdownCallback(cb ShutdownCallback) {
+	sm.callbacksMu.Lock()
+	defer sm.callbacksMu.Unlock()
+	sm.shutdownCallbacks = append(sm.shutdownCallbacks, cb)
+}
+
+// ServeWithGracefulShutdown starts all servers and blocks until a shutdown signal is received.
+func (sm *serverManager) ServeWithGracefulShutdown() error {
+	// Start all servers
+	if err := sm.StartAll(); err != nil {
+		return fmt.Errorf("failed to start servers: %w", err)
+	}
+
+	logger.Info("All servers started. Waiting for shutdown signal...")
+
+	// Wait for interrupt signal
+	sigChan := make(chan os.Signal, 1)
+	signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM, syscall.SIGINT)
+
+	sig := <-sigChan
+	logger.Info("Received signal: %v, initiating graceful shutdown", sig)
+
+	// Create context with timeout for shutdown
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+
+	return sm.StopAllWithContext(ctx)
+}
+
 // serverInstance is a concrete implementation of the Instance interface.
+// It wraps gracefulServer to provide graceful shutdown capabilities.
 type serverInstance struct {
-	cfg        Config
-	httpServer *http.Server
-	mu         sync.RWMutex
-	running    bool
-	stopCh     chan struct{}
+	cfg            Config
+	gracefulServer *gracefulServer
+	certFile       string // Path to certificate file (may be temporary for self-signed)
+	keyFile        string // Path to key file (may be temporary for self-signed)
+	mu             sync.RWMutex
+	running        bool
+	serverErr      chan error
 }

 // newInstance creates a new, unstarted server instance from a config.
@@ -179,12 +382,29 @@ func newInstance(cfg Config) (*serverInstance, error) {
 		return nil, fmt.Errorf("handler cannot be nil")
 	}

+	// Set default timeouts
+	if cfg.ShutdownTimeout == 0 {
+		cfg.ShutdownTimeout = 30 * time.Second
+	}
+	if cfg.DrainTimeout == 0 {
+		cfg.DrainTimeout = 25 * time.Second
+	}
+	if cfg.ReadTimeout == 0 {
+		cfg.ReadTimeout = 15 * time.Second
+	}
+	if cfg.WriteTimeout == 0 {
+		cfg.WriteTimeout = 15 * time.Second
+	}
+	if cfg.IdleTimeout == 0 {
+		cfg.IdleTimeout = 60 * time.Second
+	}
+
 	addr := fmt.Sprintf("%s:%d", cfg.Host, cfg.Port)
 	var handler http.Handler = cfg.Handler

 	// Wrap with GZIP handler if enabled
 	if cfg.GZIP {
-		gz, err := gzhttp.NewWrapper(gzhttp.BestSpeed)
+		gz, err := gzhttp.NewWrapper()
 		if err != nil {
 			return nil, fmt.Errorf("failed to create GZIP wrapper: %w", err)
 		}
@@ -194,20 +414,33 @@ func newInstance(cfg Config) (*serverInstance, error) {
 	// Wrap with the panic recovery middleware
 	handler = middleware.PanicRecovery(handler)

-	// Here you could add other default middleware like request logging, metrics, etc.
+	// Configure TLS if any TLS option is enabled
+	tlsConfig, certFile, keyFile, err := configureTLS(cfg)
+	if err != nil {
+		return nil, fmt.Errorf("failed to configure TLS: %w", err)
+	}

-	httpServer := &http.Server{
-		Addr:         addr,
-		Handler:      handler,
-		ReadTimeout:  15 * time.Second,
-		WriteTimeout: 15 * time.Second,
-		IdleTimeout:  60 * time.Second,
+	// Create gracefulServer
+	gracefulSrv := &gracefulServer{
+		server: &http.Server{
+			Addr:         addr,
+			Handler:      handler,
+			ReadTimeout:  cfg.ReadTimeout,
+			WriteTimeout: cfg.WriteTimeout,
+			IdleTimeout:  cfg.IdleTimeout,
+			TLSConfig:    tlsConfig,
+		},
+		shutdownTimeout:  cfg.ShutdownTimeout,
+		drainTimeout:     cfg.DrainTimeout,
+		shutdownComplete: make(chan struct{}),
 	}

 	return &serverInstance{
-		cfg:        cfg,
-		httpServer: httpServer,
-		stopCh:     make(chan struct{}),
+		cfg:            cfg,
+		gracefulServer: gracefulSrv,
+		certFile:       certFile,
+		keyFile:        keyFile,
+		serverErr:      make(chan error, 1),
 	}, nil
 }

@@ -220,42 +453,69 @@ func (s *serverInstance) Start() error {
 		return fmt.Errorf("server '%s' is already running", s.cfg.Name)
 	}

-	hasSSL := s.cfg.SSLCert != "" && s.cfg.SSLKey != ""
+	// Determine if we're using TLS
+	useTLS := s.cfg.SSLCert != "" || s.cfg.SSLKey != "" || s.cfg.SelfSignedSSL || s.cfg.AutoTLS
+
+	// Wrap handler with request tracking
+	s.gracefulServer.server.Handler = s.gracefulServer.trackRequestsMiddleware(s.gracefulServer.server.Handler)

 	go func() {
 		defer func() {
 			s.mu.Lock()
 			s.running = false
 			s.mu.Unlock()
-			logger.Info("Server '%s' stopped.", s.cfg.Name, context.Background())
+			logger.Info("Server '%s' stopped.", s.cfg.Name)
 		}()

 		var err error
 		protocol := "HTTP"

-		if hasSSL {
+		if useTLS {
 			protocol = "HTTPS"
-			// Configure TLS + HTTP/2
-			s.httpServer.TLSConfig = &tls.Config{
-				MinVersion: tls.VersionTLS12,
+			logger.Info("Starting %s server '%s' on %s", protocol, s.cfg.Name, s.Addr())
+
+			// For AutoTLS, we need to use a TLS listener
+			if s.cfg.AutoTLS {
+				// Create listener
+				ln, lnErr := net.Listen("tcp", s.gracefulServer.server.Addr)
+				if lnErr != nil {
+					err = fmt.Errorf("failed to create listener: %w", lnErr)
+				} else {
+					// Wrap with TLS
+					tlsListener := tls.NewListener(ln, s.gracefulServer.server.TLSConfig)
+					err = s.gracefulServer.server.Serve(tlsListener)
+				}
+			} else {
+				// Use certificate files (regular SSL or self-signed)
+				err = s.gracefulServer.server.ListenAndServeTLS(s.certFile, s.keyFile)
 			}
-			            logger.Info("Starting %s server '%s' on %s", protocol, s.cfg.Name, s.Addr(), context.Background())			err = s.httpServer.ListenAndServeTLS(s.cfg.SSLCert, s.cfg.SSLKey)
 		} else {
-			logger.Info("Starting %s server '%s' on %s", protocol, s.cfg.Name, s.Addr(), context.Background())
-			err = s.httpServer.ListenAndServe()
+			logger.Info("Starting %s server '%s' on %s", protocol, s.cfg.Name, s.Addr())
+			err = s.gracefulServer.server.ListenAndServe()
 		}

-		// If the server stopped for a reason other than a graceful shutdown, log the error.
+		// If the server stopped for a reason other than a graceful shutdown, log and report the error.
 		if err != nil && err != http.ErrServerClosed {
-			logger.Error("Server '%s' failed: %v", s.cfg.Name, err, context.Background())
+			logger.Error("Server '%s' failed: %v", s.cfg.Name, err)
+			select {
+			case s.serverErr <- err:
+			default:
+			}
 		}
 	}()

 	s.running = true
 	// A small delay to allow the goroutine to start and potentially fail on binding.
-	// A more robust solution might involve a channel signal.
 	time.Sleep(50 * time.Millisecond)

+	// Check if the server failed to start
+	select {
+	case err := <-s.serverErr:
+		s.running = false
+		return err
+	default:
+	}
+
 	return nil
 }

@@ -269,7 +529,7 @@ func (s *serverInstance) Stop(ctx context.Context) error {
 	}

 	logger.Info("Gracefully shutting down server '%s'...", s.cfg.Name)
-	err := s.httpServer.Shutdown(ctx)
+	err := s.gracefulServer.shutdown(ctx)
 	if err == nil {
 		s.running = false
 	}
@@ -278,5 +538,35 @@ func (s *serverInstance) Stop(ctx context.Context) error {

 // Addr returns the network address the server is listening on.
 func (s *serverInstance) Addr() string {
-	return s.httpServer.Addr
+	return s.gracefulServer.server.Addr
+}
+
+// Name returns the server instance name.
+func (s *serverInstance) Name() string {
+	return s.cfg.Name
+}
+
+// HealthCheckHandler returns a handler that responds to health checks.
+func (s *serverInstance) HealthCheckHandler() http.HandlerFunc {
+	return s.gracefulServer.healthCheckHandler()
+}
+
+// ReadinessHandler returns a handler for readiness checks.
+func (s *serverInstance) ReadinessHandler() http.HandlerFunc {
+	return s.gracefulServer.readinessHandler()
+}
+
+// InFlightRequests returns the current number of in-flight requests.
+func (s *serverInstance) InFlightRequests() int64 {
+	return s.gracefulServer.inFlightRequestsCount()
+}
+
+// IsShuttingDown returns true if the server is shutting down.
+func (s *serverInstance) IsShuttingDown() bool {
+	return s.gracefulServer.isShutdown()
+}
+
+// Wait blocks until shutdown is complete.
+func (s *serverInstance) Wait() {
+	s.gracefulServer.wait()
 }