Просмотр исходного кода

Add graceful shutdown system and improve watcher reliability

Backend Improvements:
- Added POST /shutdown endpoint for graceful server shutdown
- Implemented gracefulShutdown() method that properly closes all services
- Added closeDatabase() method to DbService with WAL checkpoint
- Stops task queue gracefully (waits for current task)
- Stops file watcher and clears all resources
- Ensures SQLite database integrity with proper WAL checkpoint

Watcher Reliability Fixes:
- Enabled polling mode for better network filesystem support
- Added awaitWriteFinish to handle incomplete file writes (5s threshold)
- Implemented callback timeout cleanup (5min) to prevent memory leaks
- Added activity monitoring every 5 minutes with health checks
- Enhanced error handling with detailed logging and recovery attempts
- Track event count and last activity time
- Log watcher health status periodically
- Cleanup all resources and timeouts on shutdown

UI Features:
- Created ShutdownButton component with confirmation dialog
- Added shutdown button to settings page
- Shows countdown timer when shutdown initiated
- Displays detailed list of services that will be stopped
- Confirmation dialog prevents accidental shutdowns

Benefits:
- Prevents SQLite database corruption on shutdown
- Ensures all file descriptors are properly closed
- Watcher is more stable for long-running operations
- Memory leaks prevented with timeout cleanup
- Better error recovery and logging
- Easy server shutdown via web UI
Timothy Pomeroy 2 недель назад
Родитель
Сommit
407308c60a

+ 36 - 0
apps/service/src/app.controller.ts

@@ -119,6 +119,42 @@ export class AppController {
     };
     };
   }
   }
 
 
+  @Post('shutdown')
+  async shutdown() {
+    if (!this.app) {
+      return {
+        success: false,
+        message: 'Application not properly initialized',
+      };
+    }
+
+    // Schedule shutdown for next tick to allow response to complete
+    setImmediate(async () => {
+      try {
+        console.log('Initiating graceful shutdown...');
+        
+        // Perform graceful shutdown of all services
+        await this.appService.gracefulShutdown();
+        
+        // Close the NestJS application
+        await this.app!.close();
+        
+        console.log('Graceful shutdown complete. Exiting...');
+        process.exit(0);
+      } catch (error) {
+        console.error('Error during graceful shutdown:', error);
+        // Force exit even on error to prevent hanging
+        process.exit(1);
+      }
+    });
+
+    return {
+      success: true,
+      message: 'API service shutting down gracefully...',
+      datetime: new Date().toISOString(),
+    };
+  }
+
   // --- Unified files CRUD endpoints below ---
   // --- Unified files CRUD endpoints below ---
 
 
   // Create a file record
   // Create a file record

+ 41 - 0
apps/service/src/app.service.ts

@@ -190,6 +190,47 @@ export class AppService {
     return this.taskQueue.getQueueStatus();
     return this.taskQueue.getQueueStatus();
   }
   }
 
 
+  /**
+   * Gracefully shutdown all services to prevent database corruption
+   */
+  async gracefulShutdown() {
+    console.log('Starting graceful shutdown of all services...');
+
+    try {
+      // 1. Stop accepting new tasks
+      console.log('Stopping task queue...');
+      await this.taskQueue.stop(true); // Graceful stop - wait for current task
+      console.log('Task queue stopped');
+    } catch (error) {
+      console.error('Error stopping task queue:', error);
+    }
+
+    try {
+      // 2. Stop the file watcher
+      console.log('Stopping file watcher...');
+      await this.watcher.stop();
+      console.log('File watcher stopped');
+    } catch (error) {
+      console.error('Error stopping watcher:', error);
+    }
+
+    try {
+      // 3. Close database connections properly
+      console.log('Closing database connections...')
+      this.db.closeDatabase();
+      console.log('Database closed');
+    } catch (error) {
+      console.error('Error closing database:', error);
+    }
+
+    console.log('Graceful shutdown complete');
+    return {
+      success: true,
+      message: 'All services shut down gracefully',
+      timestamp: new Date().toISOString(),
+    };
+  }
+
   // Task maintenance methods
   // Task maintenance methods
   cleanupTasksByStatus(status: string, olderThanDays?: number) {
   cleanupTasksByStatus(status: string, olderThanDays?: number) {
     return this.db.deleteTasksByStatus(status, olderThanDays);
     return this.db.deleteTasksByStatus(status, olderThanDays);

+ 21 - 0
apps/service/src/db.service.ts

@@ -63,6 +63,27 @@ export class DbService {
     return this.db;
     return this.db;
   }
   }
 
 
+  /**
+   * Gracefully close the database connection
+   * This ensures WAL is checkpointed and all data is flushed
+   */
+  closeDatabase(): void {
+    if (this.db) {
+      try {
+        // Checkpoint the WAL file to ensure all data is written to the main database
+        this.db.pragma('wal_checkpoint(TRUNCATE)');
+        console.log('Database WAL checkpoint completed');
+        
+        // Close the database connection
+        this.db.close();
+        console.log('Database connection closed');
+      } catch (error) {
+        console.error('Error closing database:', error);
+        throw error;
+      }
+    }
+  }
+
   // List all files
   // List all files
   listAllFiles() {
   listAllFiles() {
     return this.db.prepare('SELECT * FROM files').all();
     return this.db.prepare('SELECT * FROM files').all();

+ 146 - 3
apps/service/src/watcher.service.ts

@@ -24,6 +24,10 @@ export class WatcherService implements OnModuleDestroy {
   private logger = new Logger('WatcherService');
   private logger = new Logger('WatcherService');
   private validationWorker: Worker;
   private validationWorker: Worker;
   private validationCallbacks = new Map<string, (result: any) => void>();
   private validationCallbacks = new Map<string, (result: any) => void>();
+  private callbackTimeouts = new Map<string, NodeJS.Timeout>();
+  private lastEventTime: Date = new Date();
+  private activityCheckInterval: NodeJS.Timeout | null = null;
+  private eventCount = 0;
 
 
   constructor(
   constructor(
     @Inject(DatasetsService) private readonly datasetsService: DatasetsService,
     @Inject(DatasetsService) private readonly datasetsService: DatasetsService,
@@ -159,12 +163,27 @@ export class WatcherService implements OnModuleDestroy {
       });
       });
     };
     };
 
 
-    // Override options to be more conservative for file descriptor limits
+    // Override options with robust settings for long-running stability
     const conservativeOptions = {
     const conservativeOptions = {
       ...options,
       ...options,
+      // Polling is more reliable for network filesystems and prevents watcher from dying
+      usePolling: options.usePolling !== undefined ? options.usePolling : true,
       interval: Math.max(options.interval || 10000, 30000), // Minimum 30 seconds
       interval: Math.max(options.interval || 10000, 30000), // Minimum 30 seconds
+      binaryInterval: 60000, // Check binary files less frequently
       depth: options.depth !== undefined ? options.depth : 1,
       depth: options.depth !== undefined ? options.depth : 1,
       ignorePermissionErrors: true,
       ignorePermissionErrors: true,
+      // Wait for file writes to finish before emitting events
+      awaitWriteFinish: {
+        stabilityThreshold: 5000, // Wait 5 seconds after last change
+        pollInterval: 1000, // Check every second
+      },
+      // Prevent file descriptor leaks
+      persistent: true,
+      // Better error handling
+      ignoreInitial: false,
+      followSymlinks: false,
+      // Atomic write detection
+      atomic: true,
       ignored: (filePath: string) => {
       ignored: (filePath: string) => {
         // Use the shouldWatchFile function to filter files
         // Use the shouldWatchFile function to filter files
         return !shouldWatchFile(filePath);
         return !shouldWatchFile(filePath);
@@ -175,26 +194,43 @@ export class WatcherService implements OnModuleDestroy {
     this.isWatching = true;
     this.isWatching = true;
     this.lastWatches = enabledWatches;
     this.lastWatches = enabledWatches;
     this.lastOptions = conservativeOptions;
     this.lastOptions = conservativeOptions;
+    this.lastEventTime = new Date();
+    this.eventCount = 0;
+
     this.watcher
     this.watcher
       .on('add', (file: string) => {
       .on('add', (file: string) => {
+        this.updateActivity('add');
         this.handleFileAdded(file);
         this.handleFileAdded(file);
       })
       })
       .on('change', (file: string) => {
       .on('change', (file: string) => {
+        this.updateActivity('change');
         this.eventsGateway.emitFileUpdate({ type: 'change', file });
         this.eventsGateway.emitFileUpdate({ type: 'change', file });
       })
       })
       .on('unlink', (file: string) => {
       .on('unlink', (file: string) => {
+        this.updateActivity('unlink');
         this.eventsGateway.emitFileUpdate({ type: 'unlink', file });
         this.eventsGateway.emitFileUpdate({ type: 'unlink', file });
       })
       })
       .on('error', (error: Error) => {
       .on('error', (error: Error) => {
-        this.logger.error(`Watcher error: ${error}`);
+        this.logger.error(`Watcher error: ${error.message}`);
+        this.logger.error(`Error stack: ${error.stack}`);
         this.eventsGateway.emitWatcherUpdate({
         this.eventsGateway.emitWatcherUpdate({
           type: 'error',
           type: 'error',
           error: error.message,
           error: error.message,
         });
         });
+        // Don't let errors kill the watcher - try to recover
+        this.handleWatcherError(error);
       })
       })
       .on('ready', () => {
       .on('ready', () => {
         this.logger.log('Watcher is ready and monitoring for changes');
         this.logger.log('Watcher is ready and monitoring for changes');
+        this.logger.log(`Watching ${enabledWatches.length} path(s)`);
+        this.logger.log(`Polling enabled: ${conservativeOptions.usePolling}`);
+        this.startActivityMonitor();
+      })
+      .on('raw', (event, path, details) => {
+        // Log raw events for debugging (can be disabled in production)
+        this.logger.debug(`Raw event: ${event} on ${path}`);
       });
       });
+
     this.eventsGateway.emitWatcherUpdate({
     this.eventsGateway.emitWatcherUpdate({
       type: 'started',
       type: 'started',
       watches: enabledWatches,
       watches: enabledWatches,
@@ -258,8 +294,15 @@ export class WatcherService implements OnModuleDestroy {
       }
       }
     }
     }
 
 
-    // Offload validation to worker
+    // Offload validation to worker with timeout to prevent memory leaks
     this.validationCallbacks.set(file, (result) => {
     this.validationCallbacks.set(file, (result) => {
+      // Clear timeout when callback is called
+      const timeout = this.callbackTimeouts.get(file);
+      if (timeout) {
+        clearTimeout(timeout);
+        this.callbackTimeouts.delete(file);
+      }
+
       if (!result.isValid) {
       if (!result.isValid) {
         this.logger.warn(`File appears to be corrupted or incomplete: ${file}`);
         this.logger.warn(`File appears to be corrupted or incomplete: ${file}`);
         return;
         return;
@@ -269,6 +312,16 @@ export class WatcherService implements OnModuleDestroy {
       this.processValidFile(file, dataset);
       this.processValidFile(file, dataset);
     });
     });
 
 
+    // Set timeout to cleanup callback if worker doesn't respond within 5 minutes
+    const timeout = setTimeout(() => {
+      if (this.validationCallbacks.has(file)) {
+        this.logger.warn(`Validation timeout for file: ${file}`);
+        this.validationCallbacks.delete(file);
+        this.callbackTimeouts.delete(file);
+      }
+    }, 300000); // 5 minutes
+
+    this.callbackTimeouts.set(file, timeout);
     this.validationWorker.postMessage({ type: 'validate_file', file });
     this.validationWorker.postMessage({ type: 'validate_file', file });
   }
   }
 
 
@@ -519,7 +572,81 @@ export class WatcherService implements OnModuleDestroy {
     return null;
     return null;
   }
   }
 
 
+  private updateActivity(eventType: string) {
+    this.lastEventTime = new Date();
+    this.eventCount++;
+    if (this.eventCount % 100 === 0) {
+      this.logger.log(
+        `Watcher activity: ${this.eventCount} events processed, last: ${eventType}`,
+      );
+    }
+  }
+
+  private startActivityMonitor() {
+    // Stop any existing monitor
+    if (this.activityCheckInterval) {
+      clearInterval(this.activityCheckInterval);
+    }
+
+    // Check for watcher activity every 5 minutes
+    this.activityCheckInterval = setInterval(() => {
+      const now = new Date();
+      const timeSinceLastEvent = now.getTime() - this.lastEventTime.getTime();
+      const minutesSinceLastEvent = Math.floor(timeSinceLastEvent / 60000);
+
+      this.logger.log(
+        `Watcher health check - Events: ${this.eventCount}, Last activity: ${minutesSinceLastEvent}m ago, Status: ${this.isWatching ? 'active' : 'inactive'}`,
+      );
+
+      // Verify watcher is still watching
+      if (this.watcher && this.isWatching) {
+        const watchedPaths = this.watcher.getWatched();
+        const pathCount = Object.keys(watchedPaths).length;
+        this.logger.log(`Currently watching ${pathCount} directories`);
+
+        if (pathCount === 0 && this.lastWatches.length > 0) {
+          this.logger.error(
+            'CRITICAL: Watcher has no watched paths but should be watching!',
+          );
+          this.eventsGateway.emitWatcherUpdate({
+            type: 'health_alert',
+            healthy: false,
+            reason: 'Watcher lost all watched paths',
+          });
+        }
+      }
+    }, 300000); // Every 5 minutes
+  }
+
+  private handleWatcherError(error: Error) {
+    // Log detailed error information
+    this.logger.error(
+      'Watcher encountered an error, attempting to continue...',
+    );
+
+    // Check if watcher is still functional
+    if (this.watcher) {
+      try {
+        const watchedPaths = this.watcher.getWatched();
+        const pathCount = Object.keys(watchedPaths).length;
+        if (pathCount === 0) {
+          this.logger.error('Watcher has stopped watching paths after error!');
+        } else {
+          this.logger.log(`Watcher still monitoring ${pathCount} directories`);
+        }
+      } catch (e) {
+        this.logger.error(`Cannot check watcher status: ${e.message}`);
+      }
+    }
+  }
+
   async stop() {
   async stop() {
+    // Stop activity monitor
+    if (this.activityCheckInterval) {
+      clearInterval(this.activityCheckInterval);
+      this.activityCheckInterval = null;
+    }
+
     // If status shows we're watching, force stop regardless of watcher object state
     // If status shows we're watching, force stop regardless of watcher object state
     if (this.isWatching) {
     if (this.isWatching) {
       if (this.watcher) {
       if (this.watcher) {
@@ -553,6 +680,18 @@ export class WatcherService implements OnModuleDestroy {
   async onModuleDestroy() {
   async onModuleDestroy() {
     // Clean up resources on application shutdown
     // Clean up resources on application shutdown
     try {
     try {
+      // Stop activity monitor
+      if (this.activityCheckInterval) {
+        clearInterval(this.activityCheckInterval);
+        this.activityCheckInterval = null;
+      }
+
+      // Clear all callback timeouts
+      for (const timeout of this.callbackTimeouts.values()) {
+        clearTimeout(timeout);
+      }
+      this.callbackTimeouts.clear();
+
       // Close the watcher if it's running
       // Close the watcher if it's running
       if (this.watcher && this.isWatching) {
       if (this.watcher && this.isWatching) {
         await this.watcher.close();
         await this.watcher.close();
@@ -567,6 +706,10 @@ export class WatcherService implements OnModuleDestroy {
 
 
       // Clear callbacks
       // Clear callbacks
       this.validationCallbacks.clear();
       this.validationCallbacks.clear();
+
+      this.logger.log(
+        `Watcher destroyed. Total events processed: ${this.eventCount}`,
+      );
     } catch (error) {
     } catch (error) {
       this.logger.error(`Error during module destroy: ${error}`);
       this.logger.error(`Error during module destroy: ${error}`);
     }
     }

+ 156 - 0
apps/web/src/app/components/ShutdownButton.tsx

@@ -0,0 +1,156 @@
+"use client";
+import { useMutation } from "@tanstack/react-query";
+import { useState } from "react";
+import toast from "react-hot-toast";
+import { post } from "../../lib/api";
+
+export default function ShutdownButton() {
+  const [showConfirm, setShowConfirm] = useState(false);
+  const [countdown, setCountdown] = useState<number | null>(null);
+
+  const shutdownMutation = useMutation({
+    mutationFn: () => post("/shutdown", {}),
+    onSuccess: () => {
+      toast.success("Server is shutting down gracefully...");
+      // Start countdown
+      let count = 10;
+      setCountdown(count);
+      const interval = setInterval(() => {
+        count--;
+        setCountdown(count);
+        if (count <= 0) {
+          clearInterval(interval);
+          setCountdown(null);
+          toast.error("Server has shut down. Reconnection may fail.", {
+            duration: 10000,
+          });
+        }
+      }, 1000);
+    },
+    onError: (error: any) => {
+      toast.error(
+        `Failed to shutdown server: ${error?.message || "Unknown error"}`
+      );
+    },
+  });
+
+  const handleShutdown = () => {
+    shutdownMutation.mutate();
+    setShowConfirm(false);
+  };
+
+  if (countdown !== null) {
+    return (
+      <div className="bg-red-500/10 border border-red-500/20 rounded-lg p-4">
+        <div className="flex items-center gap-3">
+          <div className="flex-shrink-0">
+            <svg
+              className="h-6 w-6 text-red-400 animate-pulse"
+              fill="none"
+              viewBox="0 0 24 24"
+              strokeWidth="1.5"
+              stroke="currentColor"
+            >
+              <path
+                strokeLinecap="round"
+                strokeLinejoin="round"
+                d="M5.636 5.636a9 9 0 1012.728 0M12 3v9"
+              />
+            </svg>
+          </div>
+          <div className="flex-1">
+            <p className="text-sm font-medium text-red-300">
+              Server shutting down...
+            </p>
+            <p className="text-xs text-red-400">
+              Connection will be lost in {countdown} seconds
+            </p>
+          </div>
+        </div>
+      </div>
+    );
+  }
+
+  return (
+    <div>
+      {!showConfirm ? (
+        <button
+          onClick={() => setShowConfirm(true)}
+          className="inline-flex items-center gap-2 px-4 py-2 text-sm font-medium text-red-300 bg-red-500/10 hover:bg-red-500/20 border border-red-500/30 rounded-lg transition-colors"
+        >
+          <svg
+            className="h-4 w-4"
+            fill="none"
+            viewBox="0 0 24 24"
+            strokeWidth="1.5"
+            stroke="currentColor"
+          >
+            <path
+              strokeLinecap="round"
+              strokeLinejoin="round"
+              d="M5.636 5.636a9 9 0 1012.728 0M12 3v9"
+            />
+          </svg>
+          Shutdown Server
+        </button>
+      ) : (
+        <div className="bg-red-500/10 border border-red-500/20 rounded-lg p-4">
+          <div className="space-y-3">
+            <div className="flex items-start gap-3">
+              <div className="flex-shrink-0">
+                <svg
+                  className="h-6 w-6 text-red-400"
+                  fill="none"
+                  viewBox="0 0 24 24"
+                  strokeWidth="1.5"
+                  stroke="currentColor"
+                >
+                  <path
+                    strokeLinecap="round"
+                    strokeLinejoin="round"
+                    d="M12 9v3.75m-9.303 3.376c-.866 1.5.217 3.374 1.948 3.374h14.71c1.73 0 2.813-1.874 1.948-3.374L13.949 3.378c-.866-1.5-3.032-1.5-3.898 0L2.697 16.126zM12 15.75h.007v.008H12v-.008z"
+                  />
+                </svg>
+              </div>
+              <div className="flex-1">
+                <h3 className="text-sm font-semibold text-red-300">
+                  Confirm Server Shutdown
+                </h3>
+                <p className="text-xs text-red-400 mt-1">
+                  This will gracefully stop all services including:
+                </p>
+                <ul className="text-xs text-red-400 mt-2 space-y-1 list-disc list-inside">
+                  <li>File watcher monitoring</li>
+                  <li>Task queue processing</li>
+                  <li>Database connections</li>
+                  <li>All active workers</li>
+                </ul>
+                <p className="text-xs text-red-400 mt-2 font-medium">
+                  The server will need to be manually restarted.
+                </p>
+              </div>
+            </div>
+            <div className="flex gap-2">
+              <button
+                onClick={handleShutdown}
+                disabled={shutdownMutation.isPending}
+                className="flex-1 px-3 py-2 text-sm font-medium text-white bg-red-600 hover:bg-red-700 rounded-lg transition-colors disabled:opacity-50 disabled:cursor-not-allowed"
+              >
+                {shutdownMutation.isPending
+                  ? "Shutting down..."
+                  : "Yes, Shutdown"}
+              </button>
+              <button
+                onClick={() => setShowConfirm(false)}
+                disabled={shutdownMutation.isPending}
+                className="flex-1 px-3 py-2 text-sm font-medium text-gray-300 bg-white/10 hover:bg-white/20 rounded-lg transition-colors disabled:opacity-50"
+              >
+                Cancel
+              </button>
+            </div>
+          </div>
+        </div>
+      )}
+    </div>
+  );
+}

+ 27 - 0
apps/web/src/app/settings/page.tsx

@@ -1,5 +1,6 @@
 import SettingsCrud from "../components/SettingsCrud";
 import SettingsCrud from "../components/SettingsCrud";
 import SettingsList from "../components/SettingsList";
 import SettingsList from "../components/SettingsList";
+import ShutdownButton from "../components/ShutdownButton";
 import WatcherControls from "../components/WatcherControls";
 import WatcherControls from "../components/WatcherControls";
 
 
 export default function SettingsPage() {
 export default function SettingsPage() {
@@ -48,6 +49,32 @@ export default function SettingsPage() {
       <div className="bg-white dark:bg-gray-900 rounded-xl shadow-sm ring-1 ring-gray-200 dark:ring-gray-800 overflow-hidden">
       <div className="bg-white dark:bg-gray-900 rounded-xl shadow-sm ring-1 ring-gray-200 dark:ring-gray-800 overflow-hidden">
         <SettingsList />
         <SettingsList />
       </div>
       </div>
+
+      {/* Server Management Section */}
+      <div className="mt-8 bg-white dark:bg-gray-900 rounded-xl shadow-sm ring-1 ring-gray-200 dark:ring-gray-800 overflow-hidden">
+        <div className="px-6 py-4 border-b border-gray-200 dark:border-gray-800">
+          <h3 className="text-base font-semibold text-gray-900 dark:text-white">
+            Server Management
+          </h3>
+          <p className="text-sm text-gray-600 dark:text-gray-400 mt-1">
+            Control server lifecycle and perform maintenance operations
+          </p>
+        </div>
+        <div className="px-6 py-4">
+          <div className="space-y-4">
+            <div>
+              <h4 className="text-sm font-medium text-gray-900 dark:text-white mb-2">
+                Graceful Shutdown
+              </h4>
+              <p className="text-xs text-gray-600 dark:text-gray-400 mb-3">
+                Safely shut down the server by stopping all watchers, completing active tasks,
+                and properly closing database connections to prevent corruption.
+              </p>
+              <ShutdownButton />
+            </div>
+          </div>
+        </div>
+      </div>
     </div>
     </div>
   );
   );
 }
 }