|
@@ -0,0 +1,305 @@
|
|
|
|
|
+import {
|
|
|
|
|
+ Injectable,
|
|
|
|
|
+ Logger,
|
|
|
|
|
+ OnModuleDestroy,
|
|
|
|
|
+ OnModuleInit,
|
|
|
|
|
+} from '@nestjs/common';
|
|
|
|
|
+import { Cron, CronExpression } from '@nestjs/schedule';
|
|
|
|
|
+import { DbService } from './db.service';
|
|
|
|
|
+import { EventsGateway } from './events.gateway';
|
|
|
|
|
+import { WatcherService } from './watcher.service';
|
|
|
|
|
+
|
|
|
|
|
+export interface WatcherHealthRecord {
|
|
|
|
|
+ timestamp: string;
|
|
|
|
|
+ isWatching: boolean;
|
|
|
|
|
+ lastCheckTime: string;
|
|
|
|
|
+ lastErrorTime?: string;
|
|
|
|
|
+ lastErrorMessage?: string;
|
|
|
|
|
+ isHealthy: boolean;
|
|
|
|
|
+ recoveryAttempts: number;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+@Injectable()
|
|
|
|
|
+export class WatcherHealthService implements OnModuleInit, OnModuleDestroy {
|
|
|
|
|
+ private logger = new Logger('WatcherHealthService');
|
|
|
|
|
+ private lastKnownStatus: { isWatching: boolean } | null = null;
|
|
|
|
|
+ private lastCheckTime: Date = new Date();
|
|
|
|
|
+ private lastErrorTime: Date | null = null;
|
|
|
|
|
+ private lastErrorMessage: string | null = null;
|
|
|
|
|
+ private recoveryAttempts = 0;
|
|
|
|
|
+ private maxRecoveryAttempts = 5;
|
|
|
|
|
+ private recoveryAttemptsResetInterval = 1000 * 60 * 60; // 1 hour
|
|
|
|
|
+ private lastRecoveryResetTime = Date.now();
|
|
|
|
|
+ private healthCheckIntervalMs = 30000; // 30 seconds
|
|
|
|
|
+ private autoRecoveryEnabled = true;
|
|
|
|
|
+
|
|
|
|
|
+ constructor(
|
|
|
|
|
+ private readonly watcherService: WatcherService,
|
|
|
|
|
+ private readonly db: DbService,
|
|
|
|
|
+ private readonly eventsGateway: EventsGateway,
|
|
|
|
|
+ ) {
|
|
|
|
|
+ this.loadConfig();
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ onModuleInit() {
|
|
|
|
|
+ this.logger.log('Watcher health monitor initialized');
|
|
|
|
|
+ // Initial health check
|
|
|
|
|
+ this.performHealthCheck();
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ private loadConfig() {
|
|
|
|
|
+ try {
|
|
|
|
|
+ const dbInstance = this.db.getDb();
|
|
|
|
|
+ const autoRecovery = dbInstance
|
|
|
|
|
+ .prepare('SELECT value FROM settings WHERE key = ?')
|
|
|
|
|
+ .get('watcher_auto_recovery') as { value?: string } | undefined;
|
|
|
|
|
+
|
|
|
|
|
+ if (autoRecovery && autoRecovery.value) {
|
|
|
|
|
+ this.autoRecoveryEnabled = JSON.parse(autoRecovery.value);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ const healthCheckInterval = dbInstance
|
|
|
|
|
+ .prepare('SELECT value FROM settings WHERE key = ?')
|
|
|
|
|
+ .get('watcher_health_check_interval') as { value?: string } | undefined;
|
|
|
|
|
+
|
|
|
|
|
+ if (healthCheckInterval && healthCheckInterval.value) {
|
|
|
|
|
+ const interval = JSON.parse(healthCheckInterval.value);
|
|
|
|
|
+ if (typeof interval === 'number' && interval > 0) {
|
|
|
|
|
+ this.healthCheckIntervalMs = interval;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ } catch (error) {
|
|
|
|
|
+ this.logger.warn(`Failed to load health monitor config: ${error}`);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // Run health check every 30 seconds (configurable)
|
|
|
|
|
+ @Cron(CronExpression.EVERY_30_SECONDS)
|
|
|
|
|
+ async healthCheckTask() {
|
|
|
|
|
+ await this.performHealthCheck();
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ private async performHealthCheck() {
|
|
|
|
|
+ try {
|
|
|
|
|
+ const status = this.watcherService.status();
|
|
|
|
|
+ const isWatchingNow = status.isWatching;
|
|
|
|
|
+
|
|
|
|
|
+ // Check if watcher state has changed unexpectedly
|
|
|
|
|
+ if (
|
|
|
|
|
+ this.lastKnownStatus !== null &&
|
|
|
|
|
+ this.lastKnownStatus.isWatching &&
|
|
|
|
|
+ !isWatchingNow
|
|
|
|
|
+ ) {
|
|
|
|
|
+ // Watcher was running but is now stopped unexpectedly
|
|
|
|
|
+ this.logger.error('ALERT: Watcher stopped unexpectedly!');
|
|
|
|
|
+ this.lastErrorTime = new Date();
|
|
|
|
|
+ this.lastErrorMessage =
|
|
|
|
|
+ 'Watcher stopped unexpectedly without being stopped by user';
|
|
|
|
|
+
|
|
|
|
|
+ // Log to database
|
|
|
|
|
+ this.logWatcherError(this.lastErrorMessage);
|
|
|
|
|
+
|
|
|
|
|
+ // Emit alert to frontend
|
|
|
|
|
+ this.eventsGateway.emitWatcherUpdate({
|
|
|
|
|
+ type: 'health_alert',
|
|
|
|
|
+ healthy: false,
|
|
|
|
|
+ reason: this.lastErrorMessage,
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ // Attempt recovery if enabled
|
|
|
|
|
+ if (this.autoRecoveryEnabled) {
|
|
|
|
|
+ await this.attemptRecovery(status);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ this.lastKnownStatus = { isWatching: isWatchingNow };
|
|
|
|
|
+ this.lastCheckTime = new Date();
|
|
|
|
|
+ } catch (error) {
|
|
|
|
|
+ this.logger.error(`Health check failed: ${error}`);
|
|
|
|
|
+ this.lastErrorTime = new Date();
|
|
|
|
|
+ this.lastErrorMessage = `Health check exception: ${error instanceof Error ? error.message : String(error)}`;
|
|
|
|
|
+ this.logWatcherError(this.lastErrorMessage);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ private async attemptRecovery(lastStatus: any) {
|
|
|
|
|
+ // Reset attempts counter if an hour has passed
|
|
|
|
|
+ if (
|
|
|
|
|
+ Date.now() - this.lastRecoveryResetTime >
|
|
|
|
|
+ this.recoveryAttemptsResetInterval
|
|
|
|
|
+ ) {
|
|
|
|
|
+ this.recoveryAttempts = 0;
|
|
|
|
|
+ this.lastRecoveryResetTime = Date.now();
|
|
|
|
|
+ this.logger.log('Recovery attempts counter reset');
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (this.recoveryAttempts >= this.maxRecoveryAttempts) {
|
|
|
|
|
+ this.logger.warn(
|
|
|
|
|
+ `Maximum recovery attempts (${this.maxRecoveryAttempts}) reached. Giving up.`,
|
|
|
|
|
+ );
|
|
|
|
|
+ this.logWatcherError(
|
|
|
|
|
+ `Failed to recover watcher after ${this.maxRecoveryAttempts} attempts`,
|
|
|
|
|
+ );
|
|
|
|
|
+ return;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ this.recoveryAttempts++;
|
|
|
|
|
+ this.logger.warn(
|
|
|
|
|
+ `Attempting to recover watcher (attempt ${this.recoveryAttempts}/${this.maxRecoveryAttempts})...`,
|
|
|
|
|
+ );
|
|
|
|
|
+
|
|
|
|
|
+ try {
|
|
|
|
|
+ // Stop any existing watcher
|
|
|
|
|
+ try {
|
|
|
|
|
+ await this.watcherService.stop();
|
|
|
|
|
+ } catch (e) {
|
|
|
|
|
+ this.logger.debug(`Error stopping watcher during recovery: ${e}`);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // Restart with the last known configuration
|
|
|
|
|
+ if (lastStatus.watches && lastStatus.watches.length > 0) {
|
|
|
|
|
+ const result = this.watcherService.start(
|
|
|
|
|
+ lastStatus.watches,
|
|
|
|
|
+ lastStatus.options,
|
|
|
|
|
+ );
|
|
|
|
|
+ if (result.started) {
|
|
|
|
|
+ this.logger.log('Watcher successfully recovered');
|
|
|
|
|
+ this.recoveryAttempts = 0; // Reset on successful recovery
|
|
|
|
|
+ this.logWatcherError(
|
|
|
|
|
+ `Watcher recovered successfully on attempt ${this.recoveryAttempts}`,
|
|
|
|
|
+ );
|
|
|
|
|
+
|
|
|
|
|
+ this.eventsGateway.emitWatcherUpdate({
|
|
|
|
|
+ type: 'recovered',
|
|
|
|
|
+ message: `Watcher recovered successfully after failure`,
|
|
|
|
|
+ });
|
|
|
|
|
+ } else {
|
|
|
|
|
+ this.logger.error('Recovery failed: watcher did not start');
|
|
|
|
|
+ this.lastErrorMessage =
|
|
|
|
|
+ 'Recovery attempt failed: watcher would not start';
|
|
|
|
|
+ this.logWatcherError(this.lastErrorMessage);
|
|
|
|
|
+ }
|
|
|
|
|
+ } else {
|
|
|
|
|
+ this.logger.warn('Cannot recover: no watches configured');
|
|
|
|
|
+ this.lastErrorMessage = 'Recovery not possible: no watches configured';
|
|
|
|
|
+ this.logWatcherError(this.lastErrorMessage);
|
|
|
|
|
+ }
|
|
|
|
|
+ } catch (error) {
|
|
|
|
|
+ this.logger.error(`Recovery attempt failed: ${error}`);
|
|
|
|
|
+ this.lastErrorMessage = `Recovery attempt ${this.recoveryAttempts} failed: ${error instanceof Error ? error.message : String(error)}`;
|
|
|
|
|
+ this.logWatcherError(this.lastErrorMessage);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ private logWatcherError(message: string) {
|
|
|
|
|
+ try {
|
|
|
|
|
+ const db = this.db.getDb();
|
|
|
|
|
+
|
|
|
|
|
+ // Create watcher_errors table if it doesn't exist
|
|
|
|
|
+ db.exec(`
|
|
|
|
|
+ CREATE TABLE IF NOT EXISTS watcher_errors (
|
|
|
|
|
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
|
|
|
+ timestamp TEXT NOT NULL,
|
|
|
|
|
+ message TEXT NOT NULL,
|
|
|
|
|
+ recovery_attempt INTEGER DEFAULT 0,
|
|
|
|
|
+ created_at TEXT NOT NULL
|
|
|
|
|
+ );
|
|
|
|
|
+ `);
|
|
|
|
|
+
|
|
|
|
|
+ // Log the error
|
|
|
|
|
+ db.prepare(
|
|
|
|
|
+ 'INSERT INTO watcher_errors (timestamp, message, created_at) VALUES (?, ?, ?)',
|
|
|
|
|
+ ).run(new Date().toISOString(), message, new Date().toISOString());
|
|
|
|
|
+
|
|
|
|
|
+ // Keep only the last 100 errors to prevent unbounded growth
|
|
|
|
|
+ const deleteOldErrors = db
|
|
|
|
|
+ .prepare(
|
|
|
|
|
+ `DELETE FROM watcher_errors WHERE id NOT IN (
|
|
|
|
|
+ SELECT id FROM watcher_errors ORDER BY id DESC LIMIT 100
|
|
|
|
|
+ )`,
|
|
|
|
|
+ )
|
|
|
|
|
+ .run();
|
|
|
|
|
+
|
|
|
|
|
+ if (deleteOldErrors.changes > 0) {
|
|
|
|
|
+ this.logger.debug(`Cleaned up old watcher errors`);
|
|
|
|
|
+ }
|
|
|
|
|
+ } catch (error) {
|
|
|
|
|
+ this.logger.error(`Failed to log watcher error to database: ${error}`);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * Get current health status
|
|
|
|
|
+ */
|
|
|
|
|
+ getHealthStatus(): WatcherHealthRecord {
|
|
|
|
|
+ const status = this.watcherService.status();
|
|
|
|
|
+ return {
|
|
|
|
|
+ timestamp: new Date().toISOString(),
|
|
|
|
|
+ isWatching: status.isWatching,
|
|
|
|
|
+ lastCheckTime: this.lastCheckTime.toISOString(),
|
|
|
|
|
+ lastErrorTime: this.lastErrorTime?.toISOString(),
|
|
|
|
|
+ lastErrorMessage: this.lastErrorMessage || undefined,
|
|
|
|
|
+ isHealthy: status.isWatching && !this.lastErrorTime,
|
|
|
|
|
+ recoveryAttempts: this.recoveryAttempts,
|
|
|
|
|
+ };
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * Get recent error logs
|
|
|
|
|
+ */
|
|
|
|
|
+ getRecentErrors(limit = 20): Array<{ timestamp: string; message: string }> {
|
|
|
|
|
+ try {
|
|
|
|
|
+ const db = this.db.getDb();
|
|
|
|
|
+ const errors = db
|
|
|
|
|
+ .prepare(
|
|
|
|
|
+ `SELECT timestamp, message FROM watcher_errors ORDER BY id DESC LIMIT ?`,
|
|
|
|
|
+ )
|
|
|
|
|
+ .all(limit) as Array<{ timestamp: string; message: string }>;
|
|
|
|
|
+ return errors;
|
|
|
|
|
+ } catch {
|
|
|
|
|
+ return [];
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * Clear error logs
|
|
|
|
|
+ */
|
|
|
|
|
+ clearErrorLogs(): number {
|
|
|
|
|
+ try {
|
|
|
|
|
+ const db = this.db.getDb();
|
|
|
|
|
+ const result = db.prepare('DELETE FROM watcher_errors').run();
|
|
|
|
|
+ this.logger.log(`Cleared ${result.changes} watcher error logs`);
|
|
|
|
|
+ return result.changes;
|
|
|
|
|
+ } catch (error) {
|
|
|
|
|
+ this.logger.error(`Failed to clear error logs: ${error}`);
|
|
|
|
|
+ return 0;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * Set auto-recovery enabled/disabled
|
|
|
|
|
+ */
|
|
|
|
|
+ setAutoRecovery(enabled: boolean) {
|
|
|
|
|
+ this.autoRecoveryEnabled = enabled;
|
|
|
|
|
+ try {
|
|
|
|
|
+ const db = this.db.getDb();
|
|
|
|
|
+ db.prepare(
|
|
|
|
|
+ 'INSERT OR REPLACE INTO settings (key, value) VALUES (?, ?)',
|
|
|
|
|
+ ).run('watcher_auto_recovery', JSON.stringify(enabled));
|
|
|
|
|
+ this.logger.log(`Auto-recovery set to ${enabled}`);
|
|
|
|
|
+ } catch (error) {
|
|
|
|
|
+ this.logger.error(`Failed to save auto-recovery setting: ${error}`);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * Get auto-recovery status
|
|
|
|
|
+ */
|
|
|
|
|
+ isAutoRecoveryEnabled(): boolean {
|
|
|
|
|
+ return this.autoRecoveryEnabled;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ onModuleDestroy() {
|
|
|
|
|
+ this.logger.log('Watcher health monitor destroyed');
|
|
|
|
|
+ }
|
|
|
|
|
+}
|