import { Injectable, Logger } from '@nestjs/common'; import { Cron, CronExpression } from '@nestjs/schedule'; import crypto from 'crypto'; import fs from 'fs'; import path from 'path'; import { DatasetsService } from './datasets.service'; import { DbService } from './db.service'; @Injectable() export class MaintenanceService { private logger = new Logger('MaintenanceService'); constructor( private readonly db: DbService, private readonly datasetsService: DatasetsService, ) {} cleanup(file: string, dirs: string[]) { for (let i = 0, l = dirs.length; i < l; i++) { const dir = dirs[i]; if (file && dir && file.indexOf(dir) > -1) { const dataset = dir.replace(/.*\/(.*)/, '$1'); const exists = fs.existsSync(file); if (!exists) this.db.removeFile(dataset, file, true); } } } purge( dirs: string[], dayMs = 24 * 60 * 60 * 1000, cleanerMs = 60 * 60 * 1000, ) { const ago = new Date(Date.now() - dayMs); this.logger.log( `Checking for "deleted" records older than ${ago.toISOString()}`, ); for (let i = 0, l = dirs.length; i < l; i++) { const dir = dirs[i]; const dataset = dir.replace(/.*\/(.*)/, '$1'); const files = this.db.getDeletedOlderThan(dataset, ago.toISOString()); for (const file of files as { input: string }[]) { this.logger.log( `Purging "${file.input}" (${new Date().toISOString()})`, ); if (file && file.input) this.db.removeFile(dataset, file.input, false); } } setTimeout(() => { this.purge(dirs, dayMs, cleanerMs); }, cleanerMs); } prune(dirs: string[]) { this.logger.log('Checking for any files that no longer exist on disk.'); for (let i = 0, l = dirs.length; i < l; i++) { const dir = dirs[i]; const dataset = dir.replace(/.*\/(.*)/, '$1'); // Get all files for this dataset (not just successful ones) const allFiles = this.db.getAllFiles(dataset); for (const file of allFiles as { input: string; status: string }[]) { const exists = fs.existsSync(file.input); if (!exists) { // Only soft delete if not already marked as deleted // Note: Database schema changes must be done via migrations in data/migrations/ if (file.status !== 'deleted') { this.logger.log( `Marking missing file as deleted: "${file.input}" (${new Date().toISOString()})`, ); this.db.removeFile(dataset, file.input, true); // soft delete } } } } } // Scheduled task cleanup - runs periodically to prevent task table from growing too large @Cron(CronExpression.EVERY_DAY_AT_2AM) // Run daily at 2 AM scheduledTaskCleanup() { this.logger.log('Running scheduled task cleanup'); try { // Archive completed tasks older than 30 days const archiveResult = this.db.archiveOldTasks(30); this.logger.log(`Archived ${archiveResult?.changes || 0} old tasks`); // Delete failed tasks older than 7 days const failedResult = this.db.deleteTasksByStatus('failed', 7); this.logger.log( `Deleted ${failedResult.changes} failed tasks older than 7 days`, ); // Delete skipped tasks older than 7 days const skippedResult = this.db.deleteTasksByStatus('skipped', 7); this.logger.log( `Deleted ${skippedResult.changes} skipped tasks older than 7 days`, ); // Keep completed tasks for 90 days, then archive them const completedResult = this.db.deleteTasksByStatus('completed', 90); this.logger.log( `Deleted ${completedResult.changes} completed tasks older than 90 days`, ); } catch (error) { this.logger.error( `Error during scheduled task cleanup: ${error.message}`, ); } } /** * Scan destination folders (as defined in settings.datasets) for duplicate files. * Duplicates are recorded in the duplicate_files table for manual review. */ findDuplicateFiles(options: { resetExisting?: boolean } = {}) { const { resetExisting = false } = options; this.logger.log('Starting duplicate file scan'); if (resetExisting) { this.db.clearDuplicateGroups(); this.logger.log('Cleared existing duplicate groups'); } const existing = this.db.listDuplicateGroups(); const existingMap = new Map< string, { id: number; status: string; files: string[] } >(); for (const row of existing) { const key = `${row.dataset}|${row.destination}|${row.hash}|${row.size}`; existingMap.set(key, { id: row.id, status: row.status, files: row.files, }); } const datasetConfig = this.datasetsService.getDatasetConfig(); const duplicates: Array<{ dataset: string; destination: string; hash: string; size: number; files: string[]; }> = []; for (const [datasetName, datasetObj] of Object.entries(datasetConfig)) { if ( !datasetObj || datasetObj.enabled === false || datasetObj.enabled === 'false' ) { this.logger.log(`Skipping disabled dataset: ${datasetName}`); continue; } this.logger.log(`Scanning dataset: ${datasetName}`); const destinations = this.collectDestinations(datasetObj); for (const destination of destinations) { if (!destination || !fs.existsSync(destination)) { this.logger.warn( `Destination not found for dataset ${datasetName}: ${destination}`, ); continue; } this.logger.log(`Scanning destination: ${destination}`); const groups = this.scanDestinationForDuplicates(destination); this.scanForSimilarNames(destination); for (const group of groups) { const entry = { dataset: datasetName, destination, hash: group.hash, size: group.size, files: group.files, }; const key = `${entry.dataset}|${entry.destination}|${entry.hash}|${entry.size}`; const existingEntry = existingMap.get(key); // Skip groups that were marked reviewed/ignored previously if (existingEntry && existingEntry.status === 'reviewed') { continue; } duplicates.push(entry); if (existingEntry) { this.db.updateDuplicateGroupFiles( existingEntry.id, entry.files, 'pending', ); } else { this.db.saveDuplicateGroup(entry); } } if (groups.length) { this.logger.warn( `Found ${groups.length} duplicate group(s) in destination ${destination} (dataset: ${datasetName})`, ); } } } this.logger.log( `Duplicate scan completed. Processed ${duplicates.length} groups.`, ); return duplicates; } private collectDestinations(datasetObj: Record): Set { const destinations = new Set(); if (datasetObj.destination && typeof datasetObj.destination === 'string') { destinations.add(datasetObj.destination); } for (const [pathKey, cfg] of Object.entries(datasetObj)) { if (pathKey === 'enabled') continue; if (cfg && typeof cfg === 'object' && cfg.destination) { destinations.add(cfg.destination); } } return destinations; } private scanDestinationForDuplicates(destination: string) { const files = this.walkFiles(destination); this.logger.log(`Found ${files.length} files to scan in ${destination}`); const groups = new Map(); let processed = 0; for (const filePath of files) { try { const stat = fs.statSync(filePath); if (!stat.isFile()) continue; const hash = this.hashFile(filePath); if (hash) { const key = `${hash}:${stat.size}`; const group = groups.get(key) || { size: stat.size, files: [] }; group.files.push(filePath); groups.set(key, group); } processed++; if (processed % 100 === 0) { this.logger.log( `Processed ${processed}/${files.length} files in ${destination}`, ); } } catch (error) { this.logger.warn( `Failed to process file for duplicate scan: ${filePath} (${error})`, ); } } this.logger.log(`Completed scanning ${processed} files in ${destination}`); return Array.from(groups.entries()) .filter(([, group]) => group.files.length > 1) .map(([key, group]) => ({ hash: key.split(':')[0], size: group.size, files: group.files, })); } private scanForSimilarNames(destination: string) { const files = this.walkFiles(destination); this.logger.log( `Checking ${files.length} files for similar names in ${destination}`, ); const nameGroups = new Map(); let processed = 0; for (const filePath of files) { try { const stat = fs.statSync(filePath); if (!stat.isFile()) continue; const baseName = path .basename(filePath, path.extname(filePath)) .toLowerCase(); const group = nameGroups.get(baseName) || []; group.push(filePath); nameGroups.set(baseName, group); processed++; if (processed % 100 === 0) { this.logger.log( `Processed ${processed}/${files.length} files for similar names in ${destination}`, ); } } catch (error) { this.logger.warn( `Failed to process file for similar name scan: ${filePath} (${error})`, ); } } this.logger.log( `Completed similar name check for ${processed} files in ${destination}`, ); const similarGroups = Array.from(nameGroups.entries()) .filter(([, files]) => files.length > 1) .map(([baseName, files]) => ({ baseName, files })); if (similarGroups.length) { this.logger.log( `Found ${similarGroups.length} groups of files with similar names in ${destination}`, ); for (const group of similarGroups) { this.logger.log( `Similar: ${group.baseName} - ${group.files.join(', ')}`, ); } } return similarGroups; } private walkFiles(root: string) { const pending = [root]; const files: string[] = []; while (pending.length) { const current = pending.pop(); if (!current) continue; let stat: fs.Stats; try { stat = fs.statSync(current); } catch { continue; } if (stat.isDirectory()) { const children = fs.readdirSync(current); for (const child of children) { pending.push(path.join(current, child)); } } else if (stat.isFile()) { files.push(current); } } return files; } private hashFile(filePath: string): string | null { try { const hash = crypto.createHash('sha1'); const data = fs.readFileSync(filePath); hash.update(data); return hash.digest('hex'); } catch (error) { this.logger.warn(`Hashing failed for ${filePath}: ${error}`); return null; } } purgeDuplicateFiles(id: number, files: string[], note?: string) { const group = this.db.getDuplicateGroup(id); if (!group) { throw new Error('Duplicate group not found'); } const toDelete = files && files.length > 0 ? files : []; const deleted: string[] = []; const errors: Array<{ file: string; error: string }> = []; for (const filePath of toDelete) { try { if (fs.existsSync(filePath)) { fs.unlinkSync(filePath); deleted.push(filePath); } else { errors.push({ file: filePath, error: 'File not found' }); } } catch (error) { errors.push({ file: filePath, error: (error as Error).message }); } } const remaining = group.files.filter((f) => !toDelete.includes(f)); const nextStatus = remaining.length > 1 ? 'pending' : 'purged'; this.db.updateDuplicateGroupFiles(id, remaining, nextStatus, note); return { deleted, errors, remaining, status: nextStatus }; } }