| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400 |
- import { Injectable, Logger } from '@nestjs/common';
- import { Cron, CronExpression } from '@nestjs/schedule';
- import crypto from 'crypto';
- import fs from 'fs';
- import path from 'path';
- import { DatasetsService } from './datasets.service';
- import { DbService } from './db.service';
- @Injectable()
- export class MaintenanceService {
- private logger = new Logger('MaintenanceService');
- constructor(
- private readonly db: DbService,
- private readonly datasetsService: DatasetsService,
- ) {}
- cleanup(file: string, dirs: string[]) {
- for (let i = 0, l = dirs.length; i < l; i++) {
- const dir = dirs[i];
- if (file && dir && file.indexOf(dir) > -1) {
- const dataset = dir.replace(/.*\/(.*)/, '$1');
- const exists = fs.existsSync(file);
- if (!exists) this.db.removeFile(dataset, file, true);
- }
- }
- }
- purge(
- dirs: string[],
- dayMs = 24 * 60 * 60 * 1000,
- cleanerMs = 60 * 60 * 1000,
- ) {
- const ago = new Date(Date.now() - dayMs);
- this.logger.log(
- `Checking for "deleted" records older than ${ago.toISOString()}`,
- );
- for (let i = 0, l = dirs.length; i < l; i++) {
- const dir = dirs[i];
- const dataset = dir.replace(/.*\/(.*)/, '$1');
- const files = this.db.getDeletedOlderThan(dataset, ago.toISOString());
- for (const file of files as { input: string }[]) {
- this.logger.log(
- `Purging "${file.input}" (${new Date().toISOString()})`,
- );
- if (file && file.input) this.db.removeFile(dataset, file.input, false);
- }
- }
- setTimeout(() => {
- this.purge(dirs, dayMs, cleanerMs);
- }, cleanerMs);
- }
- prune(dirs: string[]) {
- this.logger.log('Checking for any files that no longer exist on disk.');
- for (let i = 0, l = dirs.length; i < l; i++) {
- const dir = dirs[i];
- const dataset = dir.replace(/.*\/(.*)/, '$1');
- // Get all files for this dataset (not just successful ones)
- const allFiles = this.db.getAllFiles(dataset);
- for (const file of allFiles as { input: string; status: string }[]) {
- const exists = fs.existsSync(file.input);
- if (!exists) {
- // Only soft delete if not already marked as deleted
- // Note: Database schema changes must be done via migrations in data/migrations/
- if (file.status !== 'deleted') {
- this.logger.log(
- `Marking missing file as deleted: "${file.input}" (${new Date().toISOString()})`,
- );
- this.db.removeFile(dataset, file.input, true); // soft delete
- }
- }
- }
- }
- }
- // Scheduled task cleanup - runs periodically to prevent task table from growing too large
- @Cron(CronExpression.EVERY_DAY_AT_2AM) // Run daily at 2 AM
- scheduledTaskCleanup() {
- this.logger.log('Running scheduled task cleanup');
- try {
- // Archive completed tasks older than 30 days
- const archiveResult = this.db.archiveOldTasks(30);
- this.logger.log(`Archived ${archiveResult?.changes || 0} old tasks`);
- // Delete failed tasks older than 7 days
- const failedResult = this.db.deleteTasksByStatus('failed', 7);
- this.logger.log(
- `Deleted ${failedResult.changes} failed tasks older than 7 days`,
- );
- // Delete skipped tasks older than 7 days
- const skippedResult = this.db.deleteTasksByStatus('skipped', 7);
- this.logger.log(
- `Deleted ${skippedResult.changes} skipped tasks older than 7 days`,
- );
- // Keep completed tasks for 90 days, then archive them
- const completedResult = this.db.deleteTasksByStatus('completed', 90);
- this.logger.log(
- `Deleted ${completedResult.changes} completed tasks older than 90 days`,
- );
- } catch (error) {
- this.logger.error(
- `Error during scheduled task cleanup: ${error.message}`,
- );
- }
- }
- /**
- * Scan destination folders (as defined in settings.datasets) for duplicate files.
- * Duplicates are recorded in the duplicate_files table for manual review.
- */
- findDuplicateFiles(options: { resetExisting?: boolean } = {}) {
- const { resetExisting = false } = options;
- this.logger.log('Starting duplicate file scan');
- if (resetExisting) {
- this.db.clearDuplicateGroups();
- this.logger.log('Cleared existing duplicate groups');
- }
- const existing = this.db.listDuplicateGroups();
- const existingMap = new Map<
- string,
- { id: number; status: string; files: string[] }
- >();
- for (const row of existing) {
- const key = `${row.dataset}|${row.destination}|${row.hash}|${row.size}`;
- existingMap.set(key, {
- id: row.id,
- status: row.status,
- files: row.files,
- });
- }
- const datasetConfig = this.datasetsService.getDatasetConfig();
- const duplicates: Array<{
- dataset: string;
- destination: string;
- hash: string;
- size: number;
- files: string[];
- }> = [];
- for (const [datasetName, datasetObj] of Object.entries(datasetConfig)) {
- if (
- !datasetObj ||
- datasetObj.enabled === false ||
- datasetObj.enabled === 'false'
- ) {
- this.logger.log(`Skipping disabled dataset: ${datasetName}`);
- continue;
- }
- this.logger.log(`Scanning dataset: ${datasetName}`);
- const destinations = this.collectDestinations(datasetObj);
- for (const destination of destinations) {
- if (!destination || !fs.existsSync(destination)) {
- this.logger.warn(
- `Destination not found for dataset ${datasetName}: ${destination}`,
- );
- continue;
- }
- this.logger.log(`Scanning destination: ${destination}`);
- const groups = this.scanDestinationForDuplicates(destination);
- this.scanForSimilarNames(destination);
- for (const group of groups) {
- const entry = {
- dataset: datasetName,
- destination,
- hash: group.hash,
- size: group.size,
- files: group.files,
- };
- const key = `${entry.dataset}|${entry.destination}|${entry.hash}|${entry.size}`;
- const existingEntry = existingMap.get(key);
- // Skip groups that were marked reviewed/ignored previously
- if (existingEntry && existingEntry.status === 'reviewed') {
- continue;
- }
- duplicates.push(entry);
- if (existingEntry) {
- this.db.updateDuplicateGroupFiles(
- existingEntry.id,
- entry.files,
- 'pending',
- );
- } else {
- this.db.saveDuplicateGroup(entry);
- }
- }
- if (groups.length) {
- this.logger.warn(
- `Found ${groups.length} duplicate group(s) in destination ${destination} (dataset: ${datasetName})`,
- );
- }
- }
- }
- this.logger.log(
- `Duplicate scan completed. Processed ${duplicates.length} groups.`,
- );
- return duplicates;
- }
- private collectDestinations(datasetObj: Record<string, any>): Set<string> {
- const destinations = new Set<string>();
- if (datasetObj.destination && typeof datasetObj.destination === 'string') {
- destinations.add(datasetObj.destination);
- }
- for (const [pathKey, cfg] of Object.entries(datasetObj)) {
- if (pathKey === 'enabled') continue;
- if (cfg && typeof cfg === 'object' && cfg.destination) {
- destinations.add(cfg.destination);
- }
- }
- return destinations;
- }
- private scanDestinationForDuplicates(destination: string) {
- const files = this.walkFiles(destination);
- this.logger.log(`Found ${files.length} files to scan in ${destination}`);
- const groups = new Map<string, { size: number; files: string[] }>();
- let processed = 0;
- for (const filePath of files) {
- try {
- const stat = fs.statSync(filePath);
- if (!stat.isFile()) continue;
- const hash = this.hashFile(filePath);
- if (hash) {
- const key = `${hash}:${stat.size}`;
- const group = groups.get(key) || { size: stat.size, files: [] };
- group.files.push(filePath);
- groups.set(key, group);
- }
- processed++;
- if (processed % 100 === 0) {
- this.logger.log(
- `Processed ${processed}/${files.length} files in ${destination}`,
- );
- }
- } catch (error) {
- this.logger.warn(
- `Failed to process file for duplicate scan: ${filePath} (${error})`,
- );
- }
- }
- this.logger.log(`Completed scanning ${processed} files in ${destination}`);
- return Array.from(groups.entries())
- .filter(([, group]) => group.files.length > 1)
- .map(([key, group]) => ({
- hash: key.split(':')[0],
- size: group.size,
- files: group.files,
- }));
- }
- private scanForSimilarNames(destination: string) {
- const files = this.walkFiles(destination);
- this.logger.log(
- `Checking ${files.length} files for similar names in ${destination}`,
- );
- const nameGroups = new Map<string, string[]>();
- let processed = 0;
- for (const filePath of files) {
- try {
- const stat = fs.statSync(filePath);
- if (!stat.isFile()) continue;
- const baseName = path
- .basename(filePath, path.extname(filePath))
- .toLowerCase();
- const group = nameGroups.get(baseName) || [];
- group.push(filePath);
- nameGroups.set(baseName, group);
- processed++;
- if (processed % 100 === 0) {
- this.logger.log(
- `Processed ${processed}/${files.length} files for similar names in ${destination}`,
- );
- }
- } catch (error) {
- this.logger.warn(
- `Failed to process file for similar name scan: ${filePath} (${error})`,
- );
- }
- }
- this.logger.log(
- `Completed similar name check for ${processed} files in ${destination}`,
- );
- const similarGroups = Array.from(nameGroups.entries())
- .filter(([, files]) => files.length > 1)
- .map(([baseName, files]) => ({ baseName, files }));
- if (similarGroups.length) {
- this.logger.log(
- `Found ${similarGroups.length} groups of files with similar names in ${destination}`,
- );
- for (const group of similarGroups) {
- this.logger.log(
- `Similar: ${group.baseName} - ${group.files.join(', ')}`,
- );
- }
- }
- return similarGroups;
- }
- private walkFiles(root: string) {
- const pending = [root];
- const files: string[] = [];
- while (pending.length) {
- const current = pending.pop();
- if (!current) continue;
- let stat: fs.Stats;
- try {
- stat = fs.statSync(current);
- } catch {
- continue;
- }
- if (stat.isDirectory()) {
- const children = fs.readdirSync(current);
- for (const child of children) {
- pending.push(path.join(current, child));
- }
- } else if (stat.isFile()) {
- files.push(current);
- }
- }
- return files;
- }
- private hashFile(filePath: string): string | null {
- try {
- const hash = crypto.createHash('sha1');
- const data = fs.readFileSync(filePath);
- hash.update(data);
- return hash.digest('hex');
- } catch (error) {
- this.logger.warn(`Hashing failed for ${filePath}: ${error}`);
- return null;
- }
- }
- purgeDuplicateFiles(id: number, files: string[], note?: string) {
- const group = this.db.getDuplicateGroup(id);
- if (!group) {
- throw new Error('Duplicate group not found');
- }
- const toDelete = files && files.length > 0 ? files : [];
- const deleted: string[] = [];
- const errors: Array<{ file: string; error: string }> = [];
- for (const filePath of toDelete) {
- try {
- if (fs.existsSync(filePath)) {
- fs.unlinkSync(filePath);
- deleted.push(filePath);
- } else {
- errors.push({ file: filePath, error: 'File not found' });
- }
- } catch (error) {
- errors.push({ file: filePath, error: (error as Error).message });
- }
- }
- const remaining = group.files.filter((f) => !toDelete.includes(f));
- const nextStatus = remaining.length > 1 ? 'pending' : 'purged';
- this.db.updateDuplicateGroupFiles(id, remaining, nextStatus, note);
- return { deleted, errors, remaining, status: nextStatus };
- }
- }
|