Forráskód Böngészése

Convert duplicate scanning to async I/O to prevent blocking the event loop

Timothy Pomeroy 4 hete
szülő
commit
7bd0e9aa10

+ 2 - 2
apps/service/src/app.controller.ts

@@ -408,8 +408,8 @@ export class AppController {
 
   // Duplicate file review endpoints
   @Post('maintenance/duplicates/scan')
-  scanDuplicates(@Body('resetExisting') resetExisting?: boolean) {
-    return this.appService.scanDuplicateFiles(resetExisting);
+  async scanDuplicates(@Body('resetExisting') resetExisting?: boolean) {
+    return await this.appService.scanDuplicateFiles(resetExisting);
   }
 
   @Get('maintenance/duplicates')

+ 2 - 2
apps/service/src/app.service.ts

@@ -190,8 +190,8 @@ export class AppService {
   }
 
   // Duplicate management
-  scanDuplicateFiles(resetExisting?: boolean) {
-    return this.maintenance.findDuplicateFiles({ resetExisting });
+  async scanDuplicateFiles(resetExisting?: boolean) {
+    return await this.maintenance.findDuplicateFiles({ resetExisting });
   }
 
   listDuplicateGroups(status?: string, dataset?: string) {

+ 11 - 10
apps/service/src/maintenance.service.ts

@@ -2,6 +2,7 @@ import { Injectable, Logger } from '@nestjs/common';
 import { Cron, CronExpression } from '@nestjs/schedule';
 import crypto from 'crypto';
 import fs from 'fs';
+import fsPromises from 'fs/promises';
 import path from 'path';
 import { DatasetsService } from './datasets.service';
 import { DbService } from './db.service';
@@ -113,7 +114,7 @@ export class MaintenanceService {
    * Scan destination folders (as defined in settings.datasets) for duplicate files.
    * Duplicates are recorded in the duplicate_files table for manual review.
    */
-  findDuplicateFiles(options: { resetExisting?: boolean } = {}) {
+  async findDuplicateFiles(options: { resetExisting?: boolean } = {}) {
     const { resetExisting = false } = options;
 
     this.logger.log('Starting duplicate file scan');
@@ -168,8 +169,8 @@ export class MaintenanceService {
         }
 
         this.logger.log(`Scanning destination: ${destination}`);
-        const groups = this.scanDestinationForDuplicates(destination);
-        this.scanForSimilarNames(destination);
+        const groups = await this.scanDestinationForDuplicates(destination);
+        await this.scanForSimilarNames(destination);
         for (const group of groups) {
           const entry = {
             dataset: datasetName,
@@ -232,7 +233,7 @@ export class MaintenanceService {
     return destinations;
   }
 
-  private scanDestinationForDuplicates(destination: string) {
+  private async scanDestinationForDuplicates(destination: string) {
     const files = this.walkFiles(destination);
     this.logger.log(`Found ${files.length} files to scan in ${destination}`);
     const groups = new Map<string, { size: number; files: string[] }>();
@@ -240,10 +241,10 @@ export class MaintenanceService {
 
     for (const filePath of files) {
       try {
-        const stat = fs.statSync(filePath);
+        const stat = await fsPromises.stat(filePath);
         if (!stat.isFile()) continue;
 
-        const hash = this.hashFile(filePath);
+        const hash = await this.hashFileAsync(filePath);
         if (hash) {
           const key = `${hash}:${stat.size}`;
           const group = groups.get(key) || { size: stat.size, files: [] };
@@ -274,7 +275,7 @@ export class MaintenanceService {
       }));
   }
 
-  private scanForSimilarNames(destination: string) {
+  private async scanForSimilarNames(destination: string) {
     const files = this.walkFiles(destination);
     this.logger.log(
       `Checking ${files.length} files for similar names in ${destination}`,
@@ -284,7 +285,7 @@ export class MaintenanceService {
 
     for (const filePath of files) {
       try {
-        const stat = fs.statSync(filePath);
+        const stat = await fsPromises.stat(filePath);
         if (!stat.isFile()) continue;
 
         const baseName = path
@@ -356,10 +357,10 @@ export class MaintenanceService {
     return files;
   }
 
-  private hashFile(filePath: string): string | null {
+  private async hashFileAsync(filePath: string): Promise<string | null> {
     try {
+      const data = await fsPromises.readFile(filePath);
       const hash = crypto.createHash('sha1');
-      const data = fs.readFileSync(filePath);
       hash.update(data);
       return hash.digest('hex');
     } catch (error) {