|
@@ -1,8 +1,8 @@
|
|
|
-import { parentPort } from 'worker_threads';
|
|
|
|
|
import crypto from 'crypto';
|
|
import crypto from 'crypto';
|
|
|
import fs from 'fs';
|
|
import fs from 'fs';
|
|
|
import fsPromises from 'fs/promises';
|
|
import fsPromises from 'fs/promises';
|
|
|
import path from 'path';
|
|
import path from 'path';
|
|
|
|
|
+import { parentPort } from 'worker_threads';
|
|
|
|
|
|
|
|
interface ScanResult {
|
|
interface ScanResult {
|
|
|
dataset: string;
|
|
dataset: string;
|
|
@@ -57,7 +57,9 @@ async function hashFileAsync(filePath: string): Promise<string | null> {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-async function scanDestinationForDuplicates(destination: string): Promise<ScanResult[]> {
|
|
|
|
|
|
|
+async function scanDestinationForDuplicates(
|
|
|
|
|
+ destination: string,
|
|
|
|
|
+): Promise<ScanResult[]> {
|
|
|
const files = walkFiles(destination);
|
|
const files = walkFiles(destination);
|
|
|
console.log(`Worker: Found ${files.length} files to scan in ${destination}`);
|
|
console.log(`Worker: Found ${files.length} files to scan in ${destination}`);
|
|
|
const groups = new Map<string, { size: number; files: string[] }>();
|
|
const groups = new Map<string, { size: number; files: string[] }>();
|
|
@@ -77,27 +79,40 @@ async function scanDestinationForDuplicates(destination: string): Promise<ScanRe
|
|
|
}
|
|
}
|
|
|
processed++;
|
|
processed++;
|
|
|
if (processed % 100 === 0) {
|
|
if (processed % 100 === 0) {
|
|
|
- console.log(`Worker: Processed ${processed}/${files.length} files in ${destination}`);
|
|
|
|
|
|
|
+ console.log(
|
|
|
|
|
+ `Worker: Processed ${processed}/${files.length} files in ${destination}`,
|
|
|
|
|
+ );
|
|
|
}
|
|
}
|
|
|
} catch (error) {
|
|
} catch (error) {
|
|
|
- console.warn(`Worker: Failed to process file for duplicate scan: ${filePath} (${error})`);
|
|
|
|
|
|
|
+ console.warn(
|
|
|
|
|
+ `Worker: Failed to process file for duplicate scan: ${filePath} (${error})`,
|
|
|
|
|
+ );
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- console.log(`Worker: Completed scanning ${processed} files in ${destination}`);
|
|
|
|
|
|
|
+ console.log(
|
|
|
|
|
+ `Worker: Completed scanning ${processed} files in ${destination}`,
|
|
|
|
|
+ );
|
|
|
|
|
|
|
|
return Array.from(groups.entries())
|
|
return Array.from(groups.entries())
|
|
|
.filter(([, group]) => group.files.length > 1)
|
|
.filter(([, group]) => group.files.length > 1)
|
|
|
- .map(([key, group]) => ({
|
|
|
|
|
- hash: key.split(':')[0],
|
|
|
|
|
- size: group.size,
|
|
|
|
|
- files: group.files,
|
|
|
|
|
- } as ScanResult));
|
|
|
|
|
|
|
+ .map(
|
|
|
|
|
+ ([key, group]) =>
|
|
|
|
|
+ ({
|
|
|
|
|
+ hash: key.split(':')[0],
|
|
|
|
|
+ size: group.size,
|
|
|
|
|
+ files: group.files,
|
|
|
|
|
+ }) as ScanResult,
|
|
|
|
|
+ );
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-async function scanForSimilarNames(destination: string): Promise<SimilarResult[]> {
|
|
|
|
|
|
|
+async function scanForSimilarNames(
|
|
|
|
|
+ destination: string,
|
|
|
|
|
+): Promise<SimilarResult[]> {
|
|
|
const files = walkFiles(destination);
|
|
const files = walkFiles(destination);
|
|
|
- console.log(`Worker: Checking ${files.length} files for similar names in ${destination}`);
|
|
|
|
|
|
|
+ console.log(
|
|
|
|
|
+ `Worker: Checking ${files.length} files for similar names in ${destination}`,
|
|
|
|
|
+ );
|
|
|
const nameGroups = new Map<string, string[]>();
|
|
const nameGroups = new Map<string, string[]>();
|
|
|
let processed = 0;
|
|
let processed = 0;
|
|
|
|
|
|
|
@@ -106,45 +121,55 @@ async function scanForSimilarNames(destination: string): Promise<SimilarResult[]
|
|
|
const stat = await fsPromises.stat(filePath);
|
|
const stat = await fsPromises.stat(filePath);
|
|
|
if (!stat.isFile()) continue;
|
|
if (!stat.isFile()) continue;
|
|
|
|
|
|
|
|
- const baseName = path.basename(filePath, path.extname(filePath)).toLowerCase();
|
|
|
|
|
|
|
+ const baseName = path
|
|
|
|
|
+ .basename(filePath, path.extname(filePath))
|
|
|
|
|
+ .toLowerCase();
|
|
|
const group = nameGroups.get(baseName) || [];
|
|
const group = nameGroups.get(baseName) || [];
|
|
|
group.push(filePath);
|
|
group.push(filePath);
|
|
|
nameGroups.set(baseName, group);
|
|
nameGroups.set(baseName, group);
|
|
|
processed++;
|
|
processed++;
|
|
|
if (processed % 100 === 0) {
|
|
if (processed % 100 === 0) {
|
|
|
- console.log(`Worker: Processed ${processed}/${files.length} files for similar names in ${destination}`);
|
|
|
|
|
|
|
+ console.log(
|
|
|
|
|
+ `Worker: Processed ${processed}/${files.length} files for similar names in ${destination}`,
|
|
|
|
|
+ );
|
|
|
}
|
|
}
|
|
|
} catch (error) {
|
|
} catch (error) {
|
|
|
- console.warn(`Worker: Failed to process file for similar name scan: ${filePath} (${error})`);
|
|
|
|
|
|
|
+ console.warn(
|
|
|
|
|
+ `Worker: Failed to process file for similar name scan: ${filePath} (${error})`,
|
|
|
|
|
+ );
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- console.log(`Worker: Completed similar name check for ${processed} files in ${destination}`);
|
|
|
|
|
|
|
+ console.log(
|
|
|
|
|
+ `Worker: Completed similar name check for ${processed} files in ${destination}`,
|
|
|
|
|
+ );
|
|
|
|
|
|
|
|
return Array.from(nameGroups.entries())
|
|
return Array.from(nameGroups.entries())
|
|
|
.filter(([, files]) => files.length > 1)
|
|
.filter(([, files]) => files.length > 1)
|
|
|
.map(([baseName, files]) => ({ baseName, files }));
|
|
.map(([baseName, files]) => ({ baseName, files }));
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-parentPort?.on('message', async (message) => {
|
|
|
|
|
- const { type, destination, dataset } = message;
|
|
|
|
|
-
|
|
|
|
|
- if (type === 'scan_duplicates') {
|
|
|
|
|
- try {
|
|
|
|
|
- const duplicates = await scanDestinationForDuplicates(destination);
|
|
|
|
|
- const similars = await scanForSimilarNames(destination);
|
|
|
|
|
- parentPort?.postMessage({
|
|
|
|
|
- type: 'scan_result',
|
|
|
|
|
- dataset,
|
|
|
|
|
- destination,
|
|
|
|
|
- duplicates,
|
|
|
|
|
- similars,
|
|
|
|
|
- });
|
|
|
|
|
- } catch (error) {
|
|
|
|
|
- parentPort?.postMessage({
|
|
|
|
|
- type: 'error',
|
|
|
|
|
- error: error.message,
|
|
|
|
|
- });
|
|
|
|
|
|
|
+parentPort?.on('message', (message) => {
|
|
|
|
|
+ void (async () => {
|
|
|
|
|
+ const { type, destination, dataset } = message;
|
|
|
|
|
+
|
|
|
|
|
+ if (type === 'scan_duplicates') {
|
|
|
|
|
+ try {
|
|
|
|
|
+ const duplicates = await scanDestinationForDuplicates(destination);
|
|
|
|
|
+ const similars = await scanForSimilarNames(destination);
|
|
|
|
|
+ parentPort?.postMessage({
|
|
|
|
|
+ type: 'scan_result',
|
|
|
|
|
+ dataset,
|
|
|
|
|
+ destination,
|
|
|
|
|
+ duplicates,
|
|
|
|
|
+ similars,
|
|
|
|
|
+ });
|
|
|
|
|
+ } catch (error) {
|
|
|
|
|
+ parentPort?.postMessage({
|
|
|
|
|
+ type: 'error',
|
|
|
|
|
+ error: error.message,
|
|
|
|
|
+ });
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
- }
|
|
|
|
|
-});
|
|
|
|
|
|
|
+ })();
|
|
|
|
|
+});
|