| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175 |
- import crypto from 'crypto';
- import fs from 'fs';
- import fsPromises from 'fs/promises';
- import path from 'path';
- import { parentPort } from 'worker_threads';
- interface ScanResult {
- dataset: string;
- destination: string;
- hash: string;
- size: number;
- files: string[];
- }
- interface SimilarResult {
- baseName: string;
- files: string[];
- }
- function walkFiles(root: string): string[] {
- const pending = [root];
- const files: string[] = [];
- while (pending.length) {
- const current = pending.pop();
- if (!current) continue;
- let stat: fs.Stats;
- try {
- stat = fs.statSync(current);
- } catch {
- continue;
- }
- if (stat.isDirectory()) {
- const children = fs.readdirSync(current);
- for (const child of children) {
- pending.push(path.join(current, child));
- }
- } else if (stat.isFile()) {
- files.push(current);
- }
- }
- return files;
- }
- async function hashFileAsync(filePath: string): Promise<string | null> {
- try {
- const data = await fsPromises.readFile(filePath);
- const hash = crypto.createHash('sha1');
- hash.update(data);
- return hash.digest('hex');
- } catch (error) {
- console.warn(`Hashing failed for ${filePath}: ${error}`);
- return null;
- }
- }
- async function scanDestinationForDuplicates(
- destination: string,
- ): Promise<ScanResult[]> {
- const files = walkFiles(destination);
- console.log(`Worker: Found ${files.length} files to scan in ${destination}`);
- const groups = new Map<string, { size: number; files: string[] }>();
- let processed = 0;
- for (const filePath of files) {
- try {
- const stat = await fsPromises.stat(filePath);
- if (!stat.isFile()) continue;
- const hash = await hashFileAsync(filePath);
- if (hash) {
- const key = `${hash}:${stat.size}`;
- const group = groups.get(key) || { size: stat.size, files: [] };
- group.files.push(filePath);
- groups.set(key, group);
- }
- processed++;
- if (processed % 100 === 0) {
- console.log(
- `Worker: Processed ${processed}/${files.length} files in ${destination}`,
- );
- }
- } catch (error) {
- console.warn(
- `Worker: Failed to process file for duplicate scan: ${filePath} (${error})`,
- );
- }
- }
- console.log(
- `Worker: Completed scanning ${processed} files in ${destination}`,
- );
- return Array.from(groups.entries())
- .filter(([, group]) => group.files.length > 1)
- .map(
- ([key, group]) =>
- ({
- hash: key.split(':')[0],
- size: group.size,
- files: group.files,
- }) as ScanResult,
- );
- }
- async function scanForSimilarNames(
- destination: string,
- ): Promise<SimilarResult[]> {
- const files = walkFiles(destination);
- console.log(
- `Worker: Checking ${files.length} files for similar names in ${destination}`,
- );
- const nameGroups = new Map<string, string[]>();
- let processed = 0;
- for (const filePath of files) {
- try {
- const stat = await fsPromises.stat(filePath);
- if (!stat.isFile()) continue;
- const baseName = path
- .basename(filePath, path.extname(filePath))
- .toLowerCase();
- const group = nameGroups.get(baseName) || [];
- group.push(filePath);
- nameGroups.set(baseName, group);
- processed++;
- if (processed % 100 === 0) {
- console.log(
- `Worker: Processed ${processed}/${files.length} files for similar names in ${destination}`,
- );
- }
- } catch (error) {
- console.warn(
- `Worker: Failed to process file for similar name scan: ${filePath} (${error})`,
- );
- }
- }
- console.log(
- `Worker: Completed similar name check for ${processed} files in ${destination}`,
- );
- return Array.from(nameGroups.entries())
- .filter(([, files]) => files.length > 1)
- .map(([baseName, files]) => ({ baseName, files }));
- }
- parentPort?.on('message', (message) => {
- void (async () => {
- const { type, destination, dataset } = message;
- if (type === 'scan_duplicates') {
- try {
- const duplicates = await scanDestinationForDuplicates(destination);
- const similars = await scanForSimilarNames(destination);
- parentPort?.postMessage({
- type: 'scan_result',
- dataset,
- destination,
- duplicates,
- similars,
- });
- } catch (error) {
- parentPort?.postMessage({
- type: 'error',
- error: error.message,
- });
- }
- }
- })();
- });
|