duplicate-worker.ts 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. import crypto from 'crypto';
  2. import fs from 'fs';
  3. import fsPromises from 'fs/promises';
  4. import path from 'path';
  5. import { parentPort } from 'worker_threads';
  6. interface ScanResult {
  7. dataset: string;
  8. destination: string;
  9. hash: string;
  10. size: number;
  11. files: string[];
  12. }
  13. interface SimilarResult {
  14. baseName: string;
  15. files: string[];
  16. }
  17. function walkFiles(root: string): string[] {
  18. const pending = [root];
  19. const files: string[] = [];
  20. while (pending.length) {
  21. const current = pending.pop();
  22. if (!current) continue;
  23. let stat: fs.Stats;
  24. try {
  25. stat = fs.statSync(current);
  26. } catch {
  27. continue;
  28. }
  29. if (stat.isDirectory()) {
  30. const children = fs.readdirSync(current);
  31. for (const child of children) {
  32. pending.push(path.join(current, child));
  33. }
  34. } else if (stat.isFile()) {
  35. files.push(current);
  36. }
  37. }
  38. return files;
  39. }
  40. async function hashFileAsync(filePath: string): Promise<string | null> {
  41. try {
  42. const data = await fsPromises.readFile(filePath);
  43. const hash = crypto.createHash('sha1');
  44. hash.update(data);
  45. return hash.digest('hex');
  46. } catch (error) {
  47. console.warn(`Hashing failed for ${filePath}: ${error}`);
  48. return null;
  49. }
  50. }
  51. async function scanDestinationForDuplicates(
  52. destination: string,
  53. ): Promise<ScanResult[]> {
  54. const files = walkFiles(destination);
  55. console.log(`Worker: Found ${files.length} files to scan in ${destination}`);
  56. const groups = new Map<string, { size: number; files: string[] }>();
  57. let processed = 0;
  58. for (const filePath of files) {
  59. try {
  60. const stat = await fsPromises.stat(filePath);
  61. if (!stat.isFile()) continue;
  62. const hash = await hashFileAsync(filePath);
  63. if (hash) {
  64. const key = `${hash}:${stat.size}`;
  65. const group = groups.get(key) || { size: stat.size, files: [] };
  66. group.files.push(filePath);
  67. groups.set(key, group);
  68. }
  69. processed++;
  70. if (processed % 100 === 0) {
  71. console.log(
  72. `Worker: Processed ${processed}/${files.length} files in ${destination}`,
  73. );
  74. }
  75. } catch (error) {
  76. console.warn(
  77. `Worker: Failed to process file for duplicate scan: ${filePath} (${error})`,
  78. );
  79. }
  80. }
  81. console.log(
  82. `Worker: Completed scanning ${processed} files in ${destination}`,
  83. );
  84. return Array.from(groups.entries())
  85. .filter(([, group]) => group.files.length > 1)
  86. .map(
  87. ([key, group]) =>
  88. ({
  89. hash: key.split(':')[0],
  90. size: group.size,
  91. files: group.files,
  92. }) as ScanResult,
  93. );
  94. }
  95. async function scanForSimilarNames(
  96. destination: string,
  97. ): Promise<SimilarResult[]> {
  98. const files = walkFiles(destination);
  99. console.log(
  100. `Worker: Checking ${files.length} files for similar names in ${destination}`,
  101. );
  102. const nameGroups = new Map<string, string[]>();
  103. let processed = 0;
  104. for (const filePath of files) {
  105. try {
  106. const stat = await fsPromises.stat(filePath);
  107. if (!stat.isFile()) continue;
  108. const baseName = path
  109. .basename(filePath, path.extname(filePath))
  110. .toLowerCase();
  111. const group = nameGroups.get(baseName) || [];
  112. group.push(filePath);
  113. nameGroups.set(baseName, group);
  114. processed++;
  115. if (processed % 100 === 0) {
  116. console.log(
  117. `Worker: Processed ${processed}/${files.length} files for similar names in ${destination}`,
  118. );
  119. }
  120. } catch (error) {
  121. console.warn(
  122. `Worker: Failed to process file for similar name scan: ${filePath} (${error})`,
  123. );
  124. }
  125. }
  126. console.log(
  127. `Worker: Completed similar name check for ${processed} files in ${destination}`,
  128. );
  129. return Array.from(nameGroups.entries())
  130. .filter(([, files]) => files.length > 1)
  131. .map(([baseName, files]) => ({ baseName, files }));
  132. }
  133. parentPort?.on('message', (message) => {
  134. void (async () => {
  135. const { type, destination, dataset } = message;
  136. if (type === 'scan_duplicates') {
  137. try {
  138. const duplicates = await scanDestinationForDuplicates(destination);
  139. const similars = await scanForSimilarNames(destination);
  140. parentPort?.postMessage({
  141. type: 'scan_result',
  142. dataset,
  143. destination,
  144. duplicates,
  145. similars,
  146. });
  147. } catch (error) {
  148. parentPort?.postMessage({
  149. type: 'error',
  150. error: error.message,
  151. });
  152. }
  153. }
  154. })();
  155. });