example-duplicate-detection.js 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. #!/usr/bin/env node
  2. /**
  3. * Example script demonstrating the new duplicate detection optimization
  4. *
  5. * This shows how to:
  6. * 1. Index destination files for fast duplicate detection
  7. * 2. Query duplicate statistics
  8. * 3. Run duplicate scans using the database
  9. */
  10. const API_BASE = process.env.API_BASE || "http://localhost:3000";
  11. async function indexDestination(dataset, destination) {
  12. console.log(`\n📁 Indexing ${dataset} destination: ${destination}`);
  13. const response = await fetch(`${API_BASE}/maintenance/index/destination`, {
  14. method: "POST",
  15. headers: { "Content-Type": "application/json" },
  16. body: JSON.stringify({
  17. dataset,
  18. destination,
  19. reindex: false, // Set to true to rebuild index
  20. batchSize: 100,
  21. }),
  22. });
  23. const result = await response.json();
  24. console.log(
  25. `✅ Indexed: ${result.indexed}, Skipped: ${result.skipped}, Errors: ${result.errors}`
  26. );
  27. return result;
  28. }
  29. async function getIndexCount(dataset, destination) {
  30. console.log(`\n📊 Getting index count for ${dataset}`);
  31. const params = new URLSearchParams({ dataset });
  32. if (destination) params.append("destination", destination);
  33. const response = await fetch(`${API_BASE}/maintenance/index/count?${params}`);
  34. const result = await response.json();
  35. console.log(`📈 Indexed files: ${result.count}`);
  36. return result;
  37. }
  38. async function getDuplicateStats(dataset) {
  39. console.log(
  40. `\n🔍 Getting duplicate statistics for ${dataset || "all datasets"}`
  41. );
  42. const params = dataset ? `?dataset=${dataset}` : "";
  43. const response = await fetch(`${API_BASE}/maintenance/index/stats${params}`);
  44. const result = await response.json();
  45. console.log(`🔄 Total duplicate groups: ${result.totalDuplicates}`);
  46. if (result.duplicatesByDataset.length > 0) {
  47. console.log("\nDuplicate groups:");
  48. result.duplicatesByDataset.slice(0, 5).forEach((dup, idx) => {
  49. console.log(`\n Group ${idx + 1}:`);
  50. console.log(` Hash: ${dup.hash.substring(0, 16)}...`);
  51. console.log(` Size: ${(dup.file_size / 1024 / 1024).toFixed(2)} MB`);
  52. console.log(` Count: ${dup.file_count} files`);
  53. console.log(` Files:`);
  54. dup.files.forEach((file) => {
  55. console.log(` - ${file}`);
  56. });
  57. });
  58. if (result.duplicatesByDataset.length > 5) {
  59. console.log(
  60. `\n ... and ${result.duplicatesByDataset.length - 5} more groups`
  61. );
  62. }
  63. }
  64. return result;
  65. }
  66. async function scanDuplicates(resetExisting = false) {
  67. console.log(`\n🔎 Scanning for duplicates (reset: ${resetExisting})`);
  68. const response = await fetch(`${API_BASE}/maintenance/duplicates/scan`, {
  69. method: "POST",
  70. headers: { "Content-Type": "application/json" },
  71. body: JSON.stringify({ resetExisting }),
  72. });
  73. const result = await response.json();
  74. console.log("✅ Duplicate scan completed");
  75. return result;
  76. }
  77. async function clearIndex(dataset, destination) {
  78. console.log(`\n🗑️ Clearing index for ${dataset}`);
  79. const params = destination ? `?destination=${destination}` : "";
  80. const response = await fetch(
  81. `${API_BASE}/maintenance/index/${dataset}${params}`,
  82. {
  83. method: "DELETE",
  84. }
  85. );
  86. const result = await response.json();
  87. console.log(`🗑️ Cleared ${result.cleared} entries`);
  88. return result;
  89. }
  90. async function reindexDestination(dataset, destination) {
  91. console.log(`\n🔄 Re-indexing ${dataset} destination: ${destination}`);
  92. const response = await fetch(`${API_BASE}/maintenance/index/destination`, {
  93. method: "POST",
  94. headers: { "Content-Type": "application/json" },
  95. body: JSON.stringify({
  96. dataset,
  97. destination,
  98. reindex: true, // Force rebuild
  99. batchSize: 100,
  100. }),
  101. });
  102. const result = await response.json();
  103. console.log(
  104. `✅ Re-indexed: ${result.indexed}, Skipped: ${result.skipped}, Errors: ${result.errors}`
  105. );
  106. return result;
  107. }
  108. // Example workflow
  109. async function exampleWorkflow() {
  110. console.log("🚀 Duplicate Detection Optimization - Example Workflow\n");
  111. console.log(`Using API: ${API_BASE}\n`);
  112. try {
  113. // Example 1: Index a destination directory
  114. console.log("═".repeat(60));
  115. console.log("Example 1: Index destination files");
  116. console.log("═".repeat(60));
  117. // Uncomment and modify these lines with your actual paths:
  118. // await indexDestination('movies', '/path/to/movies/destination');
  119. // await indexDestination('tvshows', '/path/to/tvshows/destination');
  120. console.log(
  121. "\nℹ️ Uncomment the indexDestination calls in the script to run this example"
  122. );
  123. // Example 2: Check index count
  124. console.log("\n" + "═".repeat(60));
  125. console.log("Example 2: Check index count");
  126. console.log("═".repeat(60));
  127. // await getIndexCount('movies');
  128. console.log(
  129. "\nℹ️ Uncomment the getIndexCount call in the script to run this example"
  130. );
  131. // Example 3: Get duplicate statistics
  132. console.log("\n" + "═".repeat(60));
  133. console.log("Example 3: Get duplicate statistics");
  134. console.log("═".repeat(60));
  135. // await getDuplicateStats('movies');
  136. console.log(
  137. "\nℹ️ Uncomment the getDuplicateStats call in the script to run this example"
  138. );
  139. // Example 4: Run duplicate scan (uses database)
  140. console.log("\n" + "═".repeat(60));
  141. console.log("Example 4: Run duplicate scan");
  142. console.log("═".repeat(60));
  143. // await scanDuplicates(false);
  144. console.log(
  145. "\nℹ️ Uncomment the scanDuplicates call in the script to run this example"
  146. );
  147. // Example 5: Re-index (clear and rebuild)
  148. console.log("\n" + "═".repeat(60));
  149. console.log("Example 5: Re-index destination");
  150. console.log("═".repeat(60));
  151. // await clearIndex('movies');
  152. // await indexDestination('movies', '/path/to/movies/destination');
  153. console.log(
  154. "\nℹ️ Uncomment the clearIndex and indexDestination calls in the script to run this example"
  155. );
  156. console.log("\n" + "═".repeat(60));
  157. console.log("✨ Workflow complete!");
  158. console.log("═".repeat(60));
  159. } catch (error) {
  160. console.error("\n❌ Error:", error.message);
  161. if (error.cause) {
  162. console.error("Cause:", error.cause);
  163. }
  164. }
  165. }
  166. // Run the workflow
  167. exampleWorkflow().catch(console.error);