maintenance.service.ts 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400
  1. import { Injectable, Logger } from '@nestjs/common';
  2. import { Cron, CronExpression } from '@nestjs/schedule';
  3. import crypto from 'crypto';
  4. import fs from 'fs';
  5. import path from 'path';
  6. import { DatasetsService } from './datasets.service';
  7. import { DbService } from './db.service';
  8. @Injectable()
  9. export class MaintenanceService {
  10. private logger = new Logger('MaintenanceService');
  11. constructor(
  12. private readonly db: DbService,
  13. private readonly datasetsService: DatasetsService,
  14. ) {}
  15. cleanup(file: string, dirs: string[]) {
  16. for (let i = 0, l = dirs.length; i < l; i++) {
  17. const dir = dirs[i];
  18. if (file && dir && file.indexOf(dir) > -1) {
  19. const dataset = dir.replace(/.*\/(.*)/, '$1');
  20. const exists = fs.existsSync(file);
  21. if (!exists) this.db.removeFile(dataset, file, true);
  22. }
  23. }
  24. }
  25. purge(
  26. dirs: string[],
  27. dayMs = 24 * 60 * 60 * 1000,
  28. cleanerMs = 60 * 60 * 1000,
  29. ) {
  30. const ago = new Date(Date.now() - dayMs);
  31. this.logger.log(
  32. `Checking for "deleted" records older than ${ago.toISOString()}`,
  33. );
  34. for (let i = 0, l = dirs.length; i < l; i++) {
  35. const dir = dirs[i];
  36. const dataset = dir.replace(/.*\/(.*)/, '$1');
  37. const files = this.db.getDeletedOlderThan(dataset, ago.toISOString());
  38. for (const file of files as { input: string }[]) {
  39. this.logger.log(
  40. `Purging "${file.input}" (${new Date().toISOString()})`,
  41. );
  42. if (file && file.input) this.db.removeFile(dataset, file.input, false);
  43. }
  44. }
  45. setTimeout(() => {
  46. this.purge(dirs, dayMs, cleanerMs);
  47. }, cleanerMs);
  48. }
  49. prune(dirs: string[]) {
  50. this.logger.log('Checking for any files that no longer exist on disk.');
  51. for (let i = 0, l = dirs.length; i < l; i++) {
  52. const dir = dirs[i];
  53. const dataset = dir.replace(/.*\/(.*)/, '$1');
  54. // Get all files for this dataset (not just successful ones)
  55. const allFiles = this.db.getAllFiles(dataset);
  56. for (const file of allFiles as { input: string; status: string }[]) {
  57. const exists = fs.existsSync(file.input);
  58. if (!exists) {
  59. // Only soft delete if not already marked as deleted
  60. // Note: Database schema changes must be done via migrations in data/migrations/
  61. if (file.status !== 'deleted') {
  62. this.logger.log(
  63. `Marking missing file as deleted: "${file.input}" (${new Date().toISOString()})`,
  64. );
  65. this.db.removeFile(dataset, file.input, true); // soft delete
  66. }
  67. }
  68. }
  69. }
  70. }
  71. // Scheduled task cleanup - runs periodically to prevent task table from growing too large
  72. @Cron(CronExpression.EVERY_DAY_AT_2AM) // Run daily at 2 AM
  73. scheduledTaskCleanup() {
  74. this.logger.log('Running scheduled task cleanup');
  75. try {
  76. // Archive completed tasks older than 30 days
  77. const archiveResult = this.db.archiveOldTasks(30);
  78. this.logger.log(`Archived ${archiveResult?.changes || 0} old tasks`);
  79. // Delete failed tasks older than 7 days
  80. const failedResult = this.db.deleteTasksByStatus('failed', 7);
  81. this.logger.log(
  82. `Deleted ${failedResult.changes} failed tasks older than 7 days`,
  83. );
  84. // Delete skipped tasks older than 7 days
  85. const skippedResult = this.db.deleteTasksByStatus('skipped', 7);
  86. this.logger.log(
  87. `Deleted ${skippedResult.changes} skipped tasks older than 7 days`,
  88. );
  89. // Keep completed tasks for 90 days, then archive them
  90. const completedResult = this.db.deleteTasksByStatus('completed', 90);
  91. this.logger.log(
  92. `Deleted ${completedResult.changes} completed tasks older than 90 days`,
  93. );
  94. } catch (error) {
  95. this.logger.error(
  96. `Error during scheduled task cleanup: ${error.message}`,
  97. );
  98. }
  99. }
  100. /**
  101. * Scan destination folders (as defined in settings.datasets) for duplicate files.
  102. * Duplicates are recorded in the duplicate_files table for manual review.
  103. */
  104. findDuplicateFiles(options: { resetExisting?: boolean } = {}) {
  105. const { resetExisting = false } = options;
  106. this.logger.log('Starting duplicate file scan');
  107. if (resetExisting) {
  108. this.db.clearDuplicateGroups();
  109. this.logger.log('Cleared existing duplicate groups');
  110. }
  111. const existing = this.db.listDuplicateGroups();
  112. const existingMap = new Map<
  113. string,
  114. { id: number; status: string; files: string[] }
  115. >();
  116. for (const row of existing) {
  117. const key = `${row.dataset}|${row.destination}|${row.hash}|${row.size}`;
  118. existingMap.set(key, {
  119. id: row.id,
  120. status: row.status,
  121. files: row.files,
  122. });
  123. }
  124. const datasetConfig = this.datasetsService.getDatasetConfig();
  125. const duplicates: Array<{
  126. dataset: string;
  127. destination: string;
  128. hash: string;
  129. size: number;
  130. files: string[];
  131. }> = [];
  132. for (const [datasetName, datasetObj] of Object.entries(datasetConfig)) {
  133. if (
  134. !datasetObj ||
  135. datasetObj.enabled === false ||
  136. datasetObj.enabled === 'false'
  137. ) {
  138. this.logger.log(`Skipping disabled dataset: ${datasetName}`);
  139. continue;
  140. }
  141. this.logger.log(`Scanning dataset: ${datasetName}`);
  142. const destinations = this.collectDestinations(datasetObj);
  143. for (const destination of destinations) {
  144. if (!destination || !fs.existsSync(destination)) {
  145. this.logger.warn(
  146. `Destination not found for dataset ${datasetName}: ${destination}`,
  147. );
  148. continue;
  149. }
  150. this.logger.log(`Scanning destination: ${destination}`);
  151. const groups = this.scanDestinationForDuplicates(destination);
  152. this.scanForSimilarNames(destination);
  153. for (const group of groups) {
  154. const entry = {
  155. dataset: datasetName,
  156. destination,
  157. hash: group.hash,
  158. size: group.size,
  159. files: group.files,
  160. };
  161. const key = `${entry.dataset}|${entry.destination}|${entry.hash}|${entry.size}`;
  162. const existingEntry = existingMap.get(key);
  163. // Skip groups that were marked reviewed/ignored previously
  164. if (existingEntry && existingEntry.status === 'reviewed') {
  165. continue;
  166. }
  167. duplicates.push(entry);
  168. if (existingEntry) {
  169. this.db.updateDuplicateGroupFiles(
  170. existingEntry.id,
  171. entry.files,
  172. 'pending',
  173. );
  174. } else {
  175. this.db.saveDuplicateGroup(entry);
  176. }
  177. }
  178. if (groups.length) {
  179. this.logger.warn(
  180. `Found ${groups.length} duplicate group(s) in destination ${destination} (dataset: ${datasetName})`,
  181. );
  182. }
  183. }
  184. }
  185. this.logger.log(
  186. `Duplicate scan completed. Processed ${duplicates.length} groups.`,
  187. );
  188. return duplicates;
  189. }
  190. private collectDestinations(datasetObj: Record<string, any>): Set<string> {
  191. const destinations = new Set<string>();
  192. if (datasetObj.destination && typeof datasetObj.destination === 'string') {
  193. destinations.add(datasetObj.destination);
  194. }
  195. for (const [pathKey, cfg] of Object.entries(datasetObj)) {
  196. if (pathKey === 'enabled') continue;
  197. if (cfg && typeof cfg === 'object' && cfg.destination) {
  198. destinations.add(cfg.destination);
  199. }
  200. }
  201. return destinations;
  202. }
  203. private scanDestinationForDuplicates(destination: string) {
  204. const files = this.walkFiles(destination);
  205. this.logger.log(`Found ${files.length} files to scan in ${destination}`);
  206. const groups = new Map<string, { size: number; files: string[] }>();
  207. let processed = 0;
  208. for (const filePath of files) {
  209. try {
  210. const stat = fs.statSync(filePath);
  211. if (!stat.isFile()) continue;
  212. const hash = this.hashFile(filePath);
  213. if (hash) {
  214. const key = `${hash}:${stat.size}`;
  215. const group = groups.get(key) || { size: stat.size, files: [] };
  216. group.files.push(filePath);
  217. groups.set(key, group);
  218. }
  219. processed++;
  220. if (processed % 100 === 0) {
  221. this.logger.log(
  222. `Processed ${processed}/${files.length} files in ${destination}`,
  223. );
  224. }
  225. } catch (error) {
  226. this.logger.warn(
  227. `Failed to process file for duplicate scan: ${filePath} (${error})`,
  228. );
  229. }
  230. }
  231. this.logger.log(`Completed scanning ${processed} files in ${destination}`);
  232. return Array.from(groups.entries())
  233. .filter(([, group]) => group.files.length > 1)
  234. .map(([key, group]) => ({
  235. hash: key.split(':')[0],
  236. size: group.size,
  237. files: group.files,
  238. }));
  239. }
  240. private scanForSimilarNames(destination: string) {
  241. const files = this.walkFiles(destination);
  242. this.logger.log(
  243. `Checking ${files.length} files for similar names in ${destination}`,
  244. );
  245. const nameGroups = new Map<string, string[]>();
  246. let processed = 0;
  247. for (const filePath of files) {
  248. try {
  249. const stat = fs.statSync(filePath);
  250. if (!stat.isFile()) continue;
  251. const baseName = path
  252. .basename(filePath, path.extname(filePath))
  253. .toLowerCase();
  254. const group = nameGroups.get(baseName) || [];
  255. group.push(filePath);
  256. nameGroups.set(baseName, group);
  257. processed++;
  258. if (processed % 100 === 0) {
  259. this.logger.log(
  260. `Processed ${processed}/${files.length} files for similar names in ${destination}`,
  261. );
  262. }
  263. } catch (error) {
  264. this.logger.warn(
  265. `Failed to process file for similar name scan: ${filePath} (${error})`,
  266. );
  267. }
  268. }
  269. this.logger.log(
  270. `Completed similar name check for ${processed} files in ${destination}`,
  271. );
  272. const similarGroups = Array.from(nameGroups.entries())
  273. .filter(([, files]) => files.length > 1)
  274. .map(([baseName, files]) => ({ baseName, files }));
  275. if (similarGroups.length) {
  276. this.logger.log(
  277. `Found ${similarGroups.length} groups of files with similar names in ${destination}`,
  278. );
  279. for (const group of similarGroups) {
  280. this.logger.log(
  281. `Similar: ${group.baseName} - ${group.files.join(', ')}`,
  282. );
  283. }
  284. }
  285. return similarGroups;
  286. }
  287. private walkFiles(root: string) {
  288. const pending = [root];
  289. const files: string[] = [];
  290. while (pending.length) {
  291. const current = pending.pop();
  292. if (!current) continue;
  293. let stat: fs.Stats;
  294. try {
  295. stat = fs.statSync(current);
  296. } catch {
  297. continue;
  298. }
  299. if (stat.isDirectory()) {
  300. const children = fs.readdirSync(current);
  301. for (const child of children) {
  302. pending.push(path.join(current, child));
  303. }
  304. } else if (stat.isFile()) {
  305. files.push(current);
  306. }
  307. }
  308. return files;
  309. }
  310. private hashFile(filePath: string): string | null {
  311. try {
  312. const hash = crypto.createHash('sha1');
  313. const data = fs.readFileSync(filePath);
  314. hash.update(data);
  315. return hash.digest('hex');
  316. } catch (error) {
  317. this.logger.warn(`Hashing failed for ${filePath}: ${error}`);
  318. return null;
  319. }
  320. }
  321. purgeDuplicateFiles(id: number, files: string[], note?: string) {
  322. const group = this.db.getDuplicateGroup(id);
  323. if (!group) {
  324. throw new Error('Duplicate group not found');
  325. }
  326. const toDelete = files && files.length > 0 ? files : [];
  327. const deleted: string[] = [];
  328. const errors: Array<{ file: string; error: string }> = [];
  329. for (const filePath of toDelete) {
  330. try {
  331. if (fs.existsSync(filePath)) {
  332. fs.unlinkSync(filePath);
  333. deleted.push(filePath);
  334. } else {
  335. errors.push({ file: filePath, error: 'File not found' });
  336. }
  337. } catch (error) {
  338. errors.push({ file: filePath, error: (error as Error).message });
  339. }
  340. }
  341. const remaining = group.files.filter((f) => !toDelete.includes(f));
  342. const nextStatus = remaining.length > 1 ? 'pending' : 'purged';
  343. this.db.updateDuplicateGroupFiles(id, remaining, nextStatus, note);
  344. return { deleted, errors, remaining, status: nextStatus };
  345. }
  346. }