db.service.ts 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925
  1. import { Injectable } from '@nestjs/common';
  2. import Database from 'better-sqlite3';
  3. import fs from 'fs';
  4. import path from 'path';
  5. import { MigrationRunner } from './migration-runner';
  6. @Injectable()
  7. export class DbService {
  8. private db: Database.Database;
  9. constructor() {
  10. // Use unified database for all settings/configuration
  11. // Find project root by traversing up from current directory until we find the root package.json
  12. let projectRoot = process.cwd();
  13. while (projectRoot !== path.dirname(projectRoot)) {
  14. if (fs.existsSync(path.join(projectRoot, 'package.json'))) {
  15. try {
  16. const pkg = JSON.parse(
  17. fs.readFileSync(path.join(projectRoot, 'package.json'), 'utf-8'),
  18. );
  19. if (pkg.name === 'watch-finished-turbo') {
  20. break;
  21. }
  22. } catch (e) {
  23. // ignore
  24. }
  25. }
  26. projectRoot = path.dirname(projectRoot);
  27. }
  28. const rootDataPath = path.resolve(projectRoot, 'data/database.db');
  29. // Ensure the directory exists
  30. const dir = path.dirname(rootDataPath);
  31. if (!fs.existsSync(dir)) {
  32. fs.mkdirSync(dir, { recursive: true });
  33. }
  34. try {
  35. this.db = new Database(rootDataPath);
  36. // Run migrations
  37. const migrationsDir = path.resolve(projectRoot, 'data/migrations');
  38. const migrationRunner = new MigrationRunner(this.db, migrationsDir);
  39. migrationRunner.applyPendingMigrations();
  40. } catch (error) {
  41. console.error('Failed to open database:', error);
  42. console.error('Database path:', rootDataPath);
  43. console.error('Directory exists:', fs.existsSync(dir));
  44. console.error(
  45. 'Directory permissions:',
  46. fs.statSync(dir).mode.toString(8),
  47. );
  48. throw error;
  49. }
  50. }
  51. /**
  52. * Get the database instance for direct queries
  53. */
  54. getDb(): Database.Database {
  55. return this.db;
  56. }
  57. // List all files
  58. listAllFiles() {
  59. return this.db.prepare('SELECT * FROM files').all();
  60. }
  61. // List all files for a dataset
  62. listFilesForDataset(dataset: string) {
  63. return this.db
  64. .prepare('SELECT * FROM files WHERE dataset = ?')
  65. .all(dataset);
  66. }
  67. /**
  68. * Delete file records older than X days (autoExpireDays from settings)
  69. * @param days Days to keep (optional, overrides settings)
  70. */
  71. deleteExpiredFiles(days?: number): number {
  72. // Fallback to 180 if no settings available
  73. const keepDays = days || 180;
  74. const cutoff = new Date(
  75. Date.now() - keepDays * 24 * 60 * 60 * 1000,
  76. ).toISOString();
  77. const stmt = this.db.prepare('DELETE FROM files WHERE date < ?');
  78. const info = stmt.run(cutoff);
  79. return info.changes;
  80. }
  81. /**
  82. * Migrate legacy JSON files to the SQLite database.
  83. * @param opts Options for migration.
  84. * @param dbOverride Optional override for db instance.
  85. */
  86. migrateJsonToSqlite(
  87. opts: { datasets?: string[]; dataDir?: string } = {},
  88. dbOverride?: Database.Database,
  89. ) {
  90. const datasets = opts.datasets || [
  91. 'movies',
  92. 'tvshows',
  93. 'kids',
  94. 'pr0n',
  95. 'sports',
  96. ];
  97. const dataDir = opts.dataDir || path.join(process.cwd(), 'legacy/data');
  98. const dbInstance = dbOverride || this.db;
  99. dbInstance.exec(`
  100. CREATE TABLE IF NOT EXISTS files (
  101. dataset TEXT,
  102. input TEXT,
  103. output TEXT,
  104. date TEXT,
  105. status TEXT DEFAULT 'pending',
  106. PRIMARY KEY (dataset, input)
  107. );
  108. `);
  109. const insert = dbInstance.prepare(
  110. 'INSERT INTO files (dataset, input, output, date) VALUES (?, ?, ?, ?)',
  111. );
  112. for (const dataset of datasets) {
  113. const filePath = path.join(dataDir, `${dataset}.json`);
  114. if (!fs.existsSync(filePath)) continue;
  115. const json = JSON.parse(fs.readFileSync(filePath, 'utf8'));
  116. if (!json.files || !Array.isArray(json.files)) continue;
  117. for (const rec of json.files) {
  118. insert.run(
  119. dataset,
  120. rec.input || null,
  121. rec.output || null,
  122. rec.date || null,
  123. );
  124. }
  125. }
  126. }
  127. // Duplicate file review helpers
  128. clearDuplicateGroups() {
  129. this.db.prepare('DELETE FROM duplicate_files').run();
  130. }
  131. getDuplicateGroup(id: number) {
  132. const row = this.db
  133. .prepare('SELECT * FROM duplicate_files WHERE id = ?')
  134. .get(id) as
  135. | {
  136. id: number;
  137. dataset: string;
  138. destination: string;
  139. hash: string;
  140. size: number;
  141. files: string;
  142. status: string;
  143. note?: string;
  144. created_at: string;
  145. reviewed_at?: string;
  146. }
  147. | undefined;
  148. if (!row) return undefined;
  149. return { ...row, files: this.safeParseFiles(row.files) };
  150. }
  151. getDuplicateGroupByKey(
  152. dataset: string,
  153. destination: string,
  154. hash: string,
  155. size: number,
  156. ) {
  157. const row = this.db
  158. .prepare(
  159. 'SELECT * FROM duplicate_files WHERE dataset = ? AND destination = ? AND hash = ? AND size = ?',
  160. )
  161. .get(dataset, destination, hash, size) as
  162. | {
  163. id: number;
  164. dataset: string;
  165. destination: string;
  166. hash: string;
  167. size: number;
  168. files: string;
  169. status: string;
  170. note?: string;
  171. created_at: string;
  172. reviewed_at?: string;
  173. }
  174. | undefined;
  175. if (!row) return undefined;
  176. return { ...row, files: this.safeParseFiles(row.files) };
  177. }
  178. saveDuplicateGroup(entry: {
  179. dataset: string;
  180. destination: string;
  181. hash: string;
  182. size: number;
  183. files: string[];
  184. }) {
  185. const existing = this.getDuplicateGroupByKey(
  186. entry.dataset,
  187. entry.destination,
  188. entry.hash,
  189. entry.size,
  190. );
  191. // Do not re-flag entries that were manually reviewed/ignored
  192. if (existing && existing.status === 'reviewed') {
  193. return existing;
  194. }
  195. const filesJson = JSON.stringify(entry.files);
  196. if (existing) {
  197. this.db
  198. .prepare(
  199. `UPDATE duplicate_files
  200. SET files = ?, size = ?, status = 'pending', note = note, reviewed_at = NULL
  201. WHERE id = ?`,
  202. )
  203. .run(filesJson, entry.size, existing.id);
  204. return {
  205. ...existing,
  206. files: entry.files,
  207. size: entry.size,
  208. status: 'pending',
  209. };
  210. }
  211. const result = this.db
  212. .prepare(
  213. `INSERT INTO duplicate_files (dataset, destination, hash, size, files)
  214. VALUES (?, ?, ?, ?, ?)`,
  215. )
  216. .run(entry.dataset, entry.destination, entry.hash, entry.size, filesJson);
  217. return {
  218. ...entry,
  219. id: result.lastInsertRowid as number,
  220. status: 'pending',
  221. note: null,
  222. };
  223. }
  224. listDuplicateGroups(status?: string, dataset?: string) {
  225. let query = 'SELECT * FROM duplicate_files';
  226. const params: any[] = [];
  227. if (status) {
  228. query += ' WHERE status = ?';
  229. params.push(status);
  230. }
  231. if (dataset) {
  232. query += status ? ' AND dataset = ?' : ' WHERE dataset = ?';
  233. params.push(dataset);
  234. }
  235. query += ' ORDER BY created_at DESC';
  236. const rows = this.db.prepare(query).all(...params) as Array<{
  237. id: number;
  238. dataset: string;
  239. destination: string;
  240. hash: string;
  241. size: number;
  242. files: string;
  243. status: string;
  244. note?: string;
  245. created_at: string;
  246. reviewed_at?: string;
  247. }>;
  248. return rows.map((row) => ({
  249. ...row,
  250. files: this.safeParseFiles(row.files),
  251. }));
  252. }
  253. markDuplicateGroup(
  254. id: number,
  255. status: 'pending' | 'reviewed' | 'purged',
  256. note?: string,
  257. ) {
  258. return this.db
  259. .prepare(
  260. `UPDATE duplicate_files
  261. SET status = ?, note = COALESCE(?, note), reviewed_at = CURRENT_TIMESTAMP
  262. WHERE id = ?`,
  263. )
  264. .run(status, note || null, id);
  265. }
  266. updateDuplicateGroupFiles(
  267. id: number,
  268. files: string[],
  269. status?: 'pending' | 'reviewed' | 'purged',
  270. note?: string,
  271. ) {
  272. return this.db
  273. .prepare(
  274. `UPDATE duplicate_files
  275. SET files = ?,
  276. status = COALESCE(?, status),
  277. note = COALESCE(?, note),
  278. reviewed_at = CASE WHEN ? IS NOT NULL THEN CURRENT_TIMESTAMP ELSE reviewed_at END
  279. WHERE id = ?`,
  280. )
  281. .run(
  282. JSON.stringify(files),
  283. status || null,
  284. note || null,
  285. status || null,
  286. id,
  287. );
  288. }
  289. deleteDuplicateGroup(id: number) {
  290. return this.db.prepare('DELETE FROM duplicate_files WHERE id = ?').run(id);
  291. }
  292. private safeParseFiles(value: string): string[] {
  293. try {
  294. const parsed = JSON.parse(value);
  295. return Array.isArray(parsed) ? parsed : [];
  296. } catch {
  297. return [];
  298. }
  299. }
  300. findFile(dataset: string, file: string) {
  301. return this.db
  302. .prepare('SELECT * FROM files WHERE dataset = ? AND input = ?')
  303. .get(dataset, file);
  304. }
  305. setFile(dataset: string, file: string, payload: any) {
  306. const existing = this.findFile(dataset, file) as
  307. | {
  308. dataset: string;
  309. input: string;
  310. output?: string;
  311. date?: string;
  312. status?: string;
  313. hash?: string;
  314. file_size?: number;
  315. }
  316. | undefined;
  317. const outputValue =
  318. payload && payload.output !== undefined
  319. ? payload.output
  320. : (existing?.output ?? null);
  321. const statusValue =
  322. payload && payload.status !== undefined
  323. ? payload.status
  324. : (existing?.status ?? 'pending');
  325. const dateValue =
  326. payload && payload.date !== undefined
  327. ? new Date(payload.date).toISOString()
  328. : existing?.date || new Date().toISOString();
  329. const hashValue =
  330. payload && payload.hash !== undefined
  331. ? payload.hash
  332. : (existing?.hash ?? null);
  333. const fileSizeValue =
  334. payload && payload.file_size !== undefined
  335. ? payload.file_size
  336. : (existing?.file_size ?? null);
  337. if (existing) {
  338. this.db
  339. .prepare(
  340. `UPDATE files
  341. SET output = COALESCE(?, output),
  342. date = COALESCE(?, date),
  343. status = COALESCE(?, status),
  344. hash = COALESCE(?, hash),
  345. file_size = COALESCE(?, file_size)
  346. WHERE dataset = ? AND input = ?`,
  347. )
  348. .run(
  349. outputValue,
  350. dateValue,
  351. statusValue,
  352. hashValue,
  353. fileSizeValue,
  354. dataset,
  355. file,
  356. );
  357. } else {
  358. this.db
  359. .prepare(
  360. 'INSERT INTO files (dataset, input, output, date, status, hash, file_size) VALUES (?, ?, ?, ?, ?, ?, ?)',
  361. )
  362. .run(
  363. dataset,
  364. file,
  365. outputValue,
  366. dateValue,
  367. statusValue,
  368. hashValue,
  369. fileSizeValue,
  370. );
  371. }
  372. return this.findFile(dataset, file);
  373. }
  374. removeFile(dataset: string, file: string, soft = true) {
  375. if (soft) {
  376. this.db
  377. .prepare(
  378. 'UPDATE files SET status = ?, date = ? WHERE dataset = ? AND input = ?',
  379. )
  380. .run('deleted', new Date().toISOString(), dataset, file);
  381. } else {
  382. this.db
  383. .prepare('DELETE FROM files WHERE dataset = ? AND input = ?')
  384. .run(dataset, file);
  385. }
  386. }
  387. clearAllFiles() {
  388. this.db.prepare('DELETE FROM files').run();
  389. }
  390. getAllFiles(dataset: string) {
  391. return this.db
  392. .prepare('SELECT * FROM files WHERE dataset = ?')
  393. .all(dataset);
  394. }
  395. getDeletedOlderThan(dataset: string, isoDate: string) {
  396. return this.db
  397. .prepare('SELECT * FROM files WHERE dataset = ? AND date < ?')
  398. .all(dataset, isoDate);
  399. }
  400. // Task CRUD methods
  401. getAllTasks() {
  402. return this.db
  403. .prepare('SELECT * FROM tasks ORDER BY created_at DESC')
  404. .all();
  405. }
  406. getPendingTasks(limit: number = 10) {
  407. return this.db
  408. .prepare(
  409. 'SELECT * FROM tasks WHERE status = ? ORDER BY priority DESC, created_at ASC LIMIT ?',
  410. )
  411. .all('pending', limit);
  412. }
  413. getTaskById(id: number) {
  414. return this.db.prepare('SELECT * FROM tasks WHERE id = ?').get(id);
  415. }
  416. getTaskByInput(input: string) {
  417. return this.db.prepare('SELECT * FROM tasks WHERE input = ?').get(input);
  418. }
  419. createTask(task: {
  420. type: string;
  421. status?: string;
  422. progress?: number;
  423. dataset?: string;
  424. input?: string;
  425. output?: string;
  426. preset?: string;
  427. priority?: number;
  428. error_message?: string;
  429. retry_count?: number;
  430. max_retries?: number;
  431. }) {
  432. const result = this.db
  433. .prepare(
  434. `INSERT INTO tasks (type, status, progress, dataset, input, output, preset, priority, retry_count, max_retries, error_message)
  435. VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
  436. )
  437. .run(
  438. task.type,
  439. task.status || 'pending',
  440. task.progress || 0,
  441. task.dataset || null,
  442. task.input || null,
  443. task.output || null,
  444. task.preset || null,
  445. task.priority || 0,
  446. task.retry_count || 0,
  447. task.max_retries || null,
  448. task.error_message || null,
  449. );
  450. return { id: result.lastInsertRowid, ...task };
  451. }
  452. deleteTask(id: number) {
  453. return this.db.prepare('DELETE FROM tasks WHERE id = ?').run(id);
  454. }
  455. updateTask(
  456. id: number,
  457. updates: {
  458. status?: string;
  459. progress?: number;
  460. dataset?: string;
  461. input?: string;
  462. output?: string;
  463. preset?: string;
  464. priority?: number;
  465. error_message?: string;
  466. retry_count?: number;
  467. max_retries?: number;
  468. },
  469. ) {
  470. const setParts = [];
  471. const values = [];
  472. if (updates.status !== undefined) {
  473. setParts.push('status = ?');
  474. values.push(updates.status);
  475. }
  476. if (updates.progress !== undefined) {
  477. setParts.push('progress = ?');
  478. values.push(updates.progress);
  479. }
  480. if (updates.dataset !== undefined) {
  481. setParts.push('dataset = ?');
  482. values.push(updates.dataset);
  483. }
  484. if (updates.input !== undefined) {
  485. setParts.push('input = ?');
  486. values.push(updates.input);
  487. }
  488. if (updates.output !== undefined) {
  489. setParts.push('output = ?');
  490. values.push(updates.output);
  491. }
  492. if (updates.preset !== undefined) {
  493. setParts.push('preset = ?');
  494. values.push(updates.preset);
  495. }
  496. if (updates.priority !== undefined) {
  497. setParts.push('priority = ?');
  498. values.push(updates.priority);
  499. }
  500. if (updates.error_message !== undefined) {
  501. setParts.push('error_message = ?');
  502. values.push(updates.error_message);
  503. }
  504. if (updates.retry_count !== undefined) {
  505. setParts.push('retry_count = ?');
  506. values.push(updates.retry_count);
  507. }
  508. if (updates.max_retries !== undefined) {
  509. setParts.push('max_retries = ?');
  510. values.push(updates.max_retries);
  511. }
  512. if (setParts.length > 0) {
  513. setParts.push('updated_at = CURRENT_TIMESTAMP');
  514. values.push(id);
  515. this.db
  516. .prepare(`UPDATE tasks SET ${setParts.join(', ')} WHERE id = ?`)
  517. .run(...values);
  518. }
  519. return this.db.prepare('SELECT * FROM tasks WHERE id = ?').get(id);
  520. }
  521. // Task maintenance methods
  522. deleteTasksByStatus(status: string, olderThanDays?: number) {
  523. let query = 'DELETE FROM tasks WHERE status = ?';
  524. const params = [status];
  525. if (olderThanDays !== undefined) {
  526. query += ` AND created_at < datetime('now', '-${olderThanDays} days')`;
  527. }
  528. return this.db.prepare(query).run(...params);
  529. }
  530. deleteTasksOlderThan(days: number) {
  531. return this.db
  532. .prepare(
  533. `DELETE FROM tasks WHERE created_at < datetime('now', '-${days} days')`,
  534. )
  535. .run();
  536. }
  537. getTaskStats() {
  538. const stats = this.db
  539. .prepare(
  540. `
  541. SELECT
  542. status,
  543. COUNT(*) as count,
  544. MIN(created_at) as oldest,
  545. MAX(created_at) as newest
  546. FROM tasks
  547. GROUP BY status
  548. `,
  549. )
  550. .all();
  551. const total = this.db
  552. .prepare('SELECT COUNT(*) as total FROM tasks')
  553. .get() as { total: number };
  554. return {
  555. total: total.total,
  556. byStatus: stats,
  557. };
  558. }
  559. archiveOldTasks(daysOld: number = 30): { changes?: number } {
  560. // Create archive table if it doesn't exist
  561. this.db.exec(`
  562. CREATE TABLE IF NOT EXISTS tasks_archive (
  563. id INTEGER PRIMARY KEY,
  564. type TEXT NOT NULL,
  565. status TEXT NOT NULL,
  566. progress REAL DEFAULT 0,
  567. dataset TEXT,
  568. input TEXT,
  569. output TEXT,
  570. preset TEXT,
  571. priority INTEGER DEFAULT 0,
  572. retry_count INTEGER DEFAULT 0,
  573. max_retries INTEGER,
  574. error_message TEXT,
  575. created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
  576. updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
  577. archived_at DATETIME DEFAULT CURRENT_TIMESTAMP
  578. )
  579. `);
  580. // Move old completed/failed tasks to archive
  581. const insertResult = this.db
  582. .prepare(
  583. `
  584. INSERT INTO tasks_archive
  585. SELECT *, CURRENT_TIMESTAMP as archived_at
  586. FROM tasks
  587. WHERE status IN ('completed', 'failed', 'skipped')
  588. AND created_at < datetime('now', '-${daysOld} days')
  589. `,
  590. )
  591. .run();
  592. // Delete archived tasks from main table
  593. const deleteResult = this.db
  594. .prepare(
  595. `
  596. DELETE FROM tasks
  597. WHERE status IN ('completed', 'failed', 'skipped')
  598. AND created_at < datetime('now', '-${daysOld} days')
  599. `,
  600. )
  601. .run();
  602. return { changes: insertResult.changes };
  603. }
  604. // Purge all tasks from the database
  605. purgeAllTasks() {
  606. const result = this.db.prepare('DELETE FROM tasks').run();
  607. return result;
  608. }
  609. // ============================================================
  610. // Hash-based duplicate detection methods
  611. // ============================================================
  612. /**
  613. * Store a destination file with its hash and size
  614. */
  615. storeDestinationFile(
  616. dataset: string,
  617. destinationPath: string,
  618. hash: string,
  619. fileSize: number,
  620. ) {
  621. // Use destination_path as both input and destination_path for consistency with indexing worker
  622. // This prevents duplicates by using the (dataset, input) primary key
  623. const existing = this.db
  624. .prepare('SELECT * FROM files WHERE dataset = ? AND input = ?')
  625. .get(dataset, destinationPath) as
  626. | {
  627. dataset: string;
  628. input: string | null;
  629. output: string | null;
  630. destination_path: string;
  631. hash: string | null;
  632. file_size: number | null;
  633. }
  634. | undefined;
  635. const now = new Date().toISOString();
  636. if (existing) {
  637. this.db
  638. .prepare(
  639. `UPDATE files
  640. SET destination_path = ?, hash = ?, file_size = ?, date = ?
  641. WHERE dataset = ? AND input = ?`,
  642. )
  643. .run(destinationPath, hash, fileSize, now, dataset, destinationPath);
  644. } else {
  645. // Use destination_path as input for the primary key
  646. this.db
  647. .prepare(
  648. `INSERT INTO files (dataset, input, destination_path, hash, file_size, date, status)
  649. VALUES (?, ?, ?, ?, ?, ?, 'indexed')`,
  650. )
  651. .run(dataset, destinationPath, destinationPath, hash, fileSize, now);
  652. }
  653. }
  654. /**
  655. * Find duplicate files by hash and size
  656. */
  657. findDuplicatesByHash(
  658. hash: string,
  659. fileSize: number,
  660. dataset?: string,
  661. ): Array<{
  662. dataset: string;
  663. input: string | null;
  664. output: string | null;
  665. destination_path: string | null;
  666. hash: string;
  667. file_size: number;
  668. date: string;
  669. status: string;
  670. }> {
  671. let query = 'SELECT * FROM files WHERE hash = ? AND file_size = ?';
  672. const params: any[] = [hash, fileSize];
  673. if (dataset) {
  674. query += ' AND dataset = ?';
  675. params.push(dataset);
  676. }
  677. return this.db.prepare(query).all(...params) as Array<{
  678. dataset: string;
  679. input: string | null;
  680. output: string | null;
  681. destination_path: string | null;
  682. hash: string;
  683. file_size: number;
  684. date: string;
  685. status: string;
  686. }>;
  687. }
  688. /**
  689. * Get all duplicates from the view
  690. */
  691. getAllDuplicates(dataset?: string) {
  692. let query = 'SELECT * FROM file_duplicates';
  693. const params: any[] = [];
  694. if (dataset) {
  695. query += ' WHERE dataset = ?';
  696. params.push(dataset);
  697. }
  698. return this.db.prepare(query).all(...params) as Array<{
  699. hash: string;
  700. file_size: number;
  701. dataset: string;
  702. file_count: number;
  703. file_paths: string;
  704. }>;
  705. }
  706. /**
  707. * Get files in a destination that need hash indexing
  708. */
  709. getDestinationFilesWithoutHash(dataset: string, destinationPath?: string) {
  710. let query = `
  711. SELECT * FROM files
  712. WHERE dataset = ?
  713. AND destination_path IS NOT NULL
  714. AND hash IS NULL
  715. `;
  716. const params: any[] = [dataset];
  717. if (destinationPath) {
  718. query += ' AND destination_path LIKE ?';
  719. params.push(`${destinationPath}%`);
  720. }
  721. return this.db.prepare(query).all(...params);
  722. }
  723. /**
  724. * Remove all destination file entries (for re-indexing)
  725. */
  726. clearDestinationFiles(dataset: string, destinationPath?: string) {
  727. let query =
  728. 'DELETE FROM files WHERE dataset = ? AND destination_path IS NOT NULL';
  729. const params: any[] = [dataset];
  730. if (destinationPath) {
  731. query += ' AND destination_path LIKE ?';
  732. params.push(`${destinationPath}%`);
  733. }
  734. const result = this.db.prepare(query).run(...params);
  735. return result.changes;
  736. }
  737. /**
  738. * Get count of indexed destination files
  739. */
  740. getDestinationFileCount(dataset: string, destinationPath?: string) {
  741. let query =
  742. 'SELECT COUNT(*) as count FROM files WHERE dataset = ? AND destination_path IS NOT NULL';
  743. const params: any[] = [dataset];
  744. if (destinationPath) {
  745. query += ' AND destination_path LIKE ?';
  746. params.push(`${destinationPath}%`);
  747. }
  748. const result = this.db.prepare(query).get(...params) as { count: number };
  749. return result.count;
  750. }
  751. /**
  752. * Update hash and file_size for a file after processing
  753. * Used by handbrake service after encoding completes
  754. */
  755. updateFileHash(
  756. dataset: string,
  757. filePath: string,
  758. hash: string,
  759. fileSize: number,
  760. ) {
  761. const now = new Date().toISOString();
  762. const stmt = this.db.prepare(`
  763. UPDATE files
  764. SET hash = ?, file_size = ?, date = ?
  765. WHERE dataset = ? AND input = ?
  766. `);
  767. return stmt.run(hash, fileSize, now, dataset, filePath);
  768. }
  769. /**
  770. * Remove duplicate file entries
  771. * Removes:
  772. * 1. Legacy duplicates where input = NULL and a matching entry with input = destination_path exists
  773. * 2. Multiple entries with the same output path (keeps the most recent)
  774. * 3. Multiple entries with the same destination_path (keeps the most recent)
  775. */
  776. removeDuplicateFileEntries() {
  777. let totalRemoved = 0;
  778. // 1. Remove legacy duplicates (input IS NULL)
  779. const deleteLegacyStmt = this.db.prepare(`
  780. DELETE FROM files
  781. WHERE input IS NULL
  782. AND destination_path IS NOT NULL
  783. AND EXISTS (
  784. SELECT 1 FROM files f2
  785. WHERE f2.dataset = files.dataset
  786. AND f2.input = files.destination_path
  787. AND f2.destination_path = files.destination_path
  788. )
  789. `);
  790. const legacyResult = deleteLegacyStmt.run();
  791. totalRemoved += legacyResult.changes;
  792. // 2. Remove duplicate entries with same output path (keep most recent)
  793. const deleteOutputDupsStmt = this.db.prepare(`
  794. DELETE FROM files
  795. WHERE rowid NOT IN (
  796. SELECT MAX(rowid)
  797. FROM files
  798. WHERE output IS NOT NULL
  799. GROUP BY dataset, output
  800. )
  801. AND output IS NOT NULL
  802. `);
  803. const outputDupsResult = deleteOutputDupsStmt.run();
  804. totalRemoved += outputDupsResult.changes;
  805. // 3. Remove duplicate entries with same destination_path (keep most recent)
  806. const deleteDestDupsStmt = this.db.prepare(`
  807. DELETE FROM files
  808. WHERE rowid NOT IN (
  809. SELECT MAX(rowid)
  810. FROM files
  811. WHERE destination_path IS NOT NULL
  812. GROUP BY dataset, destination_path
  813. )
  814. AND destination_path IS NOT NULL
  815. `);
  816. const destDupsResult = deleteDestDupsStmt.run();
  817. totalRemoved += destDupsResult.changes;
  818. return totalRemoved;
  819. }
  820. }