2026-01-06T19-47-58_add_hash_and_destination_tracking.sql 1.2 KB

12345678910111213141516171819202122232425262728
  1. -- Migration: add_hash_and_destination_tracking
  2. -- Created at: 2026-01-06T19:47:58.000Z
  3. -- Add hash and file_size columns to files table for duplicate detection optimization
  4. ALTER TABLE files ADD COLUMN hash TEXT;
  5. ALTER TABLE files ADD COLUMN file_size INTEGER;
  6. -- Add destination_path column to track files in destination locations (vs source files)
  7. -- This helps us separate files that are being watched from files in destinations
  8. ALTER TABLE files ADD COLUMN destination_path TEXT;
  9. -- Create indexes for fast duplicate lookups
  10. CREATE INDEX IF NOT EXISTS idx_files_hash ON files(hash) WHERE hash IS NOT NULL;
  11. CREATE INDEX IF NOT EXISTS idx_files_hash_size ON files(hash, file_size) WHERE hash IS NOT NULL;
  12. CREATE INDEX IF NOT EXISTS idx_files_destination ON files(destination_path) WHERE destination_path IS NOT NULL;
  13. -- Create a view for easy duplicate detection
  14. CREATE VIEW IF NOT EXISTS file_duplicates AS
  15. SELECT
  16. hash,
  17. file_size,
  18. dataset,
  19. COUNT(*) as file_count,
  20. GROUP_CONCAT(CASE WHEN destination_path IS NOT NULL THEN destination_path ELSE input END, '|||') as file_paths
  21. FROM files
  22. WHERE hash IS NOT NULL
  23. GROUP BY hash, file_size, dataset
  24. HAVING COUNT(*) > 1;