-- Migration: add_hash_and_destination_tracking -- Created at: 2026-01-06T19:47:58.000Z -- Add hash and file_size columns to files table for duplicate detection optimization ALTER TABLE files ADD COLUMN hash TEXT; ALTER TABLE files ADD COLUMN file_size INTEGER; -- Add destination_path column to track files in destination locations (vs source files) -- This helps us separate files that are being watched from files in destinations ALTER TABLE files ADD COLUMN destination_path TEXT; -- Create indexes for fast duplicate lookups CREATE INDEX IF NOT EXISTS idx_files_hash ON files(hash) WHERE hash IS NOT NULL; CREATE INDEX IF NOT EXISTS idx_files_hash_size ON files(hash, file_size) WHERE hash IS NOT NULL; CREATE INDEX IF NOT EXISTS idx_files_destination ON files(destination_path) WHERE destination_path IS NOT NULL; -- Create a view for easy duplicate detection CREATE VIEW IF NOT EXISTS file_duplicates AS SELECT hash, file_size, dataset, COUNT(*) as file_count, GROUP_CONCAT(CASE WHEN destination_path IS NOT NULL THEN destination_path ELSE input END, '|||') as file_paths FROM files WHERE hash IS NOT NULL GROUP BY hash, file_size, dataset HAVING COUNT(*) > 1;