From df29d598d377fcc1de81d912a743c292c4689f99 Mon Sep 17 00:00:00 2001 From: Matt Date: Tue, 14 Apr 2026 22:49:21 +0000 Subject: [PATCH] Phase 6a: transcripts mark organized in-place, skip filing Transcripts are derived text from PeerTube videos, not primary source files. They do not belong in library/Domain/Subdomain/ like PDFs. Change: transcript_processor.pre_flight() now sets organized_at = CURRENT_TIMESTAMP at the end of successful processing, marking the transcript as organized in place. The watch URL remains in catalogue.path and Qdrant download_url so users clicking search results go to the PeerTube video. The filing workers path LIKE filter naturally excludes transcripts since their documents.path is the watch URL, not a filesystem path. No filing worker changes needed. Back-fills 2,260 drain items from Phase 5c-2 via one-time SQL. Co-Authored-By: Claude Opus 4.6 --- lib/processors/transcript_processor.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lib/processors/transcript_processor.py b/lib/processors/transcript_processor.py index c5d8023..dbc3013 100644 --- a/lib/processors/transcript_processor.py +++ b/lib/processors/transcript_processor.py @@ -149,10 +149,16 @@ def pre_flight(content_path, meta_path, db, config): # Queue and advance to extracted db.queue_document(file_hash) - # Set text_dir and page_count on the documents row + # Set text_dir and page_count on the documents row. + # Transcripts are derived text from PeerTube videos, not primary sources. + # They don't get filed into library/Domain/Subdomain/ like PDFs -- instead, + # they're marked organized in-place. Their watch URL remains in catalogue.path + # and Qdrant download_url so users clicking search results go to PeerTube. + # The filing worker's path LIKE filter naturally excludes transcripts since + # their documents.path is the watch URL, not a filesystem path. conn = db._get_conn() conn.execute( - "UPDATE documents SET text_dir = ?, page_count = ? WHERE hash = ?", + "UPDATE documents SET text_dir = ?, page_count = ?, organized_at = CURRENT_TIMESTAMP WHERE hash = ?", (proc_dir, len(pages), file_hash) ) conn.commit()