mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 06:34:40 +02:00
Phase 6a: transcripts mark organized in-place, skip filing
Transcripts are derived text from PeerTube videos, not primary source files. They do not belong in library/Domain/Subdomain/ like PDFs. Change: transcript_processor.pre_flight() now sets organized_at = CURRENT_TIMESTAMP at the end of successful processing, marking the transcript as organized in place. The watch URL remains in catalogue.path and Qdrant download_url so users clicking search results go to the PeerTube video. The filing workers path LIKE filter naturally excludes transcripts since their documents.path is the watch URL, not a filesystem path. No filing worker changes needed. Back-fills 2,260 drain items from Phase 5c-2 via one-time SQL. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
9fa60f9c86
commit
df29d598d3
1 changed files with 8 additions and 2 deletions
|
|
@ -149,10 +149,16 @@ def pre_flight(content_path, meta_path, db, config):
|
||||||
# Queue and advance to extracted
|
# Queue and advance to extracted
|
||||||
db.queue_document(file_hash)
|
db.queue_document(file_hash)
|
||||||
|
|
||||||
# Set text_dir and page_count on the documents row
|
# Set text_dir and page_count on the documents row.
|
||||||
|
# Transcripts are derived text from PeerTube videos, not primary sources.
|
||||||
|
# They don't get filed into library/Domain/Subdomain/ like PDFs -- instead,
|
||||||
|
# they're marked organized in-place. Their watch URL remains in catalogue.path
|
||||||
|
# and Qdrant download_url so users clicking search results go to PeerTube.
|
||||||
|
# The filing worker's path LIKE filter naturally excludes transcripts since
|
||||||
|
# their documents.path is the watch URL, not a filesystem path.
|
||||||
conn = db._get_conn()
|
conn = db._get_conn()
|
||||||
conn.execute(
|
conn.execute(
|
||||||
"UPDATE documents SET text_dir = ?, page_count = ? WHERE hash = ?",
|
"UPDATE documents SET text_dir = ?, page_count = ?, organized_at = CURRENT_TIMESTAMP WHERE hash = ?",
|
||||||
(proc_dir, len(pages), file_hash)
|
(proc_dir, len(pages), file_hash)
|
||||||
)
|
)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue