Initial commit: RECON codebase baseline

Current state of the pipeline code as of 2026-04-14 (Phase 1 scaffolding complete).
Config has new_pipeline.enabled=false and crawler.sites=[] per refactor plan.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Matt 2026-04-14 14:57:23 +00:00
commit 563c16bb71
59 changed files with 18327 additions and 0 deletions

508
lib/status.py Normal file
View file

@ -0,0 +1,508 @@
"""
RECON Status Tracker
SQLite operations for catalogue and documents tables. WAL mode, thread-local connections.
Status flow: catalogued -> queued -> extracting -> extracted -> enriching -> enriched -> embedding -> complete.
Config: paths.db
"""
import os
import sqlite3
import threading
from datetime import datetime, timezone
from .utils import get_config
_local = threading.local()
class StatusDB:
def __init__(self, db_path=None):
if db_path is None:
db_path = get_config()['paths']['db']
self.db_path = db_path
os.makedirs(os.path.dirname(db_path), exist_ok=True)
self._init_db()
def _get_conn(self):
if not hasattr(_local, 'conn') or _local.conn is None:
_local.conn = sqlite3.connect(self.db_path, timeout=30)
_local.conn.row_factory = sqlite3.Row
_local.conn.execute("PRAGMA journal_mode=WAL")
_local.conn.execute("PRAGMA busy_timeout=5000")
return _local.conn
def _init_db(self):
conn = self._get_conn()
conn.executescript("""
CREATE TABLE IF NOT EXISTS catalogue (
hash TEXT PRIMARY KEY,
filename TEXT NOT NULL,
path TEXT NOT NULL,
size_bytes INTEGER,
source TEXT,
category TEXT,
status TEXT DEFAULT 'catalogued',
discovered_at TEXT DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS documents (
hash TEXT PRIMARY KEY,
filename TEXT NOT NULL,
path TEXT,
size_bytes INTEGER,
page_count INTEGER,
book_title TEXT,
book_author TEXT,
collection TEXT DEFAULT 'survival',
status TEXT DEFAULT 'pending',
pages_extracted INTEGER DEFAULT 0,
concepts_extracted INTEGER DEFAULT 0,
vectors_inserted INTEGER DEFAULT 0,
discovered_at TEXT DEFAULT CURRENT_TIMESTAMP,
extracted_at TEXT,
enriched_at TEXT,
embedded_at TEXT,
error_message TEXT,
retry_count INTEGER DEFAULT 0
);
CREATE TABLE IF NOT EXISTS intel (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source TEXT,
timestamp TEXT,
region TEXT,
category TEXT,
content TEXT,
summary TEXT,
key_facts TEXT,
credibility_score REAL,
verification_status TEXT,
vector_id INTEGER,
ingested_at TEXT DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS metrics_snapshots (
id INTEGER PRIMARY KEY AUTOINCREMENT,
timestamp TEXT NOT NULL,
metric_type TEXT NOT NULL,
data TEXT NOT NULL,
UNIQUE(timestamp, metric_type)
);
CREATE INDEX IF NOT EXISTS idx_catalogue_status ON catalogue(status);
CREATE INDEX IF NOT EXISTS idx_catalogue_source ON catalogue(source);
CREATE INDEX IF NOT EXISTS idx_documents_status ON documents(status);
""")
# Migration: add path_updated_at column if missing
try:
conn.execute("ALTER TABLE catalogue ADD COLUMN path_updated_at TEXT")
except Exception:
pass # column already exists
# Migration: add organized_at column to documents if missing
try:
conn.execute("ALTER TABLE documents ADD COLUMN organized_at TEXT")
except Exception:
pass # column already exists
# Stream B: file_operations + duplicate_review tables
conn.executescript("""
CREATE TABLE IF NOT EXISTS file_operations (
id INTEGER PRIMARY KEY AUTOINCREMENT,
doc_hash TEXT NOT NULL,
operation TEXT NOT NULL,
source_path TEXT NOT NULL,
target_path TEXT NOT NULL,
source_filename TEXT NOT NULL,
target_filename TEXT NOT NULL,
original_filename TEXT,
collision_step INTEGER,
qdrant_points_updated INTEGER DEFAULT 0,
performed_at TEXT DEFAULT CURRENT_TIMESTAMP,
reversed_at TEXT,
notes TEXT
);
CREATE INDEX IF NOT EXISTS idx_fileops_hash ON file_operations(doc_hash);
CREATE TABLE IF NOT EXISTS duplicate_review (
id INTEGER PRIMARY KEY AUTOINCREMENT,
doc_hash TEXT NOT NULL,
original_filename TEXT NOT NULL,
sanitized_filename TEXT NOT NULL,
collision_with_hash TEXT,
collision_path TEXT,
duplicate_path TEXT NOT NULL,
domain TEXT,
subdomain TEXT,
book_author TEXT,
book_title TEXT,
status TEXT DEFAULT 'pending',
resolution TEXT,
discovered_at TEXT DEFAULT CURRENT_TIMESTAMP,
resolved_at TEXT
);
CREATE INDEX IF NOT EXISTS idx_dupreview_status ON duplicate_review(status);
""")
conn.commit()
def add_to_catalogue(self, file_hash, filename, path, size_bytes, source, category):
conn = self._get_conn()
conn.execute(
"""INSERT INTO catalogue (hash, filename, path, size_bytes, source, category)
VALUES (?, ?, ?, ?, ?, ?)
ON CONFLICT(hash) DO UPDATE SET
path = excluded.path,
filename = excluded.filename,
source = excluded.source,
category = excluded.category,
path_updated_at = CASE
WHEN catalogue.path != excluded.path THEN CURRENT_TIMESTAMP
ELSE catalogue.path_updated_at
END""",
(file_hash, filename, path, size_bytes, source, category)
)
conn.commit()
def queue_document(self, file_hash):
conn = self._get_conn()
row = conn.execute("SELECT * FROM catalogue WHERE hash = ?", (file_hash,)).fetchone()
if not row:
return False
conn.execute("UPDATE catalogue SET status = 'queued' WHERE hash = ?", (file_hash,))
conn.execute(
"""INSERT INTO documents (hash, filename, path, size_bytes, status)
VALUES (?, ?, ?, ?, 'queued')
ON CONFLICT(hash) DO UPDATE SET
path = excluded.path,
filename = excluded.filename""",
(row['hash'], row['filename'], row['path'], row['size_bytes'])
)
conn.commit()
return True
def update_status(self, file_hash, status, **kwargs):
conn = self._get_conn()
sets = ["status = ?"]
vals = [status]
ts_field = {
'extracted': 'extracted_at',
'enriched': 'enriched_at',
'complete': 'embedded_at',
}.get(status)
if ts_field:
sets.append(f"{ts_field} = ?")
vals.append(datetime.now(timezone.utc).isoformat())
for k, v in kwargs.items():
sets.append(f"{k} = ?")
vals.append(v)
vals.append(file_hash)
conn.execute(f"UPDATE documents SET {', '.join(sets)} WHERE hash = ?", vals)
conn.commit()
def get_by_status(self, status, limit=None):
conn = self._get_conn()
q = "SELECT * FROM documents WHERE status = ? ORDER BY discovered_at"
if limit:
q += f" LIMIT {int(limit)}"
return [dict(r) for r in conn.execute(q, (status,)).fetchall()]
def get_catalogued(self, source=None, category=None, limit=None):
conn = self._get_conn()
q = "SELECT * FROM catalogue WHERE status = 'catalogued'"
params = []
if source:
q += " AND source = ?"
params.append(source)
if category:
q += " AND category = ?"
params.append(category)
q += " ORDER BY discovered_at"
if limit:
q += f" LIMIT {int(limit)}"
return [dict(r) for r in conn.execute(q, params).fetchall()]
def get_document(self, file_hash):
conn = self._get_conn()
row = conn.execute("SELECT * FROM documents WHERE hash = ?", (file_hash,)).fetchone()
return dict(row) if row else None
def get_status_counts(self):
conn = self._get_conn()
cat_counts = {}
for row in conn.execute("SELECT status, COUNT(*) as cnt FROM catalogue GROUP BY status"):
cat_counts[row['status']] = row['cnt']
doc_counts = {}
for row in conn.execute("SELECT status, COUNT(*) as cnt FROM documents GROUP BY status"):
doc_counts[row['status']] = row['cnt']
return {'catalogue': cat_counts, 'documents': doc_counts}
def get_failures(self):
conn = self._get_conn()
return [dict(r) for r in conn.execute(
"SELECT * FROM documents WHERE status = 'failed' ORDER BY discovered_at"
).fetchall()]
def mark_failed(self, file_hash, error_msg):
conn = self._get_conn()
conn.execute(
"UPDATE documents SET status = 'failed', error_message = ? WHERE hash = ?",
(str(error_msg)[:1000], file_hash)
)
conn.commit()
def increment_retry(self, file_hash):
conn = self._get_conn()
conn.execute(
"UPDATE documents SET retry_count = retry_count + 1, status = 'queued', error_message = NULL WHERE hash = ?",
(file_hash,)
)
conn.commit()
def get_sources(self):
conn = self._get_conn()
return [r[0] for r in conn.execute(
"SELECT DISTINCT source FROM catalogue ORDER BY source"
).fetchall()]
def get_categories(self, source=None):
conn = self._get_conn()
if source:
return [r[0] for r in conn.execute(
"SELECT DISTINCT category FROM catalogue WHERE source = ? ORDER BY category", (source,)
).fetchall()]
return [r[0] for r in conn.execute(
"SELECT DISTINCT category FROM catalogue ORDER BY category"
).fetchall()]
def get_all_documents(self, status=None, source=None, category=None, limit=None, offset=None):
conn = self._get_conn()
q = """SELECT d.*, c.source, c.category FROM documents d
LEFT JOIN catalogue c ON d.hash = c.hash WHERE 1=1"""
params = []
if status:
q += " AND d.status = ?"
params.append(status)
if source:
q += " AND c.source = ?"
params.append(source)
if category:
q += " AND c.category = ?"
params.append(category)
q += " ORDER BY d.discovered_at DESC"
if limit:
q += f" LIMIT {int(limit)}"
if offset:
q += f" OFFSET {int(offset)}"
return [dict(r) for r in conn.execute(q, params).fetchall()]
def count_documents(self, source=None, category=None):
"""Count documents matching optional source/category filters."""
conn = self._get_conn()
q = """SELECT COUNT(*) FROM documents d
LEFT JOIN catalogue c ON d.hash = c.hash WHERE 1=1"""
params = []
if source:
q += " AND c.source = ?"
params.append(source)
if category:
q += " AND c.category = ?"
params.append(category)
return conn.execute(q, params).fetchone()[0]
def catalogue_count(self):
conn = self._get_conn()
return conn.execute("SELECT COUNT(*) FROM catalogue").fetchone()[0]
def source_breakdown(self):
conn = self._get_conn()
return [dict(r) for r in conn.execute(
"SELECT source, COUNT(*) as count, SUM(size_bytes) as total_bytes FROM catalogue GROUP BY source ORDER BY count DESC"
).fetchall()]
def category_breakdown(self, source=None):
conn = self._get_conn()
if source:
return [dict(r) for r in conn.execute(
"SELECT category, COUNT(*) as count FROM catalogue WHERE source = ? GROUP BY category ORDER BY count DESC",
(source,)
).fetchall()]
return [dict(r) for r in conn.execute(
"SELECT source, category, COUNT(*) as count FROM catalogue GROUP BY source, category ORDER BY source, count DESC"
).fetchall()]
def get_path_updates(self):
"""Get catalogue entries where path was updated since last sync."""
conn = self._get_conn()
return [dict(r) for r in conn.execute(
"SELECT hash, filename, path, source, category FROM catalogue "
"WHERE path_updated_at IS NOT NULL"
).fetchall()]
def clear_path_update(self, file_hash):
"""Clear path_updated_at flag after Qdrant sync."""
conn = self._get_conn()
conn.execute(
"UPDATE catalogue SET path_updated_at = NULL WHERE hash = ?",
(file_hash,)
)
conn.commit()
def sync_document_path(self, file_hash, path, filename):
"""Update path and filename in documents table."""
conn = self._get_conn()
conn.execute(
"UPDATE documents SET path = ?, filename = ? WHERE hash = ?",
(path, filename, file_hash)
)
conn.commit()
def status_breakdown(self):
conn = self._get_conn()
rows = conn.execute(
"SELECT status, COUNT(*) as count FROM catalogue GROUP BY status ORDER BY count DESC"
).fetchall()
return [dict(r) for r in rows]
def get_unorganized(self, limit=None):
"""Get completed documents that haven't been organized yet."""
conn = self._get_conn()
q = "SELECT hash, filename, path FROM documents WHERE status = 'complete' AND organized_at IS NULL ORDER BY embedded_at"
if limit:
q += " LIMIT {}".format(int(limit))
return [dict(r) for r in conn.execute(q).fetchall()]
def get_ingest_pending(self, ingest_dir, limit=50):
"""Get completed docs in _ingest/ that haven't been organized."""
conn = self._get_conn()
pattern = ingest_dir + '%'
return [dict(r) for r in conn.execute(
"SELECT hash, filename, path FROM documents "
"WHERE status = 'complete' AND organized_at IS NULL AND path LIKE ? "
"ORDER BY embedded_at LIMIT ?",
(pattern, limit)
).fetchall()]
def mark_organized(self, file_hash):
"""Mark a document as organized (sets organized_at timestamp)."""
conn = self._get_conn()
conn.execute(
"UPDATE documents SET organized_at = CURRENT_TIMESTAMP WHERE hash = ?",
(file_hash,)
)
conn.commit()
def update_catalogue_path(self, file_hash, new_path, new_filename):
"""Update catalogue path/filename and flag for Qdrant sync."""
conn = self._get_conn()
conn.execute(
"UPDATE catalogue SET path = ?, filename = ?, path_updated_at = CURRENT_TIMESTAMP WHERE hash = ?",
(new_path, new_filename, file_hash)
)
conn.commit()
# ── Stream B: File Operations ───────────────────────────────────
def log_file_operation(self, doc_hash, operation, source_path, target_path,
source_filename, target_filename, original_filename=None,
collision_step=None, qdrant_points_updated=0, notes=None):
"""Log a file move/rename operation for audit trail and rollback."""
conn = self._get_conn()
conn.execute(
"""INSERT INTO file_operations
(doc_hash, operation, source_path, target_path,
source_filename, target_filename, original_filename,
collision_step, qdrant_points_updated, notes)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(doc_hash, operation, source_path, target_path,
source_filename, target_filename, original_filename,
collision_step, qdrant_points_updated, notes)
)
conn.commit()
return conn.execute("SELECT last_insert_rowid()").fetchone()[0]
def get_file_operations(self, doc_hash=None, limit=50):
"""Get file operations, optionally filtered by doc_hash."""
conn = self._get_conn()
if doc_hash:
return [dict(r) for r in conn.execute(
"SELECT * FROM file_operations WHERE doc_hash = ? ORDER BY performed_at DESC LIMIT ?",
(doc_hash, limit)
).fetchall()]
return [dict(r) for r in conn.execute(
"SELECT * FROM file_operations WHERE reversed_at IS NULL ORDER BY performed_at DESC LIMIT ?",
(limit,)
).fetchall()]
def get_file_operation(self, op_id):
"""Get a single file operation by ID."""
conn = self._get_conn()
row = conn.execute("SELECT * FROM file_operations WHERE id = ?", (op_id,)).fetchone()
return dict(row) if row else None
def mark_operation_reversed(self, op_id):
"""Mark a file operation as reversed."""
conn = self._get_conn()
conn.execute(
"UPDATE file_operations SET reversed_at = CURRENT_TIMESTAMP WHERE id = ?",
(op_id,)
)
conn.commit()
def queue_duplicate_review(self, doc_hash, original_filename, sanitized_filename,
collision_with_hash=None, collision_path=None,
duplicate_path='', domain=None, subdomain=None,
book_author=None, book_title=None):
"""Queue a file for human duplicate review."""
conn = self._get_conn()
conn.execute(
"""INSERT INTO duplicate_review
(doc_hash, original_filename, sanitized_filename,
collision_with_hash, collision_path, duplicate_path,
domain, subdomain, book_author, book_title)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(doc_hash, original_filename, sanitized_filename,
collision_with_hash, collision_path, duplicate_path,
domain, subdomain, book_author, book_title)
)
conn.commit()
def get_duplicate_reviews(self, status='pending', limit=50):
"""Get duplicate review queue."""
conn = self._get_conn()
return [dict(r) for r in conn.execute(
"SELECT * FROM duplicate_review WHERE status = ? ORDER BY discovered_at DESC LIMIT ?",
(status, limit)
).fetchall()]
def get_pipeline_stats(self):
"""Get Stream B pipeline statistics."""
conn = self._get_conn()
ops = conn.execute(
"SELECT operation, COUNT(*) as cnt FROM file_operations WHERE reversed_at IS NULL GROUP BY operation"
).fetchall()
dupes = conn.execute(
"SELECT status, COUNT(*) as cnt FROM duplicate_review GROUP BY status"
).fetchall()
acquired = 0
ingest = 0
try:
acquired_dir = get_config().get('new_pipeline', {}).get('acquired_dir', '')
ingest_dir = get_config().get('new_pipeline', {}).get('ingest_dir', '')
if acquired_dir and os.path.isdir(acquired_dir):
acquired = len([f for f in os.listdir(acquired_dir) if f.lower().endswith('.pdf')])
if ingest_dir and os.path.isdir(ingest_dir):
ingest = len([f for f in os.listdir(ingest_dir) if f.lower().endswith('.pdf')])
except Exception:
pass
return {
'operations': {dict(r)['operation']: dict(r)['cnt'] for r in ops},
'duplicates': {dict(r)['status']: dict(r)['cnt'] for r in dupes},
'acquired_pending': acquired,
'ingest_pending': ingest,
}