mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 06:34:40 +02:00
Phase 3: compute_assignment and run_tiebreaker_pass
Domain assigner module with two functions: - compute_assignment(): pass 1 concept domain count, inline per-document - run_tiebreaker_pass(): batch channel-level tiebreaker (pass 2 + pass 3 defensive re-run), mega-channel rule (>500 videos skip to tied_manual) Filters legacy domains (Sustainment Systems, Off-Grid Systems, Defense & Tactics) from concept counts automatically. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
5f36c52fb1
commit
8ab1f8c82f
1 changed files with 284 additions and 0 deletions
284
lib/domain_assigner.py
Normal file
284
lib/domain_assigner.py
Normal file
|
|
@ -0,0 +1,284 @@
|
||||||
|
"""
|
||||||
|
RECON Domain Assigner
|
||||||
|
|
||||||
|
Computes per-video domain assignments from concept extraction results.
|
||||||
|
Two functions, two execution modes:
|
||||||
|
|
||||||
|
compute_assignment() — pass 1, inline from post-embed hook
|
||||||
|
run_tiebreaker_pass() — batch, resolves ties via channel concept scan
|
||||||
|
|
||||||
|
Status values written to documents.recon_domain_status:
|
||||||
|
assigned — clear winner from pass 1 concept count
|
||||||
|
tied_pass_1 — concept tie, awaiting channel tiebreaker
|
||||||
|
tied_pass_2 — resolved by channel tiebreaker
|
||||||
|
tied_manual — needs human review (dashboard)
|
||||||
|
needs_reprocess — missing concepts or only legacy domains
|
||||||
|
manual_assigned — human override from dashboard
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
from .recon_domains import VALID_DOMAINS, DOMAIN_CATEGORY_MAP
|
||||||
|
from .utils import setup_logging
|
||||||
|
|
||||||
|
logger = setup_logging('recon.domain_assigner')
|
||||||
|
|
||||||
|
# Channels with more than this many videos skip channel tiebreaking entirely
|
||||||
|
MEGA_CHANNEL_THRESHOLD = 500
|
||||||
|
|
||||||
|
|
||||||
|
def _count_concept_domains(concepts_dir, file_hash):
|
||||||
|
"""Read concept files and count valid domain occurrences.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
concepts_dir: Base concepts directory (e.g. /opt/recon/data/concepts)
|
||||||
|
file_hash: Document hash
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Counter of {domain_name: count} for valid domains only,
|
||||||
|
or None if no concept directory exists.
|
||||||
|
"""
|
||||||
|
doc_concepts_dir = os.path.join(concepts_dir, file_hash)
|
||||||
|
if not os.path.isdir(doc_concepts_dir):
|
||||||
|
return None
|
||||||
|
|
||||||
|
domain_counter = Counter()
|
||||||
|
|
||||||
|
for fname in os.listdir(doc_concepts_dir):
|
||||||
|
if not fname.startswith('window_') or not fname.endswith('.json'):
|
||||||
|
continue
|
||||||
|
fpath = os.path.join(doc_concepts_dir, fname)
|
||||||
|
try:
|
||||||
|
with open(fpath, 'r') as f:
|
||||||
|
concepts = json.load(f)
|
||||||
|
except (json.JSONDecodeError, OSError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not isinstance(concepts, list):
|
||||||
|
continue
|
||||||
|
|
||||||
|
for concept in concepts:
|
||||||
|
if not isinstance(concept, dict):
|
||||||
|
continue
|
||||||
|
dom = concept.get('domain')
|
||||||
|
if isinstance(dom, str) and dom in VALID_DOMAINS:
|
||||||
|
domain_counter[dom] += 1
|
||||||
|
elif isinstance(dom, list):
|
||||||
|
for d in dom:
|
||||||
|
if isinstance(d, str) and d in VALID_DOMAINS:
|
||||||
|
domain_counter[d] += 1
|
||||||
|
|
||||||
|
return domain_counter
|
||||||
|
|
||||||
|
|
||||||
|
def compute_assignment(file_hash, db, config):
|
||||||
|
"""Compute domain assignment for a single document (pass 1).
|
||||||
|
|
||||||
|
Counts domain occurrences across all concepts. If a single domain
|
||||||
|
wins, assigns it. If tied, defers to batch tiebreaker.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_hash: Document hash
|
||||||
|
db: StatusDB instance
|
||||||
|
config: RECON config dict
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(domain, status) tuple where domain is a string or None,
|
||||||
|
and status is one of: 'assigned', 'tied_pass_1', 'needs_reprocess'
|
||||||
|
"""
|
||||||
|
concepts_dir = config['paths']['concepts']
|
||||||
|
domain_counter = _count_concept_domains(concepts_dir, file_hash)
|
||||||
|
|
||||||
|
if domain_counter is None or len(domain_counter) == 0:
|
||||||
|
return (None, 'needs_reprocess')
|
||||||
|
|
||||||
|
top = domain_counter.most_common(2)
|
||||||
|
top_domain = top[0][0]
|
||||||
|
top_count = top[0][1]
|
||||||
|
|
||||||
|
if len(top) == 1 or top[1][1] < top_count:
|
||||||
|
return (top_domain, 'assigned')
|
||||||
|
|
||||||
|
# Tie — defer to tiebreaker pass
|
||||||
|
return (None, 'tied_pass_1')
|
||||||
|
|
||||||
|
|
||||||
|
def _get_tied_domains(concepts_dir, file_hash):
|
||||||
|
"""Get the set of domains tied for first place in a document's concepts."""
|
||||||
|
domain_counter = _count_concept_domains(concepts_dir, file_hash)
|
||||||
|
if not domain_counter:
|
||||||
|
return []
|
||||||
|
|
||||||
|
top = domain_counter.most_common()
|
||||||
|
if not top:
|
||||||
|
return []
|
||||||
|
|
||||||
|
max_count = top[0][1]
|
||||||
|
return [dom for dom, cnt in top if cnt == max_count]
|
||||||
|
|
||||||
|
|
||||||
|
def _channel_video_hashes(db, channel_name, exclude_hash=None):
|
||||||
|
"""Get all document hashes belonging to a PeerTube channel.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
db: StatusDB instance
|
||||||
|
channel_name: catalogue.category (channel actor name)
|
||||||
|
exclude_hash: Hash to exclude (the document being resolved)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of document hashes
|
||||||
|
"""
|
||||||
|
conn = db._get_conn()
|
||||||
|
rows = conn.execute(
|
||||||
|
"SELECT hash FROM catalogue WHERE category = ? AND source = 'stream.echo6.co'",
|
||||||
|
(channel_name,)
|
||||||
|
).fetchall()
|
||||||
|
hashes = [r['hash'] for r in rows]
|
||||||
|
if exclude_hash:
|
||||||
|
hashes = [h for h in hashes if h != exclude_hash]
|
||||||
|
return hashes
|
||||||
|
|
||||||
|
|
||||||
|
def _channel_video_count(db, channel_name):
|
||||||
|
"""Count total videos in a channel."""
|
||||||
|
conn = db._get_conn()
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT COUNT(*) as cnt FROM catalogue WHERE category = ? AND source = 'stream.echo6.co'",
|
||||||
|
(channel_name,)
|
||||||
|
).fetchone()
|
||||||
|
return row['cnt'] if row else 0
|
||||||
|
|
||||||
|
|
||||||
|
def run_tiebreaker_pass(db, config):
|
||||||
|
"""Resolve tied domain assignments using channel-level concept analysis.
|
||||||
|
|
||||||
|
Processes all documents where recon_domain_status = 'tied_pass_1'.
|
||||||
|
|
||||||
|
Pass 2: For each tied document, reads concept files from all other
|
||||||
|
videos in the same channel and picks the tied domain with the highest
|
||||||
|
channel-wide count.
|
||||||
|
|
||||||
|
Pass 3 (defensive re-run): Re-reads the same channel concept files a
|
||||||
|
second time with identical logic. This catches concept-file changes
|
||||||
|
that occurred mid-run (e.g. concurrent enrichment writing new windows).
|
||||||
|
In steady state pass 3 produces the same result as pass 2, but under
|
||||||
|
concurrent writes it can resolve a tie that pass 2 missed.
|
||||||
|
|
||||||
|
Mega-channels (>500 videos) skip both passes and go straight to
|
||||||
|
'tied_manual' for dashboard review.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
db: StatusDB instance
|
||||||
|
config: RECON config dict
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with counts: resolved, manual, skipped, errors
|
||||||
|
"""
|
||||||
|
concepts_dir = config['paths']['concepts']
|
||||||
|
tied_items = db.get_items_by_domain_status('tied_pass_1')
|
||||||
|
|
||||||
|
stats = {'resolved': 0, 'manual': 0, 'skipped': 0, 'errors': 0, 'total': len(tied_items)}
|
||||||
|
logger.info(f"Tiebreaker pass: {len(tied_items)} items to resolve")
|
||||||
|
|
||||||
|
# Cache channel sizes to avoid repeated queries
|
||||||
|
channel_size_cache = {}
|
||||||
|
|
||||||
|
for item in tied_items:
|
||||||
|
file_hash = item['hash']
|
||||||
|
channel = item.get('category', '')
|
||||||
|
|
||||||
|
try:
|
||||||
|
tied_domains = _get_tied_domains(concepts_dir, file_hash)
|
||||||
|
if not tied_domains:
|
||||||
|
db.set_domain_assignment(file_hash, None, 'needs_reprocess')
|
||||||
|
stats['skipped'] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(tied_domains) == 1:
|
||||||
|
# No longer tied (possibly re-enriched since pass 1)
|
||||||
|
db.set_domain_assignment(file_hash, tied_domains[0], 'assigned')
|
||||||
|
stats['resolved'] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check mega-channel rule
|
||||||
|
if channel not in channel_size_cache:
|
||||||
|
channel_size_cache[channel] = _channel_video_count(db, channel)
|
||||||
|
|
||||||
|
if channel_size_cache[channel] > MEGA_CHANNEL_THRESHOLD:
|
||||||
|
fallback = sorted(tied_domains)[0]
|
||||||
|
db.set_domain_assignment(file_hash, fallback, 'tied_manual')
|
||||||
|
stats['manual'] += 1
|
||||||
|
logger.debug(f" {file_hash[:12]}: mega-channel '{channel}' "
|
||||||
|
f"({channel_size_cache[channel]} videos), → tied_manual")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Channel tiebreaker: count domains across all other videos in channel
|
||||||
|
other_hashes = _channel_video_hashes(db, channel, exclude_hash=file_hash)
|
||||||
|
channel_domain_counts = Counter()
|
||||||
|
|
||||||
|
for other_hash in other_hashes:
|
||||||
|
other_counts = _count_concept_domains(concepts_dir, other_hash)
|
||||||
|
if other_counts:
|
||||||
|
channel_domain_counts.update(other_counts)
|
||||||
|
|
||||||
|
# Among tied domains only, pick highest channel-wide count
|
||||||
|
best_domain = None
|
||||||
|
best_count = -1
|
||||||
|
for dom in tied_domains:
|
||||||
|
c = channel_domain_counts.get(dom, 0)
|
||||||
|
if c > best_count:
|
||||||
|
best_count = c
|
||||||
|
best_domain = dom
|
||||||
|
|
||||||
|
# Pass 2: check if channel tiebreaker resolved it
|
||||||
|
tied_at_channel = [d for d in tied_domains
|
||||||
|
if channel_domain_counts.get(d, 0) == best_count]
|
||||||
|
|
||||||
|
if len(tied_at_channel) == 1:
|
||||||
|
db.set_domain_assignment(file_hash, best_domain, 'tied_pass_2')
|
||||||
|
stats['resolved'] += 1
|
||||||
|
logger.debug(f" {file_hash[:12]}: resolved → {best_domain} (pass 2 channel tiebreaker)")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Pass 3: defensive re-run — re-count channel concepts to catch
|
||||||
|
# concept-file changes that occurred mid-run. Identical logic to
|
||||||
|
# pass 2; resolves races where files were written between the
|
||||||
|
# two reads.
|
||||||
|
channel_domain_counts_p3 = Counter()
|
||||||
|
for other_hash in other_hashes:
|
||||||
|
other_counts = _count_concept_domains(concepts_dir, other_hash)
|
||||||
|
if other_counts:
|
||||||
|
channel_domain_counts_p3.update(other_counts)
|
||||||
|
|
||||||
|
best_domain_p3 = None
|
||||||
|
best_count_p3 = -1
|
||||||
|
for dom in tied_domains:
|
||||||
|
c = channel_domain_counts_p3.get(dom, 0)
|
||||||
|
if c > best_count_p3:
|
||||||
|
best_count_p3 = c
|
||||||
|
best_domain_p3 = dom
|
||||||
|
|
||||||
|
tied_at_p3 = [d for d in tied_domains
|
||||||
|
if channel_domain_counts_p3.get(d, 0) == best_count_p3]
|
||||||
|
|
||||||
|
if len(tied_at_p3) == 1:
|
||||||
|
db.set_domain_assignment(file_hash, best_domain_p3, 'tied_pass_2')
|
||||||
|
stats['resolved'] += 1
|
||||||
|
logger.debug(f" {file_hash[:12]}: resolved → {best_domain_p3} (pass 3 defensive re-run)")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Still tied after pass 3 — mark for manual review
|
||||||
|
fallback = sorted(tied_domains)[0]
|
||||||
|
db.set_domain_assignment(file_hash, fallback, 'tied_manual')
|
||||||
|
stats['manual'] += 1
|
||||||
|
logger.debug(f" {file_hash[:12]}: still tied after pass 3, → tied_manual")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f" Tiebreaker error for {file_hash[:12]}: {e}")
|
||||||
|
stats['errors'] += 1
|
||||||
|
|
||||||
|
logger.info(f"Tiebreaker complete: {stats['resolved']} resolved, "
|
||||||
|
f"{stats['manual']} manual, {stats['skipped']} skipped, "
|
||||||
|
f"{stats['errors']} errors")
|
||||||
|
return stats
|
||||||
Loading…
Add table
Add a link
Reference in a new issue