From 299be21f4291cc196b99b357f51acf137bf8c4fc Mon Sep 17 00:00:00 2001 From: Matt Date: Tue, 28 Apr 2026 04:24:39 +0000 Subject: [PATCH] Replace mega-channel size rule with explicit skip list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The >500-video threshold was too aggressive — it skipped tiebreaking for legitimate large channels (1a-auto, forgotten-weapons, etc.) where channel context correctly resolves ties. Replace with an explicit MEGA_CHANNEL_SKIP_LIST in recon_domains.py. Only known non-topical catch-alls (currently just "Transcript") skip the tiebreaker. Removed _channel_video_count() helper and MEGA_CHANNEL_THRESHOLD constant (no longer used). Co-Authored-By: Claude Opus 4.6 --- docs/domain-assignment.md | 11 ++++++++--- lib/domain_assigner.py | 29 ++++++----------------------- lib/recon_domains.py | 12 ++++++++++++ 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/docs/domain-assignment.md b/docs/domain-assignment.md index 8651419..f1f122f 100644 --- a/docs/domain-assignment.md +++ b/docs/domain-assignment.md @@ -30,15 +30,20 @@ For each `tied_pass_1` document: 1. Identify the tied domains from Qdrant 2. Look up the document's channel (`catalogue.category`) -3. **Mega-channel rule:** If channel has >500 videos, skip tiebreaking → `tied_manual` +3. **Skip-list check:** If channel is in `MEGA_CHANNEL_SKIP_LIST` (known non-topical catch-alls), skip tiebreaking → `tied_manual` 4. Query Qdrant for domain counts across all other videos in the same channel (single batch query with `MatchAny` filter) 5. Among the tied domains only, pick the one with the highest channel-wide concept count 6. If resolved → `tied_pass_2` 7. If still tied → `tied_manual` (alphabetical fallback assigned, flagged for review) -### Mega-Channel Rule +### Channel Skip List -Channels with >500 videos (like the "Transcript" catch-all with ~9,200 videos) are not topically coherent. Scanning their concepts produces meaningless aggregate data. These go straight to `tied_manual` for dashboard review. +Certain channels are known non-topical catch-alls where channel-wide concept aggregation produces meaningless noise. These are listed explicitly in `MEGA_CHANNEL_SKIP_LIST` (defined in `lib/recon_domains.py`) and skip tiebreaking entirely — their tied items go straight to `tied_manual` for dashboard review. + +Current skip list: +- `Transcript` — Legacy catch-all (~9,200 videos), no topical coherence + +This is intentionally an explicit list, not a size threshold. Legitimate large channels (e.g., 1a-auto, forgotten-weapons) run the tiebreaker normally because their content is topically coherent. Adding a channel to the skip list requires a code change and a documented reason. ## Status Values diff --git a/lib/domain_assigner.py b/lib/domain_assigner.py index a6f0f47..cbf69a1 100644 --- a/lib/domain_assigner.py +++ b/lib/domain_assigner.py @@ -25,13 +25,11 @@ from collections import Counter from qdrant_client import QdrantClient from qdrant_client.models import Filter, FieldCondition, MatchValue, MatchAny -from .recon_domains import VALID_DOMAINS, DOMAIN_CATEGORY_MAP +from .recon_domains import VALID_DOMAINS, DOMAIN_CATEGORY_MAP, MEGA_CHANNEL_SKIP_LIST from .utils import setup_logging logger = setup_logging('recon.domain_assigner') -# Channels with more than this many videos skip channel tiebreaking entirely -MEGA_CHANNEL_THRESHOLD = 500 def _get_qdrant_client(config): @@ -222,14 +220,6 @@ def _channel_video_hashes(db, channel_name, exclude_hash=None): return hashes -def _channel_video_count(db, channel_name): - """Count total videos in a channel.""" - conn = db._get_conn() - row = conn.execute( - "SELECT COUNT(*) as cnt FROM catalogue WHERE category = ? AND source = 'stream.echo6.co'", - (channel_name,) - ).fetchone() - return row['cnt'] if row else 0 def run_tiebreaker_pass(db, config, qdrant=None): @@ -241,8 +231,8 @@ def run_tiebreaker_pass(db, config, qdrant=None): other videos in the same channel and picks the tied domain with the highest channel-wide count. - Mega-channels (>500 videos) skip tiebreaking and go straight to - 'tied_manual' for dashboard review. + Channels in MEGA_CHANNEL_SKIP_LIST (known non-topical catch-alls) skip + tiebreaking and go straight to 'tied_manual' for dashboard review. Args: db: StatusDB instance @@ -263,9 +253,6 @@ def run_tiebreaker_pass(db, config, qdrant=None): stats = {'resolved': 0, 'manual': 0, 'skipped': 0, 'errors': 0, 'total': len(tied_items)} logger.info(f"Tiebreaker pass: {len(tied_items)} items to resolve") - # Cache channel sizes to avoid repeated queries - channel_size_cache = {} - for item in tied_items: file_hash = item['hash'] channel = item.get('category', '') @@ -283,16 +270,12 @@ def run_tiebreaker_pass(db, config, qdrant=None): stats['resolved'] += 1 continue - # Check mega-channel rule - if channel not in channel_size_cache: - channel_size_cache[channel] = _channel_video_count(db, channel) - - if channel_size_cache[channel] > MEGA_CHANNEL_THRESHOLD: + # Skip-list check: known non-topical catch-all channels + if channel in MEGA_CHANNEL_SKIP_LIST: fallback = sorted(tied_domains)[0] db.set_domain_assignment(file_hash, fallback, 'tied_manual') stats['manual'] += 1 - logger.debug(f" {file_hash[:12]}: mega-channel '{channel}' " - f"({channel_size_cache[channel]} videos), → tied_manual") + logger.debug(f" {file_hash[:12]}: skip-list channel '{channel}' → tied_manual") continue # Channel tiebreaker: count domains across all other videos in channel diff --git a/lib/recon_domains.py b/lib/recon_domains.py index 350a05c..b1c8b3c 100644 --- a/lib/recon_domains.py +++ b/lib/recon_domains.py @@ -32,3 +32,15 @@ DOMAIN_CATEGORY_MAP = { VALID_DOMAINS = set(DOMAIN_CATEGORY_MAP.keys()) CATEGORY_DOMAIN_MAP = {v: k for k, v in DOMAIN_CATEGORY_MAP.items()} + +# Channels whose tiebreaker is skipped because their content is non-topical +# (catch-alls, miscellany dumps, etc.). Items in these channels with tied +# domain counts go straight to tied_manual without channel-context tiebreaker. +# +# This is intentionally a hardcoded explicit list, not a size threshold. +# Adding a channel here requires an explicit decision — only add channels +# that are genuinely non-topical catch-alls where channel-wide concept +# aggregation would produce meaningless noise. +MEGA_CHANNEL_SKIP_LIST = { + 'Transcript', # Legacy catch-all, ~9,200 videos, no topical coherence +}