Replace mega-channel size rule with explicit skip list

The >500-video threshold was too aggressive — it skipped tiebreaking
for legitimate large channels (1a-auto, forgotten-weapons, etc.) where
channel context correctly resolves ties. Replace with an explicit
MEGA_CHANNEL_SKIP_LIST in recon_domains.py. Only known non-topical
catch-alls (currently just "Transcript") skip the tiebreaker.

Removed _channel_video_count() helper and MEGA_CHANNEL_THRESHOLD
constant (no longer used).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Matt 2026-04-28 04:24:39 +00:00
commit 299be21f42
3 changed files with 26 additions and 26 deletions

View file

@ -30,15 +30,20 @@ For each `tied_pass_1` document:
1. Identify the tied domains from Qdrant 1. Identify the tied domains from Qdrant
2. Look up the document's channel (`catalogue.category`) 2. Look up the document's channel (`catalogue.category`)
3. **Mega-channel rule:** If channel has >500 videos, skip tiebreaking → `tied_manual` 3. **Skip-list check:** If channel is in `MEGA_CHANNEL_SKIP_LIST` (known non-topical catch-alls), skip tiebreaking → `tied_manual`
4. Query Qdrant for domain counts across all other videos in the same channel (single batch query with `MatchAny` filter) 4. Query Qdrant for domain counts across all other videos in the same channel (single batch query with `MatchAny` filter)
5. Among the tied domains only, pick the one with the highest channel-wide concept count 5. Among the tied domains only, pick the one with the highest channel-wide concept count
6. If resolved → `tied_pass_2` 6. If resolved → `tied_pass_2`
7. If still tied → `tied_manual` (alphabetical fallback assigned, flagged for review) 7. If still tied → `tied_manual` (alphabetical fallback assigned, flagged for review)
### Mega-Channel Rule ### Channel Skip List
Channels with >500 videos (like the "Transcript" catch-all with ~9,200 videos) are not topically coherent. Scanning their concepts produces meaningless aggregate data. These go straight to `tied_manual` for dashboard review. Certain channels are known non-topical catch-alls where channel-wide concept aggregation produces meaningless noise. These are listed explicitly in `MEGA_CHANNEL_SKIP_LIST` (defined in `lib/recon_domains.py`) and skip tiebreaking entirely — their tied items go straight to `tied_manual` for dashboard review.
Current skip list:
- `Transcript` — Legacy catch-all (~9,200 videos), no topical coherence
This is intentionally an explicit list, not a size threshold. Legitimate large channels (e.g., 1a-auto, forgotten-weapons) run the tiebreaker normally because their content is topically coherent. Adding a channel to the skip list requires a code change and a documented reason.
## Status Values ## Status Values

View file

@ -25,13 +25,11 @@ from collections import Counter
from qdrant_client import QdrantClient from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue, MatchAny from qdrant_client.models import Filter, FieldCondition, MatchValue, MatchAny
from .recon_domains import VALID_DOMAINS, DOMAIN_CATEGORY_MAP from .recon_domains import VALID_DOMAINS, DOMAIN_CATEGORY_MAP, MEGA_CHANNEL_SKIP_LIST
from .utils import setup_logging from .utils import setup_logging
logger = setup_logging('recon.domain_assigner') logger = setup_logging('recon.domain_assigner')
# Channels with more than this many videos skip channel tiebreaking entirely
MEGA_CHANNEL_THRESHOLD = 500
def _get_qdrant_client(config): def _get_qdrant_client(config):
@ -222,14 +220,6 @@ def _channel_video_hashes(db, channel_name, exclude_hash=None):
return hashes return hashes
def _channel_video_count(db, channel_name):
"""Count total videos in a channel."""
conn = db._get_conn()
row = conn.execute(
"SELECT COUNT(*) as cnt FROM catalogue WHERE category = ? AND source = 'stream.echo6.co'",
(channel_name,)
).fetchone()
return row['cnt'] if row else 0
def run_tiebreaker_pass(db, config, qdrant=None): def run_tiebreaker_pass(db, config, qdrant=None):
@ -241,8 +231,8 @@ def run_tiebreaker_pass(db, config, qdrant=None):
other videos in the same channel and picks the tied domain with the other videos in the same channel and picks the tied domain with the
highest channel-wide count. highest channel-wide count.
Mega-channels (>500 videos) skip tiebreaking and go straight to Channels in MEGA_CHANNEL_SKIP_LIST (known non-topical catch-alls) skip
'tied_manual' for dashboard review. tiebreaking and go straight to 'tied_manual' for dashboard review.
Args: Args:
db: StatusDB instance db: StatusDB instance
@ -263,9 +253,6 @@ def run_tiebreaker_pass(db, config, qdrant=None):
stats = {'resolved': 0, 'manual': 0, 'skipped': 0, 'errors': 0, 'total': len(tied_items)} stats = {'resolved': 0, 'manual': 0, 'skipped': 0, 'errors': 0, 'total': len(tied_items)}
logger.info(f"Tiebreaker pass: {len(tied_items)} items to resolve") logger.info(f"Tiebreaker pass: {len(tied_items)} items to resolve")
# Cache channel sizes to avoid repeated queries
channel_size_cache = {}
for item in tied_items: for item in tied_items:
file_hash = item['hash'] file_hash = item['hash']
channel = item.get('category', '') channel = item.get('category', '')
@ -283,16 +270,12 @@ def run_tiebreaker_pass(db, config, qdrant=None):
stats['resolved'] += 1 stats['resolved'] += 1
continue continue
# Check mega-channel rule # Skip-list check: known non-topical catch-all channels
if channel not in channel_size_cache: if channel in MEGA_CHANNEL_SKIP_LIST:
channel_size_cache[channel] = _channel_video_count(db, channel)
if channel_size_cache[channel] > MEGA_CHANNEL_THRESHOLD:
fallback = sorted(tied_domains)[0] fallback = sorted(tied_domains)[0]
db.set_domain_assignment(file_hash, fallback, 'tied_manual') db.set_domain_assignment(file_hash, fallback, 'tied_manual')
stats['manual'] += 1 stats['manual'] += 1
logger.debug(f" {file_hash[:12]}: mega-channel '{channel}' " logger.debug(f" {file_hash[:12]}: skip-list channel '{channel}' → tied_manual")
f"({channel_size_cache[channel]} videos), → tied_manual")
continue continue
# Channel tiebreaker: count domains across all other videos in channel # Channel tiebreaker: count domains across all other videos in channel

View file

@ -32,3 +32,15 @@ DOMAIN_CATEGORY_MAP = {
VALID_DOMAINS = set(DOMAIN_CATEGORY_MAP.keys()) VALID_DOMAINS = set(DOMAIN_CATEGORY_MAP.keys())
CATEGORY_DOMAIN_MAP = {v: k for k, v in DOMAIN_CATEGORY_MAP.items()} CATEGORY_DOMAIN_MAP = {v: k for k, v in DOMAIN_CATEGORY_MAP.items()}
# Channels whose tiebreaker is skipped because their content is non-topical
# (catch-alls, miscellany dumps, etc.). Items in these channels with tied
# domain counts go straight to tied_manual without channel-context tiebreaker.
#
# This is intentionally a hardcoded explicit list, not a size threshold.
# Adding a channel here requires an explicit decision — only add channels
# that are genuinely non-topical catch-alls where channel-wide concept
# aggregation would produce meaningless noise.
MEGA_CHANNEL_SKIP_LIST = {
'Transcript', # Legacy catch-all, ~9,200 videos, no topical coherence
}