mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 06:34:40 +02:00
Replace mega-channel size rule with explicit skip list
The >500-video threshold was too aggressive — it skipped tiebreaking for legitimate large channels (1a-auto, forgotten-weapons, etc.) where channel context correctly resolves ties. Replace with an explicit MEGA_CHANNEL_SKIP_LIST in recon_domains.py. Only known non-topical catch-alls (currently just "Transcript") skip the tiebreaker. Removed _channel_video_count() helper and MEGA_CHANNEL_THRESHOLD constant (no longer used). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
d8196e60c7
commit
299be21f42
3 changed files with 26 additions and 26 deletions
|
|
@ -30,15 +30,20 @@ For each `tied_pass_1` document:
|
||||||
|
|
||||||
1. Identify the tied domains from Qdrant
|
1. Identify the tied domains from Qdrant
|
||||||
2. Look up the document's channel (`catalogue.category`)
|
2. Look up the document's channel (`catalogue.category`)
|
||||||
3. **Mega-channel rule:** If channel has >500 videos, skip tiebreaking → `tied_manual`
|
3. **Skip-list check:** If channel is in `MEGA_CHANNEL_SKIP_LIST` (known non-topical catch-alls), skip tiebreaking → `tied_manual`
|
||||||
4. Query Qdrant for domain counts across all other videos in the same channel (single batch query with `MatchAny` filter)
|
4. Query Qdrant for domain counts across all other videos in the same channel (single batch query with `MatchAny` filter)
|
||||||
5. Among the tied domains only, pick the one with the highest channel-wide concept count
|
5. Among the tied domains only, pick the one with the highest channel-wide concept count
|
||||||
6. If resolved → `tied_pass_2`
|
6. If resolved → `tied_pass_2`
|
||||||
7. If still tied → `tied_manual` (alphabetical fallback assigned, flagged for review)
|
7. If still tied → `tied_manual` (alphabetical fallback assigned, flagged for review)
|
||||||
|
|
||||||
### Mega-Channel Rule
|
### Channel Skip List
|
||||||
|
|
||||||
Channels with >500 videos (like the "Transcript" catch-all with ~9,200 videos) are not topically coherent. Scanning their concepts produces meaningless aggregate data. These go straight to `tied_manual` for dashboard review.
|
Certain channels are known non-topical catch-alls where channel-wide concept aggregation produces meaningless noise. These are listed explicitly in `MEGA_CHANNEL_SKIP_LIST` (defined in `lib/recon_domains.py`) and skip tiebreaking entirely — their tied items go straight to `tied_manual` for dashboard review.
|
||||||
|
|
||||||
|
Current skip list:
|
||||||
|
- `Transcript` — Legacy catch-all (~9,200 videos), no topical coherence
|
||||||
|
|
||||||
|
This is intentionally an explicit list, not a size threshold. Legitimate large channels (e.g., 1a-auto, forgotten-weapons) run the tiebreaker normally because their content is topically coherent. Adding a channel to the skip list requires a code change and a documented reason.
|
||||||
|
|
||||||
## Status Values
|
## Status Values
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -25,13 +25,11 @@ from collections import Counter
|
||||||
from qdrant_client import QdrantClient
|
from qdrant_client import QdrantClient
|
||||||
from qdrant_client.models import Filter, FieldCondition, MatchValue, MatchAny
|
from qdrant_client.models import Filter, FieldCondition, MatchValue, MatchAny
|
||||||
|
|
||||||
from .recon_domains import VALID_DOMAINS, DOMAIN_CATEGORY_MAP
|
from .recon_domains import VALID_DOMAINS, DOMAIN_CATEGORY_MAP, MEGA_CHANNEL_SKIP_LIST
|
||||||
from .utils import setup_logging
|
from .utils import setup_logging
|
||||||
|
|
||||||
logger = setup_logging('recon.domain_assigner')
|
logger = setup_logging('recon.domain_assigner')
|
||||||
|
|
||||||
# Channels with more than this many videos skip channel tiebreaking entirely
|
|
||||||
MEGA_CHANNEL_THRESHOLD = 500
|
|
||||||
|
|
||||||
|
|
||||||
def _get_qdrant_client(config):
|
def _get_qdrant_client(config):
|
||||||
|
|
@ -222,14 +220,6 @@ def _channel_video_hashes(db, channel_name, exclude_hash=None):
|
||||||
return hashes
|
return hashes
|
||||||
|
|
||||||
|
|
||||||
def _channel_video_count(db, channel_name):
|
|
||||||
"""Count total videos in a channel."""
|
|
||||||
conn = db._get_conn()
|
|
||||||
row = conn.execute(
|
|
||||||
"SELECT COUNT(*) as cnt FROM catalogue WHERE category = ? AND source = 'stream.echo6.co'",
|
|
||||||
(channel_name,)
|
|
||||||
).fetchone()
|
|
||||||
return row['cnt'] if row else 0
|
|
||||||
|
|
||||||
|
|
||||||
def run_tiebreaker_pass(db, config, qdrant=None):
|
def run_tiebreaker_pass(db, config, qdrant=None):
|
||||||
|
|
@ -241,8 +231,8 @@ def run_tiebreaker_pass(db, config, qdrant=None):
|
||||||
other videos in the same channel and picks the tied domain with the
|
other videos in the same channel and picks the tied domain with the
|
||||||
highest channel-wide count.
|
highest channel-wide count.
|
||||||
|
|
||||||
Mega-channels (>500 videos) skip tiebreaking and go straight to
|
Channels in MEGA_CHANNEL_SKIP_LIST (known non-topical catch-alls) skip
|
||||||
'tied_manual' for dashboard review.
|
tiebreaking and go straight to 'tied_manual' for dashboard review.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
db: StatusDB instance
|
db: StatusDB instance
|
||||||
|
|
@ -263,9 +253,6 @@ def run_tiebreaker_pass(db, config, qdrant=None):
|
||||||
stats = {'resolved': 0, 'manual': 0, 'skipped': 0, 'errors': 0, 'total': len(tied_items)}
|
stats = {'resolved': 0, 'manual': 0, 'skipped': 0, 'errors': 0, 'total': len(tied_items)}
|
||||||
logger.info(f"Tiebreaker pass: {len(tied_items)} items to resolve")
|
logger.info(f"Tiebreaker pass: {len(tied_items)} items to resolve")
|
||||||
|
|
||||||
# Cache channel sizes to avoid repeated queries
|
|
||||||
channel_size_cache = {}
|
|
||||||
|
|
||||||
for item in tied_items:
|
for item in tied_items:
|
||||||
file_hash = item['hash']
|
file_hash = item['hash']
|
||||||
channel = item.get('category', '')
|
channel = item.get('category', '')
|
||||||
|
|
@ -283,16 +270,12 @@ def run_tiebreaker_pass(db, config, qdrant=None):
|
||||||
stats['resolved'] += 1
|
stats['resolved'] += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Check mega-channel rule
|
# Skip-list check: known non-topical catch-all channels
|
||||||
if channel not in channel_size_cache:
|
if channel in MEGA_CHANNEL_SKIP_LIST:
|
||||||
channel_size_cache[channel] = _channel_video_count(db, channel)
|
|
||||||
|
|
||||||
if channel_size_cache[channel] > MEGA_CHANNEL_THRESHOLD:
|
|
||||||
fallback = sorted(tied_domains)[0]
|
fallback = sorted(tied_domains)[0]
|
||||||
db.set_domain_assignment(file_hash, fallback, 'tied_manual')
|
db.set_domain_assignment(file_hash, fallback, 'tied_manual')
|
||||||
stats['manual'] += 1
|
stats['manual'] += 1
|
||||||
logger.debug(f" {file_hash[:12]}: mega-channel '{channel}' "
|
logger.debug(f" {file_hash[:12]}: skip-list channel '{channel}' → tied_manual")
|
||||||
f"({channel_size_cache[channel]} videos), → tied_manual")
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Channel tiebreaker: count domains across all other videos in channel
|
# Channel tiebreaker: count domains across all other videos in channel
|
||||||
|
|
|
||||||
|
|
@ -32,3 +32,15 @@ DOMAIN_CATEGORY_MAP = {
|
||||||
VALID_DOMAINS = set(DOMAIN_CATEGORY_MAP.keys())
|
VALID_DOMAINS = set(DOMAIN_CATEGORY_MAP.keys())
|
||||||
|
|
||||||
CATEGORY_DOMAIN_MAP = {v: k for k, v in DOMAIN_CATEGORY_MAP.items()}
|
CATEGORY_DOMAIN_MAP = {v: k for k, v in DOMAIN_CATEGORY_MAP.items()}
|
||||||
|
|
||||||
|
# Channels whose tiebreaker is skipped because their content is non-topical
|
||||||
|
# (catch-alls, miscellany dumps, etc.). Items in these channels with tied
|
||||||
|
# domain counts go straight to tied_manual without channel-context tiebreaker.
|
||||||
|
#
|
||||||
|
# This is intentionally a hardcoded explicit list, not a size threshold.
|
||||||
|
# Adding a channel here requires an explicit decision — only add channels
|
||||||
|
# that are genuinely non-topical catch-alls where channel-wide concept
|
||||||
|
# aggregation would produce meaningless noise.
|
||||||
|
MEGA_CHANNEL_SKIP_LIST = {
|
||||||
|
'Transcript', # Legacy catch-all, ~9,200 videos, no topical coherence
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue