recon/lib/peertube_scraper.py
Matt 563c16bb71 Initial commit: RECON codebase baseline
Current state of the pipeline code as of 2026-04-14 (Phase 1 scaffolding complete).
Config has new_pipeline.enabled=false and crawler.sites=[] per refactor plan.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-14 14:57:23 +00:00

580 lines
17 KiB
Python

"""
RECON PeerTube Scraper — Video transcript ingestion.
Fetches WebVTT captions from a PeerTube instance, converts to plain text,
chunks into pages, and feeds into the standard RECON enrichment pipeline.
Output format matches lib/web_scraper.py so the enricher and embedder
process transcript content identically to web content.
"""
import hashlib
import io
import json
import os
import bisect
import re
import time
from datetime import datetime, timezone
from urllib.parse import quote
import requests
import webvtt
from .utils import get_config, setup_logging
from .status import StatusDB
from .web_scraper import chunk_text
logger = setup_logging('recon.peertube_scraper')
# Module-level stop flag — set by service thread for graceful shutdown
_stop_check = None
def set_stop_check(fn):
"""Register a callable that returns True when shutdown is requested."""
global _stop_check
_stop_check = fn
# Defaults (overridden by config.yaml peertube section)
DEFAULT_API_BASE = 'http://192.168.1.170'
DEFAULT_PUBLIC_URL = 'https://stream.echo6.co'
DEFAULT_FETCH_TIMEOUT = 30
DEFAULT_RATE_LIMIT_DELAY = 0.5
def _get_pt_config(config=None):
"""Get PeerTube settings from config, with defaults."""
if config is None:
config = get_config()
pt = config.get('peertube', {})
return {
'api_base': pt.get('api_base', DEFAULT_API_BASE),
'public_url': pt.get('public_url', DEFAULT_PUBLIC_URL),
'fetch_timeout': pt.get('fetch_timeout', DEFAULT_FETCH_TIMEOUT),
'rate_limit_delay': pt.get('rate_limit_delay', DEFAULT_RATE_LIMIT_DELAY),
}
def _api_get(path, config=None, params=None):
"""Make a GET request to the PeerTube API."""
ptc = _get_pt_config(config)
url = f"{ptc['api_base']}{path}"
resp = requests.get(url, params=params, timeout=ptc['fetch_timeout'])
resp.raise_for_status()
return resp.json()
def get_videos(channel=None, since=None, config=None):
"""
Paginate through all published videos on the PeerTube instance.
Args:
channel: Filter to this channel actor_name (e.g., 'mental-outlaw')
since: ISO date string — only return videos published after this date
config: RECON config dict
Returns list of video dicts with: uuid, name, duration,
channel.name, channel.displayName, publishedAt, description.
"""
ptc = _get_pt_config(config)
videos = []
start = 0
count = 100 # PeerTube supports up to 100 per page
while True:
if channel:
path = f"/api/v1/video-channels/{channel}/videos"
else:
path = "/api/v1/videos"
data = _api_get(path, config, params={
'count': count,
'start': start,
'sort': '-publishedAt',
})
total = data.get('total', 0)
batch = data.get('data', [])
if not batch:
break
for v in batch:
published = v.get('publishedAt', '')
# Filter by since date
if since and published < since:
# Videos are sorted by publishedAt desc, so once we pass
# the since threshold, all remaining are older — stop
return videos
videos.append({
'uuid': v['uuid'],
'name': v['name'],
'duration': v.get('duration', 0),
'channel_name': v.get('channel', {}).get('name', ''),
'channel_display': v.get('channel', {}).get('displayName', ''),
'publishedAt': published,
'description': (v.get('description') or '')[:500],
})
start += count
if start >= total:
break
# Check for shutdown during pagination
if _stop_check and _stop_check():
logger.info(f"Shutdown requested during video listing — returning {len(videos)} collected so far")
return videos
# Rate limit pagination requests
time.sleep(ptc['rate_limit_delay'])
return videos
def get_captions(uuid, config=None):
"""Get caption list for a video. Returns list of caption dicts."""
data = _api_get(f"/api/v1/videos/{uuid}/captions", config)
return data.get('data', [])
def fetch_vtt(caption_path, config=None):
"""Fetch raw VTT file content from PeerTube."""
ptc = _get_pt_config(config)
url = f"{ptc['api_base']}{caption_path}"
resp = requests.get(url, timeout=ptc['fetch_timeout'])
resp.raise_for_status()
return resp.text
def _parse_vtt_time(time_str):
"""Parse VTT timestamp string (HH:MM:SS.mmm or MM:SS.mmm) to seconds."""
parts = time_str.split(':')
if len(parts) == 3:
h, m, s = parts
return int(h) * 3600 + int(m) * 60 + float(s)
elif len(parts) == 2:
m, s = parts
return int(m) * 60 + float(s)
return 0.0
def vtt_to_text(vtt_content):
"""
Convert WebVTT content to clean plain text with timestamp tracking.
Strips timestamps, de-duplicates consecutive identical cues (common with
Whisper output), removes HTML tags, and joins cues with spaces (not
newlines — Whisper cues break mid-sentence).
Returns (text, cue_timestamps) where:
- text: clean prose string
- cue_timestamps: list of (start_seconds, char_offset) tuples tracking
where each VTT cue begins in the output text
"""
buf = io.StringIO(vtt_content)
try:
captions = webvtt.read_buffer(buf)
except Exception:
# Fallback: manual regex parse if webvtt-py fails
return _vtt_to_text_fallback(vtt_content)
prev_text = None
segments = []
raw_timestamps = [] # (start_seconds, segment_index)
for caption in captions:
text = caption.text.strip()
if not text:
continue
# Strip HTML tags
text = re.sub(r'<[^>]+>', '', text)
# De-duplicate consecutive identical cues
if text == prev_text:
continue
prev_text = text
start_seconds = _parse_vtt_time(caption.start)
raw_timestamps.append((start_seconds, len(segments)))
segments.append(text)
# Join with spaces — VTT cues break mid-sentence
raw = ' '.join(segments)
# Clean up double spaces and whitespace
raw = re.sub(r'\s+', ' ', raw).strip()
# Compute char offsets for each tracked segment
seg_offsets = []
pos = 0
for i, seg in enumerate(segments):
seg_offsets.append(pos)
pos += len(seg) + 1 # +1 for space separator
cue_timestamps = []
for start_secs, seg_idx in raw_timestamps:
if seg_idx < len(seg_offsets):
cue_timestamps.append((start_secs, seg_offsets[seg_idx]))
return raw, cue_timestamps
def _vtt_to_text_fallback(vtt_content):
"""Regex-based VTT parser as fallback. Returns (text, cue_timestamps)."""
lines = vtt_content.split('\n')
prev_text = None
segments = []
raw_timestamps = []
last_time = 0.0
for line in lines:
line = line.strip()
if not line or line == 'WEBVTT':
continue
if '-->' in line:
# Parse start time from "00:01:23.456 --> 00:01:25.789"
time_part = line.split('-->')[0].strip()
last_time = _parse_vtt_time(time_part)
continue
if line.isdigit():
continue
text = re.sub(r'<[^>]+>', '', line)
if text == prev_text:
continue
prev_text = text
raw_timestamps.append((last_time, len(segments)))
segments.append(text)
raw = ' '.join(segments)
raw = re.sub(r'\s+', ' ', raw).strip()
# Compute char offsets
seg_offsets = []
pos = 0
for seg in segments:
seg_offsets.append(pos)
pos += len(seg) + 1
cue_timestamps = []
for start_secs, seg_idx in raw_timestamps:
if seg_idx < len(seg_offsets):
cue_timestamps.append((start_secs, seg_offsets[seg_idx]))
return raw, cue_timestamps
def _map_page_timestamps(pages, full_text, cue_timestamps):
"""
Map page numbers to video timestamps.
For each page, finds its approximate start position in the full text,
then looks up the nearest VTT cue timestamp via binary search.
Returns dict: {"page_0001": 0.0, "page_0002": 312.5, ...}
"""
if not cue_timestamps:
return {}
offsets = [ct[1] for ct in cue_timestamps]
times = [ct[0] for ct in cue_timestamps]
page_ts = {}
search_start = 0
for i, page_text in enumerate(pages):
page_name = f"page_{i+1:04d}"
# Find where this page starts in the full text
snippet = page_text[:200].strip()
pos = full_text.find(snippet, search_start)
if pos < 0:
pos = search_start # fallback
# Binary search for nearest cue at or before this position
idx = bisect.bisect_right(offsets, pos) - 1
if idx < 0:
idx = 0
page_ts[page_name] = round(times[idx], 1)
search_start = pos + len(snippet)
return page_ts
def _content_hash(text):
"""MD5 hash of text content — same as web_scraper."""
return hashlib.md5(text.encode('utf-8')).hexdigest()
def ingest_video(uuid, video_meta, config=None):
"""
Ingest a single PeerTube video transcript.
Fetches captions, converts VTT to text, chunks into pages,
saves to data/text/{hash}/, and sets status to 'extracted'.
Args:
uuid: Video UUID
video_meta: Dict with name, duration, channel_name, channel_display,
publishedAt, description
config: RECON config dict
Returns dict with hash, status, title, page_count — or None if no captions.
"""
if config is None:
config = get_config()
ptc = _get_pt_config(config)
db = StatusDB()
# Get captions
captions = get_captions(uuid, config)
if not captions:
return None
# Prefer English caption
caption = None
for c in captions:
if c.get('language', {}).get('id') == 'en':
caption = c
break
if caption is None:
caption = captions[0]
# Fetch VTT
vtt_content = fetch_vtt(caption['captionPath'], config)
# Convert to plain text with timestamp tracking
text, cue_timestamps = vtt_to_text(vtt_content)
if not text or len(text) < 50:
logger.warning(f"Transcript too short for {video_meta['name']} ({uuid}): {len(text)} chars")
return None
# Hash the text content
doc_hash = _content_hash(text)
# Check for duplicate
conn = db._get_conn()
existing = conn.execute("SELECT * FROM catalogue WHERE hash = ?", (doc_hash,)).fetchone()
if existing:
doc = db.get_document(doc_hash)
existing_status = doc['status'] if doc else existing['status']
logger.debug(f"Duplicate transcript (hash {doc_hash[:12]}...) — {video_meta['name']}")
return {
'hash': doc_hash,
'status': 'duplicate',
'title': video_meta['name'],
'existing_status': existing_status,
}
# Chunk into pages
words_per_page = config.get('web_scraper', {}).get('words_per_page', 2000)
pages = chunk_text(text, words_per_page)
# Compute page-to-timestamp mapping
page_timestamps = _map_page_timestamps(pages, text, cue_timestamps)
# Save text files
text_dir = os.path.join(config['paths']['text'], doc_hash)
os.makedirs(text_dir, exist_ok=True)
for i, page_text in enumerate(pages, 1):
page_file = os.path.join(text_dir, f"page_{i:04d}.txt")
with open(page_file, 'w', encoding='utf-8') as f:
f.write(page_text)
# Save meta.json
video_url = f"{ptc['public_url']}/w/{uuid}"
meta = {
'hash': doc_hash,
'source_type': 'transcript',
'url': video_url,
'title': video_meta['name'],
'author': video_meta.get('channel_display', ''),
'channel': video_meta.get('channel_name', ''),
'duration': video_meta.get('duration', 0),
'date': video_meta.get('publishedAt', ''),
'description': video_meta.get('description', ''),
'sitename': 'stream.echo6.co',
'page_count': len(pages),
'text_length': len(text),
'page_timestamps': page_timestamps,
'fetched_at': datetime.now(timezone.utc).isoformat(),
}
with open(os.path.join(text_dir, 'meta.json'), 'w') as f:
json.dump(meta, f, indent=2)
# Display filename for catalogue
display_name = re.sub(r'[^\w\s._-]', '', video_meta['name'])[:200].strip()
if not display_name:
display_name = uuid
# Add to catalogue
db.add_to_catalogue(
doc_hash, display_name, video_url,
len(text), 'stream.echo6.co', video_meta.get('channel_name', 'unknown')
)
# Queue + advance to extracted
db.queue_document(doc_hash)
db.update_status(doc_hash, 'extracted',
page_count=len(pages),
pages_extracted=len(pages),
book_title=video_meta['name'],
book_author=video_meta.get('channel_display', ''))
logger.info(
f"Ingested transcript: {video_meta['name']} ({uuid[:8]}...) "
f"-> {doc_hash[:12]}... ({len(pages)} pages, {len(text)} chars)"
)
return {
'hash': doc_hash,
'status': 'extracted',
'title': video_meta['name'],
'page_count': len(pages),
'text_length': len(text),
'page_timestamps': page_timestamps,
'channel': video_meta.get('channel_name', ''),
'duration': video_meta.get('duration', 0),
'url': video_url,
}
def ingest_channel(channel_name, config=None, since=None):
"""
Ingest all captioned videos from a specific channel.
Returns summary dict.
"""
if config is None:
config = get_config()
ptc = _get_pt_config(config)
logger.info(f"Ingesting channel: {channel_name}")
videos = get_videos(channel=channel_name, since=since, config=config)
return _ingest_video_list(videos, config, ptc)
def ingest_all(config=None, since=None):
"""
Ingest all captioned videos from the entire PeerTube instance.
Returns summary dict.
"""
if config is None:
config = get_config()
ptc = _get_pt_config(config)
logger.info("Ingesting all PeerTube videos with captions")
videos = get_videos(since=since, config=config)
return _ingest_video_list(videos, config, ptc)
def _ingest_video_list(videos, config, ptc):
"""Process a list of videos — shared logic for ingest_channel and ingest_all."""
results = []
skipped_no_captions = 0
skipped_duplicate = 0
failed = 0
ingested = 0
total_pages = 0
total = len(videos)
logger.info(f"Found {total} videos to check for captions")
for i, video in enumerate(videos, 1):
if _stop_check and _stop_check():
logger.info(f"Shutdown requested — stopping after {i-1}/{total} videos")
break
uuid = video['uuid']
try:
result = ingest_video(uuid, video, config)
if result is None:
skipped_no_captions += 1
elif result['status'] == 'duplicate':
skipped_duplicate += 1
else:
ingested += 1
total_pages += result.get('page_count', 0)
results.append(result)
except Exception as e:
logger.error(f"[{i}/{total}] Failed: {video['name']} ({uuid}) — {e}")
failed += 1
# Check for shutdown
if _stop_check and _stop_check():
logger.info(f"Shutdown requested — stopping after {i}/{total} videos")
break
# Rate limit
if i < total:
time.sleep(ptc['rate_limit_delay'])
# Progress logging every 50 videos
if i % 50 == 0:
logger.info(
f"Progress: {i}/{total} checked — "
f"{ingested} ingested, {skipped_no_captions} no captions, "
f"{skipped_duplicate} dupes, {failed} failed"
)
logger.info(
f"PeerTube ingestion complete: {ingested} ingested ({total_pages} pages), "
f"{skipped_no_captions} no captions, {skipped_duplicate} duplicates, "
f"{failed} failed out of {total} videos"
)
return {
'results': results,
'summary': {
'total_checked': total,
'ingested': ingested,
'skipped_no_captions': skipped_no_captions,
'skipped_duplicate': skipped_duplicate,
'failed': failed,
'total_pages': total_pages,
}
}
def get_instance_stats(config=None):
"""Get PeerTube instance statistics for the dashboard."""
if config is None:
config = get_config()
db = StatusDB()
# Total videos on instance
try:
data = _api_get("/api/v1/videos", config, params={'count': 1})
total_videos = data.get('total', 0)
except Exception:
total_videos = 0
# Videos ingested into RECON (from catalogue)
conn = db._get_conn()
ingested = conn.execute(
"SELECT count(*) FROM catalogue WHERE source = 'stream.echo6.co'"
).fetchone()[0]
# Status breakdown
status_rows = conn.execute(
"SELECT d.status, count(*) as cnt FROM documents d "
"JOIN catalogue c ON d.hash = c.hash "
"WHERE c.source = 'stream.echo6.co' "
"GROUP BY d.status"
).fetchall()
status_breakdown = {row['status']: row['cnt'] for row in status_rows}
return {
'total_videos': total_videos,
'ingested': ingested,
'status_breakdown': status_breakdown,
}