mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 06:34:40 +02:00
Initial commit: RECON codebase baseline
Current state of the pipeline code as of 2026-04-14 (Phase 1 scaffolding complete). Config has new_pipeline.enabled=false and crawler.sites=[] per refactor plan. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
commit
563c16bb71
59 changed files with 18327 additions and 0 deletions
159
lib/ingester.py
Normal file
159
lib/ingester.py
Normal file
|
|
@ -0,0 +1,159 @@
|
|||
"""
|
||||
RECON Intel Ingester
|
||||
|
||||
ARGUS intelligence feed intake. Embeds intel JSON and inserts into Qdrant
|
||||
with source_type='intel_feed'.
|
||||
|
||||
Dependencies: requests, qdrant-client
|
||||
Config: embedding, vector_db
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import traceback
|
||||
|
||||
import requests as http_requests
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import PointStruct
|
||||
|
||||
from .utils import get_config, setup_logging
|
||||
from .status import StatusDB
|
||||
|
||||
logger = setup_logging('recon.ingester')
|
||||
|
||||
|
||||
def ingest_intel(intel_data, config=None):
|
||||
if config is None:
|
||||
config = get_config()
|
||||
|
||||
db = StatusDB()
|
||||
|
||||
required = ['source', 'category', 'content']
|
||||
for field in required:
|
||||
if field not in intel_data:
|
||||
logger.error(f"Missing required field: {field}")
|
||||
return None
|
||||
|
||||
try:
|
||||
conn = db._get_conn()
|
||||
cursor = conn.execute(
|
||||
"""INSERT INTO intel (source, timestamp, region, category, content,
|
||||
summary, key_facts, credibility_score, verification_status)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||
(
|
||||
intel_data.get('source', 'unknown'),
|
||||
intel_data.get('timestamp', time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())),
|
||||
intel_data.get('region', 'unknown'),
|
||||
intel_data['category'],
|
||||
intel_data['content'],
|
||||
intel_data.get('summary', ''),
|
||||
json.dumps(intel_data.get('key_facts', [])),
|
||||
intel_data.get('credibility_score', 0.5),
|
||||
intel_data.get('verification_status', 'unverified'),
|
||||
)
|
||||
)
|
||||
intel_id = cursor.lastrowid
|
||||
conn.commit()
|
||||
|
||||
url = f"http://{config['embedding']['host']}:{config['embedding']['port']}/api/embed"
|
||||
resp = http_requests.post(url, json={
|
||||
"model": config['embedding']['model'],
|
||||
"input": intel_data['content']
|
||||
}, timeout=120)
|
||||
resp.raise_for_status()
|
||||
vector = resp.json()['embeddings'][0]
|
||||
|
||||
qdrant = QdrantClient(
|
||||
host=config['vector_db']['host'],
|
||||
port=config['vector_db']['port'],
|
||||
timeout=60
|
||||
)
|
||||
|
||||
point_id = intel_id + 2**60
|
||||
|
||||
payload = {
|
||||
'source_type': 'intel_feed',
|
||||
'intel_id': intel_id,
|
||||
'source': intel_data.get('source', 'unknown'),
|
||||
'region': intel_data.get('region', 'unknown'),
|
||||
'category': intel_data['category'],
|
||||
'content': intel_data['content'],
|
||||
'summary': intel_data.get('summary', ''),
|
||||
'key_facts': intel_data.get('key_facts', []),
|
||||
'credibility_score': intel_data.get('credibility_score', 0.5),
|
||||
'verification_status': intel_data.get('verification_status', 'unverified'),
|
||||
'timestamp': intel_data.get('timestamp', ''),
|
||||
'language': 'en',
|
||||
}
|
||||
|
||||
qdrant.upsert(
|
||||
collection_name=config['vector_db']['collection'],
|
||||
points=[PointStruct(id=point_id, vector=vector, payload=payload)]
|
||||
)
|
||||
|
||||
conn.execute("UPDATE intel SET vector_id = ? WHERE id = ?", (point_id, intel_id))
|
||||
conn.commit()
|
||||
|
||||
logger.info(f"Ingested intel #{intel_id} from {intel_data.get('source', 'unknown')}")
|
||||
return intel_id
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Intel ingestion failed: {e}\n{traceback.format_exc()}")
|
||||
return None
|
||||
|
||||
|
||||
def ingest_file(filepath, config=None):
|
||||
if config is None:
|
||||
config = get_config()
|
||||
|
||||
try:
|
||||
with open(filepath, encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
if isinstance(data, list):
|
||||
results = []
|
||||
for item in data:
|
||||
result = ingest_intel(item, config)
|
||||
results.append(result)
|
||||
success = sum(1 for r in results if r is not None)
|
||||
logger.info(f"Ingested {success}/{len(data)} items from {filepath}")
|
||||
return results
|
||||
else:
|
||||
return [ingest_intel(data, config)]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to ingest file {filepath}: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def run_ingestion(directory=None):
|
||||
config = get_config()
|
||||
intel_dir = directory or config['paths']['intel']
|
||||
|
||||
if not os.path.exists(intel_dir):
|
||||
logger.info(f"Intel directory does not exist: {intel_dir}")
|
||||
return 0
|
||||
|
||||
json_files = sorted([
|
||||
f for f in os.listdir(intel_dir)
|
||||
if f.endswith('.json') and not f.startswith('.')
|
||||
])
|
||||
|
||||
if not json_files:
|
||||
logger.info("No intel files to ingest")
|
||||
return 0
|
||||
|
||||
total = 0
|
||||
for jf in json_files:
|
||||
filepath = os.path.join(intel_dir, jf)
|
||||
results = ingest_file(filepath, config)
|
||||
ingested = sum(1 for r in results if r is not None)
|
||||
total += ingested
|
||||
|
||||
if ingested > 0:
|
||||
done_dir = os.path.join(intel_dir, 'processed')
|
||||
os.makedirs(done_dir, exist_ok=True)
|
||||
os.rename(filepath, os.path.join(done_dir, jf))
|
||||
|
||||
logger.info(f"Intel ingestion complete: {total} items ingested")
|
||||
return total
|
||||
Loading…
Add table
Add a link
Reference in a new issue