#!/usr/bin/env python3.12
"""
Compute frequency stats for design vocabulary across DESIGN.md files.

Features:
- Loads vocabulary from seeds.json (aligned with composite seeds)
- Filters stopwords and common English words
- TF-IDF scoring for better relevance
- Categorizes terms by type (aesthetic, layout, palette, etc.)
- Outputs frequency reports grouped by category
"""
from __future__ import annotations

import argparse
import json
import math
import pathlib
import re
from collections import Counter, defaultdict
from typing import Counter as CounterType, Dict, List, Tuple, Set, Optional

# Common English stopwords to filter out
STOPWORDS: Set[str] = {
    "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
    "of", "with", "by", "from", "as", "is", "was", "are", "were", "been",
    "be", "have", "has", "had", "do", "does", "did", "will", "would", "could",
    "should", "may", "might", "must", "shall", "can", "need", "this", "that",
    "these", "those", "it", "its", "they", "them", "their", "we", "our",
    "you", "your", "i", "my", "me", "he", "she", "his", "her", "all", "each",
    "every", "both", "few", "more", "most", "other", "some", "such", "no",
    "not", "only", "own", "same", "so", "than", "too", "very", "just", "also",
    "now", "here", "there", "when", "where", "why", "how", "what", "which",
    "who", "whom", "if", "then", "else", "because", "while", "although",
    "through", "during", "before", "after", "above", "below", "between",
    "into", "over", "under", "again", "further", "once", "any", "about",
    "design", "language", "site", "website", "page", "content", "section",
    "element", "component", "style", "use", "using", "used", "create",
    "creating", "make", "making", "ensure", "include", "including", "like",
    "based", "feature", "features", "provide", "providing", "allow", "allowing",
}

# Fallback categories if seeds.json is missing
DEFAULT_CATEGORIES: Dict[str, Set[str]] = {
    "colors": {
        "black", "white", "gray", "red", "blue", "green", "yellow", "orange",
        "purple", "pink", "brown", "monochrome", "gradient", "neon", "pastel"
    },
    "layout": {
        "grid", "flex", "column", "row", "asymmetric", "hero", "sidebar"
    },
    "typography": {
        "serif", "sans", "mono", "display", "bold", "italic", "heading"
    }
}


def load_vocabulary(root: pathlib.Path) -> Dict[str, Set[str]]:
    """Load vocabulary from seeds.json and convert lists to sets."""
    seeds_file = root / "tools" / "design" / "seeds.json"
    vocab: Dict[str, Set[str]] = {}
    
    if seeds_file.exists():
        try:
            data = json.loads(seeds_file.read_text(encoding="utf-8"))
            raw_vocab = data.get("vocabulary", {})
            for category, terms in raw_vocab.items():
                vocab[category] = set(t.lower() for t in terms)
        except Exception:
            pass
            
    return vocab or DEFAULT_CATEGORIES


def tokenize(text: str) -> List[str]:
    """Extract words, filtering short and stopwords."""
    words = re.findall(r"[a-zA-Z][a-zA-Z0-9-]*[a-zA-Z0-9]|[a-zA-Z]", text.lower())
    return [w for w in words if len(w) > 2 and w not in STOPWORDS]


def categorize_term(term: str, categories: Dict[str, Set[str]]) -> str:
    """Categorize a term into design category."""
    for category, terms in categories.items():
        if term in terms:
            return category
    return "other"


def load_designs(root: pathlib.Path) -> Dict[str, str]:
    """Load all DESIGN.md files."""
    designs: Dict[str, str] = {}
    for fp in root.glob("sites/**/DESIGN.md"):
        try:
            designs[str(fp)] = fp.read_text(encoding="utf-8")
        except Exception:
            continue
    return designs


def compute_tf_idf(
    designs: Dict[str, str]
) -> Dict[str, float]:
    """Compute TF-IDF scores for terms across documents."""
    doc_freq: CounterType[str] = Counter()
    term_freqs: List[CounterType[str]] = []

    for text in designs.values():
        tokens = tokenize(text)
        term_freq = Counter(tokens)
        term_freqs.append(term_freq)
        doc_freq.update(set(tokens))

    total_docs = len(designs)
    if total_docs == 0:
        return {}

    tf_idf_scores: Dict[str, float] = defaultdict(float)

    for term_freq in term_freqs:
        for term, freq in term_freq.items():
            if doc_freq[term] > 0:
                tf = freq / sum(term_freq.values())
                idf = math.log(total_docs / doc_freq[term])
                tf_idf_scores[term] += tf * idf

    for term in tf_idf_scores:
        tf_idf_scores[term] /= total_docs

    return dict(tf_idf_scores)


def frequency_analysis(
    designs: Dict[str, str],
    categories: Dict[str, Set[str]],
) -> Dict[str, List[Tuple[str, float]]]:
    """Analyze term frequency grouped by category."""
    total_docs = len(designs)
    if total_docs == 0:
        return {}

    # Track term presence per document
    doc_presence: CounterType[str] = Counter()
    for text in designs.values():
        tokens = set(tokenize(text))
        doc_presence.update(tokens)

    # Group by category
    grouped: Dict[str, List[Tuple[str, float]]] = defaultdict(list)
    
    for term, count in doc_presence.items():
        freq = count / total_docs
        category = categorize_term(term, categories)
        if category != "other":
            grouped[category].append((term, freq))
            
    # Sort each group by frequency desc
    for cat in grouped:
        grouped[cat].sort(key=lambda x: x[1], reverse=True)
        
    return grouped


def main() -> int:
    parser = argparse.ArgumentParser(
        description="Analyze design vocabulary frequency by category",
        formatter_class=argparse.RawDescriptionHelpFormatter
    )
    parser.add_argument("--root", default=".", help="Root directory (default: .)")
    parser.add_argument("--threshold", type=float, default=0.0, help="Minimum frequency to show")
    parser.add_argument("--overused", type=float, default=None, help="Show terms more frequent than this (legacy compat)")
    parser.add_argument("--underused", type=float, default=None, help="Show terms less frequent than this (legacy compat)")
    parser.add_argument("--top", type=int, default=None, help="Limit number of items shown per category")
    parser.add_argument("--tfidf", action="store_true", help="Show TF-IDF scores instead")
    args = parser.parse_args()

    root = pathlib.Path(args.root).resolve()
    
    # Load dynamic categories
    categories = load_vocabulary(root)
    designs = load_designs(root)

    if not designs:
        print(f"FAIL:no_designs_found:{root}/sites/**/DESIGN.md")
        return 1

    print(f"Analyzed {len(designs)} DESIGN.md files using {len(categories)} categories\n")

    if args.tfidf:
        print("TOP DISTINGUISHING TERMS (TF-IDF)")
        print("=" * 60)
        scores = compute_tf_idf(designs)
        sorted_scores = sorted(scores.items(), key=lambda x: -x[1])
        limit = args.top if args.top else 20
        for term, score in sorted_scores[:limit]:
            cat = categorize_term(term, categories)
            if cat != "other":
                print(f"  {term:<20} {score:.4f}  [{cat}]")
        return 0

    # Standard frequency analysis
    grouped_stats = frequency_analysis(designs, categories)

    # Print report per category
    for category in sorted(categories.keys()):
        stats = grouped_stats.get(category, [])
        if not stats:
            continue
            
        print(f"CATEGORY: {category.upper()}")
        print("-" * 60)
        
        # Filter logic
        visible = stats
        if args.overused is not None:
            visible = [s for s in visible if s[1] >= args.overused]
        elif args.underused is not None:
            visible = [s for s in visible if s[1] <= args.underused]
        elif args.threshold > 0:
            visible = [s for s in visible if s[1] >= args.threshold]
            
        # Limit count
        if args.top:
            visible = visible[:args.top]
        
        if not visible:
            print("  (none matching criteria)")
        else:
            for term, freq in visible:
                pct = int(freq * 100)
                bar = "█" * (pct // 5)
                print(f"  {term:<25} {pct:>3}% {bar}")
        print()

    return 0


if __name__ == "__main__":
    raise SystemExit(main())


if __name__ == "__main__":
    raise SystemExit(main())
