"""
AIFinder Feature Extraction
Optimized TF-IDF and stylometric features for AI model detection.
"""

import re
import numpy as np
from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MaxAbsScaler

from config import TFIDF_WORD_PARAMS, TFIDF_CHAR_PARAMS

_RE_COMPILED = {
    "cot": re.compile(r"<think(?:ing)?>.*?</think(?:ing)?>", re.DOTALL),
    "code_block": re.compile(r"```[\s\S]*?```"),
    "inline_code": re.compile(r"`[^`]+`"),
    "bold": re.compile(r"\*\*([^*]+)\*\*"),
    "italic_ast": re.compile(r"\*([^*]+)\*"),
    "italic_under": re.compile(r"__([^_]+)__"),
    "under": re.compile(r"_([^_]+)_"),
    "header": re.compile(r"^#{1,6}\s+", re.MULTILINE),
    "bullet": re.compile(r"^[\s]*[-*+]\s+", re.MULTILINE),
    "numbered": re.compile(r"^\s*\d+[.)]\s+", re.MULTILINE),
    "link": re.compile(r"\[([^\]]+)\]\([^)]+\)"),
    "quote": re.compile(r"^>.*$", re.MULTILINE),
    "hr": re.compile(r"^---+$", re.MULTILINE),
    "think_tag": re.compile(r"<think>"),
    "xml_tag": re.compile(r"<[^>]+>"),
    "url": re.compile(r"https?://"),
    "contraction": re.compile(r"\b\w+'\w+\b"),
    "markdown_header": re.compile(r"^#{1,6}\s", re.MULTILINE),
    "markdown_bold": re.compile(r"\*\*.*?\*\*"),
    "markdown_code_block": re.compile(r"```"),
    "markdown_inline_code": re.compile(r"`[^`]+`"),
    "markdown_bullet": re.compile(r"^[\s]*[-*+]\s", re.MULTILINE),
    "markdown_numbered": re.compile(r"^\s*\d+[.)]\s", re.MULTILINE),
    "markdown_table": re.compile(r"\|.*\|"),
    "question_start": re.compile(
        r"^(who|what|when|where|why|how)\b", re.IGNORECASE | re.MULTILINE
    ),
    "emoji": re.compile(r"[\U00010000-\U0010ffff]"),
    "chinese": re.compile(r"[\u4e00-\u9fff]"),
    "all_caps": re.compile(r"\b[A-Z][a-z]+\b"),
    "four_word": re.compile(r"\b\w+\s+\w+\s+\w+\s+\w+\b"),
    "sent_boundary": re.compile(r"[.!?]\s+[A-Z]"),
    "paren": re.compile(r"\([^)]+\)"),
    "colon_def": re.compile(r"\b\w+:\s+\w+"),
    "double_quote": re.compile(r'"[^"]*"'),
    "single_quote": re.compile(r"'[^']*'"),
    "greeting": re.compile(
        r"\b(hi|hello|hey|hiya|greetings|howdy|yo)\b", re.IGNORECASE
    ),
    "conv_phrase": re.compile(
        r"\b(great|perfect|sure|definitely|certainly|absolutely|of course|no problem|sounds good|got it|understood|okay|alright)\b",
        re.IGNORECASE,
    ),
    "helpful": re.compile(
        r"\b(let me know|feel free|happy to|glad to|happy to help|don't hesitate|let me know if|please let me|reach out)\b",
        re.IGNORECASE,
    ),
    "closing_offer": re.compile(
        r"(let me know|feel free|happy to help|don't hesitate|hope this helps)",
        re.IGNORECASE,
    ),
    "self_id_ai": re.compile(
        r"\b(I'm|I am)\s+(an?\s+)?(AI|language model|assistant|chatbot)\b",
        re.IGNORECASE,
    ),
    "provider_mention": re.compile(
        r"\b(Claude|Anthropic|GPT|OpenAI|ChatGPT|Gemini|Google|Bard|Grok|xAI|DeepSeek|Kimi|Moonshot|Mistral|MiniMax|Zhipu|GLM|深度求索)\b",
        re.IGNORECASE,
    ),
    "special_unicode": re.compile(r"[^\x00-\x7F]"),
}

_PRONOUN_SETS = {
    "first": frozenset(
        {"i", "me", "my", "mine", "myself", "we", "us", "our", "ours", "ourselves"}
    ),
    "second": frozenset({"you", "your", "yours", "yourself", "yourselves"}),
    "third": frozenset(
        {"he", "she", "it", "they", "them", "his", "her", "its", "their"}
    ),
}

_DISCOURSE_SETS = {
    "conjunctions": frozenset(
        {
            "and",
            "but",
            "or",
            "nor",
            "for",
            "yet",
            "so",
            "because",
            "although",
            "while",
            "if",
            "when",
            "where",
        }
    ),
    "discourse": frozenset(
        {
            "however",
            "therefore",
            "moreover",
            "furthermore",
            "nevertheless",
            "consequently",
            "thus",
            "hence",
        }
    ),
    "hedging": frozenset(
        {
            "perhaps",
            "maybe",
            "might",
            "could",
            "possibly",
            "seemingly",
            "apparently",
            "arguably",
            "potentially",
        }
    ),
    "certainty": frozenset(
        {
            "definitely",
            "certainly",
            "absolutely",
            "clearly",
            "obviously",
            "undoubtedly",
            "indeed",
            "surely",
        }
    ),
    "transition": frozenset(
        {
            "additionally",
            "meanwhile",
            "subsequently",
            "alternatively",
            "specifically",
            "notably",
            "importantly",
            "essentially",
        }
    ),
    "casual": frozenset(
        {
            "okay",
            "ok",
            "hey",
            "hi",
            "cool",
            "awesome",
            "wow",
            "basically",
            "actually",
            "literally",
            "right",
            "yeah",
        }
    ),
    "formal": frozenset(
        {
            "regarding",
            "concerning",
            "pertaining",
            "aforementioned",
            "respectively",
            "accordingly",
            "henceforth",
            "whereby",
            "notwithstanding",
            "pursuant",
        }
    ),
}

_PUNC_STRIP = frozenset(".,!?;:'\"()[]{}")


def strip_cot(text):
    return _RE_COMPILED["cot"].sub("", text).strip()


def strip_markdown(text):
    text = _RE_COMPILED["code_block"].sub("", text)
    text = _RE_COMPILED["inline_code"].sub("", text)
    text = _RE_COMPILED["bold"].sub(r"\1", text)
    text = _RE_COMPILED["italic_ast"].sub(r"\1", text)
    text = _RE_COMPILED["italic_under"].sub(r"\1", text)
    text = _RE_COMPILED["under"].sub(r"\1", text)
    text = _RE_COMPILED["header"].sub("", text)
    text = _RE_COMPILED["bullet"].sub("", text)
    text = _RE_COMPILED["numbered"].sub("", text)
    text = _RE_COMPILED["link"].sub(r"\1", text)
    text = _RE_COMPILED["quote"].sub("", text)
    text = _RE_COMPILED["hr"].sub("", text)
    return text.strip()


class StylometricFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return csr_matrix(np.array([self._extract(t) for t in X], dtype=np.float32))

    def _extract(self, text):
        n_chars = max(len(text), 1)
        words = text.split()
        n_words = max(len(words), 1)

        sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()]
        n_sentences = max(len(sentences), 1)

        paragraphs = text.split("\n\n")
        non_empty_paras = [p for p in paragraphs if p.strip()]
        n_paragraphs = len(non_empty_paras)

        lines = text.split("\n")
        non_empty_lines = [ln for ln in lines if ln.strip()]
        n_lines = max(len(non_empty_lines), 1)

        word_lens = [len(w) for w in words]
        sent_lens = [len(s.split()) for s in sentences]

        _rc = _RE_COMPILED
        _ps = _PRONOUN_SETS
        _ds = _DISCOURSE_SETS

        avg_word_len = np.mean(word_lens) if words else 0.0
        word_len_std = np.std(word_lens) if len(words) > 1 else 0.0
        median_word_len = np.median(word_lens) if words else 0.0
        avg_sent_len = n_words / n_sentences

        n_commas = text.count(",") / n_chars
        n_semicolons = text.count(";") / n_chars
        n_colons = text.count(":") / n_chars
        n_dash = (text.count("—") + text.count("–") + text.count("--")) / n_chars
        n_parens = (text.count("(") + text.count(")")) / n_chars
        n_quotes = (text.count('"') + text.count("'")) / n_chars
        n_exclaim = text.count("!") / n_chars
        n_question = text.count("?") / n_chars
        n_period = text.count(".") / n_chars
        n_ellipsis = (text.count("...") + text.count("…")) / n_chars

        comma_colon_ratio = n_commas / (n_colons + 0.001)
        comma_period_ratio = n_commas / (n_period + 0.001)
        excl_question_ratio = n_exclaim / (n_question + 0.001)

        n_headers = len(_rc["markdown_header"].findall(text)) / n_sentences
        n_bold = len(_rc["markdown_bold"].findall(text)) / n_sentences
        n_code_blocks = len(_rc["markdown_code_block"].findall(text)) / n_sentences
        n_inline_code = len(_rc["markdown_inline_code"].findall(text)) / n_sentences
        n_bullet = len(_rc["markdown_bullet"].findall(text)) / n_sentences
        n_numbered = len(_rc["markdown_numbered"].findall(text)) / n_sentences
        n_tables = len(_rc["markdown_table"].findall(text)) / n_sentences

        newline_density = text.count("\n") / n_chars
        double_newline_ratio = text.count("\n\n") / (text.count("\n") + 1)
        uppercase_ratio = sum(1 for c in text if c.isupper()) / n_chars
        digit_ratio = sum(1 for c in text if c.isdigit()) / n_chars
        space_ratio = sum(1 for c in text if c.isspace()) / n_chars

        unique_chars = len(set(text)) / n_chars
        unique_chars_ratio = len(set(text.lower())) / n_chars

        sent_len_std = np.std(sent_lens) if len(sent_lens) > 1 else 0.0
        sent_len_max = max(sent_lens) if sent_lens else 0
        sent_len_min = min(sent_lens) if sent_lens else 0
        sent_len_median = np.median(sent_lens) if sent_lens else 0.0
        sent_len_range = sent_len_max - sent_len_min

        has_think = 1.0 if _rc["think_tag"].search(text) else 0.0
        has_xml = 1.0 if _rc["xml_tag"].search(text) else 0.0
        has_hr = 1.0 if _rc["hr"].search(text) else 0.0
        has_url = 1.0 if _rc["url"].search(text) else 0.0

        words_lower = [w.lower().strip(".,!?;:'\"()[]{}") for w in words]
        first_person_ratio = sum(1 for w in words_lower if w in _ps["first"]) / n_words
        second_person_ratio = (
            sum(1 for w in words_lower if w in _ps["second"]) / n_words
        )
        third_person_ratio = sum(1 for w in words_lower if w in _ps["third"]) / n_words

        unique_words = len(set(words_lower))
        ttr = unique_words / n_words if n_words > 0 else 0.0
        word_counts = {}
        for w in words_lower:
            word_counts[w] = word_counts.get(w, 0) + 1
        hapax = sum(1 for c in word_counts.values() if c == 1)
        hapax_ratio = hapax / n_words if n_words > 0 else 0.0

        contraction_count = len(_rc["contraction"].findall(text))
        contraction_ratio = contraction_count / n_words if n_words > 0 else 0.0

        sentences_starters = [
            s.split()[0].lower() if s.split() else "" for s in sentences
        ]
        starter_vocab = (
            len(set(sentences_starters)) / n_sentences if n_sentences > 0 else 0.0
        )

        and_starts = sum(1 for s in sentences_starters if s == "and") / n_sentences
        but_starts = sum(1 for s in sentences_starters if s == "but") / n_sentences
        so_starts = sum(1 for s in sentences_starters if s == "so") / n_sentences
        the_starts = sum(1 for s in sentences_starters if s == "the") / n_sentences
        it_starts = (
            sum(1 for s in sentences_starters if s in ("it", "it's")) / n_sentences
        )
        i_starts = (
            sum(1 for s in sentences_starters if s in ("i", "i'm", "i've"))
            / n_sentences
        )

        short_word_ratio = sum(1 for w in words_lower if len(w) <= 2) / n_words
        medium_word_ratio = sum(1 for w in words_lower if 3 <= len(w) <= 6) / n_words
        long_word_ratio = sum(1 for w in words_lower if len(w) >= 7) / n_words
        very_long_word_ratio = sum(1 for w in words_lower if len(w) >= 10) / n_words

        para_lens = (
            [len(p.split()) for p in non_empty_paras] if non_empty_paras else [0]
        )
        avg_para_len = np.mean(para_lens)
        para_len_std = np.std(para_lens) if len(para_lens) > 1 else 0.0

        conjunction_ratio = (
            sum(1 for w in words_lower if w in _ds["conjunctions"]) / n_words
        )
        discourse_ratio = sum(1 for w in words_lower if w in _ds["discourse"]) / n_words
        hedging_ratio = sum(1 for w in words_lower if w in _ds["hedging"]) / n_words
        certainty_ratio = sum(1 for w in words_lower if w in _ds["certainty"]) / n_words
        transition_ratio = (
            sum(1 for w in words_lower if w in _ds["transition"]) / n_words
        )

        question_starts = sum(
            1 for s in sentences if s and _rc["question_start"].search(s.lower())
        )

        has_list = 1.0 if n_bullet > 0 or n_numbered > 0 else 0.0
        list_items = n_bullet + n_numbered

        emoji_count = len(_rc["emoji"].findall(text))
        has_emoji = 1.0 if emoji_count > 0 else 0.0

        all_caps_words = sum(
            1 for w in words if len(w) > 1 and w.isupper() and w.isalpha()
        )
        all_caps_ratio = all_caps_words / n_words

        paren_count = len(_rc["paren"].findall(text))
        paren_ratio = paren_count / n_sentences

        rhetorical_q = sum(1 for s in text.split("\n") if s.strip().endswith("?"))
        rhetorical_ratio = rhetorical_q / n_sentences

        casual_ratio = sum(1 for w in words_lower if w in _ds["casual"]) / n_words
        formal_ratio = sum(1 for w in words_lower if w in _ds["formal"]) / n_words

        chinese_chars = len(_rc["chinese"].findall(text))
        has_chinese = 1.0 if chinese_chars > 0 else 0.0
        chinese_ratio = chinese_chars / n_chars

        has_self_id_ai = 1.0 if _rc["self_id_ai"].search(text) else 0.0
        has_provider_mention = 1.0 if _rc["provider_mention"].search(text) else 0.0

        ends_with_question = 1.0 if text.rstrip().endswith("?") else 0.0
        has_closing_offer = 1.0 if _rc["closing_offer"].search(text) else 0.0

        commas_per_sentence = text.count(",") / n_sentences

        avg_line_len = (
            np.mean([len(ln) for ln in non_empty_lines]) if non_empty_lines else 0.0
        )
        short_lines_ratio = (
            sum(1 for ln in non_empty_lines if len(ln.split()) <= 5) / n_lines
        )

        cap_words = len(_rc["all_caps"].findall(text))
        cap_word_ratio = cap_words / n_words

        four_word_phrases = len(_rc["four_word"].findall(text))
        phrase_ratio = four_word_phrases / n_sentences

        sent_boundaries = len(_rc["sent_boundary"].findall(text))
        sent_boundary_ratio = sent_boundaries / n_sentences

        has_checkmark = 1.0 if any(c in text for c in "✓✗✔✘") else 0.0
        has_arrow = 1.0 if any(c in text for c in "→←➡") else 0.0
        has_star = 1.0 if any(c in text for c in "⭐★☆") else 0.0
        special_unicode = len(_rc["special_unicode"].findall(text)) / n_chars

        colon_definitions = len(_rc["colon_def"].findall(text)) / n_sentences

        double_quote_pairs = len(_rc["double_quote"].findall(text)) / n_sentences
        single_quote_pairs = len(_rc["single_quote"].findall(text)) / n_sentences

        greeting_patterns = len(_rc["greeting"].findall(text))
        greeting_ratio = greeting_patterns / n_sentences

        is_short = 1.0 if n_words < 100 else 0.0
        is_medium = 1.0 if 100 <= n_words < 500 else 0.0
        is_long = 1.0 if n_words >= 500 else 0.0

        excl_sentences = sum(1 for s in sentences if s.strip().endswith("!"))
        excl_sentence_ratio = excl_sentences / n_sentences

        question_lines = [ln for ln in non_empty_lines if ln.strip().endswith("?")]
        question_line_ratio = len(question_lines) / n_lines if n_lines > 0 else 0.0

        conversational_phrases = len(_rc["conv_phrase"].findall(text))
        conv_phrase_ratio = conversational_phrases / n_words

        helpful_phrases = len(_rc["helpful"].findall(text))
        helpful_ratio = helpful_phrases / n_sentences

        return [
            avg_word_len,
            word_len_std,
            median_word_len,
            avg_sent_len,
            sent_len_std,
            sent_len_max,
            sent_len_min,
            sent_len_median,
            sent_len_range,
            commas_per_sentence,
            n_commas,
            n_semicolons,
            n_colons,
            n_dash,
            n_parens,
            n_quotes,
            n_exclaim,
            n_question,
            n_period,
            n_ellipsis,
            comma_colon_ratio,
            comma_period_ratio,
            excl_question_ratio,
            n_headers,
            n_bold,
            n_code_blocks,
            n_inline_code,
            n_bullet,
            n_numbered,
            n_tables,
            has_list,
            newline_density,
            double_newline_ratio,
            uppercase_ratio,
            digit_ratio,
            space_ratio,
            unique_chars,
            unique_chars_ratio,
            list_items,
            n_paragraphs,
            n_lines / n_sentences,
            has_think,
            has_xml,
            has_hr,
            has_url,
            first_person_ratio,
            second_person_ratio,
            third_person_ratio,
            ttr,
            hapax_ratio,
            contraction_ratio,
            short_word_ratio,
            medium_word_ratio,
            long_word_ratio,
            very_long_word_ratio,
            starter_vocab,
            and_starts,
            but_starts,
            so_starts,
            the_starts,
            it_starts,
            avg_para_len,
            para_len_std,
            conjunction_ratio,
            discourse_ratio,
            hedging_ratio,
            certainty_ratio,
            transition_ratio,
            question_starts / n_sentences if n_sentences > 0 else 0,
            emoji_count,
            has_emoji,
            special_unicode,
            all_caps_ratio,
            paren_ratio,
            rhetorical_ratio,
            casual_ratio,
            formal_ratio,
            has_chinese,
            chinese_ratio,
            has_self_id_ai,
            has_provider_mention,
            ends_with_question,
            has_closing_offer,
            has_checkmark,
            has_arrow,
            has_star,
            avg_line_len,
            short_lines_ratio,
            cap_word_ratio,
            phrase_ratio,
            sent_boundary_ratio,
            colon_definitions,
            double_quote_pairs,
            single_quote_pairs,
            i_starts,
            greeting_ratio,
            is_short,
            is_medium,
            is_long,
            excl_sentence_ratio,
            question_line_ratio,
            conv_phrase_ratio,
            helpful_ratio,
        ]


class StyleOnlyPipeline:
    """Feature pipeline using ONLY stylometric features — no TF-IDF."""

    def __init__(self):
        self.stylo = StylometricFeatures()
        self.scaler = MaxAbsScaler()

    def fit_transform(self, texts):
        import time

        texts_clean = [strip_markdown(strip_cot(t)) for t in texts]
        t0 = time.time()
        stylo_features = self.stylo.transform(texts_clean)
        print(
            f"    Stylometric: {stylo_features.shape[1]} features ({time.time() - t0:.1f}s)"
        )
        result = self.scaler.fit_transform(stylo_features)
        print(f"    Final feature matrix: {result.shape}")
        return result

    def transform(self, texts):
        texts_clean = [strip_markdown(strip_cot(t)) for t in texts]
        stylo_features = self.stylo.transform(texts_clean)
        return self.scaler.transform(stylo_features)


class FeaturePipeline:
    def __init__(self, use_tfidf=True):
        word_params = dict(TFIDF_WORD_PARAMS)
        char_params = dict(TFIDF_CHAR_PARAMS)

        if word_params.get("max_features", 1) == 0:
            word_params["max_features"] = None
        if char_params.get("max_features", 1) == 0:
            char_params["max_features"] = None

        self.word_tfidf = TfidfVectorizer(**word_params)
        self.char_tfidf = TfidfVectorizer(**char_params)
        self.stylo = StylometricFeatures()
        self.scaler = MaxAbsScaler()
        self.use_tfidf = use_tfidf and (
            TFIDF_WORD_PARAMS.get("max_features", 1) > 0
            or TFIDF_CHAR_PARAMS.get("max_features", 1) > 0
        )

    def _clean_for_tfidf(self, text):
        return strip_markdown(strip_cot(text))

    def fit_transform(self, texts):
        import time

        print(f"    Input: {len(texts)} texts", flush=True)

        texts_clean = [strip_markdown(strip_cot(t)) for t in texts]
        texts_tfidf = texts_clean

        use_word_tfidf = (
            self.word_tfidf.max_features is not None
            and self.word_tfidf.max_features > 0
        )
        if use_word_tfidf:
            t0 = time.time()
            word_features = self.word_tfidf.fit_transform(texts_tfidf)
            print(
                f"    word tfidf: {word_features.shape[1]} features ({time.time() - t0:.1f}s)",
                flush=True,
            )
        else:
            word_features = csr_matrix((len(texts), 0), dtype=np.float32)

        if self.use_tfidf:
            t0 = time.time()
            char_features = self.char_tfidf.fit_transform(texts_tfidf)
            print(
                f"    char tfidf: {char_features.shape[1]} features ({time.time() - t0:.1f}s)",
                flush=True,
            )
        else:
            char_features = csr_matrix((len(texts), 0), dtype=np.float32)

        t0 = time.time()
        stylo_features = self.stylo.transform(texts_clean)
        print(
            f"    stylometric: {stylo_features.shape[1]} features ({time.time() - t0:.1f}s)",
            flush=True,
        )

        combined = hstack([word_features, char_features, stylo_features])
        combined = self.scaler.fit_transform(combined)
        print(f"    Combined feature matrix: {combined.shape}", flush=True)
        return combined

    def transform(self, texts):
        texts_clean = [strip_markdown(strip_cot(t)) for t in texts]
        texts_tfidf = texts_clean

        use_word_tfidf = (
            self.word_tfidf.max_features is not None
            and self.word_tfidf.max_features > 0
        )
        if use_word_tfidf:
            word_features = self.word_tfidf.transform(texts_tfidf)
        else:
            word_features = csr_matrix((len(texts), 0), dtype=np.float32)

        if self.use_tfidf:
            char_features = self.char_tfidf.transform(texts_tfidf)
        else:
            char_features = csr_matrix((len(texts), 0), dtype=np.float32)

        stylo_features = self.stylo.transform(texts_clean)
        combined = hstack([word_features, char_features, stylo_features])
        return self.scaler.transform(combined)