""" AIFinder Feature Extraction Optimized TF-IDF and stylometric features for AI model detection. """ import re import numpy as np from scipy.sparse import csr_matrix, hstack from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.base import BaseEstimator, TransformerMixin from sklearn.preprocessing import MaxAbsScaler from config import TFIDF_WORD_PARAMS, TFIDF_CHAR_PARAMS _RE_COMPILED = { "cot": re.compile(r".*?", re.DOTALL), "code_block": re.compile(r"```[\s\S]*?```"), "inline_code": re.compile(r"`[^`]+`"), "bold": re.compile(r"\*\*([^*]+)\*\*"), "italic_ast": re.compile(r"\*([^*]+)\*"), "italic_under": re.compile(r"__([^_]+)__"), "under": re.compile(r"_([^_]+)_"), "header": re.compile(r"^#{1,6}\s+", re.MULTILINE), "bullet": re.compile(r"^[\s]*[-*+]\s+", re.MULTILINE), "numbered": re.compile(r"^\s*\d+[.)]\s+", re.MULTILINE), "link": re.compile(r"\[([^\]]+)\]\([^)]+\)"), "quote": re.compile(r"^>.*$", re.MULTILINE), "hr": re.compile(r"^---+$", re.MULTILINE), "think_tag": re.compile(r""), "xml_tag": re.compile(r"<[^>]+>"), "url": re.compile(r"https?://"), "contraction": re.compile(r"\b\w+'\w+\b"), "markdown_header": re.compile(r"^#{1,6}\s", re.MULTILINE), "markdown_bold": re.compile(r"\*\*.*?\*\*"), "markdown_code_block": re.compile(r"```"), "markdown_inline_code": re.compile(r"`[^`]+`"), "markdown_bullet": re.compile(r"^[\s]*[-*+]\s", re.MULTILINE), "markdown_numbered": re.compile(r"^\s*\d+[.)]\s", re.MULTILINE), "markdown_table": re.compile(r"\|.*\|"), "question_start": re.compile( r"^(who|what|when|where|why|how)\b", re.IGNORECASE | re.MULTILINE ), "emoji": re.compile(r"[\U00010000-\U0010ffff]"), "chinese": re.compile(r"[\u4e00-\u9fff]"), "all_caps": re.compile(r"\b[A-Z][a-z]+\b"), "four_word": re.compile(r"\b\w+\s+\w+\s+\w+\s+\w+\b"), "sent_boundary": re.compile(r"[.!?]\s+[A-Z]"), "paren": re.compile(r"\([^)]+\)"), "colon_def": re.compile(r"\b\w+:\s+\w+"), "double_quote": re.compile(r'"[^"]*"'), "single_quote": re.compile(r"'[^']*'"), "greeting": re.compile( r"\b(hi|hello|hey|hiya|greetings|howdy|yo)\b", re.IGNORECASE ), "conv_phrase": re.compile( r"\b(great|perfect|sure|definitely|certainly|absolutely|of course|no problem|sounds good|got it|understood|okay|alright)\b", re.IGNORECASE, ), "helpful": re.compile( r"\b(let me know|feel free|happy to|glad to|happy to help|don't hesitate|let me know if|please let me|reach out)\b", re.IGNORECASE, ), "closing_offer": re.compile( r"(let me know|feel free|happy to help|don't hesitate|hope this helps)", re.IGNORECASE, ), "self_id_ai": re.compile( r"\b(I'm|I am)\s+(an?\s+)?(AI|language model|assistant|chatbot)\b", re.IGNORECASE, ), "provider_mention": re.compile( r"\b(Claude|Anthropic|GPT|OpenAI|ChatGPT|Gemini|Google|Bard|Grok|xAI|DeepSeek|Kimi|Moonshot|Mistral|MiniMax|Zhipu|GLM|深度求索)\b", re.IGNORECASE, ), "special_unicode": re.compile(r"[^\x00-\x7F]"), } _PRONOUN_SETS = { "first": frozenset( {"i", "me", "my", "mine", "myself", "we", "us", "our", "ours", "ourselves"} ), "second": frozenset({"you", "your", "yours", "yourself", "yourselves"}), "third": frozenset( {"he", "she", "it", "they", "them", "his", "her", "its", "their"} ), } _DISCOURSE_SETS = { "conjunctions": frozenset( { "and", "but", "or", "nor", "for", "yet", "so", "because", "although", "while", "if", "when", "where", } ), "discourse": frozenset( { "however", "therefore", "moreover", "furthermore", "nevertheless", "consequently", "thus", "hence", } ), "hedging": frozenset( { "perhaps", "maybe", "might", "could", "possibly", "seemingly", "apparently", "arguably", "potentially", } ), "certainty": frozenset( { "definitely", "certainly", "absolutely", "clearly", "obviously", "undoubtedly", "indeed", "surely", } ), "transition": frozenset( { "additionally", "meanwhile", "subsequently", "alternatively", "specifically", "notably", "importantly", "essentially", } ), "casual": frozenset( { "okay", "ok", "hey", "hi", "cool", "awesome", "wow", "basically", "actually", "literally", "right", "yeah", } ), "formal": frozenset( { "regarding", "concerning", "pertaining", "aforementioned", "respectively", "accordingly", "henceforth", "whereby", "notwithstanding", "pursuant", } ), } _PUNC_STRIP = frozenset(".,!?;:'\"()[]{}") def strip_cot(text): return _RE_COMPILED["cot"].sub("", text).strip() def strip_markdown(text): text = _RE_COMPILED["code_block"].sub("", text) text = _RE_COMPILED["inline_code"].sub("", text) text = _RE_COMPILED["bold"].sub(r"\1", text) text = _RE_COMPILED["italic_ast"].sub(r"\1", text) text = _RE_COMPILED["italic_under"].sub(r"\1", text) text = _RE_COMPILED["under"].sub(r"\1", text) text = _RE_COMPILED["header"].sub("", text) text = _RE_COMPILED["bullet"].sub("", text) text = _RE_COMPILED["numbered"].sub("", text) text = _RE_COMPILED["link"].sub(r"\1", text) text = _RE_COMPILED["quote"].sub("", text) text = _RE_COMPILED["hr"].sub("", text) return text.strip() class StylometricFeatures(BaseEstimator, TransformerMixin): def fit(self, X, y=None): return self def transform(self, X): return csr_matrix(np.array([self._extract(t) for t in X], dtype=np.float32)) def _extract(self, text): n_chars = max(len(text), 1) words = text.split() n_words = max(len(words), 1) sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()] n_sentences = max(len(sentences), 1) paragraphs = text.split("\n\n") non_empty_paras = [p for p in paragraphs if p.strip()] n_paragraphs = len(non_empty_paras) lines = text.split("\n") non_empty_lines = [ln for ln in lines if ln.strip()] n_lines = max(len(non_empty_lines), 1) word_lens = [len(w) for w in words] sent_lens = [len(s.split()) for s in sentences] _rc = _RE_COMPILED _ps = _PRONOUN_SETS _ds = _DISCOURSE_SETS avg_word_len = np.mean(word_lens) if words else 0.0 word_len_std = np.std(word_lens) if len(words) > 1 else 0.0 median_word_len = np.median(word_lens) if words else 0.0 avg_sent_len = n_words / n_sentences n_commas = text.count(",") / n_chars n_semicolons = text.count(";") / n_chars n_colons = text.count(":") / n_chars n_dash = (text.count("—") + text.count("–") + text.count("--")) / n_chars n_parens = (text.count("(") + text.count(")")) / n_chars n_quotes = (text.count('"') + text.count("'")) / n_chars n_exclaim = text.count("!") / n_chars n_question = text.count("?") / n_chars n_period = text.count(".") / n_chars n_ellipsis = (text.count("...") + text.count("…")) / n_chars comma_colon_ratio = n_commas / (n_colons + 0.001) comma_period_ratio = n_commas / (n_period + 0.001) excl_question_ratio = n_exclaim / (n_question + 0.001) n_headers = len(_rc["markdown_header"].findall(text)) / n_sentences n_bold = len(_rc["markdown_bold"].findall(text)) / n_sentences n_code_blocks = len(_rc["markdown_code_block"].findall(text)) / n_sentences n_inline_code = len(_rc["markdown_inline_code"].findall(text)) / n_sentences n_bullet = len(_rc["markdown_bullet"].findall(text)) / n_sentences n_numbered = len(_rc["markdown_numbered"].findall(text)) / n_sentences n_tables = len(_rc["markdown_table"].findall(text)) / n_sentences newline_density = text.count("\n") / n_chars double_newline_ratio = text.count("\n\n") / (text.count("\n") + 1) uppercase_ratio = sum(1 for c in text if c.isupper()) / n_chars digit_ratio = sum(1 for c in text if c.isdigit()) / n_chars space_ratio = sum(1 for c in text if c.isspace()) / n_chars unique_chars = len(set(text)) / n_chars unique_chars_ratio = len(set(text.lower())) / n_chars sent_len_std = np.std(sent_lens) if len(sent_lens) > 1 else 0.0 sent_len_max = max(sent_lens) if sent_lens else 0 sent_len_min = min(sent_lens) if sent_lens else 0 sent_len_median = np.median(sent_lens) if sent_lens else 0.0 sent_len_range = sent_len_max - sent_len_min has_think = 1.0 if _rc["think_tag"].search(text) else 0.0 has_xml = 1.0 if _rc["xml_tag"].search(text) else 0.0 has_hr = 1.0 if _rc["hr"].search(text) else 0.0 has_url = 1.0 if _rc["url"].search(text) else 0.0 words_lower = [w.lower().strip(".,!?;:'\"()[]{}") for w in words] first_person_ratio = sum(1 for w in words_lower if w in _ps["first"]) / n_words second_person_ratio = ( sum(1 for w in words_lower if w in _ps["second"]) / n_words ) third_person_ratio = sum(1 for w in words_lower if w in _ps["third"]) / n_words unique_words = len(set(words_lower)) ttr = unique_words / n_words if n_words > 0 else 0.0 word_counts = {} for w in words_lower: word_counts[w] = word_counts.get(w, 0) + 1 hapax = sum(1 for c in word_counts.values() if c == 1) hapax_ratio = hapax / n_words if n_words > 0 else 0.0 contraction_count = len(_rc["contraction"].findall(text)) contraction_ratio = contraction_count / n_words if n_words > 0 else 0.0 sentences_starters = [ s.split()[0].lower() if s.split() else "" for s in sentences ] starter_vocab = ( len(set(sentences_starters)) / n_sentences if n_sentences > 0 else 0.0 ) and_starts = sum(1 for s in sentences_starters if s == "and") / n_sentences but_starts = sum(1 for s in sentences_starters if s == "but") / n_sentences so_starts = sum(1 for s in sentences_starters if s == "so") / n_sentences the_starts = sum(1 for s in sentences_starters if s == "the") / n_sentences it_starts = ( sum(1 for s in sentences_starters if s in ("it", "it's")) / n_sentences ) i_starts = ( sum(1 for s in sentences_starters if s in ("i", "i'm", "i've")) / n_sentences ) short_word_ratio = sum(1 for w in words_lower if len(w) <= 2) / n_words medium_word_ratio = sum(1 for w in words_lower if 3 <= len(w) <= 6) / n_words long_word_ratio = sum(1 for w in words_lower if len(w) >= 7) / n_words very_long_word_ratio = sum(1 for w in words_lower if len(w) >= 10) / n_words para_lens = ( [len(p.split()) for p in non_empty_paras] if non_empty_paras else [0] ) avg_para_len = np.mean(para_lens) para_len_std = np.std(para_lens) if len(para_lens) > 1 else 0.0 conjunction_ratio = ( sum(1 for w in words_lower if w in _ds["conjunctions"]) / n_words ) discourse_ratio = sum(1 for w in words_lower if w in _ds["discourse"]) / n_words hedging_ratio = sum(1 for w in words_lower if w in _ds["hedging"]) / n_words certainty_ratio = sum(1 for w in words_lower if w in _ds["certainty"]) / n_words transition_ratio = ( sum(1 for w in words_lower if w in _ds["transition"]) / n_words ) question_starts = sum( 1 for s in sentences if s and _rc["question_start"].search(s.lower()) ) has_list = 1.0 if n_bullet > 0 or n_numbered > 0 else 0.0 list_items = n_bullet + n_numbered emoji_count = len(_rc["emoji"].findall(text)) has_emoji = 1.0 if emoji_count > 0 else 0.0 all_caps_words = sum( 1 for w in words if len(w) > 1 and w.isupper() and w.isalpha() ) all_caps_ratio = all_caps_words / n_words paren_count = len(_rc["paren"].findall(text)) paren_ratio = paren_count / n_sentences rhetorical_q = sum(1 for s in text.split("\n") if s.strip().endswith("?")) rhetorical_ratio = rhetorical_q / n_sentences casual_ratio = sum(1 for w in words_lower if w in _ds["casual"]) / n_words formal_ratio = sum(1 for w in words_lower if w in _ds["formal"]) / n_words chinese_chars = len(_rc["chinese"].findall(text)) has_chinese = 1.0 if chinese_chars > 0 else 0.0 chinese_ratio = chinese_chars / n_chars has_self_id_ai = 1.0 if _rc["self_id_ai"].search(text) else 0.0 has_provider_mention = 1.0 if _rc["provider_mention"].search(text) else 0.0 ends_with_question = 1.0 if text.rstrip().endswith("?") else 0.0 has_closing_offer = 1.0 if _rc["closing_offer"].search(text) else 0.0 commas_per_sentence = text.count(",") / n_sentences avg_line_len = ( np.mean([len(ln) for ln in non_empty_lines]) if non_empty_lines else 0.0 ) short_lines_ratio = ( sum(1 for ln in non_empty_lines if len(ln.split()) <= 5) / n_lines ) cap_words = len(_rc["all_caps"].findall(text)) cap_word_ratio = cap_words / n_words four_word_phrases = len(_rc["four_word"].findall(text)) phrase_ratio = four_word_phrases / n_sentences sent_boundaries = len(_rc["sent_boundary"].findall(text)) sent_boundary_ratio = sent_boundaries / n_sentences has_checkmark = 1.0 if any(c in text for c in "✓✗✔✘") else 0.0 has_arrow = 1.0 if any(c in text for c in "→←➡") else 0.0 has_star = 1.0 if any(c in text for c in "⭐★☆") else 0.0 special_unicode = len(_rc["special_unicode"].findall(text)) / n_chars colon_definitions = len(_rc["colon_def"].findall(text)) / n_sentences double_quote_pairs = len(_rc["double_quote"].findall(text)) / n_sentences single_quote_pairs = len(_rc["single_quote"].findall(text)) / n_sentences greeting_patterns = len(_rc["greeting"].findall(text)) greeting_ratio = greeting_patterns / n_sentences is_short = 1.0 if n_words < 100 else 0.0 is_medium = 1.0 if 100 <= n_words < 500 else 0.0 is_long = 1.0 if n_words >= 500 else 0.0 excl_sentences = sum(1 for s in sentences if s.strip().endswith("!")) excl_sentence_ratio = excl_sentences / n_sentences question_lines = [ln for ln in non_empty_lines if ln.strip().endswith("?")] question_line_ratio = len(question_lines) / n_lines if n_lines > 0 else 0.0 conversational_phrases = len(_rc["conv_phrase"].findall(text)) conv_phrase_ratio = conversational_phrases / n_words helpful_phrases = len(_rc["helpful"].findall(text)) helpful_ratio = helpful_phrases / n_sentences return [ avg_word_len, word_len_std, median_word_len, avg_sent_len, sent_len_std, sent_len_max, sent_len_min, sent_len_median, sent_len_range, commas_per_sentence, n_commas, n_semicolons, n_colons, n_dash, n_parens, n_quotes, n_exclaim, n_question, n_period, n_ellipsis, comma_colon_ratio, comma_period_ratio, excl_question_ratio, n_headers, n_bold, n_code_blocks, n_inline_code, n_bullet, n_numbered, n_tables, has_list, newline_density, double_newline_ratio, uppercase_ratio, digit_ratio, space_ratio, unique_chars, unique_chars_ratio, list_items, n_paragraphs, n_lines / n_sentences, has_think, has_xml, has_hr, has_url, first_person_ratio, second_person_ratio, third_person_ratio, ttr, hapax_ratio, contraction_ratio, short_word_ratio, medium_word_ratio, long_word_ratio, very_long_word_ratio, starter_vocab, and_starts, but_starts, so_starts, the_starts, it_starts, avg_para_len, para_len_std, conjunction_ratio, discourse_ratio, hedging_ratio, certainty_ratio, transition_ratio, question_starts / n_sentences if n_sentences > 0 else 0, emoji_count, has_emoji, special_unicode, all_caps_ratio, paren_ratio, rhetorical_ratio, casual_ratio, formal_ratio, has_chinese, chinese_ratio, has_self_id_ai, has_provider_mention, ends_with_question, has_closing_offer, has_checkmark, has_arrow, has_star, avg_line_len, short_lines_ratio, cap_word_ratio, phrase_ratio, sent_boundary_ratio, colon_definitions, double_quote_pairs, single_quote_pairs, i_starts, greeting_ratio, is_short, is_medium, is_long, excl_sentence_ratio, question_line_ratio, conv_phrase_ratio, helpful_ratio, ] class StyleOnlyPipeline: """Feature pipeline using ONLY stylometric features — no TF-IDF.""" def __init__(self): self.stylo = StylometricFeatures() self.scaler = MaxAbsScaler() def fit_transform(self, texts): import time texts_clean = [strip_markdown(strip_cot(t)) for t in texts] t0 = time.time() stylo_features = self.stylo.transform(texts_clean) print( f" Stylometric: {stylo_features.shape[1]} features ({time.time() - t0:.1f}s)" ) result = self.scaler.fit_transform(stylo_features) print(f" Final feature matrix: {result.shape}") return result def transform(self, texts): texts_clean = [strip_markdown(strip_cot(t)) for t in texts] stylo_features = self.stylo.transform(texts_clean) return self.scaler.transform(stylo_features) class FeaturePipeline: def __init__(self, use_tfidf=True): word_params = dict(TFIDF_WORD_PARAMS) char_params = dict(TFIDF_CHAR_PARAMS) if word_params.get("max_features", 1) == 0: word_params["max_features"] = None if char_params.get("max_features", 1) == 0: char_params["max_features"] = None self.word_tfidf = TfidfVectorizer(**word_params) self.char_tfidf = TfidfVectorizer(**char_params) self.stylo = StylometricFeatures() self.scaler = MaxAbsScaler() self.use_tfidf = use_tfidf and ( TFIDF_WORD_PARAMS.get("max_features", 1) > 0 or TFIDF_CHAR_PARAMS.get("max_features", 1) > 0 ) def _clean_for_tfidf(self, text): return strip_markdown(strip_cot(text)) def fit_transform(self, texts): import time print(f" Input: {len(texts)} texts", flush=True) texts_clean = [strip_markdown(strip_cot(t)) for t in texts] texts_tfidf = texts_clean use_word_tfidf = ( self.word_tfidf.max_features is not None and self.word_tfidf.max_features > 0 ) if use_word_tfidf: t0 = time.time() word_features = self.word_tfidf.fit_transform(texts_tfidf) print( f" word tfidf: {word_features.shape[1]} features ({time.time() - t0:.1f}s)", flush=True, ) else: word_features = csr_matrix((len(texts), 0), dtype=np.float32) if self.use_tfidf: t0 = time.time() char_features = self.char_tfidf.fit_transform(texts_tfidf) print( f" char tfidf: {char_features.shape[1]} features ({time.time() - t0:.1f}s)", flush=True, ) else: char_features = csr_matrix((len(texts), 0), dtype=np.float32) t0 = time.time() stylo_features = self.stylo.transform(texts_clean) print( f" stylometric: {stylo_features.shape[1]} features ({time.time() - t0:.1f}s)", flush=True, ) combined = hstack([word_features, char_features, stylo_features]) combined = self.scaler.fit_transform(combined) print(f" Combined feature matrix: {combined.shape}", flush=True) return combined def transform(self, texts): texts_clean = [strip_markdown(strip_cot(t)) for t in texts] texts_tfidf = texts_clean use_word_tfidf = ( self.word_tfidf.max_features is not None and self.word_tfidf.max_features > 0 ) if use_word_tfidf: word_features = self.word_tfidf.transform(texts_tfidf) else: word_features = csr_matrix((len(texts), 0), dtype=np.float32) if self.use_tfidf: char_features = self.char_tfidf.transform(texts_tfidf) else: char_features = csr_matrix((len(texts), 0), dtype=np.float32) stylo_features = self.stylo.transform(texts_clean) combined = hstack([word_features, char_features, stylo_features]) return self.scaler.transform(combined)