"""
AIFinder Feature Extraction
Optimized TF-IDF and stylometric features for AI model detection.
"""
import re
import numpy as np
from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MaxAbsScaler
from config import TFIDF_WORD_PARAMS, TFIDF_CHAR_PARAMS
_RE_COMPILED = {
"cot": re.compile(r".*?", re.DOTALL),
"code_block": re.compile(r"```[\s\S]*?```"),
"inline_code": re.compile(r"`[^`]+`"),
"bold": re.compile(r"\*\*([^*]+)\*\*"),
"italic_ast": re.compile(r"\*([^*]+)\*"),
"italic_under": re.compile(r"__([^_]+)__"),
"under": re.compile(r"_([^_]+)_"),
"header": re.compile(r"^#{1,6}\s+", re.MULTILINE),
"bullet": re.compile(r"^[\s]*[-*+]\s+", re.MULTILINE),
"numbered": re.compile(r"^\s*\d+[.)]\s+", re.MULTILINE),
"link": re.compile(r"\[([^\]]+)\]\([^)]+\)"),
"quote": re.compile(r"^>.*$", re.MULTILINE),
"hr": re.compile(r"^---+$", re.MULTILINE),
"think_tag": re.compile(r""),
"xml_tag": re.compile(r"<[^>]+>"),
"url": re.compile(r"https?://"),
"contraction": re.compile(r"\b\w+'\w+\b"),
"markdown_header": re.compile(r"^#{1,6}\s", re.MULTILINE),
"markdown_bold": re.compile(r"\*\*.*?\*\*"),
"markdown_code_block": re.compile(r"```"),
"markdown_inline_code": re.compile(r"`[^`]+`"),
"markdown_bullet": re.compile(r"^[\s]*[-*+]\s", re.MULTILINE),
"markdown_numbered": re.compile(r"^\s*\d+[.)]\s", re.MULTILINE),
"markdown_table": re.compile(r"\|.*\|"),
"question_start": re.compile(
r"^(who|what|when|where|why|how)\b", re.IGNORECASE | re.MULTILINE
),
"emoji": re.compile(r"[\U00010000-\U0010ffff]"),
"chinese": re.compile(r"[\u4e00-\u9fff]"),
"all_caps": re.compile(r"\b[A-Z][a-z]+\b"),
"four_word": re.compile(r"\b\w+\s+\w+\s+\w+\s+\w+\b"),
"sent_boundary": re.compile(r"[.!?]\s+[A-Z]"),
"paren": re.compile(r"\([^)]+\)"),
"colon_def": re.compile(r"\b\w+:\s+\w+"),
"double_quote": re.compile(r'"[^"]*"'),
"single_quote": re.compile(r"'[^']*'"),
"greeting": re.compile(
r"\b(hi|hello|hey|hiya|greetings|howdy|yo)\b", re.IGNORECASE
),
"conv_phrase": re.compile(
r"\b(great|perfect|sure|definitely|certainly|absolutely|of course|no problem|sounds good|got it|understood|okay|alright)\b",
re.IGNORECASE,
),
"helpful": re.compile(
r"\b(let me know|feel free|happy to|glad to|happy to help|don't hesitate|let me know if|please let me|reach out)\b",
re.IGNORECASE,
),
"closing_offer": re.compile(
r"(let me know|feel free|happy to help|don't hesitate|hope this helps)",
re.IGNORECASE,
),
"self_id_ai": re.compile(
r"\b(I'm|I am)\s+(an?\s+)?(AI|language model|assistant|chatbot)\b",
re.IGNORECASE,
),
"provider_mention": re.compile(
r"\b(Claude|Anthropic|GPT|OpenAI|ChatGPT|Gemini|Google|Bard|Grok|xAI|DeepSeek|Kimi|Moonshot|Mistral|MiniMax|Zhipu|GLM|深度求索)\b",
re.IGNORECASE,
),
"special_unicode": re.compile(r"[^\x00-\x7F]"),
}
_PRONOUN_SETS = {
"first": frozenset(
{"i", "me", "my", "mine", "myself", "we", "us", "our", "ours", "ourselves"}
),
"second": frozenset({"you", "your", "yours", "yourself", "yourselves"}),
"third": frozenset(
{"he", "she", "it", "they", "them", "his", "her", "its", "their"}
),
}
_DISCOURSE_SETS = {
"conjunctions": frozenset(
{
"and",
"but",
"or",
"nor",
"for",
"yet",
"so",
"because",
"although",
"while",
"if",
"when",
"where",
}
),
"discourse": frozenset(
{
"however",
"therefore",
"moreover",
"furthermore",
"nevertheless",
"consequently",
"thus",
"hence",
}
),
"hedging": frozenset(
{
"perhaps",
"maybe",
"might",
"could",
"possibly",
"seemingly",
"apparently",
"arguably",
"potentially",
}
),
"certainty": frozenset(
{
"definitely",
"certainly",
"absolutely",
"clearly",
"obviously",
"undoubtedly",
"indeed",
"surely",
}
),
"transition": frozenset(
{
"additionally",
"meanwhile",
"subsequently",
"alternatively",
"specifically",
"notably",
"importantly",
"essentially",
}
),
"casual": frozenset(
{
"okay",
"ok",
"hey",
"hi",
"cool",
"awesome",
"wow",
"basically",
"actually",
"literally",
"right",
"yeah",
}
),
"formal": frozenset(
{
"regarding",
"concerning",
"pertaining",
"aforementioned",
"respectively",
"accordingly",
"henceforth",
"whereby",
"notwithstanding",
"pursuant",
}
),
}
_PUNC_STRIP = frozenset(".,!?;:'\"()[]{}")
def strip_cot(text):
return _RE_COMPILED["cot"].sub("", text).strip()
def strip_markdown(text):
text = _RE_COMPILED["code_block"].sub("", text)
text = _RE_COMPILED["inline_code"].sub("", text)
text = _RE_COMPILED["bold"].sub(r"\1", text)
text = _RE_COMPILED["italic_ast"].sub(r"\1", text)
text = _RE_COMPILED["italic_under"].sub(r"\1", text)
text = _RE_COMPILED["under"].sub(r"\1", text)
text = _RE_COMPILED["header"].sub("", text)
text = _RE_COMPILED["bullet"].sub("", text)
text = _RE_COMPILED["numbered"].sub("", text)
text = _RE_COMPILED["link"].sub(r"\1", text)
text = _RE_COMPILED["quote"].sub("", text)
text = _RE_COMPILED["hr"].sub("", text)
return text.strip()
class StylometricFeatures(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
return csr_matrix(np.array([self._extract(t) for t in X], dtype=np.float32))
def _extract(self, text):
n_chars = max(len(text), 1)
words = text.split()
n_words = max(len(words), 1)
sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()]
n_sentences = max(len(sentences), 1)
paragraphs = text.split("\n\n")
non_empty_paras = [p for p in paragraphs if p.strip()]
n_paragraphs = len(non_empty_paras)
lines = text.split("\n")
non_empty_lines = [ln for ln in lines if ln.strip()]
n_lines = max(len(non_empty_lines), 1)
word_lens = [len(w) for w in words]
sent_lens = [len(s.split()) for s in sentences]
_rc = _RE_COMPILED
_ps = _PRONOUN_SETS
_ds = _DISCOURSE_SETS
avg_word_len = np.mean(word_lens) if words else 0.0
word_len_std = np.std(word_lens) if len(words) > 1 else 0.0
median_word_len = np.median(word_lens) if words else 0.0
avg_sent_len = n_words / n_sentences
n_commas = text.count(",") / n_chars
n_semicolons = text.count(";") / n_chars
n_colons = text.count(":") / n_chars
n_dash = (text.count("—") + text.count("–") + text.count("--")) / n_chars
n_parens = (text.count("(") + text.count(")")) / n_chars
n_quotes = (text.count('"') + text.count("'")) / n_chars
n_exclaim = text.count("!") / n_chars
n_question = text.count("?") / n_chars
n_period = text.count(".") / n_chars
n_ellipsis = (text.count("...") + text.count("…")) / n_chars
comma_colon_ratio = n_commas / (n_colons + 0.001)
comma_period_ratio = n_commas / (n_period + 0.001)
excl_question_ratio = n_exclaim / (n_question + 0.001)
n_headers = len(_rc["markdown_header"].findall(text)) / n_sentences
n_bold = len(_rc["markdown_bold"].findall(text)) / n_sentences
n_code_blocks = len(_rc["markdown_code_block"].findall(text)) / n_sentences
n_inline_code = len(_rc["markdown_inline_code"].findall(text)) / n_sentences
n_bullet = len(_rc["markdown_bullet"].findall(text)) / n_sentences
n_numbered = len(_rc["markdown_numbered"].findall(text)) / n_sentences
n_tables = len(_rc["markdown_table"].findall(text)) / n_sentences
newline_density = text.count("\n") / n_chars
double_newline_ratio = text.count("\n\n") / (text.count("\n") + 1)
uppercase_ratio = sum(1 for c in text if c.isupper()) / n_chars
digit_ratio = sum(1 for c in text if c.isdigit()) / n_chars
space_ratio = sum(1 for c in text if c.isspace()) / n_chars
unique_chars = len(set(text)) / n_chars
unique_chars_ratio = len(set(text.lower())) / n_chars
sent_len_std = np.std(sent_lens) if len(sent_lens) > 1 else 0.0
sent_len_max = max(sent_lens) if sent_lens else 0
sent_len_min = min(sent_lens) if sent_lens else 0
sent_len_median = np.median(sent_lens) if sent_lens else 0.0
sent_len_range = sent_len_max - sent_len_min
has_think = 1.0 if _rc["think_tag"].search(text) else 0.0
has_xml = 1.0 if _rc["xml_tag"].search(text) else 0.0
has_hr = 1.0 if _rc["hr"].search(text) else 0.0
has_url = 1.0 if _rc["url"].search(text) else 0.0
words_lower = [w.lower().strip(".,!?;:'\"()[]{}") for w in words]
first_person_ratio = sum(1 for w in words_lower if w in _ps["first"]) / n_words
second_person_ratio = (
sum(1 for w in words_lower if w in _ps["second"]) / n_words
)
third_person_ratio = sum(1 for w in words_lower if w in _ps["third"]) / n_words
unique_words = len(set(words_lower))
ttr = unique_words / n_words if n_words > 0 else 0.0
word_counts = {}
for w in words_lower:
word_counts[w] = word_counts.get(w, 0) + 1
hapax = sum(1 for c in word_counts.values() if c == 1)
hapax_ratio = hapax / n_words if n_words > 0 else 0.0
contraction_count = len(_rc["contraction"].findall(text))
contraction_ratio = contraction_count / n_words if n_words > 0 else 0.0
sentences_starters = [
s.split()[0].lower() if s.split() else "" for s in sentences
]
starter_vocab = (
len(set(sentences_starters)) / n_sentences if n_sentences > 0 else 0.0
)
and_starts = sum(1 for s in sentences_starters if s == "and") / n_sentences
but_starts = sum(1 for s in sentences_starters if s == "but") / n_sentences
so_starts = sum(1 for s in sentences_starters if s == "so") / n_sentences
the_starts = sum(1 for s in sentences_starters if s == "the") / n_sentences
it_starts = (
sum(1 for s in sentences_starters if s in ("it", "it's")) / n_sentences
)
i_starts = (
sum(1 for s in sentences_starters if s in ("i", "i'm", "i've"))
/ n_sentences
)
short_word_ratio = sum(1 for w in words_lower if len(w) <= 2) / n_words
medium_word_ratio = sum(1 for w in words_lower if 3 <= len(w) <= 6) / n_words
long_word_ratio = sum(1 for w in words_lower if len(w) >= 7) / n_words
very_long_word_ratio = sum(1 for w in words_lower if len(w) >= 10) / n_words
para_lens = (
[len(p.split()) for p in non_empty_paras] if non_empty_paras else [0]
)
avg_para_len = np.mean(para_lens)
para_len_std = np.std(para_lens) if len(para_lens) > 1 else 0.0
conjunction_ratio = (
sum(1 for w in words_lower if w in _ds["conjunctions"]) / n_words
)
discourse_ratio = sum(1 for w in words_lower if w in _ds["discourse"]) / n_words
hedging_ratio = sum(1 for w in words_lower if w in _ds["hedging"]) / n_words
certainty_ratio = sum(1 for w in words_lower if w in _ds["certainty"]) / n_words
transition_ratio = (
sum(1 for w in words_lower if w in _ds["transition"]) / n_words
)
question_starts = sum(
1 for s in sentences if s and _rc["question_start"].search(s.lower())
)
has_list = 1.0 if n_bullet > 0 or n_numbered > 0 else 0.0
list_items = n_bullet + n_numbered
emoji_count = len(_rc["emoji"].findall(text))
has_emoji = 1.0 if emoji_count > 0 else 0.0
all_caps_words = sum(
1 for w in words if len(w) > 1 and w.isupper() and w.isalpha()
)
all_caps_ratio = all_caps_words / n_words
paren_count = len(_rc["paren"].findall(text))
paren_ratio = paren_count / n_sentences
rhetorical_q = sum(1 for s in text.split("\n") if s.strip().endswith("?"))
rhetorical_ratio = rhetorical_q / n_sentences
casual_ratio = sum(1 for w in words_lower if w in _ds["casual"]) / n_words
formal_ratio = sum(1 for w in words_lower if w in _ds["formal"]) / n_words
chinese_chars = len(_rc["chinese"].findall(text))
has_chinese = 1.0 if chinese_chars > 0 else 0.0
chinese_ratio = chinese_chars / n_chars
has_self_id_ai = 1.0 if _rc["self_id_ai"].search(text) else 0.0
has_provider_mention = 1.0 if _rc["provider_mention"].search(text) else 0.0
ends_with_question = 1.0 if text.rstrip().endswith("?") else 0.0
has_closing_offer = 1.0 if _rc["closing_offer"].search(text) else 0.0
commas_per_sentence = text.count(",") / n_sentences
avg_line_len = (
np.mean([len(ln) for ln in non_empty_lines]) if non_empty_lines else 0.0
)
short_lines_ratio = (
sum(1 for ln in non_empty_lines if len(ln.split()) <= 5) / n_lines
)
cap_words = len(_rc["all_caps"].findall(text))
cap_word_ratio = cap_words / n_words
four_word_phrases = len(_rc["four_word"].findall(text))
phrase_ratio = four_word_phrases / n_sentences
sent_boundaries = len(_rc["sent_boundary"].findall(text))
sent_boundary_ratio = sent_boundaries / n_sentences
has_checkmark = 1.0 if any(c in text for c in "✓✗✔✘") else 0.0
has_arrow = 1.0 if any(c in text for c in "→←➡") else 0.0
has_star = 1.0 if any(c in text for c in "⭐★☆") else 0.0
special_unicode = len(_rc["special_unicode"].findall(text)) / n_chars
colon_definitions = len(_rc["colon_def"].findall(text)) / n_sentences
double_quote_pairs = len(_rc["double_quote"].findall(text)) / n_sentences
single_quote_pairs = len(_rc["single_quote"].findall(text)) / n_sentences
greeting_patterns = len(_rc["greeting"].findall(text))
greeting_ratio = greeting_patterns / n_sentences
is_short = 1.0 if n_words < 100 else 0.0
is_medium = 1.0 if 100 <= n_words < 500 else 0.0
is_long = 1.0 if n_words >= 500 else 0.0
excl_sentences = sum(1 for s in sentences if s.strip().endswith("!"))
excl_sentence_ratio = excl_sentences / n_sentences
question_lines = [ln for ln in non_empty_lines if ln.strip().endswith("?")]
question_line_ratio = len(question_lines) / n_lines if n_lines > 0 else 0.0
conversational_phrases = len(_rc["conv_phrase"].findall(text))
conv_phrase_ratio = conversational_phrases / n_words
helpful_phrases = len(_rc["helpful"].findall(text))
helpful_ratio = helpful_phrases / n_sentences
return [
avg_word_len,
word_len_std,
median_word_len,
avg_sent_len,
sent_len_std,
sent_len_max,
sent_len_min,
sent_len_median,
sent_len_range,
commas_per_sentence,
n_commas,
n_semicolons,
n_colons,
n_dash,
n_parens,
n_quotes,
n_exclaim,
n_question,
n_period,
n_ellipsis,
comma_colon_ratio,
comma_period_ratio,
excl_question_ratio,
n_headers,
n_bold,
n_code_blocks,
n_inline_code,
n_bullet,
n_numbered,
n_tables,
has_list,
newline_density,
double_newline_ratio,
uppercase_ratio,
digit_ratio,
space_ratio,
unique_chars,
unique_chars_ratio,
list_items,
n_paragraphs,
n_lines / n_sentences,
has_think,
has_xml,
has_hr,
has_url,
first_person_ratio,
second_person_ratio,
third_person_ratio,
ttr,
hapax_ratio,
contraction_ratio,
short_word_ratio,
medium_word_ratio,
long_word_ratio,
very_long_word_ratio,
starter_vocab,
and_starts,
but_starts,
so_starts,
the_starts,
it_starts,
avg_para_len,
para_len_std,
conjunction_ratio,
discourse_ratio,
hedging_ratio,
certainty_ratio,
transition_ratio,
question_starts / n_sentences if n_sentences > 0 else 0,
emoji_count,
has_emoji,
special_unicode,
all_caps_ratio,
paren_ratio,
rhetorical_ratio,
casual_ratio,
formal_ratio,
has_chinese,
chinese_ratio,
has_self_id_ai,
has_provider_mention,
ends_with_question,
has_closing_offer,
has_checkmark,
has_arrow,
has_star,
avg_line_len,
short_lines_ratio,
cap_word_ratio,
phrase_ratio,
sent_boundary_ratio,
colon_definitions,
double_quote_pairs,
single_quote_pairs,
i_starts,
greeting_ratio,
is_short,
is_medium,
is_long,
excl_sentence_ratio,
question_line_ratio,
conv_phrase_ratio,
helpful_ratio,
]
class StyleOnlyPipeline:
"""Feature pipeline using ONLY stylometric features — no TF-IDF."""
def __init__(self):
self.stylo = StylometricFeatures()
self.scaler = MaxAbsScaler()
def fit_transform(self, texts):
import time
texts_clean = [strip_markdown(strip_cot(t)) for t in texts]
t0 = time.time()
stylo_features = self.stylo.transform(texts_clean)
print(
f" Stylometric: {stylo_features.shape[1]} features ({time.time() - t0:.1f}s)"
)
result = self.scaler.fit_transform(stylo_features)
print(f" Final feature matrix: {result.shape}")
return result
def transform(self, texts):
texts_clean = [strip_markdown(strip_cot(t)) for t in texts]
stylo_features = self.stylo.transform(texts_clean)
return self.scaler.transform(stylo_features)
class FeaturePipeline:
def __init__(self, use_tfidf=True):
word_params = dict(TFIDF_WORD_PARAMS)
char_params = dict(TFIDF_CHAR_PARAMS)
if word_params.get("max_features", 1) == 0:
word_params["max_features"] = None
if char_params.get("max_features", 1) == 0:
char_params["max_features"] = None
self.word_tfidf = TfidfVectorizer(**word_params)
self.char_tfidf = TfidfVectorizer(**char_params)
self.stylo = StylometricFeatures()
self.scaler = MaxAbsScaler()
self.use_tfidf = use_tfidf and (
TFIDF_WORD_PARAMS.get("max_features", 1) > 0
or TFIDF_CHAR_PARAMS.get("max_features", 1) > 0
)
def _clean_for_tfidf(self, text):
return strip_markdown(strip_cot(text))
def fit_transform(self, texts):
import time
print(f" Input: {len(texts)} texts", flush=True)
texts_clean = [strip_markdown(strip_cot(t)) for t in texts]
texts_tfidf = texts_clean
use_word_tfidf = (
self.word_tfidf.max_features is not None
and self.word_tfidf.max_features > 0
)
if use_word_tfidf:
t0 = time.time()
word_features = self.word_tfidf.fit_transform(texts_tfidf)
print(
f" word tfidf: {word_features.shape[1]} features ({time.time() - t0:.1f}s)",
flush=True,
)
else:
word_features = csr_matrix((len(texts), 0), dtype=np.float32)
if self.use_tfidf:
t0 = time.time()
char_features = self.char_tfidf.fit_transform(texts_tfidf)
print(
f" char tfidf: {char_features.shape[1]} features ({time.time() - t0:.1f}s)",
flush=True,
)
else:
char_features = csr_matrix((len(texts), 0), dtype=np.float32)
t0 = time.time()
stylo_features = self.stylo.transform(texts_clean)
print(
f" stylometric: {stylo_features.shape[1]} features ({time.time() - t0:.1f}s)",
flush=True,
)
combined = hstack([word_features, char_features, stylo_features])
combined = self.scaler.fit_transform(combined)
print(f" Combined feature matrix: {combined.shape}", flush=True)
return combined
def transform(self, texts):
texts_clean = [strip_markdown(strip_cot(t)) for t in texts]
texts_tfidf = texts_clean
use_word_tfidf = (
self.word_tfidf.max_features is not None
and self.word_tfidf.max_features > 0
)
if use_word_tfidf:
word_features = self.word_tfidf.transform(texts_tfidf)
else:
word_features = csr_matrix((len(texts), 0), dtype=np.float32)
if self.use_tfidf:
char_features = self.char_tfidf.transform(texts_tfidf)
else:
char_features = csr_matrix((len(texts), 0), dtype=np.float32)
stylo_features = self.stylo.transform(texts_clean)
combined = hstack([word_features, char_features, stylo_features])
return self.scaler.transform(combined)