Early Release
This evaluator reflects early-stage work. We’re continuously improving its accuracy and reliability.
Copy
import ast
import asyncio
import json
from typing import Any, List
import numpy as np
import pandas as pd
from langchain_core.messages import SystemMessage
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.chat import HumanMessagePromptTemplate
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
from textstat import textstat as ts
# Prompts and Rubrics from the Prompts section are saved as a separate file
from prompts import sent_str_prompts as prompts
# Set your api key in your environment
# os.environ['OPENAI_API_KEY'] = 'YOUR API KEY'
MODEL_NAME = "gpt-4o"
TEMPERATURE = 0
model = ChatOpenAI(model=MODEL_NAME, temperature=TEMPERATURE, timeout=15000)
### Models for Sentence analysis and Complexity prediction outputs
class SentenceAnalysesEvaluatorOutput(BaseModel):
reasoning: str = Field(
description="Your step-by-step reasoning as you analyze the text, with a line break between each sentence analyzed."
)
# Foundational
num_sentences: int = Field(description="Total number of sentences in the text.")
num_words: int = Field(description="Total number of words in the text.")
flesch_kincaid_grade: float = Field(
description="Flesch-Kincaid Grade Level number from Computational Counts, rounded to two decimal places."
)
# Sentence Type
num_simple_sentences: int = Field(
description="Number of simple sentences in the text."
)
num_compound_sentences: int = Field(
description="Number of compound sentences in the text."
)
num_complex_sentences: int = Field(
description="Number of complex sentences in the text."
)
num_compound_complex_sentences: int = Field(
description="Number of compound-complex sentences in the text."
)
num_other_sentences: int = Field(
description="Number of sentences that do not fit the four canonical types (e.g., fragments, run-ons, elliptical answers, headlines, or stylized dialogue tags)."
)
# Subordination
num_independent_clauses: int = Field(
description="Total count of all independent clauses in the text."
)
num_subordinate_clauses: int = Field(
description="Total count of all subordinate clauses in the text. A single sentence can have multiple."
)
num_total_clauses: int = Field(
description="The sum of all independent and subordinate clauses in the text."
)
num_sentences_with_subordinate: int = Field(
description="Number of sentences that contain a subordinate clause."
)
num_sentences_with_multiple_subordinates: int = Field(
description="Number of sentences that contain two or more subordinate clauses."
)
num_sentences_with_embedded_clauses: int = Field(
description="Number of sentences with an embedded clause (a subordinate clause inside another clause)."
)
# Informational Phrases
num_prepositional_phrases: int = Field(
description="Number of prepositional phrases in the text."
)
num_participle_phrases: int = Field(
description="Number of participle phrases in the text."
)
num_appositive_phrases: int = Field(
description="Number of appositive phrases in the text."
)
# Cohesion
num_simple_transitions: int = Field(
description="Number of simple transitions in the text."
)
num_sophisticated_transitions: int = Field(
description="Number of sophisticated transitions in the text."
)
# Sentence Type Density
words_in_simple_sentences: int = Field(
description="Total number of words in all sentences classified as simple."
)
words_in_compound_sentences: int = Field(
description="Total number of words in all sentences classified as compound."
)
words_in_complex_sentences: int = Field(
description="Total number of words in all sentences classified as complex."
)
words_in_compound_complex_sentences: int = Field(
description="Total number of words in all sentences classified as compound-complex."
)
words_in_other_sentences: int = Field(
description="Total number of words in all sentences classified as other."
)
# Additional Features
sentence_word_counts: List[int] = Field(
description="A list containing the word count of each individual sentence in the order they appear."
)
num_one_concept_sentences: int = Field(
description="Number of sentences with a single main idea: no subordinate clause and no transition word/phrase."
)
num_multi_concept_sentences: int = Field(
description="Number of sentences with multiple main ideas: either a subordinate clause or a transition word/phrase or both."
)
num_cleft_sentences: int = Field(
description='Number of sentences with cleft constructions (e.g., "It was X that...", "What X did was...").'
)
max_clauses_in_any_sentence: int = Field(
description="Max number of clauses (independent + subordinate) found in a single sentence."
)
class ComplexityClassificationOutput(BaseModel):
reasoning: str = Field(
description="Detailed reasoning that is pedagogically approriate, and helpful for K-12 educators."
)
answer: str = Field(
description="The final complexity category, which must be one of ['Slightly Complex', 'Moderately Complex', 'Very Complex', 'Exceedingly Complex']."
)
# Function to do sentence analysis
async def execute_sentence_analysis(text: str) -> dict:
# Compute ground truth counts
gt_sentence_count = ts.sentence_count(text)
gt_word_count = ts.lexicon_count(text, removepunct=True)
gt_char_count = ts.char_count(text, ignore_spaces=True)
gt_syllable_count = ts.syllable_count(text)
flesch_kincaid_grade = round(ts.flesch_kincaid_grade(text), 2)
gt_counts_str = (
f"num_sentences: {gt_sentence_count}\n"
f"num_words: {gt_word_count}\n"
f"num_char: {gt_char_count}\n"
f"num_syllable: {gt_syllable_count}\n"
f"flesch_kincaid_grade: {flesch_kincaid_grade}"
)
prompt_template = ChatPromptTemplate(
messages=[
SystemMessage(content=prompts.SYSTEM_PROMPT_ANALYSIS),
HumanMessagePromptTemplate.from_template(prompts.USER_PROMPT_ANALYSIS)
],
input_variables=["text", "ground_truth_counts"],
partial_variables={
"format_instructions": JsonOutputParser(pydantic_object=SentenceAnalysesEvaluatorOutput).get_format_instructions()
}
)
chain = prompt_template | model | JsonOutputParser()
result = await chain.ainvoke({
"text": text,
"ground_truth_counts": gt_counts_str
})
return result
# Additional sentence structure features
FEATURE_COLS = [
# Foundational & Distributional
"avg_words_per_sentence",
"sentence_length_variation",
"percent_short_sentences",
"percent_medium_sentences",
"percent_long_sentences",
"percent_very_long_sentences",
"flesch_kincaid_grade",
# Sentence Structure (Grammatical Type)
"percent_simple_sentences",
"percent_compound_sentences",
"percent_complex_sentences",
"percent_compound_complex_sentences",
"percent_other_sentences",
# Word Distribution
"percent_words_in_simple_sentences",
"percent_words_in_complex_sentences",
"percent_words_in_compound_sentences",
"percent_words_in_compound_complex_sentences",
"percent_words_in_other_sentences",
# Clausal & Subordination
"avg_subordinates_per_sentence",
"avg_clauses_per_sentence",
"percent_sentences_with_subordinate",
"percent_sentences_with_multiple_subordinates",
"percent_sentences_with_embedded_clauses",
# Phrase Density
"prep_phrase_density",
"participle_phrase_density",
"appositive_phrase_density",
# Cohesion & Transitions
"avg_transitions_per_sentence",
"percent_sophisticated_transitions",
# Conceptual & Other
"percent_sentences_w_one_concept",
"percent_sentences_w_multi_concept",
"percent_cleft_sentences",
"max_clauses_in_any_sentence",
]
def safe_literal_eval(s):
try:
return ast.literal_eval(s)
except (ValueError, SyntaxError, TypeError):
# Return an empty list if parsing fails, which is a safe default for calculations
return []
def safe_division(numerator, denominator):
# Replaces 0 in the denominator with NaN to avoid division by zero errors
denominator_safe = denominator.replace(0, np.nan)
# Perform division and fill any resulting NaN values (from 0 denominators) with 0
return (numerator / denominator_safe).fillna(0)
def categorize_sentence_lengths(word_counts):
# This function processes a list of word counts for a single text
if not isinstance(word_counts, list) or not word_counts:
return pd.Series(
[0, 0, 0, 0],
index=[
"percent_short_sentences",
"percent_medium_sentences",
"percent_long_sentences",
"percent_very_long_sentences",
],
)
short_count, medium_count, long_count, very_long_count = 0, 0, 0, 0
for count in word_counts:
if count <= 10:
short_count += 1
elif count <= 20:
medium_count += 1
elif count <= 30:
long_count += 1
else:
very_long_count += 1
total_sentences = len(word_counts)
return pd.Series(
[
(short_count / total_sentences) * 100,
(medium_count / total_sentences) * 100,
(long_count / total_sentences) * 100,
(very_long_count / total_sentences) * 100,
],
index=[
"percent_short_sentences",
"percent_medium_sentences",
"percent_long_sentences",
"percent_very_long_sentences",
],
)
def add_engineered_features(df):
df_normalized = df.copy()
# Ensure all relevant numeric columns are actually numeric, coercing errors
for col in df_normalized.columns:
if col.startswith("num_") or col.startswith("words_in_"):
df_normalized[col] = pd.to_numeric(df_normalized[col], errors="coerce")
# Safely convert string representation of lists to actual lists
if (
"sentence_word_counts" in df_normalized.columns
and not df_normalized["sentence_word_counts"].empty
):
# Check if the first non-null element is a string to decide if conversion is needed
first_item = (
df_normalized["sentence_word_counts"].dropna().iloc[0]
if not df_normalized["sentence_word_counts"].dropna().empty
else None
)
if isinstance(first_item, str):
df_normalized["sentence_word_counts"] = df_normalized[
"sentence_word_counts"
].apply(safe_literal_eval)
# Foundational Metrics
df_normalized["avg_words_per_sentence"] = safe_division(
df_normalized["num_words"], df_normalized["num_sentences"]
)
df_normalized["sentence_length_variation"] = df_normalized[
"sentence_word_counts"
].apply(lambda x: np.std(x) if isinstance(x, list) and len(x) > 1 else 0)
length_dist_df = df_normalized["sentence_word_counts"].apply(
categorize_sentence_lengths
)
df_normalized = df_normalized.join(length_dist_df)
# Sentence Structure Percentages
df_normalized["percent_simple_sentences"] = (
safe_division(
df_normalized["num_simple_sentences"], df_normalized["num_sentences"]
)
* 100
)
df_normalized["percent_compound_sentences"] = (
safe_division(
df_normalized["num_compound_sentences"], df_normalized["num_sentences"]
)
* 100
)
df_normalized["percent_complex_sentences"] = (
safe_division(
df_normalized["num_complex_sentences"], df_normalized["num_sentences"]
)
* 100
)
df_normalized["percent_compound_complex_sentences"] = (
safe_division(
df_normalized["num_compound_complex_sentences"],
df_normalized["num_sentences"],
)
* 100
)
df_normalized["percent_other_sentences"] = (
safe_division(
df_normalized["num_other_sentences"], df_normalized["num_sentences"]
)
* 100
)
# Word Distribution Percentages (as a percentage of total words)
df_normalized["percent_words_in_simple_sentences"] = (
safe_division(
df_normalized["words_in_simple_sentences"], df_normalized["num_words"]
)
* 100
)
df_normalized["percent_words_in_compound_sentences"] = (
safe_division(
df_normalized["words_in_compound_sentences"], df_normalized["num_words"]
)
* 100
)
df_normalized["percent_words_in_complex_sentences"] = (
safe_division(
df_normalized["words_in_complex_sentences"], df_normalized["num_words"]
)
* 100
)
df_normalized["percent_words_in_compound_complex_sentences"] = (
safe_division(
df_normalized["words_in_compound_complex_sentences"],
df_normalized["num_words"],
)
* 100
)
df_normalized["percent_words_in_other_sentences"] = (
safe_division(
df_normalized["words_in_other_sentences"], df_normalized["num_words"]
)
* 100
)
# Subordination and Clausal Complexity
df_normalized["avg_subordinates_per_sentence"] = safe_division(
df_normalized["num_subordinate_clauses"], df_normalized["num_sentences"]
)
df_normalized["avg_clauses_per_sentence"] = safe_division(
df_normalized["num_total_clauses"], df_normalized["num_sentences"]
)
df_normalized["percent_sentences_with_subordinate"] = (
safe_division(
df_normalized["num_sentences_with_subordinate"],
df_normalized["num_sentences"],
)
* 100
)
df_normalized["percent_sentences_with_multiple_subordinates"] = (
safe_division(
df_normalized["num_sentences_with_multiple_subordinates"],
df_normalized["num_sentences"],
)
* 100
)
df_normalized["percent_sentences_with_embedded_clauses"] = (
safe_division(
df_normalized["num_sentences_with_embedded_clauses"],
df_normalized["num_sentences"],
)
* 100
)
# Phrase Density (per 100 words)
df_normalized["prep_phrase_density"] = (
safe_division(
df_normalized["num_prepositional_phrases"], df_normalized["num_words"]
)
* 100
)
df_normalized["participle_phrase_density"] = (
safe_division(
df_normalized["num_participle_phrases"], df_normalized["num_words"]
)
* 100
)
df_normalized["appositive_phrase_density"] = (
safe_division(
df_normalized["num_appositive_phrases"], df_normalized["num_words"]
)
* 100
)
# Cohesion and Transitions
total_transitions = df_normalized["num_simple_transitions"].add(
df_normalized["num_sophisticated_transitions"], fill_value=0
)
df_normalized["avg_transitions_per_sentence"] = safe_division(
total_transitions, df_normalized["num_sentences"]
)
df_normalized["percent_sophisticated_transitions"] = (
safe_division(df_normalized["num_sophisticated_transitions"], total_transitions)
* 100
)
# Conceptual & Other
df_normalized["percent_sentences_w_one_concept"] = (
safe_division(
df_normalized["num_one_concept_sentences"], df_normalized["num_sentences"]
)
* 100
)
df_normalized["percent_sentences_w_multi_concept"] = (
safe_division(
df_normalized["num_multi_concept_sentences"], df_normalized["num_sentences"]
)
* 100
)
df_normalized["percent_cleft_sentences"] = (
safe_division(
df_normalized["num_cleft_sentences"], df_normalized["num_sentences"]
)
* 100
)
return df_normalized
def normalize_label(s: Any) -> str | None:
if s is None:
return None
m = {
"slightly complex": "Slightly Complex",
"moderately complex": "Moderately Complex",
"very complex": "Very Complex",
"exceedingly complex": "Exceedingly Complex",
"extremely complex": "Exceedingly Complex",
}
return m.get(str(s).strip().lower())
def row_to_features_json(
row: pd.Series, decimals: int = 1, cast_to_int: bool = True
) -> str:
s = row.reindex(FEATURE_COLS)
s_rounded = pd.to_numeric(s, errors="coerce").round(decimals)
payload = (
{k: (None if pd.isna(v) else int(v)) for k, v in s_rounded.items()}
if cast_to_int
else {k: (None if pd.isna(v) else float(v)) for k, v in s_rounded.items()}
)
return json.dumps(payload, indent=2)
### Define text complexity evaluation function
async def classify_complexity_with_grade_level(sentence_features, grade, excerpt):
rubric = prompts.GRADE_SPECIFIC_RUBRICS.get(
grade,
"No specific rubric available for this grade. Use general linguistic principles.",
)
prompt_template = ChatPromptTemplate(
messages=[
SystemMessage(content=prompts.SYSTEM_PROMPT_COMPLEXITY),
HumanMessagePromptTemplate.from_template(prompts.USER_PROMPT_COMPLEXITY),
],
input_variables=["sentence_features", "grade", "rubric", "excerpt"],
partial_variables={
"format_instructions": JsonOutputParser(
pydantic_object=ComplexityClassificationOutput
).get_format_instructions()
},
)
chain = prompt_template | model | JsonOutputParser()
return await chain.ainvoke(
{
"sentence_features": sentence_features,
"grade": grade,
"rubric": rubric,
"excerpt": excerpt,
}
)
### Functions to run the evaluation over a dataframe
async def analyze_df(input_df: pd.DataFrame, concurrency: int = 5) -> pd.DataFrame:
sem = asyncio.Semaphore(concurrency)
error_payload: dict = {
key: None for key in list(SentenceAnalysesEvaluatorOutput.model_fields.keys())
}
async def _analyze_one_row(text_to_process: str):
async with sem:
try:
result = await execute_sentence_analysis(text_to_process)
result["error"] = None
return result
except Exception as e:
result = {**error_payload, "error": str(e)}
return result
tasks = [_analyze_one_row(t) for t in input_df["text"].tolist()]
results = await asyncio.gather(*tasks)
llm_features_df = pd.DataFrame.from_records(results, index=input_df.index)
llm_features_df.rename(columns={"reasoning": "grammatical_reasoning"}, inplace=True)
processed_df = input_df.join(llm_features_df)
return processed_df
async def evaluate_df(df: pd.DataFrame, concurrency: int = 5):
sem = asyncio.Semaphore(concurrency)
async def _classify_one_row(idx: Any, row: pd.Series):
async with sem:
try:
feats = row_to_features_json(row)
res = await classify_complexity_with_grade_level(
sentence_features=feats, grade=row["grade"], excerpt=row["text"]
)
return {
"index": idx,
"answer": normalize_label(res.get("answer")),
"reasoning": res.get("reasoning"),
"error": None,
}
except Exception as e:
return {
"index": idx,
"answer": None,
"reasoning": None,
"error": str(e),
}
tasks = [_classify_one_row(idx, row) for idx, row in df.iterrows()]
results = await asyncio.gather(*tasks)
pred_df = pd.DataFrame(results).set_index("index")
return pred_df
### Function to predict sentence structure complexity level for a given text and grade
async def predict_text_complexity_level(text: str, grade: int):
input = {"grade": grade, "text": text}
# Convert to a dataframe for feature engineering and processing
input_df = pd.DataFrame.from_records([input])
# Run analysis, engineeried fatures, and evaluation over a dataframe
processed_df = await analyze_df(input_df)
final_features_df = add_engineered_features(processed_df)
predictions_df = await evaluate_df(final_features_df)
# Return the results, in this case for one row of input
return predictions_df.iloc[0][["answer", "reasoning"]].to_dict()
# Add your text & the grade level you want to evaluate for sentence structure complexity
# CLEAR ID=2181
text = """
Lasers have found many uses in everyday life as well as in industry. Lasers are found in CD and DVD players, where they read the code from the disk that stores a song or movie. A laser is often used to read the bar codes or SQR codes on things sold in a store, to identify a product and give its price. Lasers are used in medicine, particularly in LASIK eye surgery, where the laser is used to repair the shape of the cornea. It is used in chemistry with spectroscopy to identify materials, to find out what kind of gases, solids or liquids something is made of. Stronger lasers can be used to cut metal.
Lasers are used to measure the distance of the Moon from Earth by reflecting off reflectors left by the Apollo missions. By measuring the time it takes for the light to travel to the Moon and back again we can find out exactly how far away the moon is.
"""
grade_level = 3
output = await predict_text_complexity_level(text, grade_level)
display(output)