SATE / morpheme.py
Shuwei Hou
fix_morpheme_index
9e45db3
import os
import json
import stanza
import re
nlp = stanza.Pipeline(
lang="en",
processors="tokenize,pos,lemma",
tokenize_pretokenized=False,
)
_EXPECTED_SUFFIXES = {
"Plural": {"s", "es"},
"Possessive": {"'s", "s"},
"Comparative": {"er"},
"Superlative": {"est"},
"3rd Person Singular": {"s", "es"},
"Past Tense": {"ed"},
"Past Participle": {"ed", "en", "n"},
"Progressive": {"ing"},
"Gerund": {"ing"},
}
_CONTRACTION_PARTICLES = {
"'ll": "will", # we'll, he'll
"'d": "would/had", # I'd, she'd
"'ve": "have", # we've, they've
"'re": "are", # you're, they're
"'m": "am", # I'm
"n't": "not", # isn't, didn't
"'s": "is/has", # what's, she's
}
_S_TOKENS = {"'s", "’s"}
def is_possessive_candidate(tok):
return tok.text in _S_TOKENS and tok.upos == "PART"
def lcp(a: str, b: str) -> str:
i = 0
while i < min(len(a), len(b)) and a[i].lower() == b[i].lower():
i += 1
return a[:i]
def strip_doubling(lemma: str, suf: str) -> str:
if suf and len(suf) >= 2 and suf[0] == lemma[-1]:
cand = suf[1:]
if any(cand in v for v in _EXPECTED_SUFFIXES.values()):
return cand
return suf
def get_suffix(lemma: str, surface: str) -> str:
return strip_doubling(lemma, surface[len(lcp(lemma, surface)):])
def normalize_suffix(lemma: str, raw_suf: str, expected_set: set) -> str | None:
if raw_suf in expected_set:
return raw_suf
if lemma.lower().endswith("y") and raw_suf.startswith("i"):
alt = raw_suf[1:]
if alt in expected_set:
return alt
return None
def preprocess_text(text: str) -> tuple[str, list[int]]:
original_words = text.split()
position_map = [] # position_map[original_index] = cleaned_index
cleaned_words = []
for i, word in enumerate(original_words):
if re.match(r'\[.*\]', word):
position_map.append(-1)
else:
position_map.append(len(cleaned_words))
cleaned_words.append(word)
cleaned_text = ' '.join(cleaned_words)
return cleaned_text, position_map
def calculate_adjusted_index(cleaned_index: int, position_map: list[int]) -> int:
for original_index, cleaned_pos in enumerate(position_map):
if cleaned_pos == cleaned_index:
return original_index
return cleaned_index
def extract_inflectional_morphemes(text: str):
cleaned_text, position_map = preprocess_text(text)
doc = nlp(cleaned_text)
results = []
for sent in doc.sentences:
words = sent.words
i = 0
while i < len(words):
w = words[i]
surf, lem, pos = w.text, w.lemma, w.upos
feats = {k: v for k, v in (f.split("=", 1) for f in (w.feats or "").split("|") if "=" in f)}
low_txt = surf.lower()
if is_possessive_candidate(w) and i > 0:
prev = words[i - 1]
if prev.upos in {"NOUN", "PROPN"}:
results.append({
"word": prev.text + surf,
"lemma": prev.lemma,
"index": calculate_adjusted_index(i - 1, position_map),
"inflectional_morpheme": "Possessive",
"morpheme_form": "'/s",
})
else:
results.append({
"word": prev.text + surf,
"lemma": prev.lemma,
"index": calculate_adjusted_index(i - 1, position_map),
"inflectional_morpheme": "Contraction",
"morpheme_form": "'/s",
})
i += 1
continue
if low_txt in _CONTRACTION_PARTICLES and i > 0:
prev = words[i - 1]
results.append({
"word": prev.text + surf,
"lemma": prev.lemma,
"index": calculate_adjusted_index(i - 1, position_map),
"inflectional_morpheme": "Contraction",
"morpheme_form": low_txt,
})
i += 1
continue
if feats.get("Poss") == "Yes" and pos in {"PRON", "DET"}:
low_lem, low_surf = lem.lower(), surf.lower()
suf = get_suffix(low_lem, low_surf)
morpheme_form = "/s" if suf in {"s", "es"} and low_lem + suf == low_surf else "<IRR>"
results.append({
"word": surf,
"lemma": lem,
"index": calculate_adjusted_index(i, position_map),
"inflectional_morpheme": "Possessive",
"morpheme_form": morpheme_form,
})
i += 1
continue
inflect_type = None
if pos == "NOUN" and feats.get("Number") == "Plur":
inflect_type = "Plural"
elif pos == "ADJ" and feats.get("Degree") == "Cmp":
inflect_type = "Comparative"
elif pos == "ADJ" and feats.get("Degree") == "Sup":
inflect_type = "Superlative"
elif pos == "VERB" and feats.get("VerbForm") == "Fin" and feats.get("Tense") == "Pres" and feats.get("Person") == "3":
inflect_type = "3rd Person Singular"
elif pos == "VERB" and feats.get("VerbForm") == "Fin" and feats.get("Tense") == "Past":
inflect_type = "Past Tense"
elif pos == "VERB" and feats.get("VerbForm") == "Part":
if feats.get("Tense") == "Past" or w.xpos == "VBN":
inflect_type = "Past Participle"
elif feats.get("Tense") == "Pres" or w.xpos == "VBG":
inflect_type = "Progressive"
if inflect_type:
if surf.lower() == lem.lower() and inflect_type not in {"Possessive", "Comparative", "Superlative"}:
i += 1
continue
raw_suffix = get_suffix(lem, low_txt)
canon = normalize_suffix(lem, raw_suffix, _EXPECTED_SUFFIXES[inflect_type])
morpheme_form = f"/{canon}" if canon else "<IRR>"
results.append({
"word": surf,
"lemma": lem,
"index": calculate_adjusted_index(i, position_map),
"inflectional_morpheme": inflect_type,
"morpheme_form": morpheme_form,
})
i += 1
return results
def annotate_morpheme(session_id, base_dir="session_data"):
base_dir = base_dir or os.getcwd()
json_file = os.path.join(base_dir, f"{session_id}/{session_id}_transcriptionCW.json")
if not os.path.exists(json_file):
raise FileNotFoundError(f"{json_file} not found – make sure transcription step ran first.")
with open(json_file, "r", encoding="utf-8") as f:
data = json.load(f)
segments = data.get("segments", data) if isinstance(data, dict) else data
for seg in segments:
text = seg.get("text", "")
seg["morphemes"] = extract_inflectional_morphemes(text)
with open(json_file, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
if __name__ == "__main__":
print(extract_inflectional_morphemes("And he [UH] [UM] the rabbit [UH] makes [UH] sand castle."))