import os import json import stanza import re nlp = stanza.Pipeline( lang="en", processors="tokenize,pos,lemma", tokenize_pretokenized=False, ) _EXPECTED_SUFFIXES = { "Plural": {"s", "es"}, "Possessive": {"'s", "s"}, "Comparative": {"er"}, "Superlative": {"est"}, "3rd Person Singular": {"s", "es"}, "Past Tense": {"ed"}, "Past Participle": {"ed", "en", "n"}, "Progressive": {"ing"}, "Gerund": {"ing"}, } _CONTRACTION_PARTICLES = { "'ll": "will", # we'll, he'll "'d": "would/had", # I'd, she'd "'ve": "have", # we've, they've "'re": "are", # you're, they're "'m": "am", # I'm "n't": "not", # isn't, didn't "'s": "is/has", # what's, she's } _S_TOKENS = {"'s", "’s"} def is_possessive_candidate(tok): return tok.text in _S_TOKENS and tok.upos == "PART" def lcp(a: str, b: str) -> str: i = 0 while i < min(len(a), len(b)) and a[i].lower() == b[i].lower(): i += 1 return a[:i] def strip_doubling(lemma: str, suf: str) -> str: if suf and len(suf) >= 2 and suf[0] == lemma[-1]: cand = suf[1:] if any(cand in v for v in _EXPECTED_SUFFIXES.values()): return cand return suf def get_suffix(lemma: str, surface: str) -> str: return strip_doubling(lemma, surface[len(lcp(lemma, surface)):]) def normalize_suffix(lemma: str, raw_suf: str, expected_set: set) -> str | None: if raw_suf in expected_set: return raw_suf if lemma.lower().endswith("y") and raw_suf.startswith("i"): alt = raw_suf[1:] if alt in expected_set: return alt return None def preprocess_text(text: str) -> tuple[str, list[int]]: original_words = text.split() position_map = [] # position_map[original_index] = cleaned_index cleaned_words = [] for i, word in enumerate(original_words): if re.match(r'\[.*\]', word): position_map.append(-1) else: position_map.append(len(cleaned_words)) cleaned_words.append(word) cleaned_text = ' '.join(cleaned_words) return cleaned_text, position_map def calculate_adjusted_index(cleaned_index: int, position_map: list[int]) -> int: for original_index, cleaned_pos in enumerate(position_map): if cleaned_pos == cleaned_index: return original_index return cleaned_index def extract_inflectional_morphemes(text: str): cleaned_text, position_map = preprocess_text(text) doc = nlp(cleaned_text) results = [] for sent in doc.sentences: words = sent.words i = 0 while i < len(words): w = words[i] surf, lem, pos = w.text, w.lemma, w.upos feats = {k: v for k, v in (f.split("=", 1) for f in (w.feats or "").split("|") if "=" in f)} low_txt = surf.lower() if is_possessive_candidate(w) and i > 0: prev = words[i - 1] results.append({ "word": prev.text + surf, "lemma": prev.lemma, "index": calculate_adjusted_index(i - 1, position_map), "inflectional_morpheme": "Possessive" if prev.upos in {"NOUN", "PROPN"} else "Contraction", "morpheme_form": "'/s", }) i += 1 continue if low_txt in _CONTRACTION_PARTICLES and i > 0: prev = words[i - 1] results.append({ "word": prev.text + surf, "lemma": prev.lemma, "index": calculate_adjusted_index(i - 1, position_map), "inflectional_morpheme": "Contraction", "morpheme_form": low_txt, }) i += 1 continue if feats.get("Poss") == "Yes" and pos in {"PRON", "DET"}: low_lem, low_surf = lem.lower(), surf.lower() suf = get_suffix(low_lem, low_surf) morpheme_form = "/s" if suf in {"s", "es"} and low_lem + suf == low_surf else "" results.append({ "word": surf, "lemma": lem, "index": calculate_adjusted_index(i, position_map), "inflectional_morpheme": "Possessive", "morpheme_form": morpheme_form, }) i += 1 continue inflect_type = None if pos == "NOUN" and feats.get("Number") == "Plur": inflect_type = "Plural" elif pos == "ADJ" and feats.get("Degree") == "Cmp": inflect_type = "Comparative" elif pos == "ADJ" and feats.get("Degree") == "Sup": inflect_type = "Superlative" elif pos == "VERB" and feats.get("VerbForm") == "Fin" and feats.get("Tense") == "Pres" and feats.get("Person") == "3": inflect_type = "3rd Person Singular" elif pos == "VERB" and feats.get("VerbForm") == "Fin" and feats.get("Tense") == "Past": inflect_type = "Past Tense" elif pos == "VERB" and feats.get("VerbForm") == "Part": if feats.get("Tense") == "Past" or w.xpos == "VBN": inflect_type = "Past Participle" elif feats.get("Tense") == "Pres" or w.xpos == "VBG": inflect_type = "Progressive" if inflect_type: if surf.lower() == lem.lower() and inflect_type not in {"Possessive", "Comparative", "Superlative"}: i += 1 continue raw_suffix = get_suffix(lem, low_txt) canon = normalize_suffix(lem, raw_suffix, _EXPECTED_SUFFIXES[inflect_type]) morpheme_form = f"/{canon}" if canon else "" results.append({ "word": surf, "lemma": lem, "index": calculate_adjusted_index(i, position_map), "inflectional_morpheme": inflect_type, "morpheme_form": morpheme_form, }) i += 1 return results def extract_morpheme_omissions(text: str): cleaned_text, position_map = preprocess_text(text) doc = nlp(cleaned_text) omissions = [] for sent in doc.sentences: words = sent.words i = 0 while i < len(words): w = words[i] surf, lem, pos = w.text, w.lemma, w.upos feats = {k: v for k, v in (f.split("=", 1) for f in (w.feats or "").split("|") if "=" in f)} inflect_type = None if pos == "NOUN" and feats.get("Number") == "Plur": inflect_type = "Plural" elif pos == "ADJ" and feats.get("Degree") == "Cmp": inflect_type = "Comparative" elif pos == "ADJ" and feats.get("Degree") == "Sup": inflect_type = "Superlative" elif pos == "VERB" and feats.get("VerbForm") == "Fin" and feats.get("Tense") == "Pres" and feats.get("Person") == "3": inflect_type = "3rd Person Singular" elif pos == "VERB" and feats.get("VerbForm") == "Fin" and feats.get("Tense") == "Past": inflect_type = "Past Tense" elif pos == "VERB" and feats.get("VerbForm") == "Part": if feats.get("Tense") == "Past" or w.xpos == "VBN": inflect_type = "Past Participle" elif feats.get("Tense") == "Pres" or w.xpos == "VBG": inflect_type = "Progressive" if inflect_type and surf.lower() == lem.lower() and inflect_type not in {"Possessive", "Comparative", "Superlative"}: omissions.append({ "word": surf, "lemma": lem, "index": calculate_adjusted_index(i, position_map), "inflectional_morpheme": inflect_type, "morpheme_form": "", }) i += 1 return omissions def annotate_morpheme_omission(session_id, base_dir="session_data"): base_dir = base_dir or os.getcwd() json_file = os.path.join(base_dir, f"{session_id}/{session_id}_transcriptionCW.json") if not os.path.exists(json_file): raise FileNotFoundError(f"{json_file} not found, make sure transcription step ran first.") with open(json_file, "r", encoding="utf-8") as f: data = json.load(f) segments = data.get("segments", data) if isinstance(data, dict) else data for seg in segments: text = seg.get("text", "") seg["morpheme_omissions"] = extract_morpheme_omissions(text) with open(json_file, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2) if __name__ == "__main__": sample = "And he [UM] [UM] the rabbit [UM] [UH] [UH] make [UH] sand castle." print("Inflectional Morphemes:") print(json.dumps(extract_inflectional_morphemes(sample), indent=2, ensure_ascii=False)) print("\nMorpheme Omissions:") print(json.dumps(extract_morpheme_omissions(sample), indent=2, ensure_ascii=False))