|
import os |
|
import json |
|
import stanza |
|
import re |
|
|
|
nlp = stanza.Pipeline( |
|
lang="en", |
|
processors="tokenize,pos,lemma", |
|
tokenize_pretokenized=False, |
|
) |
|
|
|
_EXPECTED_SUFFIXES = { |
|
"Plural": {"s", "es"}, |
|
"Possessive": {"'s", "s"}, |
|
"Comparative": {"er"}, |
|
"Superlative": {"est"}, |
|
"3rd Person Singular": {"s", "es"}, |
|
"Past Tense": {"ed"}, |
|
"Past Participle": {"ed", "en", "n"}, |
|
"Progressive": {"ing"}, |
|
"Gerund": {"ing"}, |
|
} |
|
|
|
_CONTRACTION_PARTICLES = { |
|
"'ll": "will", |
|
"'d": "would/had", |
|
"'ve": "have", |
|
"'re": "are", |
|
"'m": "am", |
|
"n't": "not", |
|
"'s": "is/has", |
|
} |
|
|
|
_S_TOKENS = {"'s", "’s"} |
|
|
|
|
|
def is_possessive_candidate(tok): |
|
return tok.text in _S_TOKENS and tok.upos == "PART" |
|
|
|
|
|
def lcp(a: str, b: str) -> str: |
|
i = 0 |
|
while i < min(len(a), len(b)) and a[i].lower() == b[i].lower(): |
|
i += 1 |
|
return a[:i] |
|
|
|
|
|
def strip_doubling(lemma: str, suf: str) -> str: |
|
if suf and len(suf) >= 2 and suf[0] == lemma[-1]: |
|
cand = suf[1:] |
|
if any(cand in v for v in _EXPECTED_SUFFIXES.values()): |
|
return cand |
|
return suf |
|
|
|
|
|
def get_suffix(lemma: str, surface: str) -> str: |
|
return strip_doubling(lemma, surface[len(lcp(lemma, surface)):]) |
|
|
|
|
|
def normalize_suffix(lemma: str, raw_suf: str, expected_set: set) -> str | None: |
|
if raw_suf in expected_set: |
|
return raw_suf |
|
if lemma.lower().endswith("y") and raw_suf.startswith("i"): |
|
alt = raw_suf[1:] |
|
if alt in expected_set: |
|
return alt |
|
return None |
|
|
|
|
|
def preprocess_text(text: str) -> tuple[str, list[int]]: |
|
|
|
original_words = text.split() |
|
|
|
position_map = [] |
|
cleaned_words = [] |
|
|
|
for i, word in enumerate(original_words): |
|
if re.match(r'\[.*\]', word): |
|
position_map.append(-1) |
|
else: |
|
position_map.append(len(cleaned_words)) |
|
cleaned_words.append(word) |
|
|
|
cleaned_text = ' '.join(cleaned_words) |
|
return cleaned_text, position_map |
|
|
|
|
|
def calculate_adjusted_index(cleaned_index: int, position_map: list[int]) -> int: |
|
|
|
for original_index, cleaned_pos in enumerate(position_map): |
|
if cleaned_pos == cleaned_index: |
|
return original_index |
|
|
|
|
|
return cleaned_index |
|
|
|
|
|
def extract_inflectional_morphemes(text: str): |
|
cleaned_text, position_map = preprocess_text(text) |
|
|
|
doc = nlp(cleaned_text) |
|
results = [] |
|
|
|
for sent in doc.sentences: |
|
words = sent.words |
|
i = 0 |
|
while i < len(words): |
|
w = words[i] |
|
surf, lem, pos = w.text, w.lemma, w.upos |
|
feats = {k: v for k, v in (f.split("=", 1) for f in (w.feats or "").split("|") if "=" in f)} |
|
low_txt = surf.lower() |
|
|
|
if is_possessive_candidate(w) and i > 0: |
|
prev = words[i - 1] |
|
|
|
if prev.upos in {"NOUN", "PROPN"}: |
|
results.append({ |
|
"word": prev.text + surf, |
|
"lemma": prev.lemma, |
|
"index": calculate_adjusted_index(i - 1, position_map), |
|
"inflectional_morpheme": "Possessive", |
|
"morpheme_form": "'/s", |
|
}) |
|
else: |
|
results.append({ |
|
"word": prev.text + surf, |
|
"lemma": prev.lemma, |
|
"index": calculate_adjusted_index(i - 1, position_map), |
|
"inflectional_morpheme": "Contraction", |
|
"morpheme_form": "'/s", |
|
}) |
|
i += 1 |
|
continue |
|
|
|
if low_txt in _CONTRACTION_PARTICLES and i > 0: |
|
prev = words[i - 1] |
|
results.append({ |
|
"word": prev.text + surf, |
|
"lemma": prev.lemma, |
|
"index": calculate_adjusted_index(i - 1, position_map), |
|
"inflectional_morpheme": "Contraction", |
|
"morpheme_form": low_txt, |
|
}) |
|
i += 1 |
|
continue |
|
|
|
if feats.get("Poss") == "Yes" and pos in {"PRON", "DET"}: |
|
low_lem, low_surf = lem.lower(), surf.lower() |
|
suf = get_suffix(low_lem, low_surf) |
|
morpheme_form = "/s" if suf in {"s", "es"} and low_lem + suf == low_surf else "<IRR>" |
|
results.append({ |
|
"word": surf, |
|
"lemma": lem, |
|
"index": calculate_adjusted_index(i, position_map), |
|
"inflectional_morpheme": "Possessive", |
|
"morpheme_form": morpheme_form, |
|
}) |
|
i += 1 |
|
continue |
|
|
|
inflect_type = None |
|
if pos == "NOUN" and feats.get("Number") == "Plur": |
|
inflect_type = "Plural" |
|
elif pos == "ADJ" and feats.get("Degree") == "Cmp": |
|
inflect_type = "Comparative" |
|
elif pos == "ADJ" and feats.get("Degree") == "Sup": |
|
inflect_type = "Superlative" |
|
elif pos == "VERB" and feats.get("VerbForm") == "Fin" and feats.get("Tense") == "Pres" and feats.get("Person") == "3": |
|
inflect_type = "3rd Person Singular" |
|
elif pos == "VERB" and feats.get("VerbForm") == "Fin" and feats.get("Tense") == "Past": |
|
inflect_type = "Past Tense" |
|
elif pos == "VERB" and feats.get("VerbForm") == "Part": |
|
if feats.get("Tense") == "Past" or w.xpos == "VBN": |
|
inflect_type = "Past Participle" |
|
elif feats.get("Tense") == "Pres" or w.xpos == "VBG": |
|
inflect_type = "Progressive" |
|
|
|
if inflect_type: |
|
if surf.lower() == lem.lower() and inflect_type not in {"Possessive", "Comparative", "Superlative"}: |
|
i += 1 |
|
continue |
|
|
|
raw_suffix = get_suffix(lem, low_txt) |
|
canon = normalize_suffix(lem, raw_suffix, _EXPECTED_SUFFIXES[inflect_type]) |
|
morpheme_form = f"/{canon}" if canon else "<IRR>" |
|
results.append({ |
|
"word": surf, |
|
"lemma": lem, |
|
"index": calculate_adjusted_index(i, position_map), |
|
"inflectional_morpheme": inflect_type, |
|
"morpheme_form": morpheme_form, |
|
}) |
|
|
|
i += 1 |
|
|
|
return results |
|
|
|
|
|
def annotate_morpheme(session_id, base_dir="session_data"): |
|
|
|
base_dir = base_dir or os.getcwd() |
|
json_file = os.path.join(base_dir, f"{session_id}/{session_id}_transcriptionCW.json") |
|
|
|
if not os.path.exists(json_file): |
|
raise FileNotFoundError(f"{json_file} not found – make sure transcription step ran first.") |
|
|
|
with open(json_file, "r", encoding="utf-8") as f: |
|
data = json.load(f) |
|
|
|
segments = data.get("segments", data) if isinstance(data, dict) else data |
|
|
|
for seg in segments: |
|
text = seg.get("text", "") |
|
seg["morphemes"] = extract_inflectional_morphemes(text) |
|
|
|
with open(json_file, "w", encoding="utf-8") as f: |
|
json.dump(data, f, ensure_ascii=False, indent=2) |
|
|
|
|
|
if __name__ == "__main__": |
|
print(extract_inflectional_morphemes("And he [UH] [UM] the rabbit [UH] makes [UH] sand castle.")) |