Spaces:

Sven33
/

SATE

Sleeping

File size: 9,398 Bytes

import os
import json
import stanza
import re

nlp = stanza.Pipeline(
    lang="en",
    processors="tokenize,pos,lemma",
    tokenize_pretokenized=False,
)

_EXPECTED_SUFFIXES = {
    "Plural":              {"s", "es"},
    "Possessive":          {"'s", "s"},
    "Comparative":         {"er"},
    "Superlative":         {"est"},
    "3rd Person Singular": {"s", "es"},
    "Past Tense":          {"ed"},
    "Past Participle":     {"ed", "en", "n"},
    "Progressive":         {"ing"},
    "Gerund":              {"ing"},
}

_CONTRACTION_PARTICLES = {
    "'ll": "will",          # we'll, he'll
    "'d":  "would/had",     # I'd, she'd
    "'ve": "have",          # we've, they've
    "'re": "are",           # you're, they're
    "'m":  "am",            # I'm
    "n't": "not",           # isn't, didn't
    "'s":  "is/has",        # what's, she's
}

_S_TOKENS = {"'s", "’s"}


def is_possessive_candidate(tok):
    return tok.text in _S_TOKENS and tok.upos == "PART"


def lcp(a: str, b: str) -> str:
    i = 0
    while i < min(len(a), len(b)) and a[i].lower() == b[i].lower():
        i += 1
    return a[:i]


def strip_doubling(lemma: str, suf: str) -> str:
    if suf and len(suf) >= 2 and suf[0] == lemma[-1]:
        cand = suf[1:]
        if any(cand in v for v in _EXPECTED_SUFFIXES.values()):
            return cand
    return suf


def get_suffix(lemma: str, surface: str) -> str:
    return strip_doubling(lemma, surface[len(lcp(lemma, surface)):])


def normalize_suffix(lemma: str, raw_suf: str, expected_set: set) -> str | None:
    if raw_suf in expected_set:
        return raw_suf
    if lemma.lower().endswith("y") and raw_suf.startswith("i"):
        alt = raw_suf[1:]
        if alt in expected_set:
            return alt
    return None


def preprocess_text(text: str) -> tuple[str, list[int]]:

    original_words = text.split()
    
    position_map = []  # position_map[original_index] = cleaned_index
    cleaned_words = []
    
    for i, word in enumerate(original_words):
        if re.match(r'\[.*\]', word):
            position_map.append(-1)
        else:
            position_map.append(len(cleaned_words))
            cleaned_words.append(word)
    
    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text, position_map


def calculate_adjusted_index(cleaned_index: int, position_map: list[int]) -> int:

    for original_index, cleaned_pos in enumerate(position_map):
        if cleaned_pos == cleaned_index:
            return original_index
    

    return cleaned_index


def extract_inflectional_morphemes(text: str):

    cleaned_text, position_map = preprocess_text(text)
    
    doc = nlp(cleaned_text)
    results = []

    for sent in doc.sentences:
        words = sent.words
        i = 0
        while i < len(words):
            w = words[i]
            surf, lem, pos = w.text, w.lemma, w.upos
            feats = {k: v for k, v in (f.split("=", 1) for f in (w.feats or "").split("|") if "=" in f)}
            low_txt = surf.lower()

            if is_possessive_candidate(w) and i > 0:
                prev = words[i - 1]
                results.append({
                    "word": prev.text + surf,
                    "lemma": prev.lemma,
                    "index": calculate_adjusted_index(i - 1, position_map),
                    "inflectional_morpheme": "Possessive"
                    if prev.upos in {"NOUN", "PROPN"} else "Contraction",
                    "morpheme_form": "'/s",
                })
                i += 1
                continue

            if low_txt in _CONTRACTION_PARTICLES and i > 0:
                prev = words[i - 1]
                results.append({
                    "word": prev.text + surf,
                    "lemma": prev.lemma,
                    "index": calculate_adjusted_index(i - 1, position_map),
                    "inflectional_morpheme": "Contraction",
                    "morpheme_form": low_txt,
                })
                i += 1
                continue

            if feats.get("Poss") == "Yes" and pos in {"PRON", "DET"}:
                low_lem, low_surf = lem.lower(), surf.lower()
                suf = get_suffix(low_lem, low_surf)
                morpheme_form = "/s" if suf in {"s", "es"} and low_lem + suf == low_surf else "<IRR>"
                results.append({
                    "word": surf,
                    "lemma": lem,
                    "index": calculate_adjusted_index(i, position_map),
                    "inflectional_morpheme": "Possessive",
                    "morpheme_form": morpheme_form,
                })
                i += 1
                continue

            inflect_type = None
            if pos == "NOUN" and feats.get("Number") == "Plur":
                inflect_type = "Plural"
            elif pos == "ADJ" and feats.get("Degree") == "Cmp":
                inflect_type = "Comparative"
            elif pos == "ADJ" and feats.get("Degree") == "Sup":
                inflect_type = "Superlative"
            elif pos == "VERB" and feats.get("VerbForm") == "Fin" and feats.get("Tense") == "Pres" and feats.get("Person") == "3":
                inflect_type = "3rd Person Singular"
            elif pos == "VERB" and feats.get("VerbForm") == "Fin" and feats.get("Tense") == "Past":
                inflect_type = "Past Tense"
            elif pos == "VERB" and feats.get("VerbForm") == "Part":
                if feats.get("Tense") == "Past" or w.xpos == "VBN":
                    inflect_type = "Past Participle"
                elif feats.get("Tense") == "Pres" or w.xpos == "VBG":
                    inflect_type = "Progressive"

            if inflect_type:
                if surf.lower() == lem.lower() and inflect_type not in {"Possessive", "Comparative", "Superlative"}:
                    i += 1
                    continue

                raw_suffix = get_suffix(lem, low_txt)
                canon = normalize_suffix(lem, raw_suffix, _EXPECTED_SUFFIXES[inflect_type])
                morpheme_form = f"/{canon}" if canon else "<IRR>"
                results.append({
                    "word": surf,
                    "lemma": lem,
                    "index": calculate_adjusted_index(i, position_map),
                    "inflectional_morpheme": inflect_type,
                    "morpheme_form": morpheme_form,
                })

            i += 1

    return results


def extract_morpheme_omissions(text: str):

    cleaned_text, position_map = preprocess_text(text)
    
    doc = nlp(cleaned_text)
    omissions = []

    for sent in doc.sentences:
        words = sent.words
        i = 0
        while i < len(words):
            w = words[i]
            surf, lem, pos = w.text, w.lemma, w.upos
            feats = {k: v for k, v in (f.split("=", 1) for f in (w.feats or "").split("|") if "=" in f)}

            inflect_type = None
            if pos == "NOUN" and feats.get("Number") == "Plur":
                inflect_type = "Plural"
            elif pos == "ADJ" and feats.get("Degree") == "Cmp":
                inflect_type = "Comparative"
            elif pos == "ADJ" and feats.get("Degree") == "Sup":
                inflect_type = "Superlative"
            elif pos == "VERB" and feats.get("VerbForm") == "Fin" and feats.get("Tense") == "Pres" and feats.get("Person") == "3":
                inflect_type = "3rd Person Singular"
            elif pos == "VERB" and feats.get("VerbForm") == "Fin" and feats.get("Tense") == "Past":
                inflect_type = "Past Tense"
            elif pos == "VERB" and feats.get("VerbForm") == "Part":
                if feats.get("Tense") == "Past" or w.xpos == "VBN":
                    inflect_type = "Past Participle"
                elif feats.get("Tense") == "Pres" or w.xpos == "VBG":
                    inflect_type = "Progressive"

            if inflect_type and surf.lower() == lem.lower() and inflect_type not in {"Possessive", "Comparative", "Superlative"}:
                omissions.append({
                    "word": surf,
                    "lemma": lem,
                    "index": calculate_adjusted_index(i, position_map),
                    "inflectional_morpheme": inflect_type,
                    "morpheme_form": "<OMI>", 
                })

            i += 1

    return omissions


def annotate_morpheme_omission(session_id, base_dir="session_data"):

    base_dir = base_dir or os.getcwd()
    json_file = os.path.join(base_dir, f"{session_id}/{session_id}_transcriptionCW.json")

    if not os.path.exists(json_file):
        raise FileNotFoundError(f"{json_file} not found, make sure transcription step ran first.")

    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    segments = data.get("segments", data) if isinstance(data, dict) else data

    for seg in segments:
        text = seg.get("text", "")
        seg["morpheme_omissions"] = extract_morpheme_omissions(text)

    with open(json_file, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


if __name__ == "__main__":
    sample = "And he [UM] [UM] the rabbit [UM] [UH] [UH] make [UH] sand castle."
    print("Inflectional Morphemes:")
    print(json.dumps(extract_inflectional_morphemes(sample), indent=2, ensure_ascii=False))
    print("\nMorpheme Omissions:")
    print(json.dumps(extract_morpheme_omissions(sample), indent=2, ensure_ascii=False))