|
import os |
|
import json |
|
import re |
|
import string |
|
|
|
|
|
def load_custom_dict(dict_path): |
|
with open(dict_path, 'r', encoding='utf-8') as f: |
|
return json.load(f) |
|
|
|
custom_dict_path = "./syllable_dict_ENNI_refine.json" |
|
custom_dict = load_custom_dict(custom_dict_path) |
|
|
|
vowels_phonemes = [ |
|
"iː", "uː", "ɜː", "ɔː", "ɑː", |
|
"ɪ", "ʊ", "e", "ə", "æ", "ʌ", "ɛ", "ɒ", |
|
"eɪ", "aɪ", "ɔɪ", "aʊ", "əʊ", "ɪə", "eə", "ʊə" |
|
] |
|
|
|
def phoneme_type(phoneme): |
|
return 'V' if phoneme in vowels_phonemes else 'C' |
|
|
|
def get_pronunciation_from_dict(word): |
|
clean_word = word.strip(string.punctuation).lower() |
|
return custom_dict.get(clean_word, "") |
|
|
|
def split_ipa_into_syllables(ipa_str): |
|
ipa_str = ipa_str.replace("ˈ", ".").replace("ˌ", ".") |
|
return [s for s in ipa_str.split('.') if s.strip()] |
|
|
|
def split_syllable_into_phonemes(syllable): |
|
vowels_sorted = sorted(vowels_phonemes, key=len, reverse=True) |
|
phonemes = [] |
|
i = 0 |
|
while i < len(syllable): |
|
matched = None |
|
for v in vowels_sorted: |
|
if syllable[i:i+len(v)] == v: |
|
matched = v |
|
break |
|
if matched: |
|
phonemes.append(matched) |
|
i += len(matched) |
|
else: |
|
phonemes.append(syllable[i]) |
|
i += 1 |
|
return phonemes |
|
|
|
def analyze_word_syllables(word): |
|
ipa_str = get_pronunciation_from_dict(word) |
|
if not ipa_str: |
|
return [] |
|
syllables_ipa = split_ipa_into_syllables(ipa_str) |
|
syllable_data = [] |
|
for syl in syllables_ipa: |
|
phs = split_syllable_into_phonemes(syl) |
|
CV = ''.join(phoneme_type(p) for p in phs) |
|
syllable_data.append({ |
|
"syllable": ''.join(phs), |
|
"phonemes": phs, |
|
"CV_pattern": CV |
|
}) |
|
return syllable_data |
|
|
|
def annotate_syllables(session_id, base_dir="session_data"): |
|
json_file = os.path.join(base_dir, session_id, f"{session_id}_transcriptionCW.json") |
|
if not os.path.exists(json_file): |
|
print(f"[Error] Cannot find file: {json_file}") |
|
return |
|
|
|
with open(json_file, "r", encoding="utf-8") as f: |
|
data = json.load(f) |
|
|
|
for segment in data.get("segments", []): |
|
text = segment.get("text", "") |
|
words_info = segment.get("words", []) |
|
syllables = [] |
|
|
|
for idx, word_obj in enumerate(words_info): |
|
word = word_obj.get("word", "") |
|
if re.fullmatch(r"\[.*?\]", word): |
|
continue |
|
word_syllables = analyze_word_syllables(word) |
|
for syl in word_syllables: |
|
syl["word_index"] = idx |
|
syllables.extend(word_syllables) |
|
|
|
segment["syllables"] = syllables |
|
|
|
with open(json_file, "w", encoding="utf-8") as f: |
|
json.dump(data, f, indent=4, ensure_ascii=False) |
|
|
|
print(f"Session {session_id} syllable annotation done: {json_file}") |
|
return data |
|
|
|
|
|
if __name__ == "__main__": |
|
annotate_syllables("000030") |
|
|