SATE / syllable.py
Shuwei Hou
init
67d6834
import os
import json
import re
import string
# Load Dict
def load_custom_dict(dict_path):
with open(dict_path, 'r', encoding='utf-8') as f:
return json.load(f)
custom_dict_path = "./syllable_dict_ENNI_refine.json"
custom_dict = load_custom_dict(custom_dict_path)
vowels_phonemes = [
"iː", "uː", "ɜː", "ɔː", "ɑː",
"ɪ", "ʊ", "e", "ə", "æ", "ʌ", "ɛ", "ɒ",
"eɪ", "aɪ", "ɔɪ", "aʊ", "əʊ", "ɪə", "eə", "ʊə"
]
def phoneme_type(phoneme):
return 'V' if phoneme in vowels_phonemes else 'C'
def get_pronunciation_from_dict(word):
clean_word = word.strip(string.punctuation).lower()
return custom_dict.get(clean_word, "")
def split_ipa_into_syllables(ipa_str):
ipa_str = ipa_str.replace("ˈ", ".").replace("ˌ", ".")
return [s for s in ipa_str.split('.') if s.strip()]
def split_syllable_into_phonemes(syllable):
vowels_sorted = sorted(vowels_phonemes, key=len, reverse=True)
phonemes = []
i = 0
while i < len(syllable):
matched = None
for v in vowels_sorted:
if syllable[i:i+len(v)] == v:
matched = v
break
if matched:
phonemes.append(matched)
i += len(matched)
else:
phonemes.append(syllable[i])
i += 1
return phonemes
def analyze_word_syllables(word):
ipa_str = get_pronunciation_from_dict(word)
if not ipa_str:
return []
syllables_ipa = split_ipa_into_syllables(ipa_str)
syllable_data = []
for syl in syllables_ipa:
phs = split_syllable_into_phonemes(syl)
CV = ''.join(phoneme_type(p) for p in phs)
syllable_data.append({
"syllable": ''.join(phs),
"phonemes": phs,
"CV_pattern": CV
})
return syllable_data
def annotate_syllables(session_id, base_dir="session_data"):
json_file = os.path.join(base_dir, session_id, f"{session_id}_transcriptionCW.json")
if not os.path.exists(json_file):
print(f"[Error] Cannot find file: {json_file}")
return
with open(json_file, "r", encoding="utf-8") as f:
data = json.load(f)
for segment in data.get("segments", []):
text = segment.get("text", "")
words_info = segment.get("words", [])
syllables = []
for idx, word_obj in enumerate(words_info):
word = word_obj.get("word", "")
if re.fullmatch(r"\[.*?\]", word): # 跳过 filler
continue
word_syllables = analyze_word_syllables(word)
for syl in word_syllables:
syl["word_index"] = idx
syllables.extend(word_syllables)
segment["syllables"] = syllables
with open(json_file, "w", encoding="utf-8") as f:
json.dump(data, f, indent=4, ensure_ascii=False)
print(f"Session {session_id} syllable annotation done: {json_file}")
return data
if __name__ == "__main__":
annotate_syllables("000030")