SATE / annotation.py
Shuwei Hou
init
67d6834
import os
import json
def annotate_transcript(session_id, base_dir="session_data"):
session_dir = os.path.join(base_dir, session_id)
json_file = os.path.join(session_dir, f"{session_id}_transcriptionCW.json")
output_file = os.path.join(session_dir, "annotation_result.txt")
if not os.path.exists(json_file):
print(f"Error: could not find {json_file}")
return
with open(json_file, "r", encoding="utf-8") as f:
data = json.load(f)
segments = data.get("segments", [])
annotated_lines = []
for segment in segments:
speaker = segment.get("speaker", "UNKNOWN")
words = segment.get("words", [])
n = len(words)
pause_map = {}
for pause in segment.get("pauses", []):
pause_start = pause.get("start")
duration = pause.get("duration")
for idx, token in enumerate(words):
if abs(token.get("end", 0) - pause_start) < 0.01:
pause_map.setdefault(idx, []).append(f"({duration})")
break
rep_map = {}
for rep in segment.get("repetitions", []):
indices = rep.get("words", [])
if indices:
start_idx = indices[0]
end_idx = indices[-1]
rep_content = rep.get("content", "")
rep_map[start_idx] = (end_idx, rep_content)
annotated_tokens = []
i = 0
while i < n:
if i in rep_map:
rep_end, rep_content = rep_map[i]
rep_str = f"<{rep_content}> [/]"
annotated_tokens.append(rep_str)
if rep_end in pause_map:
for pause_marker in pause_map[rep_end]:
annotated_tokens.append(pause_marker)
i = rep_end + 1
else:
token_word = words[i].get("word", "")
annotated_tokens.append(token_word)
if i in pause_map:
for pause_marker in pause_map[i]:
annotated_tokens.append(pause_marker)
i += 1
# join all transcript
transcript = " ".join(annotated_tokens)
# gen *SPEAKER:\ttranscript
line = f"*{speaker}\t{transcript}"
annotated_lines.append(line)
# write annotation_result.txt
with open(output_file, "w", encoding="utf-8") as f:
for line in annotated_lines:
f.write(line + "\n")
print(f"Annotation done in {output_file}")
return output_file
if __name__ == "__main__":
annotate_transcript("000030")