|
import os |
|
import json |
|
|
|
def annotate_transcript(session_id, base_dir="session_data"): |
|
|
|
session_dir = os.path.join(base_dir, session_id) |
|
json_file = os.path.join(session_dir, f"{session_id}_transcriptionCW.json") |
|
output_file = os.path.join(session_dir, "annotation_result.txt") |
|
|
|
if not os.path.exists(json_file): |
|
print(f"Error: could not find {json_file}") |
|
return |
|
|
|
with open(json_file, "r", encoding="utf-8") as f: |
|
data = json.load(f) |
|
|
|
segments = data.get("segments", []) |
|
annotated_lines = [] |
|
|
|
for segment in segments: |
|
speaker = segment.get("speaker", "UNKNOWN") |
|
words = segment.get("words", []) |
|
n = len(words) |
|
|
|
pause_map = {} |
|
for pause in segment.get("pauses", []): |
|
pause_start = pause.get("start") |
|
duration = pause.get("duration") |
|
for idx, token in enumerate(words): |
|
if abs(token.get("end", 0) - pause_start) < 0.01: |
|
pause_map.setdefault(idx, []).append(f"({duration})") |
|
break |
|
|
|
rep_map = {} |
|
for rep in segment.get("repetitions", []): |
|
indices = rep.get("words", []) |
|
if indices: |
|
start_idx = indices[0] |
|
end_idx = indices[-1] |
|
rep_content = rep.get("content", "") |
|
rep_map[start_idx] = (end_idx, rep_content) |
|
|
|
annotated_tokens = [] |
|
i = 0 |
|
while i < n: |
|
if i in rep_map: |
|
rep_end, rep_content = rep_map[i] |
|
rep_str = f"<{rep_content}> [/]" |
|
annotated_tokens.append(rep_str) |
|
if rep_end in pause_map: |
|
for pause_marker in pause_map[rep_end]: |
|
annotated_tokens.append(pause_marker) |
|
i = rep_end + 1 |
|
else: |
|
token_word = words[i].get("word", "") |
|
annotated_tokens.append(token_word) |
|
if i in pause_map: |
|
for pause_marker in pause_map[i]: |
|
annotated_tokens.append(pause_marker) |
|
i += 1 |
|
|
|
|
|
transcript = " ".join(annotated_tokens) |
|
|
|
line = f"*{speaker}\t{transcript}" |
|
annotated_lines.append(line) |
|
|
|
|
|
with open(output_file, "w", encoding="utf-8") as f: |
|
for line in annotated_lines: |
|
f.write(line + "\n") |
|
|
|
print(f"Annotation done in {output_file}") |
|
return output_file |
|
|
|
if __name__ == "__main__": |
|
|
|
annotate_transcript("000030") |
|
|