File size: 2,677 Bytes
67d6834 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import os
import json
def annotate_transcript(session_id, base_dir="session_data"):
session_dir = os.path.join(base_dir, session_id)
json_file = os.path.join(session_dir, f"{session_id}_transcriptionCW.json")
output_file = os.path.join(session_dir, "annotation_result.txt")
if not os.path.exists(json_file):
print(f"Error: could not find {json_file}")
return
with open(json_file, "r", encoding="utf-8") as f:
data = json.load(f)
segments = data.get("segments", [])
annotated_lines = []
for segment in segments:
speaker = segment.get("speaker", "UNKNOWN")
words = segment.get("words", [])
n = len(words)
pause_map = {}
for pause in segment.get("pauses", []):
pause_start = pause.get("start")
duration = pause.get("duration")
for idx, token in enumerate(words):
if abs(token.get("end", 0) - pause_start) < 0.01:
pause_map.setdefault(idx, []).append(f"({duration})")
break
rep_map = {}
for rep in segment.get("repetitions", []):
indices = rep.get("words", [])
if indices:
start_idx = indices[0]
end_idx = indices[-1]
rep_content = rep.get("content", "")
rep_map[start_idx] = (end_idx, rep_content)
annotated_tokens = []
i = 0
while i < n:
if i in rep_map:
rep_end, rep_content = rep_map[i]
rep_str = f"<{rep_content}> [/]"
annotated_tokens.append(rep_str)
if rep_end in pause_map:
for pause_marker in pause_map[rep_end]:
annotated_tokens.append(pause_marker)
i = rep_end + 1
else:
token_word = words[i].get("word", "")
annotated_tokens.append(token_word)
if i in pause_map:
for pause_marker in pause_map[i]:
annotated_tokens.append(pause_marker)
i += 1
# join all transcript
transcript = " ".join(annotated_tokens)
# gen *SPEAKER:\ttranscript
line = f"*{speaker}\t{transcript}"
annotated_lines.append(line)
# write annotation_result.txt
with open(output_file, "w", encoding="utf-8") as f:
for line in annotated_lines:
f.write(line + "\n")
print(f"Annotation done in {output_file}")
return output_file
if __name__ == "__main__":
annotate_transcript("000030")
|