Shuwei Hou commited on
Commit
37ea16b
·
1 Parent(s): a04f574

refine_sentence_segment

Browse files
Files changed (2) hide show
  1. morpheme_omission.py +1 -1
  2. preprocess.py +74 -2
morpheme_omission.py CHANGED
@@ -223,7 +223,7 @@ def annotate_morpheme_omission(session_id, base_dir="session_data"):
223
 
224
 
225
  if __name__ == "__main__":
226
- sample = "His is more better than mine, he get up in the water. He is take the buses."
227
  print("Inflectional Morphemes:")
228
  print(json.dumps(extract_inflectional_morphemes(sample), indent=2, ensure_ascii=False))
229
  print("\nMorpheme Omissions:")
 
223
 
224
 
225
  if __name__ == "__main__":
226
+ sample = "His is more better than mine. He's going to play. He get up in the water. He is take the buses."
227
  print("Inflectional Morphemes:")
228
  print(json.dumps(extract_inflectional_morphemes(sample), indent=2, ensure_ascii=False))
229
  print("\nMorpheme Omissions:")
preprocess.py CHANGED
@@ -68,6 +68,75 @@ def load_audio_for_split(input_audio_file):
68
  else:
69
  return sf.read(input_audio_file)
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  def process_audio_file(input_audio_file, num_speakers, device="cuda"):
72
 
73
  print("Loading WhisperX model (English)...")
@@ -237,7 +306,10 @@ def process_audio_file(input_audio_file, num_speakers, device="cuda"):
237
  "text": text,
238
  "words": words_info
239
  }
240
- segments_cw.append(segment_entry)
 
 
 
241
 
242
  segments_cw = sorted(segments_cw, key=lambda x: x["start"])
243
  cw_json_path = os.path.join(session_dir, f"{session_id}_transcriptionCW.json")
@@ -256,4 +328,4 @@ def process_audio_file(input_audio_file, num_speakers, device="cuda"):
256
 
257
  if __name__ == "__main__":
258
  session = process_audio_file("/home/easgrad/shuweiho/workspace/volen/SATE_docker_test/input/454.mp3", num_speakers=2, device="cuda")
259
- print("Processing complete. Session ID:", session)
 
68
  else:
69
  return sf.read(input_audio_file)
70
 
71
+ def split_segment_by_sentences(segment):
72
+
73
+ text = segment["text"]
74
+ words = segment["words"]
75
+ start_time = segment["start"]
76
+ end_time = segment["end"]
77
+ speaker = segment["speaker"]
78
+
79
+ sentences = [s.strip() for s in text.split('.') if s.strip()]
80
+
81
+ if len(sentences) <= 1:
82
+ return [segment]
83
+
84
+ new_segments = []
85
+ word_index = 0
86
+
87
+ for i, sentence in enumerate(sentences):
88
+ if not sentence:
89
+ continue
90
+
91
+ sentence_words = []
92
+ sentence_text_clean = re.sub(r'[^\w\s]', '', sentence.lower())
93
+ sentence_word_tokens = sentence_text_clean.split()
94
+
95
+ matched_words = 0
96
+ sentence_start = None
97
+ sentence_end = None
98
+
99
+ temp_word_index = word_index
100
+ while temp_word_index < len(words) and matched_words < len(sentence_word_tokens):
101
+ word_obj = words[temp_word_index]
102
+ word_text_clean = re.sub(r'[^\w\s]', '', word_obj["word"].lower())
103
+
104
+ if word_text_clean == sentence_word_tokens[matched_words]:
105
+ if sentence_start is None:
106
+ sentence_start = word_obj["start"]
107
+ sentence_end = word_obj["end"]
108
+ sentence_words.append(word_obj)
109
+ matched_words += 1
110
+ elif word_text_clean in sentence_word_tokens[matched_words:]:
111
+ sentence_words.append(word_obj)
112
+ if sentence_start is None:
113
+ sentence_start = word_obj["start"]
114
+ sentence_end = word_obj["end"]
115
+
116
+ temp_word_index += 1
117
+
118
+ if sentence_start is None or sentence_end is None:
119
+ total_duration = end_time - start_time
120
+ sentence_duration = total_duration / len(sentences)
121
+ sentence_start = start_time + i * sentence_duration
122
+ sentence_end = start_time + (i + 1) * sentence_duration
123
+
124
+ if i == len(sentences) - 1:
125
+ sentence_end = end_time
126
+
127
+ word_index = temp_word_index
128
+
129
+ new_segment = {
130
+ "start": round(sentence_start, 3),
131
+ "end": round(sentence_end, 3),
132
+ "speaker": speaker,
133
+ "text": sentence + ".",
134
+ "words": sentence_words
135
+ }
136
+ new_segments.append(new_segment)
137
+
138
+ return new_segments
139
+
140
  def process_audio_file(input_audio_file, num_speakers, device="cuda"):
141
 
142
  print("Loading WhisperX model (English)...")
 
306
  "text": text,
307
  "words": words_info
308
  }
309
+
310
+ print(f"Post-processing: splitting segment by sentences...")
311
+ split_segments = split_segment_by_sentences(segment_entry)
312
+ segments_cw.extend(split_segments)
313
 
314
  segments_cw = sorted(segments_cw, key=lambda x: x["start"])
315
  cw_json_path = os.path.join(session_dir, f"{session_id}_transcriptionCW.json")
 
328
 
329
  if __name__ == "__main__":
330
  session = process_audio_file("/home/easgrad/shuweiho/workspace/volen/SATE_docker_test/input/454.mp3", num_speakers=2, device="cuda")
331
+ print("Processing complete. Session ID:", session)