Spaces:

Sven33
/

SATE

Sleeping

App Files Files Community

Shuwei Hou commited on Jun 18

Commit

37ea16b

1 Parent(s): a04f574

refine_sentence_segment

Browse files

Files changed (2) hide show

morpheme_omission.py +1 -1
preprocess.py +74 -2

morpheme_omission.py CHANGED Viewed

@@ -223,7 +223,7 @@ def annotate_morpheme_omission(session_id, base_dir="session_data"):
 if __name__ == "__main__":
-    sample = "His is more better than mine, he get up in the water. He is take the buses."
     print("Inflectional Morphemes:")
     print(json.dumps(extract_inflectional_morphemes(sample), indent=2, ensure_ascii=False))
     print("\nMorpheme Omissions:")

 if __name__ == "__main__":
+    sample = "His is more better than mine. He's going to play. He get up in the water. He is take the buses."
     print("Inflectional Morphemes:")
     print(json.dumps(extract_inflectional_morphemes(sample), indent=2, ensure_ascii=False))
     print("\nMorpheme Omissions:")

preprocess.py CHANGED Viewed

@@ -68,6 +68,75 @@ def load_audio_for_split(input_audio_file):
     else:
         return sf.read(input_audio_file)
 def process_audio_file(input_audio_file, num_speakers, device="cuda"):
     print("Loading WhisperX model (English)...")
@@ -237,7 +306,10 @@ def process_audio_file(input_audio_file, num_speakers, device="cuda"):
             "text": text,
             "words": words_info
         }
-        segments_cw.append(segment_entry)
     segments_cw = sorted(segments_cw, key=lambda x: x["start"])
     cw_json_path = os.path.join(session_dir, f"{session_id}_transcriptionCW.json")
@@ -256,4 +328,4 @@ def process_audio_file(input_audio_file, num_speakers, device="cuda"):
 if __name__ == "__main__":
     session = process_audio_file("/home/easgrad/shuweiho/workspace/volen/SATE_docker_test/input/454.mp3", num_speakers=2, device="cuda")
-    print("Processing complete. Session ID:", session)

     else:
         return sf.read(input_audio_file)
+def split_segment_by_sentences(segment):
+    text = segment["text"]
+    words = segment["words"]
+    start_time = segment["start"]
+    end_time = segment["end"]
+    speaker = segment["speaker"]
+    sentences = [s.strip() for s in text.split('.') if s.strip()]
+    if len(sentences) <= 1:
+        return [segment]
+    new_segments = []
+    word_index = 0
+    for i, sentence in enumerate(sentences):
+        if not sentence:
+            continue
+        sentence_words = []
+        sentence_text_clean = re.sub(r'[^\w\s]', '', sentence.lower())
+        sentence_word_tokens = sentence_text_clean.split()
+        matched_words = 0
+        sentence_start = None
+        sentence_end = None
+        temp_word_index = word_index
+        while temp_word_index < len(words) and matched_words < len(sentence_word_tokens):
+            word_obj = words[temp_word_index]
+            word_text_clean = re.sub(r'[^\w\s]', '', word_obj["word"].lower())
+            if word_text_clean == sentence_word_tokens[matched_words]:
+                if sentence_start is None:
+                    sentence_start = word_obj["start"]
+                sentence_end = word_obj["end"]
+                sentence_words.append(word_obj)
+                matched_words += 1
+            elif word_text_clean in sentence_word_tokens[matched_words:]:
+                sentence_words.append(word_obj)
+                if sentence_start is None:
+                    sentence_start = word_obj["start"]
+                sentence_end = word_obj["end"]
+            temp_word_index += 1
+        if sentence_start is None or sentence_end is None:
+            total_duration = end_time - start_time
+            sentence_duration = total_duration / len(sentences)
+            sentence_start = start_time + i * sentence_duration
+            sentence_end = start_time + (i + 1) * sentence_duration
+            if i == len(sentences) - 1:
+                sentence_end = end_time
+        word_index = temp_word_index
+        new_segment = {
+            "start": round(sentence_start, 3),
+            "end": round(sentence_end, 3),
+            "speaker": speaker,
+            "text": sentence + ".",
+            "words": sentence_words
+        }
+        new_segments.append(new_segment)
+    return new_segments
 def process_audio_file(input_audio_file, num_speakers, device="cuda"):
     print("Loading WhisperX model (English)...")
             "text": text,
             "words": words_info
         }
+        print(f"Post-processing: splitting segment by sentences...")
+        split_segments = split_segment_by_sentences(segment_entry)
+        segments_cw.extend(split_segments)
     segments_cw = sorted(segments_cw, key=lambda x: x["start"])
     cw_json_path = os.path.join(session_dir, f"{session_id}_transcriptionCW.json")
 if __name__ == "__main__":
     session = process_audio_file("/home/easgrad/shuweiho/workspace/volen/SATE_docker_test/input/454.mp3", num_speakers=2, device="cuda")
+    print("Processing complete. Session ID:", session)