Shuwei Hou commited on
Commit
9e45db3
·
1 Parent(s): 37ea16b

fix_morpheme_index

Browse files
Files changed (2) hide show
  1. morpheme.py +38 -7
  2. morpheme_omission.py +41 -8
morpheme.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import json
3
  import stanza
 
4
 
5
  nlp = stanza.Pipeline(
6
  lang="en",
@@ -66,8 +67,38 @@ def normalize_suffix(lemma: str, raw_suf: str, expected_set: set) -> str | None:
66
  return None
67
 
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  def extract_inflectional_morphemes(text: str):
70
- doc = nlp(text)
 
 
71
  results = []
72
 
73
  for sent in doc.sentences:
@@ -86,7 +117,7 @@ def extract_inflectional_morphemes(text: str):
86
  results.append({
87
  "word": prev.text + surf,
88
  "lemma": prev.lemma,
89
- "index": i - 1,
90
  "inflectional_morpheme": "Possessive",
91
  "morpheme_form": "'/s",
92
  })
@@ -94,7 +125,7 @@ def extract_inflectional_morphemes(text: str):
94
  results.append({
95
  "word": prev.text + surf,
96
  "lemma": prev.lemma,
97
- "index": i - 1,
98
  "inflectional_morpheme": "Contraction",
99
  "morpheme_form": "'/s",
100
  })
@@ -106,7 +137,7 @@ def extract_inflectional_morphemes(text: str):
106
  results.append({
107
  "word": prev.text + surf,
108
  "lemma": prev.lemma,
109
- "index": i - 1,
110
  "inflectional_morpheme": "Contraction",
111
  "morpheme_form": low_txt,
112
  })
@@ -120,7 +151,7 @@ def extract_inflectional_morphemes(text: str):
120
  results.append({
121
  "word": surf,
122
  "lemma": lem,
123
- "index": i,
124
  "inflectional_morpheme": "Possessive",
125
  "morpheme_form": morpheme_form,
126
  })
@@ -155,7 +186,7 @@ def extract_inflectional_morphemes(text: str):
155
  results.append({
156
  "word": surf,
157
  "lemma": lem,
158
- "index": i,
159
  "inflectional_morpheme": inflect_type,
160
  "morpheme_form": morpheme_form,
161
  })
@@ -187,4 +218,4 @@ def annotate_morpheme(session_id, base_dir="session_data"):
187
 
188
 
189
  if __name__ == "__main__":
190
- print(extract_inflectional_morphemes("His is more better than mine, he get up in the water. He is take the buses. I like his books."))
 
1
  import os
2
  import json
3
  import stanza
4
+ import re
5
 
6
  nlp = stanza.Pipeline(
7
  lang="en",
 
67
  return None
68
 
69
 
70
+ def preprocess_text(text: str) -> tuple[str, list[int]]:
71
+
72
+ original_words = text.split()
73
+
74
+ position_map = [] # position_map[original_index] = cleaned_index
75
+ cleaned_words = []
76
+
77
+ for i, word in enumerate(original_words):
78
+ if re.match(r'\[.*\]', word):
79
+ position_map.append(-1)
80
+ else:
81
+ position_map.append(len(cleaned_words))
82
+ cleaned_words.append(word)
83
+
84
+ cleaned_text = ' '.join(cleaned_words)
85
+ return cleaned_text, position_map
86
+
87
+
88
+ def calculate_adjusted_index(cleaned_index: int, position_map: list[int]) -> int:
89
+
90
+ for original_index, cleaned_pos in enumerate(position_map):
91
+ if cleaned_pos == cleaned_index:
92
+ return original_index
93
+
94
+
95
+ return cleaned_index
96
+
97
+
98
  def extract_inflectional_morphemes(text: str):
99
+ cleaned_text, position_map = preprocess_text(text)
100
+
101
+ doc = nlp(cleaned_text)
102
  results = []
103
 
104
  for sent in doc.sentences:
 
117
  results.append({
118
  "word": prev.text + surf,
119
  "lemma": prev.lemma,
120
+ "index": calculate_adjusted_index(i - 1, position_map),
121
  "inflectional_morpheme": "Possessive",
122
  "morpheme_form": "'/s",
123
  })
 
125
  results.append({
126
  "word": prev.text + surf,
127
  "lemma": prev.lemma,
128
+ "index": calculate_adjusted_index(i - 1, position_map),
129
  "inflectional_morpheme": "Contraction",
130
  "morpheme_form": "'/s",
131
  })
 
137
  results.append({
138
  "word": prev.text + surf,
139
  "lemma": prev.lemma,
140
+ "index": calculate_adjusted_index(i - 1, position_map),
141
  "inflectional_morpheme": "Contraction",
142
  "morpheme_form": low_txt,
143
  })
 
151
  results.append({
152
  "word": surf,
153
  "lemma": lem,
154
+ "index": calculate_adjusted_index(i, position_map),
155
  "inflectional_morpheme": "Possessive",
156
  "morpheme_form": morpheme_form,
157
  })
 
186
  results.append({
187
  "word": surf,
188
  "lemma": lem,
189
+ "index": calculate_adjusted_index(i, position_map),
190
  "inflectional_morpheme": inflect_type,
191
  "morpheme_form": morpheme_form,
192
  })
 
218
 
219
 
220
  if __name__ == "__main__":
221
+ print(extract_inflectional_morphemes("And he [UH] [UM] the rabbit [UH] makes [UH] sand castle."))
morpheme_omission.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import json
3
  import stanza
 
4
 
5
  nlp = stanza.Pipeline(
6
  lang="en",
@@ -66,9 +67,39 @@ def normalize_suffix(lemma: str, raw_suf: str, expected_set: set) -> str | None:
66
  return None
67
 
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  def extract_inflectional_morphemes(text: str):
70
 
71
- doc = nlp(text)
 
 
72
  results = []
73
 
74
  for sent in doc.sentences:
@@ -85,7 +116,7 @@ def extract_inflectional_morphemes(text: str):
85
  results.append({
86
  "word": prev.text + surf,
87
  "lemma": prev.lemma,
88
- "index": i - 1,
89
  "inflectional_morpheme": "Possessive"
90
  if prev.upos in {"NOUN", "PROPN"} else "Contraction",
91
  "morpheme_form": "'/s",
@@ -98,7 +129,7 @@ def extract_inflectional_morphemes(text: str):
98
  results.append({
99
  "word": prev.text + surf,
100
  "lemma": prev.lemma,
101
- "index": i - 1,
102
  "inflectional_morpheme": "Contraction",
103
  "morpheme_form": low_txt,
104
  })
@@ -112,7 +143,7 @@ def extract_inflectional_morphemes(text: str):
112
  results.append({
113
  "word": surf,
114
  "lemma": lem,
115
- "index": i,
116
  "inflectional_morpheme": "Possessive",
117
  "morpheme_form": morpheme_form,
118
  })
@@ -147,7 +178,7 @@ def extract_inflectional_morphemes(text: str):
147
  results.append({
148
  "word": surf,
149
  "lemma": lem,
150
- "index": i,
151
  "inflectional_morpheme": inflect_type,
152
  "morpheme_form": morpheme_form,
153
  })
@@ -159,7 +190,9 @@ def extract_inflectional_morphemes(text: str):
159
 
160
  def extract_morpheme_omissions(text: str):
161
 
162
- doc = nlp(text)
 
 
163
  omissions = []
164
 
165
  for sent in doc.sentences:
@@ -191,7 +224,7 @@ def extract_morpheme_omissions(text: str):
191
  omissions.append({
192
  "word": surf,
193
  "lemma": lem,
194
- "index": i,
195
  "inflectional_morpheme": inflect_type,
196
  "morpheme_form": "<OMI>",
197
  })
@@ -223,7 +256,7 @@ def annotate_morpheme_omission(session_id, base_dir="session_data"):
223
 
224
 
225
  if __name__ == "__main__":
226
- sample = "His is more better than mine. He's going to play. He get up in the water. He is take the buses."
227
  print("Inflectional Morphemes:")
228
  print(json.dumps(extract_inflectional_morphemes(sample), indent=2, ensure_ascii=False))
229
  print("\nMorpheme Omissions:")
 
1
  import os
2
  import json
3
  import stanza
4
+ import re
5
 
6
  nlp = stanza.Pipeline(
7
  lang="en",
 
67
  return None
68
 
69
 
70
+ def preprocess_text(text: str) -> tuple[str, list[int]]:
71
+
72
+ original_words = text.split()
73
+
74
+ position_map = [] # position_map[original_index] = cleaned_index
75
+ cleaned_words = []
76
+
77
+ for i, word in enumerate(original_words):
78
+ if re.match(r'\[.*\]', word):
79
+ position_map.append(-1)
80
+ else:
81
+ position_map.append(len(cleaned_words))
82
+ cleaned_words.append(word)
83
+
84
+ cleaned_text = ' '.join(cleaned_words)
85
+ return cleaned_text, position_map
86
+
87
+
88
+ def calculate_adjusted_index(cleaned_index: int, position_map: list[int]) -> int:
89
+
90
+ for original_index, cleaned_pos in enumerate(position_map):
91
+ if cleaned_pos == cleaned_index:
92
+ return original_index
93
+
94
+
95
+ return cleaned_index
96
+
97
+
98
  def extract_inflectional_morphemes(text: str):
99
 
100
+ cleaned_text, position_map = preprocess_text(text)
101
+
102
+ doc = nlp(cleaned_text)
103
  results = []
104
 
105
  for sent in doc.sentences:
 
116
  results.append({
117
  "word": prev.text + surf,
118
  "lemma": prev.lemma,
119
+ "index": calculate_adjusted_index(i - 1, position_map),
120
  "inflectional_morpheme": "Possessive"
121
  if prev.upos in {"NOUN", "PROPN"} else "Contraction",
122
  "morpheme_form": "'/s",
 
129
  results.append({
130
  "word": prev.text + surf,
131
  "lemma": prev.lemma,
132
+ "index": calculate_adjusted_index(i - 1, position_map),
133
  "inflectional_morpheme": "Contraction",
134
  "morpheme_form": low_txt,
135
  })
 
143
  results.append({
144
  "word": surf,
145
  "lemma": lem,
146
+ "index": calculate_adjusted_index(i, position_map),
147
  "inflectional_morpheme": "Possessive",
148
  "morpheme_form": morpheme_form,
149
  })
 
178
  results.append({
179
  "word": surf,
180
  "lemma": lem,
181
+ "index": calculate_adjusted_index(i, position_map),
182
  "inflectional_morpheme": inflect_type,
183
  "morpheme_form": morpheme_form,
184
  })
 
190
 
191
  def extract_morpheme_omissions(text: str):
192
 
193
+ cleaned_text, position_map = preprocess_text(text)
194
+
195
+ doc = nlp(cleaned_text)
196
  omissions = []
197
 
198
  for sent in doc.sentences:
 
224
  omissions.append({
225
  "word": surf,
226
  "lemma": lem,
227
+ "index": calculate_adjusted_index(i, position_map),
228
  "inflectional_morpheme": inflect_type,
229
  "morpheme_form": "<OMI>",
230
  })
 
256
 
257
 
258
  if __name__ == "__main__":
259
+ sample = "And he [UM] [UM] the rabbit [UM] [UH] [UH] make [UH] sand castle."
260
  print("Inflectional Morphemes:")
261
  print(json.dumps(extract_inflectional_morphemes(sample), indent=2, ensure_ascii=False))
262
  print("\nMorpheme Omissions:")