Shuwei Hou
commited on
Commit
·
9e45db3
1
Parent(s):
37ea16b
fix_morpheme_index
Browse files- morpheme.py +38 -7
- morpheme_omission.py +41 -8
morpheme.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import os
|
2 |
import json
|
3 |
import stanza
|
|
|
4 |
|
5 |
nlp = stanza.Pipeline(
|
6 |
lang="en",
|
@@ -66,8 +67,38 @@ def normalize_suffix(lemma: str, raw_suf: str, expected_set: set) -> str | None:
|
|
66 |
return None
|
67 |
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
def extract_inflectional_morphemes(text: str):
|
70 |
-
|
|
|
|
|
71 |
results = []
|
72 |
|
73 |
for sent in doc.sentences:
|
@@ -86,7 +117,7 @@ def extract_inflectional_morphemes(text: str):
|
|
86 |
results.append({
|
87 |
"word": prev.text + surf,
|
88 |
"lemma": prev.lemma,
|
89 |
-
"index": i - 1,
|
90 |
"inflectional_morpheme": "Possessive",
|
91 |
"morpheme_form": "'/s",
|
92 |
})
|
@@ -94,7 +125,7 @@ def extract_inflectional_morphemes(text: str):
|
|
94 |
results.append({
|
95 |
"word": prev.text + surf,
|
96 |
"lemma": prev.lemma,
|
97 |
-
"index": i - 1,
|
98 |
"inflectional_morpheme": "Contraction",
|
99 |
"morpheme_form": "'/s",
|
100 |
})
|
@@ -106,7 +137,7 @@ def extract_inflectional_morphemes(text: str):
|
|
106 |
results.append({
|
107 |
"word": prev.text + surf,
|
108 |
"lemma": prev.lemma,
|
109 |
-
"index": i - 1,
|
110 |
"inflectional_morpheme": "Contraction",
|
111 |
"morpheme_form": low_txt,
|
112 |
})
|
@@ -120,7 +151,7 @@ def extract_inflectional_morphemes(text: str):
|
|
120 |
results.append({
|
121 |
"word": surf,
|
122 |
"lemma": lem,
|
123 |
-
"index": i,
|
124 |
"inflectional_morpheme": "Possessive",
|
125 |
"morpheme_form": morpheme_form,
|
126 |
})
|
@@ -155,7 +186,7 @@ def extract_inflectional_morphemes(text: str):
|
|
155 |
results.append({
|
156 |
"word": surf,
|
157 |
"lemma": lem,
|
158 |
-
"index": i,
|
159 |
"inflectional_morpheme": inflect_type,
|
160 |
"morpheme_form": morpheme_form,
|
161 |
})
|
@@ -187,4 +218,4 @@ def annotate_morpheme(session_id, base_dir="session_data"):
|
|
187 |
|
188 |
|
189 |
if __name__ == "__main__":
|
190 |
-
print(extract_inflectional_morphemes("
|
|
|
1 |
import os
|
2 |
import json
|
3 |
import stanza
|
4 |
+
import re
|
5 |
|
6 |
nlp = stanza.Pipeline(
|
7 |
lang="en",
|
|
|
67 |
return None
|
68 |
|
69 |
|
70 |
+
def preprocess_text(text: str) -> tuple[str, list[int]]:
|
71 |
+
|
72 |
+
original_words = text.split()
|
73 |
+
|
74 |
+
position_map = [] # position_map[original_index] = cleaned_index
|
75 |
+
cleaned_words = []
|
76 |
+
|
77 |
+
for i, word in enumerate(original_words):
|
78 |
+
if re.match(r'\[.*\]', word):
|
79 |
+
position_map.append(-1)
|
80 |
+
else:
|
81 |
+
position_map.append(len(cleaned_words))
|
82 |
+
cleaned_words.append(word)
|
83 |
+
|
84 |
+
cleaned_text = ' '.join(cleaned_words)
|
85 |
+
return cleaned_text, position_map
|
86 |
+
|
87 |
+
|
88 |
+
def calculate_adjusted_index(cleaned_index: int, position_map: list[int]) -> int:
|
89 |
+
|
90 |
+
for original_index, cleaned_pos in enumerate(position_map):
|
91 |
+
if cleaned_pos == cleaned_index:
|
92 |
+
return original_index
|
93 |
+
|
94 |
+
|
95 |
+
return cleaned_index
|
96 |
+
|
97 |
+
|
98 |
def extract_inflectional_morphemes(text: str):
|
99 |
+
cleaned_text, position_map = preprocess_text(text)
|
100 |
+
|
101 |
+
doc = nlp(cleaned_text)
|
102 |
results = []
|
103 |
|
104 |
for sent in doc.sentences:
|
|
|
117 |
results.append({
|
118 |
"word": prev.text + surf,
|
119 |
"lemma": prev.lemma,
|
120 |
+
"index": calculate_adjusted_index(i - 1, position_map),
|
121 |
"inflectional_morpheme": "Possessive",
|
122 |
"morpheme_form": "'/s",
|
123 |
})
|
|
|
125 |
results.append({
|
126 |
"word": prev.text + surf,
|
127 |
"lemma": prev.lemma,
|
128 |
+
"index": calculate_adjusted_index(i - 1, position_map),
|
129 |
"inflectional_morpheme": "Contraction",
|
130 |
"morpheme_form": "'/s",
|
131 |
})
|
|
|
137 |
results.append({
|
138 |
"word": prev.text + surf,
|
139 |
"lemma": prev.lemma,
|
140 |
+
"index": calculate_adjusted_index(i - 1, position_map),
|
141 |
"inflectional_morpheme": "Contraction",
|
142 |
"morpheme_form": low_txt,
|
143 |
})
|
|
|
151 |
results.append({
|
152 |
"word": surf,
|
153 |
"lemma": lem,
|
154 |
+
"index": calculate_adjusted_index(i, position_map),
|
155 |
"inflectional_morpheme": "Possessive",
|
156 |
"morpheme_form": morpheme_form,
|
157 |
})
|
|
|
186 |
results.append({
|
187 |
"word": surf,
|
188 |
"lemma": lem,
|
189 |
+
"index": calculate_adjusted_index(i, position_map),
|
190 |
"inflectional_morpheme": inflect_type,
|
191 |
"morpheme_form": morpheme_form,
|
192 |
})
|
|
|
218 |
|
219 |
|
220 |
if __name__ == "__main__":
|
221 |
+
print(extract_inflectional_morphemes("And he [UH] [UM] the rabbit [UH] makes [UH] sand castle."))
|
morpheme_omission.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import os
|
2 |
import json
|
3 |
import stanza
|
|
|
4 |
|
5 |
nlp = stanza.Pipeline(
|
6 |
lang="en",
|
@@ -66,9 +67,39 @@ def normalize_suffix(lemma: str, raw_suf: str, expected_set: set) -> str | None:
|
|
66 |
return None
|
67 |
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
def extract_inflectional_morphemes(text: str):
|
70 |
|
71 |
-
|
|
|
|
|
72 |
results = []
|
73 |
|
74 |
for sent in doc.sentences:
|
@@ -85,7 +116,7 @@ def extract_inflectional_morphemes(text: str):
|
|
85 |
results.append({
|
86 |
"word": prev.text + surf,
|
87 |
"lemma": prev.lemma,
|
88 |
-
"index": i - 1,
|
89 |
"inflectional_morpheme": "Possessive"
|
90 |
if prev.upos in {"NOUN", "PROPN"} else "Contraction",
|
91 |
"morpheme_form": "'/s",
|
@@ -98,7 +129,7 @@ def extract_inflectional_morphemes(text: str):
|
|
98 |
results.append({
|
99 |
"word": prev.text + surf,
|
100 |
"lemma": prev.lemma,
|
101 |
-
"index": i - 1,
|
102 |
"inflectional_morpheme": "Contraction",
|
103 |
"morpheme_form": low_txt,
|
104 |
})
|
@@ -112,7 +143,7 @@ def extract_inflectional_morphemes(text: str):
|
|
112 |
results.append({
|
113 |
"word": surf,
|
114 |
"lemma": lem,
|
115 |
-
"index": i,
|
116 |
"inflectional_morpheme": "Possessive",
|
117 |
"morpheme_form": morpheme_form,
|
118 |
})
|
@@ -147,7 +178,7 @@ def extract_inflectional_morphemes(text: str):
|
|
147 |
results.append({
|
148 |
"word": surf,
|
149 |
"lemma": lem,
|
150 |
-
"index": i,
|
151 |
"inflectional_morpheme": inflect_type,
|
152 |
"morpheme_form": morpheme_form,
|
153 |
})
|
@@ -159,7 +190,9 @@ def extract_inflectional_morphemes(text: str):
|
|
159 |
|
160 |
def extract_morpheme_omissions(text: str):
|
161 |
|
162 |
-
|
|
|
|
|
163 |
omissions = []
|
164 |
|
165 |
for sent in doc.sentences:
|
@@ -191,7 +224,7 @@ def extract_morpheme_omissions(text: str):
|
|
191 |
omissions.append({
|
192 |
"word": surf,
|
193 |
"lemma": lem,
|
194 |
-
"index": i,
|
195 |
"inflectional_morpheme": inflect_type,
|
196 |
"morpheme_form": "<OMI>",
|
197 |
})
|
@@ -223,7 +256,7 @@ def annotate_morpheme_omission(session_id, base_dir="session_data"):
|
|
223 |
|
224 |
|
225 |
if __name__ == "__main__":
|
226 |
-
sample = "
|
227 |
print("Inflectional Morphemes:")
|
228 |
print(json.dumps(extract_inflectional_morphemes(sample), indent=2, ensure_ascii=False))
|
229 |
print("\nMorpheme Omissions:")
|
|
|
1 |
import os
|
2 |
import json
|
3 |
import stanza
|
4 |
+
import re
|
5 |
|
6 |
nlp = stanza.Pipeline(
|
7 |
lang="en",
|
|
|
67 |
return None
|
68 |
|
69 |
|
70 |
+
def preprocess_text(text: str) -> tuple[str, list[int]]:
|
71 |
+
|
72 |
+
original_words = text.split()
|
73 |
+
|
74 |
+
position_map = [] # position_map[original_index] = cleaned_index
|
75 |
+
cleaned_words = []
|
76 |
+
|
77 |
+
for i, word in enumerate(original_words):
|
78 |
+
if re.match(r'\[.*\]', word):
|
79 |
+
position_map.append(-1)
|
80 |
+
else:
|
81 |
+
position_map.append(len(cleaned_words))
|
82 |
+
cleaned_words.append(word)
|
83 |
+
|
84 |
+
cleaned_text = ' '.join(cleaned_words)
|
85 |
+
return cleaned_text, position_map
|
86 |
+
|
87 |
+
|
88 |
+
def calculate_adjusted_index(cleaned_index: int, position_map: list[int]) -> int:
|
89 |
+
|
90 |
+
for original_index, cleaned_pos in enumerate(position_map):
|
91 |
+
if cleaned_pos == cleaned_index:
|
92 |
+
return original_index
|
93 |
+
|
94 |
+
|
95 |
+
return cleaned_index
|
96 |
+
|
97 |
+
|
98 |
def extract_inflectional_morphemes(text: str):
|
99 |
|
100 |
+
cleaned_text, position_map = preprocess_text(text)
|
101 |
+
|
102 |
+
doc = nlp(cleaned_text)
|
103 |
results = []
|
104 |
|
105 |
for sent in doc.sentences:
|
|
|
116 |
results.append({
|
117 |
"word": prev.text + surf,
|
118 |
"lemma": prev.lemma,
|
119 |
+
"index": calculate_adjusted_index(i - 1, position_map),
|
120 |
"inflectional_morpheme": "Possessive"
|
121 |
if prev.upos in {"NOUN", "PROPN"} else "Contraction",
|
122 |
"morpheme_form": "'/s",
|
|
|
129 |
results.append({
|
130 |
"word": prev.text + surf,
|
131 |
"lemma": prev.lemma,
|
132 |
+
"index": calculate_adjusted_index(i - 1, position_map),
|
133 |
"inflectional_morpheme": "Contraction",
|
134 |
"morpheme_form": low_txt,
|
135 |
})
|
|
|
143 |
results.append({
|
144 |
"word": surf,
|
145 |
"lemma": lem,
|
146 |
+
"index": calculate_adjusted_index(i, position_map),
|
147 |
"inflectional_morpheme": "Possessive",
|
148 |
"morpheme_form": morpheme_form,
|
149 |
})
|
|
|
178 |
results.append({
|
179 |
"word": surf,
|
180 |
"lemma": lem,
|
181 |
+
"index": calculate_adjusted_index(i, position_map),
|
182 |
"inflectional_morpheme": inflect_type,
|
183 |
"morpheme_form": morpheme_form,
|
184 |
})
|
|
|
190 |
|
191 |
def extract_morpheme_omissions(text: str):
|
192 |
|
193 |
+
cleaned_text, position_map = preprocess_text(text)
|
194 |
+
|
195 |
+
doc = nlp(cleaned_text)
|
196 |
omissions = []
|
197 |
|
198 |
for sent in doc.sentences:
|
|
|
224 |
omissions.append({
|
225 |
"word": surf,
|
226 |
"lemma": lem,
|
227 |
+
"index": calculate_adjusted_index(i, position_map),
|
228 |
"inflectional_morpheme": inflect_type,
|
229 |
"morpheme_form": "<OMI>",
|
230 |
})
|
|
|
256 |
|
257 |
|
258 |
if __name__ == "__main__":
|
259 |
+
sample = "And he [UM] [UM] the rabbit [UM] [UH] [UH] make [UH] sand castle."
|
260 |
print("Inflectional Morphemes:")
|
261 |
print(json.dumps(extract_inflectional_morphemes(sample), indent=2, ensure_ascii=False))
|
262 |
print("\nMorpheme Omissions:")
|