Citelab / test.py
SHEN1017's picture
Upload 97 files
96b6673 verified
import re
from bs4 import BeautifulSoup
def parse_html_prompt(input_str):
soup = BeautifulSoup(input_str, "html.parser")
# 处理 <p></p> 内的内容
p_content = soup.find("p").decode_contents().replace("<br>", "\n")
p_content = re.sub(r'<span[^>]*>(.*?)</span>', r'<\1>', p_content)
template = p_content.strip().replace(' <br/>', '').replace(' ', '').replace('<br/>', '')
# 解析 component-item
components = {}
for item in soup.find_all("div", class_="component-item"):
key_span = item.find("div", class_="component-key").find("span")
key = key_span.get_text(strip=True) if key_span else ""
value_div = item.find("div", class_="component-value")
value_content = value_div.decode_contents()
value_content = re.sub(r'<span[^>]*>(.*?)</span>', r'{\1}', value_content)
components[key] = value_content.strip().replace(' <br/>', '').replace('<br/>', '')
# 解析 self-info-item
self_prompt = {}
for item in soup.find_all("div", class_="self-info-item"):
key_span = item.find("div", class_="component-key").find("span")
key = key_span.get_text(strip=True) if key_span else ""
value_div = item.find("div", class_="component-value")
value = value_div.get_text(strip=True) if value_div else ""
self_prompt[key] = value.replace(' <br/>', '').replace('<br/>', '')
return {
'template': template,
'components': components,
'self_prompt': self_prompt
}
#print(parse_html(info))