面试题答案
一键面试import re
import html
def extract_custom_tag_text(html_code):
# 定义正则表达式匹配自定义标签及内部文本
pattern = re.compile(r'<custom - tag(.*?)>(.*?)</custom - tag>', re.DOTALL)
results = []
for match in pattern.finditer(html_code):
# 还原属性值中的HTML实体
attrs = match.group(1)
attrs_dict = {}
if attrs:
for attr in attrs.split():
key, value = attr.split('=')
attrs_dict[key] = html.unescape(value.strip('"'))
# 还原标签内文本中的HTML实体
inner_text = html.unescape(match.group(2))
results.append(inner_text)
return results
你可以这样调用函数:
html_code = "<custom - tag attr1='value1' attr2='<special char>'>text1</custom - tag><custom - tag attr3='&'>text2</custom - tag>"
print(extract_custom_tag_text(html_code))