import htmlentitydefs as html
import re
def unescapeHTML(str):
str = re.sub(r"<.+?>|</.+?>", '', str)
str = re.sub(r'&#(\d+);', lambda m: unichr(int(m.group(1))).encode('utf-8'), str)
str = re.sub(r'&(\w+);', lambda m: unichr(html.name2codepoint[m.group(1)]).encode('utf-8'), str)
return str
======================================
#using beautifulsoup
import re, copy
from BeautifulSoup import BeautifulSoup
hexentityMassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
hexentityMassage = [(re.compile('&#x([^;]+);'), lambda m: '&#%d' % int(m.group(1), 16))]
def unescapeHTML2(str):
str = re.sub(r"<.+?>|</.+?>", '', str)
try:
return BeautifulSoup(str, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage).contents[0].string
except:
return str
import re
def unescapeHTML(str):
str = re.sub(r"<.+?>|</.+?>", '', str)
str = re.sub(r'&#(\d+);', lambda m: unichr(int(m.group(1))).encode('utf-8'), str)
str = re.sub(r'&(\w+);', lambda m: unichr(html.name2codepoint[m.group(1)]).encode('utf-8'), str)
return str
======================================
#using beautifulsoup
import re, copy
from BeautifulSoup import BeautifulSoup
hexentityMassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
hexentityMassage = [(re.compile('&#x([^;]+);'), lambda m: '&#%d' % int(m.group(1), 16))]
def unescapeHTML2(str):
str = re.sub(r"<.+?>|</.+?>", '', str)
try:
return BeautifulSoup(str, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage).contents[0].string
except:
return str
Nhận xét
Đăng nhận xét