Convert HTML Entities to normal text

import htmlentitydefs as html
import re

def unescapeHTML(str):
    str = re.sub(r"<.+?>|</.+?>", '', str)
    str = re.sub(r'&#(\d+);', lambda m: unichr(int(m.group(1))).encode('utf-8'), str)
    str = re.sub(r'&(\w+);', lambda m: unichr(html.name2codepoint[m.group(1)]).encode('utf-8'), str)
    return str

======================================
#using beautifulsoup
import re, copy
from BeautifulSoup import BeautifulSoup

hexentityMassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
hexentityMassage = [(re.compile('&#x([^;]+);'), lambda m: '&#%d' % int(m.group(1), 16))]

def unescapeHTML2(str):
    str = re.sub(r"<.+?>|</.+?>", '', str)
    try:
        return BeautifulSoup(str, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage).contents[0].string
    except:
        return str

Long Hoàng Giang

Tìm kiếm Blog này

Convert HTML Entities to normal text

Nhận xét

Đăng nhận xét

Bài đăng phổ biến từ blog này

firefox 4 get tabs in title bar | remove title caption firefox ubuntu

Tạo SVN Server & import project lên svn

unicode with python