from lxml.html import tostring
import lxml.html
import re
from .cleaners import normalize_spaces, clean_attributes
from .encoding import get_encoding
from .compat import str_
utf8_parser = lxml.html.HTMLParser(encoding="utf-8")
# 将 HTML 文本转为文档树
def build_doc(page):
# 如果页面文本是字符串
# 保持原样,不解析编码
if isinstance(page, str_):
encoding = None
decoded_page = page
else:
# 否则获取其编码,默认 UTF8
# 将字节串转化为字符串
encoding = get_encoding(page) or "utf-8"
decoded_page = page.decode(encoding, "replace")
# XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters
doc = lxml.html.document_fromstring(
decoded_page.encode("utf-8", "replace"), parser=utf8_parser
)
return doc, encoding
# JS风格的正则替换函数
def js_re(src, pattern, flags, repl):
# 将替换字符串中的 $ 换成 \\ 然后再执行 re.sub
return re.compile(pattern, flags).sub(src, repl.replace("$", "\\"))
# 规范化实体
# 将一些 Unicode 字符替换为等价 ASCII 字符
def normalize_entities(cur_title):
entities = {
u"\u2014": "-",
u"\u2013": "-",
u"—": "-",
u"–": "-",
u"\u00A0": " ",
u"\u00AB": '"',
u"\u00BB": '"',
u""": '"',
}
for c, r in entities.items():
if c in cur_title:
cur_title = cur_title.replace(c, r)
return cur_title
# 规范化标题 = 规范化实体+空白
def norm_title(title):
return normalize_entities(normalize_spaces(title))
def get_title(doc):
# 获取`<title>`节点
title = doc.find(".//title")
# 如果找不到或者没有内容,返回占位符
if title is None or title.text is None or len(title.text) == 0:
return "[no-title]"
# 规范化标题并返回
return norm_title(title.text)
# 获取作者
def get_author(doc):
# 获取`<meta name='author'>`
author = doc.find(".//meta[@name='author']")
# 还是找不到或者内容为空,就返回占位符
if author is None or 'content' not in author.keys() or \
len(author.get('content')) == 0:
return "[no-author]"
# 返回`content`属性的值
return author.get('content')
def add_match(collection, text, orig):
text = norm_title(text)
if len(text.split()) >= 2 and len(text) >= 15:
if text.replace('"', "") in orig.replace('"', ""):
collection.add(text)
# 正文中标题候选元素的一些 CSS 选择器
TITLE_CSS_HEURISTICS = [
"#title",
"#head",
"#heading",
".pageTitle",
".news_title",
".title",
".head",
".heading",
".contentheading",
".small_header_red",
]
# 获取简短标题
def shorten_title(doc):
# 寻找`<title>`节点
title = doc.find(".//title")
# 如果没有,或者没有文本,那么返回空串
if title is None or title.text is None or len(title.text) == 0:
return ""
# 规范化标题
title = orig = norm_title(title.text)
# 创建标题候选集
candidates = set()
# 对于每个`<h1> <h2> <h3>`
for item in [".//h1", ".//h2", ".//h3"]:
for e in list(doc.iterfind(item)):
# 如果它有内容,就加入候选
if e.text:
add_match(candidates, e.text, orig)
if e.text_content():
add_match(candidates, e.text_content(), orig)
# 对于每个标题候选元素
for item in TITLE_CSS_HEURISTICS:
for e in doc.cssselect(item):
# 如果它有内容,就加入候选
if e.text:
add_match(candidates, e.text, orig)
if e.text_content():
add_match(candidates, e.text_content(), orig)
if candidates:
# 如果候选集不为空,取最长元素当做标题
title = sorted(candidates, key=len)[-1]
else:
# 将文章标题和网站名称分开,类似
# `<title>text title | site name</title>`
for delimiter in [" | ", " - ", " :: ", " / "]:
# 对于每个分隔符,判断是否包含在标题中
if delimiter in title:
# 使用分隔符分割标题
parts = orig.split(delimiter)
# 找出标题网站名称前面还是后面
# 如果第一个元素每空格分成四段或者以上
# 就取第一个元素当标题,反之就取最后一个
if len(parts[0].split()) >= 4:
title = parts[0]
break
elif len(parts[-1].split()) >= 4:
title = parts[-1]
break
else:
if ": " in title:
parts = orig.split(": ")
if len(parts[-1].split()) >= 4:
title = parts[-1]
else:
title = orig.split(": ", 1)[1]
if not 15 < len(title) < 150:
return orig
return title
# 获取整洁版的正文
# is it necessary? Cleaner from LXML is initialized correctly in cleaners.py
def get_body(doc):
# 删除`<script>`、`<link>`和`<style>`
for elem in doc.xpath(".//script | .//link | .//style"):
elem.drop_tree()
# 获取文档的`<body>`,如果没有就是文档的根元素,之后获取其 HTML
# tostring() always return utf-8 encoded string
# FIXME: isn't better to use tounicode?
raw_html = tostring(doc.body or doc)
# 如果是字节串转换为字符串
if isinstance(raw_html, bytes):
raw_html = raw_html.decode()
# 把一些不良属性移除
# 但是先删属性后转 HTML 比较好,这就很迷
cleaned = clean_attributes(raw_html)
try:
# BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
return cleaned
except Exception: # FIXME find the equivalent lxml error
# logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
return raw_html