# -*- coding: utf-8 -*-
"""
:mod:`webstruct.feature_extraction` contains classes that help
with:
- converting HTML pages into lists of feature dicts and
- extracting annotations.
Usually, the approach is the following:
1. Extract text from the webpage and tokenize it, preserving information
about token position in original HTML tree
(token + its tree position = :class:`HtmlToken`).
Information about annotations (if present) is split from the rest
of data at this stage. :class:`HtmlTokenizer` is used for extracting
HTML tokens and annotation tags.
2. Run a number of "token feature functions" that return bits of information
about each token: token text, token shape (uppercased/lowercased/...),
whether token is in ``<a>`` HTML element, etc. For each token information
is combined into a single feature dictionary.
Use :class:`HtmlFeatureExtractor` at this stage. There is a number of
predefined token feature functions in :mod:`webstruct.features`.
3. Run a number of "global feature functions" that can modify token feature
dicts inplace (insert new features, change, remove them) using "global"
information - information about all other tokens in a document and their
existing token-level feature dicts. Global feature functions are applied
sequentially: subsequent global feature functions get feature dicts updated
by previous feature functions.
This is also done by :class:`HtmlFeatureExtractor`.
:class:`~webstruct.features.utils.LongestMatchGlobalFeature` can be used
to create features that capture multi-token patterns. Some predefined
global feature functions can be found in :mod:`webstruct.gazetteers`.
"""
from __future__ import absolute_import
import re
import copy
from itertools import chain, groupby
from collections import namedtuple, Counter
from lxml.etree import XPathEvaluator
from sklearn.base import BaseEstimator, TransformerMixin
from webstruct.sequence_encoding import IobEncoder
from webstruct.tokenizers import tokenize
from webstruct.features import CombinedFeatures
from webstruct.utils import replace_html_tags, kill_html_tags, smart_join
_HtmlToken = namedtuple('HtmlToken', 'index tokens elem is_tail')
[docs]class HtmlToken(_HtmlToken):
"""
HTML token info.
Attributes:
* :attr:`index` is a token index (in the :attr:`tokens` list)
* :attr:`tokens` is a list of all tokens in current html block
* :attr:`elem` is the current html block (as lxml's Element) - most
likely you want :attr:`parent` instead of it
* :attr:`is_tail` flag indicates that token belongs to element tail
Computed properties:
* :attr:`token` is the current token (as text);
* :attr:`parent` is token's parent HTML element (as lxml's Element);
* :attr:`root` is an ElementTree this token belongs to.
"""
@property
def token(self):
return self.tokens[self.index]
@property
def parent(self):
if not self.is_tail:
return self.elem
return self.elem.getparent()
@property
def root(self):
return self.elem.getroottree()
[docs]class HtmlTokenizer(object):
"""
Class for converting HTML trees (returned by one of the
:mod:`webstruct.loaders`) into lists of :class:`HtmlToken` instances
and associated tags. Also, it can do the reverse conversion.
Use :meth:`tokenize_single` to convert a single tree and :meth:`tokenize`
to convert multiple trees.
Use :meth:`detokenize_single` to get an annotated tree out of a list
of :class:`HtmlToken` instances and a list of tags.
Parameters
----------
tagset : set, optional
A set of entity types to keep. If not passed, all entity types are kept.
Use this argument to discard some entity types from training data.
sequence_encoder : object, optional
Sequence encoder object. If not passed,
:class:`~webstruct.sequence_encoding.IobEncoder` instance is created.
text_toknize_func : callable, optional
Function used for tokenizing text inside HTML elements.
By default, :class:`HtmlTokenizer` uses
:func:`webstruct.tokenizers.tokenize`.
kill_html_tags: set, optional
A set of HTML tags which should be removed. Contents inside
removed tags is not removed. See :func:`webstruct.utils.kill_html_tags`
replace_html_tags: dict, optional
A mapping ``{'old_tagname': 'new_tagname'}``. It defines how tags
should be renamed. See :func:`webstruct.utils.replace_html_tags`
ignore_html_tags: set, optional
A set of HTML tags which won't produce :class:`HtmlToken` instances,
but will be kept in a tree. Default is ``{'script', 'style'}``.
"""
def __init__(self, tagset=None, sequence_encoder=None, text_tokenize_func=None,
kill_html_tags=None, replace_html_tags=None, ignore_html_tags=None):
self.tagset = set(tagset) if tagset is not None else None
self.text_tokenize_func = text_tokenize_func or tokenize
self.kill_html_tags = kill_html_tags
self.replace_html_tags = replace_html_tags
if ignore_html_tags is not None:
self.ignore_html_tags = set(ignore_html_tags)
else:
self.ignore_html_tags = {'script', 'style'}
# FIXME: don't use shared instance of sequence encoder
# because sequence encoder is stateful
self.sequence_encoder = sequence_encoder or IobEncoder()
tag_pattern = self.sequence_encoder.token_processor.tag_re.pattern.strip()
self._tag_re = re.compile(r"(^|\s)%s(\s|$)" % tag_pattern)
[docs] def tokenize_single(self, tree):
"""
Return two lists:
* a list a list of HtmlToken tokens;
* a list of associated tags.
For unannotated HTML all tags will be "O" - they may be ignored.
Example:
>>> from webstruct import GateLoader, HtmlTokenizer
>>> loader = GateLoader(known_entities={'PER'})
>>> html_tokenizer = HtmlTokenizer(replace_html_tags={'b': 'strong'})
>>> tree = loader.loadbytes(b"<p>hello, <PER>John <b>Doe</b></PER> <br> <PER>Mary</PER> said</p>")
>>> html_tokens, tags = html_tokenizer.tokenize_single(tree)
>>> html_tokens # doctest: +ELLIPSIS
[HtmlToken(index=0, tokens=[u'hello', u'John'], elem=<Element p at ...>, is_tail=False), HtmlToken...]
>>> tags
['O', u'B-PER', u'I-PER', u'B-PER', 'O']
>>> for tok, iob_tag in zip(html_tokens, tags):
... print "%5s" % iob_tag, tok.token, tok.elem.tag, tok.parent.tag
O hello p p
B-PER John p p
I-PER Doe strong strong
B-PER Mary br p
O said br p
For HTML without text it returns empty lists::
>>> html_tokenizer.tokenize_single(loader.loadbytes(b'<p></p>'))
([], [])
"""
tree = copy.deepcopy(tree)
self.sequence_encoder.reset()
self._prepare_tree(tree)
res = zip(*(self._process_tree(tree)))
if not res:
return ([], [])
return list(res[0]), list(res[1])
[docs] def tokenize(self, trees):
X, y = [], []
for tree in trees:
html_tokens, tags = self.tokenize_single(tree)
X.append(html_tokens)
y.append(tags)
return X, y
[docs] def detokenize_single(self, html_tokens, tags):
"""
Build annotated ``lxml.etree.ElementTree`` from
``html_tokens`` (a list of :class:`.HtmlToken` instances)
and ``tags`` (a list of their tags).
Annotations are encoded as ``__START_TAG__`` and ``__END_TAG__``
text tokens (this is the format :mod:`webstruct.loaders` use).
"""
if len(html_tokens) != len(tags):
raise ValueError("len(html_tokens) must be equal to len(tags)")
if not html_tokens:
return None
orig_tree = html_tokens[0].root
tree = copy.deepcopy(orig_tree)
xpatheval = XPathEvaluator(tree)
# find starts/ends of token groups
token_groups = self.sequence_encoder.group(zip(html_tokens, tags))
starts, ends = set(), set()
pos = 0
for gr_tokens, gr_tag in token_groups:
n_tokens = len(gr_tokens)
if gr_tag != 'O':
starts.add(pos)
ends.add(pos + n_tokens - 1)
pos += n_tokens
# mark starts/ends with special tokens
data = zip(html_tokens, tags, range(len(html_tokens)))
keyfunc = lambda rec: (rec[0].elem, rec[0].is_tail)
for (orig_elem, is_tail), g in groupby(data, keyfunc):
g = list(g)
fix = False
tokens = g[0][0].tokens[:]
for token, tag, token_idx in g:
if token_idx in starts:
tokens[token.index] = ' __START_%s__ %s' % (tag[2:], tokens[token.index])
fix = True
if token_idx in ends:
tokens[token.index] = '%s __END_%s__ ' % (tokens[token.index], tag[2:])
fix = True
if fix:
xpath = orig_tree.getpath(orig_elem)
elem = xpatheval(xpath)[0]
if is_tail:
elem.tail = smart_join(tokens)
else:
elem.text = smart_join(tokens)
return tree
def _prepare_tree(self, tree):
if self.kill_html_tags:
kill_html_tags(tree, self.kill_html_tags, keep_child=True)
if self.replace_html_tags:
replace_html_tags(tree, self.replace_html_tags)
def _process_tree(self, tree):
if tree.tag in self.ignore_html_tags:
return
head_tokens, head_tags = self._tokenize_and_split(tree.text)
for index, (token, tag) in enumerate(zip(head_tokens, head_tags)):
yield HtmlToken(index, head_tokens, tree, False), tag
for child in tree: # where is my precious "yield from"?
for html_token, tag in self._process_tree(child):
yield html_token, tag
tail_tokens, tail_tags = self._tokenize_and_split(tree.tail)
for index, (token, tag) in enumerate(zip(tail_tokens, tail_tags)):
yield HtmlToken(index, tail_tokens, tree, True), tag
self._cleanup_elem(tree)
def _cleanup_elem(self, elem):
""" Remove special tokens from elem """
if elem.text:
elem.text = self._tag_re.sub("", elem.text)
if elem.tail:
elem.tail = self._tag_re.sub("", elem.tail)
def _tokenize_and_split(self, text):
input_tokens = self._limit_tags(self.text_tokenize_func(text or ''))
input_tokens = map(unicode, input_tokens)
return self.sequence_encoder.encode_split(input_tokens)
def _limit_tags(self, input_tokens):
if self.tagset is None:
return input_tokens
proc = self.sequence_encoder.token_processor
token_classes = [proc.classify(tok) for tok in input_tokens]
return [
tok for (tok, (typ, value)) in zip(input_tokens, token_classes)
if not (typ in {'start', 'end'} and value not in self.tagset)
]