"""
:mod:`webstruct.webannotator` provides functions for working with HTML
pages annotated with WebAnnotator_ Firefox extension.
.. _WebAnnotator: https://github.com/xtannier/WebAnnotator
"""
from __future__ import absolute_import
import re
import warnings
import random
from copy import deepcopy
from collections import defaultdict, OrderedDict
import xml.sax.handler
import lxml.sax
from lxml import html
from lxml.etree import Element, LXML_VERSION
from webstruct.utils import html_document_fromstring
DEFAULT_COLORS = [
# foreground, background
("#000000", "#33CCFF"),
("#000000", "#FF0000"),
("#000000", "#33FF33"),
("#000000", "#CC66CC"),
("#000000", "#FF9900"),
("#000000", "#99FFFF"),
("#000000", "#FF6666"),
("#000000", "#66FF99"),
("#FFFFFF", "#3333FF"),
("#FFFFFF", "#660000"),
("#FFFFFF", "#006600"),
("#FFFFFF", "#663366"),
("#FFFFFF", "#993300"),
("#FFFFFF", "#336666"),
("#FFFFFF", "#666600"),
("#FFFFFF", "#009900"),
]
def _get_colors(index):
try:
return DEFAULT_COLORS[index]
except IndexError:
fg = random.choice(["#000000", "#FFFFFF"])
bg = "#" + "".join(random.choice("01234567890ABCDEF") for x in range(6))
return fg, bg
[docs]class EntityColors(defaultdict):
"""
``{"entity_name": ("fg_color", "bg_color", entity_index)}`` mapping
that generates entries for new entities on first access.
"""
def __init__(self, **kwargs):
self.next_index = len(kwargs)
super(EntityColors, self).__init__(self._new_item_factory, **kwargs)
def _new_item_factory(self):
fg, bg = _get_colors(self.next_index)
self.next_index += 1
return fg, bg, self.next_index-1
@classmethod
[docs] def from_htmlfile(cls, path, encoding=None):
""" Load the color mapping from WebAnnotator-annotated HTML file """
with open(path, 'rb') as f:
return cls.from_htmlbytes(f.read(), encoding=encoding)
@classmethod
[docs] def from_htmlbytes(cls, html_bytes, encoding=None):
colors = cls()
tree = html_document_fromstring(html_bytes, encoding=encoding)
for wa_color in tree.xpath('//wa-color'):
assert wa_color.get('id').lower().startswith('wa-color-')
idx = int(wa_color.get('id')[len("WA-color-"):])
fg = wa_color.get('fg')
bg = wa_color.get('bg')
typ = wa_color.get('type')
colors[typ] = (fg, bg, idx)
return colors
def apply_wa_title(tree):
"""
Replace page's ``<title>`` contents with a contents of
``<wa-title>`` element and remove ``<wa-title>`` tag.
WebAnnotator > 1.14 allows annotation of ``<title>`` contents;
it is stored after body in ``<wa-title>`` elements.
"""
for wa_title in tree.xpath('//wa-title'):
titles = tree.xpath('//title')
if not titles:
wa_title.drop_tree()
return
title = titles[0]
head = title.getparent()
head.insert(head.index(title), wa_title)
title.drop_tree()
wa_title.tag = 'title'
return
class _WaContentHandler(xml.sax.handler.ContentHandler):
TAG_SPLIT_RE = re.compile(r'\s?(__(?:START|END)_(?:\w+)__)\s?')
TAG_PARSE_RE = re.compile(r'__(START|END)_(\w+)__')
def __init__(self, entity_colors=None):
self.idx = 0
self.entity = None
self.text_buf = []
self.out = lxml.sax.ElementTreeContentHandler(makeelement=html.Element)
if entity_colors is None:
entity_colors = EntityColors()
self.entity_colors = entity_colors
def startElementNS(self, name, qname, attributes):
self._flush()
self._closeSpan()
# print('start %s' % qname)
self.out.startElementNS(name, qname, attributes)
self._openSpan()
def endElementNS(self, name, qname):
self._flush()
self._closeSpan()
# print('end %s' % qname)
self.out.endElementNS(name, qname)
self._openSpan()
# print("")
def characters(self, data):
self.text_buf.append(data)
def startDocument(self):
self.out.startDocument()
def endDocument(self):
self.out.endDocument()
def _flush(self):
self.text = ''.join(self.text_buf)
self.text_buf = []
if self.text:
tokens = self.TAG_SPLIT_RE.split(self.text)
for token in tokens:
m = self.TAG_PARSE_RE.match(token.strip())
if m:
event, entity = m.groups()
if event == 'START':
self.entity = entity
self._openSpan()
elif event == 'END':
assert entity == self.entity
self._closeSpan()
self.idx += 1
self.entity = None
else:
self.out.characters(token)
# print("write %r" % token)
def _closeSpan(self):
if self.entity:
# print('close span %s' % self.entity)
self.out.endElement('span')
def _openSpan(self):
if self.entity:
# print('open span %s' % self.entity)
fg, bg, entity_idx = self.entity_colors[self.entity]
attrs = OrderedDict([
('wa-id', str(self.idx)),
('wa-type', str(self.entity)),
('wa-subtypes', ''),
('style', 'color:%s; background-color:%s;' % (fg, bg)),
('class', 'WebAnnotator_%s' % self.entity),
])
self.out.startElement('span', _fix_sax_attributes(attrs))
def _fix_sax_attributes(attrs):
""" Fix sax startElement attributes for lxml < 3.1.2 """
if LXML_VERSION >= (3,1,2):
return attrs
items = [((None, key), value) for key, value in attrs.items()]
return OrderedDict(items)
def _add_wacolor_elements(tree, entity_colors):
"""
Add <wa-color> elements after <body>::
<wa-color id="WA-color-0" bg="#33CCFF" fg="#000000" class="WebAnnotator_ORG" type="ORG">
"""
body = tree.find('.//body')
if body is None:
warnings.warn("html has no <body>, <wa-color> elements are not added")
return
for wa_color in tree.xpath('//wa-color'):
wa_color.drop_tree()
items = sorted(entity_colors.items(), key=lambda it: -it[1][2])
for ent, (fg, bg, idx) in items:
attrs = OrderedDict([
('id', "WA-color-%s" % idx),
('bg', bg),
('fg', fg),
('class', "WebAnnotator_%s" % ent),
('type', ent),
])
wa_color = Element("wa-color", attrs)
body.addnext(wa_color)
def _copy_title(tree):
# <wa-title style="box-shadow:0 0 1em black;border:2px solid blue;padding:0.5em;">Contact</wa-title>
title = tree.find('.//title')
if title is None:
return
body = tree.find('.//body')
if body is None:
warnings.warn("html has no <body>, <wa-title> element is not added")
return
for wa_title in tree.xpath('//wa-title'):
wa_title.drop_tree()
wa_title = deepcopy(title)
wa_title.tag = 'wa-title'
wa_title.set('style', 'box-shadow:0 0 1em black;border:2px solid blue;padding:0.5em;')
body.addnext(wa_title)
text = title.xpath('string()')
title.clear()
title.text = text
[docs]def to_webannotator(tree, entity_colors=None):
"""
Convert a tree loaded by one of WebStruct loaders to WebAnnotator format.
If you want a predictable colors assignment use ``entity_colors`` argument;
it should be a mapping ``{'entity_name': (fg, bg, entity_idx)}``;
entity names should be lowercased. You can use :class:`EntityColors`
to generate this mapping automatically:
>>> from webstruct.webannotator import EntityColors, to_webannotator
>>> # trees = ...
>>> entity_colors = EntityColors()
>>> wa_trees = [to_webannotator(tree, entity_colors) for tree in trees] # doctest: +SKIP
"""
handler = _WaContentHandler(entity_colors)
lxml.sax.saxify(tree, handler)
tree = handler.out.etree
_copy_title(tree)
_add_wacolor_elements(tree, handler.entity_colors)
return tree