# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import re
import collections
TextToken = collections.namedtuple('TextToken', 'chars, position, length')
[docs]class WordTokenizer(object):
r"""This tokenizer is copy-pasted version of TreebankWordTokenizer
that doesn't split on @ and ':' symbols and doesn't split contractions.
It supports span_tokenize(in terms of nltk tokenizers) method - :meth:`segment_words`::
>>> s = '''Good muffins cost $3.88\nin New York. Email: muffins@gmail.com'''
>>> WordTokenizer().segment_words(s)
[TextToken(chars='Good', position=0, length=4),
TextToken(chars='muffins', position=5, length=7),
TextToken(chars='cost', position=13, length=4),
TextToken(chars='$', position=18, length=1),
TextToken(chars='3.88', position=19, length=4),
TextToken(chars='in', position=24, length=2),
TextToken(chars='New', position=27, length=3),
TextToken(chars='York.', position=31, length=5),
TextToken(chars='Email:', position=37, length=6),
TextToken(chars='muffins@gmail.com', position=44, length=17)]
>>> s = '''Shelbourne Road,'''
>>> WordTokenizer().segment_words(s)
[TextToken(chars='Shelbourne', position=0, length=10),
TextToken(chars='Road', position=11, length=4),
TextToken(chars=',', position=15, length=1)]
>>> s = '''population of 100,000'''
>>> WordTokenizer().segment_words(s)
[TextToken(chars='population', position=0, length=10),
TextToken(chars='of', position=11, length=2),
TextToken(chars='100,000', position=14, length=7)]
>>> s = '''Hello|World'''
>>> WordTokenizer().segment_words(s)
[TextToken(chars='Hello', position=0, length=5),
TextToken(chars='|', position=5, length=1),
TextToken(chars='World', position=6, length=5)]
>>> s2 = '"We beat some pretty good teams to get here," Slocum said.'
>>> WordTokenizer().segment_words(s2) # doctest: +NORMALIZE_WHITESPACE
[TextToken(chars='``', position=0, length=1),
TextToken(chars='We', position=1, length=2),
TextToken(chars='beat', position=4, length=4),
TextToken(chars='some', position=9, length=4),
TextToken(chars='pretty', position=14, length=6),
TextToken(chars='good', position=21, length=4),
TextToken(chars='teams', position=26, length=5),
TextToken(chars='to', position=32, length=2),
TextToken(chars='get', position=35, length=3),
TextToken(chars='here', position=39, length=4),
TextToken(chars=',', position=43, length=1),
TextToken(chars="''", position=44, length=1),
TextToken(chars='Slocum', position=46, length=6),
TextToken(chars='said', position=53, length=4),
TextToken(chars='.', position=57, length=1)]
>>> s3 = '''Well, we couldn't have this predictable,
... cliche-ridden, \"Touched by an
... Angel\" (a show creator John Masius
... worked on) wanna-be if she didn't.'''
>>> WordTokenizer().segment_words(s3) # doctest: +NORMALIZE_WHITESPACE
[TextToken(chars='Well', position=0, length=4),
TextToken(chars=',', position=4, length=1),
TextToken(chars='we', position=6, length=2),
TextToken(chars="couldn't", position=9, length=8),
TextToken(chars='have', position=18, length=4),
TextToken(chars='this', position=23, length=4),
TextToken(chars='predictable', position=28, length=11),
TextToken(chars=',', position=39, length=1),
TextToken(chars='cliche-ridden', position=41, length=13),
TextToken(chars=',', position=54, length=1),
TextToken(chars='``', position=56, length=1),
TextToken(chars='Touched', position=57, length=7),
TextToken(chars='by', position=65, length=2),
TextToken(chars='an', position=68, length=2),
TextToken(chars='Angel', position=71, length=5),
TextToken(chars="''", position=76, length=1),
TextToken(chars='(', position=78, length=1),
TextToken(chars='a', position=79, length=1),
TextToken(chars='show', position=81, length=4),
TextToken(chars='creator', position=86, length=7),
TextToken(chars='John', position=94, length=4),
TextToken(chars='Masius', position=99, length=6),
TextToken(chars='worked', position=106, length=6),
TextToken(chars='on', position=113, length=2),
TextToken(chars=')', position=115, length=1),
TextToken(chars='wanna-be', position=117, length=8),
TextToken(chars='if', position=126, length=2),
TextToken(chars='she', position=129, length=3),
TextToken(chars="didn't", position=133, length=6),
TextToken(chars='.', position=139, length=1)]
>>> WordTokenizer().segment_words('"')
[TextToken(chars='``', position=0, length=1)]
>>> WordTokenizer().segment_words('" a')
[TextToken(chars='``', position=0, length=1),
TextToken(chars='a', position=2, length=1)]
>>> WordTokenizer().segment_words('["a')
[TextToken(chars='[', position=0, length=1),
TextToken(chars='``', position=1, length=1),
TextToken(chars='a', position=2, length=1)]
Some issues:
>>> WordTokenizer().segment_words("Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.")
[TextToken(chars='Copyright', position=0, length=9),
TextToken(chars=u'\xa9', position=10, length=1),
TextToken(chars='2014', position=12, length=4),
TextToken(chars='Foo', position=17, length=3),
TextToken(chars='Bar', position=21, length=3),
TextToken(chars='and', position=25, length=3),
TextToken(chars='Buzz', position=29, length=4),
TextToken(chars='Spam.', position=34, length=5),
TextToken(chars='All', position=40, length=3),
TextToken(chars='Rights', position=44, length=6),
TextToken(chars='Reserved', position=51, length=8),
TextToken(chars='.', position=59, length=1)]
"""
# regex, token
# if token is None - regex match group is taken
rules = [
(re.compile(r'\s+', re.UNICODE), ''),
(re.compile(r'“'), "``"),
(re.compile(r'["”]'), "''"),
(re.compile(r'``'), None),
(re.compile(r'…|\.\.\.'), '...'),
(re.compile(r'--'), None),
(re.compile(r',(?=\D|$)'), None),
(re.compile(r'\.$'), None),
(re.compile(r'[;#$£%&|!?[\](){}<>]'), None),
(re.compile(r"'(?=\s)|''", re.UNICODE), None),
]
open_quotes = re.compile(r'(^|[\s(\[{<])"')
def _segment_words(self, text):
# this one cannot be placed in the loop of internal function because it requires
# position check (beginning of the string) or previous char value
start = 0
for quote in self.open_quotes.finditer(text):
quote_pos = quote.end() - 1
for token in self._segment_words_nonquote(text[start:quote_pos]):
yield TextToken(chars=token.chars,
position=token.position + start,
length=token.length)
yield TextToken(chars='``', position=quote_pos, length=1)
start = quote.end()
for token in self._segment_words_nonquote(text[start:]):
yield TextToken(chars=token.chars,
position=token.position + start,
length=token.length)
def _segment_words_nonquote(self, text):
i = 0
token_start = 0
while 1:
if i >= len(text):
yield TextToken(chars=text[token_start:],
position=token_start,
length=len(text) - token_start)
break
shift = 1
partial_text = text[i:]
for regex, token in self.rules:
match = regex.match(partial_text)
if match:
yield TextToken(chars=text[token_start:i],
position=token_start,
length=i - token_start)
shift = match.end() - match.start()
token_start = i + shift
if token is None:
yield TextToken(chars=match.group(),
position=i + match.start(),
length=shift)
else:
yield TextToken(chars=token,
position=i + match.start(),
length=shift)
break
i += shift
[docs] def segment_words(self, text):
return [t for t in self._segment_words(text) if t.chars]
[docs] def tokenize(self, text):
return [t.chars for t in self.segment_words(text)]
[docs]class DefaultTokenizer(WordTokenizer):
[docs] def segment_words(self, text):
tokens = super(DefaultTokenizer, self).segment_words(text)
# remove standalone commas and semicolons
# as they broke tag sets,
# e.g. PERSON->FUNCTION in case "PERSON, FUNCTION"
# but it has negative consequences, e.g.
# etalon: [PER-B, PER-I, FUNC-B]
# predicted: [PER-B, PER-I, PER-I ]
# because we removed punctuation
# FIXME: remove as token, but save as feature left/right_punct:","
return [t for t in tokens if t.chars not in {',', ';'}]
tokenize = DefaultTokenizer().segment_words