Source code for webstruct.text_tokenizers

# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import re


[docs]class WordTokenizer(object):
    r"""This tokenizer is copy-pasted version of TreebankWordTokenizer
    that doesn't split on @ and ':' symbols and doesn't split contractions::

    >>> from nltk.tokenize.treebank import TreebankWordTokenizer  # doctest: +SKIP
    >>> s = '''Good muffins cost $3.88\nin New York. Email: muffins@gmail.com'''
    >>> TreebankWordTokenizer().tokenize(s) # doctest: +SKIP
    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Email', ':', 'muffins', '@', 'gmail.com']
    >>> WordTokenizer().tokenize(s)
    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Email:', 'muffins@gmail.com']

    >>> s = '''Shelbourne Road,'''
    >>> WordTokenizer().tokenize(s)
    ['Shelbourne', 'Road', ',']

    >>> s = '''population of 100,000'''
    >>> WordTokenizer().tokenize(s)
    ['population', 'of', '100,000']

    >>> s = '''Hello|World'''
    >>> WordTokenizer().tokenize(s)
    ['Hello', '|', 'World']

    >>> s2 = '"We beat some pretty good teams to get here," Slocum said.'
    >>> WordTokenizer().tokenize(s2)  # doctest: +NORMALIZE_WHITESPACE
    ['``', 'We', 'beat', 'some', 'pretty', 'good',
    'teams', 'to', 'get', 'here', ',', "''", 'Slocum', 'said', '.']
    >>> s3 = '''Well, we couldn't have this predictable,
    ... cliche-ridden, \"Touched by an
    ... Angel\" (a show creator John Masius
    ... worked on) wanna-be if she didn't.'''
    >>> WordTokenizer().tokenize(s3)  # doctest: +NORMALIZE_WHITESPACE
    ['Well', ',', 'we', "couldn't", 'have', 'this', 'predictable',
     ',', 'cliche-ridden', ',', '``', 'Touched', 'by', 'an',
     'Angel', "''", '(', 'a', 'show', 'creator', 'John', 'Masius',
     'worked', 'on', ')', 'wanna-be', 'if', 'she', "didn't", '.']

    Some issues:

    >>> WordTokenizer().tokenize("Phone:855-349-1914")  # doctest: +SKIP
    ['Phone', ':', '855-349-1914']

    >>> WordTokenizer().tokenize("Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.")  # doctest: +SKIP
    ['Copyright', '\xc2\xa9', '2014', 'Wall', 'Decor', 'and', 'Home', 'Accents', '.', 'All', 'Rights', 'Reserved', '.']

    >>> WordTokenizer().tokenize("Powai Campus, Mumbai-400077")  # doctest: +SKIP
    ['Powai', 'Campus', ',', 'Mumbai", "-", "400077']

    >>> WordTokenizer().tokenize("1 5858/ 1800")  # doctest: +SKIP
    ['1', '5858', '/', '1800']

    >>> WordTokenizer().tokenize("Saudi Arabia-")  # doctest: +SKIP
    ['Saudi', 'Arabia', '-']

    """

    # regex, token
    # if token is None - regex match group is taken
    rules = [
        (re.compile(r'\s+', re.UNICODE), ''),
        (re.compile(r'“'), "``"),
        (re.compile(r'["”]'), "''"),
        (re.compile(r'``'), None),
        (re.compile(r'…|\.\.\.'), '...'),
        (re.compile(r'--'), None),
        (re.compile(r',(?=\D|$)'), None),
        (re.compile(r'\.$'), None),
        (re.compile(r'[;#$£%&|!?[\](){}<>]'), None),
        (re.compile(r"'(?=\s)|''", re.UNICODE), None),
    ]

    open_quotes = re.compile(r'(^|[\s(\[{<])"')

    def _tokenize(self, text):
        # this one cannot be placed in the loop because it requires
        # position check (beginning of the string) or previous char value
        text = self.open_quotes.sub(r'\1``', text)

        i = 0
        token_start = 0
        while 1:
            if i >= len(text):
                yield text[token_start:]
                break
            shift = 1
            partial_text = text[i:]
            for regex, token in self.rules:
                match = regex.match(partial_text)
                if match:
                    yield text[token_start:i]
                    shift = match.end() - match.start()
                    token_start = i + shift
                    if token is None:
                        yield match.group()
                    else:
                        yield token
                    break
            i += shift

[docs]    def tokenize(self, text):
        return [t for t in self._tokenize(text) if t]


[docs]class DefaultTokenizer(WordTokenizer):
[docs]    def tokenize(self, text):
        tokens = super(DefaultTokenizer, self).tokenize(text)
        # remove standalone commas and semicolons
        return [t for t in tokens if t not in {',', ';'}]


tokenize = DefaultTokenizer().tokenize