Source code for webstruct.text_tokenizers

# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import re

[docs]class WordTokenizer(object): r"""This tokenizer is copy-pasted version of TreebankWordTokenizer that doesn't split on @ and ':' symbols and doesn't split contractions:: >>> from nltk.tokenize.treebank import TreebankWordTokenizer # doctest: +SKIP >>> s = '''Good muffins cost $3.88\nin New York. Email:''' >>> TreebankWordTokenizer().tokenize(s) # doctest: +SKIP ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Email', ':', 'muffins', '@', ''] >>> WordTokenizer().tokenize(s) ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Email:', ''] >>> s = '''Shelbourne Road,''' >>> WordTokenizer().tokenize(s) ['Shelbourne', 'Road', ','] >>> s = '''population of 100,000''' >>> WordTokenizer().tokenize(s) ['population', 'of', '100,000'] >>> s = '''Hello|World''' >>> WordTokenizer().tokenize(s) ['Hello', '|', 'World'] >>> s2 = '"We beat some pretty good teams to get here," Slocum said.' >>> WordTokenizer().tokenize(s2) # doctest: +NORMALIZE_WHITESPACE ['``', 'We', 'beat', 'some', 'pretty', 'good', 'teams', 'to', 'get', 'here', ',', "''", 'Slocum', 'said', '.'] >>> s3 = '''Well, we couldn't have this predictable, ... cliche-ridden, \"Touched by an ... Angel\" (a show creator John Masius ... worked on) wanna-be if she didn't.''' >>> WordTokenizer().tokenize(s3) # doctest: +NORMALIZE_WHITESPACE ['Well', ',', 'we', "couldn't", 'have', 'this', 'predictable', ',', 'cliche-ridden', ',', '``', 'Touched', 'by', 'an', 'Angel', "''", '(', 'a', 'show', 'creator', 'John', 'Masius', 'worked', 'on', ')', 'wanna-be', 'if', 'she', "didn't", '.'] Some issues: >>> WordTokenizer().tokenize("Phone:855-349-1914") # doctest: +SKIP ['Phone', ':', '855-349-1914'] >>> WordTokenizer().tokenize("Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.") # doctest: +SKIP ['Copyright', '\xc2\xa9', '2014', 'Wall', 'Decor', 'and', 'Home', 'Accents', '.', 'All', 'Rights', 'Reserved', '.'] >>> WordTokenizer().tokenize("Powai Campus, Mumbai-400077") # doctest: +SKIP ['Powai', 'Campus', ',', 'Mumbai", "-", "400077'] >>> WordTokenizer().tokenize("1 5858/ 1800") # doctest: +SKIP ['1', '5858', '/', '1800'] >>> WordTokenizer().tokenize("Saudi Arabia-") # doctest: +SKIP ['Saudi', 'Arabia', '-'] """ # regex, token # if token is None - regex match group is taken rules = [ (re.compile(r'\s+', re.UNICODE), ''), (re.compile(r'“'), "``"), (re.compile(r'["”]'), "''"), (re.compile(r'``'), None), (re.compile(r'…|\.\.\.'), '...'), (re.compile(r'--'), None), (re.compile(r',(?=\D|$)'), None), (re.compile(r'\.$'), None), (re.compile(r'[;#$£%&|!?[\](){}<>]'), None), (re.compile(r"'(?=\s)|''", re.UNICODE), None), ] open_quotes = re.compile(r'(^|[\s(\[{<])"') def _tokenize(self, text): # this one cannot be placed in the loop because it requires # position check (beginning of the string) or previous char value text = self.open_quotes.sub(r'\1``', text) i = 0 token_start = 0 while 1: if i >= len(text): yield text[token_start:] break shift = 1 partial_text = text[i:] for regex, token in self.rules: match = regex.match(partial_text) if match: yield text[token_start:i] shift = match.end() - match.start() token_start = i + shift if token is None: yield else: yield token break i += shift
[docs] def tokenize(self, text): return [t for t in self._tokenize(text) if t]
[docs]class DefaultTokenizer(WordTokenizer):
[docs] def tokenize(self, text): tokens = super(DefaultTokenizer, self).tokenize(text) # remove standalone commas and semicolons return [t for t in tokens if t not in {',', ';'}]
tokenize = DefaultTokenizer().tokenize