Source code for webstruct.tokenizers

# -*- coding: utf-8 -*-
from __future__ import absolute_import
import re

[docs]class WordTokenizer(object): """ This tokenizer is copy-pasted version of TreebankWordTokenizer that doesn't split on @ and ':' symbols and doesn't split contractions:: >>> from nltk.tokenize.treebank import TreebankWordTokenizer >>> s = u'''Good muffins cost $3.88\\nin New York. Email:''' >>> TreebankWordTokenizer().tokenize(s) [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York.', u'Email', u':', u'muffins', u'@', u''] >>> WordTokenizer().tokenize(s) [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York.', u'Email:', u''] >>> s = u'''Shelbourne Road,''' >>> WordTokenizer().tokenize(s) [u'Shelbourne', u'Road', u','] >>> s = u'''population of 100,000''' >>> WordTokenizer().tokenize(s) [u'population', u'of', u'100,000'] >>> s = u'''Hello|World''' >>> WordTokenizer().tokenize(s) [u'Hello', u'|', u'World'] """
[docs] def tokenize(self, text): # starting quotes text = re.sub(ur'^["“]', r'``', text) # +unicode quotes text = text.replace('``', " `` ") text = re.sub(r'([ (\[{<])"', r'\1 `` ', text) # punctuation text = re.sub(r'(,)(\D|\Z)', r' \1 \2', text) # CHANGED text = text.replace("...", " ... ") text = text.replace(u"…", u" ... ") text = re.sub(r'[;#$%&|]', r' \g<0> ', text) # CHANGED @| text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text) text = re.sub(r'[?!]', r' \g<0> ', text) text = re.sub(r"([^'])' ", r"\1 ' ", text) # parens, brackets, etc. text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text) text = text.replace("--", " -- ") # add extra space to make things easier text = " " + text + " " # ending quotes text = text.replace(u'["”]', " '' ") # +unicode quotes text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text) # XXX: contractions handling is removed return text.split()
[docs]class DefaultTokenizer(WordTokenizer):
[docs] def tokenize(self, text): tokens = super(DefaultTokenizer, self).tokenize(text) # remove standalone commas and semicolons return [t for t in tokens if t not in {',', ';'}]
tokenize = DefaultTokenizer().tokenize