0001
0002
0003"""Infoset serialization format styles.
0004
0005This modules provides methods assisting the serialization module
0006in formatting the text content of serialized infosets.
0007
0008The methods for "educating" and "stupefying" typographic characters
0009have been inspired by John Gruber's "SmartyPants" project
0010(http://daringfireball.net/projects/smartypants/,
0011see also http://web.chad.org/projects/smartypants.py/).
0012
0013"""
0014
0015__revision__ = "$Rev: 492 $"
0016__date__ = "$Date: 2007-07-06 21:38:45 -0400 (Fri, 06 Jul 2007) $"
0017__author__ = "Christoph Zwerschke (cito@online.de)"
0018__copyright__ = "Copyright 2006, Christoph Zwerschke"
0019__license__ = "MIT <http://www.opensource.org/licenses/mit-license.php>"
0020
0021import re
0022
0023__all__ = ['Format', 'output_formats']
0024
0025
0026class Format(object):
0027 """Formatting details for Serializers."""
0028
0029
0030
0031 wrap = 80
0032 indent = '\t'
0033 min_level, max_level = 1, 8
0034 tabsize = 8
0035
0036 apostrophe = u'\u2019'
0037 squotes = u'\u2018\u2019'
0038 dquotes = u'\u201c\u201d'
0039 dashes = u'\u2013\u2014'
0040 ellipsis = u'\u2026'
0041
0042
0043
0044 re_whitespace = re.compile(r'[ \t\n\r]+')
0045 re_leading_blanks = re.compile(r'^[ \t]+', re.MULTILINE)
0046 re_trailing_blanks = re.compile(r'[ \t]+$', re.MULTILINE)
0047 re_duplicate_blanks = re.compile(r'[ \t]{2,}')
0048 re_duplicate_newlines = re.compile(r'\n[ \t\n\r]*\n')
0049 re_whitespace_with_newline = re.compile(r'[ \t]*\n[ \t\n\r]*')
0050 re_indentation = re.compile(r'\n[ \t]*')
0051 re_squotes = re.compile(r"'")
0052 re_dquotes = re.compile(r'"')
0053 re_sbackticks = re.compile(r"`")
0054 re_dbackticks = re.compile(r"(?<![\w`])``(?!`)")
0055 re_squote_decade = re.compile(r"(?<=\W)'(?=\d\d\D)")
0056 re_squote_left = re.compile(
0057 r"((?<=\W)'(?=\w))|((?<=\s)'(?=\S))", re.UNICODE)
0058 re_squote_right = re.compile(
0059 r"((?<=\w)'(?=\W))|((?<=\S)'(?=\s))|((?<=\W)')", re.UNICODE)
0060 re_dquote_left = re.compile(
0061 r'((?<=\W)"(?=\w))|((?<=\s)"(?=\S))', re.UNICODE)
0062 re_dquote_right = re.compile(
0063 r'((?<=\w)"(?=\W))|((?<=\S)"(?=\s))|((?<=\W)")', re.UNICODE)
0064 re_endash = re.compile(r'(?<!-)--(?!-)')
0065 re_emdash = re.compile(r'(?<!-)---(?!-)')
0066 re_hyphen_between_blanks = re.compile(r'(?<!\S)-(?!\S)', re.UNICODE)
0067 re_ellipses = re.compile(
0068 r'((?<!\.)\.\.\.(?!\.\.))|((?<!\. )\. \. \.(?! \. \.))')
0069
0070 def __init__(self, *args, **kw):
0071 """Create an output format with given parameters.
0072
0073 You can pass one or more text filter functions
0074 for processing text content in the output stream.
0075
0076 You can also set keyword parameters for using some
0077 standard text filter operations. The following parameters
0078 must be set to True to activate the operation:
0079
0080 strip_lines: strip blanks in all text lines
0081 lstrip_lines: left strip blanks in all text lines
0082 rstrip_lines: right strip blanks in all text lines
0083 simple_blanks: remove all duplicate blanks
0084 no_empty_lines: remove all empty lines
0085 simple_whitespace: remove all duplicate whitespace
0086 wrap: wrap text lines to a maximum width
0087
0088 You can also specify the exact width using the wrap parameter.
0089
0090 There are some more operations which you should use
0091 with caution, since they may remove significant whitespace:
0092
0093 strip: strip whitespace
0094 lstrip: left strip whitespace
0095 rstrip: right strip whitespace
0096 strip_blanks: strip blanks
0097 lstrip_blanks: left strip blanks
0098 rstrip_blanks: right strip blanks
0099
0100 The following parameters control typographic punctuation.
0101
0102 educate_quotes: use typographic quotes
0103 educate_backticks: replace backticks with opening quotes
0104 educate_dashes: replace en-dashes and em-dashes
0105 educate_ellipses: replace ellipses
0106 educate (or nice): all of the above
0107 stupefy (or ugly): reverse operation of educate
0108
0109 apostrophe: character to be used for the apostrophe
0110 squotes: left and right single quote characters
0111 dquotes: left and right double quote characters
0112 dashes: characters to be used for en-dash and em-dash
0113 ellipsis: character to be used for the ellipsis
0114
0115 The following parameters control indentation.
0116 This will insert newlines and level-dependent indentation,
0117 paying regard to inline and whitespace senstive tags:
0118
0119 indent: string or number of blanks for indentation
0120 min_level: minimum level for indentation
0121 max_level: maximum level for indentation
0122
0123 Note that this formatting has some limitations since
0124 it processes only text content in a stream (no look-ahead,
0125 no paying regard to the format of the serialized tags).
0126
0127 The following parameters are passed to the Serializer
0128 (see there for the possible values of these parameters):
0129
0130 decl: add xml declaration at the beginning
0131 doctype: add document type at the beginning
0132 entity_map (or named): entity map for named entities
0133 transpose: how to transpose html tags
0134 inject_type: inject meta tag with content-type
0135
0136 """
0137 wrap = kw.get('wrap')
0138 if wrap:
0139 if isinstance(wrap, bool):
0140 wrap = wrap and self.wrap or None
0141 elif isinstance(wrap, int):
0142 wrap = wrap or None
0143 else:
0144 wrap = None
0145 self.wrap = wrap
0146 if wrap:
0147 simple_whitespace = None
0148 simple_blanks = simple_newlines = None
0149 else:
0150 simple_whitespace = kw.get('simple_whitespace')
0151 if simple_whitespace:
0152 simple_blanks = simple_newlines = None
0153 else:
0154 simple_blanks = kw.get('simple_blanks')
0155 simple_newlines = kw.get('simple_newlines',
0156 kw.get('no_empty_lines'))
0157 if simple_blanks and simple_newlines:
0158 simple_whitespace = True
0159 simple_blanks = simple_newlines = None
0160 strip = kw.get('strip')
0161 if strip:
0162 lstrip = rstrip = None
0163 else:
0164 lstrip = kw.get('lstrip')
0165 rstrip = kw.get('rstrip')
0166 if lstrip and rstrip:
0167 strip = True
0168 lstrip = rstrip = None
0169 strip_lines = kw.get('strip_lines')
0170 if strip_lines:
0171 lstrip_lines = rstrip_lines = None
0172 else:
0173 lstrip_lines = kw.get('lstrip_lines')
0174 rstrip_lines = kw.get('rstrip_lines')
0175 if lstrip_lines and rstrip_lines:
0176 strip_lines = True
0177 lstrip_lines = rstrip_lines = None
0178 if strip or strip_lines:
0179 lstrip_blanks = rstrip_blanks = strip_blanks = None
0181 else:
0182 strip_blanks = kw.get('strip_blanks')
0183 if strip_blanks:
0184 lstrip_blanks = rstrip_blanks = None
0185 else:
0186 if lstrip or lstrip_lines:
0187 lstrip_blanks = None
0188 else:
0189 lstrip_blanks = kw.get('lstrip_blanks')
0190 if rstrip or rstrip_lines:
0191 rstrip_blanks = None
0192 else:
0193 rstrip_blanks = kw.get('rstrip_blanks')
0194 if lstrip_blanks and rstrip_blanks:
0195 strip_blanks = True
0196 lstrip_blanks = rstrip_blanks = None
0197 indent = kw.get('indent')
0198 if indent:
0199 if isinstance(indent, bool):
0200 indent = indent and self.indent or None
0201 elif isinstance(indent, int):
0202 indent = ' ' * indent
0203 elif isinstance(indent, basestring):
0204 pass
0205 else:
0206 indent = None
0207 min_level = kw.get('min_level', self.min_level)
0208 max_level = kw.get('max_level', self.max_level)
0209 else:
0210 min_level = max_level = None
0211 self.indent = indent
0212 self.min_level, self.max_level = min_level, max_level
0213
0214 stupefy = kw.get('stupefy', kw.get('ugly'))
0215 educate = not stupefy and kw.get('educate',
0216 kw.get('educated', kw.get('nice')))
0217 educate_quotes = kw.get('educate_quotes', educate)
0218 educate_backticks = kw.get('educate_backticks', educate)
0219 self.with_backticks = bool(educate_backticks)
0220 educate_dashes = kw.get('educate_dashes', educate)
0221 educate_ellipses = kw.get('educate_ellipses', educate)
0222 self.apostrophe = kw.get('apostrophe', self.apostrophe)
0223 self.squotes = kw.get('squotes', self.squotes)
0224 self.dquotes = kw.get('dquotes', self.dquotes)
0225 self.dashes = kw.get('dashes', self.dashes)
0226 self.ellipsis = kw.get('ellipsis', self.ellipsis)
0227
0228 filters = []
0229 if simple_whitespace:
0230 filters.append(self.simple_whitespace)
0231 else:
0232 if simple_blanks:
0233 filters.append(self.simple_blanks)
0234 elif simple_newlines:
0235 filters.append(self.simple_newlines)
0236 if strip:
0237 filters.append(self.strip)
0238 elif lstrip:
0239 filters.append(self.lstrip)
0240 elif rstrip:
0241 filters.append(self.rstrip)
0242 if strip_lines:
0243 filters.append(self.strip_lines)
0244 elif lstrip_lines:
0245 filters.append(self.lstrip_lines)
0246 elif rstrip_lines:
0247 filters.append(self.rstrip_lines)
0248 if strip_blanks:
0249 filters.append(self.strip_blanks)
0250 elif lstrip_blanks:
0251 filters.append(self.lstrip_blanks)
0252 elif rstrip_blanks:
0253 filters.append(self.rstrip_blanks)
0254 if stupefy:
0255 filters.append(self.stupefy)
0256 if educate_backticks and not educate_quotes:
0257 filters.append(self.educate_backticks)
0258 if educate_dashes:
0259 filters.append(self.educate_dashes)
0260 if educate_ellipses:
0261 filters.append(self.educate_ellipses)
0262
0263 self.custom_filters = []
0264 for f in args:
0265 if callable(f):
0266 if f not in self.custom_filters and f not in filters:
0267 self.custom_filters.append(f)
0268 self.text_filters = self.custom_filters + filters
0269
0270 self.context_filters = []
0271 if educate_quotes:
0272 self.context_filters.append(self.educate_quotes)
0273
0274 self.decl = kw.get('decl')
0275 self.doctype = kw.get('doctype')
0276 self.entity_map = kw.get('entity_map',
0277 kw.get('named_entities', kw.get('named')))
0278 self.transpose = kw.get('transpose')
0279 self.inject_type = kw.get('inject_type')
0280
0281 def __repr__(self):
0282 args = {}
0283 attrs = self.__dict__.keys()
0284 for attr in attrs:
0285 if attr.endswith('_filters'):
0286 continue
0287 value = getattr(self, attr)
0288 if value is not None:
0289 try:
0290 if value == getattr(Format, attr):
0291 value = attr in ('wrap', 'indent') and True or None
0292 except AttributeError:
0293 pass
0294 if value is not None:
0295 args[attr] = value
0296 for f in self.text_filters + self.context_filters:
0297 attr = f.__name__
0298 if attr == 'indent_lines':
0299 if 'indent' not in args:
0300 args['indent'] = True
0301 elif attr == 'wrap_lines':
0302 if 'wrap' not in args:
0303 args['wrap'] = True
0304 else:
0305 args[attr] = True
0306 if 'with_backticks' in args:
0307 if args['with_backticks']:
0308 args['educate_backticks'] = True
0309 del args['with_backticks']
0310 if ('educate_quotes' in args
0311 and 'educate_backticks' in args
0312 and 'educate_dashes' in args
0313 and 'educate_ellipses' in args):
0314 del args['educate_quotes']
0315 del args['educate_backticks']
0316 del args['educate_dashes']
0317 del args['educate_ellipses']
0318 args['educate'] = True
0319 attrs = args.keys()
0320 attrs.sort()
0321 args = [f.__name__ for f in self.custom_filters] + ['%s=%r' % (attr, args[attr]) for attr in attrs]
0323 return "%s(%s)" % (self.__class__.__name__, ', '.join(args))
0324
0325 def filter(self, s, last_char=None, next_char=None):
0326 """Run all filters."""
0327 return self.context_filter(self.text_filter(s),
0328 last_char, next_char)
0329
0330
0331
0332 def text_filter(self, s):
0333 """Run all filters which do not need a context."""
0334 for f in self.text_filters:
0335 s = f(s)
0336 return s
0337
0338
0339
0340
0341 def lstrip(s):
0342 """Left strip XML whitespace from string."""
0343 return s.lstrip(' \t\n\r')
0344 lstrip = staticmethod(lstrip)
0345
0346 def rstrip(s):
0347 """Right strip XML whitespace from string."""
0348 return s.rstrip(' \t\n\r')
0349 rstrip = staticmethod(rstrip)
0350
0351 def strip(s):
0352 """Strip XML whitespace from string."""
0353 return s.strip(' \t\n\r')
0354 strip = staticmethod(strip)
0355
0356 def lstrip_blanks(s):
0357 """Left strip only blanks and tabs from string."""
0358 return s.lstrip(' \t')
0359 lstrip_blanks = staticmethod(lstrip_blanks)
0360
0361 def rstrip_blanks(s):
0362 """Right strip only blanks and tabs from string."""
0363 return s.rstrip(' \t')
0364 rstrip_blanks = staticmethod(rstrip_blanks)
0365
0366 def strip_blanks(s):
0367 """Strip only blanks and tabs from string."""
0368 return s.strip(' \t')
0369 strip_blanks = staticmethod(strip_blanks)
0370
0371 def lstrip_lines(cls, s):
0372 """Left strip XML whitespace from all lines in string."""
0373 return cls.re_leading_blanks.sub('', s)
0374 lstrip_lines = classmethod(lstrip_lines)
0375
0376 def rstrip_lines(cls, s):
0377 """Right strip XML whitespace from all lines in string."""
0378 return cls.re_trailing_blanks.sub('', s)
0379 rstrip_lines = classmethod(rstrip_lines)
0380
0381 def strip_lines(cls, s):
0382 """Right strip XML whitespace from all lines in string."""
0383 return cls.lstrip_lines(cls.rstrip_lines(s))
0384 strip_lines = classmethod(strip_lines)
0385
0386 def simple_blanks(cls, s):
0387 """Remove all duplicate blanks in string."""
0388 return cls.re_duplicate_blanks.sub(' ', s)
0389 simple_blanks = classmethod(simple_blanks)
0390
0391 def simple_newlines(cls, s):
0392 """Remove all duplicate newlines in string."""
0393 return cls.re_duplicate_newlines.sub('\n', s)
0394 simple_newlines = classmethod(simple_newlines)
0395
0396 def simple_newline_whitespace(cls, s):
0397 """Simplify all whitespace containing newlines in string."""
0398 return cls.re_whitespace_with_newline.sub('\n', s)
0399 simple_newline_whitespace = classmethod(simple_newline_whitespace)
0400
0401 def simple_whitespace(cls, s):
0402 """Simplify all whitespace in string."""
0403 return cls.simple_blanks(cls.simple_newline_whitespace(s))
0404 simple_whitespace = classmethod(simple_whitespace)
0405
0406 def clean_whitespace(cls, s):
0407 """Simplify and strip all whitespace in string."""
0408 return cls.strip(cls.simple_whitespace(s))
0409 clean_whitespace = classmethod(clean_whitespace)
0410
0411
0412
0413 def educate_backticks(self, s):
0414 """Replace backticks (`) with opening quotes."""
0415
0416 s = self.re_dbackticks.sub(self.dquotes[0], s)
0417
0418 s = self.re_dquotes.sub(self.dquotes[1], s)
0419
0420 s = self.re_sbackticks.sub(self.squotes[0], s)
0421
0422 s = self.re_squote_decade.sub(self.apostrophe, s)
0423
0424 s = self.re_squote_right.sub(self.squotes[1], s)
0425
0426 s = self.re_squotes.sub(self.apostrophe, s)
0427 return s
0428
0429 def educate_dashes(self, s):
0430 """Replace en-dashes (--) and em-dashes (---)."""
0431 s = self.re_hyphen_between_blanks.sub(self.dashes[0], s)
0432 s = self.re_emdash.sub(self.dashes[1], s)
0433 s = self.re_endash.sub(self.dashes[0], s)
0434 return s
0435
0436 def educate_ellipses(self, s):
0437 """Replace ellipses (...)."""
0438 return self.re_ellipses.sub(self.ellipsis, s)
0439
0440 def stupefy(self, s):
0441 """Replace typographic with simple punctuation."""
0442 s = s.replace(self.squotes[0], "'").replace(self.squotes[1], "'")
0443 s = s.replace(self.dquotes[0], '"').replace(self.dquotes[1], '"')
0444 s = s.replace(self.dashes[0], '--').replace(self.dashes[1], '---')
0445 return s.replace(self.ellipsis, '...')
0446
0447
0448
0449 def context_filter(self, s, last_char=None, next_char=None):
0450 """Run all filters which need context characters."""
0451 for f in self.context_filters:
0452 s = f(s, last_char, next_char)
0453 return s
0454
0455
0456
0457 def educate_quotes(self, s, last_char=None, next_char=None):
0458 """Use proper typographic quotes in the text.
0459
0460 You should at least pass the last character
0461 of the text content if used as a stream filter.
0462
0463 """
0464 if self.with_backticks:
0465
0466 s = self.re_dbackticks.sub(self.dquotes[0], s)
0467 s = self.re_sbackticks.