0001
0002
0003"""Infoset serialization formats (XML, XHTML, HTML, etc)"""
0004
0005__revision__ = "$Rev: 492 $"
0006__date__ = "$Date: 2007-07-06 21:38:45 -0400 (Fri, 06 Jul 2007) $"
0007__author__ = "Ryan Tomayko (rtomayko@gmail.com)"
0008__copyright__ = "Copyright 2004-2005, Ryan Tomayko"
0009__license__ = "MIT <http://www.opensource.org/licenses/mit-license.php>"
0010
0011import htmlentitydefs
0012
0013try:
0014 set
0015except NameError:
0016 from sets import Set as set
0017
0018from kid.element import Element, Comment, ProcessingInstruction, Fragment, QName, namespaces, encode_entity, raise_serialization_error
0020import kid.namespace as namespace
0021from kid.parser import START, END, TEXT, COMMENT, PI, _coalesce
0022from kid.format import Format, output_formats
0023
0024__all__ = ['doctypes', 'Serializer', 'XMLSerializer', 'HTMLSerializer']
0025
0026
0027xml_uri = namespace.xml.uri
0028xhtml_uri = namespace.xhtml.uri
0029
0030
0031default_entity_map = {}
0032for k, v in htmlentitydefs.codepoint2name.items():
0033 default_entity_map[unichr(k)] = "&%s;" % v
0034
0035
0036
0037doctypes = {
0038 'wml': ('wml', "-//WAPFORUM//DTD WML 1.1//EN",
0039 "http://www.wapforum.org/DTD/wml_1.1.xml"),
0040 'xhtml-strict': ('html', "-//W3C//DTD XHTML 1.0 Strict//EN",
0041 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"),
0042 'xhtml': ('html', "-//W3C//DTD XHTML 1.0 Transitional//EN",
0043 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"),
0044 'xhtml-frameset': ('html', "-//W3C//DTD XHTML 1.0 Frameset//EN",
0045 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"),
0046 'html-strict': ('HTML', "-//W3C//DTD HTML 4.01//EN",
0047 "http://www.w3.org/TR/html4/strict.dtd"),
0048 'html': ('HTML', "-//W3C//DTD HTML 4.01 Transitional//EN",
0049 "http://www.w3.org/TR/html4/loose.dtd"),
0050 'html-frameset': ('HTML', "-//W3C//DTD HTML 4.01 Frameset//EN",
0051 "http://www.w3.org/TR/html4/frameset.dtd"),
0052 'html-quirks': ('HTML', '-//W3C//DTD HTML 4.01 Transitional//EN'),
0053 'html-frameset-quirks': ('HTML', "-//W3C//DTD HTML 4.01 Frameset//EN")
0054 }
0055
0056
0057class Serializer(object):
0058
0059 namespaces = namespace.namespaces
0060 src_encoding = encoding = 'utf-8'
0061 format = output_formats['default']
0062 formatted = False
0063 inline = False
0064
0065 def __init__(self, encoding=None, src_encoding=None,
0066 formatted=None, inline=None, format=None):
0067 """Initialize Serializer.
0068
0069 You can change the following parameters:
0070
0071 encoding: the output encoding
0072 src_encoding: the source encoding
0073 formatted: whether all tags should be considered formatted
0074 inline: whether all tags should be considered inline
0075 format: format to be applied (string or instance of Format)
0076
0077 """
0078 if encoding is not None:
0079 self.encoding = encoding
0080 if src_encoding is not None:
0081 self.src_encoding = src_encoding
0082 if formatted is not None:
0083 self.formatted = formatted
0084 if inline is not None:
0085 self.inline = inline
0086 if format is not None:
0087 self.format = format
0088 self.format = self._get_format(format)
0089
0090 def _get_format(self, format):
0091 if format is None:
0092 return self.format
0093 elif isinstance(format, basestring):
0094 return output_formats[format]
0095 else:
0096 return format
0097
0098 def is_formatted(self, tagname):
0099 return self.formatted
0100
0101 def is_inline(self, tagname):
0102 return self.inline
0103
0104 def serialize(self, stream, encoding=None,
0105 fragment=False, format=None):
0106 try:
0107 text = ''.join(self.generate(stream, encoding, fragment, format))
0108 except TypeError:
0109 text = ''.join(tuple(
0110 self.generate(stream, encoding, fragment, format)))
0111 if not fragment:
0112 text = Format.strip(text)
0113 return text
0114
0115 def write(self, stream, file, encoding=None,
0116 fragment=False, format=None):
0117 needs_closed = False
0118 if not hasattr(file, 'write'):
0119 needs_closed = True
0120 file = open(file, 'wb')
0121 try:
0122 write = file.write
0123 for text in self.generate(stream, encoding, fragment, format):
0124 write(text)
0125 finally:
0126
0127 if needs_closed:
0128 file.close()
0129
0130 def generate(self, stream, encoding=None,
0131 fragment=False, format=None):
0132 pass
0133
0134 def apply_filters(self, stream, format=None):
0135 stream = _coalesce(stream, self.src_encoding)
0136 if format:
0137 stream = self.format_stream(stream, format)
0138 return stream
0139
0140 def format_stream(self, stream, format):
0141 """Apply format to stream.
0142
0143 Note that this method is unaware of the serialization of the tags
0144 and does only take into account the text inside the stream. So the
0145 results may sometimes differ from what you expect when formatting
0146 the complete serialized output.
0147
0148 """
0149 filter_text = format.filter
0150 indent, wrap = format.indent, format.wrap
0151 if indent is not None:
0152 indent_lines = format.indent_lines
0153 lstrip_blanks = format.lstrip_blanks
0154 rstrip_blanks = format.rstrip_blanks
0155 lstrip_lines = format.lstrip_lines
0156 min_level, max_level = format.min_level, format.max_level
0157 indent_level = []
0158 new_line = False
0159 if wrap is not None:
0160 wrap_lines = format.wrap_lines
0161 indent_width, new_offset = format.indent_width, format.new_offset
0162 offset = 0
0163 formatted = 0
0164 text = last_char = ''
0165 for ev, item in stream:
0166 if ev == TEXT:
0167 text += item
0168 else:
0169 if ev in (START, END):
0170 tag = item.tag
0171 if not formatted:
0172 text = filter_text(text, last_char)
0173 if indent is None:
0174 if wrap is not None:
0175 text = wrap_lines(text, wrap, offset)
0176 else:
0177 level = len(indent_level)
0178 if max_level and level > max_level:
0179 level = max_level
0180 if min_level:
0181 level -= min_level
0182 if level < 0:
0183 level = 0
0184 if wrap is not None:
0185 text = wrap_lines(text, wrap, offset,
0186 indent_width(level*indent))
0187 if '\n' in text and indent_level:
0188 indent_level[-1] = True
0189 if new_line:
0190 if lstrip_blanks(text)[:1] != '\n':
0191 text = '\n' + lstrip_blanks(text)
0192 offset = 0
0193 new_line = False
0194 if tag == Comment or not self.is_inline(tag):
0195 if ev == START:
0196 if indent_level:
0197 if rstrip_blanks(text)[-1:] != '\n':
0198 text = rstrip_blanks(text) + '\n'
0199 text = indent_lines(text, level*indent)
0200 indent_level[-1] = True
0201 elif text:
0202 text = lstrip_lines(text)
0203 if tag != Comment and not self.is_formatted(tag):
0205 indent_level.append(False)
0206 else:
0207 if indent_level:
0208 if indent_level.pop():
0209 if rstrip_blanks(text)[-1:] == '\n':
0210 text = rstrip_blanks(text)[:-1]
0211 text = indent_lines(text,
0212 level*indent)
0213 text = rstrip_blanks(text) + '\n'
0214 level = len(indent_level)
0215 if max_level and level > max_level:
0216 level = max_level
0217 if min_level:
0218 level -= min_level
0219 if level < 0:
0220 level = 0
0221 text += level*indent
0222 elif text:
0223 text = lstrip_lines(text)
0224 new_line = True
0225 elif text:
0226 text = indent_lines(text, level*indent)
0227 if tag == Comment or self.is_formatted(tag):
0228 if ev == START:
0229 formatted += 1
0230 elif formatted:
0231 formatted -= 1
0232 new_line = True
0233 yield TEXT, text
0234 if wrap is not None:
0235 offset = new_offset(text, offset)
0236 last_char = text[-1:]
0237 text = ''
0238 yield ev, item
0239 if text:
0240 if not formatted:
0241 text = filter_text(text, last_char)
0242 if wrap is not None:
0243 text = wrap_lines(text, wrap, offset)
0244 if indent is None:
0245 if wrap is not None:
0246 text = wrap_lines(text, wrap, offset)
0247 else:
0248 level = len(indent_level)
0249 if max_level and level > max_level:
0250 level = max_level
0251 if min_level:
0252 level -= min_level
0253 if level < 0:
0254 level = 0
0255 if wrap is not None:
0256 text = wrap_lines(text, wrap, offset,
0257 indent_width(level*indent))
0258 if rstrip_blanks(text)[-1:] == '\n':
0259 text = text[:-1]
0260 text = indent_lines(text, level*indent)
0261 yield TEXT, text
0262
0263
0264class XMLSerializer(Serializer):
0265
0266 decl = True
0267 doctype = None
0268 entity_map = None
0269
0270 def __init__(self, encoding=None,
0271 decl=None, doctype=None, entity_map=None, namespaces=None,
0272 formatted=None, inline=None, format=None):
0273 """Initialize XMLSerializer.
0274
0275 You can change the following parameters:
0276
0277 encoding: the output encoding
0278 decl: add xml declaration at the beginning (True/False)
0279 doctype: add doctype (None, string, tuple)
0280 entity_map: use named entities for output (True/False or mapping)
0281 namespaces: mapping of namespaces
0282 formatted: whether all tags should be considered formatted
0283 inline: whether all tags should be considered inline
0284 format: format to be applied (string or instance of Format)
0285
0286 """
0287 Serializer.__init__(self, encoding=encoding,
0288 format=format, formatted=formatted, inline=inline)
0289 if decl is not None:
0290 self.decl = decl
0291 if doctype is not None:
0292 self.doctype = doctype
0293 if entity_map is not None:
0294 self.entity_map = entity_map
0295 if namespaces is not None:
0296 self.namespaces = namespaces
0297
0298 def can_be_empty_element(self, item_name):
0299 return True
0300
0301 def generate(self, stream, encoding=None,
0302 fragment=False, format=None):
0303 """Serializes an event stream to bytes of the specified encoding.
0304
0305 This function yields an encoded string over and over until the
0306 stream is exhausted.
0307
0308 """
0309 decl = self.decl
0310 doctype = self.doctype
0311 encoding = encoding or self.encoding or 'utf-8'
0312 entity_map = self.entity_map
0313 format = self._get_format(format)
0314 if format:
0315 if format.decl is not None:
0316 decl = format.decl
0317 if format.doctype is not None:
0318 doctype = format.doctype
0319 if format.entity_map is not None:
0320 entity_map = format.entity_map
0321 if entity_map == True:
0322
0323 entity_map = default_entity_map
0324 elif entity_map == False:
0325 entity_map = None
0326 if isinstance(doctype, basestring):
0327
0328 doctype = doctypes[self.doctype]
0329
0330 escape_cdata = XMLSerializer.escape_cdata
0331 escape_attrib = XMLSerializer.escape_attrib
0332
0333 lastev = None
0334 stream = iter(stream)
0335 names = NamespaceStack(self.namespaces)
0336 if not fragment:
0337 if decl:
0338 yield '<?xml version="1.0" encoding="%s"?>\n' % encoding
0339 if doctype is not None:
0340 yield serialize_doctype(doctype) + '\n'
0341 text = None
0342 for ev, item in self.apply_filters(stream, format):
0343 if ev in (START, END) and item.tag == Fragment:
0344 continue
0345 elif ev == TEXT:
0346 if text is not None:
0347 text = u''.join([text, item])
0348 else:
0349 text = item
0350 continue
0351 if lastev == START:
0352 if ev == END and (not text or not (Format.strip(text)
0353 or self.is_formatted(item.tag))) and self.can_be_empty_element(item.tag):
0355 yield ' />'
0356 lastev = END
0357 text = None
0358 names.pop()
0359 continue
0360 yield ">"
0361 if text:
0362 yield escape_cdata(text, encoding, entity_map)
0363 text = None
0364 if ev == START:
0365 if item.tag == Comment:
0366 yield "<!--%s-->" % item.text.encode(encoding)
0367 lastev = COMMENT
0368 continue
0369 elif item.tag == ProcessingInstruction:
0370 yield "<?%s?>" % item.text.encode(encoding)
0371 lastev = PI
0372 continue
0373 else:
0374 current_names = names.current
0375 names.push(namespaces(item, remove=True))
0376 qname = names.qname(item.tag, default=True)
0377 yield "<" + qname.encode(encoding)
0378 for k, v in item.attrib.items():
0379 k = names.qname(k, default=False).encode(encoding)
0380 v = escape_attrib(v, encoding)
0381 yield ' %s="%s"' % (k, v)
0382 for prefix, uri in names.current.items():
0383 if prefix not in current_names or current_names[prefix] != uri:
0385 v = escape_attrib(uri, encoding)
0386 if prefix:
0387 k = 'xmlns:' + prefix.encode(encoding)
0388 else:
0389 k = 'xmlns'
0390 yield ' %s="%s"' % (k, v)
0391 elif ev == END and item.tag not in (
0392 Comment, ProcessingInstruction):
0393 qname = names.qname(item.tag, default=True)
0394 yield "</%s>" % qname.encode(encoding)
0395 names.pop()
0396 lastev = ev
0397 if fragment and text:
0398 yield escape_cdata(text, encoding, entity_map)
0399 return
0400<