0001# -*- coding: utf-8 -*-
0002
0003"""Infoset serialization formats (XML, XHTML, HTML, etc)"""
0004
0005__revision__ = "$Rev: 492 $"
0006__date__ = "$Date: 2007-07-06 21:38:45 -0400 (Fri, 06 Jul 2007) $"
0007__author__ = "Ryan Tomayko (rtomayko@gmail.com)"
0008__copyright__ = "Copyright 2004-2005, Ryan Tomayko"
0009__license__ = "MIT <http://www.opensource.org/licenses/mit-license.php>"
0010
0011import htmlentitydefs
0012
0013try:
0014    set
0015except NameError: # fallback for Python 2.3
0016    from sets import Set as set
0017
0018from kid.element import Element, Comment, ProcessingInstruction,       Fragment, QName, namespaces, encode_entity, raise_serialization_error
0020import kid.namespace as namespace
0021from kid.parser import START, END, TEXT, COMMENT, PI, _coalesce
0022from kid.format import Format, output_formats
0023
0024__all__ = ['doctypes', 'Serializer', 'XMLSerializer', 'HTMLSerializer']
0025
0026# bring in well known namespaces
0027xml_uri = namespace.xml.uri
0028xhtml_uri = namespace.xhtml.uri
0029
0030# This is the default entity map:
0031default_entity_map = {}
0032for k, v in htmlentitydefs.codepoint2name.items():
0033    default_entity_map[unichr(k)] = "&%s;" % v
0034
0035# Some common doctypes.
0036# You can pass doctype strings from here or doctype tuples to Serializers.
0037doctypes = {
0038    'wml': ('wml', "-//WAPFORUM//DTD WML 1.1//EN",
0039        "http://www.wapforum.org/DTD/wml_1.1.xml"),
0040    'xhtml-strict': ('html', "-//W3C//DTD XHTML 1.0 Strict//EN",
0041        "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"),
0042    'xhtml': ('html', "-//W3C//DTD XHTML 1.0 Transitional//EN",
0043        "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"),
0044    'xhtml-frameset': ('html', "-//W3C//DTD XHTML 1.0 Frameset//EN",
0045        "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"),
0046    'html-strict': ('HTML', "-//W3C//DTD HTML 4.01//EN",
0047        "http://www.w3.org/TR/html4/strict.dtd"),
0048    'html': ('HTML', "-//W3C//DTD HTML 4.01 Transitional//EN",
0049        "http://www.w3.org/TR/html4/loose.dtd"),
0050    'html-frameset': ('HTML', "-//W3C//DTD HTML 4.01 Frameset//EN",
0051        "http://www.w3.org/TR/html4/frameset.dtd"),
0052    'html-quirks': ('HTML', '-//W3C//DTD HTML 4.01 Transitional//EN'),
0053    'html-frameset-quirks': ('HTML', "-//W3C//DTD HTML 4.01 Frameset//EN")
0054    }
0055
0056
0057class Serializer(object):
0058
0059    namespaces = namespace.namespaces
0060    src_encoding = encoding = 'utf-8'
0061    format = output_formats['default']
0062    formatted = False
0063    inline = False
0064
0065    def __init__(self, encoding=None, src_encoding=None,
0066            formatted=None, inline=None, format=None):
0067        """Initialize Serializer.
0068
0069        You can change the following parameters:
0070
0071        encoding: the output encoding
0072        src_encoding: the source encoding
0073        formatted: whether all tags should be considered formatted
0074        inline: whether all tags should be considered inline
0075        format: format to be applied (string or instance of Format)
0076
0077        """
0078        if encoding is not None:
0079            self.encoding = encoding
0080        if src_encoding is not None:
0081            self.src_encoding = src_encoding
0082        if formatted is not None:
0083            self.formatted = formatted
0084        if inline is not None:
0085            self.inline = inline
0086        if format is not None:
0087            self.format = format
0088        self.format = self._get_format(format)
0089
0090    def _get_format(self, format):
0091        if format is None:
0092            return self.format
0093        elif isinstance(format, basestring):
0094            return output_formats[format]
0095        else:
0096            return format
0097
0098    def is_formatted(self, tagname):
0099        return self.formatted
0100
0101    def is_inline(self, tagname):
0102        return self.inline
0103
0104    def serialize(self, stream, encoding=None,
0105            fragment=False, format=None):
0106        try:
0107            text = ''.join(self.generate(stream, encoding, fragment, format))
0108        except TypeError: # workaround for bug 905389 in Python < 2.5
0109            text = ''.join(tuple(
0110                self.generate(stream, encoding, fragment, format)))
0111        if not fragment:
0112            text = Format.strip(text)
0113        return text
0114
0115    def write(self, stream, file, encoding=None,
0116            fragment=False, format=None):
0117        needs_closed = False
0118        if not hasattr(file, 'write'):
0119            needs_closed = True
0120            file = open(file, 'wb')
0121        try:
0122            write = file.write
0123            for text in self.generate(stream, encoding, fragment, format):
0124                write(text)
0125        finally:
0126            # only close a file if it was opened locally
0127            if needs_closed:
0128                file.close()
0129
0130    def generate(self, stream, encoding=None,
0131            fragment=False, format=None):
0132        pass
0133
0134    def apply_filters(self, stream, format=None):
0135        stream = _coalesce(stream, self.src_encoding)
0136        if format:
0137            stream = self.format_stream(stream, format)
0138        return stream
0139
0140    def format_stream(self, stream, format):
0141        """Apply format to stream.
0142
0143        Note that this method is unaware of the serialization of the tags
0144        and does only take into account the text inside the stream. So the
0145        results may sometimes differ from what you expect when formatting
0146        the complete serialized output.
0147
0148        """
0149        filter_text = format.filter
0150        indent, wrap = format.indent, format.wrap
0151        if indent is not None:
0152            indent_lines = format.indent_lines
0153            lstrip_blanks = format.lstrip_blanks
0154            rstrip_blanks = format.rstrip_blanks
0155            lstrip_lines = format.lstrip_lines
0156            min_level, max_level = format.min_level, format.max_level
0157            indent_level = []
0158            new_line = False
0159        if wrap is not None:
0160            wrap_lines = format.wrap_lines
0161            indent_width, new_offset = format.indent_width, format.new_offset
0162            offset = 0
0163        formatted = 0
0164        text = last_char = ''
0165        for ev, item in stream:
0166            if ev == TEXT:
0167                text += item
0168            else:
0169                if ev in (START, END):
0170                    tag = item.tag
0171                    if not formatted:
0172                        text = filter_text(text, last_char)
0173                        if indent is None:
0174                            if wrap is not None:
0175                                text = wrap_lines(text, wrap, offset)
0176                        else:
0177                            level = len(indent_level)
0178                            if max_level and level > max_level:
0179                                level = max_level
0180                            if min_level:
0181                                level -= min_level
0182                                if level < 0:
0183                                    level = 0
0184                            if wrap is not None:
0185                                text = wrap_lines(text, wrap, offset,
0186                                    indent_width(level*indent))
0187                            if '\n' in text and indent_level:
0188                                indent_level[-1] = True
0189                            if new_line:
0190                                if lstrip_blanks(text)[:1] != '\n':
0191                                    text = '\n' + lstrip_blanks(text)
0192                                    offset = 0
0193                                new_line = False
0194                            if tag == Comment or not self.is_inline(tag):
0195                                if ev == START:
0196                                    if indent_level:
0197                                        if rstrip_blanks(text)[-1:] != '\n':
0198                                            text = rstrip_blanks(text) + '\n'
0199                                        text = indent_lines(text, level*indent)
0200                                        indent_level[-1] = True
0201                                    elif text:
0202                                        text = lstrip_lines(text)
0203                                    if tag != Comment                                               and not self.is_formatted(tag):
0205                                        indent_level.append(False)
0206                                else:
0207                                    if indent_level:
0208                                        if indent_level.pop():
0209                                            if rstrip_blanks(text)[-1:] == '\n':
0210                                                text = rstrip_blanks(text)[:-1]
0211                                            text = indent_lines(text,
0212                                                level*indent)
0213                                            text = rstrip_blanks(text) + '\n'
0214                                            level = len(indent_level)
0215                                            if max_level and level > max_level:
0216                                                level = max_level
0217                                            if min_level:
0218                                                level -= min_level
0219                                                if level < 0:
0220                                                    level = 0
0221                                            text += level*indent
0222                                    elif text:
0223                                        text = lstrip_lines(text)
0224                                    new_line = True
0225                            elif text:
0226                                text = indent_lines(text, level*indent)
0227                    if tag == Comment or self.is_formatted(tag):
0228                        if ev == START:
0229                            formatted += 1
0230                        elif formatted:
0231                            formatted -= 1
0232                            new_line = True
0233                    yield TEXT, text
0234                    if wrap is not None:
0235                        offset = new_offset(text, offset)
0236                    last_char = text[-1:]
0237                    text = ''
0238                yield ev, item
0239        if text:
0240            if not formatted:
0241                text = filter_text(text, last_char)
0242                if wrap is not None:
0243                    text = wrap_lines(text, wrap, offset)
0244                if indent is None:
0245                    if wrap is not None:
0246                        text = wrap_lines(text, wrap, offset)
0247                else:
0248                    level = len(indent_level)
0249                    if max_level and level > max_level:
0250                        level = max_level
0251                    if min_level:
0252                        level -= min_level
0253                        if level < 0:
0254                            level = 0
0255                    if wrap is not None:
0256                        text = wrap_lines(text, wrap, offset,
0257                            indent_width(level*indent))
0258                    if rstrip_blanks(text)[-1:] == '\n':
0259                        text = text[:-1]
0260                    text = indent_lines(text, level*indent)
0261            yield TEXT, text
0262
0263
0264class XMLSerializer(Serializer):
0265
0266    decl = True
0267    doctype = None
0268    entity_map = None
0269
0270    def __init__(self, encoding=None,
0271            decl=None, doctype=None, entity_map=None, namespaces=None,
0272            formatted=None, inline=None, format=None):
0273        """Initialize XMLSerializer.
0274
0275        You can change the following parameters:
0276
0277        encoding: the output encoding
0278        decl: add xml declaration at the beginning (True/False)
0279        doctype: add doctype (None, string, tuple)
0280        entity_map: use named entities for output (True/False or mapping)
0281        namespaces: mapping of namespaces
0282        formatted: whether all tags should be considered formatted
0283        inline: whether all tags should be considered inline
0284        format: format to be applied (string or instance of Format)
0285
0286        """
0287        Serializer.__init__(self, encoding=encoding,
0288            format=format, formatted=formatted, inline=inline)
0289        if decl is not None:
0290            self.decl = decl
0291        if doctype is not None:
0292            self.doctype = doctype
0293        if entity_map is not None:
0294            self.entity_map = entity_map
0295        if namespaces is not None:
0296            self.namespaces = namespaces
0297
0298    def can_be_empty_element(self, item_name):
0299        return True
0300
0301    def generate(self, stream, encoding=None,
0302            fragment=False, format=None):
0303        """Serializes an event stream to bytes of the specified encoding.
0304
0305        This function yields an encoded string over and over until the
0306        stream is exhausted.
0307
0308        """
0309        decl = self.decl
0310        doctype = self.doctype
0311        encoding = encoding or self.encoding or 'utf-8'
0312        entity_map = self.entity_map
0313        format = self._get_format(format)
0314        if format:
0315            if format.decl is not None:
0316                decl = format.decl
0317            if format.doctype is not None:
0318                doctype = format.doctype
0319            if format.entity_map is not None:
0320                entity_map = format.entity_map
0321        if entity_map == True:
0322            # if True, use default HTML entity map
0323            entity_map = default_entity_map
0324        elif entity_map == False:
0325            entity_map = None
0326        if isinstance(doctype, basestring):
0327            # allow doctype strings
0328            doctype = doctypes[self.doctype]
0329
0330        escape_cdata = XMLSerializer.escape_cdata
0331        escape_attrib = XMLSerializer.escape_attrib
0332
0333        lastev = None
0334        stream = iter(stream)
0335        names = NamespaceStack(self.namespaces)
0336        if not fragment:
0337            if decl:
0338                yield '<?xml version="1.0" encoding="%s"?>\n' % encoding
0339            if doctype is not None:
0340                yield serialize_doctype(doctype) + '\n'
0341        text = None
0342        for ev, item in self.apply_filters(stream, format):
0343            if ev in (START, END) and item.tag == Fragment:
0344                continue
0345            elif ev == TEXT:
0346                if text is not None:
0347                    text = u''.join([text, item])
0348                else:
0349                    text = item
0350                continue
0351            if lastev == START:
0352                if ev == END and (not text or not (Format.strip(text)
0353                        or self.is_formatted(item.tag)))                           and self.can_be_empty_element(item.tag):
0355                    yield ' />'
0356                    lastev = END
0357                    text = None
0358                    names.pop()
0359                    continue
0360                yield ">"
0361            if text:
0362                yield escape_cdata(text, encoding, entity_map)
0363                text = None
0364            if ev == START:
0365                if item.tag == Comment:
0366                    yield "<!--%s-->" % item.text.encode(encoding)
0367                    lastev = COMMENT
0368                    continue
0369                elif item.tag == ProcessingInstruction:
0370                    yield "<?%s?>" % item.text.encode(encoding)
0371                    lastev = PI
0372                    continue
0373                else:
0374                    current_names = names.current
0375                    names.push(namespaces(item, remove=True))
0376                    qname = names.qname(item.tag, default=True)
0377                    yield "<" + qname.encode(encoding)
0378                    for k, v in item.attrib.items():
0379                        k = names.qname(k, default=False).encode(encoding)
0380                        v = escape_attrib(v, encoding)
0381                        yield ' %s="%s"' % (k, v)
0382                    for prefix, uri in names.current.items():
0383                        if prefix not in current_names                                   or current_names[prefix] != uri:
0385                            v = escape_attrib(uri, encoding)
0386                            if prefix:
0387                                k = 'xmlns:' + prefix.encode(encoding)
0388                            else:
0389                                k = 'xmlns'
0390                            yield ' %s="%s"' % (k, v)
0391            elif ev == END and item.tag not in (
0392                    Comment, ProcessingInstruction):
0393                qname = names.qname(item.tag, default=True)
0394                yield "</%s>" % qname.encode(encoding)
0395                names.pop()
0396            lastev = ev
0397        if fragment and text:
0398            yield escape_cdata(text, encoding, entity_map)
0399        return
0400<