mako/filters.py

23 months ago

author
zzzeek
date
Tue Jun 22 17:38:45 2010 -0400
changeset 436
7c0d449fb0aa
parent 384
e2e48b721d99
child 443
3c97a5d3dbd3
permissions
-rw-r--r--

ensure .mako goes in here

     1 # filters.py
     2 # Copyright (C) 2006, 2007, 2008, 2009, 2010 Geoffrey T. Dairiki <dairiki@dairiki.org> and Michael Bayer <mike_mp@zzzcomputing.com>
     3 #
     4 # This module is part of Mako and is released under
     5 # the MIT License: http://www.opensource.org/licenses/mit-license.php
     8 import re, urllib, htmlentitydefs, codecs
     9 from StringIO import StringIO
    10 from mako import util
    11 import markupsafe
    13 xml_escapes = {
    14     '&' : '&amp;',
    15     '>' : '&gt;', 
    16     '<' : '&lt;', 
    17     '"' : '&#34;',   # also &quot; in html-only
    18     "'" : '&#39;'    # also &apos; in html-only    
    19 }
    21 # XXX: &quot; is valid in HTML and XML
    22 #      &apos; is not valid HTML, but is valid XML
    24 def html_escape(string):
    25     return markupsafe.escape(string)
    27 def legacy_html_escape(string):
    28     """legacy HTML escape for non-unicode mode."""
    30     return re.sub(r'([&<"\'>])', lambda m: xml_escapes[m.group()], string)
    32 def xml_escape(string):
    33     return re.sub(r'([&<"\'>])', lambda m: xml_escapes[m.group()], string)
    35 def url_escape(string):
    36     # convert into a list of octets
    37     string = string.encode("utf8")
    38     return urllib.quote_plus(string)
    40 def url_unescape(string):
    41     text = urllib.unquote_plus(string)
    42     if not is_ascii_str(text):
    43         text = text.decode("utf8")
    44     return text
    46 def trim(string):
    47     return string.strip()
    50 class Decode(object):
    51     def __getattr__(self, key):
    52         def decode(x):
    53             if isinstance(x, unicode):
    54                 return x
    55             elif not isinstance(x, str):
    56                 return unicode(str(x), encoding=key)
    57             else:
    58                 return unicode(x, encoding=key)
    59         return decode
    60 decode = Decode()
    63 _ASCII_re = re.compile(r'\A[\x00-\x7f]*\Z')
    65 def is_ascii_str(text):
    66     return isinstance(text, str) and _ASCII_re.match(text)
    68 ################################################################    
    70 class XMLEntityEscaper(object):
    71     def __init__(self, codepoint2name, name2codepoint):
    72         self.codepoint2entity = dict([(c, u'&%s;' % n)
    73                                       for c,n in codepoint2name.iteritems()])
    74         self.name2codepoint = name2codepoint
    76     def escape_entities(self, text):
    77         """Replace characters with their character entity references.
    79         Only characters corresponding to a named entity are replaced.
    80         """
    81         return unicode(text).translate(self.codepoint2entity)
    83     def __escape(self, m):
    84         codepoint = ord(m.group())
    85         try:
    86             return self.codepoint2entity[codepoint]
    87         except (KeyError, IndexError):
    88             return '&#x%X;' % codepoint
    91     __escapable = re.compile(r'["&<>]|[^\x00-\x7f]')
    93     def escape(self, text):
    94         """Replace characters with their character references.
    96         Replace characters by their named entity references.
    97         Non-ASCII characters, if they do not have a named entity reference,
    98         are replaced by numerical character references.
   100         The return value is guaranteed to be ASCII.
   101         """
   102         return self.__escapable.sub(self.__escape, unicode(text)
   103                                     ).encode('ascii')
   105     # XXX: This regexp will not match all valid XML entity names__.
   106     # (It punts on details involving involving CombiningChars and Extenders.)
   107     #
   108     # .. __: http://www.w3.org/TR/2000/REC-xml-20001006#NT-EntityRef
   109     __characterrefs = re.compile(r'''& (?:
   110                                           \#(\d+)
   111                                           | \#x([\da-f]+)
   112                                           | ( (?!\d) [:\w] [-.:\w]+ )
   113                                           ) ;''',
   114                                  re.X | re.UNICODE)
   116     def __unescape(self, m):
   117         dval, hval, name = m.groups()
   118         if dval:
   119             codepoint = int(dval)
   120         elif hval:
   121             codepoint = int(hval, 16)
   122         else:
   123             codepoint = self.name2codepoint.get(name, 0xfffd)
   124             # U+FFFD = "REPLACEMENT CHARACTER"
   125         if codepoint < 128:
   126             return chr(codepoint)
   127         return unichr(codepoint)
   129     def unescape(self, text):
   130         """Unescape character references.
   132         All character references (both entity references and numerical
   133         character references) are unescaped.
   134         """
   135         return self.__characterrefs.sub(self.__unescape, text)
   138 _html_entities_escaper = XMLEntityEscaper(htmlentitydefs.codepoint2name,
   139                                           htmlentitydefs.name2codepoint)
   141 html_entities_escape = _html_entities_escaper.escape_entities
   142 html_entities_unescape = _html_entities_escaper.unescape
   145 def htmlentityreplace_errors(ex):
   146     """An encoding error handler.
   148     This python `codecs`_ error handler replaces unencodable
   149     characters with HTML entities, or, if no HTML entity exists for
   150     the character, XML character references.
   152     >>> u'The cost was \u20ac12.'.encode('latin1', 'htmlentityreplace')
   153     'The cost was &euro;12.'
   154     """
   155     if isinstance(ex, UnicodeEncodeError):
   156         # Handle encoding errors
   157         bad_text = ex.object[ex.start:ex.end]
   158         text = _html_entities_escaper.escape(bad_text)
   159         return (unicode(text), ex.end)
   160     raise ex
   162 codecs.register_error('htmlentityreplace', htmlentityreplace_errors)
   165 # TODO: options to make this dynamic per-compilation will be added in a later release
   166 DEFAULT_ESCAPES = {
   167     'x':'filters.xml_escape',
   168     'h':'filters.html_escape',
   169     'u':'filters.url_escape',
   170     'trim':'filters.trim',
   171     'entity':'filters.html_entities_escape',
   172     'unicode':'unicode',
   173     'decode':'decode',
   174     'str':'str',
   175     'n':'n'
   176 }
   178 if util.py3k:
   179     DEFAULT_ESCAPES.update({
   180         'unicode':'str'
   181     })
   183 NON_UNICODE_ESCAPES = DEFAULT_ESCAPES.copy()
   184 NON_UNICODE_ESCAPES['h'] = 'filters.legacy_html_escape'

mercurial