23 months ago
ensure .mako goes in here
1 # filters.py
2 # Copyright (C) 2006, 2007, 2008, 2009, 2010 Geoffrey T. Dairiki <dairiki@dairiki.org> and Michael Bayer <mike_mp@zzzcomputing.com>
3 #
4 # This module is part of Mako and is released under
5 # the MIT License: http://www.opensource.org/licenses/mit-license.php
8 import re, urllib, htmlentitydefs, codecs
9 from StringIO import StringIO
10 from mako import util
11 import markupsafe
13 xml_escapes = {
14 '&' : '&',
15 '>' : '>',
16 '<' : '<',
17 '"' : '"', # also " in html-only
18 "'" : ''' # also ' in html-only
19 }
21 # XXX: " is valid in HTML and XML
22 # ' is not valid HTML, but is valid XML
24 def html_escape(string):
25 return markupsafe.escape(string)
27 def legacy_html_escape(string):
28 """legacy HTML escape for non-unicode mode."""
30 return re.sub(r'([&<"\'>])', lambda m: xml_escapes[m.group()], string)
32 def xml_escape(string):
33 return re.sub(r'([&<"\'>])', lambda m: xml_escapes[m.group()], string)
35 def url_escape(string):
36 # convert into a list of octets
37 string = string.encode("utf8")
38 return urllib.quote_plus(string)
40 def url_unescape(string):
41 text = urllib.unquote_plus(string)
42 if not is_ascii_str(text):
43 text = text.decode("utf8")
44 return text
46 def trim(string):
47 return string.strip()
50 class Decode(object):
51 def __getattr__(self, key):
52 def decode(x):
53 if isinstance(x, unicode):
54 return x
55 elif not isinstance(x, str):
56 return unicode(str(x), encoding=key)
57 else:
58 return unicode(x, encoding=key)
59 return decode
60 decode = Decode()
63 _ASCII_re = re.compile(r'\A[\x00-\x7f]*\Z')
65 def is_ascii_str(text):
66 return isinstance(text, str) and _ASCII_re.match(text)
68 ################################################################
70 class XMLEntityEscaper(object):
71 def __init__(self, codepoint2name, name2codepoint):
72 self.codepoint2entity = dict([(c, u'&%s;' % n)
73 for c,n in codepoint2name.iteritems()])
74 self.name2codepoint = name2codepoint
76 def escape_entities(self, text):
77 """Replace characters with their character entity references.
79 Only characters corresponding to a named entity are replaced.
80 """
81 return unicode(text).translate(self.codepoint2entity)
83 def __escape(self, m):
84 codepoint = ord(m.group())
85 try:
86 return self.codepoint2entity[codepoint]
87 except (KeyError, IndexError):
88 return '&#x%X;' % codepoint
91 __escapable = re.compile(r'["&<>]|[^\x00-\x7f]')
93 def escape(self, text):
94 """Replace characters with their character references.
96 Replace characters by their named entity references.
97 Non-ASCII characters, if they do not have a named entity reference,
98 are replaced by numerical character references.
100 The return value is guaranteed to be ASCII.
101 """
102 return self.__escapable.sub(self.__escape, unicode(text)
103 ).encode('ascii')
105 # XXX: This regexp will not match all valid XML entity names__.
106 # (It punts on details involving involving CombiningChars and Extenders.)
107 #
108 # .. __: http://www.w3.org/TR/2000/REC-xml-20001006#NT-EntityRef
109 __characterrefs = re.compile(r'''& (?:
110 \#(\d+)
111 | \#x([\da-f]+)
112 | ( (?!\d) [:\w] [-.:\w]+ )
113 ) ;''',
114 re.X | re.UNICODE)
116 def __unescape(self, m):
117 dval, hval, name = m.groups()
118 if dval:
119 codepoint = int(dval)
120 elif hval:
121 codepoint = int(hval, 16)
122 else:
123 codepoint = self.name2codepoint.get(name, 0xfffd)
124 # U+FFFD = "REPLACEMENT CHARACTER"
125 if codepoint < 128:
126 return chr(codepoint)
127 return unichr(codepoint)
129 def unescape(self, text):
130 """Unescape character references.
132 All character references (both entity references and numerical
133 character references) are unescaped.
134 """
135 return self.__characterrefs.sub(self.__unescape, text)
138 _html_entities_escaper = XMLEntityEscaper(htmlentitydefs.codepoint2name,
139 htmlentitydefs.name2codepoint)
141 html_entities_escape = _html_entities_escaper.escape_entities
142 html_entities_unescape = _html_entities_escaper.unescape
145 def htmlentityreplace_errors(ex):
146 """An encoding error handler.
148 This python `codecs`_ error handler replaces unencodable
149 characters with HTML entities, or, if no HTML entity exists for
150 the character, XML character references.
152 >>> u'The cost was \u20ac12.'.encode('latin1', 'htmlentityreplace')
153 'The cost was €12.'
154 """
155 if isinstance(ex, UnicodeEncodeError):
156 # Handle encoding errors
157 bad_text = ex.object[ex.start:ex.end]
158 text = _html_entities_escaper.escape(bad_text)
159 return (unicode(text), ex.end)
160 raise ex
162 codecs.register_error('htmlentityreplace', htmlentityreplace_errors)
165 # TODO: options to make this dynamic per-compilation will be added in a later release
166 DEFAULT_ESCAPES = {
167 'x':'filters.xml_escape',
168 'h':'filters.html_escape',
169 'u':'filters.url_escape',
170 'trim':'filters.trim',
171 'entity':'filters.html_entities_escape',
172 'unicode':'unicode',
173 'decode':'decode',
174 'str':'str',
175 'n':'n'
176 }
178 if util.py3k:
179 DEFAULT_ESCAPES.update({
180 'unicode':'str'
181 })
183 NON_UNICODE_ESCAPES = DEFAULT_ESCAPES.copy()
184 NON_UNICODE_ESCAPES['h'] = 'filters.legacy_html_escape'