-
Notifications
You must be signed in to change notification settings - Fork 0
/
typogrify.py
310 lines (241 loc) · 11.5 KB
/
typogrify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
# -*- coding: utf-8 -*-
import re
import sys
import six
from six.moves.html_parser import HTMLParser
from six.moves.html_entities import name2codepoint
# Used to represent any tag
class any_tag:
pass
class Typogrify(object):
# class variables
__ignores = None
__default_ignores = ['pre', 'code', 'script', 'kbd']
__filters = ['smartypants','widont','caps','amp','initial_quotes']
class _HTMLParser(HTMLParser):
"""Typogrify HTML Parser: A very simple parser, it determines when
HTML text is being processed (as opposed to HTML tags) and applies
the typogrify filters to the text"""
current_pos = 0
filtering = True # default is to filter everything
intermediate_tags = 0
data_buffer = ''
new_line_pos = dict()
filtered_data_length = 0
def __init__(self, typogrify, html_doc):
self.html_doc = html_doc.strip()
try:
# Python 3.4+
HTMLParser.__init__(self, convert_charrefs=False)
except TypeError:
HTMLParser.__init__(self)
# Mark the new line positions - needed to
# determine the position within the input string
#
# ACTUALLY - we should use StringIO here instead
new_line = 1
self.new_line_pos[new_line] = 0
for index, char in enumerate(self.html_doc):
if char == "\n":
new_line += 1
# Add one due to index being zero based
self.new_line_pos[new_line] = index + 1
self.typogrify = typogrify
self.feed(self.html_doc) # start parsing
def handle_starttag(self, tag, attrs):
"""Records the current tag and determines if
filters should be applied. If intermediate_tags > 0
then this tag is already being ignored (not
filtered) because a parent was specified to be
ignored"""
if self.intermediate_tags > 0:
self.intermediate_tags += 1
return
self.filtering = self.typogrify._should_be_filtered(tag, attrs)
self.intermediate_tags = 1 if not self.filtering else 0
def handle_data(self, data):
"""Filters the content of a html text node if
it is not being ignored"""
line_num, offset = self.getpos()
new_pos = self.new_line_pos[line_num] + offset
self.data_buffer += self.html_doc[self.current_pos:new_pos]
content = data
if self.filtering:
content = self.typogrify._apply_filters(content, self.lasttag)
self.filtered_data_length = len(content)
self.data_buffer += content
self.current_pos = new_pos + len(data)
def handle_endtag(self, tag):
"""Used to determine when a tag that is not
being filtered has ended"""
if self.intermediate_tags > 0:
self.intermediate_tags -= 1
# Widont filter needs to be handled here
if self.filtering:
content = self.data_buffer[-self.filtered_data_length:]
content = self.typogrify.widont(tag, content)
self.data_buffer = self.data_buffer[:-self.filtered_data_length] + content
def get_output(self):
"""If current_pos has not reached to the end of the
document, then it gets appended here"""
if self.current_pos < len(self.html_doc):
self.data_buffer += self.html_doc[self.current_pos:]
self.current_pos = len(self.html_doc)
return self.data_buffer
def __init__(self):
"""Class constructor"""
# Set default variables
self.ignores = [] # sets ignores to defaults
@property
def ignores(self):
"""Exposes ignores as a list containing
items to be ignored"""
pass # make ignore_tags unaccessible
@ignores.setter
def ignores(self, value):
"""The setter of the ignore list, the format is
as follows: ['div','span.test','#test'] would
ignore: the tag div, the tag span if it has
a class of test, all id's set to test"""
value += self.__default_ignores
tags, attributes = self._process_ignores(value)
self.__ignores = list([tags, attributes])
def _process_ignores(self, ignores):
"""User specified HTML tags or attributes can be ignored. This
method classifies the different ignores into three categories:
1) Tags to be ignored (e.g. span, div)
2) Attributes to be ignored, with # representing an id, and .
representing a class (e.g. #test - ignore all id's that
are set to test)
3) Attributes on tags, using the same attribute format as
mentioned above (e.g. span.test - ignore all span elements
that have class set to test)"""
ignores = set(map(lambda ign: ign.strip(), ignores)) # strip ws and make unique
ignored_tags = set() # contains tags that will be ignored
ignored_attributes = dict() # contains attributes (classes or ids) to be ignored
# classify ignores into categories
tags = set(filter(lambda x: '.' not in x and '#' not in x, ignores))
generic_filters = set(filter(lambda x: x.startswith(('.','#')), ignores))
tag_filters = ignores - (tags | generic_filters)
# tags that are to be ignored
for item in tags:
ignored_tags.add(item)
# attributes that are to be ignored
ignored_attributes[any_tag] = set()
for item in generic_filters:
ignored_attributes[any_tag].add(item)
for item in tag_filters:
tag_attr = re.split(r'([.#])', item, 1)
# Do not process if tag is already being ignored
if tag_attr[0] not in tags:
attr = ignored_attributes.get(tag_attr[0], set())
attr.add(tag_attr[1]+tag_attr[2])
ignored_attributes[tag_attr[0]] = attr
return (ignored_tags, ignored_attributes)
def _should_be_filtered(self, tag, attrs):
"""Determines if the current html node should be filtered.
A node should be filtered if it's tag or its class or id
attribute was not specified to be ignored by the user"""
# Test if the node's tag should be filtered
if self.__ignores[0] and tag in self.__ignores[0]:
return False
# Test if the node's attributes should be filtered
filters = self.__ignores[1][any_tag]
if tag in self.__ignores[1]:
filters |= self.__ignores[1][tag]
try:
if any('.%s' % attr[1] in filters for attr in attrs if attr[0] == 'class'):
return False
except KeyError:
pass
try:
if any('#%s' % attr[1] in filters for attr in attrs if attr[0] == 'id'):
return False
except KeyError:
pass
return True
#
# Typogrify Filters
#
def amp(self, text):
"""Wraps apersands in HTML with ``<span class="amp">`` so they can be
styled with CSS. Apersands are also normalized to ``&``. Requires
ampersands to have whitespace or an `` `` on both sides."""
amp_finder = re.compile(r"""
(\s| ) # Group 1: prefixed whitespace
(?:&|&|&\#38;) # The actual ampersand (non capturing group)
(\s| ) # Group 2: suffixed whitespace
""", re.VERBOSE)
replace_function = lambda match: """%s<span class="amp">&</span>%s""" % match.group(1,2)
text = amp_finder.sub(replace_function, text)
return text
def caps(self, text):
"""Wraps multiple capital letters in ``<span class="caps">``
so they can be styled with CSS."""
cap_finder = re.compile(r"""
( # Start group capture
(?=(:?\d*[A-Z]){2}) # Positive look ahead: At least two caps interspersed with any amount of digits must exist
(?:[A-Z\d']*) # Any amount of caps, digits or dumb apostrophes
| # Or
(?:[A-Z]+\.\s??){2,} # Caps followed by '.' must be present at least twice (note \s?? which is non-greedy)
) # End group capture
""", re.VERBOSE)
replace_function = lambda match: """<span class="caps">%s</span>""" % match.group(1)
text = cap_finder.sub(replace_function, text)
return text
def widont(self, tag, text):
"""Replaces the space between the last two words in a string with `` ``
Works in these block tags ``(h1-h6, p, li, dd, dt)`` and also accounts for
potential closing inline elements ``a, em, strong, span, b, i``"""
approved_tags = ['a','em','span','strong','i','b','p','h1',
'h2','h3','h4','h5','h6','li','dt','dd']
# Must be inside an approved tag
if tag not in approved_tags:
return text
widont_finder = re.compile(r"""
(.*) # Group 1: captures everything except the final whitespace before a word
\s+ # The final whitespace before the word
(\S) # The actual word
\s* # Optional whitespace (which is removed if present)
""", re.VERBOSE)
replace_function = lambda match: '%s %s' % match.group(1, 2)
text = widont_finder.sub(replace_function, text)
return text
def initial_quotes(self, text):
"""Wraps initial quotes in ``class="dquo"`` for double quotes or
``class="quo"`` for single quotes"""
quote_finder = re.compile(r"""
( # Start group capture
("|“|&\#8220;) # A double quote
| # Or
('|‘|&\#8216;) # A single quote
) # End group capture
""", re.VERBOSE)
replace_function = lambda match: """<span class="%s">%s</span>"""\
% ('dquo' if match.group(2) else 'quo', match.group(1))
text = quote_finder.sub(replace_function, text, 1)
return text
def smarty_pants(self, text):
"""Applies smarty pants to html text"""
# Try to load smartypants
try:
import smartypants
return smartypants.smartypants(text)
except ImportError:
pass # this should be logged maybe??? Right now, silently ignored
return text
def _apply_filters(self, text, tag):
"""Applies the above filters to the text nodes of the HTML doc"""
# The order of the filters below is important
# and should not be changed
# intial_quotes needs to happen at this point so that
# attribute values introduced later on do not get affected
text = self.initial_quotes(text)
text = self.smarty_pants(text)
text = self.amp(text)
text = self.caps(text)
return text
def filter(self, html_doc, tags=None, session_ignores=None, session_filters=None):
"""Produces Typogryfied html for the Pelican static project"""
parser = self._HTMLParser(self, html_doc)
return parser.get_output()