1 | #!/usr/bin/env python |
---|
2 | |
---|
3 | """\ |
---|
4 | This tool regenerates and replaces the ToC in an HTML file from the actual |
---|
5 | structure of <div>s and <h[2345]>s present in the body of the document. |
---|
6 | The section to be overwritten is identified as the XML subtree |
---|
7 | rooted at <ol id="toc">. |
---|
8 | |
---|
9 | Usage: ./toctool.py filename... |
---|
10 | |
---|
11 | This file is a copy of r37798 from the Subversion repository. |
---|
12 | |
---|
13 | """ |
---|
14 | |
---|
15 | import sys |
---|
16 | import os |
---|
17 | import xml.parsers.expat |
---|
18 | |
---|
19 | |
---|
20 | class Index: |
---|
21 | def __init__(self): |
---|
22 | self.title = None |
---|
23 | self.tree = [] |
---|
24 | self._ptr_stack = [self.tree] |
---|
25 | |
---|
26 | def addLevel(self, id, title): |
---|
27 | newlevel = [(id, title)] |
---|
28 | self._ptr_stack[-1].append(newlevel) |
---|
29 | self._ptr_stack.append(newlevel) |
---|
30 | |
---|
31 | def upLevel(self): |
---|
32 | self._ptr_stack.pop(-1) |
---|
33 | |
---|
34 | def prettyString(self): |
---|
35 | out = [] |
---|
36 | def step(ilevel, node): |
---|
37 | if isinstance(node, list): |
---|
38 | for subnode in node: |
---|
39 | step(ilevel+1, subnode) |
---|
40 | else: |
---|
41 | out.append("%s%s" % (" "*ilevel, node)) |
---|
42 | step(-2, self.tree) |
---|
43 | return "\n".join(out) |
---|
44 | |
---|
45 | def renderXML(self): |
---|
46 | out = [] |
---|
47 | def step(ilevel, node): |
---|
48 | if len(node) == 1: |
---|
49 | out.append('%s<li><a href="#%s">%s</a></li>' |
---|
50 | % (' '*ilevel, node[0][0], node[0][1])) |
---|
51 | else: |
---|
52 | out.append('%s<li><a href="#%s">%s</a>' |
---|
53 | % (' '*ilevel, node[0][0], node[0][1])) |
---|
54 | out.append('%s<ol>' % (' '*ilevel)) |
---|
55 | for subnode in node[1:]: |
---|
56 | step(ilevel+1, subnode) |
---|
57 | out.append('%s</ol>' % (' '*ilevel)) |
---|
58 | out.append('%s</li> <!-- %s -->' % (' '*ilevel, node[0][0])) |
---|
59 | out.append('<ol id="toc">') |
---|
60 | for node in self.tree: |
---|
61 | step(1, node) |
---|
62 | out.append('</ol>') |
---|
63 | return "\n".join(out) |
---|
64 | |
---|
65 | |
---|
66 | class ExpatParseJob: |
---|
67 | def parse(self, file): |
---|
68 | p = xml.parsers.expat.ParserCreate() |
---|
69 | p.ordered_attributes = self._ordered_attributes |
---|
70 | p.returns_unicode = False |
---|
71 | p.specified_attributes = True |
---|
72 | for name in dir(self): |
---|
73 | if name.endswith('Handler'): |
---|
74 | setattr(p, name, getattr(self, name)) |
---|
75 | p.ParseFile(file) |
---|
76 | |
---|
77 | |
---|
78 | class IndexBuildParse(ExpatParseJob): |
---|
79 | keys = {'h2':None, 'h3':None, 'h4':None, 'h5':None} |
---|
80 | |
---|
81 | def __init__(self): |
---|
82 | self.index = Index() |
---|
83 | self.keyptr = 0 |
---|
84 | self.collecting_text = False |
---|
85 | self.text = '' |
---|
86 | self.waiting_for_elt = None |
---|
87 | self.saved_id = None |
---|
88 | self.elt_stack = [] |
---|
89 | self._ordered_attributes = False |
---|
90 | |
---|
91 | def StartElementHandler(self, name, attrs): |
---|
92 | if name == 'div': |
---|
93 | cl = attrs.get('class') |
---|
94 | if cl in self.keys: |
---|
95 | self.waiting_for_elt = cl |
---|
96 | self.saved_id = attrs.get('id') |
---|
97 | self.elt_stack.append((name, True)) |
---|
98 | return |
---|
99 | elif name == 'title': |
---|
100 | self.collecting_text = name |
---|
101 | self.text = '' |
---|
102 | elif name == self.waiting_for_elt: |
---|
103 | self.waiting_for_elt = None |
---|
104 | self.collecting_text = name |
---|
105 | self.text = '' |
---|
106 | self.elt_stack.append((name, False)) |
---|
107 | |
---|
108 | def EndElementHandler(self, name): |
---|
109 | if self.collecting_text: |
---|
110 | if name == self.collecting_text: |
---|
111 | if name == 'title': |
---|
112 | self.index.title = self.text |
---|
113 | else: |
---|
114 | self.index.addLevel(self.saved_id, self.text) |
---|
115 | self.saved_id = None |
---|
116 | self.collecting_text = False |
---|
117 | else: |
---|
118 | raise RuntimeError('foo') |
---|
119 | eltinfo = self.elt_stack.pop(-1) |
---|
120 | assert eltinfo[0] == name |
---|
121 | if eltinfo[1]: |
---|
122 | self.index.upLevel() |
---|
123 | |
---|
124 | def DefaultHandler(self, data) : |
---|
125 | if self.collecting_text: |
---|
126 | self.text += data |
---|
127 | |
---|
128 | |
---|
129 | def attrlist_to_dict(l): |
---|
130 | d = {} |
---|
131 | for i in range(0, len(l), 2): |
---|
132 | d[l[i]] = l[i+1] |
---|
133 | return d |
---|
134 | |
---|
135 | |
---|
136 | def escape_entities(s): |
---|
137 | return s.replace('&', '&').replace('<', '<').replace('>', '>') |
---|
138 | |
---|
139 | |
---|
140 | class IndexInsertParse(ExpatParseJob): |
---|
141 | def __init__(self, index, outfp): |
---|
142 | self._ordered_attributes = True |
---|
143 | self.index = index |
---|
144 | self.outfp = outfp |
---|
145 | self.elt_stack = [] |
---|
146 | self.skipping_toc = False |
---|
147 | |
---|
148 | self._line_in_progress = [] |
---|
149 | self._element_open = None |
---|
150 | self.linepos = 0 |
---|
151 | self.indentpos = 0 |
---|
152 | |
---|
153 | self.do_not_minimize = {'script':None} |
---|
154 | self.do_not_indent = {'div':None, 'a':None, 'strong':None, 'em':None} |
---|
155 | self.do_not_wrap = {'div':None, 'strong':None, 'em':None, 'li':None} |
---|
156 | |
---|
157 | if self.index.title == 'Subversion Design': |
---|
158 | self.do_not_wrap['a'] = None |
---|
159 | |
---|
160 | def put_token(self, token, tag_name): |
---|
161 | self._line_in_progress.append((token, tag_name)) |
---|
162 | |
---|
163 | def done_line(self): |
---|
164 | linepos = 0 |
---|
165 | last_was_tag = False |
---|
166 | outq = [] |
---|
167 | for token, tag_name in self._line_in_progress: |
---|
168 | is_tag = tag_name is not None and tag_name not in self.do_not_wrap |
---|
169 | no_indent_if_wrap = tag_name in self.do_not_indent |
---|
170 | linepos += len(token) |
---|
171 | if linepos > 79 and is_tag and last_was_tag: |
---|
172 | token = token.lstrip(' ') |
---|
173 | if no_indent_if_wrap: |
---|
174 | linepos = len(token) |
---|
175 | outq.append('\n') |
---|
176 | else: |
---|
177 | linepos = len(token) + 2 |
---|
178 | outq.append('\n ') |
---|
179 | outq.append(token) |
---|
180 | last_was_tag = is_tag |
---|
181 | outq.append('\n') |
---|
182 | for i in outq: |
---|
183 | self.outfp.write(i) |
---|
184 | del self._line_in_progress[:] |
---|
185 | |
---|
186 | def _finish_pending(self, minimized_form): |
---|
187 | if self._element_open is not None: |
---|
188 | name = self._element_open |
---|
189 | self._element_open = None |
---|
190 | if minimized_form: |
---|
191 | self.put_token(' />', name) |
---|
192 | return True |
---|
193 | else: |
---|
194 | self.put_token('>', name) |
---|
195 | return False |
---|
196 | |
---|
197 | def StartElementHandler(self, name, attrs): |
---|
198 | self._finish_pending(False) |
---|
199 | if name == 'ol' and attrlist_to_dict(attrs).get('id') == 'toc': |
---|
200 | self.outfp.write(self.index.renderXML()) |
---|
201 | self.skipping_toc = True |
---|
202 | self.elt_stack.append((name, True)) |
---|
203 | return |
---|
204 | if not self.skipping_toc: |
---|
205 | self.put_token("<%s" % name, name) |
---|
206 | while attrs: |
---|
207 | aname = attrs.pop(0) |
---|
208 | aval = escape_entities(attrs.pop(0)) |
---|
209 | self.put_token(' %s="%s"' % (aname, aval), name) |
---|
210 | self._element_open = name |
---|
211 | self.elt_stack.append((name, False)) |
---|
212 | |
---|
213 | def EndElementHandler(self, name): |
---|
214 | if not self.skipping_toc: |
---|
215 | if not self._finish_pending(name not in self.do_not_minimize): |
---|
216 | self.put_token("</%s>" % name, name) |
---|
217 | eltinfo = self.elt_stack.pop(-1) |
---|
218 | assert eltinfo[0] == name |
---|
219 | if eltinfo[1]: |
---|
220 | self.skipping_toc = False |
---|
221 | |
---|
222 | def DefaultHandler(self, data): |
---|
223 | if self.skipping_toc: |
---|
224 | return |
---|
225 | self._finish_pending(False) |
---|
226 | # This makes an unsafe assumption that expat will pass '\n' as individual |
---|
227 | # characters to this function. Seems to work at the moment. |
---|
228 | # Will almost certainly break later. |
---|
229 | if data == '\n': |
---|
230 | self.done_line() |
---|
231 | else: |
---|
232 | self.put_token(data, None) |
---|
233 | |
---|
234 | |
---|
235 | def process(fn): |
---|
236 | infp = open(fn, 'r') |
---|
237 | builder = IndexBuildParse() |
---|
238 | builder.parse(infp) |
---|
239 | |
---|
240 | infp.seek(0) |
---|
241 | outfp = open(fn + '.new', 'w') |
---|
242 | inserter = IndexInsertParse(builder.index, outfp) |
---|
243 | inserter.parse(infp) |
---|
244 | |
---|
245 | infp.close() |
---|
246 | outfp.close() |
---|
247 | os.rename(fn, fn + '.toctool-backup~') |
---|
248 | os.rename(fn + '.new', fn) |
---|
249 | |
---|
250 | |
---|
251 | def main(): |
---|
252 | for fn in sys.argv[1:]: |
---|
253 | process(fn) |
---|
254 | |
---|
255 | |
---|
256 | if __name__ == '__main__': |
---|
257 | main() |
---|