| 1 | #!/usr/bin/env python |
|---|
| 2 | |
|---|
| 3 | """\ |
|---|
| 4 | This tool regenerates and replaces the ToC in an HTML file from the actual |
|---|
| 5 | structure of <div>s and <h[2345]>s present in the body of the document. |
|---|
| 6 | The section to be overwritten is identified as the XML subtree |
|---|
| 7 | rooted at <ol id="toc">. |
|---|
| 8 | |
|---|
| 9 | Usage: ./toctool.py filename... |
|---|
| 10 | |
|---|
| 11 | This file is a copy of r37798 from the Subversion repository. |
|---|
| 12 | |
|---|
| 13 | """ |
|---|
| 14 | |
|---|
| 15 | import sys |
|---|
| 16 | import os |
|---|
| 17 | import xml.parsers.expat |
|---|
| 18 | |
|---|
| 19 | |
|---|
| 20 | class Index: |
|---|
| 21 | def __init__(self): |
|---|
| 22 | self.title = None |
|---|
| 23 | self.tree = [] |
|---|
| 24 | self._ptr_stack = [self.tree] |
|---|
| 25 | |
|---|
| 26 | def addLevel(self, id, title): |
|---|
| 27 | newlevel = [(id, title)] |
|---|
| 28 | self._ptr_stack[-1].append(newlevel) |
|---|
| 29 | self._ptr_stack.append(newlevel) |
|---|
| 30 | |
|---|
| 31 | def upLevel(self): |
|---|
| 32 | self._ptr_stack.pop(-1) |
|---|
| 33 | |
|---|
| 34 | def prettyString(self): |
|---|
| 35 | out = [] |
|---|
| 36 | def step(ilevel, node): |
|---|
| 37 | if isinstance(node, list): |
|---|
| 38 | for subnode in node: |
|---|
| 39 | step(ilevel+1, subnode) |
|---|
| 40 | else: |
|---|
| 41 | out.append("%s%s" % (" "*ilevel, node)) |
|---|
| 42 | step(-2, self.tree) |
|---|
| 43 | return "\n".join(out) |
|---|
| 44 | |
|---|
| 45 | def renderXML(self): |
|---|
| 46 | out = [] |
|---|
| 47 | def step(ilevel, node): |
|---|
| 48 | if len(node) == 1: |
|---|
| 49 | out.append('%s<li><a href="#%s">%s</a></li>' |
|---|
| 50 | % (' '*ilevel, node[0][0], node[0][1])) |
|---|
| 51 | else: |
|---|
| 52 | out.append('%s<li><a href="#%s">%s</a>' |
|---|
| 53 | % (' '*ilevel, node[0][0], node[0][1])) |
|---|
| 54 | out.append('%s<ol>' % (' '*ilevel)) |
|---|
| 55 | for subnode in node[1:]: |
|---|
| 56 | step(ilevel+1, subnode) |
|---|
| 57 | out.append('%s</ol>' % (' '*ilevel)) |
|---|
| 58 | out.append('%s</li> <!-- %s -->' % (' '*ilevel, node[0][0])) |
|---|
| 59 | out.append('<ol id="toc">') |
|---|
| 60 | for node in self.tree: |
|---|
| 61 | step(1, node) |
|---|
| 62 | out.append('</ol>') |
|---|
| 63 | return "\n".join(out) |
|---|
| 64 | |
|---|
| 65 | |
|---|
| 66 | class ExpatParseJob: |
|---|
| 67 | def parse(self, file): |
|---|
| 68 | p = xml.parsers.expat.ParserCreate() |
|---|
| 69 | p.ordered_attributes = self._ordered_attributes |
|---|
| 70 | p.returns_unicode = False |
|---|
| 71 | p.specified_attributes = True |
|---|
| 72 | for name in dir(self): |
|---|
| 73 | if name.endswith('Handler'): |
|---|
| 74 | setattr(p, name, getattr(self, name)) |
|---|
| 75 | p.ParseFile(file) |
|---|
| 76 | |
|---|
| 77 | |
|---|
| 78 | class IndexBuildParse(ExpatParseJob): |
|---|
| 79 | keys = {'h2':None, 'h3':None, 'h4':None, 'h5':None} |
|---|
| 80 | |
|---|
| 81 | def __init__(self): |
|---|
| 82 | self.index = Index() |
|---|
| 83 | self.keyptr = 0 |
|---|
| 84 | self.collecting_text = False |
|---|
| 85 | self.text = '' |
|---|
| 86 | self.waiting_for_elt = None |
|---|
| 87 | self.saved_id = None |
|---|
| 88 | self.elt_stack = [] |
|---|
| 89 | self._ordered_attributes = False |
|---|
| 90 | |
|---|
| 91 | def StartElementHandler(self, name, attrs): |
|---|
| 92 | if name == 'div': |
|---|
| 93 | cl = attrs.get('class') |
|---|
| 94 | if cl in self.keys: |
|---|
| 95 | self.waiting_for_elt = cl |
|---|
| 96 | self.saved_id = attrs.get('id') |
|---|
| 97 | self.elt_stack.append((name, True)) |
|---|
| 98 | return |
|---|
| 99 | elif name == 'title': |
|---|
| 100 | self.collecting_text = name |
|---|
| 101 | self.text = '' |
|---|
| 102 | elif name == self.waiting_for_elt: |
|---|
| 103 | self.waiting_for_elt = None |
|---|
| 104 | self.collecting_text = name |
|---|
| 105 | self.text = '' |
|---|
| 106 | self.elt_stack.append((name, False)) |
|---|
| 107 | |
|---|
| 108 | def EndElementHandler(self, name): |
|---|
| 109 | if self.collecting_text: |
|---|
| 110 | if name == self.collecting_text: |
|---|
| 111 | if name == 'title': |
|---|
| 112 | self.index.title = self.text |
|---|
| 113 | else: |
|---|
| 114 | self.index.addLevel(self.saved_id, self.text) |
|---|
| 115 | self.saved_id = None |
|---|
| 116 | self.collecting_text = False |
|---|
| 117 | else: |
|---|
| 118 | raise RuntimeError('foo') |
|---|
| 119 | eltinfo = self.elt_stack.pop(-1) |
|---|
| 120 | assert eltinfo[0] == name |
|---|
| 121 | if eltinfo[1]: |
|---|
| 122 | self.index.upLevel() |
|---|
| 123 | |
|---|
| 124 | def DefaultHandler(self, data) : |
|---|
| 125 | if self.collecting_text: |
|---|
| 126 | self.text += data |
|---|
| 127 | |
|---|
| 128 | |
|---|
| 129 | def attrlist_to_dict(l): |
|---|
| 130 | d = {} |
|---|
| 131 | for i in range(0, len(l), 2): |
|---|
| 132 | d[l[i]] = l[i+1] |
|---|
| 133 | return d |
|---|
| 134 | |
|---|
| 135 | |
|---|
| 136 | def escape_entities(s): |
|---|
| 137 | return s.replace('&', '&').replace('<', '<').replace('>', '>') |
|---|
| 138 | |
|---|
| 139 | |
|---|
| 140 | class IndexInsertParse(ExpatParseJob): |
|---|
| 141 | def __init__(self, index, outfp): |
|---|
| 142 | self._ordered_attributes = True |
|---|
| 143 | self.index = index |
|---|
| 144 | self.outfp = outfp |
|---|
| 145 | self.elt_stack = [] |
|---|
| 146 | self.skipping_toc = False |
|---|
| 147 | |
|---|
| 148 | self._line_in_progress = [] |
|---|
| 149 | self._element_open = None |
|---|
| 150 | self.linepos = 0 |
|---|
| 151 | self.indentpos = 0 |
|---|
| 152 | |
|---|
| 153 | self.do_not_minimize = {'script':None} |
|---|
| 154 | self.do_not_indent = {'div':None, 'a':None, 'strong':None, 'em':None} |
|---|
| 155 | self.do_not_wrap = {'div':None, 'strong':None, 'em':None, 'li':None} |
|---|
| 156 | |
|---|
| 157 | if self.index.title == 'Subversion Design': |
|---|
| 158 | self.do_not_wrap['a'] = None |
|---|
| 159 | |
|---|
| 160 | def put_token(self, token, tag_name): |
|---|
| 161 | self._line_in_progress.append((token, tag_name)) |
|---|
| 162 | |
|---|
| 163 | def done_line(self): |
|---|
| 164 | linepos = 0 |
|---|
| 165 | last_was_tag = False |
|---|
| 166 | outq = [] |
|---|
| 167 | for token, tag_name in self._line_in_progress: |
|---|
| 168 | is_tag = tag_name is not None and tag_name not in self.do_not_wrap |
|---|
| 169 | no_indent_if_wrap = tag_name in self.do_not_indent |
|---|
| 170 | linepos += len(token) |
|---|
| 171 | if linepos > 79 and is_tag and last_was_tag: |
|---|
| 172 | token = token.lstrip(' ') |
|---|
| 173 | if no_indent_if_wrap: |
|---|
| 174 | linepos = len(token) |
|---|
| 175 | outq.append('\n') |
|---|
| 176 | else: |
|---|
| 177 | linepos = len(token) + 2 |
|---|
| 178 | outq.append('\n ') |
|---|
| 179 | outq.append(token) |
|---|
| 180 | last_was_tag = is_tag |
|---|
| 181 | outq.append('\n') |
|---|
| 182 | for i in outq: |
|---|
| 183 | self.outfp.write(i) |
|---|
| 184 | del self._line_in_progress[:] |
|---|
| 185 | |
|---|
| 186 | def _finish_pending(self, minimized_form): |
|---|
| 187 | if self._element_open is not None: |
|---|
| 188 | name = self._element_open |
|---|
| 189 | self._element_open = None |
|---|
| 190 | if minimized_form: |
|---|
| 191 | self.put_token(' />', name) |
|---|
| 192 | return True |
|---|
| 193 | else: |
|---|
| 194 | self.put_token('>', name) |
|---|
| 195 | return False |
|---|
| 196 | |
|---|
| 197 | def StartElementHandler(self, name, attrs): |
|---|
| 198 | self._finish_pending(False) |
|---|
| 199 | if name == 'ol' and attrlist_to_dict(attrs).get('id') == 'toc': |
|---|
| 200 | self.outfp.write(self.index.renderXML()) |
|---|
| 201 | self.skipping_toc = True |
|---|
| 202 | self.elt_stack.append((name, True)) |
|---|
| 203 | return |
|---|
| 204 | if not self.skipping_toc: |
|---|
| 205 | self.put_token("<%s" % name, name) |
|---|
| 206 | while attrs: |
|---|
| 207 | aname = attrs.pop(0) |
|---|
| 208 | aval = escape_entities(attrs.pop(0)) |
|---|
| 209 | self.put_token(' %s="%s"' % (aname, aval), name) |
|---|
| 210 | self._element_open = name |
|---|
| 211 | self.elt_stack.append((name, False)) |
|---|
| 212 | |
|---|
| 213 | def EndElementHandler(self, name): |
|---|
| 214 | if not self.skipping_toc: |
|---|
| 215 | if not self._finish_pending(name not in self.do_not_minimize): |
|---|
| 216 | self.put_token("</%s>" % name, name) |
|---|
| 217 | eltinfo = self.elt_stack.pop(-1) |
|---|
| 218 | assert eltinfo[0] == name |
|---|
| 219 | if eltinfo[1]: |
|---|
| 220 | self.skipping_toc = False |
|---|
| 221 | |
|---|
| 222 | def DefaultHandler(self, data): |
|---|
| 223 | if self.skipping_toc: |
|---|
| 224 | return |
|---|
| 225 | self._finish_pending(False) |
|---|
| 226 | # This makes an unsafe assumption that expat will pass '\n' as individual |
|---|
| 227 | # characters to this function. Seems to work at the moment. |
|---|
| 228 | # Will almost certainly break later. |
|---|
| 229 | if data == '\n': |
|---|
| 230 | self.done_line() |
|---|
| 231 | else: |
|---|
| 232 | self.put_token(data, None) |
|---|
| 233 | |
|---|
| 234 | |
|---|
| 235 | def process(fn): |
|---|
| 236 | infp = open(fn, 'r') |
|---|
| 237 | builder = IndexBuildParse() |
|---|
| 238 | builder.parse(infp) |
|---|
| 239 | |
|---|
| 240 | infp.seek(0) |
|---|
| 241 | outfp = open(fn + '.new', 'w') |
|---|
| 242 | inserter = IndexInsertParse(builder.index, outfp) |
|---|
| 243 | inserter.parse(infp) |
|---|
| 244 | |
|---|
| 245 | infp.close() |
|---|
| 246 | outfp.close() |
|---|
| 247 | os.rename(fn, fn + '.toctool-backup~') |
|---|
| 248 | os.rename(fn + '.new', fn) |
|---|
| 249 | |
|---|
| 250 | |
|---|
| 251 | def main(): |
|---|
| 252 | for fn in sys.argv[1:]: |
|---|
| 253 | process(fn) |
|---|
| 254 | |
|---|
| 255 | |
|---|
| 256 | if __name__ == '__main__': |
|---|
| 257 | main() |
|---|