source: public/content/toctool.py

Last change on this file was 12008, checked in by ehuelsmann, 15 years ago

Send in the new site.

File size: 6.8 KB
Line 
1#!/usr/bin/env python
2
3"""\
4This tool regenerates and replaces the ToC in an HTML file from the actual
5structure of <div>s and <h[2345]>s present in the body of the document.
6The section to be overwritten is identified as the XML subtree
7rooted at <ol id="toc">.
8
9Usage: ./toctool.py filename...
10
11This file is a copy of r37798 from the Subversion repository.
12
13"""
14
15import sys
16import os
17import xml.parsers.expat
18
19
20class Index:
21  def __init__(self):
22    self.title = None
23    self.tree = []
24    self._ptr_stack = [self.tree]
25
26  def addLevel(self, id, title):
27    newlevel = [(id, title)]
28    self._ptr_stack[-1].append(newlevel)
29    self._ptr_stack.append(newlevel)
30
31  def upLevel(self):
32    self._ptr_stack.pop(-1)
33
34  def prettyString(self):
35    out = []
36    def step(ilevel, node):
37      if isinstance(node, list):
38        for subnode in node:
39          step(ilevel+1, subnode)
40      else:
41        out.append("%s%s" % ("  "*ilevel, node))
42    step(-2, self.tree)
43    return "\n".join(out)
44
45  def renderXML(self):
46    out = []
47    def step(ilevel, node):
48      if len(node) == 1:
49        out.append('%s<li><a href="#%s">%s</a></li>'
50            % ('  '*ilevel, node[0][0], node[0][1]))
51      else:
52        out.append('%s<li><a href="#%s">%s</a>'
53            % ('  '*ilevel, node[0][0], node[0][1]))
54        out.append('%s<ol>' % ('  '*ilevel))
55        for subnode in node[1:]:
56          step(ilevel+1, subnode)
57        out.append('%s</ol>' % ('  '*ilevel))
58        out.append('%s</li> <!-- %s -->' % ('  '*ilevel, node[0][0]))
59    out.append('<ol id="toc">')
60    for node in self.tree:
61      step(1, node)
62    out.append('</ol>')
63    return "\n".join(out)
64
65
66class ExpatParseJob:
67  def parse(self, file):
68    p = xml.parsers.expat.ParserCreate()
69    p.ordered_attributes = self._ordered_attributes
70    p.returns_unicode = False
71    p.specified_attributes = True
72    for name in dir(self):
73      if name.endswith('Handler'):
74        setattr(p, name, getattr(self, name))
75    p.ParseFile(file)
76
77
78class IndexBuildParse(ExpatParseJob):
79  keys = {'h2':None, 'h3':None, 'h4':None, 'h5':None}
80
81  def __init__(self):
82    self.index = Index()
83    self.keyptr = 0
84    self.collecting_text = False
85    self.text = ''
86    self.waiting_for_elt = None
87    self.saved_id = None
88    self.elt_stack = []
89    self._ordered_attributes = False
90
91  def StartElementHandler(self, name, attrs):
92    if name == 'div':
93      cl = attrs.get('class')
94      if cl in self.keys:
95        self.waiting_for_elt = cl
96        self.saved_id = attrs.get('id')
97        self.elt_stack.append((name, True))
98        return
99    elif name == 'title':
100      self.collecting_text = name
101      self.text = ''
102    elif name == self.waiting_for_elt:
103      self.waiting_for_elt = None
104      self.collecting_text = name
105      self.text = ''
106    self.elt_stack.append((name, False))
107
108  def EndElementHandler(self, name):
109    if self.collecting_text:
110      if name == self.collecting_text:
111        if name == 'title':
112          self.index.title = self.text
113        else:
114          self.index.addLevel(self.saved_id, self.text)
115          self.saved_id = None
116        self.collecting_text = False
117      else:
118        raise RuntimeError('foo')
119    eltinfo = self.elt_stack.pop(-1)
120    assert eltinfo[0] == name
121    if eltinfo[1]:
122      self.index.upLevel()
123
124  def DefaultHandler(self, data) :
125    if self.collecting_text:
126      self.text += data
127
128
129def attrlist_to_dict(l):
130  d = {}
131  for i in range(0, len(l), 2):
132    d[l[i]] = l[i+1]
133  return d
134
135
136def escape_entities(s):
137  return s.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
138
139
140class IndexInsertParse(ExpatParseJob):
141  def __init__(self, index, outfp):
142    self._ordered_attributes = True
143    self.index = index
144    self.outfp = outfp
145    self.elt_stack = []
146    self.skipping_toc = False
147
148    self._line_in_progress = []
149    self._element_open = None
150    self.linepos = 0
151    self.indentpos = 0
152
153    self.do_not_minimize = {'script':None}
154    self.do_not_indent = {'div':None, 'a':None, 'strong':None, 'em':None}
155    self.do_not_wrap = {'div':None, 'strong':None, 'em':None, 'li':None}
156
157    if self.index.title == 'Subversion Design':
158      self.do_not_wrap['a'] = None
159
160  def put_token(self, token, tag_name):
161    self._line_in_progress.append((token, tag_name))
162
163  def done_line(self):
164    linepos = 0
165    last_was_tag = False
166    outq = []
167    for token, tag_name in self._line_in_progress:
168      is_tag = tag_name is not None and tag_name not in self.do_not_wrap
169      no_indent_if_wrap = tag_name in self.do_not_indent
170      linepos += len(token)
171      if linepos > 79 and is_tag and last_was_tag:
172        token = token.lstrip(' ')
173        if no_indent_if_wrap:
174          linepos = len(token)
175          outq.append('\n')
176        else:
177          linepos = len(token) + 2
178          outq.append('\n  ')
179      outq.append(token)
180      last_was_tag = is_tag
181    outq.append('\n')
182    for i in outq:
183      self.outfp.write(i)
184    del self._line_in_progress[:]
185
186  def _finish_pending(self, minimized_form):
187    if self._element_open is not None:
188      name = self._element_open
189      self._element_open = None
190      if minimized_form:
191        self.put_token(' />', name)
192        return True
193      else:
194        self.put_token('>', name)
195    return False
196
197  def StartElementHandler(self, name, attrs):
198    self._finish_pending(False)
199    if name == 'ol' and attrlist_to_dict(attrs).get('id') == 'toc':
200      self.outfp.write(self.index.renderXML())
201      self.skipping_toc = True
202      self.elt_stack.append((name, True))
203      return
204    if not self.skipping_toc:
205      self.put_token("<%s" % name, name)
206      while attrs:
207        aname = attrs.pop(0)
208        aval = escape_entities(attrs.pop(0))
209        self.put_token(' %s="%s"' % (aname, aval), name)
210      self._element_open = name
211    self.elt_stack.append((name, False))
212
213  def EndElementHandler(self, name):
214    if not self.skipping_toc:
215      if not self._finish_pending(name not in self.do_not_minimize):
216        self.put_token("</%s>" % name, name)
217    eltinfo = self.elt_stack.pop(-1)
218    assert eltinfo[0] == name
219    if eltinfo[1]:
220      self.skipping_toc = False
221
222  def DefaultHandler(self, data):
223    if self.skipping_toc:
224      return
225    self._finish_pending(False)
226    # This makes an unsafe assumption that expat will pass '\n' as individual
227    # characters to this function.  Seems to work at the moment.
228    # Will almost certainly break later.
229    if data == '\n':
230      self.done_line()
231    else:
232      self.put_token(data, None)
233
234
235def process(fn):
236  infp = open(fn, 'r')
237  builder = IndexBuildParse()
238  builder.parse(infp)
239
240  infp.seek(0)
241  outfp = open(fn + '.new', 'w')
242  inserter = IndexInsertParse(builder.index, outfp)
243  inserter.parse(infp)
244
245  infp.close()
246  outfp.close()
247  os.rename(fn, fn + '.toctool-backup~')
248  os.rename(fn + '.new', fn)
249
250
251def main():
252  for fn in sys.argv[1:]:
253    process(fn)
254
255
256if __name__ == '__main__':
257  main()
Note: See TracBrowser for help on using the repository browser.