Module q3html
[frames] | no frames]

Source Code for Module q3html

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3   
  4   
  5  """This program takes files named *.q in the current directory, and converts them to HTML. 
  6  To do that, it looks for a python script called 'headfoot.py' or '../headfoot.py' or ... . 
  7  This script needs to define one variable ( server ) and two functions (header and footer). 
  8   
  9  The .q files have the following format:: 
 10   
 11          TITLE title string 
 12          OTHER_HEADER_KEYWORD header info 
 13           
 14          P 
 15                  Text is 3>5 ? 
 16                  A ref="foo.html" No! 
 17                          More text inside the link. 
 18                  IMG ref="foo.gif" 
 19                  UL 
 20                          LI List item 
 21                          LI Another List item 
 22   
 23   
 24  The header info is separated from the body with a blank line. 
 25  HTML tags are always the first thing on a line. 
 26  Line continuations and enclosures are indicated with indentation. 
 27  Tags automatically close when the indentation gets smaller. 
 28   
 29  The PRE tag is the only exception. 
 30  You have to close it with a line beginning '/PRE'. 
 31  No indenting is needed inside a PRE /PRE pair. 
 32   
 33  The user needs to define one thing: 
 34  the address of the web server. 
 35  This address is prepended to 
 36  all hyperlinks and image references that aren't absolute. 
 37   
 38  All the header information gets put into a dictionary, and is 
 39  passed into the header() and footer() functions. 
 40  That dictionary has 'filename' and '_server_root' entries added. 
 41   
 42  The user can also define three things: 
 43   
 44  DEFAULT_HEADER, a dictionary passed to the header() and footer functions(). 
 45  Useful things to put in DEFAULT_HEADER are: 
 46          - 'lang': 'en'  # appears in the <html lang="en"> tag 
 47          - 'stylesheet': url becomes a link to a stylesheet: <link rel="stylesheet" type="text/css" href="url"> 
 48          - Anything in the form M.x:y becomes a <meta name="x" content="y"> 
 49          - M.description and M.keywords are useful. 
 50          - Anything in the form HTTP.x:y becomes a <meta http-equiv="x" content="y"> 
 51   
 52  Header(thd, hinfo) takes two arguments, a file descriptor 
 53  to which it should write the HTML, and a dictionary of header 
 54  information. 
 55   
 56  Footer(thd, hinfo) takes the same arguments. 
 57   
 58  See _header() and _footer() below, for examples. 
 59  """ 
 60   
 61  import os 
 62  import re 
 63  import time 
 64  import urlparse 
 65  import htmlentitydefs as ED 
 66  from gmisclib import die 
 67  from gmisclib import g_pipe 
 68   
 69  XML = 0 
 70   
 71  POINTTAG = 1 
 72  _anytag = {'P':0, 'STYLE': 0, 
 73                  'UL':0, 'OL':0, 'DL':0, 
 74                  'LI':0, 'DT': 0, 'DD': 0, 
 75                  'PRE':'SPECIAL', 
 76                  'TABLE':0, 'TR':0, 'TD':0, 
 77                  'CAPTION':0, 
 78                  'TBODY':0, 'THEAD':0, 'TFOOT':0, 
 79                  'FORM':0, 'INPUT':1, 'SELECT':0, 'OPTION':0, 'TEXTAREA':0, 
 80                  'A':0, 'NOP': 0, 
 81                  'H1':0, 'H2':0, 'H3':0, 'H4':0, 
 82                  'EM':0, 'STRONG':0, 'CITE':0, 'DFN':0, 
 83                  'CODE':0, 'SAMP':0, 'KBD':0, 'VAR':0, 'ABBR':0, 'ACRONYM':0, 
 84                  'OBJECT': 0, 'DIV': 0, 'SPAN': 0, 
 85                  'FONT': 0, 
 86                  'IMG': POINTTAG, 'BR': POINTTAG, 'HR': POINTTAG 
 87                  } 
 88   
 89  _ck = re.compile(r'\b(H1|PRE|P|TITLE|IMG)\b', re.IGNORECASE) 
 90   
91 -def _checkfile(f):
92 """Look at the beginning of a file to see if is plausibly input for this program.""" 93 fd = open(f, "r") 94 for l in fd.readlines()[:10]: 95 if _ck.match(l): 96 return 1 97 return 0
98 99 100
101 -def qfiles(dir):
102 o = [] 103 minname = re.compile('^.*[.]q$') 104 l = os.listdir(dir) 105 for t in l: 106 ts = t.strip() 107 x = minname.match(ts) 108 if not x: 109 continue 110 if not _checkfile(ts): 111 continue 112 o.append(ts) 113 return o
114 115 116 117
118 -def av(k, v):
119 if not v.startswith('"') or not v.endswith('"'): 120 v = '"%s"' % v 121 return "%s=%s" % (k, v)
122 123 124 LL = 60 125 TABW = 8 126 127 DEFAULT_TAG = 'P' 128 DEFAULT_INDENT = -1 129
130 -def measure_indent(s):
131 indent = 0 132 i = 0 133 if s.isspace(): 134 return (None, '') 135 136 while i<len(s) and s[i].isspace(): 137 if s[i] == ' ': 138 indent += 1 139 elif s[i] == '\t': 140 indent = TABW*((indent+TABW)/TABW) 141 elif s[i] == '\r' or s[i] == '\n': 142 return (None, '') 143 else: 144 raise ValueError, "Unrecognized whitespace: %d" % ord(s[i]) 145 i += 1 146 return indent
147 148
149 -def prepare_text(l):
150 l = l.strip() 151 if l.startswith('%'): 152 l = l[1:].lstrip() 153 return l
154 155
156 -def starts_with_a_tag(l):
157 a = l.split(None, 1) 158 if len(a) == 0: 159 return False 160 elif len(a) == 1: 161 return _anytag.has_key(a[0]) 162 else: 163 return _anytag.has_key(a[0]) and '=' in a[1]
164 165 166 _eqtoken = re.compile("""\s*([a-zA-Z0-9_:]+)\s*=\s*(("[^"]*")|(\S+))\s*""")
167 -def tokenize(l):
168 # print "TOKENIZE:", l.strip() 169 a = l.split(None, 1) 170 tag = a[0] 171 args = {} 172 if len(a) > 1: 173 s = a[1] 174 while True: 175 m = _eqtoken.match(s) 176 if not m: 177 break 178 args[m.group(1)] = m.group(2) 179 s = s[m.end():] 180 txt = s 181 else: 182 txt = '' 183 die.info("TOKENIZED:%s %s %s" % (tag, args, txt)) 184 return (tag, args, txt)
185 186
187 -class lineC:
188 - def __init__(self, l):
189 self.indent = measure_indent(l) 190 l = l.strip() 191 if starts_with_a_tag(l): 192 self.tag, self.args, self.txt = tokenize(l) 193 else: 194 self.tag = None 195 self.args = None 196 self.txt = prepare_text(l)
197
198 - def getarg(self, k, defv):
199 return self.args.get(k, defv)
200 201 202 203
204 -def format_dict(d, drop=None):
205 o = [] 206 if drop is None: 207 for (k, v) in d.items(): 208 o.append(av(k, v)) 209 else: 210 for (k, v) in d.items(): 211 if not k.startswith(drop): 212 o.append(av(k, v)) 213 return o
214 215
216 -def dequote(a):
217 if len(a) > 1 and a[0]=='"' and a[-1]=='"': 218 return a[1:-1] 219 return a
220
221 -def urlq(s):
222 return s
223 224
225 -def prepend(a, b):
226 """Process a URL. Pathnames are prepended with the server root. 227 Complete URLs are untouched.""" 228 229 # print 'JOIN', a['_abs_root'] + '/' + a['_cwd'] + '/', b 230 return '"%s"' % urlparse.urljoin(urlq(a['_abs_root'] + '/' + a['_cwd'] + '/'), 231 urlq(dequote(b)))
232 233 234
235 -def process_tag(d, od, hinfo, eol):
236 references = [] 237 238 # print "Tag=", d.tag 239 if d.tag == 'IMG': 240 d.args['src'] = prepend(hinfo, d.args['ref']) 241 references = [ d.args['src'] ] 242 del d.args['ref'] 243 elif d.tag == 'OBJECT': 244 d.args['data'] = prepend(hinfo, d.args['ref']) 245 references = [ d.args['data'] ] 246 del d.args['ref'] 247 elif d.tag == 'A' and 'ref' in d.args: 248 d.args['href'] = prepend(hinfo, d.args['ref']) 249 references = [ d.args['href'] ] 250 del d.args['ref'] 251 252 args = format_dict(d.args, '_') 253 254 if XML: 255 pointclose = ([], ['/'])[ _anytag[d.tag]==POINTTAG ] 256 else: 257 pointclose = [] 258 259 od.write('<%s>%s%s' % (' '.join([d.tag] + args + pointclose), d.txt, eol)) 260 261 if not _anytag[d.tag] == POINTTAG: 262 return (references, d.tag) 263 return (references, None)
264 265 _escape = dict([(unichr(q), nm) for (q, nm) in ED.codepoint2name.items()]) 266 _escape.update( {'~': '&nbsp;', '>':'&gt;', '<':'&lt;', '&':'&amp;', 267 '"': '&quot;' 268 } ) 269 #_epat = '&[0-9a-zA-Z]{,5};|' + ( '|'.join(_escape.keys()) ) 270 _epat = '[%s]' % (''.join(_escape.keys())) 271 272 # print "EPAT=", _epat 273 _ere = re.compile(_epat)
274 -def _esc1(x):
275 """Converts a character in a MatchObject to a &xx; (HTML) escape sequence""" 276 mstring = x.group(0) 277 if len(mstring)==1: 278 return _escape[mstring] 279 return mstring
280 281 282
283 -def escape(s):
284 return _ere.sub(_esc1, s)
285 286
287 -def get_logical_line(lines):
288 while 1: 289 if len(lines) == 0: 290 break 291 o = lines.pop(0) 292 if not o.startswith('#'): 293 return o 294 return None
295 296 297
298 -def process(lines, od, hinfo):
299 stack = [('BODY', -1000)] # [ (tag, indent), ... ] 300 301 references = [] 302 eol = '\n' 303 n = 0 304 while 1: 305 n += 1 306 l = get_logical_line(lines) 307 if l is None: 308 break 309 if l.lower().startswith('pre'): 310 od.write('<pre>' + l[len('pre'):]) 311 while len(lines) > 0: 312 l = lines.pop(0).rstrip() 313 if l.lower().startswith('/pre'): 314 break 315 od.write(l + '\n') 316 od.write("</pre>\n") 317 if len(lines) == 0: 318 break 319 l = get_logical_line(lines) 320 321 # print "#", l 322 try: 323 d = lineC(l) 324 except ValueError, x: 325 raise ValueError, "Bad Parse, line %d: %s" % (n, x) 326 327 if d is None: 328 # print "EMPTY" 329 continue 330 331 while d.indent <= stack[-1][1]: 332 cltag, oindent = stack.pop(-1) 333 od.write("</%s>%s" % (cltag, eol)) 334 335 if d.tag is not None: 336 # print "args=", d.args 337 some_refs, cltag = process_tag(d, od, hinfo, eol) 338 if cltag is not None: 339 stack.append((cltag, d.indent)) 340 references.extend( some_refs ) 341 342 elif d.txt is not None: 343 od.write('%s%s' % (escape(d.txt), eol)) 344 345 while len(stack) > 0: 346 od.write('</%s>%s' % (stack.pop(-1)[0], eol)) 347 348 return references
349 350
351 -def swapend(a, e):
352 t = a.rindex('.') 353 return a[:t] + e
354 355
356 -def aget(dic, alist):
357 o = [] 358 for a in alist: 359 for ac in [a, a.lower(), a.upper()]: 360 if dic.has_key(ac): 361 o.append('%s="%s"' % (ac, dic[ac])) 362 break 363 if len(o) == 0: 364 return '' 365 return " " + ' '.join(o)
366 367
368 -def sw(s, prefix):
369 return s.startswith(prefix) or s.startswith(prefix.lower()) or s.startswith(prefix.upper())
370
371 -def dpre(s, prefix):
372 return s[len(prefix):]
373
374 -def agp(dic, prefix):
375 lp = len(prefix) 376 o = [] 377 for (k, v) in dic.items(): 378 if len(k) <= lp: 379 continue 380 if sw(k, prefix): 381 o.append('%s="%s"' % (k[lp:], v)) 382 if len(o) == 0: 383 return '' 384 return " " + ' '.join(o)
385 386
387 -def get(dic, key, dfl):
388 for k in [key, key.lower(), key.upper()]: 389 if dic.has_key(k): 390 return dic[k] 391 return dfl
392 393
394 -def _header(thd, hinfo):
395 thd.write( get(hinfo, 'DOCTYPE', 396 '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">' 397 ) + '\n' 398 ) 399 thd.write('<html%s>\n' % aget(hinfo, ['lang'])) 400 thd.write("<head>\n") 401 sty = get(hinfo, 'stylesheet', '') 402 if sty != '': 403 thd.write('<link rel="stylesheet" type="text/css" href="%s">' % sty) 404 if hinfo.has_key('_mod_time'): 405 thd.write('<meta http-equiv="Last-Modified" content="%s">\n' 406 % time.asctime(time.gmtime(hinfo['_mod_time']))) 407 for (k, v) in hinfo.items(): 408 if sw(k, 'M.'): 409 thd.write('<meta name="%s" content="%s">\n' % (dpre(k, 'M.'), v)) 410 if sw(k, 'HTTP.'): 411 thd.write('<meta http-equiv="%s" content="%s">\n' % (dpre(k, 'HTTP:'), v)) 412 thd.write("<title>%s</title>\n" % get(hinfo, 'TITLE', hinfo['filename'])) 413 thd.write("</head>\n") 414 thd.write("<body%s>\n" % agp(hinfo, 'B.'))
415 416
417 -def _common(lines, hinfo):
418 return lines
419 420 423 424
425 -def readheader(fd, t):
426 h = {'filename': t} 427 while 1: 428 l = fd.readline() 429 if l is None: 430 break 431 l = l.rstrip() 432 if l == '': 433 break 434 if l[0] == '#': 435 continue 436 a = l.split(None, 1) 437 assert len(a) == 2 438 h[a[0].strip()] = a[1].strip() 439 return h
440 441 442 MTIME = 8 443
444 -def needcopy(a, b, force=None):
445 """Do we need to reconstruct file b from 446 file a? Also reconstruct b if it is older 447 than the date in force. 448 """ 449 assert os.access(a, os.F_OK + os.R_OK) 450 ossa = os.stat(a) 451 if not os.access(b, os.F_OK): 452 return 1 453 ossb = os.stat(b) 454 if force is not None and ossb[MTIME] < force: 455 return 1 456 if ossb[MTIME] < ossa[MTIME]: 457 return 1 458 return 0
459 460
461 -def file_only(url):
462 """Given a URL, just return the file part, 463 not the target specifier inside the file.""" 464 465 a = url.split('#') 466 if len(a) > 2: 467 die.warn("URL has more than one '#': can't split into file and target.") 468 return url 469 return a[0]
470 471
472 -def del_fin_sl(s):
473 if s.endswith('/'): 474 return s[:-1] 475 return s
476 477
478 -def go(env, force=None):
479 htmlheader = env.get('header', _header) 480 htmlfooter = env.get('footer', _footer) 481 htmlcommon = env.get('common', _common) 482 default_header = env.get('DEFAULT_HEADER', {}) 483 all = set() 484 q = qfiles('.') 485 for t in q: 486 th = swapend(t, '.html') 487 if not needcopy(t, th, force): 488 print "# No need to process", t 489 continue 490 die.info("# processing:%s -> %s\n" % (t, th)) 491 fd = open(t, "r") 492 # thd = open(th, "w") 493 thd, thdo = g_pipe.popen2("/usr/bin/tidy", ['tidy', '-asxhtml', '-c', '-i', '-q', '-o', th]) 494 hinfo = default_header.copy() 495 hinfo.update(readheader(fd, t)) 496 hinfo['_mod_time'] = os.fstat(fd.fileno()).st_mtime 497 hinfo['_server_root'] = del_fin_sl(env.get('server', '.')) 498 hinfo['_abs_root'] = del_fin_sl(env.get('server', '.')) 499 hinfo['_rel_root'] = del_fin_sl(env.get('rel_root', '.')) 500 if env.has_key('cwd'): 501 hinfo['_cwd'] = env['cwd'] 502 hinfo['_local_root'] = env['cwd'] 503 htmlheader(thd, hinfo) 504 lines = htmlcommon([x.rstrip() for x in fd.readlines()], hinfo) 505 for reference in process(lines, thd, hinfo): 506 all.add(reference) 507 fd.close() 508 htmlfooter(thd, hinfo) 509 thd.close() 510 511 return all
512 513 514 515 DEFF = 'headfoot.py' 516 517
518 -def easygo():
519 force = None 520 env = {} 521 cwd = os.getcwd().split('/') + ['.'] 522 for i in range(10): 523 chkd = './' + ( '../' * i ) 524 chk = chkd + DEFF 525 print "checking", chk 526 if os.access(chk, os.R_OK): 527 env['rel_root'] = chkd[:-1] 528 env['cwd'] = '/'.join(cwd[-i-1:]) 529 execfile(chk, env) 530 assert env.has_key('server') 531 force = os.stat(chk)[MTIME] 532 break 533 return go(env, force)
534 535 536 537 538 539 540 if __name__ == '__main__': 541 o = easygo() 542 print "# Files to upload:" 543 for t in o: 544 print t 545