#!/usr/local/bin/python
"""
NAME

  chk_xhtml.py -  Validate XHTML using the xml.sax parser

SYNOPSIS

  python chk_xhtml.py [-v] [-f catalog_file] file [file ...]

  where:
    -v               verbose
    -f catalog_file  define location of local DTD catalog file
    file             name of file to be checked

DESCRIPTION

  Validates XHTML file or files using the python xml.sax engine.
  Errors in the XHTML file are written to sys.stdout and cause
  chk_xhtml.py to return a non-zero exit code.

  Note that chk_xhtml.py only validates that the XHTML is well-formed.
  It does not check that the XHTML conforms to the DTD.

  If you have local copies of the DTDs and other supporting files,
  specify the catalog file name to chk_html.py via the -f argument.
  This makes chk_xhtml.py much faster.

NOTES

  Uses catalog.py to build a in-memory copy of the local entity definitions,
  which may be flakey.

MODIFICATION HISTORY
  Mnemonic   Rel   Date     Who
  chk_xhtml  1.0   20050922 mpw
     Created.

$Id: chk_xhtml.py,v 1.1 2014/04/18 19:45:45 mark Exp $
"""

import xml.sax
import xml.sax.handler
import xml.sax.saxutils
import sys
import getopt
import os
import catalog

# Local resolver class; requires catalog.py to build in-memory copy of local
# entity resolution definitions.
#
# You can use this class to stop the sax parser reading DTDs, if required,
# by passing an argument of "/dev/null" to prepare_input_source.
class myER(xml.sax.handler.EntityResolver):
    def __init__(self):
        self.wd = ""
        
    def resolveEntity(self,ent_name,loc):
        global cat
        filename = cat.resolveEntity(ent_name)
        if filename == None:
            # can't resolve; see if an appropriate file is located in
            # the wd of last successful resolution (because it is probably
            # defined as relative to the parent dtd)
            if os.path.exists(self.wd+loc):
                filename = self.wd+loc
            else:
                # use location from DTD (this will probably fail)
                filename = loc
        else:
            self.wd = os.path.dirname(filename)+"/"
        return xml.sax.saxutils.prepare_input_source(filename)
    
verbose = False
catfile = ""
cat = catalog.Catalog()

try:
    opts,args = getopt.getopt(sys.argv[1:],'vf:')
    for o,v in opts:
        if o == '-v':
            verbose = True
        elif o == '-f':
            catfile = v
except getopt.GetoptError,e:
    print "%s: illegal argument: %s" % (sys.argv[0],e.opt)
    sys.exit(1)

if catfile != "" and not cat.build(catfile):
    print >>sys.stderr,"%s: unable to build catalog from %s" % (sys.argv[0],\
                                                                catfile)
    sys.exit(1)

for f in args:
    try:
        contents = open(f).read()
    except IOError:
        print >>sys.stderr,"%s: unreadable file: %s" % (sys.argv[0],f)
        sys.exit(1)
    if verbose: print "Processing",f,
    # is file XHTML?
    if contents.find('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML') >= 0:
        p = xml.sax.make_parser()
        p.setContentHandler(xml.sax.ContentHandler())
        p.setEntityResolver(myER())
        try:
            p.feed(contents)
            p.close()
        except xml.sax._exceptions.SAXParseException,e:
            if verbose:
                print ""
            print >>sys.stderr,"%s: invalid XHTML: %s" % (f,e)
            sys.exit(1)
    else:
        if verbose: print " (not XHTML)"
        continue
    if verbose: print "(OK)"
