iwannamassage.py v0.1
# Fetch and parse an HTML page into a DOM-like tree of elements. # Uses my new favorite XML parser, ElementTree by the effbot. # Rather than use ElementTree's interface to the Tidy command # (which runs Tidy in a subshell), I'm using M.-A. Lemburg's version # that turns Tidy into a Python extention. from __future__ import generators import sys, os, tempfile import urllib2 from mx.Tidy import Tidy from elementtree import ElementTree, HTMLTreeBuilder def main(url): # mx.Tidy.tidy() will take only true file objects for input and # output, so we set up two temp file objects for it. inTempFileObj = os.tmpfile() outTempFileObj = os.tmpfile() # Open our target URL, read it all in, and write it all out to # a temp file. inTempFileObj.writelines(urllib2.urlopen(url).readlines()) # Make sure the temp file we just output to is ready to be # an input source. inTempFileObj.flush() inTempFileObj.seek(0,0) # Use mx.Tidy.tidy to convert our input HTML into clean # output XHTML. nerrors, nwarnings, outputdata, errordata = Tidy.tidy(inTempFileObj, outTempFileObj, output_xml=1) # Make sure the temp file we just output to is ready to be # an input source. outTempFileObj.flush() outTempFileObj.seek(0,0) # I want to know about any tidy errors/warnings. if nerrors > 0 or nwarnings > 0: print "Tidy had %d errors, %d warnings" % (nerrors, nwarnings) print "Errors: %s" % errordata # Get an ElementTree HTML parser. htmlParserObj = HTMLTreeBuilder.TreeBuilder() # Tell ElementTree to parse our input XHTML source, using an # HTML parser. This will give us a top-level ElementTree object. treeObj = ElementTree.parse(outTempFileObj, parser=htmlParserObj) # For now, just dump out the tree so we can see it. ElementTree.dump(treeObj) return if __name__ == "__main__": main(sys.argv[1])
|
© Copyright
2003
Michael Kent.
Last update:
6/26/2003; 12:14:26 PM.
This theme is based on the SoundWaves
(blue) Manila theme. |
|