Trivial Thoughts
Thoughts and discussion on programming projects using the Python language.



Subscribe to "Trivial Thoughts" in Radio UserLand.

Click to see the XML version of this web page.

Click here to send an email to the editor of this weblog.
 

 

iwannamassage.py v0.1

# Fetch and parse an HTML page into a DOM-like tree of elements.
# Uses my new favorite XML parser, ElementTree by the effbot.
# Rather than use ElementTree's interface to the Tidy command
# (which runs Tidy in a subshell), I'm using M.-A. Lemburg's version
# that turns Tidy into a Python extention.
from __future__ import generators
import sys, os, tempfile
import urllib2
from mx.Tidy import Tidy
from elementtree import ElementTree, HTMLTreeBuilder
def main(url):
    # mx.Tidy.tidy() will take only true file objects for input and
    # output, so we set up two temp file objects for it.
    inTempFileObj = os.tmpfile()
    outTempFileObj = os.tmpfile()
    # Open our target URL, read it all in, and write it all out to
    # a temp file.
    inTempFileObj.writelines(urllib2.urlopen(url).readlines())
    # Make sure the temp file we just output to is ready to be
    # an input source.
    inTempFileObj.flush()
    inTempFileObj.seek(0,0)
    # Use mx.Tidy.tidy to convert our input HTML into clean
    # output XHTML.
    nerrors, nwarnings, outputdata, errordata =
        Tidy.tidy(inTempFileObj, outTempFileObj, output_xml=1)
    # Make sure the temp file we just output to is ready to be
    # an input source.
    outTempFileObj.flush()
    outTempFileObj.seek(0,0)
   
    # I want to know about any tidy errors/warnings.
    if nerrors > 0 or nwarnings > 0:
        print "Tidy had %d errors, %d warnings" % (nerrors, nwarnings)
        print "Errors: %s" % errordata
    # Get an ElementTree HTML parser.  
    htmlParserObj = HTMLTreeBuilder.TreeBuilder()
    # Tell ElementTree to parse our input XHTML source, using an
    # HTML parser.  This will give us a top-level ElementTree object.
    treeObj = ElementTree.parse(outTempFileObj, parser=htmlParserObj)
    # For now, just dump out the tree so we can see it.
    ElementTree.dump(treeObj)
    return
if __name__ == "__main__":
    main(sys.argv[1])


Click here to visit the Radio UserLand website. © Copyright 2003 Michael Kent.
Last update: 6/26/2003; 12:14:26 PM.
This theme is based on the SoundWaves (blue) Manila theme.