"""
to create the html docs of this script run it thru epydoc
python yahoo_stripper.py produces the example file clean.html
(basically you are doing a search on yahoo for "fravia searching"
There was a discussion on the board on how to parse html.
One person suggested using regular expressions. Not a good idea
too prone to mistakes. Anyway here is how to scrub the links and 
descriptions out of a yahoo search. And generate your own nice and clean
webpage ;). Lots of room for improvement, meant to be a learning tool.

references
http://www.voidspace.org.uk/python/articles/urllib2.shtml
http://diveintopython.org/html_processing/
http://epydoc.sourceforge.net

XML parsers come in 2 different flavors DOM and SAX.
HTML is an ancestor of XML.
sgmllib is a parser for html that is of the SAX flavor
The parser in javascript is a DOM parser since it parses the entire webpage and makes a "tree".
SAX Parser always faster but you have less control.

Anyway the page if fetched from yahoo using urllib2.
Then it is fed thru YahooHTMLParser.
in self.clean_links lo and behold ends up a list of the links on yahoo,
allong with the abstract (if it is there).

Then I take the list and make my own webpage ;)
Again I am not evangelizing python, you can do the same thing in Perl, or Scheme or any other language
that has a html parser. Even in python there are other html parsers besides sgml.
"""
import urllib2
import urllib
from sgmllib import SGMLParser

class YahooHTMLParser(SGMLParser):
	"""Our Parser which we have created to strip the garbage from yahoo"""
	class CleanLink(object):
		"""convenience class to hold the cleaned links from yahoo"""
		def __init__(self):
			self.link = None
			self.abstract = []
	
	def feed(self,page):
		"""This is where we initialize the I{state variables} think of it as the constructor"""
		self.in_link = False
		self.in_abstract = False
		self.clean_links = []
		self.current_link = None
		SGMLParser.feed(self,page)
		
	def start_div(self,attrs):
		"""catch div attributes. I{Abstracts} in yahoo are stored in a div with a class attribute of "yschabstr"
		"""
		attrs = dict(attrs)
		try :
			cls = attrs["class"]
			if cls == "yschabstr" :
				self.in_abstract = True
		except KeyError :
			pass
			
	def end_div(self):
		"""In a more complex parser we should count the number of divs we have entered.
			Here we just blindly assume that if a div has ended and we are in an abstract we should just leave.
			Notice I save the strings as a list then at the end join the strings.
			This is because of how python handles strings and an optimization trick. (search on web for optimizing python)
		"""
		if self.in_abstract :
			self.in_abstract = False
			self.current_link.abstract="".join(self.current_link.abstract)
		
	def start_a(self,attrs):
		"""look for the anchor which has class attribute of "yschttl" """
		attrs = dict(attrs)
		try :
			if attrs["class"] == "yschttl" :
				self.current_link = self.CleanLink()
				self.clean_links.append(self.current_link)
				self.current_link.link = attrs["href"]
				self.in_link = True
		except KeyError:
			pass
			
	def end_a(self):
		if self.in_link :
			self.in_link = False
			
	def unknown_starttag(self,tag,attrs) :
		"""if we are in an abstract just save the html tag so it looks the same"""
		if self.in_abstract :
			self.current_link.abstract.append(self.get_starttag_text())
	
	def unknown_endtag(self,tag) :
		if self.in_abstract :
			self.current_link.abstract.append("</%s>"%(tag))
		
	def handle_data(self,text):
		"""If we are in abstract save the text"""
		if self.in_abstract :
			self.current_link.abstract.append(text)

def main():
	"""main procedure fetch the search query, feed it into the parser and then create our own simple
		web page called "clean.html"
	"""
	yahoo_url = "http://search.yahoo.com/search?%s"
	params = {"p" : "fravia searching",
		"ei" : "UTF-8",
		"fr" : "sfp"}
	url_params = urllib.urlencode(params)
	
	f = urllib2.urlopen(yahoo_url%(url_params))
	page = f.read()
	f.close()
				
	yahParser = YahooHTMLParser()
	yahParser.feed(page)
	yahParser.close()
	
	link_html="<li><a href=%s>%d %s</a></li><li>%s</li>"
	f = open("clean.html","w")
	f.write("<html><body><ul>")
	count = 1
	for link in yahParser.clean_links :
		f.write(link_html%(link.link,count,link.link,link.abstract))
		count += 1
	f.write("</ul></html></body>")
	f.close()
	
if __name__=="__main__":
	main()

