''' Realized by Antonello Cicchese 
    http://www.antonellocicchese.com
'''
#!/usr/bin/env python

import sys
import string
from urllib2 import Request, urlopen, URLError, HTTPError
import urlparse
import getopt
import htmllib
import formatter
import string


class Parser(htmllib.HTMLParser):
    # return a dictionary mapping anchor texts to lists
    # of associated hyperlinks

    def __init__(self, verbose=0):
        self.anchors = {}
        f = formatter.NullFormatter()
        htmllib.HTMLParser.__init__(self, f, verbose)

    def anchor_bgn(self, href, name, type):
        self.save_bgn()
        self.anchor = href

    def anchor_end(self):
        text = string.strip(self.save_end())
        if self.anchor and text:
            self.anchors[text] = self.anchors.get(text, []) + [self.anchor]
    
    

def find_links(page, base):
    empty = 0
    good_list = ["html","htm","php","asp","xml","php3","php4"]
    tupla = urlparse.urlparse(page)[2]
    try:
        ext = tupla.split(".")[1]
    except:
        empty = 1
    ritorno = []
    temp =""
    try:
        conn = urlopen(page)        
    except HTTPError, e:
        print "The server couldn't fulfill the request."
        print 'Error code: ', e.code
    except URLError, e:
        print 'We failed to reach a server.'
        print 'Reason: ', e.reason
    else:
        if empty or ext in good_list:
            contenuto = conn.read()        
            p = Parser()
            p.feed(contenuto)
            p.close()        
            for k, v in p.anchors.items():
                temp = urlparse.urljoin(base,str(v[0]))
                if base in temp: ritorno.append(temp)
    return ritorno

def build_dictionary(sep):
    '''Using BFS visit'''
    dizio = {}
    stack = []
    stack.append(sep)
    while(len(stack) > 0 ):
        currpage = stack.pop()
        dizio[currpage] = 1 # sets it to visited
        links = find_links(currpage,sep) # this method finds all the links of the page
        print (currpage, len(links),"\n")
        for l in links:
            if l not in dizio.keys():
                dizio[l] = 0
                stack.append(l)
                print "\t--->",l,"\n"
    return dizio            

def output(links):
    fout=open('sitemap.txt','w+')
    for l in links.keys():
        fout.write(l+"\n")
	
    


if __name__ == "__main__":
    try:
        site = sys.argv[1:2][0]
    except:
        print "Please specify a website url"
        sys.exit(0)
    print "Creating index ..."
    lista = build_dictionary(site)
    print "Producing sitemap ..."
    output(lista)
    print "Job finished! Bye"
