Tuesday 24 September 2013

This is the small Program for find all documents,files in a website using Python.

# This is a Small Program for find all documents in a website using python. If we extend this program it will gives lot of results .Just Try it once......

import urllib2
import string
from urlparse import urlparse
from posixpath import basename,dirname
url_queue=[]

#Give a Base website Url::::
base_url="http://localhost/python_docs/"

#### This is for Proxy Handling
#opener = urllib2.build_opener(
#                urllib2.HTTPHandler(),
#                urllib2.HTTPSHandler(),
#                urllib2.ProxyHandler({'http': 'http://127.0.0.1:1233'}))
#urllib2.install_opener(opener)

class website_analyzer:
    def __init__(self,website):
        self.website=website       
        url_queue.append(self.website+",")       
        print "The Given Website Is analyzed By Web Robot ",self.website
        for url in url_queue:
            selected_url(url)
        print url_queue
       

class selected_url:
    def __init__(self,link):
        self.link=link.split(",")[0]       
        url_ref=link.split(",")[1]
        self.ret_data(self.link,url_ref)
       
    def ret_data(self,link,url_ref):
        self.url_ref=url_ref
   
        try:       
            page_data=urllib2.urlopen(self.link).read()
            href_split=page_data.split("href=")           
            for single_link in href_split:
                condition=True
                try:               
                    single_url=single_link.split("\"")[1]
                    url_split=urlparse(single_url)
                    if url_split.netloc=="":                       
                        append_url=base_url+url_split.geturl()+","+dirname(url_split.path)                   
                        if (append_url not in url_queue) and '#' not in append_url:
                            print append_url
                            url_queue.append(append_url)

                    else:
                        append_url=url_split.geturl()+","+dirname(url_split.path)
                        if (append_url not in url_queue) and '#' not in append_url:
                            print append_url
                            url_queue.append(append_url)                       
                           
                except IndexError:
                    next=1       
        except ValueError,err:
            print "Error Link ",self.link,"Error is ",err
        except urllib2.URLError:
            print "Url Not Fetched"


if __name__=="__main__":
    website_analyzer("http://localhost/python_docs")

No comments:

Post a Comment