This is the small Program for find all documents,files in a website using Python.
# This is a Small Program for find all documents in a website using python. If we extend this program it will gives lot of results .Just Try it once......
import urllib2
import string
from urlparse import urlparse
from posixpath import basename,dirname
url_queue=[]
#Give a Base website Url::::
base_url="http://localhost/python_docs/"
#### This is for Proxy Handling
#opener = urllib2.build_opener(
# urllib2.HTTPHandler(),
# urllib2.HTTPSHandler(),
# urllib2.ProxyHandler({'http': 'http://127.0.0.1:1233'}))
#urllib2.install_opener(opener)
class website_analyzer:
def __init__(self,website):
self.website=website
url_queue.append(self.website+",")
print "The Given Website Is analyzed By Web Robot ",self.website
for url in url_queue:
selected_url(url)
print url_queue
class selected_url:
def __init__(self,link):
self.link=link.split(",")[0]
url_ref=link.split(",")[1]
self.ret_data(self.link,url_ref)
def ret_data(self,link,url_ref):
self.url_ref=url_ref
try:
page_data=urllib2.urlopen(self.link).read()
href_split=page_data.split("href=")
for single_link in href_split:
condition=True
try:
single_url=single_link.split("\"")[1]
url_split=urlparse(single_url)
if url_split.netloc=="":
append_url=base_url+url_split.geturl()+","+dirname(url_split.path)
if (append_url not in url_queue) and '#' not in append_url:
print append_url
url_queue.append(append_url)
else:
append_url=url_split.geturl()+","+dirname(url_split.path)
if (append_url not in url_queue) and '#' not in append_url:
print append_url
url_queue.append(append_url)
except IndexError:
next=1
except ValueError,err:
print "Error Link ",self.link,"Error is ",err
except urllib2.URLError:
print "Url Not Fetched"
if __name__=="__main__":
website_analyzer("http://localhost/python_docs")