'''
Function: get_links(url)
parameter: url
urlparse.urlparse: parse a url into six compents, returing a 6-tuple (scheme,netloc,path,params,query and fragment),please frefer to http://docs.python.org/library/urlparse.html for more info about urlparse.
HTTPConnection.request(method, url[, body[, headers]])
'''
import urllib, urllister
import urlparse
import httplib
import time
import urllib2
def get_links(url):
usock=urllib.urlopen(url)
parser=urllister.URLLister() #Create a instance
parser.feed(usock.read()) #Put the resource(html) into parser,and get the relevent segments from the resource.
usock.close()
parser.close()
uhost=urlparse.urlparse(url)
for url in parser.urls:
print url
up=urlparse.urlparse(url)
if up.netloc=="": #Some link may not contain 'http:'(called absolute path')
conn=httplib.HTTPConnection(uhost.netloc)
conn.request("GET","/"+up.path+"?"+up.params+up.query+up.fragment)
res=conn.getresponse()
status=res.status
reason=res.reason
#data=res.read()
conn.close()
else:
conn=httplib.HTTPConnection(uhost.netloc)
conn.request("GET",up.path+"?"+up.params+up.query+up.fragment)
res=conn.getresponse()
status=res.status
reason=res.reason
#data=res.read()
conn.close()
print url,status,reason
if __name__ == '__main__':
url=raw_input("Please enter the url you want to check:\n")
get_links(url)
|