import urllib
from HTMLParser import HTMLParser
import re
class HtmlParserTool():
def __init__(self):
self.urlcontent = ''
def set_url(self, url):
self.urlcontent = urllib.urlopen(url).read()
if 0 == len(self.urlcontent):
print 'HtmlParserTool:read url',url,'error\n'
class HtmlParserGetUrl(HTMLParser):
def __init__(self):
self.url = []
self.urltag = 0
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
if 'a' == tag:
hrefv = ""
for name,value in attrs:
if name == 'href':
hrefv = value
self.url.append(hrefv)
self.urltag = 1
def handle_data(self, data):
return
def handle_endtag(self, tag):
if 1 == self.urltag:
self.urltag = 0
class HtmlSavePage():
def __init__(self):
self.Parser = HtmlParserGetUrl()
def urlset(self, subsite, page):
self.Parser.feed(page)
for i in range(0, len(self.Parser.url)):
if -1 != self.Parser.url[i].find('http://'):
continue
index = 0
find = 0
for j in range(0, len(self.Parser.url)):
index = page[index:len(page)].find(self.Parser.url[i])
if -1 != index:
index2 = page[0:index].rfind('href')
if -1 == index2:
index2 = page[0:index].rfind('HREF')
if -1 != index2:
if index - index2 > 2:
index = index + len(self.Parser.url[i])
continue
else:
find = 1
else:
find = 0
break
if 1 == find:
page = page[0:index] + subsite + page[index:len(page)]
def save(self, path, url, site):
print path,'\n',url
webtool = HtmlParserTool()
webtool.set_url(url)
page = webtool.urlcontent
#replace '/xxx'
page = re.sub('src="//*', 'src="' + site + '/', page)
page = re.sub('href="//*', 'src="' + site + '/', page)
savefile = open(path, 'w')
savefile.write(page)
savefile.close()
class HtmlParserTP1(HTMLParser):
def __init__(self):
self.title = ''
self.url = []
self.urltitle = []
self.urltag = 0
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
if 'a' == tag.lower():
self.urlTP1 = 1
titlev = ""
hrefv = ""
for name,value in attrs:
if name == 'href':
hrefv = value
if name == 'title':
titlev = value
if len(titlev) > 0:
return
if -1 != hrefv.find('showart'):
for i in range(0, len(self.url)):
if self.url[i] == hrefv:
return
self.url.append(hrefv)
self.urltag = 1
def handle_data(self, data):
if 1 == self.urltag:
self.urltitle.append(data)
def handle_endtag(self, tag):
if 1 == self.urltag:
self.urltag = 0
def set_url(self, url):
urllib.urlopen(url).read()
class HtmlTool():
def __init__(self):
self.HtmlTp = ''
self.urlfile = ''
self.site = ''
self.subsite = ''
self.savepath = ''
self.UrlTool = HtmlParserTool()
def set_savepath(self, path):
self.savepath = path
def get_site(self):
urlhead = 'http://'
index = self.urlfile[len(urlhead) : len(self.urlfile)].find('/')
self.site = self.urlfile[0 : index + len(urlhead)]
index = self.urlfile.rfind('/')
self.subsite = self.urlfile[0:index]
print 'site:',self.site
print 'sub site:',self.subsite
def set_tp(self, tp):
self.HtmlTp = tp
def open_file(self, file):
self.file = open(file, 'w+')
def close_file(self):
self.file.close()
def set_url(self, url):
self.urlfile = url
self.get_site()
if('TP1' == self.HtmlTp):
self.HtmlParser = HtmlParserTP1()
self.UrlTool.set_url(url)
self.HtmlParser.feed(self.UrlTool.urlcontent)
if len(self.HtmlParser.url) > 0:
for i in range(0, len(self.HtmlParser.url)):
self.file.write(self.HtmlParser.url[i])
self.file.write(' <- ')
self.file.write(self.HtmlParser.urltitle[i])
fileurl = self.subsite + '/' + self.HtmlParser.url[i]
print fileurl
savetool = HtmlSavePage()
savetool.save(self.savepath + self.HtmlParser.url[i], fileurl, self.site)
self.file.write("\r\n")
self.file.write("\r\n")
subject = '<a src=\'/aaaaa\'>'
result = re.sub('src=\'', "src=\'http:/bbb", subject)
print result
webtool = HtmlTool()
webtool.set_tp('TP1')
webtool.set_savepath('d:/')
webtool.open_file('d:/web.txt')
webtool.set_url('http://blog.chinaunix.net/u1/44067/index.php')
#webtool.set_url('http://blog.chinaunix.net/u2/62235/index.html')
webtool.close_file()
|