#!/usr/bin/python
#BaiDu Blog Backuper v2
import urllib
import string
import re
def Save2File(url,fn):
print "Retrieving: ",url;
print "Save as: ",fn
try:
URLFile=urllib.urlopen(url)
except IOError:
print "\nCan not retrieve ",url,"!\nThe connection cannot be made!\n"
else:
HTMLText=URLFile.read()
URLFile.close()
flist=fn.split("/")
fn=string.join(flist)
flist=fn.split("\\")
fn=string.join(flist)
flist=fn.split(":")
fn=string.join(flist)
flist=fn.split("*")
fn=string.join(flist)
flist=fn.split("?")
fn=string.join(flist)
flist=fn.split("\"")
fn=string.join(flist)
flist=fn.split("<")
fn=string.join(flist)
flist=fn.split(">")
fn=string.join(flist)
HTMLFile=open(fn,"w")
HTMLFile.write(HTMLText)
HTMLFile.close()
def GetContent (url):
try:
URLFile=urllib.urlopen(url)
except IOError:
print "\nCan not retrieve ",url,"!\nThe connection cannot be made!\n"
else:
HTMLText=URLFile.read()
URLFile.close()
return HTMLText
if(__name__=="__main__"):
list_base="http://hi.baidu.com/linxhchina/blog/index/"
artical_base="http://hi.baidu.com/"
rexp=re.compile(r'<a href="(/linxhchina/blog/item/.*?html)".*?>(.*?)</a></div>')
queue=[];
cond=True;
i=0;
while cond:
cond=False
list_url="%s%d" %(list_base,i)
i=i+1
#print list_url
content=GetContent(list_url)
#print content
lines=content.split("\n")
for line in lines:
#print line
a=rexp.search(line)
if(a):
cond=True
queue.append(a.groups())
print "<html>"
print "<head>"
print "<title>Baidu Blog List</title>"
print "</head>"
print "<body>"
print "<font face=\"Arial, Helvetica\" size=4>"
for q in queue:
artical_url="%s%s" %(artical_base,q[0])
fname=q[1]+".html"
print "<a target=\"_blank\" href=\"%s\">%s</a><br>" %(artical_url,q[1])
#Save2File(artical_url,fname)
print "</font>"
print "</body>"
print "</html>"
|