备份百度空间Blog的Python程序

时间：2006-11-07 来源：linxh

#!/usr/bin/python

#BaiDu Blog Backuper v2
import urllib
import string
import re

def Save2File(url,fn):
    print "Retrieving: ",url;
    print "Save as: ",fn
    try:
        URLFile=urllib.urlopen(url)
    except IOError:
        print "\nCan not retrieve ",url,"!\nThe connection cannot be made!\n"
    else:
        HTMLText=URLFile.read()
        URLFile.close()
        flist=fn.split("/")
        fn=string.join(flist)
        flist=fn.split("\\")
        fn=string.join(flist)
        flist=fn.split(":")
        fn=string.join(flist)
        flist=fn.split("*")
        fn=string.join(flist)
        flist=fn.split("?")
        fn=string.join(flist)
        flist=fn.split("\"")
        fn=string.join(flist)
        flist=fn.split("<")
        fn=string.join(flist)
        flist=fn.split(">")
        fn=string.join(flist)
        HTMLFile=open(fn,"w")
        HTMLFile.write(HTMLText)
        HTMLFile.close()

def GetContent (url):
    try:
        URLFile=urllib.urlopen(url)
    except IOError:
        print "\nCan not retrieve ",url,"!\nThe connection cannot be made!\n"
    else:
        HTMLText=URLFile.read()
        URLFile.close()
        return HTMLText


if(__name__=="__main__"):
    list_base="http://hi.baidu.com/linxhchina/blog/index/"
    artical_base="http://hi.baidu.com/"
    rexp=re.compile(r'<a href="(/linxhchina/blog/item/.*?html)".*?>(.*?)</a></div>')
    queue=[];
    cond=True;
    i=0;
    while cond:
        cond=False
        list_url="%s%d" %(list_base,i)
        i=i+1
        #print list_url
        content=GetContent(list_url)
        #print content
        lines=content.split("\n")
        for line in lines:
            #print line
            a=rexp.search(line)
            if(a):
                cond=True
                queue.append(a.groups())
    print "<html>"
    print "<head>"
    print "<title>Baidu Blog List</title>"
    print "</head>"
    print "<body>"
    print "<font face=\"Arial, Helvetica\" size=4>"

    for q in queue:
        artical_url="%s%s" %(artical_base,q[0])
        fname=q[1]+".html"
        print "<a target=\"_blank\" href=\"%s\">%s</a><br>" %(artical_url,q[1])
        #Save2File(artical_url,fname)

    print "</font>"
    print "</body>"
    print "</html>"