python 自动下载 voa MP3
时间:2010-08-19 来源:Jerry.Kuan
因为英语学习的需要,经常会去网上下载一些VOA的MP3,常去的一个网站是http://www.51voa.com/
要想下载该网站上的MP3,需要手动选择要下载的篇幅,打开之后再选择要下载的MP3文件。要下载单独一个MP3文件还好,但要是想把某一时间内的所有MP3文件都下载下来,就很繁琐,需要重复做那些无聊的操作。能否用python来做一个下载voa MP3的工具呢?
设计思路如下:
一、打开http://www.51voa.com/主页,分析html,解析出主页上 VOA美国之音听力最近更新 文件列表,生成<文件名,文件下载地址>的dictionary
二、对已生成的dictionary按照当前日期过滤,得到能下载的当天的VOA MP3
三、对过滤后的dictionary遍历,同时进行下载操作
其中所使用的技术:
一、解析html,可以使用standar library中的HTMLParser,或者SGMLParser,也可以选择3rd party的解析库,比如BeautifulSoup(对html和xml都能很好的支持),本文采用BeautifulSoup
二、下载MP3,采用urllib,为提高效率,使用多线程进行下载,url header中可以使用Range参数分片下载,这样一来就能多部分协同操作。
具体代码如下:
一、多线程下载部分代码
#!/usr/bin/env python # -*- coding :utf-8 -*- """ It is a multi-thread downloading tool """ import sys import os import time import urllib2 import urllib from threading import Thread class MyWorkThread(Thread, urllib.FancyURLopener): """ Multi-thread downloading class. run() is a vitual method of Thread """ def __init__(self, threadname, url, filename, ranges = 0): Thread.__init__(self, name = threadname) urllib.FancyURLopener.__init__(self) self.name = threadname self.url = url self.filename = filename self.ranges = ranges self.downloaded = 0 def run(self): """ virtual function in Thread """ try: self.downloaded = os.path.getsize(self.filename) except OSError: self.downloaded = 0 #rebuild start point self.startpoint = self.ranges[0] + self.downloaded #if this part is completed if self.startpoint >= self.ranges[1]: print 'Part %s has been downloaded over.' % self.filename return self.oneTimeSize = 8 * 1024 #8K bytes / time print 'task %s will download from %d to %d' %(self.name, self.startpoint, self.ranges[1]) self.addheader('Range', 'bytes=%d-%d' %(self.startpoint, self.ranges[1])) self.urlhandle = self.open(self.url) data = self.urlhandle.read(self.oneTimeSize) while data: filehandle = open(self.filename, 'ab+') filehandle.write(data) filehandle.close() self.downloaded += len(data) data = self.urlhandle.read(self.oneTimeSize) def GetUrlFileSize(url): urlHandler = urllib.urlopen(url) headers = urlHandler.info().headers length = 0 for header in headers: if header.find('Length') != -1: length = header.split(':')[-1].strip() length = int(length) return length def SpliteBlocks(totalsize, blocknumber): blocksize = totalsize / blocknumber ranges = [] for i in range(0, blocknumber -1): ranges.append((i * blocksize, i * blocksize + blocksize -1)) ranges.append((blocksize * (blocknumber -1), totalsize -1)) return ranges def isLive(tasks): for task in tasks: if task.isAlive(): return True return False def downLoadFile(url, output, blocks = 6): sys.stdout.write('Begin to download from %s\n' %url ) sys.stdout.flush() size = GetUrlFileSize(url) ranges = SpliteBlocks(size, blocks) threadname = ["thread_%d" %i for i in range(0, blocks)] filename = ["tmpfile_%d" %i for i in range(0, blocks)] tasks = [] for i in range(0, blocks): task = MyWorkThread(threadname[i], url, filename[i], ranges[i]) task.setDaemon(True) task.start() tasks.append(task) time.sleep(2) while isLive(tasks): downloaded = sum([task.downloaded for task in tasks]) process = downloaded / float(size) * 100 show = u'\rFilesize: %d Downloaded:%d Completed: %.2f%%' %(size, downloaded, process) sys.stdout.write(show) sys.stdout.flush time.sleep(1) output = formatFileName(output) filehandle = open(output, 'wb+') for i in filename: f = open(i, 'rb') filehandle.write(f.read()) f.close() os.remove(i) filehandle.close() sys.stdout.write("Completed!\n") sys.stdout.flush() def formatFileName(filename): if isinstance(filename, str): header, tail = os.path.split(filename) if tail != '': tuple = ('\\','/',':', '*', '?', '"', '<', '>', '|') for char in tuple: if tail.find(char) != -1: tail = tail.replace(char, '') filename = os.path.join(header, tail) #print filename return filename else: return 'None' if __name__ == '__main__': url = r'http://www.51voa.com/path.asp?url=/201008/hennessy_africa_wildlife_18aug10-32b.mp3' output = r"D:\Voa\Study:'Shoot to Kill' Policy in Africa's Parks Abuses Human Rights.mp3" downLoadFile(url, output, blocks = 4)
二、解析voa页面部分代码
#!/usr/bin/env python # -*- coding:utf-8 -*- import urllib2 import chardet import os import time import string import re from HTMLParser import HTMLParser import sys from BeautifulSoup import BeautifulSoup import multiThreadDownloadTool VOA_URL = r'http://www.51voa.com' DOWNLOAD_DIR = r'D:/Voa' """ File downloading from the web. """ def getURLContent(url): """ get url content of the url, begin with html and ignor the doctype declarations """ file = urllib2.urlopen(url) #print file.info() data = file.read() file.close() #return data.decode('utf-8') index = data.find('html') data = data[index - 1 :] return data def getVOAURLs(content): """ find the voa script urls in the content """ urls = {} soup = BeautifulSoup(content) divs = soup.findAll('div', {'id':'rightContainer'}) #print divs neededDiv = None if len(divs) >= 1: neededDiv = divs[0] if neededDiv != None: #pass the div #print neededDiv neededSpan = neededDiv.find('span', {'id' : 'list'}) #print neededSpan lis = neededSpan.findAll('li') #print lis for li in lis: needAs = li.findAll('a') #got it #print needAs[1] #print needAs[1]['href'] #print needAs[-1].string urls[needAs[-1].string] = VOA_URL + needAs[-1]['href'] print "getVOAURLs() urls count is " , len(urls) return urls def filterbyDate(urls ,date): """ filter the urls by date """ neededURLs = {} currentDate = time.localtime(time.time()); #currentDateStr = time.strftime('%Y-%m-%d', currentDate) #currentDateStr = currentDate.tm_year + '-' + currentDate.tm_mon + ' ' + currentDate.tm_mday currentDateStr = "%s-%s-%s" %(currentDate.tm_year, currentDate.tm_mon, currentDate.tm_mday) if(date != None): currentDateStr = date for url in urls.keys(): name = url.lstrip().rstrip() length = len(name) publishDate = name[- 10 : -1] #print publishDate if publishDate == currentDateStr: neededURLs[name] = urls[url] print 'find ', name print 'After filter, the count is ' , len(neededURLs) return neededURLs def findMP3FileInURL(url): """ find MP3 files in a url """ print 'parse the content of ', url urls = [] #define a MP3 re string p = re.compile(r'/path.asp\?url=[-\w/]*\.mp3') #p = re.compile(r'/[-\w/]*\.mp3') content = getURLContent(url) matchLinks = p.findall(content) #print matchLinks for link in matchLinks: tmp = VOA_URL + link if tmp in urls: # check if exist already pass else: urls.append(tmp) print 'Current count of mp3 files is ', len(urls) return urls def getHTMLFile(url, file_name): ifile = urllib2.urlopen(url) content = ifile.read() local_file = open(file_name, 'w') local_file.write(content) local_file.close() def downloadFile(url, fileName2Store): """ download file from url, and store it to local system using fileName2Store parameter """ try: full_path = os.path.join(DOWNLOAD_DIR, fileName2Store) print 'begin to download url to ', full_path if os.path.isfile(full_path): #already exist print 'the file ', full_path, 'is alreasy exist, so just skip it!' else: print '\tDownloading the mp3 file...', data=urllib2.urlopen(url).read() print 'Done' print '\tWriting data info file...', f=file(full_path, 'wb') f.write(data) print 'Done' f.close() except Exception, ex: print 'some exceptions occur when downloading ', ex if __name__ == "__main__": try: #getHTMLFile(VOA_URL, r'.\Voa.html') context = getURLContent(VOA_URL) #file_read = open(r'.\Voa.html', 'r') #context = file_read.read() #print context #print '\n' * 5 #print chardet.detect(context) print 'Begin to get download information, it may cost some minuts, please wait...' files2download = getVOAURLs(context) neededDownload = filterbyDate(files2download, None) neededDownloadMp3s = {} for name in neededDownload.keys(): fullURL = neededDownload[name] formatedName = name[: -11].lstrip().rstrip() #formatedName = formatedName.replace(' ', '-') #print formatedName, ' ' * 5, fullURL #print fullURL mp3Names = findMP3FileInURL(fullURL) if len(mp3Names) == 1: #there is only on mp3 file in this file ,so we will use the formatedname neededDownloadMp3s[formatedName] = mp3Names[0] else: for name in mp3Names: print name index_begin = name.rfind('/') index_end = name.rfind('.') tmpName = name[index_begin + 1 : index_end] neededDownloadMp3s[tmpName] = name print 'Now , the mp3 files are :' print neededDownloadMp3s #findMP3FileInURL(r'http://www.51voa.com/VOA_Special_English/Phoning-Fertilizer-Philippine-Rice-Farmers--38545.html') #findMP3FileInURL(r'http://www.51voa.com/Voa_English_Learning/Learn_A_Word_38412.html') #down load file for filename in neededDownloadMp3s.keys(): try: full_path = os.path.join(DOWNLOAD_DIR, filename) full_path = full_path + r'.mp3' if full_path == r'D:\Voa\hennessy_africa_wildlife_18aug10-32b.mp3': multiThreadDownloadTool.downLoadFile(neededDownloadMp3s[filename], full_path) except Exception, ex: print 'Some exceptions occur, when downloading file from %s, exception messages are %s' %(neededDownloadMp3s[filename], ex) #downloadFile(r'http://www.51voa.com/path.asp?url=/201008/mercer_australia_election_16aug10-32b.mp3', 'test.mp3') except Exception, ex: print 'Exception caught, tracebacks are :',sys.exc_info(), ex print 'download all completed!' raw_input("Press any key to continue...")
需要注意的地方:
在使用BeautifulSoup进行html解析的时候发现,BeautifulSoup对于
<!DOCTYPE html PUBliC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" />
的支持不是很好,经常解析不出来,所以为了方便,在解析的时候先将源文件解析,只将<html></html>之间的数据交与BeautifulSoup解析。具体为什么BeautifulSoup解析
DOCTYPE出错,我还没查出问题所在,希望有知道的朋友告知一声。