BBS图片自动下载的脚本实现(v1.0)
时间:2008-11-16 来源:taoyuliang
今天Graduate版聚归来,已是晚上10点过了,昨天刚写了一个在水源上自动发文的脚本,所以又突然想写一个自动下载图片的Python脚本。
虽然类似的工具早有人写过(叫BBSPicSpider),不过它是用.NET来做的。反正就当学习Python的练手,于是开始试着编写。过程还是比较顺利,不过当中遇到很多小问题,查了很多Python的库文档加上百度,得以解决。今天太晚了,过两天再总结一下。
先把代码贴上了,不过这个是1.0版本,基本功能已经可以实现了,不过有一点小BUG,有待进一步研究改进,有兴趣的朋友可以交流一下:)
1 #!/usr/bin/python
2
3 # Download pictures from PPPerson @ bbs.sjtu.edu.cn
4
5 import re
6 import os
7 from urllib import ContentTooShortError
8 from urllib2 import URLError
9 import urllib, urllib2
10
11 def requestURL( url, datas, headers = None ) :
12 """Request a url"""
13
14 if not headers :
15 headers = { 'User-Agent' : 'Mozilla/3.0' }
16 # request
17 req = urllib2.Request( url, datas, headers )
18 # open url
19 try :
20 res = urllib2.urlopen( req )
21 except URLError, e :
22 if hasattr( e, 'reason' ) :
23 print "Failed to reach server: ", e.reason
24 elif hasattr( e, 'code' ) :
25 print "Can't fulfill the requset: ",e.code
26 else :
27 pass #print "Requset is successful\n"
28
29 return res
30
31 def getSubjectsURL( cmpPattern ) :
32 """Get the url of subjects"""
33
34 board_url = 'http://bbs.sjtu.edu.cn/bbstdoc'
35
36 # baord
37 data = { 'board' : 'PPPerson' }
38 datas = urllib.urlencode( data )
39
40 # get a response
41 res = requestURL( board_url, datas )
42
43 # create a compiled regular expression
44 #cmpPattern = re.compile(r'<a href=bbstcon\?(.*?)>')
45
46 # get the url list of titles
47 subjectsURL = cmpPattern.findall( res.read() )
48
49 return subjectsURL
50
51 def getImagesURL( url, cmpPattern ) :
52 """Return the images url according to corresponding subject url"""
53
54 # get the url of subject
55 subject_url = 'http://bbs.sjtu.edu.cn/bbstcon'
56 # get datas
57 datas = url
58
59 # request and get a response
60 res = requestURL( subject_url, datas )
61
62 # create a compiled pattern to find urls of images
63 #cmpPattern = re.compile( r'<IMG\sSRC="(.*?)"\s>', re.I )
64
65 # get the urls of images
66 imagesURL = cmpPattern.findall( res.read() )
67
68 # delete the repeated images
69 sets = set( imagesURL )
70 imagesURL = []
71 for item in sets :
72 imagesURL.append( item )
73
74 return imagesURL
75
76 def downloadImage( imageURL, subID ) :
77 """Download images"""
78
79 # image url
80 image_url = 'http://bbs.sjtu.edu.cn' + imageURL
81
82 # create the directory to store images
83 # if not os.path.exists( './download' ) :
84 try :
85 os.makedirs( './download/' + subID )
86 except OSError :
87 pass
88 #print "Failed to create directories"
89
90
91 # get filename of image
92 filename = 'download/' + subID + '/' + imageURL.split( '/' )[-1]
93
94 # clear the cache that may have been built up
95 # by previous calls to urlretrieve()
96 urllib.urlcleanup()
97
98 # retrieve the image
99 try :
100 urllib.urlretrieve( image_url, filename )
101 except ContentTooShortError :
102 print "The data available was less than that of expected"
103 print "Downloading file %s was interrupted" \
104 % os.path.basename( filename )
105 else :
106 # get the size of file
107 size = os.path.getsize( filename ) / 1024
108 print ">>>File %s (%s Kb) was done..." % ( filename, size )
109
110
111 if __name__ == '__main__' :
112
113 # create compiled regular expression pattern
114 findSubjectsPattern = re.compile( \
115 r'<td>(\d+)<td>.*?<a\shref=bbstcon\?(.*?)>', re.I | re.DOTALL )
116 findImagesPattern = re.compile( r'<IMG\sSRC="(.*?)"\s>', re.I )
117
118 # get subjects' url list
119 subjectsList = getSubjectsURL( findSubjectsPattern )
120
121 print "Downloading begins...\n"
122
123 filecount = 1
124
125 for i in range( len( subjectsList ) ) :
126 # get images url list
127 print "\nSubject %s begins..." % subjectsList[i][0]
128 imagesList = getImagesURL( subjectsList[i][1], findImagesPattern )
129 # download all iamges
130 for j in range( len(imagesList) ) :
131 downloadImage( imagesList[j], subjectsList[i][0] )
132 filecount += 1
133
134 print "\nAll downloads were done"
135 print "%d files were downloaded totally\n" % filecount
136
虽然类似的工具早有人写过(叫BBSPicSpider),不过它是用.NET来做的。反正就当学习Python的练手,于是开始试着编写。过程还是比较顺利,不过当中遇到很多小问题,查了很多Python的库文档加上百度,得以解决。今天太晚了,过两天再总结一下。
先把代码贴上了,不过这个是1.0版本,基本功能已经可以实现了,不过有一点小BUG,有待进一步研究改进,有兴趣的朋友可以交流一下:)
1 #!/usr/bin/python
2
3 # Download pictures from PPPerson @ bbs.sjtu.edu.cn
4
5 import re
6 import os
7 from urllib import ContentTooShortError
8 from urllib2 import URLError
9 import urllib, urllib2
10
11 def requestURL( url, datas, headers = None ) :
12 """Request a url"""
13
14 if not headers :
15 headers = { 'User-Agent' : 'Mozilla/3.0' }
16 # request
17 req = urllib2.Request( url, datas, headers )
18 # open url
19 try :
20 res = urllib2.urlopen( req )
21 except URLError, e :
22 if hasattr( e, 'reason' ) :
23 print "Failed to reach server: ", e.reason
24 elif hasattr( e, 'code' ) :
25 print "Can't fulfill the requset: ",e.code
26 else :
27 pass #print "Requset is successful\n"
28
29 return res
30
31 def getSubjectsURL( cmpPattern ) :
32 """Get the url of subjects"""
33
34 board_url = 'http://bbs.sjtu.edu.cn/bbstdoc'
35
36 # baord
37 data = { 'board' : 'PPPerson' }
38 datas = urllib.urlencode( data )
39
40 # get a response
41 res = requestURL( board_url, datas )
42
43 # create a compiled regular expression
44 #cmpPattern = re.compile(r'<a href=bbstcon\?(.*?)>')
45
46 # get the url list of titles
47 subjectsURL = cmpPattern.findall( res.read() )
48
49 return subjectsURL
50
51 def getImagesURL( url, cmpPattern ) :
52 """Return the images url according to corresponding subject url"""
53
54 # get the url of subject
55 subject_url = 'http://bbs.sjtu.edu.cn/bbstcon'
56 # get datas
57 datas = url
58
59 # request and get a response
60 res = requestURL( subject_url, datas )
61
62 # create a compiled pattern to find urls of images
63 #cmpPattern = re.compile( r'<IMG\sSRC="(.*?)"\s>', re.I )
64
65 # get the urls of images
66 imagesURL = cmpPattern.findall( res.read() )
67
68 # delete the repeated images
69 sets = set( imagesURL )
70 imagesURL = []
71 for item in sets :
72 imagesURL.append( item )
73
74 return imagesURL
75
76 def downloadImage( imageURL, subID ) :
77 """Download images"""
78
79 # image url
80 image_url = 'http://bbs.sjtu.edu.cn' + imageURL
81
82 # create the directory to store images
83 # if not os.path.exists( './download' ) :
84 try :
85 os.makedirs( './download/' + subID )
86 except OSError :
87 pass
88 #print "Failed to create directories"
89
90
91 # get filename of image
92 filename = 'download/' + subID + '/' + imageURL.split( '/' )[-1]
93
94 # clear the cache that may have been built up
95 # by previous calls to urlretrieve()
96 urllib.urlcleanup()
97
98 # retrieve the image
99 try :
100 urllib.urlretrieve( image_url, filename )
101 except ContentTooShortError :
102 print "The data available was less than that of expected"
103 print "Downloading file %s was interrupted" \
104 % os.path.basename( filename )
105 else :
106 # get the size of file
107 size = os.path.getsize( filename ) / 1024
108 print ">>>File %s (%s Kb) was done..." % ( filename, size )
109
110
111 if __name__ == '__main__' :
112
113 # create compiled regular expression pattern
114 findSubjectsPattern = re.compile( \
115 r'<td>(\d+)<td>.*?<a\shref=bbstcon\?(.*?)>', re.I | re.DOTALL )
116 findImagesPattern = re.compile( r'<IMG\sSRC="(.*?)"\s>', re.I )
117
118 # get subjects' url list
119 subjectsList = getSubjectsURL( findSubjectsPattern )
120
121 print "Downloading begins...\n"
122
123 filecount = 1
124
125 for i in range( len( subjectsList ) ) :
126 # get images url list
127 print "\nSubject %s begins..." % subjectsList[i][0]
128 imagesList = getImagesURL( subjectsList[i][1], findImagesPattern )
129 # download all iamges
130 for j in range( len(imagesList) ) :
131 downloadImage( imagesList[j], subjectsList[i][0] )
132 filecount += 1
133
134 print "\nAll downloads were done"
135 print "%d files were downloaded totally\n" % filecount
136
相关阅读 更多 +