图片提取程序(python)

时间：2006-12-16 来源：vinge

这个程序可以把html文件里的图片提取出来，重新生成一个按原顺序排列的只有图片的html文件，相信对那些爱收集图片的人会有用的。本来可以把它写成直接在网上抓图片的，不过怕网络连接处理不好，因此只对本地文件操作。感兴趣的人可以把htmldata模块抓下来看看，我放上资源中心。
写这个东西的时候那个url的quote问题把我挡了很久，url自动把特殊字符转成%xx的形式，常见的是空格%20,如果文件名里头有空格，在 html里头的url就会是xxx%20xxx的形式，有个文件刚好有个％在名字里，变成了%25，弄了一整天，以为是编码的问题，最后还是在 mailing list的人告诉我的,菜阿！

import urllib, htmldata, time,string,sys,shutil,re,os,stat
'''
   this program is to extract jpg file of html,and
   create a simple html to view the graphics.
   the argvs must be absoul directory.
'''
def extractjpg(url,targetpath,size,action='0'):
   print ''
   print '----------------------------------------------------------'#for log format
   foundtitle=0
   foundcharset=0
   filesize=0
   newfile=['',]
   jpgs=[]
   ignore=[]
   jpgpath=re.sub('.htm.*?$','_files',url)
   newpath=targetpath+'/'+string.split(jpgpath,'/')[-1]
   filesubpath=string.split(jpgpath,'/')[-1]
   contents = urllib.urlopen(url).read()
   for u in htmldata.tagextract(contents):
        if (foundtitle==1):
            newfile.append( '')
        foundtitle=2
        continue
    if isinstance(u,tuple) and u[0]=='title':
        foundtitle=1;
        continue
    if isinstance(u,tuple) and u[0]=='meta':
        try:
            string.index(u[1]['content'],'charset')
        #print u[1]['content']
        newfile.append('')
        continue
            except :
        pass
    if foundtitle==2 and foundcharset==1 :
        newfile.append('')
        break
    #now find all jpgs
   for u in htmldata.urlextract(contents, url):
       if u.tag_name == 'img':

       filename=urllib.unquote(u.url) #nuquote the url to real "path/file"
           fields=string.split(filename,'.')
        if fields[-1]=='jpg' or fields[-1]=='jpeg' :
           realname=string.split(filename,'/')[-1]
# print realname
           if os.path.exists(filename) and os.stat(filename)[stat.ST_SIZE]>size:
           #print filename
               jpgs.append(realname)
               newfile.append('')
               newfile.append('
---------------------------------------------------------
')
           else :
           ignore.append(filename)
   newfile.append('')
   #print the log message
   print '@file||extracting from:: '+string.split(url,'/')[-1]
   print '@info||extracted '+str(len(jpgs))+' image::'
   print jpgs
   if len(ignore)>0:
       print '@wanning||ignore '+str(len(ignore))+' image::'
       print ignore
   if action=='0':
       print "just test"+string.split(url,'/')[-1]
       return
   #write the htmlfile and copy the jpgs to dist dir
   newhtmlname=targetpath+'/'+string.split(url,'/')[-1]
   htmlfile=open(newhtmlname,'w',102400)
   htmlfile.writelines(newfile)
   htmlfile.close()

   if not os.path.exists(newpath):
       os.mkdir(newpath)
   for items in jpgs:
       shutil.copyfile(jpgpath+'/'+items,newpath+'/'+items)

#------------------------------------------------------------------------------
if __name__ == '__main__':
   srcdir = sys.argv[1]
   disdir=sys.argv[2]
   test=sys.argv[3]
   size=1024
   targetfiles=[]
   if not(os.path.isdir(srcdir)):
      print 'source path is incorrect'+sys.argv[1]
      sys.exit()
   if not(os.path.isdir(disdir)):
      print 'target path is incorrect'+sys.argv[1]
      sys.exit()
   files=os.listdir(srcdir)
   for elem in files:
       fullname=os.path.join(srcdir,elem)
       if os.path.isfile(elem):
           extname=string.split(elem,'.')[-1]
           if extname=='htm' or extname=='html':
           targetfiles.append(fullname)
   i=1
   for elem in targetfiles:
       print '@process file '+str(i)
       extractjpg(elem,disdir,size,test)
       i=i+1