寻找建设银行的挂马网站

时间：2008-05-26 来源：CUDev

找建设银行的挂马网站，通过google的搜索结果来查找。
通过建设银行的首页的title进行检索，然后进行人工排除。

#!/usr/bin/env python

import urllib2
import re

opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]

URL = "http://www.google.cn/search?as_q=%E6%AC%A2%E8%BF%8E%E8%AE%BF%E9%97%AE%E4%B8%AD%E5%9B%BD%E5%BB%BA%E8%AE%BE%E9%93%B6%E8%A1%8C%E7%BD%91%E7%AB%99&complete=1&hl=zh-CN&newwindow=1&num=100&btnG=Google+%E6%90%9C%E7%B4%A2&as_epq=&as_oq=&as_eq=&lr=&cr=&as_ft=i&as_filetype=&as_qdr=all&as_occt=title&as_dt=i&as_sitesearch=&as_rights="
sock = opener.open(URL)
#sock = urllib.urlopen(URL);
htmlsource = sock.read()

#print htmlsource
#ippattern = re.compile(r'<h2 class=r><a href="http://([^"/]+)/\S*" target')
#ippattern = re.compile(r'<h2 class=r><a href="http://([^"/]+)/(\s|\S*)" target')
#ippattern = re.compile(r'<h2 class=r><a href="http://([^/]+)/.+?" target')
ippattern = re.compile(r'<h2 class=r><a href="(.+?)" target')
list = re.findall(ippattern,htmlsource)

result = []

format = re.compile(r'.*http://(.+?)/')
for i in list:
   #print i
   #temp = re.findall(format,i)
   #result.extend(temp)
   temp = format.search(i)
   #print temp.group(1)
   result.append(temp.group(1))

sock.close()

result = dict.fromkeys(result).keys()

outfile = open('blacklist.txt','w')
len = len(result)
for i in range(len):
    outfile.write(result[i]+'\n')
outfile.close()

相关阅读更多 +