#!/usr/bin/python
# -*- coding=utf8 -*-
# 用来看天天中文 小说网 的小说更新章节
# 第一次写这种的,感觉有点乱,没有规划好,模块什么的也乱
# 外面带一个readChapter 文件和novelName 文件
# readChapter 文件可以为空,默认就载入 novelName中的前五章最新
# 更新,novelName 中为 书本号,书本名 书本号在快眼看书里查出来的
#
from sgmllib import SGMLParser
import urllib
import os
class htmlParser(SGMLParser):
# 自己写的网页过滤,只过滤script 和 frame
def reset(self):
SGMLParser.reset(self)
self.data=[]
self.process=0
#self.num # Get the data in 2 th script
self.src=None
self.countNum=0
def start_frame(self,attrs):
self.src=[ v for k,v in attrs if k=='src'][0]
def start_script(self,attrs):
self.process=1
self.countNum+=1
def handle_data(self,text):
if self.process==1 and self.countNum==self.num :
self.data.append(text)
def end_script(self):
self.process=0
def newChapterList(bookId):
## 列出有无新的章节更新
url="http://s1.qbooksky.com/Bookdetail.aspx?BookID="+bookId+"&SiteID=167&Level=0&History=6"
urlfd = urllib.urlopen(url)
parser = htmlParser()
parser.num=2
parser.feed(urlfd.read())
newIndex=parser.data[0]
fd=open('readChapter')
hadRead=fd.read()
newIndex=newIndex[3:-3].split('(')[1:6]
#print newIndex
print '\n'+10*'*'+namelist[bookId]+10*'*'
getNovelList[bookId]=[]
if len(hadRead):
hadRead=hadRead.replace('\n',',').split(',')[:-1]
for i in range(0,len(hadRead),2):
readlist[hadRead[i]]=hadRead[i+1]
lastChapter=readlist[bookId]
for i in range(len(newIndex)):
if newIndex[i].find(lastChapter)!=-1:
break
if i==0:
print u'\n 没有新的章节更新!'
else:
readlist[bookId]=newIndex[0].split(',')[0]
print readlist[bookId]
else:
i=6
#print i
#print lastChapter
if i:
newIndex=newIndex[:i]
newIndex.reverse()
for i in range(len(newIndex)):
newIndex[i]=newIndex[i].split(',')[:2]
print newIndex[i][1][1:-1]
# print newIndex[i][0]
readlist[bookId]=newIndex[i][0]
getNovelList[bookId].append(newIndex[i][0])
#print readlist[bookId]
fd.close()
urlfd.close()
#return url
def getNovel(bookId,chapterId,filefd):
# 取得有更新小说的章节写入tempnovel.txt文本中,
# 文本写入方式是追加方式
url="http://v1.bookintake.com/ViewBook.aspx?SiteID=167&BookID="+bookId+"&ChapterID="+chapterId
urlfd=urllib.urlopen(url)
parser = htmlParser()
parser.feed(urlfd.read())
url=parser.src
urlfd.close()
urlfd=urllib.urlopen(url)
parser= htmlParser()
parser.num=5
parser.feed(urlfd.read())
s=parser.data[0]
#print s
s=s.split('"')
# print s
if s[0][:9]=='outputTxt':
txturl="http://www.365zw.com/novel"+s[1]
txturlfd=urllib.urlopen(txturl)
stxt=txturlfd.read()
stxt=stxt.replace('<br/><br/>','\n')
stxt=stxt.decode('gbk').encode('utf-8')
#print stxt
stxt='\n**********start**********\n'+'\n'+stxt[16:-3]+'\n'
txturlfd.close()
else:
s=s[:-1]
for i in range(1,len(s),2):
imageurl="wget http://www.365zw.com/novel"+s[i]
os.system(imageurl)
stxt='\n**********start**********\n'+'\n 此章为图片版 已下载\n'
#print stxt
filefd.writelines(stxt)
urlfd.close()
##########################main###################
try:
os.system('rm tempnovel.txt 2>/dev/null')
os.system('rm *_*.gif 2>/dev/null')
os.system('clear')
except :
pass
readlist={} # 读过的章节
namelist={} # 小说的书号和名字
getNovelList={} # 存放取得更新的列表
fd=open('novelName')
filefd=open('tempnovel.txt','wa')
name=fd.read()
name=name.replace('\n',',').split(',')[:-1]
for i in range(0,len(name),2):
namelist[name[i]]=name[i+1]
for i in range(len(namelist)):
newChapterList(namelist.keys()[i])
if len(getNovelList[namelist.keys()[i]]):
for j in range(len(getNovelList[namelist.keys()[i]])):
getNovel(namelist.keys()[i],getNovelList[namelist.keys()[i]][j],filefd)
chapterfd=open('readChapter','w')
for i in range(len(readlist)):
chapterfd.writelines(readlist.keys()[i]+','+readlist[readlist.keys()[i]]+'\n')
chapterfd.close()
filefd.close()
fd.close()
|