QQ消息记录统计程序
时间:2007-07-27 来源:wibrst
目前功能:分析每个发言者的发言次数并排序列出
根据开关sInputType来决定从拷贝记录还是导出记录的形式分析。
主文件:organizeQQMsg.py
排序模块:sortDict.py
---------------------------------------------------------------------------
# organizeQQMsg.py
import sys
import os
import re
import sortDict
def file2List(_uf):
bUTF8 = 0
f = file(_uf , 'r')
l = f.readlines()
if l[0][:3]=='\xef\xbb\xbf':
bUTF8 = 1
l[0]=l[0][3:]
# print l[0].decode('utf8') # test the trouble line
f.close()
return l,bUTF8
def getFilePath():
bValidInput = 0
while not bValidInput:
uInput = raw_input("input source file path:")
if os.path.isfile ( uInput ) :
bValidInput = 1
return uInput
def getMsgsDictExport(aRecordRaw):
pTitle = r'\d{4}-\d{2}-\d{2}\ \d{2}\:\d{2}\:\d{2}\ (.*)'
cpTitle = re.compile(pTitle)
dAuthorWords = {}
for l in aRecordRaw:
m = cpTitle.match(l)
if m:
sAuthor = m.group(1)
if not dAuthorWords.has_key(sAuthor):
dAuthorWords[sAuthor]=1
dAuthorWords[sAuthor] +=1
return dAuthorWords
def getMsgsDictHistory(aRecordRaw):
pTitle = r'(.*)\ \d{2}\:\d{2}\:\d{2}'
cpTitle = re.compile(pTitle)
dAuthorWords = {}
bTitle = 0
bContentReading = 0
iIdx = 0
for i in range(len(aRecordRaw)):
l = aRecordRaw[i]
if bTitle: # read first words line
if not bContentReading:
bTitle = 0
bContentReading = 1
else :
m = cpTitle.match(l)
if m: # encount title
bTitle = 1
bContentReading = 0
# process prev words
if 'aWords' in dir():
aWords.append(i-1)
del aWords
# start this
iIdx+=1
sAuthor = m.group(1)
aWords = [iIdx,i+1]
if not dAuthorWords.has_key(sAuthor):
dAuthorWords[sAuthor] = []
dAuthorWords[sAuthor].append(aWords)
else : # continiue reading
pass
aWords.append(i)
del aWords
return dAuthorWords
def appendElement(e ):
aResult.append( (e.decode('utf8').encode('gbk') if bUTF8 else e).strip())
def writeResult(aResult,uInput):
b,e = os.path.splitext(os.path.abspath(uInput))
uOutput = b+'_stat'+e
if 0:
print aResult
else:
f = file(uOutput,'w')
f.write('\n'.join(aResult))
f.close()
if __name__ == "__main__":
bModeOnlyTitle = 1 # do not display msgs
sInputType = 'export' # export history
uInput = 'sample.txt'
# uInput = getFilePath()
aRecordRaw,bUTF8 = file2List(uInput)
aResult = []
if sInputType =='export':
dAuthorCounts = getMsgsDictExport(aRecordRaw)
aSorted = sortDict.getListSortDict(dAuthorCounts,1)
for i in range(len(aSorted)):
item = aSorted[i]
#print '%s [%d]' % (item[0],item[1])
appendElement( '%-20s [%d]' % (item[0],item[1]) )
elif sInputType == 'history':
dAuthorWords = getMsgsDictHistory(aRecordRaw)
aSorted = sortDict.getListSortDict(dAuthorWords,2)
for e in range(len(aSorted)):
item = aSorted[e]
appendElement( '%-20s [%d]' % (item[0],item[2]) +('' if bModeOnlyTitle else ':') )
if bModeOnlyTitle :
continue
aWords = item[1]
for i in range(len(aWords)):
r = aWords[i]
appendElement('%02d:\t%s' % (r[0],aRecordRaw[r[1]]) )
for j in range(r[1]+1,r[2]):
appendElement( '\t'+aRecordRaw[j])
appendElement( '-------------------'+os.linesep)
writeResult(aResult,uInput)
---------------------------------------------------------------------------
# sortDict.py
import random
def getRdmDict():
d ={}
iCodeBase =ord('a')
for i in range(13):
d[chr(iCodeBase+i)] = random.randint(1,100)
return d
def getListSortDict(d ,iMethod):
aSort =[]
if iMethod == 1 :
for i in d:
insertElement1(aSort,i,d[i])
elif iMethod == 2 :
for i in d:
insertElement2(aSort,i,d[i])
return aSort
def insertElement1(aSort,sAuthor,iTimes): # [sAuthor,iTimes] format
for i in range(len(aSort)):
if iTimes <aSort[i][1]:
aSort.insert(i,[sAuthor,iTimes])
return
aSort.append([sAuthor,iTimes])
def insertElement2(aSort,sAuthor,aWords): # [sAuthor,aWords,iTimes] format
iTimes = len(aWords)
for i in range(len(aSort)):
if iTimes <aSort[i][2]:
aSort.insert(i,[sAuthor,aWords,iTimes])
return
aSort.append([sAuthor,aWords,iTimes])
if __name__ == '__main__':
d = getRdmDict()
a = getListSortDict(d)
print d
print a
根据开关sInputType来决定从拷贝记录还是导出记录的形式分析。
主文件:organizeQQMsg.py
排序模块:sortDict.py
---------------------------------------------------------------------------
# organizeQQMsg.py
import sys
import os
import re
import sortDict
def file2List(_uf):
bUTF8 = 0
f = file(_uf , 'r')
l = f.readlines()
if l[0][:3]=='\xef\xbb\xbf':
bUTF8 = 1
l[0]=l[0][3:]
# print l[0].decode('utf8') # test the trouble line
f.close()
return l,bUTF8
def getFilePath():
bValidInput = 0
while not bValidInput:
uInput = raw_input("input source file path:")
if os.path.isfile ( uInput ) :
bValidInput = 1
return uInput
def getMsgsDictExport(aRecordRaw):
pTitle = r'\d{4}-\d{2}-\d{2}\ \d{2}\:\d{2}\:\d{2}\ (.*)'
cpTitle = re.compile(pTitle)
dAuthorWords = {}
for l in aRecordRaw:
m = cpTitle.match(l)
if m:
sAuthor = m.group(1)
if not dAuthorWords.has_key(sAuthor):
dAuthorWords[sAuthor]=1
dAuthorWords[sAuthor] +=1
return dAuthorWords
def getMsgsDictHistory(aRecordRaw):
pTitle = r'(.*)\ \d{2}\:\d{2}\:\d{2}'
cpTitle = re.compile(pTitle)
dAuthorWords = {}
bTitle = 0
bContentReading = 0
iIdx = 0
for i in range(len(aRecordRaw)):
l = aRecordRaw[i]
if bTitle: # read first words line
if not bContentReading:
bTitle = 0
bContentReading = 1
else :
m = cpTitle.match(l)
if m: # encount title
bTitle = 1
bContentReading = 0
# process prev words
if 'aWords' in dir():
aWords.append(i-1)
del aWords
# start this
iIdx+=1
sAuthor = m.group(1)
aWords = [iIdx,i+1]
if not dAuthorWords.has_key(sAuthor):
dAuthorWords[sAuthor] = []
dAuthorWords[sAuthor].append(aWords)
else : # continiue reading
pass
aWords.append(i)
del aWords
return dAuthorWords
def appendElement(e ):
aResult.append( (e.decode('utf8').encode('gbk') if bUTF8 else e).strip())
def writeResult(aResult,uInput):
b,e = os.path.splitext(os.path.abspath(uInput))
uOutput = b+'_stat'+e
if 0:
print aResult
else:
f = file(uOutput,'w')
f.write('\n'.join(aResult))
f.close()
if __name__ == "__main__":
bModeOnlyTitle = 1 # do not display msgs
sInputType = 'export' # export history
uInput = 'sample.txt'
# uInput = getFilePath()
aRecordRaw,bUTF8 = file2List(uInput)
aResult = []
if sInputType =='export':
dAuthorCounts = getMsgsDictExport(aRecordRaw)
aSorted = sortDict.getListSortDict(dAuthorCounts,1)
for i in range(len(aSorted)):
item = aSorted[i]
#print '%s [%d]' % (item[0],item[1])
appendElement( '%-20s [%d]' % (item[0],item[1]) )
elif sInputType == 'history':
dAuthorWords = getMsgsDictHistory(aRecordRaw)
aSorted = sortDict.getListSortDict(dAuthorWords,2)
for e in range(len(aSorted)):
item = aSorted[e]
appendElement( '%-20s [%d]' % (item[0],item[2]) +('' if bModeOnlyTitle else ':') )
if bModeOnlyTitle :
continue
aWords = item[1]
for i in range(len(aWords)):
r = aWords[i]
appendElement('%02d:\t%s' % (r[0],aRecordRaw[r[1]]) )
for j in range(r[1]+1,r[2]):
appendElement( '\t'+aRecordRaw[j])
appendElement( '-------------------'+os.linesep)
writeResult(aResult,uInput)
---------------------------------------------------------------------------
# sortDict.py
import random
def getRdmDict():
d ={}
iCodeBase =ord('a')
for i in range(13):
d[chr(iCodeBase+i)] = random.randint(1,100)
return d
def getListSortDict(d ,iMethod):
aSort =[]
if iMethod == 1 :
for i in d:
insertElement1(aSort,i,d[i])
elif iMethod == 2 :
for i in d:
insertElement2(aSort,i,d[i])
return aSort
def insertElement1(aSort,sAuthor,iTimes): # [sAuthor,iTimes] format
for i in range(len(aSort)):
if iTimes <aSort[i][1]:
aSort.insert(i,[sAuthor,iTimes])
return
aSort.append([sAuthor,iTimes])
def insertElement2(aSort,sAuthor,aWords): # [sAuthor,aWords,iTimes] format
iTimes = len(aWords)
for i in range(len(aSort)):
if iTimes <aSort[i][2]:
aSort.insert(i,[sAuthor,aWords,iTimes])
return
aSort.append([sAuthor,aWords,iTimes])
if __name__ == '__main__':
d = getRdmDict()
a = getListSortDict(d)
print d
print a
相关阅读 更多 +