编码还是编码
时间:2010-08-18 来源:stlaw
某个文本,从mysql的latin1表中导出,使用gb18030解码转成utf8,如果转换失败,就对失败的字符换成latin1进行转:
===================
#!/usr/local/python/bin/python
#-*- encoding: utf-8 -*-
import traceback,sys,re
fh=open('1.out','rb')
ofh=open('1.out.1','wb')
class MyException(Exception):
def __init__(self,code,msg):
self.args=(code,msg)
class NoPositionError(MyException):
pass
class NotUnicodeError(MyException):
pass
def decodestr(todostr):
global i #无法转换时记录下行数
try:
newstr=todostr.decode('gb18030').encode('utf8')
return newstr
except:
t,v,tb=sys.exc_info()
errstr=traceback.format_exception_only(t,v)[0]
print i
if errstr.startswith('UnicodeDecodeError'):
m=re.search(r'position (\d+)[:|-]',errstr)
if m:
redoint=int(m.group(1))
newstr_left=todostr[:redoint].decode('18030').encode('utf8')
newstr_mid=todostr[redoint].decode('latin_1').encode('utf8')
newstr_right=decodestr(todostr[redoint+1:])
newstr = newstr_left+newstr_mid+newstr_right
return newstr
else:
raise NoPositionError('',errstr)
else: raise NotUnicodeError('',newstr)
i=0
while 1:
i=i+1
l=fh.readline()
if l:
l=l.strip()
nl=decodestr(l)
print >>ofh,nl
else: break