python正则表达式提取中文字符串(转)
时间:2010-07-20 来源:yemuda
因为想留作以后研究python中文编码用,貌似不完整,以后如果研究透彻,会加以补充,转贴如下:
=========================================
要做国际化的版本,需要把中文字符串都提取出来翻译,写了这个python脚本。
#!/usr/bin/python
# -*- coding: UTF-8 -*
'''
##
# @file match-chinese.py
# @brief 利用正则表达式提取中文字符串
# @author Jesse
# @version 1.0
# @date 2009-11-20
'''
import os,string
import re
directory = "."
output = "chinese.txt"
def match_chinese(s, f, i):
global fd_output
r = re.compile('\"[^\"]*[\x80-\xff]{3}[^\"]*\"')
s_match = r.findall(s)
for c in s_match:
str = "%s ( %d ): %s\n" % (f, i, c)
fd_output.write(str)
def istextfile(filename, blocksize = 512):
return istext(open(filename).read(blocksize))
def istext(s):
if "\0" in s:
return 0
if not s:
return 1
text_characters = "".join(map(chr, range(32, 127)) + list("\n\r\t\b"))
_null_trans = string.maketrans("", "")
t = s.translate(_null_trans, text_characters)
if len(t)/len(s) > 0.30:
return 0
return 1
def read_file(f):
if not istextfile(f):
print "%s is NOT a text file" % (f)
return
#if not re.match(r".*\.[c|h]$", f):
# return
i = 0
fd = open(f,'r')
buff = fd.readlines()
for line in buff:
i + 1 2
=========================================
要做国际化的版本,需要把中文字符串都提取出来翻译,写了这个python脚本。
#!/usr/bin/python
# -*- coding: UTF-8 -*
'''
##
# @file match-chinese.py
# @brief 利用正则表达式提取中文字符串
# @author Jesse
# @version 1.0
# @date 2009-11-20
'''
import os,string
import re
directory = "."
output = "chinese.txt"
def match_chinese(s, f, i):
global fd_output
r = re.compile('\"[^\"]*[\x80-\xff]{3}[^\"]*\"')
s_match = r.findall(s)
for c in s_match:
str = "%s ( %d ): %s\n" % (f, i, c)
fd_output.write(str)
def istextfile(filename, blocksize = 512):
return istext(open(filename).read(blocksize))
def istext(s):
if "\0" in s:
return 0
if not s:
return 1
text_characters = "".join(map(chr, range(32, 127)) + list("\n\r\t\b"))
_null_trans = string.maketrans("", "")
t = s.translate(_null_trans, text_characters)
if len(t)/len(s) > 0.30:
return 0
return 1
def read_file(f):
if not istextfile(f):
print "%s is NOT a text file" % (f)
return
#if not re.match(r".*\.[c|h]$", f):
# return
i = 0
fd = open(f,'r')
buff = fd.readlines()
for line in buff:
i + 1 2
相关阅读 更多 +