#!/usr/bin/env python

成都创新互联是一家以成都网站建设、网页设计、品牌设计、软件运维、seo优化排名、小程序App开发等移动开发为一体互联网公司。已累计为成都混凝土搅拌罐等众行业中小客户提供优质的互联网建站和软件开发服务。
# -*- coding:GBK -*-
"""汉字处理的工具:
判断unicode是否是汉字,数字,英文,或者其他字符。
全角符号转半角符号。"""
__author__="internetsweeper zhengbin0713@gmail.com"
__date__="2007-08-04"
def is_chinese(uchar):
"""判断一个unicode是否是汉字"""
if uchar = u'\u4e00' and uchar=u'\u9fa5':
return True
else:
return False
def is_number(uchar):
"""判断一个unicode是否是数字"""
if uchar = u'\u0030' and uchar=u'\u0039':
return True
else:
return False
def is_alphabet(uchar):
"""判断一个unicode是否是英文字母"""
if (uchar = u'\u0041' and uchar=u'\u005a') or (uchar = u'\u0061' and uchar=u'\u007a'):
return True
else:
return False
def is_other(uchar):
"""判断是否非汉字,数字和英文字符"""
if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)):
return True
else:
return False
def B2Q(uchar):
"""半角转全角"""
inside_code=ord(uchar)
if inside_code0x0020 or inside_code0x7e: #不是半角字符就返回原来的字符
return uchar
if inside_code==0x0020: #除了空格其他的全角半角的公式为:半角=全角-0xfee0
inside_code=0x3000
else:
inside_code+=0xfee0
return unichr(inside_code)
def Q2B(uchar):
"""全角转半角"""
inside_code=ord(uchar)
if inside_code==0x3000:
inside_code=0x0020
else:
inside_code-=0xfee0
if inside_code0x0020 or inside_code0x7e: #转完之后不是半角字符返回原来的字符
return uchar
return unichr(inside_code)
def stringQ2B(ustring):
"""把字符串全角转半角"""
return "".join([Q2B(uchar) for uchar in ustring])
def uniform(ustring):
"""格式化字符串,完成全角转半角,大写转小写的工作"""
return stringQ2B(ustring).lower()
def string2List(ustring):
"""将ustring按照中文,字母,数字分开"""
retList=[]
utmp=[]
for uchar in ustring:
if is_other(uchar):
if len(utmp)==0:
continue
else:
retList.append("".join(utmp))
utmp=[]
else:
utmp.append(uchar)
if len(utmp)!=0:
retList.append("".join(utmp))
return retList
if __name__=="__main__":
#test Q2B and B2Q
for i in range(0x0020,0x007F):
print Q2B(B2Q(unichr(i))),B2Q(unichr(i))
#test uniform
ustring=u'中国 人名a高频A'
ustring=uniform(ustring)
ret=string2List(ustring)
print ret
以上转自
这个问题是做 MkIV 预处理程序时搞定的,就是把一个混合了中英文混合字串分离为英文与中文的子字串,譬如,将 ”我的 English 学的不好“ 分离为 “我的"、" English ” 与 "学的不好" 三个子字串。
1. 中英文混合字串的统一编码表示中英文混合字串处理最省力的办法就是把它们的编码都转成 Unicode,让一个汉字与一个英文字母的内存位宽都是相等的。这个工作用 Python 来做,比较合适,因为 Python 内码采用的是 Unicode,并且为了支持 Unicode 字串的操作,Python 做了一个 Unicode 内建模块,把 string 对象的全部方法重新实现了一遍,另外提供了 Codecs 对象,解决各种编码类型的字符串解码与编码问题。
譬如下面的 Python 代码,可实现 UTF-8 编码的中英文混合字串向 Unicode 编码的转换:# -*-
coding:utf-8 -*-
a = "我的 English 学的不好"
print type(a),len (a), a
b = unicode (a, "utf-8")
print type(b), len (b), b字符串 a 是 utf-8 编码,使用 python 的内建对象 unicode 可将其转换为 Unicode 编码的字符串 b。上述代码执行后的输出结果如下所示,比较字串 a 与字串 b 的长度,显然 len (b) 的输出结果是合理的。type 'str' 27 我的 English 学的不好
type 'unicode' 15 我的 English 学的不好要注意的一个问题是 Unicode 虽然号称是“统一码”,不过也是存在着两种形式,即:
UCS-2:为 16 位码,具有 2^16 = 65536 个码位; UCS-4:为 32 位码,目前的规定是其首字节的首位为 0,因此具有 2^31 = 2147483648 个码位,不过现在的只使用了 0x00000000 - 0x0010FFFF 之间的码位,共 1114112 个。
使用Python sys 模块提供的一个变量 maxunicode 的值可以判断当前 Python 所使用的 Unicode 类型是 UCS-2 的还是 UCS-4 的。import sys
print sys.maxunicode若 sys.maxunicode 的值为 1114111,即为 UCS-4;若为 65535,则为 UCS-2。
2. 中英文混合字串的分离一旦中英文字串的编码获得统一,那么对它们进行分裂就是很简单的事情了。首先要为中文字串与英文字串分别准备一个收集器,使用两个空的字串对象即可,譬如 zh_gather 与 en_gather;然后要准备一个列表对象,负责按分离次序存储 zh_gather 与 en_gather 的值。下面这个 Python 函数接受一个中英文混合的 Unicode 字串,并返回存储中英文子字串的列表。def split_zh_en (zh_en_str):
zh_en_group = []
zh_gather = ""
en_gather = ""
zh_status = False
for c in zh_en_str:
if not zh_status and is_zh (c):
zh_status = True
if en_gather != "":
zh_en_group.append ([mark["en"],en_gather])
en_gather = ""
elif not is_zh (c) and zh_status:
zh_status = False
if zh_gather != "":
zh_en_group.append ([mark["zh"], zh_gather])
if zh_status:
zh_gather += c
else:
en_gather += c
zh_gather = ""
if en_gather != "":
zh_en_group.append ([mark["en"],en_gather])
elif zh_gather != "":
zh_en_group.append ([mark["zh"],zh_gather])
return zh_en_group上述代码所实现的功能细节是:对中英文混合字串 zh_en_str 的遍历过程中进行逐字识别,若当前字符为中文,则将其添加到 zh_gather 中;若当前字符为英文,则将其添加到 en_gather 中。zh_status 表示中英文字符的切换状态,当 zh_status 的值发生突变时,就将所收集的中文子字串或英文子字串添加到 zh_en_group 中去。
判断字串 zh_en_str 中是否包含中文字符的条件语句中出现了一个 is_zh () 函数,它的实现如下:def is_zh (c):
x = ord (c)
# Punct Radicals
if x = 0x2e80 and x = 0x33ff:
return True
# Fullwidth Latin Characters
elif x = 0xff00 and x = 0xffef:
return True
# CJK Unified Ideographs
# CJK Unified Ideographs Extension A
elif x = 0x4e00 and x = 0x9fbb:
return True
# CJK Compatibility Ideographs
elif x = 0xf900 and x = 0xfad9:
return True
# CJK Unified Ideographs Extension B
elif x = 0x20000 and x = 0x2a6d6:
return True
# CJK Compatibility Supplement
elif x = 0x2f800 and x = 0x2fa1d:
return True
else:
return False这段代码来自 jjgod 写的 XeTeX 预处理程序。
对于分离出来的中文子字串与英文子字串,为了使用方便,在将它们存入 zh_en_group 列表时,我对它们分别做了标记,即 mark["zh"] 与 mark["en"]。mark 是一个 dict 对象,其定义如下:mark = {"en":1, "zh":2}如果要对 zh_en_group 中的英文字串或中文字串进行处理时,标记的意义在于快速判定字串是中文的,还是英文的,譬如:for str in zh_en_group:
if str[0] = mark["en"]:
do somthing
else:
do somthing
1、首选运行工具 makepy.py。
2、这样就可以查看 C# dll的 com导出的 py文件了。
Python编程makepy.py代码如下:
# -*- coding: mbcs -*-
# Created by makepy.py version 0.5.00
# By python version 2.5.4 (r254:67916, Dec 23 2008, 15:10:54) [MSC v.1310 32 bit (Intel)]
# From type library 'XGSharpLib.tlb'
""""""
makepy_version = '0.5.00'
python_version = 0x20504f0
import win32com.client.CLSIDToClass, pythoncom, pywintypes
import win32com.client.util
from pywintypes import IID
from win32com.client import Dispatch
# The following 3 lines may need tweaking for the particular server
# Candidates are pythoncom.Missing, .Empty and .ArgNotFound
defaultNamedOptArg=pythoncom.Empty
defaultNamedNotOptArg=pythoncom.Empty
defaultUnnamedArg=pythoncom.Empty
CLSID = IID('{B38EF2FA-4639-40BC-B97C-7908CED04FF9}')
MajorVersion = 1
MinorVersion = 0
LibraryFlags = 8
LCID = 0x0
from win32com.client import DispatchBaseClass
class IComXGSharpLib(DispatchBaseClass):
CLSID = IID('{D1965A94-0271-4C48-8AF6-2A56E256808B}')
coclass_clsid = IID('{2302D874-18FE-4281-B329-9517F1BC8311}')
def EncryptMd5(self, orignPwd=defaultNamedNotOptArg):
# Result is a Unicode object
return self._oleobj_.InvokeTypes(2, LCID, 1, (8, 0), ((8, 1),),orignPwd
)
def EncryptSHA1(self, orignPwd=defaultNamedNotOptArg):
# Result is a Unicode object
return self._oleobj_.InvokeTypes(1, LCID, 1, (8, 0), ((8, 1),),orignPwd
)
_prop_map_get_ = {
}
_prop_map_put_ = {
}
class _Object(DispatchBaseClass):
CLSID = IID('{65074F7F-63C0-304E-AF0A-D51741CB4A8D}')
coclass_clsid = IID('{2302D874-18FE-4281-B329-9517F1BC8311}')
def Equals(self, obj=defaultNamedNotOptArg):
return self._oleobj_.InvokeTypes(1610743809, LCID, 1, (11, 0), ((12, 1),),obj
)
def GetHashCode(self):
return self._oleobj_.InvokeTypes(1610743810, LCID, 1, (3, 0), (),)
# Result is of type _Type
def GetType(self):
ret = self._oleobj_.InvokeTypes(1610743811, LCID, 1, (13, 0), (),)
if ret is not None:
# See if this IUnknown is really an IDispatch
try:
ret = ret.QueryInterface(pythoncom.IID_IDispatch)
except pythoncom.error:
return ret
ret = Dispatch(ret, u'GetType', '{BCA8B44D-AAD6-3A86-8AB7-03349F4F2DA2}')
return ret
_prop_map_get_ = {
"ToString": (0, 2, (8, 0), (), "ToString", None),
}
_prop_map_put_ = {
}
# Default property for this class is 'ToString'
def __call__(self):
return self._ApplyTypes_(*(0, 2, (8, 0), (), "ToString", None))
def __unicode__(self, *args):
try:
return unicode(self.__call__(*args))
except pythoncom.com_error:
return repr(self)
def __str__(self, *args):
return str(self.__unicode__(*args))
def __int__(self, *args):
return int(self.__call__(*args))
from win32com.client import CoClassBaseClass
# This CoClass is known by the name 'XGSharpLib.Security'
class Security(CoClassBaseClass): # A CoClass
CLSID = IID('{2302D874-18FE-4281-B329-9517F1BC8311}')
coclass_sources = [
]
coclass_interfaces = [
IComXGSharpLib,
_Object,
]
default_interface = IComXGSharpLib
IComXGSharpLib_vtables_dispatch_ = 1
IComXGSharpLib_vtables_ = [
(( u'EncryptSHA1' , u'orignPwd' , u'pRetVal' , ), 1, (1, (), [ (8, 1, None, None) ,
(16392, 10, None, None) , ], 1 , 1 , 4 , 0 , 28 , (3, 0, None, None) , 0 , )),
(( u'EncryptMd5' , u'orignPwd' , u'pRetVal' , ), 2, (2, (), [ (8, 1, None, None) ,
(16392, 10, None, None) , ], 1 , 1 , 4 , 0 , 32 , (3, 0, None, None) , 0 , )),
]
_Object_vtables_dispatch_ = 1
_Object_vtables_ = [
(( u'ToString' , u'pRetVal' , ), 0, (0, (), [ (16392, 10, None, None) , ], 1 , 2 , 4 , 0 , 28 , (3, 0, None, None) , 0 , )),
(( u'Equals' , u'obj' , u'pRetVal' , ), 1610743809, (1610743809, (), [ (12, 1, None, None) ,
(16395, 10, None, None) , ], 1 , 1 , 4 , 0 , 32 , (3, 0, None, None) , 0 , )),
(( u'GetHashCode' , u'pRetVal' , ), 1610743810, (1610743810, (), [ (16387, 10, None, None) , ], 1 , 1 , 4 , 0 , 36 , (3, 0, None, None) , 0 , )),
(( u'GetType' , u'pRetVal' , ), 1610743811, (1610743811, (), [ (16397, 10, None, "IID('{BCA8B44D-AAD6-3A86-8AB7-03349F4F2DA2}')") , ], 1 , 1 , 4 , 0 , 40 , (3, 0, None, None) , 0 , )),
]
RecordMap = {
}
CLSIDToClassMap = {
'{D1965A94-0271-4C48-8AF6-2A56E256808B}' : IComXGSharpLib,
'{2302D874-18FE-4281-B329-9517F1BC8311}' : Security,
'{65074F7F-63C0-304E-AF0A-D51741CB4A8D}' : _Object,
}
CLSIDToPackageMap = {}
win32com.client.CLSIDToClass.RegisterCLSIDsFromDict( CLSIDToClassMap )
VTablesToPackageMap = {}
VTablesToClassMap = {
'{D1965A94-0271-4C48-8AF6-2A56E256808B}' : 'IComXGSharpLib',
'{65074F7F-63C0-304E-AF0A-D51741CB4A8D}' : '_Object',
}
NamesToIIDMap = {
'_Object' : '{65074F7F-63C0-304E-AF0A-D51741CB4A8D}',
'IComXGSharpLib' : '{D1965A94-0271-4C48-8AF6-2A56E256808B}',
}
3、举例:Python调用C# dll中的 MD5加密方法。
Python:常用函数封装:
def is_chinese(uchar):
"""判断一个unicode是否是汉字"""
if uchar = u'\u4e00' and uchar=u'\u9fa5':
return True
else:
return False
def is_number(uchar):
"""判断一个unicode是否是数字"""
if uchar = u'\u0030' and uchar=u'\u0039':
return True
else:
return False
def is_alphabet(uchar):
"""判断一个unicode是否是英文字母"""
if (uchar = u'\u0041' and uchar=u'\u005a') or (uchar = u'\u0061' and uchar=u'\u007a'):
return True
else:
return False
def is_other(uchar):
"""判断是否非汉字,数字和英文字符"""
if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)):
return True
else:
return False
def B2Q(uchar):
"""半角转全角"""
inside_code=ord(uchar)
if inside_code0x0020 or inside_code0x7e: #不是半角字符就返回原来的字符
return uchar
if inside_code==0x0020: #除了空格其他的全角半角的公式为:半角=全角-0xfee0
inside_code=0x3000
else:
inside_code+=0xfee0
return unichr(inside_code)
def Q2B(uchar):
"""全角转半角"""
inside_code=ord(uchar)
if inside_code==0x3000:
inside_code=0x0020
else:
inside_code-=0xfee0
if inside_code0x0020 or inside_code0x7e: #转完之后不是半角字符返回原来的字符
return uchar
return unichr(inside_code)
def stringQ2B(ustring):
"""把字符串全角转半角"""
return "".join([Q2B(uchar) for uchar in ustring])
def uniform(ustring):
"""格式化字符串,完成全角转半角,大写转小写的工作"""
return stringQ2B(ustring).lower()
def string2List(ustring):
"""将ustring按照中文,字母,数字分开"""
retList=[]
utmp=[]
for uchar in ustring:
if is_other(uchar):
if len(utmp)==0:
continue
else:
retList.append("".join(utmp))
utmp=[]
else:
utmp.append(uchar)
if len(utmp)!=0:
retList.append("".join(utmp))
return retList
#encoding=utf-8
import time
from collections import defaultdict
from operator import itemgetter
# Data in BC.txt:
# a b
# a h
# b c
# b h
# h i
# h g
# g i
# g f
# c f
# c i
# c d
# d f
# d e
# f e
class Graph:
def __init__(self):
self.Graph = defaultdict(set)
self.NodesNum = 0
def MakeLink(self,filename,separator):
with open(filename,'r') as graphfile:
for line in graphfile:
nodeA,nodeB = line.strip().split(separator)
self.Graph[nodeA].add(nodeB)
self.Graph[nodeB].add(nodeA)
self.NodesNum = len(self.Graph)
def BetweennessCentrality(self):
betweenness = dict.fromkeys(self.Graph,0.0)
for s in self.Graph:
# 1. compute the length and number of shortest paths from node s
S = []
P = {}
for v in self.Graph:
P[v]=[]
Sigma = dict.fromkeys(self.Graph,0.0)
Sigma[s] = 1.0
D = {}
D[s] = 0
Q = [s]
# use BFS to find single source shortest paths
while Q:
v = Q.pop(0)
S.append(v)
Dv = D[v]
for w in self.Graph[v]:
# w found for the first time?
if w not in D:
Q.append(w)
D[w] = D[v] + 1
# shortest path to w via v
if D[w] == D[v] + 1:
Sigma[w] += Sigma[v]
P[w].append(v)
# 2. sum all pair-dependencies of node s
delta = dict.fromkeys(self.Graph,0.0)
# S returns vertices in order of non-increasing distance from s
while S:
w = S.pop()
coeff = (1.0+delta[w])/Sigma[w]
for v in P[w]:
delta[v] += Sigma[v]*coeff
if w != s:
betweenness[w] += delta[w]
scale = 1.0/((self.NodesNum-1)*(self.NodesNum-2))
for v in betweenness:
betweenness[v] *= scale
betweenness = [(node,bc) for node,bc in betweenness.iteritems()]
betweenness = sorted(betweenness,key=itemgetter(1),reverse=True)
return betweenness
if __name__=='__main__':
separator = '\t'
file = 'C:\\Users\\Administrator\\Desktop\\BC.txt'
begin = time.time()
myGraph = Graph()
myGraph.MakeLink(file,separator)
print myGraph.BetweennessCentrality()
print 'Time:',time.time()-begin,' seconds'