forked from zhanghe06/python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathchar.py
More file actions
211 lines (163 loc) · 4.35 KB
/
char.py
File metadata and controls
211 lines (163 loc) · 4.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# coding=utf-8
__author__ = 'zhanghe'
"""
字符处理的工具:
判断unicode是否是汉字,数字,英文,或者其他字符。
全角符号转半角符号。
"""
def is_alphabet(uchar):
"""
判断一个unicode是否是英文字母
"""
if (u'\u0041' <= uchar <= u'\u005a') or (u'\u0061' <= uchar <= u'\u007a'):
return True
else:
return False
def is_chinese(uchar):
"""
判断一个unicode是否是汉字
"""
if u'\u4e00' <= uchar <= u'\u9fa5':
return True
else:
return False
def is_number(uchar):
"""
判断一个unicode是否是数字
"""
if u'\u0030' <= uchar <= u'\u0039':
return True
else:
return False
def is_other(uchar):
"""
判断是否非汉字,数字和英文字符
"""
if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)):
return True
else:
return False
def b2q(uchar):
"""
半角转全角
"""
inside_code = ord(uchar)
if inside_code < 0x0020 or inside_code > 0x7e: # 不是半角字符就返回原来的字符
return uchar
if inside_code == 0x0020: # 除了空格其他的全角半角的公式为:半角=全角-0xfee0
inside_code = 0x3000
else:
inside_code += 0xfee0
return unichr(inside_code)
def q2b(uchar):
"""
全角转半角
"""
inside_code = ord(uchar)
if inside_code == 0x3000:
inside_code = 0x0020
else:
inside_code -= 0xfee0
if inside_code < 0x0020 or inside_code > 0x7e: # 转完之后不是半角字符返回原来的字符
return uchar
return unichr(inside_code)
def string_q2b(ustring):
"""
把字符串全角转半角
"""
return "".join([q2b(uchar) for uchar in ustring])
def uniform(ustring):
"""
格式化字符串,完成全角转半角,大写转小写的工作
"""
return string_q2b(ustring).lower()
def string2list(ustring):
"""
将ustring按照中文,字母,数字分开
"""
ret_list = []
u_tmp = []
for uchar in ustring:
if is_other(uchar):
if len(u_tmp) == 0:
continue
else:
ret_list.append("".join(u_tmp))
u_tmp = []
else:
u_tmp.append(uchar)
if len(u_tmp) != 0:
ret_list.append("".join(u_tmp))
return ret_list
def get_first_char(s):
"""
获取字符串首字母数 (如果是中文 获取拼音首字母)
"""
if s is None or s == '':
return ''
s = s.decode('utf-8', 'ignore')
# 如果字符串首字符是字母,直接返回
first_char = ord(s[0].upper())
# print first_char
if ord('A') <= first_char <= ord('Z'):
return s[0].upper()
# 处理汉字情况
s = s.encode('gb18030', 'ignore')
# print len(s)
if len(s) < 2:
return ''
asc = ord(s[0])*256 + ord(s[1])-65536
if -20319 <= asc <= -20284:
return 'A'
if -20283 <= asc <= -19776:
return 'B'
if -19775 <= asc <= -19219:
return 'C'
if -19218 <= asc <= -18711:
return 'D'
if -18710 <= asc <= -18527:
return 'E'
if -18526 <= asc <= -18240:
return 'F'
if -18239 <= asc <= -17923:
return 'G'
if -17922 <= asc <= -17418:
return 'H'
if -17417 <= asc <= -16475:
return 'J'
if -16474 <= asc <= -16213:
return 'K'
if -16212 <= asc <= -15641:
return 'L'
if -15640 <= asc <= -15166:
return 'M'
if -15165 <= asc <= -14923:
return 'N'
if -14922 <= asc <= -14915:
return 'O'
if -14914 <= asc <= -14631:
return 'P'
if -14630 <= asc <= -14150:
return 'Q'
if -14149 <= asc <= -14091:
return 'R'
if -14090 <= asc <= -13319:
return 'S'
if -13318 <= asc <= -12839:
return 'T'
if -12838 <= asc <= -12557:
return 'W'
if -12556 <= asc <= -11848:
return 'X'
if -11847 <= asc <= -11056:
return 'Y'
if -11055 <= asc <= -10247:
return 'Z'
return ''
if __name__ == '__main__':
print is_chinese(u'你好啊') # True
print is_chinese(u'你abc') # True
print is_chinese(u'abc你') # False
print get_first_char('abc你') # A
print get_first_char('q') # Q
print get_first_char('你好') # N