python3-cookbook/exts/smallseg.py at master · pythoningLearning/python3-cookbook

History

144 lines (138 loc) · 4.66 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

# -*- coding: utf-8 -*-

import re

import os

import sys

class SEG(object):

def __init__(self):

_localDir=os.path.dirname(__file__)

_curpath=os.path.normpath(os.path.join(os.getcwd(),_localDir))

curpath=_curpath

self.d = {}

print >> sys.stderr,"loading dict..."

self.set([x.rstrip() for x in file(os.path.join(curpath,"main.dic")) ])

self.specialwords= set([x.rstrip().decode('utf-8') for x in file(os.path.join(curpath,"suffix.dic"))])

print >> sys.stderr,'dict ok.'

#set dictionary(a list)

def set(self,keywords):

p = self.d

q = {}

k = ''

for word in keywords:

word = (chr(11)+word).decode('utf-8')

if len(word)>5:

continue

p = self.d

ln = len(word)

for i in xrange(ln-1,-1,-1):

char = word[i].lower()

if p=='':

q[k] = {}

p = q[k]

if not (char in p):

p[char] = ''

q = p

k = char

p = p[char]

pass

def _binary_seg(self,s):

ln = len(s)

if ln==1:

return [s]

R = []

for i in xrange(ln,1,-1):

tmp = s[i-2:i]

R.append(tmp)

return R

def _pro_unreg(self,piece):

#print piece

R = []

tmp = re.sub(u"。|，|,|！|…|!|《|》|<|>|\"|'|:|：|？|\?|、|\||“|”|‘|’|；|—|（|）|·|\(|\)|　"," ",piece).split()

ln1 = len(tmp)

for i in xrange(len(tmp)-1,-1,-1):

mc = re.split(r"([0-9A-Za-z\-\+#@_\.]+)",tmp[i])

for j in xrange(len(mc)-1,-1,-1):

r = mc[j]

if re.search(r"([0-9A-Za-z\-\+#@_\.]+)",r)!=None:

R.append(r)

else:

R.extend(self._binary_seg(r))

return R

def cut(self,text):

"""

text = text.decode('utf-8','ignore')

p = self.d

ln = len(text)

i = ln

j = 0

z = ln

q = 0

recognised = []

mem = None

mem2 = None

while i-j>0:

t = text[i-j-1].lower()

#print i,j,t,mem

if not (t in p):

if (mem!=None) or (mem2!=None):

if mem!=None:

i,j,z = mem

mem = None

elif mem2!=None:

delta = mem2[0]-i

if delta>=1:

if (delta<5) and (re.search(ur"[\w\u2E80-\u9FFF]",t)!=None):

pre = text[i-j]

#print pre

if not (pre in self.specialwords):

i,j,z,q = mem2

del recognised[q:]

mem2 = None

p = self.d

if((i<ln) and (i<z)):

unreg_tmp = self._pro_unreg(text[i:z])

recognised.extend(unreg_tmp)

recognised.append(text[i-j:i])

#print text[i-j:i],mem2

i = i-j

z = i

j = 0

continue

j = 0

i -= 1

p = self.d

continue

p = p[t]

j+=1

if chr(11) in p:

if j<=2:

mem = i,j,z

#print text[i-1]

if (z-i<2) and (text[i-1] in self.specialwords) and ((mem2==None) or ((mem2!=None and mem2[0]-i>1))):

#print text[i-1]

mem = None

mem2 = i,j,z,len(recognised)

p = self.d

i -= 1

j = 0

continue

#print mem

p = self.d

#print i,j,z,text[i:z]

if((i<ln) and (i<z)):

unreg_tmp = self._pro_unreg(text[i:z])

recognised.extend(unreg_tmp)

recognised.append(text[i-j:i])

i = i-j

z = i

j = 0

mem = None

mem2 = None

#print mem

if mem!=None:

i,j,z = mem

recognised.extend(self._pro_unreg(text[i:z]))

recognised.append(text[i-j:i])

else:

recognised.extend(self._pro_unreg(text[i-j:z]))

return recognised

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

smallseg.py

Latest commit

History

smallseg.py

File metadata and controls