原创：中文分词的逆向最大匹配算法

合集下载

1、下载文档前请自行甄别文档内容的完整性，平台不提供额外的编辑、内容补充、找答案等附加服务。
2、"仅部分预览"的文档,不可在线预览部分如存在完整性等问题,可反馈申请退款(可完整预览的文档不适用该条件!)。
3、如文档侵犯您的权益，请联系客服反馈,我们会尽快为您处理(人工客服工作时间：9:00-18:30)。

原创：中⽂分词的逆向最⼤匹配算法
逆向最⼤匹配算法，中⽂分词机械化分词中最基本的算法，也是⼊门级别的算法。

但是，在机械化分词⽅⾯的效果，表现却很好。

尤其是在⼤⽂本的时候，⼀次取较多词语进⾏匹配，因为⼤⽂本匹配成词的概率远远⾼于⼩⽂本，所以会有很好的表现。

IK分词，在中⽂分词领域⾥，只能算是⽪⽑，或者说是⼀个壳⼉⽽已，根本不算真正的分词。

中⽂分词⾥⾯，运⽤CRF进⾏消除歧义分词，是主流，在NLP领
域，RNN是主要技术⼿段，截⽌到2016年，RNN已经成功应⽤到NLP领域中，甚⾄在计算机视觉中也发挥着重要作⽤。

⽬前，在open nlp 社区⾥，有⼀个HanLP分词源码包，⾥⾯有极速分词和消歧分词，性能⾮常优异。

下⾯的代码，来⾃IK分词的⼀部分源码包，本⼈进⾏了逆向最⼤匹配算法的改造，闲着没事⼲，算是⼊门级别的分词。

package org.wltea.analyzer.core;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.dic.Dictionary;
/**
* 中分分词上下⽂环境
* @author TongXueQiang
* @date 2016/01/22
* @since 1.7
*/
class AnalyzeContext {
private char[] segmentBuff;
private int[] charTypes;
private int buffOffset;
private int cursor;
private int available;
private Set<String> buffLocker;
private QuickSortSet orgLexemes;
private Map<Integer, LexemePath> pathMap;
private LinkedList<Lexeme> results;
private Configuration cfg;
private Integer moveIndex;
public AnalyzeContext(Configuration cfg) {
this.cfg = cfg;
this.segmentBuff = new char[4096];
this.charTypes = new int[4096];
this.buffLocker = new HashSet<String>();
Lexemes = new QuickSortSet();
this.pathMap = new HashMap<Integer, LexemePath>();
this.results = new LinkedList<Lexeme>();
}
int getCursor() {
return this.cursor;
}
char[] getSegmentBuff() {
return this.segmentBuff;
}
char getCurrentChar() {
return this.segmentBuff[this.cursor];
}
int getCurrentCharType() {
return this.charTypes[this.cursor];
}
int getBufferOffset() {
return this.buffOffset;
}
* 向缓冲区填充字符
* @param reader
* @return
* @throws IOException
*/
int fillBuffer(Reader reader) throws IOException {
int readCount = 0;
if (this.buffOffset == 0) {
readCount = reader.read(this.segmentBuff);
} else {
int offset = this.available - this.cursor;
if (offset > 0) {
System.arraycopy(this.segmentBuff, this.cursor,
this.segmentBuff, 0, offset);
readCount = offset;
}
readCount += reader.read(this.segmentBuff, offset, -offset);
}
this.available = readCount;
this.cursor = 0;
return readCount;
}
void initCursor() {
this.cursor = this.available-1;
//规范会字符
this.segmentBuff[this.cursor] = CharacterUtil
.regularize(this.segmentBuff[this.cursor]);
//为字符指定类型，⽐如阿拉伯数字类型，英⽂字母类型等等this.charTypes[this.cursor] = CharacterUtil
.identifyCharType(this.segmentBuff[this.cursor]);
}
boolean moveCursor() {
if ((this.cursor-moveIndex) > 0) {
this.cursor -= (moveIndex+1);
//System.out.println("移动指针后的cursor位置:"+cursor);
//移动指针后还要进⾏规范化当前字符
this.segmentBuff[this.cursor] = CharacterUtil
.regularize(this.segmentBuff[this.cursor]);
//指定当前字符的类型
this.charTypes[this.cursor] = CharacterUtil
.identifyCharType(this.segmentBuff[this.cursor]);
return true;
}
return false;
}
void lockBuffer(String segmenterName) {
this.buffLocker.add(segmenterName);
}
void unlockBuffer(String segmenterName) {
this.buffLocker.remove(segmenterName);
}
boolean isBufferLocked() {
return (this.buffLocker.size() > 0);
}
boolean isBufferConsumed() {
return (this.cursor == this.available - 1);
}
boolean needRefillBuffer() {
return ((this.available == 4096) && (this.cursor < this.available - 1) && (this.cursor > this.available - 100) && (!(isBufferLocked())));
void markBufferOffset() {
this.buffOffset += this.cursor;
}
void addLexeme(Lexeme lexeme) {
Lexemes.addLexeme(lexeme);
}
void addLexemePath(LexemePath path) {
if (path != null)
this.pathMap.put(Integer.valueOf(path.getPathBegin()), path);
}
QuickSortSet getOrgLexemes() {
return Lexemes;
}
/**
* 输出结果集
*/
void outputToResult() {
int index = 0;
while (index <= this.cursor) {
LexemePath path = (LexemePath) this.pathMap.get(Integer
.valueOf(index));
if (path != null) {
Lexeme l = path.pollFirst();
if (l != null) {
this.results.add(l);
index = l.getBegin() + l.getLength();
this.cursor = index;
}
} else {
outputSingleCJK(index);
++index;
}
}
this.pathMap.clear();
}
private void outputSingleCJK(int index) {
Lexeme singleCharLexeme;
if (4 == this.charTypes[index]) {
singleCharLexeme = new Lexeme(this.buffOffset, index, 1, 64);
this.results.add(singleCharLexeme);
} else if (8 == this.charTypes[index]) {
singleCharLexeme = new Lexeme(this.buffOffset, index, 1, 8);
this.results.add(singleCharLexeme);
}
}
/**
* 取出词元，为词元赋值
* @return
*/
Lexeme getNextLexeme() {
Lexeme result = (Lexeme) this.results.pollFirst();
while (result != null) {
compound(result);//数量词合并
//过滤掉停⽤词
if (Dictionary.getSingleton().isStopWord(this.segmentBuff,
result.getBegin(), result.getLength())) {
//System.out.println(Dictionary.getSingleton().isStopWord(this.segmentBuff, //result.getBegin(), result.getLength()));
result = (Lexeme) this.results.pollFirst();
} else {
//为Lexeme赋值
result.setLexemeText(String.valueOf(this.segmentBuff, result.getBegin(), result.getLength()));
break;
}
}
return result;
}
void reset() {
this.buffLocker.clear();
Lexemes = new QuickSortSet();
this.available = 0;
this.buffOffset = 0;
this.charTypes = new int[4096];
this.cursor = 0;
this.results.clear();
this.segmentBuff = new char[4096];
this.pathMap.clear();
}
/**
* 数量词合并
* @param result
*/
private void compound(Lexeme result) {
if (!(eSmart())) {
return;
}
if (this.results.isEmpty())
return;
Lexeme nextLexeme;
boolean appendOk;
if (2 == result.getLexemeType()) {
nextLexeme = (Lexeme) this.results.peekFirst();
appendOk = false;
if (16 == nextLexeme.getLexemeType()) {
appendOk = result.append(nextLexeme, 16);
} else if (32 == nextLexeme.getLexemeType()) {
appendOk = result.append(nextLexeme, 48);
}
if (appendOk) {
this.results.pollFirst();
}
}
if ((16 == result.getLexemeType()) && (!(this.results.isEmpty()))) { nextLexeme = (Lexeme) this.results.peekFirst();
appendOk = false;
if (32 == nextLexeme.getLexemeType()) {
appendOk = result.append(nextLexeme, 48);
}
if (!(appendOk))
return;
this.results.pollFirst();
}
}
public void setMoveIndex(Integer moveIndex) {
this.moveIndex = moveIndex;
}
}
以下是CJK逆向最⼤匹配算法：
package org.wltea.analyzer.core;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;
/**
* 中⽇韩分词器，逆向最⼤匹配算法
*
* @author TongXueQiang
* @date 2016/01/20
* @since 1.7
*/
class CJKSegmenter implements ISegmenter {
static final String SEGMENTER_NAME = "CJK_SEGMENTER";
static Integer MATCH_LEN = 7;
static Integer moveIndex = MATCH_LEN - 1;
CJKSegmenter() {
}
/*
*逆向最⼤匹配算法
*
* @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.
* AnalyzeContext)
*/
public void analyze(AnalyzeContext context) {
if (context.getCursor() < moveIndex) {
moveIndex = context.getCursor();
MATCH_LEN = context.getCursor() + 1;
}
Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(
context.getSegmentBuff(), context.getCursor() - moveIndex,
MATCH_LEN);
if (singleCharHit.isMatch() || MATCH_LEN == 1) {
Lexeme newLexeme = new Lexeme(context.getBufferOffset(),
context.getCursor() - moveIndex, MATCH_LEN, 4);
context.addLexeme(newLexeme);
context.setMoveIndex(moveIndex);
init();
} else {
if (!singleCharHit.isUnmatch() || singleCharHit.isUnmatch()) {
--moveIndex;
--MATCH_LEN;
analyze(context);
}
}
}
private void init() {
moveIndex = 6;
MATCH_LEN = 7;
}
@Override
public void reset() {
}
}
⾰命尚未成功，同志仍需努⼒！专注机器学习理论的研究，寻求理论的突破，然后转化成代码，苦练最底层的基本功，持之以恒，兼顾理论和编程，成为不可或缺的⼈才，定能成为⼀流的⾼⼿！。